open-source-search-engine/Spider.cpp
Matt Wells 6d30f21ad9 use small dgrams to avoid splitting at the kernel level
down to the mtu. increase 0xc1 msg request delay from 3 to 20 secs.
need to make it linear order.
2015-12-14 10:47:07 -08:00

14920 lines
463 KiB
C++

// . TODO: do not cache if less than the 20k thing again.
// . TODO: nuke doledb every couple hours.
// CollectionRec::m_doledbRefreshRateInSecs. but how would this work
// for crawlbot jobs where we got 10,000 collections? i'd turn this off.
// we could selectively update certain firstips in doledb that have
// been in doledb for a long time.
// i'd like to see how many collections are actually active
// for diffbot first though.
// TODO: add m_downloadTimeTable to measure download speed of an IP
// TODO: consider a "latestpubdateage" in url filters for pages that are
// adding new dates (not clocks) all the time
#include "gb-include.h"
#include "Spider.h"
#include "Msg5.h"
#include "Collectiondb.h"
#include "XmlDoc.h" // score8to32()
#include "Stats.h"
#include "SafeBuf.h"
#include "Repair.h"
#include "CountryCode.h"
#include "DailyMerge.h"
#include "Process.h"
#include "Test.h" // g_test
#include "Threads.h"
#include "XmlDoc.h"
#include "HttpServer.h"
#include "Pages.h"
#include "Parms.h"
#include "Rebalance.h"
void testWinnerTreeKey ( ) ;
// . this was 10 but cpu is getting pegged, so i set to 45
// . we consider the collection done spidering when no urls to spider
// for this many seconds
// . i'd like to set back to 10 for speed... maybe even 5 or less
// . back to 30 from 20 to try to fix crawls thinking they are done
// maybe because of the empty doledb logic taking too long?
//#define SPIDER_DONE_TIMER 30
// try 45 to prevent false revivals
//#define SPIDER_DONE_TIMER 45
// try 30 again since we have new localcrawlinfo update logic much faster
//#define SPIDER_DONE_TIMER 30
// neo under heavy load go to 60
//#define SPIDER_DONE_TIMER 60
// super overloaded
//#define SPIDER_DONE_TIMER 90
#define SPIDER_DONE_TIMER 20
// seems like timecity.com as gigabytes of spiderdb data so up from 40 to 400
//#define MAX_WINNER_NODES 400
// up it to 2000 because shard #15 has slow disk reads and some collections
// are taking forever to spider because the spiderdb scan is so slow.
// we reduce this below if the spiderdb is smaller.
#define MAX_WINNER_NODES 2000
Doledb g_doledb;
RdbTree *g_tree = NULL;
SpiderRequest *g_sreq = NULL;
int32_t g_corruptCount = 0;
static char s_countsAreValid = 1;
/////////////////////////
///////////////////////// SPIDEREC
/////////////////////////
void SpiderRequest::setKey (int32_t firstIp,
int64_t parentDocId,
int64_t uh48,
bool isDel) {
// sanity
if ( firstIp == 0 || firstIp == -1 ) { char *xx=NULL;*xx=0; }
m_key = g_spiderdb.makeKey ( firstIp,uh48,true,parentDocId , isDel );
// set dataSize too!
setDataSize();
}
void SpiderRequest::setDataSize ( ) {
m_dataSize = (m_url - (char *)this) + gbstrlen(m_url) + 1
// subtract m_key and m_dataSize
- sizeof(key128_t) - 4 ;
}
int32_t SpiderRequest::print ( SafeBuf *sbarg ) {
SafeBuf *sb = sbarg;
SafeBuf tmp;
if ( ! sb ) sb = &tmp;
//sb->safePrintf("k.n1=0x%"XINT64" ",m_key.n1);
//sb->safePrintf("k.n0=0x%"XINT64" ",m_key.n0);
sb->safePrintf("k=%s ",KEYSTR(this,
getKeySizeFromRdbId(RDB_SPIDERDB)));
// indicate it's a request not a reply
sb->safePrintf("REQ ");
sb->safePrintf("uh48=%"UINT64" ",getUrlHash48());
// if negtaive bail early now
if ( (m_key.n0 & 0x01) == 0x00 ) {
sb->safePrintf("[DELETE]");
if ( ! sbarg ) printf("%s",sb->getBufStart() );
return sb->length();
}
sb->safePrintf("recsize=%"INT32" ",getRecSize());
sb->safePrintf("parentDocId=%"UINT64" ",getParentDocId());
sb->safePrintf("firstip=%s ",iptoa(m_firstIp) );
sb->safePrintf("hostHash32=0x%"XINT32" ",m_hostHash32 );
sb->safePrintf("domHash32=0x%"XINT32" ",m_domHash32 );
sb->safePrintf("siteHash32=0x%"XINT32" ",m_siteHash32 );
sb->safePrintf("siteNumInlinks=%"INT32" ",m_siteNumInlinks );
// print time format: 7/23/1971 10:45:32
struct tm *timeStruct ;
char time[256];
time_t ts = (time_t)m_addedTime;
timeStruct = gmtime ( &ts );
strftime ( time , 256 , "%b %e %T %Y UTC", timeStruct );
sb->safePrintf("addedTime=%s(%"UINT32") ",time,(uint32_t)m_addedTime );
//sb->safePrintf("parentFirstIp=%s ",iptoa(m_parentFirstIp) );
sb->safePrintf("pageNumInlinks=%i ",(int)m_pageNumInlinks);
sb->safePrintf("parentHostHash32=0x%"XINT32" ",m_parentHostHash32 );
sb->safePrintf("parentDomHash32=0x%"XINT32" ",m_parentDomHash32 );
sb->safePrintf("parentSiteHash32=0x%"XINT32" ",m_parentSiteHash32 );
sb->safePrintf("hopCount=%"INT32" ",(int32_t)m_hopCount );
//timeStruct = gmtime ( &m_spiderTime );
//time[0] = 0;
//if ( m_spiderTime ) strftime (time,256,"%b %e %T %Y UTC",timeStruct);
//sb->safePrintf("spiderTime=%s(%"UINT32") ",time,m_spiderTime);
//timeStruct = gmtime ( &m_pubDate );
//time[0] = 0;
//if ( m_pubDate ) strftime (time,256,"%b %e %T %Y UTC",timeStruct);
//sb->safePrintf("pubDate=%s(%"UINT32") ",time,m_pubDate );
sb->safePrintf("ufn=%"INT32" ", (int32_t)m_ufn);
// why was this unsigned?
sb->safePrintf("priority=%"INT32" ", (int32_t)m_priority);
//sb->safePrintf("errCode=%s(%"UINT32") ",mstrerror(m_errCode),m_errCode );
//sb->safePrintf("crawlDelay=%"INT32"ms ",m_crawlDelay );
//sb->safePrintf("httpStatus=%"INT32" ",(int32_t)m_httpStatus );
//sb->safePrintf("retryNum=%"INT32" ",(int32_t)m_retryNum );
//sb->safePrintf("langId=%s(%"INT32") ",
// getLanguageString(m_langId),(int32_t)m_langId );
//sb->safePrintf("percentChanged=%"INT32"%% ",(int32_t)m_percentChanged );
if ( m_isNewOutlink ) sb->safePrintf("ISNEWOUTLINK ");
if ( m_isAddUrl ) sb->safePrintf("ISADDURL ");
if ( m_isPageReindex ) sb->safePrintf("ISPAGEREINDEX ");
if ( m_isPageParser ) sb->safePrintf("ISPAGEPARSER ");
if ( m_urlIsDocId ) sb->safePrintf("URLISDOCID ");
if ( m_isRSSExt ) sb->safePrintf("ISRSSEXT ");
if ( m_isUrlPermalinkFormat ) sb->safePrintf("ISURLPERMALINKFORMAT ");
if ( m_isPingServer ) sb->safePrintf("ISPINGSERVER ");
if ( m_fakeFirstIp ) sb->safePrintf("ISFAKEFIRSTIP ");
if ( m_isInjecting ) sb->safePrintf("ISINJECTING ");
if ( m_forceDelete ) sb->safePrintf("FORCEDELETE ");
if ( m_sameDom ) sb->safePrintf("SAMEDOM ");
if ( m_sameHost ) sb->safePrintf("SAMEHOST ");
if ( m_sameSite ) sb->safePrintf("SAMESITE ");
if ( m_wasParentIndexed ) sb->safePrintf("WASPARENTINDEXED ");
if ( m_parentIsRSS ) sb->safePrintf("PARENTISRSS ");
if ( m_parentIsPermalink ) sb->safePrintf("PARENTISPERMALINK ");
if ( m_parentIsPingServer ) sb->safePrintf("PARENTISPINGSERVER ");
if ( m_parentIsSiteMap ) sb->safePrintf("PARENTISSITEMAP ");
if ( m_isMenuOutlink ) sb->safePrintf("MENUOUTLINK ");
if ( m_parentHasAddress ) sb->safePrintf("PARENTHASADDRESS ");
//if ( m_fromSections ) sb->safePrintf("FROMSECTIONS ");
if ( m_isScraping ) sb->safePrintf("ISSCRAPING ");
if ( m_hasContent ) sb->safePrintf("HASCONTENT ");
if ( m_inGoogle ) sb->safePrintf("INGOOGLE ");
if ( m_hasAuthorityInlink ) sb->safePrintf("HASAUTHORITYINLINK ");
if ( m_hasContactInfo ) sb->safePrintf("HASCONTACTINFO ");
//if ( m_hasSiteVenue ) sb->safePrintf("HASSITEVENUE ");
if ( m_isContacty ) sb->safePrintf("CONTACTY ");
if ( m_isWWWSubdomain ) sb->safePrintf("WWWSUBDOMAIN ");
if ( m_avoidSpiderLinks ) sb->safePrintf("AVOIDSPIDERLINKS ");
//if ( m_inOrderTree ) sb->safePrintf("INORDERTREE ");
//if ( m_doled ) sb->safePrintf("DOLED ");
//uint32_t gid = g_spiderdb.getGroupId(m_firstIp);
int32_t shardNum = g_hostdb.getShardNum(RDB_SPIDERDB,this);
sb->safePrintf("shardnum=%"UINT32" ",(uint32_t)shardNum);
sb->safePrintf("url=%s",m_url);
if ( ! sbarg )
printf("%s",sb->getBufStart() );
return sb->length();
}
void SpiderReply::setKey (int32_t firstIp,
int64_t parentDocId,
int64_t uh48,
bool isDel) {
m_key = g_spiderdb.makeKey ( firstIp,uh48,false,parentDocId , isDel );
// set dataSize too!
m_dataSize = sizeof(SpiderReply) - sizeof(key128_t) - 4;
}
int32_t SpiderReply::print ( SafeBuf *sbarg ) {
SafeBuf *sb = sbarg;
SafeBuf tmp;
if ( ! sb ) sb = &tmp;
//sb->safePrintf("k.n1=0x%llx ",m_key.n1);
//sb->safePrintf("k.n0=0x%llx ",m_key.n0);
sb->safePrintf("k=%s ",KEYSTR(this,sizeof(SPIDERDBKEY)));
// indicate it's a reply
sb->safePrintf("REP ");
sb->safePrintf("uh48=%"UINT64" ",getUrlHash48());
sb->safePrintf("parentDocId=%"UINT64" ",getParentDocId());
// if negtaive bail early now
if ( (m_key.n0 & 0x01) == 0x00 ) {
sb->safePrintf("[DELETE]");
if ( ! sbarg ) printf("%s",sb->getBufStart() );
return sb->length();
}
sb->safePrintf("firstip=%s ",iptoa(m_firstIp) );
sb->safePrintf("percentChangedPerDay=%.02f%% ",m_percentChangedPerDay);
// print time format: 7/23/1971 10:45:32
struct tm *timeStruct ;
char time[256];
time_t ts = (time_t)m_spideredTime;
timeStruct = gmtime ( &ts );
time[0] = 0;
if ( m_spideredTime ) strftime (time,256,"%b %e %T %Y UTC",timeStruct);
sb->safePrintf("spideredTime=%s(%"UINT32") ",time,
(uint32_t)m_spideredTime);
sb->safePrintf("siteNumInlinks=%"INT32" ",m_siteNumInlinks );
time_t ts2 = (time_t)m_pubDate;
timeStruct = gmtime ( &ts2 );
time[0] = 0;
if ( m_pubDate != 0 && m_pubDate != -1 )
strftime (time,256,"%b %e %T %Y UTC",timeStruct);
sb->safePrintf("pubDate=%s(%"INT32") ",time,m_pubDate );
//sb->safePrintf("newRequests=%"INT32" ",m_newRequests );
sb->safePrintf("ch32=%"UINT32" ",(uint32_t)m_contentHash32);
sb->safePrintf("crawldelayms=%"INT32"ms ",m_crawlDelayMS );
sb->safePrintf("httpStatus=%"INT32" ",(int32_t)m_httpStatus );
sb->safePrintf("langId=%s(%"INT32") ",
getLanguageString(m_langId),(int32_t)m_langId );
if ( m_errCount )
sb->safePrintf("errCount=%"INT32" ",(int32_t)m_errCount);
sb->safePrintf("errCode=%s(%"UINT32") ",mstrerror(m_errCode),
(uint32_t)m_errCode );
//if ( m_isSpam ) sb->safePrintf("ISSPAM ");
if ( m_isRSS ) sb->safePrintf("ISRSS ");
if ( m_isPermalink ) sb->safePrintf("ISPERMALINK ");
if ( m_isPingServer ) sb->safePrintf("ISPINGSERVER ");
//if ( m_deleted ) sb->safePrintf("DELETED ");
if ( m_isIndexed && ! m_isIndexedINValid) sb->safePrintf("ISINDEXED ");
if ( m_hasAddress ) sb->safePrintf("HASADDRESS ");
if ( m_hasTOD ) sb->safePrintf("HASTOD ");
//if ( m_hasSiteVenue ) sb->safePrintf("HASSITEVENUE ");
if ( m_isContacty ) sb->safePrintf("CONTACTY ");
//sb->safePrintf("url=%s",m_url);
if ( ! sbarg )
printf("%s",sb->getBufStart() );
return sb->length();
}
int32_t SpiderRequest::printToTable ( SafeBuf *sb , char *status ,
XmlDoc *xd , int32_t row ) {
sb->safePrintf("<tr bgcolor=#%s>\n",LIGHT_BLUE);
// show elapsed time
if ( xd ) {
int64_t now = gettimeofdayInMilliseconds();
int64_t elapsed = now - xd->m_startTime;
sb->safePrintf(" <td>%"INT32"</td>\n",row);
sb->safePrintf(" <td>%"INT64"ms</td>\n",elapsed);
collnum_t collnum = xd->m_collnum;
CollectionRec *cr = g_collectiondb.getRec(collnum);
char *cs = ""; if ( cr ) cs = cr->m_coll;
// sb->safePrintf(" <td><a href=/crawlbot?c=%s>%"INT32"</a></td>\n",
// cs,(int32_t)collnum);
//sb->safePrintf(" <td><a href=/crawlbot?c=%s>%s</a></td>\n",
// cs,cs);
sb->safePrintf(" <td><a href=/search?c=%s&q=url%%3A%s>%s</a>"
"</td>\n",cs,m_url,cs);
}
sb->safePrintf(" <td><a href=%s><nobr>",m_url);
sb->safeTruncateEllipsis ( m_url , 64 );
sb->safePrintf("</nobr></a></td>\n");
sb->safePrintf(" <td><nobr>%s</nobr></td>\n",status );
sb->safePrintf(" <td>%"INT32"</td>\n",(int32_t)m_priority);
sb->safePrintf(" <td>%"INT32"</td>\n",(int32_t)m_ufn);
sb->safePrintf(" <td>%s</td>\n",iptoa(m_firstIp) );
sb->safePrintf(" <td>%"INT32"</td>\n",(int32_t)m_errCount );
sb->safePrintf(" <td>%"UINT64"</td>\n",getUrlHash48());
//sb->safePrintf(" <td>0x%"XINT32"</td>\n",m_hostHash32 );
//sb->safePrintf(" <td>0x%"XINT32"</td>\n",m_domHash32 );
//sb->safePrintf(" <td>0x%"XINT32"</td>\n",m_siteHash32 );
sb->safePrintf(" <td>%"INT32"</td>\n",m_siteNumInlinks );
//sb->safePrintf(" <td>%"INT32"</td>\n",m_pageNumInlinks );
sb->safePrintf(" <td>%"INT32"</td>\n",(int32_t)m_hopCount );
// print time format: 7/23/1971 10:45:32
struct tm *timeStruct ;
char time[256];
time_t ts3 = (time_t)m_addedTime;
timeStruct = gmtime ( &ts3 );
strftime ( time , 256 , "%b %e %T %Y UTC", timeStruct );
sb->safePrintf(" <td><nobr>%s(%"UINT32")</nobr></td>\n",time,
(uint32_t)m_addedTime);
//timeStruct = gmtime ( &m_pubDate );
//time[0] = 0;
//if ( m_pubDate ) strftime (time,256,"%b %e %T %Y UTC",timeStruct);
//sb->safePrintf(" <td>%s(%"UINT32")</td>\n",time,m_pubDate );
//sb->safePrintf(" <td>%s(%"UINT32")</td>\n",mstrerror(m_errCode),m_errCode);
//sb->safePrintf(" <td>%"INT32"ms</td>\n",m_crawlDelay );
sb->safePrintf(" <td>%i</td>\n",(int)m_pageNumInlinks);
sb->safePrintf(" <td>%"UINT64"</td>\n",getParentDocId() );
//sb->safePrintf(" <td>0x%"XINT32"</td>\n",m_parentHostHash32);
//sb->safePrintf(" <td>0x%"XINT32"</td>\n",m_parentDomHash32 );
//sb->safePrintf(" <td>0x%"XINT32"</td>\n",m_parentSiteHash32 );
//sb->safePrintf(" <td>%"INT32"</td>\n",(int32_t)m_httpStatus );
//sb->safePrintf(" <td>%"INT32"</td>\n",(int32_t)m_retryNum );
//sb->safePrintf(" <td>%s(%"INT32")</td>\n",
// getLanguageString(m_langId),(int32_t)m_langId );
//sb->safePrintf(" <td>%"INT32"%%</td>\n",(int32_t)m_percentChanged );
sb->safePrintf(" <td><nobr>");
if ( m_isNewOutlink ) sb->safePrintf("ISNEWOUTLINK ");
if ( m_isAddUrl ) sb->safePrintf("ISADDURL ");
if ( m_isPageReindex ) sb->safePrintf("ISPAGEREINDEX ");
if ( m_isPageParser ) sb->safePrintf("ISPAGEPARSER ");
if ( m_urlIsDocId ) sb->safePrintf("URLISDOCID ");
if ( m_isRSSExt ) sb->safePrintf("ISRSSEXT ");
if ( m_isUrlPermalinkFormat ) sb->safePrintf("ISURLPERMALINKFORMAT ");
if ( m_isPingServer ) sb->safePrintf("ISPINGSERVER ");
if ( m_isInjecting ) sb->safePrintf("ISINJECTING ");
if ( m_forceDelete ) sb->safePrintf("FORCEDELETE ");
if ( m_sameDom ) sb->safePrintf("SAMEDOM ");
if ( m_sameHost ) sb->safePrintf("SAMEHOST ");
if ( m_sameSite ) sb->safePrintf("SAMESITE ");
if ( m_wasParentIndexed ) sb->safePrintf("WASPARENTINDEXED ");
if ( m_parentIsRSS ) sb->safePrintf("PARENTISRSS ");
if ( m_parentIsPermalink ) sb->safePrintf("PARENTISPERMALINK ");
if ( m_parentIsPingServer ) sb->safePrintf("PARENTISPINGSERVER ");
if ( m_parentIsSiteMap ) sb->safePrintf("PARENTISSITEMAP ");
if ( m_isMenuOutlink ) sb->safePrintf("MENUOUTLINK ");
if ( m_parentHasAddress ) sb->safePrintf("PARENTHASADDRESS ");
//if ( m_fromSections ) sb->safePrintf("FROMSECTIONS ");
if ( m_isScraping ) sb->safePrintf("ISSCRAPING ");
if ( m_hasContent ) sb->safePrintf("HASCONTENT ");
if ( m_inGoogle ) sb->safePrintf("INGOOGLE ");
if ( m_hasAuthorityInlink ) sb->safePrintf("HASAUTHORITYINLINK ");
if ( m_hasContactInfo ) sb->safePrintf("HASCONTACTINFO ");
//if ( m_hasSiteVenue ) sb->safePrintf("HASSITEVENUE ");
if ( m_isContacty ) sb->safePrintf("CONTACTY ");
//if ( m_inOrderTree ) sb->safePrintf("INORDERTREE ");
//if ( m_doled ) sb->safePrintf("DOLED ");
sb->safePrintf("</nobr></td>\n");
sb->safePrintf("</tr>\n");
return sb->length();
}
int32_t SpiderRequest::printTableHeaderSimple ( SafeBuf *sb ,
bool currentlySpidering) {
sb->safePrintf("<tr bgcolor=#%s>\n",DARK_BLUE);
// how long its been being spidered
if ( currentlySpidering ) {
sb->safePrintf(" <td><b>#</b></td>\n");
sb->safePrintf(" <td><b>elapsed</b></td>\n");
sb->safePrintf(" <td><b>coll</b></td>\n");
}
sb->safePrintf(" <td><b>url</b></td>\n");
sb->safePrintf(" <td><b>status</b></td>\n");
sb->safePrintf(" <td><b>first IP</b></td>\n");
sb->safePrintf(" <td><b>crawlDelay</b></td>\n");
sb->safePrintf(" <td><b>pri</b></td>\n");
sb->safePrintf(" <td><b>errCount</b></td>\n");
sb->safePrintf(" <td><b>hops</b></td>\n");
sb->safePrintf(" <td><b>addedTime</b></td>\n");
//sb->safePrintf(" <td><b>flags</b></td>\n");
sb->safePrintf("</tr>\n");
return sb->length();
}
int32_t SpiderRequest::printToTableSimple ( SafeBuf *sb , char *status ,
XmlDoc *xd , int32_t row ) {
sb->safePrintf("<tr bgcolor=#%s>\n",LIGHT_BLUE);
// show elapsed time
if ( xd ) {
int64_t now = gettimeofdayInMilliseconds();
int64_t elapsed = now - xd->m_startTime;
sb->safePrintf(" <td>%"INT32"</td>\n",row);
sb->safePrintf(" <td>%"INT64"ms</td>\n",elapsed);
// print collection
CollectionRec *cr = g_collectiondb.getRec ( xd->m_collnum );
char *coll = "";
if ( cr ) coll = cr->m_coll;
sb->safePrintf("<td>%s</td>",coll);
}
sb->safePrintf(" <td><nobr>");
sb->safeTruncateEllipsis ( m_url , 64 );
sb->safePrintf("</nobr></td>\n");
sb->safePrintf(" <td><nobr>%s</nobr></td>\n",status );
sb->safePrintf(" <td>%s</td>\n",iptoa(m_firstIp));
if ( xd->m_crawlDelayValid && xd->m_crawlDelay >= 0 )
sb->safePrintf(" <td>%"INT32" ms</td>\n",xd->m_crawlDelay);
else
sb->safePrintf(" <td>--</td>\n");
sb->safePrintf(" <td>%"INT32"</td>\n",(int32_t)m_priority);
sb->safePrintf(" <td>%"INT32"</td>\n",(int32_t)m_errCount );
sb->safePrintf(" <td>%"INT32"</td>\n",(int32_t)m_hopCount );
// print time format: 7/23/1971 10:45:32
struct tm *timeStruct ;
char time[256];
time_t ts4 = (time_t)m_addedTime;
timeStruct = gmtime ( &ts4 );
strftime ( time , 256 , "%b %e %T %Y UTC", timeStruct );
sb->safePrintf(" <td><nobr>%s(%"UINT32")</nobr></td>\n",time,
(uint32_t)m_addedTime);
/*
sb->safePrintf(" <td><nobr>");
if ( m_isNewOutlink ) sb->safePrintf("ISNEWOUTLINK ");
if ( m_isAddUrl ) sb->safePrintf("ISADDURL ");
if ( m_isPageReindex ) sb->safePrintf("ISPAGEREINDEX ");
if ( m_isPageParser ) sb->safePrintf("ISPAGEPARSER ");
if ( m_urlIsDocId ) sb->safePrintf("URLISDOCID ");
if ( m_isRSSExt ) sb->safePrintf("ISRSSEXT ");
if ( m_isUrlPermalinkFormat ) sb->safePrintf("ISURLPERMALINKFORMAT ");
if ( m_isPingServer ) sb->safePrintf("ISPINGSERVER ");
if ( m_isInjecting ) sb->safePrintf("ISINJECTING ");
if ( m_forceDelete ) sb->safePrintf("FORCEDELETE ");
if ( m_sameDom ) sb->safePrintf("SAMEDOM ");
if ( m_sameHost ) sb->safePrintf("SAMEHOST ");
if ( m_sameSite ) sb->safePrintf("SAMESITE ");
if ( m_wasParentIndexed ) sb->safePrintf("WASPARENTINDEXED ");
if ( m_parentIsRSS ) sb->safePrintf("PARENTISRSS ");
if ( m_parentIsPermalink ) sb->safePrintf("PARENTISPERMALINK ");
if ( m_parentIsPingServer ) sb->safePrintf("PARENTISPINGSERVER ");
if ( m_isMenuOutlink ) sb->safePrintf("MENUOUTLINK ");
if ( m_parentHasAddress ) sb->safePrintf("PARENTHASADDRESS ");
//if ( m_fromSections ) sb->safePrintf("FROMSECTIONS ");
if ( m_isScraping ) sb->safePrintf("ISSCRAPING ");
if ( m_hasContent ) sb->safePrintf("HASCONTENT ");
if ( m_inGoogle ) sb->safePrintf("INGOOGLE ");
if ( m_hasAuthorityInlink ) sb->safePrintf("HASAUTHORITYINLINK ");
if ( m_hasContactInfo ) sb->safePrintf("HASCONTACTINFO ");
if ( m_hasSiteVenue ) sb->safePrintf("HASSITEVENUE ");
if ( m_isContacty ) sb->safePrintf("CONTACTY ");
sb->safePrintf("</nobr></td>\n");
*/
sb->safePrintf("</tr>\n");
return sb->length();
}
int32_t SpiderRequest::printTableHeader ( SafeBuf *sb , bool currentlySpidering) {
sb->safePrintf("<tr bgcolor=#%s>\n",DARK_BLUE);
// how long its been being spidered
if ( currentlySpidering ) {
sb->safePrintf(" <td><b>#</b></td>\n");
sb->safePrintf(" <td><b>elapsed</b></td>\n");
sb->safePrintf(" <td><b>coll</b></td>\n");
}
sb->safePrintf(" <td><b>url</b></td>\n");
sb->safePrintf(" <td><b>status</b></td>\n");
sb->safePrintf(" <td><b>pri</b></td>\n");
sb->safePrintf(" <td><b>ufn</b></td>\n");
sb->safePrintf(" <td><b>firstIp</b></td>\n");
sb->safePrintf(" <td><b>errCount</b></td>\n");
sb->safePrintf(" <td><b>urlHash48</b></td>\n");
//sb->safePrintf(" <td><b>hostHash32</b></td>\n");
//sb->safePrintf(" <td><b>domHash32</b></td>\n");
//sb->safePrintf(" <td><b>siteHash32</b></td>\n");
sb->safePrintf(" <td><b>siteInlinks</b></td>\n");
//sb->safePrintf(" <td><b>pageNumInlinks</b></td>\n");
sb->safePrintf(" <td><b>hops</b></td>\n");
sb->safePrintf(" <td><b>addedTime</b></td>\n");
//sb->safePrintf(" <td><b>lastAttempt</b></td>\n");
//sb->safePrintf(" <td><b>pubDate</b></td>\n");
//sb->safePrintf(" <td><b>errCode</b></td>\n");
//sb->safePrintf(" <td><b>crawlDelay</b></td>\n");
sb->safePrintf(" <td><b>parentIp</b></td>\n");
sb->safePrintf(" <td><b>parentDocId</b></td>\n");
//sb->safePrintf(" <td><b>parentHostHash32</b></td>\n");
//sb->safePrintf(" <td><b>parentDomHash32</b></td>\n");
//sb->safePrintf(" <td><b>parentSiteHash32</b></td>\n");
//sb->safePrintf(" <td><b>httpStatus</b></td>\n");
//sb->safePrintf(" <td><b>retryNum</b></td>\n");
//sb->safePrintf(" <td><b>langId</b></td>\n");
//sb->safePrintf(" <td><b>percentChanged</b></td>\n");
sb->safePrintf(" <td><b>flags</b></td>\n");
sb->safePrintf("</tr>\n");
return sb->length();
}
/////////////////////////
///////////////////////// SPIDERDB
/////////////////////////
// a global class extern'd in .h file
Spiderdb g_spiderdb;
Spiderdb g_spiderdb2;
// reset rdb
void Spiderdb::reset() { m_rdb.reset(); }
// print the spider rec
int32_t Spiderdb::print( char *srec , SafeBuf *sb ) {
// get if request or reply and print it
if ( isSpiderRequest ( (key128_t *)srec ) )
((SpiderRequest *)srec)->print(sb);
else
((SpiderReply *)srec)->print(sb);
return 0;
}
bool Spiderdb::init ( ) {
int32_t maxMem = 200000000;
// . what's max # of tree nodes?
// . assume avg spider rec size (url) is about 45
// . 45 + 33 bytes overhead in tree is 78
int32_t maxTreeNodes = maxMem / 78;
// . really we just cache the first 64k of each priority list
// . used only by SpiderLoop
//int32_t maxCacheNodes = 32;
// we use the same disk page size as indexdb (for rdbmap.cpp)
//int32_t pageSize = GB_INDEXDB_PAGE_SIZE;
// disk page cache mem, 100MB on gk0 now
//int32_t pcmem = 20000000;//g_conf.m_spiderdbMaxDiskPageCacheMem;
// keep this low if we are the tmp cluster
//if ( g_hostdb.m_useTmpCluster ) pcmem = 0;
// turn off to prevent blocking up cpu
//pcmem = 0;
// key parser checks
//int32_t ip = 0x1234;
char priority = 12;
int32_t spiderTime = 0x3fe96610;
int64_t urlHash48 = 0x1234567887654321LL & 0x0000ffffffffffffLL;
//int64_t pdocid = 0x567834222LL;
//key192_t k = makeOrderKey ( ip,priority,spiderTime,urlHash48,pdocid);
//if (getOrderKeyUrlHash48 (&k)!=urlHash48 ){char*xx=NULL;*xx=0;}
//if (getOrderKeySpiderTime (&k)!=spiderTime){char*xx=NULL;*xx=0;}
//if (getOrderKeyPriority (&k)!=priority ){char*xx=NULL;*xx=0;}
//if (getOrderKeyIp (&k)!=ip ){char*xx=NULL;*xx=0;}
//if (getOrderKeyParentDocId(&k)!=pdocid ){char*xx=NULL;*xx=0;}
// doledb key test
key_t dk = g_doledb.makeKey(priority,spiderTime,urlHash48,false);
if(g_doledb.getPriority(&dk)!=priority){char*xx=NULL;*xx=0;}
if(g_doledb.getSpiderTime(&dk)!=spiderTime){char*xx=NULL;*xx=0;}
if(g_doledb.getUrlHash48(&dk)!=urlHash48){char*xx=NULL;*xx=0;}
if(g_doledb.getIsDel(&dk)!= 0){char*xx=NULL;*xx=0;}
// spiderdb key test
int64_t docId = 123456789;
int32_t firstIp = 0x23991688;
key128_t sk = g_spiderdb.makeKey ( firstIp,urlHash48,1,docId,false);
if ( ! g_spiderdb.isSpiderRequest (&sk) ) { char *xx=NULL;*xx=0; }
if ( g_spiderdb.getUrlHash48(&sk) != urlHash48){char *xx=NULL;*xx=0;}
if ( g_spiderdb.getFirstIp(&sk) != firstIp) {char *xx=NULL;*xx=0;}
testWinnerTreeKey();
// we now use a page cache
// if ( ! m_pc.init ( "spiderdb",
// RDB_SPIDERDB ,
// pcmem ,
// pageSize ))
// return log(LOG_INIT,"spiderdb: Init failed.");
// initialize our own internal rdb
return m_rdb.init ( g_hostdb.m_dir ,
"spiderdb" ,
true , // dedup
-1 , // fixedDataSize
// now that we have MAX_WINNER_NODES allowed in doledb
// we don't have to keep spiderdb so tightly merged i guess..
// MDW: it seems to slow performance when not tightly merged
// so put this back to "2"...
2,//g_conf.m_spiderdbMinFilesToMerge , mintomerge
maxMem,//g_conf.m_spiderdbMaxTreeMem ,
maxTreeNodes ,
true , // balance tree?
0,//g_conf.m_spiderdbMaxCacheMem,
0,//maxCacheNodes ,
false , // half keys?
false , // save cache?
NULL,//&m_pc ,
false ,
false ,
sizeof(key128_t) );
}
// init the rebuild/secondary rdb, used by PageRepair.cpp
bool Spiderdb::init2 ( int32_t treeMem ) {
// . what's max # of tree nodes?
// . assume avg spider rec size (url) is about 45
// . 45 + 33 bytes overhead in tree is 78
int32_t maxTreeNodes = treeMem / 78;
// initialize our own internal rdb
return m_rdb.init ( g_hostdb.m_dir ,
"spiderdbRebuild" ,
true , // dedup
-1 , // fixedDataSize
200 , // g_conf.m_spiderdbMinFilesToMerge
treeMem , // g_conf.m_spiderdbMaxTreeMem ,
maxTreeNodes ,
true , // balance tree?
0 , // m_spiderdbMaxCacheMem,
0 , // maxCacheNodes ,
false , // half keys?
false , // save cache?
NULL , // &m_pc
false , // isTitledb?
false , // preload diskpagecache
sizeof(key128_t));
}
/*
bool Spiderdb::addColl ( char *coll, bool doVerify ) {
if ( ! m_rdb.addColl ( coll ) ) return false;
if ( ! doVerify ) return true;
// verify
if ( verify(coll) ) return true;
// if not allowing scale, return false
if ( ! g_conf.m_allowScale ) return false;
// otherwise let it go
log ( "db: Verify failed, but scaling is allowed, passing." );
return true;
}
*/
bool Spiderdb::verify ( char *coll ) {
//return true;
log ( LOG_DEBUG, "db: Verifying Spiderdb for coll %s...", coll );
g_threads.disableThreads();
Msg5 msg5;
Msg5 msg5b;
RdbList list;
key128_t startKey;
key128_t endKey;
startKey.setMin();
endKey.setMax();
//int32_t minRecSizes = 64000;
CollectionRec *cr = g_collectiondb.getRec(coll);
if ( ! msg5.getList ( RDB_SPIDERDB ,
cr->m_collnum ,
&list ,
(char *)&startKey ,
(char *)&endKey ,
64000 , // minRecSizes ,
true , // includeTree ,
false , // add to cache?
0 , // max cache age
0 , // startFileNum ,
-1 , // numFiles ,
NULL , // state
NULL , // callback
0 , // niceness
false , // err correction?
NULL ,
0 ,
-1 ,
true ,
-1LL ,
&msg5b ,
true )) {
g_threads.enableThreads();
return log("db: HEY! it did not block");
}
int32_t count = 0;
int32_t got = 0;
for ( list.resetListPtr() ; ! list.isExhausted() ;
list.skipCurrentRecord() ) {
char *k = list.getCurrentRec();
//key_t k = list.getCurrentKey();
count++;
// what group's spiderdb should hold this rec
//uint32_t groupId = g_hostdb.getGroupId ( RDB_SPIDERDB , k );
//if ( groupId == g_hostdb.m_groupId ) got++;
int32_t shardNum = g_hostdb.getShardNum(RDB_SPIDERDB,k);
if ( shardNum == g_hostdb.getMyShardNum() ) got++;
}
if ( got != count ) {
// tally it up
g_rebalance.m_numForeignRecs += count - got;
log ("db: Out of first %"INT32" records in spiderdb, "
"only %"INT32" belong to our shard.",count,got);
// exit if NONE, we probably got the wrong data
if ( got == 0 ) log("db: Are you sure you have the "
"right "
"data in the right directory? "
"Exiting.");
log ( "db: Exiting due to Spiderdb inconsistency." );
g_threads.enableThreads();
return g_conf.m_bypassValidation;
}
log (LOG_DEBUG,"db: Spiderdb passed verification successfully for %"INT32" "
"recs.", count );
// DONE
g_threads.enableThreads();
return true;
}
key128_t Spiderdb::makeKey ( int32_t firstIp ,
int64_t urlHash48 ,
bool isRequest ,
// MDW: now we use timestamp instead of parentdocid
// for spider replies. so they do not dedup...
int64_t parentDocId ,
bool isDel ) {
key128_t k;
k.n1 = (uint32_t)firstIp;
// push ip to top 32 bits
k.n1 <<= 32;
// . top 32 bits of url hash are in the lower 32 bits of k.n1
// . often the urlhash48 has top bits set that shouldn't be so mask
// it to 48 bits
k.n1 |= (urlHash48 >> 16) & 0xffffffff;
// remaining 16 bits
k.n0 = urlHash48 & 0xffff;
// room for isRequest
k.n0 <<= 1;
if ( isRequest ) k.n0 |= 0x01;
// parent docid
k.n0 <<= 38;
// if we are making a spider reply key just leave the parentdocid as 0
// so we only store one reply per url. the last reply we got.
// if ( isRequest ) k.n0 |= parentDocId & DOCID_MASK;
k.n0 |= parentDocId & DOCID_MASK;
// reserved (padding)
k.n0 <<= 8;
// del bit
k.n0 <<= 1;
if ( ! isDel ) k.n0 |= 0x01;
return k;
}
/////////////////////////
///////////////////////// DOLEDB
/////////////////////////
// reset rdb
void Doledb::reset() { m_rdb.reset(); }
bool Doledb::init ( ) {
// . what's max # of tree nodes?
// . assume avg spider rec size (url) is about 45
// . 45 + 33 bytes overhead in tree is 78
// . use 5MB for the tree
int32_t maxTreeMem = 150000000; // 150MB
int32_t maxTreeNodes = maxTreeMem / 78;
// we use the same disk page size as indexdb (for rdbmap.cpp)
//int32_t pageSize = GB_INDEXDB_PAGE_SIZE;
// disk page cache mem, hard code to 5MB
// int32_t pcmem = 5000000; // g_conf.m_spiderdbMaxDiskPageCacheMem;
// keep this low if we are the tmp cluster
// if ( g_hostdb.m_useTmpCluster ) pcmem = 0;
// we no longer dump doledb to disk, Rdb::dumpTree() does not allow it
// pcmem = 0;
// we now use a page cache
// if ( ! m_pc.init ( "doledb" ,
// RDB_DOLEDB ,
// pcmem ,
// pageSize ))
// return log(LOG_INIT,"doledb: Init failed.");
// initialize our own internal rdb
if ( ! m_rdb.init ( g_hostdb.m_dir ,
"doledb" ,
true , // dedup
-1 , // fixedDataSize
2 , // MinFilesToMerge
maxTreeMem ,
maxTreeNodes ,
true , // balance tree?
0 , // spiderdbMaxCacheMe
0 , // maxCacheNodes
false , // half keys?
false , // save cache?
NULL))//&m_pc ))
return false;
return true;
}
/*
bool Doledb::addColl ( char *coll, bool doVerify ) {
if ( ! m_rdb.addColl ( coll ) ) return false;
//if ( ! doVerify ) return true;
// verify
//if ( verify(coll) ) return true;
// if not allowing scale, return false
//if ( ! g_conf.m_allowScale ) return false;
// otherwise let it go
//log ( "db: Verify failed, but scaling is allowed, passing." );
return true;
}
*/
/////////////////////////
///////////////////////// SpiderCache
/////////////////////////
// . reload everything this many seconds
// . this was originally done to as a lazy compensation for a bug but
// now i do not add too many of the same domain if the same domain wait
// is ample and we know we'll be refreshed in X seconds anyway
//#define DEFAULT_SPIDER_RELOAD_RATE (3*60*60)
// . size of spiderecs to load in one call to readList
// . i increased it to 1MB to speed everything up, seems like cache is
// getting loaded up way too slow
#define SR_READ_SIZE (512*1024)
// for caching in s_ufnTree
//#define MAX_NODES (30)
// a global class extern'd in .h file
SpiderCache g_spiderCache;
SpiderCache::SpiderCache ( ) {
//m_numSpiderColls = 0;
//m_isSaving = false;
}
// returns false and set g_errno on error
bool SpiderCache::init ( ) {
//for ( int32_t i = 0 ; i < MAX_COLL_RECS ; i++ )
// m_spiderColls[i] = NULL;
// success
return true;
}
/*
static void doneSavingWrapper ( void *state ) {
SpiderCache *THIS = (SpiderCache *)state;
log("spcache: done saving something");
//THIS->doneSaving();
// . call the callback if any
// . this let's PageMaster.cpp know when we're closed
//if (THIS->m_closeCallback) THIS->m_closeCallback(THIS->m_closeState);
}
void SpiderCache::doneSaving ( ) {
// bail if g_errno was set
if ( g_errno ) {
log("spider: Had error saving waitingtree.dat or doleiptable: "
"%s.",
mstrerror(g_errno));
g_errno = 0;
}
else {
// display any error, if any, otherwise prints "Success"
logf(LOG_INFO,"db: Successfully saved waitingtree and "
"doleiptable");
}
// if still more need to save, not done yet
if ( needsSave ( ) ) return;
// ok, call callback that initiaed the save
if ( m_callback ) m_callback ( THIS->m_state );
// ok, we are done!
//m_isSaving = false;
}
*/
// return false if any tree save blocked
void SpiderCache::save ( bool useThread ) {
// bail if already saving
//if ( m_isSaving ) return true;
// assume saving
//m_isSaving = true;
// loop over all SpiderColls and get the best
for ( int32_t i = 0 ; i < g_collectiondb.m_numRecs ; i++ ) {
SpiderColl *sc = getSpiderCollIffNonNull(i);//m_spiderColls[i];
if ( ! sc ) continue;
RdbTree *tree = &sc->m_waitingTree;
if ( ! tree->m_needsSave ) continue;
// if already saving from a thread
if ( tree->m_isSaving ) continue;
char *filename = "waitingtree";
char dir[1024];
sprintf(dir,"%scoll.%s.%"INT32"",g_hostdb.m_dir,
sc->m_coll,(int32_t)sc->m_collnum);
// log it for now
log("spider: saving waiting tree for cn=%"INT32"",(int32_t)i);
// returns false if it blocked, callback will be called
tree->fastSave ( dir, // g_hostdb.m_dir ,
filename ,
useThread ,
NULL,//this ,
NULL);//doneSavingWrapper );
// also the doleIpTable
/*
filename = "doleiptable.dat";
sc->m_doleIpTable.fastSave(useThread,
dir,
filename,
NULL,
0,
NULL,//this,
NULL);//doneSavingWrapper );
*/
// . crap, this is made at startup from waitintree!
/*
// waiting table
filename = "waitingtable.dat";
if ( sc->m_waitingTable.m_needsSave )
logf(LOG_INFO,"db: Saving %s/%s",dir,
filename);
sc->m_waitingTable.fastSave(useThread,
dir,
filename,
NULL,
0,
NULL,//this,
NULL );//doneSavingWrapper );
*/
}
// if still needs save, not done yet, return false to indicate blocked
//if ( blocked ) return false;
// all done
//m_isSaving = false;
// did not block
//return true;
}
bool SpiderCache::needsSave ( ) {
for ( int32_t i = 0 ; i < g_collectiondb.m_numRecs ; i++ ) {
SpiderColl *sc = getSpiderCollIffNonNull(i);//m_spiderColls[i];
if ( ! sc ) continue;
if ( sc->m_waitingTree.m_needsSave ) return true;
// also the doleIpTable
//if ( sc->m_doleIpTable.m_needsSave ) return true;
}
return false;
}
void SpiderCache::reset ( ) {
log(LOG_DEBUG,"spider: resetting spidercache");
// loop over all SpiderColls and get the best
for ( int32_t i = 0 ; i < g_collectiondb.m_numRecs ; i++ ) {
SpiderColl *sc = getSpiderCollIffNonNull(i);
if ( ! sc ) continue;
sc->reset();
mdelete ( sc , sizeof(SpiderColl) , "SpiderCache" );
delete ( sc );
//m_spiderColls[i] = NULL;
CollectionRec *cr = g_collectiondb.getRec(i);
cr->m_spiderColl = NULL;
}
//m_numSpiderColls = 0;
}
SpiderColl *SpiderCache::getSpiderCollIffNonNull ( collnum_t collnum ) {
// "coll" must be invalid
if ( collnum < 0 ) return NULL;
if ( collnum >= g_collectiondb.m_numRecs ) return NULL;
// int16_tcut
CollectionRec *cr = g_collectiondb.m_recs[collnum];
// empty?
if ( ! cr ) return NULL;
// return it if non-NULL
return cr->m_spiderColl;
}
bool tryToDeleteSpiderColl ( SpiderColl *sc , char *msg ) {
// if not being deleted return false
if ( ! sc->m_deleteMyself ) return false;
// otherwise always return true
if ( sc->m_msg5b.m_waitingForList ) {
log("spider: deleting sc=0x%"PTRFMT" for collnum=%"INT32" "
"waiting1",
(PTRTYPE)sc,(int32_t)sc->m_collnum);
return true;
}
// if ( sc->m_msg1.m_mcast.m_inUse ) {
// log("spider: deleting sc=0x%"PTRFMT" for collnum=%"INT32" "
// "waiting2",
// (PTRTYPE)sc,(int32_t)sc->m_collnum);
// return true;
// }
if ( sc->m_isLoading ) {
log("spider: deleting sc=0x%"PTRFMT" for collnum=%"INT32" "
"waiting3",
(PTRTYPE)sc,(int32_t)sc->m_collnum);
return true;
}
// this means msg5 is out
if ( sc->m_msg5.m_waitingForList ) {
log("spider: deleting sc=0x%"PTRFMT" for collnum=%"INT32" "
"waiting4",
(PTRTYPE)sc,(int32_t)sc->m_collnum);
return true;
}
// if ( sc->m_gettingList1 ) {
// log("spider: deleting sc=0x%"PTRFMT" for collnum=%"INT32"
//"waiting5",
// (int32_t)sc,(int32_t)sc->m_collnum);
// return true;
// }
// if ( sc->m_gettingList2 ) {
// log("spider: deleting sc=0x%"PTRFMT" for collnum=%"INT32"
//"waiting6",
// (int32_t)sc,(int32_t)sc->m_collnum);
// return true;
// }
// there's still a core of someone trying to write to someting
// in "sc" so we have to try to fix that. somewhere in xmldoc.cpp
// or spider.cpp. everyone should get sc from cr everytime i'd think
log("spider: deleting sc=0x%"PTRFMT" for collnum=%"INT32" (msg=%s)",
(PTRTYPE)sc,(int32_t)sc->m_collnum,msg);
// . make sure nobody has it
// . cr might be NULL because Collectiondb.cpp::deleteRec2() might
// have nuked it
//CollectionRec *cr = sc->m_cr;
// use fake ptrs for easier debugging
//if ( cr ) cr->m_spiderColl = (SpiderColl *)0x987654;//NULL;
mdelete ( sc , sizeof(SpiderColl),"postdel1");
delete ( sc );
return true;
}
// . get SpiderColl for a collection
// . if it is NULL for that collection then make a new one
SpiderColl *SpiderCache::getSpiderColl ( collnum_t collnum ) {
// "coll" must be invalid
if ( collnum < 0 ) return NULL;
// return it if non-NULL
//if ( m_spiderColls [ collnum ] ) return m_spiderColls [ collnum ];
// if spidering disabled, do not bother creating this!
//if ( ! g_conf.m_spideringEnabled ) return NULL;
// int16_tcut
CollectionRec *cr = g_collectiondb.m_recs[collnum];
// collection might have been reset in which case collnum changes
if ( ! cr ) return NULL;
// return it if non-NULL
SpiderColl *sc = cr->m_spiderColl;
if ( sc ) return sc;
// if spidering disabled, do not bother creating this!
//if ( ! cr->m_spideringEnabled ) return NULL;
// cast it
//SpiderColl *sc;
// make it
try { sc = new(SpiderColl); }
catch ( ... ) {
log("spider: failed to make SpiderColl for collnum=%"INT32"",
(int32_t)collnum);
return NULL;
}
// register it
mnew ( sc , sizeof(SpiderColl), "spcoll" );
// store it
//m_spiderColls [ collnum ] = sc;
cr->m_spiderColl = sc;
// note it
logf(LOG_DEBUG,"spider: made spidercoll=%"PTRFMT" for cr=%"PTRFMT"",
(PTRTYPE)sc,(PTRTYPE)cr);
// update this
//if ( m_numSpiderColls < collnum + 1 )
// m_numSpiderColls = collnum + 1;
// set this
sc->m_collnum = collnum;
// save this
strcpy ( sc->m_coll , cr->m_coll );
// set this
if ( ! strcmp ( cr->m_coll,"qatest123" ) ) sc->m_isTestColl = true;
else sc->m_isTestColl = false;
// set this
sc->setCollectionRec ( cr ); // sc->m_cr = cr;
// set first doledb scan key
sc->m_nextDoledbKey.setMin();
// turn off quickpolling while loading incase a parm update comes in
bool saved = g_conf.m_useQuickpoll;
g_conf.m_useQuickpoll = false;
// mark it as loading so it can't be deleted while loading
sc->m_isLoading = true;
// . load its tables from disk
// . crap i think this might call quickpoll and we get a parm
// update to delete this spider coll!
sc->load();
// mark it as loading
sc->m_isLoading = false;
// restore
g_conf.m_useQuickpoll = saved;
// did crawlbottesting delete it right away?
if ( tryToDeleteSpiderColl( sc ,"1") ) return NULL;
// sanity check
//if ( ! cr ) { char *xx=NULL;*xx=0; }
// deleted right away?
//if ( sc->getCollectionRec() == NULL ) { char *xx=NULL;*xx=0; }
// note it!
log(LOG_DEBUG,"spider: adding new spider collection for %s",
cr->m_coll);
// that was it
return sc;
}
/////////////////////////
///////////////////////// SpiderColl
/////////////////////////
void SpiderColl::setCollectionRec ( CollectionRec *cr ) {
m_cr = cr;
// this was useful for debugging a null m_cr bug
//log("sc: sc 0x%"PTRFMT" setting cr to 0x%"PTRFMT""
//,(int32_t)this,(int32_t)cr);
}
CollectionRec *SpiderColl::getCollectionRec ( ) {
//log("sc: sc 0x%"PTRFMT" getting cr of 0x%"PTRFMT""
//,(int32_t)this,(int32_t)m_cr);
return m_cr;
}
SpiderColl::SpiderColl () {
m_overflowList = NULL;
m_lastOverflowFirstIp = 0;
m_lastPrinted = 0;
m_deleteMyself = false;
m_isLoading = false;
m_gettingList1 = false;
m_gettingList2 = false;
m_lastScanTime = 0;
m_isPopulatingDoledb = false;
m_numAdded = 0;
m_numBytesScanned = 0;
m_lastPrintCount = 0;
m_siteListIsEmptyValid = false;
m_cr = NULL;
//m_lastSpiderAttempt = 0;
//m_lastSpiderCouldLaunch = 0;
//m_numRoundsDone = 0;
//m_lastDoledbReadEmpty = false; // over all priorities in this coll
// re-set this to min and set m_needsWaitingTreeRebuild to true
// when the admin updates the url filters page
m_waitingTreeNeedsRebuild = false;
m_nextKey2.setMin();
m_endKey2.setMax();
m_spidersOut = 0;
m_coll[0] = '\0';// = NULL;
reset();
// reset this
memset ( m_outstandingSpiders , 0 , 4 * MAX_SPIDER_PRIORITIES );
// start off sending all colls local crawl info to all hosts to
// be sure we are in sync
memset ( m_sendLocalCrawlInfoToHost , 1 , MAX_HOSTS );
}
int32_t SpiderColl::getTotalOutstandingSpiders ( ) {
int32_t sum = 0;
for ( int32_t i = 0 ; i < MAX_SPIDER_PRIORITIES ; i++ )
sum += m_outstandingSpiders[i];
return sum;
}
void nukeDoledb ( collnum_t collnum ) ;
// load the tables that we set when m_doInitialScan is true
bool SpiderColl::load ( ) {
// error?
int32_t err = 0;
// make the dir
char *coll = g_collectiondb.getColl(m_collnum);
// sanity check
if ( ! coll || coll[0]=='\0' ) {
log("spider: bad collnum of %"INT32"",(int32_t)m_collnum);
g_errno = ENOCOLLREC;
return false;
//char *xx=NULL;*xx=0; }
}
// reset this once
//m_msg1Avail = true;
m_isPopulatingDoledb = false;
// keep it kinda low if we got a ton of collections
int32_t maxMem = 15000;
int32_t maxNodes = 500;
if ( g_collectiondb.m_numRecsUsed > 500 ) {
maxNodes = 100;
maxMem = maxNodes * 20;
}
if ( ! m_lastDownloadCache.init ( maxMem , // maxcachemem,
8 , // fixed data size (MS)
false , // support lists?
maxNodes , // max nodes
false , // use half keys?
"downcache", // dbname
false , // load from disk?
12 , // key size (firstip)
12 , // data key size?
-1 ))// numPtrsMax
return log("spider: dcache init failed");
// this has a quickpoll in it, so that quickpoll processes
// a restart request from crawlbottesting for this collnum which
// calls Collectiondb::resetColl2() which calls deleteSpiderColl()
// on THIS spidercoll, but our m_loading flag is set
if (!m_sniTable.set ( 4,8,0,NULL,0,false,MAX_NICENESS,"snitbl") )
return false;
if (!m_cdTable.set (4,4,0,NULL,0,false,MAX_NICENESS,"cdtbl"))
return false;
// doledb seems to have like 32000 entries in it
int32_t numSlots = 0; // was 128000
if(!m_doleIpTable.set(4,4,numSlots,NULL,0,false,MAX_NICENESS,"doleip"))
return false;
// this should grow dynamically...
if (!m_waitingTable.set (4,8,16,NULL,0,false,MAX_NICENESS,"waittbl"))
return false;
// . a tree of keys, key is earliestSpiderTime|ip (key=12 bytes)
// . earliestSpiderTime is 0 if unknown
// . max nodes is 1M but we should grow dynamically! TODO
// . let's up this to 5M because we are hitting the limit in some
// test runs...
// . try going to 20M now since we hit it again...
// . start off at just 10 nodes since we grow dynamically now
if (!m_waitingTree.set(0,10,true,-1,true,"waittree2",
false,"waitingtree",sizeof(key_t)))return false;
m_waitingTreeKeyValid = false;
m_scanningIp = 0;
// prevent core with this
//m_waitingTree.m_rdbId = RDB_NONE;
// make dir
char dir[500];
sprintf(dir,"%scoll.%s.%"INT32"",g_hostdb.m_dir,coll,(int32_t)m_collnum);
// load up all the tables
if ( ! m_cdTable .load(dir,"crawldelay.dat" ) ) err = g_errno;
if ( ! m_sniTable.load(dir,"siteinlinks.dat" ) ) err = g_errno;
// and its doledb data
//if ( ! initializeDoleTables( ) ) err = g_errno;
// our table that has how many of each firstIP are in doledb
//if ( ! m_doleIpTable.load(dir,"doleiptable.dat") ) err = g_errno;
// load in the waiting tree, IPs waiting to get into doledb
BigFile file;
file.set ( dir , "waitingtree-saved.dat" , NULL );
bool treeExists = file.doesExist() > 0;
// load the table with file named "THISDIR/saved"
if ( treeExists && ! m_waitingTree.fastLoad(&file,&m_waitingMem) )
err = g_errno;
// init wait table. scan wait tree and add the ips into table.
if ( ! makeWaitingTable() ) err = g_errno;
// save it
g_errno = err;
// return false on error
if ( g_errno )
// note it
return log("spider: had error loading initial table: %s",
mstrerror(g_errno));
// if we hade doledb0001.dat files on disk then nuke doledb
// and waiting tree and rebuild now with a tree-only doledb.
RdbBase *base = getRdbBase ( RDB_DOLEDB , m_collnum );
// . do this now just to keep everything somewhat in sync
// . we lost dmoz.org and could not get it back in because it was
// in the doleip table but NOT in doledb!!!
if ( ! makeDoleIPTable() ) return false;
// delete the files and doledb-saved.dat and the waitingtree
// and set the waitingtree into rebuild mode.
if ( base && base->m_numFiles > 0 ) {
nukeDoledb ( m_collnum );
return true;
}
// otherwise true
return true;
}
// . scan all spiderRequests in doledb at startup and add them to our tables
// . then, when we scan spiderdb and add to orderTree/urlhashtable it will
// see that the request is in doledb and set m_doled...
// . initialize the dole table for that then
// quickly scan doledb and add the doledb records to our trees and
// tables. that way if we receive a SpiderReply() then addSpiderReply()
// will be able to find the associated SpiderRequest.
// MAKE SURE to put each spiderrequest into m_doleTable... and into
// maybe m_urlHashTable too???
// this should block since we are at startup...
bool SpiderColl::makeDoleIPTable ( ) {
log(LOG_DEBUG,"spider: making dole ip table for %s",m_coll);
key_t startKey ; startKey.setMin();
key_t endKey ; endKey.setMax();
key_t lastKey ; lastKey.setMin();
// turn off threads for this so it blocks
bool enabled = g_threads.areThreadsEnabled();
// turn off regardless
g_threads.disableThreads();
// get a meg at a time
int32_t minRecSizes = 1024*1024;
Msg5 msg5;
Msg5 msg5b;
RdbList list;
loop:
// use msg5 to get the list, should ALWAYS block since no threads
if ( ! msg5.getList ( RDB_DOLEDB ,
m_collnum ,
&list ,
startKey ,
endKey ,
minRecSizes ,
true , // includeTree?
false , // add to cache?
0 , // max cache age
0 , // startFileNum ,
-1 , // numFiles ,
NULL , // state
NULL , // callback
0,//MAX_NICENESS , // niceness
false , // err correction?
NULL , // cache key ptr
0 , // retry num
-1 , // maxRetries
true , // compensate for merge
-1LL , // sync point
&msg5b )){
log(LOG_LOGIC,"spider: getList did not block.");
return false;
}
// int16_tcut
int32_t minSize=(int32_t)(sizeof(SpiderRequest)+sizeof(key_t)+4-MAX_URL_LEN);
// all done if empty
if ( list.isEmpty() ) goto done;
// loop over entries in list
for (list.resetListPtr();!list.isExhausted();list.skipCurrentRecord()){
// get rec
char *rec = list.getCurrentRec();
// get key
key_t k = list.getCurrentKey();
// skip deletes -- how did this happen?
if ( (k.n0 & 0x01) == 0) continue;
// check this out
int32_t recSize = list.getCurrentRecSize();
// zero?
if ( recSize <= 0 ) { char *xx=NULL;*xx=0; }
// 16 is bad too... wtf is this?
if ( recSize <= 16 ) continue;
// crazy?
if ( recSize<=minSize) {char *xx=NULL;*xx=0;}
// . doledb key is 12 bytes, followed by a 4 byte datasize
// . so skip that key and dataSize to point to spider request
SpiderRequest *sreq = (SpiderRequest *)(rec+sizeof(key_t)+4);
// add to dole tables
if ( ! addToDoleTable ( sreq ) )
// return false with g_errno set on error
return false;
}
startKey = *(key_t *)list.getLastKey();
startKey += (uint32_t) 1;
// watch out for wrap around
if ( startKey >= *(key_t *)list.getLastKey() ) goto loop;
done:
log(LOG_DEBUG,"spider: making dole ip table done.");
// re-enable threads
if ( enabled ) g_threads.enableThreads();
// we wrapped, all done
return true;
}
key_t makeWaitingTreeKey ( uint64_t spiderTimeMS , int32_t firstIp ) {
// sanity
if ( ((int64_t)spiderTimeMS) < 0 ) { char *xx=NULL;*xx=0; }
// make the wait tree key
key_t wk;
wk.n1 = (spiderTimeMS>>32);
wk.n0 = (spiderTimeMS&0xffffffff);
wk.n0 <<= 32;
wk.n0 |= (uint32_t)firstIp;
// sanity
if ( wk.n1 & 0x8000000000000000LL ) { char *xx=NULL;*xx=0; }
return wk;
}
CollectionRec *SpiderColl::getCollRec() {
CollectionRec *cr = g_collectiondb.m_recs[m_collnum];
if ( ! cr ) log("spider: lost coll rec");
return cr;
}
char *SpiderColl::getCollName() {
CollectionRec *cr = getCollRec();
if ( ! cr ) return "lostcollection";
return cr->m_coll;
}
//
// remove all recs from doledb for the given collection
//
static void nukeDoledbWrapper ( int fd , void *state ) {
g_loop.unregisterSleepCallback ( state , nukeDoledbWrapper );
collnum_t collnum = *(collnum_t *)state;
nukeDoledb ( collnum );
}
void nukeDoledb ( collnum_t collnum ) {
//g_spiderLoop.m_winnerListCache.verify();
// in case we changed url filters for this collection #
g_spiderLoop.m_winnerListCache.clear ( collnum );
//g_spiderLoop.m_winnerListCache.verify();
//WaitEntry *we = (WaitEntry *)state;
//if ( we->m_registered )
// g_loop.unregisterSleepCallback ( we , doDoledbNuke );
// . nuke doledb for this collnum
// . it will unlink the files and maps for doledb for this collnum
// . it will remove all recs of this collnum from its tree too
if ( g_doledb.getRdb()->isSavingTree () ) {
g_loop.registerSleepCallback(100,&collnum,nukeDoledbWrapper);
//we->m_registered = true;
return;
}
// . ok, tree is not saving, it should complete entirely from this call
g_doledb.getRdb()->deleteAllRecs ( collnum );
// re-add it back so the RdbBase is new'd
//g_doledb.getRdb()->addColl2 ( we->m_collnum );
SpiderColl *sc = g_spiderCache.getSpiderCollIffNonNull ( collnum );
if ( sc ) {
sc->m_lastUrlFiltersUpdate = getTimeLocal();//GlobalNoCore();
// . make sure to nuke m_doleIpTable as well
sc->m_doleIpTable.clear();
// need to recompute this!
//sc->m_ufnMapValid = false;
// reset this cache
//clearUfnTable();
// log it
log("spider: rebuilding %s from doledb nuke",
sc->getCollName());
// activate a scan if not already activated
sc->m_waitingTreeNeedsRebuild = true;
// if a scan is ongoing, this will re-set it
sc->m_nextKey2.setMin();
// clear it?
sc->m_waitingTree.clear();
sc->m_waitingTable.clear();
// kick off the spiderdb scan to rep waiting tree and doledb
sc->populateWaitingTreeFromSpiderdb(false);
}
// nuke this state
//mfree ( we , sizeof(WaitEntry) , "waitet" );
// note it
log("spider: finished nuking doledb for coll (%"INT32")",
(int32_t)collnum);
}
// . call this when changing the url filters
// . will make all entries in waiting tree have zero time basically
// . and makes us repopulate doledb from these waiting tree entries
/*
void SpiderColl::urlFiltersChanged ( ) {
// log it
log("spider: rebuilding doledb/waitingtree for coll=%s",getCollName());
WaitEntry *we = (WaitEntry *)mmalloc ( sizeof(WaitEntry) , "waite2" );
if ( ! we ) {
log("spider: wait entry alloc: %s",mstrerror(g_errno));
g_errno = 0;
return;
}
// prepare our state in case the purge operation would block
we->m_registered = false;
we->m_cr = m_cr;
we->m_collnum = m_cr->m_collnum;
//we->m_callback = doDoledbNuke2;
//we->m_state = NULL;
// remove all recs from doledb for the given collection
doDoledbNuke ( 0 , we );
}
*/
// this one has to scan all of spiderdb
bool SpiderColl::makeWaitingTree ( ) {
log(LOG_DEBUG,"spider: making waiting tree for %s",m_coll);
key128_t startKey ; startKey.setMin();
key128_t endKey ; endKey.setMax();
key128_t lastKey ; lastKey.setMin();
// turn off threads for this so it blocks
bool enabled = g_threads.areThreadsEnabled();
// turn off regardless
g_threads.disableThreads();
// get a meg at a time
int32_t minRecSizes = 1024*1024;
Msg5 msg5;
Msg5 msg5b;
RdbList list;
loop:
// use msg5 to get the list, should ALWAYS block since no threads
if ( ! msg5.getList ( RDB_SPIDERDB ,
m_collnum ,
&list ,
&startKey ,
&endKey ,
minRecSizes ,
true , // includeTree?
false , // add to cache?
0 , // max cache age
0 , // startFileNum ,
-1 , // numFiles ,
NULL , // state
NULL , // callback
MAX_NICENESS , // niceness
false , // err correction?
NULL , // cache key ptr
0 , // retry num
-1 , // maxRetries
true , // compensate for merge
-1LL , // sync point
&msg5b )){
log(LOG_LOGIC,"spider: getList did not block.");
return false;
}
// all done if empty
if ( list.isEmpty() ) goto done;
// loop over entries in list
for (list.resetListPtr();!list.isExhausted();list.skipCurrentRecord()){
// get rec
char *rec = list.getCurrentRec();
// get key
key128_t k; list.getCurrentKey(&k);
// skip deletes -- how did this happen?
if ( (k.n0 & 0x01) == 0) continue;
// check this out
int32_t recSize = list.getCurrentRecSize();
// zero?
if ( recSize <= 0 ) { char *xx=NULL;*xx=0; }
// 16 is bad too... wtf is this?
if ( recSize <= 16 ) continue;
// skip replies
if ( g_spiderdb.isSpiderReply ( (key128_t *)rec ) ) continue;
// get request
SpiderRequest *sreq = (SpiderRequest *)rec;
// skip if not assigned to us
if ( ! isAssignedToUs ( sreq->m_firstIp ) ) continue;
// get first ip
int32_t firstIp = sreq->m_firstIp;
// skip if in dole ip table
if ( m_doleIpTable.isInTable ( &firstIp ) ) continue;
// make the key. use 1 for spiderTimeMS. this tells the
// spider loop that it is temporary and should be updated
key_t wk = makeWaitingTreeKey ( 1 , firstIp );
// ok, add to waiting tree
int32_t wn = m_waitingTree.addKey ( &wk );
if ( wn < 0 ) {
log("spider: makeWaitTree: %s",mstrerror(g_errno));
return false;
}
// note it
if ( g_conf.m_logDebugSpider )
logf(LOG_DEBUG,"spider: added time=1 ip=%s to waiting "
"tree (node#=%"INT32")", iptoa(firstIp),wn);
// a tmp var
int64_t fakeone = 1LL;
// add to table now since its in the tree
if ( ! m_waitingTable.addKey ( &firstIp , &fakeone ) ) {
log("spider: makeWaitTree2: %s",mstrerror(g_errno));
m_waitingTree.deleteNode3 ( wn , true );
//log("sper: 6 del node %"INT32" for %s",wn,iptoa(firstIp));
return false;
}
}
startKey = *(key128_t *)list.getLastKey();
startKey += (uint32_t) 1;
// watch out for wrap around
if ( startKey >= *(key128_t *)list.getLastKey() ) goto loop;
done:
log(LOG_DEBUG,"spider: making waiting tree done.");
// re-enable threads
if ( enabled ) g_threads.enableThreads();
// we wrapped, all done
return true;
}
// for debugging query reindex i guess
int64_t SpiderColl::getEarliestSpiderTimeFromWaitingTree ( int32_t firstIp ) {
// make the key. use 0 as the time...
key_t wk = makeWaitingTreeKey ( 0, firstIp );
// set node from wait tree key. this way we can resume from a prev key
int32_t node = m_waitingTree.getNextNode ( 0, (char *)&wk );
// if empty, stop
if ( node < 0 ) return -1;
// breathe
QUICKPOLL(MAX_NICENESS);
// get the key
key_t *k = (key_t *)m_waitingTree.getKey ( node );
// ok, we got one
int32_t storedFirstIp = (k->n0) & 0xffffffff;
// match? we call this with a firstIp of 0 below to indicate
// any IP, we just want to get the next spider time.
if ( firstIp != 0 && storedFirstIp != firstIp ) return -1;
// get the time
uint64_t spiderTimeMS = k->n1;
// shift upp
spiderTimeMS <<= 32;
// or in
spiderTimeMS |= (k->n0 >> 32);
// make into seconds
return spiderTimeMS;
}
bool SpiderColl::makeWaitingTable ( ) {
log(LOG_DEBUG,"spider: making waiting table for %s.",m_coll);
int32_t node = m_waitingTree.getFirstNode();
for ( ; node >= 0 ; node = m_waitingTree.getNextNode(node) ) {
// breathe
QUICKPOLL(MAX_NICENESS);
// get key
key_t *key = (key_t *)m_waitingTree.getKey(node);
// get ip from that
int32_t ip = (key->n0) & 0xffffffff;
// spider time is up top
uint64_t spiderTimeMS = (key->n1);
spiderTimeMS <<= 32;
spiderTimeMS |= ((key->n0) >> 32);
// store in waiting table
if ( ! m_waitingTable.addKey(&ip,&spiderTimeMS) ) return false;
}
log(LOG_DEBUG,"spider: making waiting table done.");
return true;
}
SpiderColl::~SpiderColl () {
reset();
}
// we call this now instead of reset when Collectiondb::resetColl() is used
void SpiderColl::clearLocks ( ) {
// remove locks from locktable for all spiders out i guess
HashTableX *ht = &g_spiderLoop.m_lockTable;
top:
// scan the slots
int32_t ns = ht->m_numSlots;
for ( int32_t i = 0 ; i < ns ; i++ ) {
// skip if empty
if ( ! ht->m_flags[i] ) continue;
// cast lock
UrlLock *lock = (UrlLock *)ht->getValueFromSlot(i);
// skip if not our collnum
if ( lock->m_collnum != m_collnum ) continue;
// nuke it!
ht->removeSlot(i);
// restart since cells may have shifted
goto top;
}
/*
// reset these for SpiderLoop;
m_nextDoledbKey.setMin();
m_didRound = false;
// set this to -1 here, when we enter spiderDoledUrls() it will
// see that its -1 and set the m_msg5StartKey
m_pri2 = -1; // MAX_SPIDER_PRIORITIES - 1;
m_twinDied = false;
m_lastUrlFiltersUpdate = 0;
char *coll = "unknown";
if ( m_coll[0] ) coll = m_coll;
logf(LOG_DEBUG,"spider: CLEARING spider cache coll=%s",coll);
m_ufnMapValid = false;
m_doleIpTable .clear();
m_cdTable .clear();
m_sniTable .clear();
m_waitingTable.clear();
m_waitingTree .clear();
m_waitingMem .clear();
//m_lastDownloadCache.clear ( m_collnum );
// copied from reset() below
for ( int32_t i = 0 ; i < MAX_SPIDER_PRIORITIES ; i++ ) {
m_nextKeys[i] = g_doledb.makeFirstKey2 ( i );
m_isDoledbEmpty[i] = 0;
}
// assume the whole thing is not empty
m_allDoledbPrioritiesEmpty = 0;//false;
m_lastEmptyCheck = 0;
*/
}
void SpiderColl::reset ( ) {
// these don't work because we only store one reply
// which overwrites any older reply. that's how the
// key is. we can change the key to use the timestamp
// and not parent docid in makeKey() for spider
// replies later.
// m_numSuccessReplies = 0;
// m_numFailedReplies = 0;
// reset these for SpiderLoop;
m_nextDoledbKey.setMin();
//m_didRound = false;
// set this to -1 here, when we enter spiderDoledUrls() it will
// see that its -1 and set the m_msg5StartKey
m_pri2 = -1; // MAX_SPIDER_PRIORITIES - 1;
m_twinDied = false;
m_lastUrlFiltersUpdate = 0;
m_isPopulatingDoledb = false;
char *coll = "unknown";
if ( m_coll[0] ) coll = m_coll;
log(LOG_DEBUG,"spider: resetting spider cache coll=%s",coll);
m_ufnMapValid = false;
m_doleIpTable .reset();
m_cdTable .reset();
m_sniTable .reset();
m_waitingTable.reset();
m_waitingTree .reset();
m_waitingMem .reset();
m_winnerTree .reset();
m_winnerTable .reset();
m_dupCache .reset();
if ( m_overflowList ) {
mfree ( m_overflowList , OVERFLOWLISTSIZE * 4 ,"olist" );
m_overflowList = NULL;
}
// each spider priority in the collection has essentially a cursor
// that references the next spider rec in doledb to spider. it is
// used as a performance hack to avoid the massive positive/negative
// key annihilations related to starting at the top of the priority
// queue every time we scan it, which causes us to do upwards of
// 300 re-reads!
for ( int32_t i = 0 ; i < MAX_SPIDER_PRIORITIES ; i++ ) {
m_nextKeys[i] = g_doledb.makeFirstKey2 ( i );
m_isDoledbEmpty[i] = 0;
}
// assume the whole thing is not empty
//m_allDoledbPrioritiesEmpty = 0;//false;
//m_lastEmptyCheck = 0;
}
bool SpiderColl::updateSiteNumInlinksTable ( int32_t siteHash32,
int32_t sni,
time_t timestamp ) {
// do not update if invalid
if ( sni == -1 ) return true;
// . get entry for siteNumInlinks table
// . use 32-bit key specialized lookup for speed
uint64_t *val = (uint64_t *)m_sniTable.getValue32(siteHash32);
// bail?
if ( val && ((*val)&0xffffffff) > (uint32_t)timestamp ) return true;
// . make new data for this key
// . lower 32 bits is the addedTime
// . upper 32 bits is the siteNumInlinks
uint64_t nv = (uint32_t)sni;
// shift up
nv <<= 32;
// or in time
nv |= (uint32_t)timestamp;//sreq->m_addedTime;
// just direct update if faster
if ( val ) *val = nv;
// store it anew otherwise
else if ( ! m_sniTable.addKey(&siteHash32,&nv) )
// return false with g_errno set on error
return false;
// success
return true;
}
// . we call this when we receive a spider reply in Rdb.cpp
// . returns false and sets g_errno on error
// . xmldoc.cpp adds reply AFTER the negative doledb rec since we decement
// the count in m_doleIpTable here
bool SpiderColl::addSpiderReply ( SpiderReply *srep ) {
////
//
// skip if not assigned to us for doling
//
////
if ( ! isAssignedToUs ( srep->m_firstIp ) )
return true;
/////////
//
// remove the lock here
//
//////
int64_t lockKey = makeLockTableKey ( srep );
// int16_tcut
HashTableX *ht = &g_spiderLoop.m_lockTable;
UrlLock *lock = (UrlLock *)ht->getValue ( &lockKey );
time_t nowGlobal = getTimeGlobal();
if ( g_conf.m_logDebugSpider )
logf(LOG_DEBUG,"spider: removing lock uh48=%"INT64" "
"lockKey=%"UINT64"",
srep->getUrlHash48(),
lockKey );
// test it
//if ( m_nowGlobal == 0 && lock )
// m_nowGlobal = getTimeGlobal();
// we do it this way rather than remove it ourselves
// because a lock request for this guy
// might be currently outstanding, and it will end up
// being granted the lock even though we have by now removed
// it from doledb, because it read doledb before we removed
// it! so wait 5 seconds for the doledb negative key to
// be absorbed to prevent a url we just spidered from being
// re-spidered right away because of this sync issue.
// . if we wait too long then the round end time, SPIDER_DONE_TIMER,
// will kick in before us and end the round, then we end up
// spidering a previously locked url right after and DOUBLE
// increment the round!
if ( lock ) lock->m_expires = nowGlobal + 2;
/////
//
// but do note that its spider has returned for populating the
// waiting tree. addToWaitingTree should not add an entry if
// a spiderReply is still pending according to the lock table,
// UNLESS, maxSpidersPerIP is more than what the lock table says
// is currently being spidered.
//
/////
if ( lock ) lock->m_spiderOutstanding = false;
// bitch if not in there
// when "rebuilding" (Rebuild.cpp) this msg gets triggered too much...
// so only show it when in debug mode.
if ( !lock && g_conf.m_logDebugSpider)//ht->isInTable(&lockKey))
logf(LOG_DEBUG,"spider: rdb: lockKey=%"UINT64" "
"was not in lock table",lockKey);
// now just remove it since we only spider our own urls
// and doledb is in memory
g_spiderLoop.m_lockTable.removeKey ( &lockKey );
// update the latest siteNumInlinks count for this "site" (repeatbelow)
updateSiteNumInlinksTable ( srep->m_siteHash32,
srep->m_siteNumInlinks,
srep->m_spideredTime );
// . skip the rest if injecting
// . otherwise it triggers a lookup for this firstip in spiderdb to
// get a new spider request to add to doledb
// . no, because there might be more on disk from the same firstip
// so comment this out again
//if ( srep->m_fromInjectionRequest )
// return true;
// clear error for this
g_errno = 0;
// . update the latest crawl delay for this domain
// . only add to the table if we had a crawl delay
// . -1 implies an invalid or unknown crawl delay
// . we have to store crawl delays of -1 now so we at least know we
// tried to download the robots.txt (todo: verify that!)
// and the webmaster did not have one. then we can
// crawl more vigorously...
//if ( srep->m_crawlDelayMS >= 0 ) {
bool update = false;
// use the domain hash for this guy! since its from robots.txt
int32_t *cdp = (int32_t *)m_cdTable.getValue32(srep->m_domHash32);
// update it only if better or empty
if ( ! cdp ) update = true;
// no update if injecting or from pagereindex (docid based spider request)
if ( srep->m_fromInjectionRequest )
update = false;
//else if (((*cdp)&0xffffffff)<(uint32_t)srep->m_spideredTime)
// update = true;
// update m_sniTable if we should
if ( update ) {
// . make new data for this key
// . lower 32 bits is the spideredTime
// . upper 32 bits is the crawldelay
int32_t nv = (int32_t)(srep->m_crawlDelayMS);
// shift up
//nv <<= 32;
// or in time
//nv |= (uint32_t)srep->m_spideredTime;
// just direct update if faster
if ( cdp ) *cdp = nv;
// store it anew otherwise
else if ( ! m_cdTable.addKey(&srep->m_domHash32,&nv)){
// return false with g_errno set on error
//return false;
log("spider: failed to add crawl delay for "
"firstip=%s",iptoa(srep->m_firstIp));
// just ignore
g_errno = 0;
}
}
// . anytime we add a reply then
// we must update this downloadTable with the replies
// SpiderReply::m_downloadEndTime so we can obey sameIpWait
// . that is the earliest that this url can be respidered, but we
// also have a sameIpWait constraint we have to consider...
// . we alone our responsible for adding doledb recs from this ip so
// this is easy to throttle...
// . and make sure to only add to this download time hash table if
// SpiderReply::m_downloadEndTime is non-zero, because zero means
// no download happened. (TODO: check this)
// . TODO: consult crawldelay table here too! use that value if is
// less than our sameIpWait
// . make m_lastDownloadTable an rdbcache ...
// . this is 0 for pagereindex docid-based replies
if ( srep->m_downloadEndTime )
m_lastDownloadCache.addLongLong ( m_collnum,
srep->m_firstIp ,
srep->m_downloadEndTime );
// log this for now
if ( g_conf.m_logDebugSpider )
log("spider: adding spider reply, download end time %"INT64" for "
"ip=%s(%"UINT32") uh48=%"UINT64" indexcode=\"%s\" coll=%"INT32" "
"k.n1=%"UINT64" k.n0=%"UINT64"",
//"to SpiderColl::m_lastDownloadCache",
srep->m_downloadEndTime,
iptoa(srep->m_firstIp),
(uint32_t)srep->m_firstIp,
srep->getUrlHash48(),
mstrerror(srep->m_errCode),
(int32_t)m_collnum,
srep->m_key.n1,
srep->m_key.n0);
// ignore errors from that, it's just a cache
g_errno = 0;
// sanity check - test cache
//if ( g_conf.m_logDebugSpider && srep->m_downloadEndTime ) {
// int64_t last = m_lastDownloadCache.getLongLong ( m_collnum ,
// srep->m_firstIp ,
// -1,// maxAge
// true );//pro
// if ( last != srep->m_downloadEndTime ) { char *xx=NULL;*xx=0;}
//}
// skip:
// . add to wait tree and let it populate doledb on its batch run
// . use a spiderTime of 0 which means unknown and that it needs to
// scan spiderdb to get that
// . returns false if did not add to waiting tree
// . returns false sets g_errno on error
bool added = addToWaitingTree ( 0LL, srep->m_firstIp , true );
// ignore errors i guess
g_errno = 0;
// if added to waiting tree, bail now, needs to scan spiderdb
// in order to add to doledb, because it won't add to waiting tree
// if we already have spiderrequests in doledb for this firstip
if ( added ) return true;
// spider some urls that were doled to us
g_spiderLoop.spiderDoledUrls( );
return true;
}
void SpiderColl::removeFromDoledbTable ( int32_t firstIp ) {
// . decrement doledb table ip count for firstIp
// . update how many per ip we got doled
int32_t *score = (int32_t *)m_doleIpTable.getValue32 ( firstIp );
// wtf! how did this spider without being doled?
if ( ! score ) {
//if ( ! srep->m_fromInjectionRequest )
log("spider: corruption. received spider reply whose "
"ip has no entry in dole ip table. firstip=%s",
iptoa(firstIp));
return;
}
// reduce it
*score = *score - 1;
// now we log it too
if ( g_conf.m_logDebugSpider )
log(LOG_DEBUG,"spider: removed ip=%s from doleiptable "
"(newcount=%"INT32")", iptoa(firstIp),*score);
// remove if zero
if ( *score == 0 ) {
// this can file if writes are disabled on this hashtablex
// because it is saving
m_doleIpTable.removeKey ( &firstIp );
// sanity check
//if ( ! m_doleIpTable.m_isWritable ) { char *xx=NULL;*xx=0; }
}
// wtf!
if ( *score < 0 ) { char *xx=NULL;*xx=0; }
// all done?
if ( g_conf.m_logDebugSpider ) {
// log that too!
logf(LOG_DEBUG,"spider: discounting firstip=%s to %"INT32"",
iptoa(firstIp),*score);
}
}
bool SpiderColl::isInDupCache ( SpiderRequest *sreq , bool addToCache ) {
// init dup cache?
if ( ! m_dupCache.isInitialized() )
// use 50k i guess of 64bit numbers and linked list info
m_dupCache.init ( 90000,
4 , // fixeddatasize (don't really need this)
false, // list support?
5000, // maxcachenodes
false, // usehalfkeys?
"urldups", // dbname
false, // loadfromdisk
12, // cachekeysize
0, // datakeysize
-1 ); // numptrsmax
// quit add dups over and over again...
int64_t dupKey64 = sreq->getUrlHash48();
// . these flags make big difference in url filters
// . NOTE: if you see a url that is not getting spidered that should be it might
// be because we are not incorporating other flags here...
if ( sreq->m_fakeFirstIp ) dupKey64 ^= 12345;
if ( sreq->m_isAddUrl ) dupKey64 ^= 49387333;
if ( sreq->m_isInjecting ) dupKey64 ^= 3276404;
if ( sreq->m_isPageReindex) dupKey64 ^= 32999604;
if ( sreq->m_forceDelete ) dupKey64 ^= 29386239;
if ( sreq->m_hadReply ) dupKey64 ^= 293294099;
if ( sreq->m_sameDom ) dupKey64 ^= 963493311;
if ( sreq->m_sameHost ) dupKey64 ^= 288844772;
if ( sreq->m_sameSite ) dupKey64 ^= 58320355;
// . maxage=86400,promoteRec=yes. returns -1 if not in there
// . dupKey64 is for hopcount 0, so if this url is in the dupcache
// with a hopcount of zero, do not add it
if ( m_dupCache.getLong ( 0,dupKey64,86400,true ) != -1 ) {
dedup:
if ( g_conf.m_logDebugSpider )
log("spider: skipping dup request url=%s uh48=%"UINT64"",
sreq->m_url,sreq->getUrlHash48());
return true;
}
// if our hopcount is 2 and there is a hopcount 1 in there, do not add
if ( sreq->m_hopCount >= 2 &&
m_dupCache.getLong ( 0,dupKey64 ^ 0x01 ,86400,true ) != -1 )
goto dedup;
// likewise, if there's a hopcount 2 in there, do not add if we are 3+
if ( sreq->m_hopCount >= 3 &&
m_dupCache.getLong ( 0,dupKey64 ^ 0x02 ,86400,true ) != -1 )
goto dedup;
if ( ! addToCache ) return false;
int32_t hc = sreq->m_hopCount;
// limit hopcount to 3 for making cache key so we don't flood cache
if ( hc >= 3 ) hc = 3;
// mangle the key with hopcount before adding it to the cache
dupKey64 ^= hc;
// add it
m_dupCache.addLong(0,dupKey64 ,1);
return false;
}
// . Rdb.cpp calls SpiderColl::addSpiderRequest/Reply() for every positive
// spiderdb record it adds to spiderdb. that way our cache is kept
// uptodate incrementally
// . returns false and sets g_errno on error
// . if the spiderTime appears to be AFTER m_nextReloadTime then we should
// not add this spider request to keep the cache trimmed!!! (MDW: TODO)
// . BUT! if we have 150,000 urls that is going to take a int32_t time to
// spider, so it should have a high reload rate!
bool SpiderColl::addSpiderRequest ( SpiderRequest *sreq ,
int64_t nowGlobalMS ) {
// don't add negative keys or data less thangs
if ( sreq->m_dataSize <= 0 ) {
//if ( g_conf.m_logDebugSpider )
log("spider: add spider request is dataless for "
"uh48=%"UINT64"",sreq->getUrlHash48());
//char *xx=NULL;*xx=0;
return true;
}
// . are we already more or less in spiderdb? true = addToCache
// . put this above isAssignedToUs() so we *try* to keep twins in sync because
// Rdb.cpp won't add the spiderrequest if its in this dup cache, and we add
// it to the dupcache here...
if ( isInDupCache ( sreq , true ) ) {
if ( g_conf.m_logDebugSpider )
log("spider: skipping dup request url=%s uh48=%"UINT64"",
sreq->m_url,sreq->getUrlHash48());
return true;
}
// skip if not assigned to us for doling
if ( ! isAssignedToUs ( sreq->m_firstIp ) ) {
if ( g_conf.m_logDebugSpider )
log("spider: spider request not assigned to us. "
"skipping.");
return true;
}
// . get the url's length contained in this record
// . it should be NULL terminated
// . we set the ip here too
int32_t ulen = sreq->getUrlLen();
// watch out for corruption
if ( sreq->m_firstIp == 0 || sreq->m_firstIp == -1 || ulen <= 0 ) {
log("spider: Corrupt spider req with url length of "
"%"INT32" <= 0 u=%s. dataSize=%"INT32" firstip=%"INT32" uh48=%"UINT64". Skipping.",
ulen,sreq->m_url,
sreq->m_dataSize,sreq->m_firstIp,sreq->getUrlHash48());
return true;
}
// update crawlinfo stats here and not in xmldoc so that we count
// seeds and bulk urls added from add url and can use that to
// determine if the collection is empty of urls or not for printing
// out the colored bullets in printCollectionNavBar() in Pages.cpp.
CollectionRec *cr = g_collectiondb.m_recs[m_collnum];
if ( cr ) {
cr->m_localCrawlInfo .m_urlsHarvested++;
cr->m_globalCrawlInfo.m_urlsHarvested++;
cr->m_needsSave = true;
}
// . if already have a request in doledb for this firstIp, forget it!
// . TODO: make sure we remove from doledb first before adding this
// spider request
// . NOW: allow it in if different priority!!! so maybe hash the
// priority in with the firstIp???
// . we really just need to add it if it beats what is currently
// in doledb. so maybe store the best priority doledb in the
// data value part of the doleiptable...? therefore we should
// probably move this check down below after we get the priority
// of the spider request.
//char *val = (char *)m_doleIpTable.getValue ( &sreq->m_firstIp );
//if ( val && *val > 0 ) {
// if ( g_conf.m_logDebugSpider )
// log("spider: request IP already in dole table");
// return true;
//}
// . skip if already in wait tree
// . no, no. what if the current url for this firstip is not due to
// be spidered until 24 hrs and we are adding a url from this firstip
// that should be spidered now...
//if ( m_waitingTable.isInTable ( &sreq->m_firstIp ) ) {
// if ( g_conf.m_logDebugSpider )
// log("spider: request already in waiting table");
// return true;
//}
// . we can't do this because we do not have the spiderReply!!!???
// . MDW: no, we have to do it because tradesy.com has links to twitter
// on every page and twitter is not allowed so we continually
// re-scan a big spiderdblist for twitter's firstip. major performace
// degradation. so try to get ufn without reply. if we need
// a reply to get the ufn then this function should return -1 which
// means an unknown ufn and we'll add to waiting tree.
// get ufn/priority,because if filtered we do not want to add to doledb
int32_t ufn ;
// HACK: set isOutlink to true here since we don't know if we have sre
ufn = ::getUrlFilterNum(sreq,NULL,nowGlobalMS,false,MAX_NICENESS,m_cr,
true,//isoutlink? HACK!
NULL,// quota table quotatable
-1 ); // langid not valid
// sanity check
//if ( ufn < 0 ) {
// log("spider: failed to add spider request for %s because "
// "it matched no url filter",
// sreq->m_url);
// g_errno = EBADENGINEER;
// return false;
//}
// spiders disabled for this row in url filters?
if ( ufn >= 0 && m_cr->m_maxSpidersPerRule[ufn] == 0 ) {
if ( g_conf.m_logDebugSpider )
log("spider: request spidersoff ufn=%"INT32" url=%s",ufn,
sreq->m_url);
return true;
}
// set the priority (might be the same as old)
int32_t priority = -1;
if ( ufn >= 0 ) priority = m_cr->m_spiderPriorities[ufn];
// sanity checks
//if ( priority == -1 ) { char *xx=NULL;*xx=0; }
if ( priority >= MAX_SPIDER_PRIORITIES) {char *xx=NULL;*xx=0;}
// do not add to doledb if bad
//if ( priority == SPIDER_PRIORITY_FILTERED ) {
if ( m_cr->m_forceDelete[ufn] ) {
if ( g_conf.m_logDebugSpider )
log("spider: request %s is filtered ufn=%"INT32"",
sreq->m_url,ufn);
return true;
}
//if ( priority == SPIDER_PRIORITY_BANNED ) {
if ( m_cr->m_forceDelete[ufn] ) {
if ( g_conf.m_logDebugSpider )
log("spider: request %s is banned ufn=%"INT32"",
sreq->m_url,ufn);
return true;
}
// set it for adding to doledb and computing spidertime
//sreq->m_ufn = ufn;
//sreq->m_priority = priority;
// get spider time -- i.e. earliest time when we can spider it
//uint64_t spiderTimeMS = getSpiderTimeMS (sreq,ufn,NULL,nowGlobalMS );
// sanity
//if ( (int64_t)spiderTimeMS < 0 ) { char *xx=NULL;*xx=0; }
// once in waiting tree, we will scan waiting tree and then lookup
// each firstIp in waiting tree in spiderdb to get the best
// SpiderRequest for that firstIp, then we can add it to doledb
// as int32_t as it can be spidered now
//bool status = addToWaitingTree ( spiderTimeMS,sreq->m_firstIp,true);
bool added = addToWaitingTree ( 0 , sreq->m_firstIp , true );
// if already doled and we beat the priority/spidertime of what
// was doled then we should probably delete the old doledb key
// and add the new one. hmm, the waitingtree scan code ...
// sanity check
//int64_t ttt=getEarliestSpiderTimeFromWaitingTree(sreq->m_firstIp);
//logf (LOG_DEBUG,"spider: earliestime=%"INT64" for firstip=%s",
// ttt,iptoa(sreq->m_firstIp));
//if ( ttt != (int64_t)spiderTimeMS ) { char *xx=NULL;*xx=0; }
// update the latest siteNumInlinks count for this "site"
if ( sreq->m_siteNumInlinksValid ) {
// updates m_siteNumInlinksTable
updateSiteNumInlinksTable ( sreq->m_siteHash32 ,
sreq->m_siteNumInlinks ,
(time_t)sreq->m_addedTime );
// clear error for this if there was any
g_errno = 0;
}
if ( ! g_conf.m_logDebugSpider ) return true;//status;
char *msg = "ADDED";
if ( ! added ) msg = "DIDNOTADD";
// log it
logf(LOG_DEBUG,
"spider: %s request to waiting tree %s "
"uh48=%"UINT64" "
"firstIp=%s "
"pageNumInlinks=%"UINT32" "
"parentdocid=%"UINT64" "
"isinjecting=%"INT32" "
"ispagereindex=%"INT32" "
"ufn=%"INT32" "
"priority=%"INT32" "
"addedtime=%"UINT32" "
//"spidertime=%"UINT64"",
,
msg,
sreq->m_url,
sreq->getUrlHash48(),
iptoa(sreq->m_firstIp),
(uint32_t)sreq->m_pageNumInlinks,//(uint32_t)sreq->m_parentFirstIp
sreq->getParentDocId(),
(int32_t)(bool)sreq->m_isInjecting,
(int32_t)(bool)sreq->m_isPageReindex,
(int32_t)sreq->m_ufn,
(int32_t)sreq->m_priority,
(uint32_t)sreq->m_addedTime
//spiderTimeMS);
);
return true;//status;
}
bool SpiderColl::printWaitingTree ( ) {
int32_t node = m_waitingTree.getFirstNode();
for ( ; node >= 0 ; node = m_waitingTree.getNextNode(node) ) {
key_t *wk = (key_t *)m_waitingTree.getKey (node);
// spider time is up top
uint64_t spiderTimeMS = (wk->n1);
spiderTimeMS <<= 32;
spiderTimeMS |= ((wk->n0) >> 32);
// then ip
int32_t firstIp = wk->n0 & 0xffffffff;
// show it
log("dump: time=%"INT64" firstip=%s",spiderTimeMS,iptoa(firstIp));
}
return true;
}
bool SpiderLoop::printLockTable ( ) {
// count locks
HashTableX *ht = &g_spiderLoop.m_lockTable;
// scan the slots
int32_t ns = ht->m_numSlots;
for ( int32_t i = 0 ; i < ns ; i++ ) {
// skip if empty
if ( ! ht->m_flags[i] ) continue;
// cast lock
UrlLock *lock = (UrlLock *)ht->getValueFromSlot(i);
// get the key
int64_t lockKey = *(int64_t *)ht->getKeyFromSlot(i);
// show it
log("dump: lock. "
"lockkey=%"INT64" "
"spiderout=%"INT32" "
"confirmed=%"INT32" "
"firstip=%s "
"expires=%"INT32" "
"hostid=%"INT32" "
"timestamp=%"INT32" "
"sequence=%"INT32" "
"collnum=%"INT32" "
,lockKey
,(int32_t)(lock->m_spiderOutstanding)
,(int32_t)(lock->m_confirmed)
,iptoa(lock->m_firstIp)
,lock->m_expires
,lock->m_hostId
,lock->m_timestamp
,lock->m_lockSequence
,(int32_t)lock->m_collnum
);
}
return true;
}
//////
//
// . 1. called by addSpiderReply(). it should have the sameIpWait available
// or at least that will be in the crawldelay cache table.
// SpiderReply::m_crawlDelayMS. Unfortunately, no maxSpidersPerIP!!!
// we just add a "0" in the waiting tree which means evalIpLoop() will
// be called and can get the maxSpidersPerIP from the winning candidate
// and add to the waiting tree based on that.
// . 2. called by addSpiderRequests(). It SHOULD maybe just add a "0" as well
// to offload the logic. try that.
// . 3. called by populateWaitingTreeFromSpiderdb(). it just adds "0" as well,
// if not doled
// . 4. UPDATED in evalIpLoop() if the best SpiderRequest for a firstIp is
// in the future, this is the only time we will add a waiting tree key
// whose spider time is non-zero. that is where we also take
// sameIpWait and maxSpidersPerIP into consideration. evalIpLoop()
// will actually REMOVE the entry from the waiting tree if that IP
// already has the max spiders outstanding per IP. when a spiderReply
// is received it will populate the waiting tree again with a "0" entry
// and evalIpLoop() will re-do its check.
//
//////
// . returns true if we added to waiting tree, false if not
// . if one of these add fails consider increasing mem used by tree/table
// . if we lose an ip that sux because it won't be gotten again unless
// we somehow add another request/reply to spiderdb in the future
bool SpiderColl::addToWaitingTree ( uint64_t spiderTimeMS , int32_t firstIp ,
bool callForScan ) {
// skip if already in wait tree. no - might be an override with
// a sooner spiderTimeMS
//if ( m_waitingTable.isInTable ( &firstIp ) ) return true;
if ( g_conf.m_logDebugSpider )
log("spider: addtowaitingtree ip=%s",iptoa(firstIp));
// we are currently reading spiderdb for this ip and trying to find
// a best SpiderRequest or requests to add to doledb. so if this
// happens, let the scan know that more replies or requests came in
// while we were scanning so that it should not delete the rec from
// waiting tree and not add to doledb, then we'd lose it forever or
// until the next waitingtree rebuild was triggered in time.
//
// Before i was only setting this in addSpiderRequest() so if a new
// reply came in it was not setting m_gotNewDataForScanninIp and
// we ended up losing the IP from the waiting tree forever (or until
// the next timed rebuild). putting it here seems to fix that.
if ( firstIp == m_scanningIp ) {
m_gotNewDataForScanningIp = m_scanningIp;
//log("spider: got new data for %s",iptoa(firstIp));
//return true;
}
// . this can now be only 0
// . only evalIpLoop() will add a waiting tree key with a non-zero
// value after it figures out the EARLIEST time that a
// SpiderRequest from this firstIp can be spidered.
if ( spiderTimeMS != 0 ) { char *xx=NULL;*xx=0; }
// waiting tree might be saving!!!
if ( ! m_waitingTree.m_isWritable ) {
log("spider: addtowaitingtree: failed. is not writable. "
"saving?");
return false;
}
// only if we are the responsible host in the shard
if ( ! isAssignedToUs ( firstIp ) )
return false;
// . do not add to waiting tree if already in doledb
// . an ip should not exist in both doledb and waiting tree.
// . waiting tree is meant to be a signal that we need to add
// a spiderrequest from that ip into doledb where it can be picked
// up for immediate spidering
if ( m_doleIpTable.isInTable ( &firstIp ) ) {
if ( g_conf.m_logDebugSpider )
log("spider: not adding to waiting tree, already in "
"doleip table");
return false;
}
// sanity check
// i think this trigged on gk209 during an auto-save!!! FIX!
if ( ! m_waitingTree.m_isWritable ) { char *xx=NULL; *xx=0; }
/*
///////
//
// compute the min time for this entry to satisfy sameIpWait
//
///////
int64_t spiderTimeMS = spiderTimeMSArg;
int64_t lastDownloadTimeMS = lastDownloadTime ( firstIp );
// how long to wait between downloads from same ip in milliseconds?
int32_t sameIpWaitTime = 250; // ms
if ( ufn >= 0 ) {
int32_t siwt = m_sc->m_cr->m_spiderIpWaits[ufn];
if ( siwt >= 0 ) sameIpWaitTime = siwt;
}
int64_t minDownloadTime = sameIpWaitTime + siwt;
// use that if it is more restrictive
if ( minDownloadTime > now && minDownloadTime > spiderTimeMS )
spiderTimeMS = minDownloadTime;
*/
// see if in tree already, so we can delete it and replace it below
int32_t ws = m_waitingTable.getSlot ( &firstIp ) ;
// . this is >= 0 if already in tree
// . if spiderTimeMS is a sooner time than what this firstIp already
// has as its earliest time, then we will override it and have to
// update both m_waitingTree and m_waitingTable, however
// IF the spiderTimeMS is a later time, then we bail without doing
// anything at this point.
if ( ws >= 0 ) {
// get timems from waiting table
int64_t sms = m_waitingTable.getScore64FromSlot(ws);
// get current time
//int64_t nowMS = gettimeofdayInMillisecondsGlobal();
// make the key then
key_t wk = makeWaitingTreeKey ( sms, firstIp );
// must be there
int32_t tn = m_waitingTree.getNode ( (collnum_t)0, (char *)&wk );
// sanity check. ensure waitingTable and waitingTree in sync
if ( tn < 0 ) { char *xx=NULL;*xx=0; }
// not only must we be a sooner time, but we must be 5-seconds
// sooner than the time currently in there to avoid thrashing
// when we had a ton of outlinks with this first ip within an
// 5-second interval.
//
// i'm not so sure what i was doing here before, but i don't
// want to starve the spiders, so make this 100ms not 5000ms
if ( (int64_t)spiderTimeMS > sms - 100 ) {
if ( g_conf.m_logDebugSpider )
log("spider: skip updating waiting tree");
return false;
}
// log the replacement
if ( g_conf.m_logDebugSpider )
log("spider: replacing waitingtree key "
"oldtime=%"UINT32" newtime=%"UINT32" firstip=%s",
(uint32_t)(sms/1000LL),
(uint32_t)(spiderTimeMS/1000LL),
iptoa(firstIp));
// remove from tree so we can add it below
m_waitingTree.deleteNode3 ( tn , false );
//log("spider: 4 del node %"INT32" for %s",tn,iptoa(firstIp));
}
else {
char *s="";
// time of 0 means we got the reply for something we spidered
// in doledb so we will need to recompute the best spider
// requests for this first ip
if ( spiderTimeMS==0 ) s = "(replyreset)";
// log the replacement
if ( g_conf.m_logDebugSpcache )
log("spider: adding new key to waitingtree "
"newtime=%"UINT32"%s firstip=%s",
(uint32_t)(spiderTimeMS/1000LL),s,
iptoa(firstIp));
}
// make the key
key_t wk = makeWaitingTreeKey ( spiderTimeMS, firstIp );
// what is this?
if ( firstIp == 0 || firstIp == -1 ) {
log("spider: got ip of %s. cn=%"INT32" "
"wtf? failed to add to "
"waiting tree, but return true anyway.",
iptoa(firstIp) ,
(int32_t)m_collnum);
// don't return true lest m_nextKey2 never gets updated
// and we end up in an infinite loop doing
// populateWaitingTreeFromSpiderdb()
return true;
//return false;
char *xx=NULL; *xx=0;
}
// grow the tree if too small!
int32_t used = m_waitingTree.getNumUsedNodes();
int32_t max = m_waitingTree.getNumTotalNodes();
if ( used + 1 > max ) {
int32_t more = (((int64_t)used) * 15) / 10;
if ( more < 10 ) more = 10;
if ( more > 100000 ) more = 100000;
int32_t newNum = max + more;
log("spider: growing waiting tree to from %"INT32" to %"INT32" nodes "
"for collnum %"INT32"",
max , newNum , (int32_t)m_collnum );
if ( ! m_waitingTree.growTree ( newNum , MAX_NICENESS ) )
return log("spider: failed to grow waiting tree to "
"add firstip %s",iptoa(firstIp) );
if ( ! m_waitingTable.setTableSize ( newNum , NULL , 0 ) )
return log("spider: failed to grow waiting table to "
"add firstip %s",iptoa(firstIp) );
}
// add that
int32_t wn;
if ( ( wn = m_waitingTree.addKey ( &wk ) ) < 0 ) {
log("spider: waitingtree add failed ip=%s. increase max nodes "
"lest we lose this IP forever. err=%s",
iptoa(firstIp),mstrerror(g_errno));
//char *xx=NULL; *xx=0;
return false;
}
// note it
if ( g_conf.m_logDebugSpider )
logf(LOG_DEBUG,"spider: added time=%"INT64" ip=%s to waiting tree "
"scan=%"INT32" node=%"INT32"",
spiderTimeMS , iptoa(firstIp),(int32_t)callForScan,wn);
// add to table now since its in the tree
if ( ! m_waitingTable.addKey ( &firstIp , &spiderTimeMS ) ) {
// remove from tree then
m_waitingTree.deleteNode3 ( wn , false );
//log("spider: 5 del node %"INT32" for %s",wn,iptoa(firstIp));
return false;
}
// . kick off a scan, i don't care if this blocks or not!
// . the populatedoledb loop might already have a scan in progress
// but usually it won't, so rather than wait for its sleepwrapper
// to be called we force it here for speed.
// . re-entry is false because we are entering for the first time
// . calling this everytime msg4 adds a spider request is super slow!!!
// SO TAKE THIS OUT FOR NOW
// . no that was not it. mdw. put it back.
if ( callForScan ) populateDoledbFromWaitingTree ( );
// tell caller there was no error
return true;
}
// . this scan is started anytime we call addSpiderRequest() or addSpiderReply
// . if nothing is in tree it quickly exits
// . otherwise it scan the entries in the tree
// . each entry is a key with spiderTime/firstIp
// . if spiderTime > now it stops the scan
// . if the firstIp is already in doledb (m_doleIpTable) then it removes
// it from the waitingtree and waitingtable. how did that happen?
// . otherwise, it looks up that firstIp in spiderdb to get a list of all
// the spiderdb recs from that firstIp
// . then it selects the "best" one and adds it to doledb. once added to
// doledb it adds it to doleIpTable, and remove from waitingtree and
// waitingtable
// . returns false if blocked, true otherwise
int32_t SpiderColl::getNextIpFromWaitingTree ( ) {
// if nothing to scan, bail
if ( m_waitingTree.isEmpty() ) return 0;
// reset first key to get first rec in waiting tree
m_waitingTreeKey.setMin();
// current time on host #0
uint64_t nowMS = gettimeofdayInMillisecondsGlobal();
top:
// we might have deleted the only node below...
if ( m_waitingTree.isEmpty() ) return 0;
// advance to next
//m_waitingTreeKey += 1LL;
// assume none
int32_t firstIp = 0;
// set node from wait tree key. this way we can resume from a prev key
int32_t node = m_waitingTree.getNextNode ( 0, (char *)&m_waitingTreeKey );
// if empty, stop
if ( node < 0 ) return 0;
// breathe. take this out fix core because cr could be deleted...
//QUICKPOLL(MAX_NICENESS);
// get the key
key_t *k = (key_t *)m_waitingTree.getKey ( node );
// ok, we got one
firstIp = (k->n0) & 0xffffffff;
// sometimes we take over for a dead host, but if he's no longer
// dead then we can remove his keys. but first make sure we have had
// at least one ping from him so we do not remove at startup.
// if it is in doledb or in the middle of being added to doledb
// via msg4, nuke it as well!
if ( ! isAssignedToUs (firstIp) || m_doleIpTable.isInTable(&firstIp)) {
// only delete if this host is alive and has sent us a ping
// before so we know he was up at one time. this way we do not
// remove all his keys just because we restarted and think he
// is alive even though we have gotten no ping from him.
//if ( hp->m_numPingRequests > 0 )
// these operations should fail if writes have been disabled
// and becase the trees/tables for spidercache are saving
// in Process.cpp's g_spiderCache::save() call
m_waitingTree.deleteNode3 ( node , true );
//log("spdr: 8 del node node %"INT32" for %s",node,iptoa(firstIp));
// note it
if ( g_conf.m_logDebugSpider )
log(LOG_DEBUG,"spider: removed1 ip=%s from waiting "
"tree. nn=%"INT32"",
iptoa(firstIp),m_waitingTree.m_numUsedNodes);
// log it
if ( g_conf.m_logDebugSpcache )
log("spider: erasing waitingtree key firstip=%s",
iptoa(firstIp) );
// remove from table too!
m_waitingTable.removeKey ( &firstIp );
goto top;
}
// spider time is up top
uint64_t spiderTimeMS = (k->n1);
spiderTimeMS <<= 32;
spiderTimeMS |= ((k->n0) >> 32);
// stop if need to wait for this one
if ( spiderTimeMS > nowMS ) return 0;
// sanity
if ( (int64_t)spiderTimeMS < 0 ) { char *xx=NULL;*xx=0; }
// save key for deleting when done
m_waitingTreeKey.n1 = k->n1;
m_waitingTreeKey.n0 = k->n0;
m_waitingTreeKeyValid = true;
m_scanningIp = firstIp;
// sanity
if ( firstIp == 0 || firstIp == -1 ) { char *xx=NULL;*xx=0; }
// we set this to true when done
//m_isReadDone = false;
// compute the best request from spiderdb list, not valid yet
//m_bestRequestValid = false;
m_lastReplyValid = false;
// start reading spiderdb here
m_nextKey = g_spiderdb.makeFirstKey(firstIp);
m_endKey = g_spiderdb.makeLastKey (firstIp);
// all done
return firstIp;
}
uint64_t SpiderColl::getNextSpiderTimeFromWaitingTree ( ) {
// if nothing to scan, bail
if ( m_waitingTree.isEmpty() ) return 0LL;
// the key
key_t mink; mink.setMin();
// set node from wait tree key. this way we can resume from a prev key
int32_t node = m_waitingTree.getNextNode (0,(char *)&mink );
// if empty, stop
if ( node < 0 ) return 0LL;
// get the key
key_t *wk = (key_t *)m_waitingTree.getKey ( node );
// time from that
uint64_t spiderTimeMS = (wk->n1);
spiderTimeMS <<= 32;
spiderTimeMS |= ((wk->n0) >> 32);
// stop if need to wait for this one
return spiderTimeMS;
}
static void gotSpiderdbListWrapper2( void *state , RdbList *list,Msg5 *msg5) {
SpiderColl *THIS = (SpiderColl *)state;
// did our collection rec get deleted? since we were doing a read
// the SpiderColl will have been preserved in that case but its
// m_deleteMyself flag will have been set.
if ( tryToDeleteSpiderColl ( THIS ,"2") ) return;
THIS->m_gettingList2 = false;
THIS->populateWaitingTreeFromSpiderdb ( true );
}
// lower from 1300 to 300
#define MAXUDPSLOTS 300
//////////////////
//////////////////
//
// THE BACKGROUND FUNCTION
//
// when the user changes the ufn table the waiting tree is flushed
// and repopulated from spiderdb with this. also used for repairs.
//
//////////////////
//////////////////
// . this stores an ip into the waiting tree with a spidertime of "0" so
// it will be evaluate properly by populateDoledbFromWaitingTree()
//
// . scan spiderdb to make sure each firstip represented in spiderdb is
// in the waiting tree. it seems they fall out over time. we need to fix
// that but in the meantime this should do a bg repair. and is nice to have
// . the waiting tree key is reall just a spidertime and a firstip. so we will
// still need populatedoledbfromwaitingtree to periodically scan firstips
// that are already in doledb to see if it has a higher-priority request
// for that firstip. in which case it can add that to doledb too, but then
// we have to be sure to only grant one lock for a firstip to avoid hammering
// that firstip
// . this should be called from a sleepwrapper, the same sleep wrapper we
// call populateDoledbFromWaitingTree() from should be fine
void SpiderColl::populateWaitingTreeFromSpiderdb ( bool reentry ) {
// skip if in repair mode
if ( g_repairMode ) return;
// sanity
if ( m_deleteMyself ) { char *xx=NULL;*xx=0; }
// skip if spiders off
if ( ! m_cr->m_spideringEnabled ) return;
if ( ! g_hostdb.getMyHost( )->m_spiderEnabled ) return;
// skip if udp table is full
if ( g_udpServer.getNumUsedSlotsIncoming() >= MAXUDPSLOTS ) return;
// if entering for the first time, we need to read list from spiderdb
if ( ! reentry ) {
// just return if we should not be doing this yet
if ( ! m_waitingTreeNeedsRebuild ) return;
// a double call? can happen if list read is slow...
if ( m_gettingList2 ) return;
// . borrow a msg5
// . if none available just return, we will be called again
// by the sleep/timer function
// . read in a replacement SpiderRequest to add to doledb from
// this ip
// . get the list of spiderdb records
// . do not include cache, those results are old and will mess
// us up
log(LOG_DEBUG,"spider: populateWaitingTree: "
"calling msg5: startKey=0x%"XINT64",0x%"XINT64" "
"firstip=%s",
m_nextKey2.n1,m_nextKey2.n0,
iptoa(g_spiderdb.getFirstIp(&m_nextKey2)));
// flag it
m_gettingList2 = true;
// make state
//int32_t state2 = (int32_t)m_cr->m_collnum;
// read the list from local disk
if ( ! m_msg5b.getList ( RDB_SPIDERDB ,
m_cr->m_collnum,
&m_list2 ,
&m_nextKey2 ,
&m_endKey2 ,
SR_READ_SIZE , // minRecSizes (512k)
true , // includeTree
false , // addToCache
0 , // max cache age
0 , // startFileNum
-1 , // numFiles (all)
this,//(void *)state2,//this//state
gotSpiderdbListWrapper2 ,
MAX_NICENESS , // niceness
true )) // do error correct?
// return if blocked
return;
}
// show list stats
if ( g_conf.m_logDebugSpider )
log("spider: populateWaitingTree: got list of size %"INT32"",
m_list2.m_listSize);
// unflag it
m_gettingList2 = false;
// stop if we are done
//if ( m_isReadDone2 ) return;
// if waitingtree is locked for writing because it is saving or
// writes were disabled then just bail and let the scan be re-called
// later
RdbTree *wt = &m_waitingTree;
if ( wt->m_isSaving || ! wt->m_isWritable ) return;
// int16_tcut
RdbList *list = &m_list2;
// ensure we point to the top of the list
list->resetListPtr();
// bail on error
if ( g_errno ) {
log("spider: Had error getting list of urls "
"from spiderdb2: %s.",mstrerror(g_errno));
//m_isReadDone2 = true;
return;
}
int32_t lastOne = 0;
// loop over all serialized spiderdb records in the list
for ( ; ! list->isExhausted() ; ) {
// breathe
QUICKPOLL ( MAX_NICENESS );
// get spiderdb rec in its serialized form
char *rec = list->getCurrentRec();
// skip to next guy
list->skipCurrentRecord();
// negative? wtf?
if ( (rec[0] & 0x01) == 0x00 ) {
//logf(LOG_DEBUG,"spider: got negative spider rec");
continue;
}
// if its a SpiderReply skip it
if ( ! g_spiderdb.isSpiderRequest ( (key128_t *)rec)) continue;
// cast it
SpiderRequest *sreq = (SpiderRequest *)rec;
// get first ip
int32_t firstIp = sreq->m_firstIp;
// corruption?
// if ( firstIp == 0 || firstIp == -1 )
// gotCorruption = true;
// if same as last, skip it
if ( firstIp == lastOne ) continue;
// set this lastOne for speed
lastOne = firstIp;
// check for dmoz. set up gdb on gk157/gk221 to break here
// so we can see what's going on
//if ( firstIp == -815809331 )
// log("got dmoz");
// if firstip already in waiting tree, skip it
if ( m_waitingTable.isInTable ( &firstIp ) ) continue;
// skip if only our twin should add it to waitingtree/doledb
if ( ! isAssignedToUs ( firstIp ) ) continue;
// skip if ip already represented in doledb i guess otehrwise
// the populatedoledb scan will nuke it!!
if ( m_doleIpTable.isInTable ( &firstIp ) ) continue;
// not currently spidering either. when they got their
// lock they called confirmLockAcquisition() which will
// have added an entry to the waiting table. sometimes the
// lock still exists but the spider is done. because the
// lock persists for 5 seconds afterwards in case there was
// a lock request for that url in progress, so it will be
// denied.
// . this is starving other collections , should be
// added to waiting tree anyway! otherwise it won't get
// added!!!
// . so now i made this collection specific, not global
if ( g_spiderLoop.getNumSpidersOutPerIp (firstIp,m_collnum)>0)
continue;
// otherwise, we want to add it with 0 time so the doledb
// scan will evaluate it properly
// this will return false if we are saving the tree i guess
if ( ! addToWaitingTree ( 0 , firstIp , false ) ) {
log("spider: failed to add ip %s to waiting tree. "
"ip will not get spidered then and our "
"population of waiting tree will repeat until "
"this add happens."
, iptoa(firstIp) );
return;
}
// count it
m_numAdded++;
// ignore errors for this
g_errno = 0;
}
// are we the final list in the scan?
bool shortRead = ( list->getListSize() <= 0);//(int32_t)SR_READ_SIZE) ;
m_numBytesScanned += list->getListSize();
// reset? still left over from our first scan?
if ( m_lastPrintCount > m_numBytesScanned )
m_lastPrintCount = 0;
// announce every 10MB
if ( m_numBytesScanned - m_lastPrintCount > 10000000 ) {
log("spider: %"UINT64" spiderdb bytes scanned for waiting tree "
"re-population for cn=%"INT32"",m_numBytesScanned,
(int32_t)m_collnum);
m_lastPrintCount = m_numBytesScanned;
}
// debug info
log(LOG_DEBUG,"spider: Read2 %"INT32" spiderdb bytes.",list->getListSize());
// reset any errno cuz we're just a cache
g_errno = 0;
// if not done, keep going
if ( ! shortRead ) {
// . inc it here
// . it can also be reset on a collection rec update
key128_t lastKey = *(key128_t *)list->getLastKey();
if ( lastKey < m_nextKey2 ) {
log("spider: got corruption 9. spiderdb "
"keys out of order for "
"collnum=%"INT32, (int32_t)m_collnum);
g_corruptCount++;
// this should result in an empty list read for
// our next scan of spiderdb. unfortunately we could
// miss a lot of spider requests then
m_nextKey2 = m_endKey2;
}
else {
m_nextKey2 = lastKey;
m_nextKey2 += (uint32_t) 1;
}
// watch out for wrap around
if ( m_nextKey2 < lastKey ) shortRead = true;
// nah, advance the firstip, should be a lot faster when
// we are only a few firstips...
if ( lastOne && lastOne != -1 ) { // && ! gotCorruption ) {
key128_t cand = g_spiderdb.makeFirstKey(lastOne+1);
// corruption still seems to happen, so only
// do this part if it increases the key to avoid
// putting us into an infinite loop.
if ( cand > m_nextKey2 ) m_nextKey2 = cand;
}
}
if ( shortRead ) {
// mark when the scan completed so we can do another one
// like 24 hrs from that...
m_lastScanTime = getTimeLocal();
// log it
// if ( m_numAdded )
// log("spider: added %"INT32" recs to waiting tree from "
// "scan of %"INT64" bytes coll=%s",
// m_numAdded,m_numBytesScanned,
// m_cr->m_coll);
// note it
log("spider: rebuild complete for %s. Added %"INT32" "
"recs to waiting tree, scanned %"INT64" bytes of spiderdb.",
m_coll,m_numAdded,m_numBytesScanned);
// reset the count for next scan
m_numAdded = 0 ;
m_numBytesScanned = 0;
// reset for next scan
m_nextKey2.setMin();
// no longer need rebuild
m_waitingTreeNeedsRebuild = false;
// and re-send the crawlinfo in handerequestc1 to each host
// so they no if we have urls ready to spider or not. because
// if we told them no before we completed this rebuild we might
// have found some urls.
// MDW: let's not do this unless we find it is a problem
//m_cr->localCrawlInfoUpdate();
}
// free list to save memory
list->freeList();
// wait for sleepwrapper to call us again with our updated m_nextKey2
return;
}
//static bool s_ufnTreeSet = false;
//static RdbTree s_ufnTree;
//static time_t s_lastUfnTreeFlushTime = 0;
//////////////////////////
//////////////////////////
//
// The first KEYSTONE function.
//
// CALL THIS ANYTIME to load up doledb from waiting tree entries
//
// This is a key function.
//
// It is called from two places:
//
// 1) sleep callback
//
// 2) addToWaitingTree()
// is called from addSpiderRequest() anytime a SpiderRequest
// is added to spiderdb (or from addSpiderReply())
//
// It can only be entered once so will just return if already scanning
// spiderdb.
//
//////////////////////////
//////////////////////////
// . for each IP in the waiting tree, scan all its SpiderRequests and determine
// which one should be the next to be spidered. and put that one in doledb.
// . we call this a lot, like if the admin changes the url filters table
// we have to re-scan all of spiderdb basically and re-do doledb
void SpiderColl::populateDoledbFromWaitingTree ( ) { // bool reentry ) {
// only one loop can run at a time!
//if ( ! reentry && m_isPopulating ) return;
if ( m_isPopulatingDoledb ) return;
// skip if in repair mode
if ( g_repairMode ) return;
// if rebuilding the waiting tree, do that first
// MDW. re-allow us to populate doledb while waiting tree is being
// build so spiders can go right away. i had this in there to debug.
//if ( m_waitingTreeNeedsRebuild ) return;
// let's skip if spiders off so we can inject/popoulate the index quick
// since addSpiderRequest() calls addToWaitingTree() which then calls
// this.
if ( ! g_conf.m_spideringEnabled ) return;
if ( ! g_hostdb.getMyHost( )->m_spiderEnabled ) return;
// skip if udp table is full
if ( g_udpServer.getNumUsedSlotsIncoming() >= MAXUDPSLOTS ) return;
// try skipping!!!!!!!!!!!
// yeah, this makes us scream. in addition to calling
// Doledb::m_rdb::addRecord() below
// WE NEED THIS TO REPOPULATE DOLEDB THOUGH!!!
//return;
//if ( g_conf.m_logDebugSpider )
// log("spider: in populatedoledbfromwaitingtree "
// "numUsedNodes=%"INT32"",
// m_waitingTree.m_numUsedNodes);
// set this flag so we are not re-entered
m_isPopulatingDoledb = true;
loop:
// if waiting tree is being saved, we can't write to it
// so in that case, bail and wait to be called another time
RdbTree *wt = &m_waitingTree;
if( wt->m_isSaving || ! wt->m_isWritable ) {
m_isPopulatingDoledb = false;
return;
}
// . get next IP that is due to be spidered from
// . also sets m_waitingTreeKey so we can delete it easily!
int32_t ip = getNextIpFromWaitingTree();
// . return if none. all done. unset populating flag.
// . it returns 0 if the next firstip has a spidertime in the future
if ( ip == 0 ) { m_isPopulatingDoledb = false; return; }
// set read range for scanning spiderdb
m_nextKey = g_spiderdb.makeFirstKey(ip);
m_endKey = g_spiderdb.makeLastKey (ip);
if ( g_conf.m_logDebugSpider )
log("spider: for cn=%i nextip=%s nextkey=%s",
(int)m_collnum,
iptoa(ip),
KEYSTR(&m_nextKey,sizeof(key128_t)));
//////
//
// do TWO PASSES, one to count pages, the other to get the best url!!
//
//////
// assume we don't have to do two passes
m_countingPagesIndexed = false;
// get the collectionrec
CollectionRec *cr = g_collectiondb.getRec ( m_collnum );
// but if we have quota based url filters we do have to count
if ( cr && cr->m_urlFiltersHavePageCounts ) {
// tell evalIpLoop() to count first
m_countingPagesIndexed = true;
// reset this stuff used for counting UNIQUE votes
m_lastReqUh48a = 0LL;
m_lastReqUh48b = 0LL;
m_lastRepUh48 = 0LL;
// and setup the LOCAL counting table if not initialized
if ( m_localTable.m_ks == 0 )
m_localTable.set (4,4,0,NULL,0,false,0,"ltpct" );
// otherwise, just reset it so we can repopulate it
else m_localTable.reset();
}
// debug output
if ( g_conf.m_logDebugSpider )
log(LOG_DEBUG,"spider: evalIpLoop: waitingtree nextip=%s "
"numUsedNodes=%"INT32"",iptoa(ip),m_waitingTree.m_numUsedNodes);
/*
// assume using tree
m_useTree = true;
// . flush the tree every 12 hours
// . i guess we could add incoming requests to the ufntree if
// they strictly beat the ufn tree tail node, HOWEVER, we
// still have the problem of that if a url we spidered is due
// to be respidered very soon we will miss it, as only the reply
// is added back into spiderdb, not a new request.
int32_t nowLocal = getTimeLocal();
// make it one hour so we don't cock-block a new high priority
// request that just got added... crap, what if its an addurl
// or something like that????
if ( nowLocal - s_lastUfnTreeFlushTime > 3600 ) {
s_ufnTree.clear();
s_lastUfnTreeFlushTime = nowLocal;
}
int64_t uh48;
//
// s_ufnTree tries to cache the top X spiderrequests for an IP
// that should be spidered next so we do not have to scan like
// a million spiderrequests in spiderdb to find the best one.
//
// if we have a specific uh48 targetted in s_ufnTree then that
// saves a ton of time!
// key format for s_ufnTree:
// iiiiiiii iiiiiiii iiiiiii iiiiiii i = firstip
// PPPPPPPP tttttttt ttttttt ttttttt P = priority
// tttttttt tttttttt hhhhhhh hhhhhhh t = spiderTimeMS (40 bits)
// hhhhhhhh hhhhhhhh hhhhhhh hhhhhhh h = urlhash48
key128_t key;
key.n1 = ip;
key.n1 <<= 32;
key.n0 = 0LL;
int32_t node = s_ufnTree.getNextNode(0,(char *)&key);
// cancel node if not from our ip
if ( node >= 0 ) {
key128_t *rk = (key128_t *)s_ufnTree.getKey ( node );
if ( (rk->n1 >> 32) != (uint32_t)ip ) node = -1;
}
if ( node >= 0 ) {
// get the key
key128_t *nk = (key128_t *)s_ufnTree.getKey ( node );
// parse out uh48
uh48 = nk->n0;
// mask out spidertimems
uh48 &= 0x0000ffffffffffffLL;
// use that to refine the key range immensley!
m_nextKey = g_spiderdb.makeFirstKey2 (ip, uh48);
m_endKey = g_spiderdb.makeLastKey2 (ip, uh48);
// do not add the recs to the tree!
m_useTree = false;
}
*/
// turn this off until we figure out why it sux
m_useTree = false;
// so we know if we are the first read or not...
m_firstKey = m_nextKey;
// . initialize this before scanning the spiderdb recs of an ip
// . it lets us know if we recvd new spider requests for m_scanningIp
// while we were doing the scan
m_gotNewDataForScanningIp = 0;
m_lastListSize = -1;
// let evalIpLoop() know it has not yet tried to read from spiderdb
m_didRead = false;
// reset this
int32_t maxWinners = (int32_t)MAX_WINNER_NODES;
//if ( ! m_cr->m_isCustomCrawl ) maxWinners = 1;
if ( m_winnerTree.m_numNodes == 0 &&
! m_winnerTree.set ( -1 , // fixeddatasize
maxWinners , // maxnumnodes
true , // balance?
maxWinners * MAX_BEST_REQUEST_SIZE, // memmax
true, // owndata?
"wintree", // allocname
false, // datainptrs?
NULL, // dbname
sizeof(key192_t), // keysize
false, // useprotection?
false, // allowdups?
-1 ) ) { // rdbid
m_isPopulatingDoledb = false;
log("spider: winntree set: %s",mstrerror(g_errno));
return;
}
if ( ! m_winnerTable.isInitialized() &&
! m_winnerTable.set ( 8 , // uh48 is key
sizeof(key192_t) , // winnertree key is data
64 , // 64 slots initially
NULL ,
0 ,
false , // allow dups?
MAX_NICENESS ,
"wtdedup" ) ) {
m_isPopulatingDoledb = false;
log("spider: wintable set: %s",mstrerror(g_errno));
return;
}
// clear it before evaluating this ip so it is empty
m_winnerTree.clear();
// and table as well now
m_winnerTable.clear();
// reset this as well
m_minFutureTimeMS = 0LL;
m_totalBytesScanned = 0LL;
m_totalNewSpiderRequests = 0LL;
m_lastOverflowFirstIp = 0;
// . look up in spiderdb otherwise and add best req to doledb from ip
// . if it blocks ultimately it calls gotSpiderdbListWrapper() which
// calls this function again with re-entry set to true
if ( ! evalIpLoop ( ) ) return ;
// oom error? i've seen this happen and we end up locking up!
if ( g_errno ) {
log("spider: evalIpLoop: %s",mstrerror(g_errno));
m_isPopulatingDoledb = false;
return;
}
// try more
goto loop;
}
/*
// replace this func with the one above...
static void doledWrapper ( void *state ) {
SpiderColl *THIS = (SpiderColl *)state;
// msg4 is available again
THIS->m_msg1Avail = true;
if ( g_errno )
log("spider: msg1 addlist had error: %s. need to "
"somehow reduce doleiptable score now...",
mstrerror(g_errno));
// no longer populating doledb. we also set to false in
// gotSpiderListWrapper
//THIS->m_isPopulating = false;
int64_t now = gettimeofdayInMilliseconds();
int64_t diff = now - THIS->m_msg4Start;
// we add recs to doledb using msg1 to keep things fast because
// msg4 has a delay of 500ms in it. but even then, msg1 can take
// 6ms or more just because of load issues.
if ( diff > 10 )
log("spider: adding to doledb took %"INT64"ms",diff);
// we are done!! that was the final step...
THIS->m_isPopulating = false;
// did collection get nuked while we were waiting for msg1 reply?
if ( tryToDeleteSpiderColl ( THIS ,"3") ) return;
// . we added a rec to doledb for the firstIp in m_waitingTreeKey, so
// now go to the next node in the wait tree.
// . it will get the next key after m_waitingTreeKey
// . re-entry is true because we just got the msg4 reply
THIS->populateDoledbFromWaitingTree ( );
}
*/
key128_t makeUfnTreeKey ( int32_t firstIp ,
int32_t priority ,
int64_t spiderTimeMS ,
int64_t uh48 ) {
// sanity check, do not allow negative priorities for now
if ( priority < 0 ) { char *xx=NULL;*xx=0; }
if ( priority > 255 ) { char *xx=NULL;*xx=0; }
key128_t key;
key.n1 = (uint32_t)firstIp;
// all of priority (COMPLEMENTED!)
key.n1 <<= 8;
key.n1 |= (unsigned char)(255-priority);
// top 3 bytes of spiderTimeMS (5 bytes total)
key.n1 <<= 24;
key.n1 |= ((spiderTimeMS >> 16) & 0x00ffffff);
// remaining 2 bytes of spiderTimeMS goes in key.n0
key.n0 = (spiderTimeMS & 0xffff);
// 6 bytes uh48
key.n0 <<= 48;
key.n0 |= uh48;
return key;
}
////////
//
// winner tree key. holds the top/best spider requests for a firstIp
// for spidering purposes.
//
////////
// key bitmap (192 bits):
//
// ffffffff ffffffff ffffffff ffffffff f=firstIp
// pppppppp pppppppp HHHHHHHH HHHHHHHH p=255-priority H=hopcount
// tttttttt tttttttt tttttttt tttttttt t=spiderTimeMS
// tttttttt tttttttt tttttttt tttttttt h=urlHash48
// hhhhhhhh hhhhhhhh hhhhhhhh hhhhhhhh
// hhhhhhhh hhhhhhhh 00000000 00000000
key192_t makeWinnerTreeKey ( int32_t firstIp ,
int32_t priority ,
int32_t hopCount,
int64_t spiderTimeMS ,
int64_t uh48 ) {
key192_t k;
k.n2 = firstIp;
k.n2 <<= 16;
k.n2 |= (255-priority);
k.n2 <<= 16;
// query reindex is still using hopcount -1...
if ( hopCount == -1 ) hopCount = 0;
if ( hopCount < 0 ) { char *xx=NULL;*xx=0; }
if ( hopCount > 0xffff ) hopCount = 0xffff;
k.n2 |= hopCount;
k.n1 = spiderTimeMS;
k.n0 = uh48;
k.n0 <<= 16;
return k;
}
void parseWinnerTreeKey ( key192_t *k ,
int32_t *firstIp ,
int32_t *priority ,
int32_t *hopCount,
int64_t *spiderTimeMS ,
int64_t *uh48 ) {
*firstIp = (k->n2) >> 32;
*priority = 255 - ((k->n2 >> 16) & 0xffff);
*hopCount = (k->n2 & 0xffff);
*spiderTimeMS = k->n1;
*uh48 = (k->n0 >> 16);
}
void testWinnerTreeKey ( ) {
int32_t firstIp = 1234567;
int32_t priority = 123;
int64_t spiderTimeMS = 456789123LL;
int64_t uh48 = 987654321888LL;
int32_t hc = 4321;
key192_t k = makeWinnerTreeKey (firstIp,priority,hc,spiderTimeMS,uh48);
int32_t firstIp2;
int32_t priority2;
int64_t spiderTimeMS2;
int64_t uh482;
int32_t hc2;
parseWinnerTreeKey(&k,&firstIp2,&priority2,&hc2,&spiderTimeMS2,&uh482);
if ( firstIp != firstIp2 ) { char *xx=NULL;*xx=0; }
if ( priority != priority2 ) { char *xx=NULL;*xx=0; }
if ( spiderTimeMS != spiderTimeMS2 ) { char *xx=NULL;*xx=0; }
if ( uh48 != uh482 ) { char *xx=NULL;*xx=0; }
if ( hc != hc2 ) { char *xx=NULL;*xx=0; }
}
void removeExpiredLocks ( int32_t hostId );
static void gotSpiderdbListWrapper ( void *state , RdbList *list , Msg5 *msg5){
SpiderColl *THIS = (SpiderColl *)state;
// prevent a core
THIS->m_gettingList1 = false;
// are we trying to exit? some firstip lists can be quite long, so
// terminate here so all threads can return and we can exit properly
if ( g_process.m_mode == EXIT_MODE ) return;
// return if that blocked
if ( ! THIS->evalIpLoop() ) return;
// we are done, re-entry popuatedoledb
THIS->m_isPopulatingDoledb = false;
// gotta set m_isPopulatingDoledb to false lest it won't work
THIS->populateDoledbFromWaitingTree ( );
}
///////////////////
//
// KEYSTONE FUNCTION
//
// . READ ALL spiderdb recs for IP of m_scanningIp
// . add winner to doledb
// . called ONLY by populateDoledbFromWaitingTree()
//
// . continually scan spiderdb requests for a particular ip, m_scanningIp
// . compute the best spider request to spider next
// . add it to doledb
// . getNextIpFromWaitingTree() must have been called to set m_scanningIp
// otherwise m_bestRequestValid might not have been reset to false
//
///////////////////
bool SpiderColl::evalIpLoop ( ) {
//testWinnerTreeKey ( );
// sanity
if ( m_scanningIp == 0 || m_scanningIp == -1 ) { char *xx=NULL;*xx=0;}
// if this ip is in the winnerlistcache use that. it saves
// us a lot of time.
key_t cacheKey;
cacheKey.n0 = m_scanningIp;
cacheKey.n1 = 0;
char *doleBuf = NULL;
int32_t doleBufSize;
RdbCache *wc = &g_spiderLoop.m_winnerListCache;
time_t cachedTimestamp = 0;
bool inCache = false;
bool useCache = true;
CollectionRec *cr = g_collectiondb.getRec ( m_collnum );
// if doing site or page quotes for the sitepages or domainpages
// url filter expressions, we can't muck with the cache because
// we end up skipping the counting part.
if ( ! cr )
useCache = false;
if ( cr && cr->m_urlFiltersHavePageCounts )
useCache = false;
if ( m_countingPagesIndexed )
useCache = false;
// assume not from cache
if ( useCache ) {
//wc->verify();
inCache = wc->getRecord ( m_collnum ,
(char *)&cacheKey ,
&doleBuf,
&doleBufSize ,
false, // doCopy?
// we raised MAX_WINNER_NODES so
// grow from 600 to 1200
// (10 mins to 20 mins) to make
// some crawls faster
1200, // maxAge, 600 seconds
true ,// incCounts
&cachedTimestamp , // rec timestamp
true ); // promote rec?
//wc->verify();
}
// if ( m_collnum == 18752 ) {
// int32_t coff = 0;
// if ( inCache && doleBufSize >= 4 ) coff = *(int32_t *)doleBuf;
// log("spider: usecache=%i incache=%i dbufsize=%i currentoff=%i "
// "ctime=%i ip=%s"
// ,(int)useCache
// ,(int)inCache
// ,(int)doleBufSize
// ,(int)coff
// ,(int)cachedTimestamp
// ,iptoa(m_scanningIp));
// }
// doleBuf could be NULL i guess...
if ( inCache ) { // && doleBufSize > 0 ) {
int32_t crc = hash32 ( doleBuf + 4 , doleBufSize - 4 );
if ( g_conf.m_logDebugSpider ) // || m_collnum == 18752 )
log("spider: GOT %"INT32" bytes of SpiderRequests "
"from winnerlistcache for ip %s ptr=0x%"PTRFMT
" crc=%"UINT32
,doleBufSize,
iptoa(m_scanningIp),
(PTRTYPE)doleBuf,
crc);
// set own to false so it doesn't get freed
// m_doleBuf.setBuf ( doleBuf ,
// doleBufSize ,
// doleBufSize ,
// false , // ownData?
// 0 ); // encoding. doesn't matter.
//m_doleBuf.reset();
// gotta copy it because we end up re-adding part of it
// to rdbcache below
//m_doleBuf.safeMemcpy ( doleBuf , doleBufSize );
// we no longer re-add to avoid churn. but do not free it
// so do not 'own' it.
SafeBuf sb;
sb.setBuf ( doleBuf, doleBufSize, doleBufSize, false );
// now add the first rec m_doleBuf into doledb's tree
// and re-add the rest back to the cache with the same key.
return addDoleBufIntoDoledb(&sb,true);//,cachedTimestamp)
}
top:
// did our collection rec get deleted? since we were doing a read
// the SpiderColl will have been preserved in that case but its
// m_deleteMyself flag will have been set.
if ( tryToDeleteSpiderColl ( this ,"4") ) return false;
// if first time here, let's do a read first
if ( ! m_didRead ) {
// reset list size to 0
m_list.reset();
// assume we did a read now
m_didRead = true;
// reset some stuff
m_lastScanningIp = 0;
// reset these that need to keep track of requests for
// the same url that might span two spiderdb lists or more
m_lastSreqUh48 = 0LL;
// do a read. if it blocks it will recall this loop
if ( ! readListFromSpiderdb () ) return false;
}
loop:
// did our collection rec get deleted? since we were doing a read
// the SpiderColl will have been preserved in that case but its
// m_deleteMyself flag will have been set.
if ( tryToDeleteSpiderColl ( this ,"5") )
// pretend to block since we got deleted!!!
return false;
// . did reading the list from spiderdb have an error?
// . i guess we don't add to doledb then
if ( g_errno ) {
log("spider: Had error getting list of urls "
"from spiderdb: %s.",mstrerror(g_errno));
// save mem
m_list.freeList();
//m_isReadDone = true;
return true;
}
// if we started reading, then assume we got a fresh list here
if ( g_conf.m_logDebugSpider )
log("spider: back from msg5 spiderdb read2 of %"INT32" bytes "
"(cn=%"INT32")",m_list.m_listSize,(int32_t)m_collnum);
// . set the winning request for all lists we read so far
// . if m_countingPagesIndexed is true this will just fill in
// quota info into m_localTable...
scanListForWinners();
// if list not empty, keep reading!
if ( ! m_list.isEmpty() ) {
// update m_nextKey for successive reads of spiderdb by
// calling readListFromSpiderdb()
key128_t lastKey = *(key128_t *)m_list.getLastKey();
// sanity
//if ( endKey != finalKey ) { char *xx=NULL;*xx=0; }
// crazy corruption?
if ( lastKey < m_nextKey ) {
log("spider: got corruption. spiderdb "
"keys out of order for "
"collnum=%"INT32" for evaluation of "
"firstip=%s so terminating evaluation of that "
"firstip." ,
(int32_t)m_collnum,
iptoa(m_scanningIp));
g_corruptCount++;
// this should result in an empty list read for
// m_scanningIp in spiderdb
m_nextKey = m_endKey;
}
else {
m_nextKey = lastKey;
m_nextKey += (uint32_t) 1;
}
// . watch out for wrap around
// . normally i would go by this to indicate that we are
// done reading, but there's some bugs... so we go
// by whether our list is empty or not for now
if ( m_nextKey < lastKey ) m_nextKey = lastKey;
// reset list to save mem
m_list.reset();
// read more! return if it blocked
if ( ! readListFromSpiderdb() ) return false;
// we got a list without blocking
goto loop;
}
// . we are all done if last list read was empty
// . if we were just counting pages for quota, do a 2nd pass!
if ( m_countingPagesIndexed ) {
// do not do again.
m_countingPagesIndexed = false;
// start at the top again
m_nextKey = g_spiderdb.makeFirstKey(m_scanningIp);
// this time m_localTable should have the quota info in it so
// getUrlFilterNum() can use that
m_didRead = false;
// do the 2nd pass. read list from the very top.
goto top;
}
// free list to save memory
m_list.freeList();
// . add all winners if we can in m_winnerTree into doledb
// . if list was empty, then reading is all done so take the winner we
// got from all the lists we did read for this IP and add him
// to doledb
// . if no winner exists, then remove m_scanningIp from m_waitingTree
// so we do not waste our time again. if url filters change then
// waiting tree will be rebuilt and we'll try again... or if
// a new spider request or reply for this ip comes in we'll try
// again as well...
// . this returns false if blocked adding to doledb using msg1
if ( ! addWinnersIntoDoledb() ) return false;
// . do more from tree
// . re-entry is true because we just got the msg5 reply
// . don't do this because populateDoledb calls us in a loop
// and we call it from all our callbacks if we blocked...
//populateDoledbFromWaitingTree ( true );
// we are done...
return true;
}
// . this is ONLY CALLED from evalIpLoop() above
// . returns false if blocked, true otherwise
// . returns true and sets g_errno on error
bool SpiderColl::readListFromSpiderdb ( ) {
if ( ! m_waitingTreeKeyValid ) { char *xx=NULL;*xx=0; }
if ( ! m_scanningIp ) { char *xx=NULL;*xx=0; }
CollectionRec *cr = g_collectiondb.getRec ( m_collnum );
if ( ! cr ) {
log("spider: lost collnum %"INT32"",(int32_t)m_collnum);
g_errno = ENOCOLLREC;
return true;
}
// i guess we are always restricted to an ip, because
// populateWaitingTreeFromSpiderdb calls its own msg5.
int32_t firstIp0 = g_spiderdb.getFirstIp(&m_nextKey);
// sanity
if ( m_scanningIp != firstIp0 ) { char *xx=NULL;*xx=0; }
// sometimes we already have this ip in doledb/doleiptable
// already and somehow we try to scan spiderdb for it anyway
if ( m_doleIpTable.isInTable ( &firstIp0 ) ) { char *xx=NULL;*xx=0;}
// if it got zapped from the waiting tree by the time we read the list
if ( ! m_waitingTable.isInTable ( &m_scanningIp ) ) return true;
// sanity check
int32_t wn = m_waitingTree.getNode(0,(char *)&m_waitingTreeKey);
// it gets removed because addSpiderReply() calls addToWaitingTree
// and replaces the node we are scanning with one that has a better
// time, an earlier time, even though that time may have come and
// we are scanning it now. perhaps addToWaitingTree() should ignore
// the ip if it equals m_scanningIp?
if ( wn < 0 ) {
log("spider: waiting tree key removed while reading list "
"for %s (%"INT32")",
cr->m_coll,(int32_t)m_collnum);
return true;
}
// sanity. if first time, this must be invalid
//if ( needList && m_nextKey == m_firstKey && m_bestRequestValid ) {
// char *xx=NULL; *xx=0 ; }
// . if the scanning ip has too many outstanding spiders
// . looks a UrlLock::m_firstIp and UrlLock::m_isSpiderOutstanding
// since the lock lives for 5 seconds after the spider reply
// comes back.
// . when the spiderReply comes back that will re-add a "0" entry
// to the waiting tree.
// . PROBLEM: some spiders don't seem to add a spiderReply!! wtf???
// they end up having their locks timeout after like 3 hrs?
// . maybe just do not add to waiting tree in confirmLockAcquisition()
// handler in such cases? YEAH.. try that
//int32_t numOutPerIp = getOustandingSpidersPerIp ( firstIp );
//if ( numOutPerIp > maxSpidersPerIp ) {
// // remove from the tree and table
// removeFromWaitingTree ( firstIp );
// return true;
//}
// readLoop:
// if we re-entered from the read wrapper, jump down
//if ( needList ) {
// sanity check
if ( m_gettingList1 ) { char *xx=NULL;*xx=0; }
// . read in a replacement SpiderRequest to add to doledb from
// this ip
// . get the list of spiderdb records
// . do not include cache, those results are old and will mess
// us up
if (g_conf.m_logDebugSpider ) {
// got print each out individually because KEYSTR
// uses a static buffer to store the string
SafeBuf tmp;
tmp.safePrintf("spider: readListFromSpiderdb: "
"calling msg5: ");
tmp.safePrintf("firstKey=%s "
,KEYSTR(&m_firstKey,sizeof(key128_t)));
tmp.safePrintf("endKey=%s "
,KEYSTR(&m_endKey,sizeof(key128_t)));
tmp.safePrintf("nextKey=%s "
,KEYSTR(&m_nextKey,sizeof(key128_t)));
tmp.safePrintf("firstip=%s "
,iptoa(m_scanningIp));
tmp.safePrintf("(cn=%"INT32")",(int32_t)m_collnum);
log(LOG_DEBUG,"%s",tmp.getBufStart());
}
// log this better
if ( g_conf.m_logDebugSpider )
log("spider: readListFromSpiderdb: firstip=%s key=%s"
,iptoa(m_scanningIp)
,KEYSTR(&m_nextKey,sizeof(key128_t) ) );
// flag it
m_gettingList1 = true;
// make state
//int32_t state2 = (int32_t)m_cr->m_collnum;
// . read the list from local disk
// . if a niceness 0 intersect thread is taking a LONG time
// then this will not complete in a int32_t time and we
// end up timing out the round. so try checking for
// m_gettingList in spiderDoledUrls() and setting
// m_lastSpiderCouldLaunch
if ( ! m_msg5.getList ( RDB_SPIDERDB ,
m_cr->m_collnum ,
&m_list ,
&m_nextKey ,
&m_endKey ,
SR_READ_SIZE , // minRecSizes (512k)
true , // includeTree
false , // addToCache
0 , // max cache age
0 , // startFileNum
-1 , // numFiles (all)
this,//(void *)state2,//this,//state
gotSpiderdbListWrapper ,
MAX_NICENESS , // niceness
true )) // do error correct?
// return false if blocked
return false ;
// note its return
if ( g_conf.m_logDebugSpider )
log("spider: back from msg5 spiderdb read of %"INT32" bytes",
m_list.m_listSize);
// no longer getting list
m_gettingList1 = false;
// got it without blocking. maybe all in tree or in cache
return true;
// unflag it
//m_gettingList = false;
// stop if we are done
//if ( m_isReadDone ) return true;
}
static int32_t s_lastIn = 0;
static int32_t s_lastOut = 0;
bool SpiderColl::isFirstIpInOverflowList ( int32_t firstIp ) {
if ( ! m_overflowList ) return false;
if ( firstIp == 0 || firstIp == -1 ) return false;
if ( firstIp == s_lastIn ) return true;
if ( firstIp == s_lastOut ) return false;
for ( int32_t oi = 0 ; ; oi++ ) {
// stop at end
if ( ! m_overflowList[oi] ) break;
// an ip of zero is end of the list
if ( m_overflowList[oi] == firstIp ) {
s_lastIn = firstIp;
return true;
}
}
s_lastOut = firstIp;
return false;
}
// . ADDS top X winners to m_winnerTree
// . this is ONLY CALLED from evalIpLoop() above
// . scan m_list that we read from spiderdb for m_scanningIp IP
// . set m_bestRequest if an request in m_list is better than what is
// in m_bestRequest from previous lists for this IP
bool SpiderColl::scanListForWinners ( ) {
// if list is empty why are we here?
if ( m_list.isEmpty() ) return true;
// if waitingtree is locked for writing because it is saving or
// writes were disabled then just bail and let the scan be re-called
// later
//
// MDW: move this up in evalIpLoop() i think
RdbTree *wt = &m_waitingTree;
if ( wt->m_isSaving || ! wt->m_isWritable )
return true;
// int16_tcut
RdbList *list = &m_list;
// ensure we point to the top of the list
list->resetListPtr();
// get this
int64_t nowGlobalMS = gettimeofdayInMillisecondsGlobal();//Local();
uint32_t nowGlobal = nowGlobalMS / 1000;
//SpiderRequest *winReq = NULL;
//int32_t winPriority = -10;
//uint64_t winTimeMS = 0xffffffffffffffffLL;
//int32_t winMaxSpidersPerIp = 9999;
SpiderReply *srep = NULL;
int64_t srepUh48;
// for getting the top MAX_NODES nodes
//int32_t tailPriority = -10;
//uint64_t tailTimeMS = 0xffffffffffffffffLL;
// if we are continuing from another list...
if ( m_lastReplyValid ) {
srep = (SpiderReply *)m_lastReplyBuf;
srepUh48 = srep->getUrlHash48();
}
// sanity, if it was in ufntree it should be on disk then...
/*
if ( list->isEmpty() && m_nextKey == m_firstKey && ! m_useTree ) {
SafeBuf sb;
sb.safePrintf("startkey=%s,",
KEYSTR(&m_nextKey,sizeof(key128_t) ));
sb.safePrintf("endkey=%s",
KEYSTR(&m_endKey,sizeof(key128_t) ));
// get waiting key info
int32_t firstIp = m_waitingTreeKey.n0 & 0xffffffff;
log("spider: strange corruption #1. there was an entry "
"in the waiting tree, but spiderdb read was empty. "
"%s. deleting waitingtree key firstip=%s",
sb.getBufStart(),
iptoa(firstIp));
// delete the exact node #
m_waitingTree.deleteNode3 ( wn , false );
}
*/
//char *xx=NULL;*xx=0; }
/*
// use the ufntree?
bool useTree = m_useTree;
// if we are the first read and list is not full do not bother
// using the tree because its just as fast to scan the little list
// we got
if ( m_nextKey == m_firstKey && list->getListSize() < SR_READ_SIZE )
useTree = false;
// init ufn tree
if ( useTree && ! s_ufnTreeSet ) {
s_ufnTreeSet = true;
s_ufnTree.set ( 0 , // fixed data size (uh48)
1000000 , // max num nodes
true, // balance?
-1 , // maxmem, none
false , // own data?
"ufntree",
false, // data is ptr! (true?)
"ufntreedb",
sizeof(key128_t),
false,
false );
}
*/
// show list stats
if ( g_conf.m_logDebugSpider )
log("spider: readListFromSpiderdb: got list of size %"INT32" "
"for firstip=%s",
m_list.m_listSize,iptoa(m_scanningIp));
// if we don't read minRecSizes worth of data that MUST indicate
// there is no more data to read. put this theory to the test
// before we use it to indcate an end of list condition.
if ( list->getListSize() > 0 &&
m_lastScanningIp == m_scanningIp &&
m_lastListSize < (int32_t)SR_READ_SIZE &&
m_lastListSize >= 0 ) {
//char *xx=NULL;*xx=0; }
log("spider: shucks. spiderdb reads not full.");
}
m_lastListSize = list->getListSize();
m_lastScanningIp = m_scanningIp;
m_totalBytesScanned += list->getListSize();
if ( list->isEmpty() && g_conf.m_logDebugSpider )
log("spider: failed to get rec for ip=%s",iptoa(m_scanningIp));
int32_t firstIp = m_waitingTreeKey.n0 & 0xffffffff;
//int32_t numNodes = 0;
//int32_t tailNode = -1;
key128_t finalKey;
int32_t recCount = 0;
// how many spiders currently out for this ip?
//int32_toutNow=g_spiderLoop.getNumSpidersOutPerIp(m_scanningIp,m_collnum)
// loop over all serialized spiderdb records in the list
for ( ; ! list->isExhausted() ; ) {
// breathe
QUICKPOLL ( MAX_NICENESS );
// stop coring on empty lists
if ( list->isEmpty() ) break;
// get spiderdb rec in its serialized form
char *rec = list->getCurrentRec();
// count it
recCount++;
// sanity
gbmemcpy ( (char *)&finalKey , rec , sizeof(key128_t) );
// skip to next guy
list->skipCurrentRecord();
// negative? wtf?
if ( (rec[0] & 0x01) == 0x00 ) {
logf(LOG_DEBUG,"spider: got negative spider rec");
continue;
}
// if its a SpiderReply set it for an upcoming requests
if ( ! g_spiderdb.isSpiderRequest ( (key128_t *)rec ) ) {
// see if this is the most recent one
SpiderReply *tmp = (SpiderReply *)rec;
// . MDW: we have to detect corrupt replies up here so
// they do not become the winning reply because
// their date is in the future!!
// . this is -1 on corruption
// . i've seen -31757, 21... etc for bad http replies
// in the qatest123 doc cache... so turn off for that
if ( tmp->m_httpStatus >= 1000 ) {
if ( m_cr->m_spiderCorruptCount == 0 ) {
log("spider: got corrupt 3 "
"spiderReply in "
"scan "
"uh48=%"INT64" "
"httpstatus=%"INT32" "
"(cn=%"INT32")",
tmp->getUrlHash48(),
(int32_t)tmp->m_httpStatus,
(int32_t)m_collnum);
}
m_cr->m_spiderCorruptCount++;
// don't nuke it just for that...
//srep = NULL;
continue;
}
// bad langid?
if ( ! getLanguageAbbr (tmp->m_langId) ) {
log("spider: got corrupt 4 spiderReply in "
"scan uh48=%"INT64" "
"langid=%"INT32" (cn=%"INT32")",
tmp->getUrlHash48(),
(int32_t)tmp->m_langId,
(int32_t)m_collnum);
m_cr->m_spiderCorruptCount++;
//srep = NULL;
// if ( tmp->getUrlHash48() ==
// 271713196158770LL )
// log("hey");
continue;
}
// reset reply stats if beginning a new url
// these don't work because we only store one reply
// which overwrites any older reply. that's how the
// key is. we can change the key to use the timestamp
// and not parent docid in makeKey() for spider
// replies later.
// if ( srepUh48 != tmp->getUrlHash48() ) {
// m_numSuccessReplies = 0;
// m_numFailedReplies = 0;
// }
// inc stats
// these don't work because we only store one reply
// which overwrites any older reply. that's how the
// key is. we can change the key to use the timestamp
// and not parent docid in makeKey() for spider
// replies later.
// if ( tmp->m_errCode == 0 ) m_numSuccessReplies++;
// else m_numFailedReplies ++;
// if we have a more recent reply already, skip this
if ( srep &&
srep->getUrlHash48() == tmp->getUrlHash48() &&
srep->m_spideredTime >= tmp->m_spideredTime )
continue;
// otherwise, assign it
srep = tmp;
srepUh48 = srep->getUrlHash48();
continue;
}
// cast it
SpiderRequest *sreq = (SpiderRequest *)rec;
// skip if our twin or another shard should handle it
if ( ! isAssignedToUs ( sreq->m_firstIp ) )
continue;
int64_t uh48 = sreq->getUrlHash48();
// reset reply stats if beginning a new url
// these don't work because we only store one reply
// which overwrites any older reply. that's how the key is.
// we can change the key to use the timestamp and not
// parent docid in makeKey() for spider replies later.
// if ( ! srep ) {
// m_numSuccessReplies = 0;
// m_numFailedReplies = 0;
// }
// . skip if our twin should add it to doledb
// . waiting tree only has firstIps assigned to us so
// this should not be necessary
//if ( ! isAssignedToUs ( sreq->m_firstIp ) ) continue;
// null out srep if no match
if ( srep && srepUh48 != uh48 ) srep = NULL;
// if we are doing parser test, ignore all but initially
// injected requests. NEVER DOLE OUT non-injected urls
// when doing parser test
if ( g_conf.m_testParserEnabled ) {
// skip if already did it
if ( srep ) continue;
// skip if not injected
if ( ! sreq->m_isInjecting ) {
if ( g_conf.m_logDebugSpider )
log("spider: skipping8 %s",
sreq->m_url);
continue;
}
}
// . ignore docid-based requests if spidered the url afterwards
// . these are one-hit wonders
// . once done they can be deleted
if ( sreq->m_isPageReindex && // urlIsDocId &&
srep &&
srep->m_spideredTime > sreq->m_addedTime ) {
if ( g_conf.m_logDebugSpider )
log("spider: skipping9 %s", sreq->m_url);
continue;
}
// if a replie-less new url spiderrequest count it
if ( ! srep && m_lastSreqUh48 != uh48 &&
// avoid counting query reindex requests
! sreq->m_fakeFirstIp )
m_totalNewSpiderRequests++;
//int32_t ipdom ( int32_t ip ) { return ip & 0x00ffffff; };
int32_t cblock = ipdom ( sreq->m_firstIp );
bool countIt = true;
// reset page inlink count on url request change
if ( m_lastSreqUh48 != uh48 ) {
m_pageNumInlinks = 0;
m_lastCBlockIp = 0;
}
//if ( uh48 != m_lastSreqUh48 )
// countIt = false;
if ( cblock == m_lastCBlockIp )
countIt = false;
// do not count manually added spider requests
if ( (sreq->m_isAddUrl || sreq->m_isInjecting) )
countIt = false;
// 20 is good enough
if ( m_pageNumInlinks >= 20 )
countIt = false;
if ( countIt ) {
int32_t ca;
for ( ca = 0 ; ca < m_pageNumInlinks ; ca++ )
if ( m_cblocks[ca] == cblock ) break;
// if found in our list, do not count it, already did
if ( ca < m_pageNumInlinks )
countIt = false;
}
if ( countIt ) {
m_cblocks[m_pageNumInlinks] = cblock;
m_pageNumInlinks++;
if ( m_pageNumInlinks > 20 ) { char *xx=NULL;*xx=0;}
}
// set this now. it does increase with each request. so
// initial requests will not see the full # of inlinks.
sreq->m_pageNumInlinks = (uint8_t)m_pageNumInlinks;
// put these in the spiderequest in doledb so we can
// show in the json spider status docs in
// XmlDoc::getSpiderStatusDocMetaList2()
// these don't work because we only store one reply
// which overwrites any older reply. that's how the
// key is. we can change the key to use the timestamp
// and not parent docid in makeKey() for spider
// replies later.
// sreq->m_reservedc1 = m_numSuccessReplies;
// sreq->m_reservedc2 = m_numFailedReplies;
m_lastSreqUh48 = uh48;
m_lastCBlockIp = cblock;
// only add firstip if manually added and not fake
//
// just calculating page counts? if the url filters are based
// on the # of pages indexed per ip or subdomain/site then
// we have to maintain a page count table. sitepages.
//
if ( m_countingPagesIndexed ) { //&& sreq->m_fakeFirstIp ) {
// get request url hash48 (jez= 220459274533043 )
//int64_t uh48 = sreq->getUrlHash48();
// do not repeatedly page count if we just have
// a single fake firstip request. this just adds
// an entry to the table that will end up in
// m_pageCountTable so we avoid doing this count
// again over and over. also gives url filters
// table a zero-entry...
//m_localTable.addScore(&sreq->m_firstIp,0);
//m_localTable.addScore(&sreq->m_siteHash32,0);
//m_localTable.addScore(&sreq->m_domHash32,0);
// only add dom/site hash seeds if it is
// a fake firstIp to avoid double counting seeds
if ( sreq->m_fakeFirstIp ) continue;
// count the manual additions separately. mangle their
// hash with 0x123456 so they are separate.
if ( (sreq->m_isAddUrl || sreq->m_isInjecting) &&
// unique votes per seed
uh48 != m_lastReqUh48a ) {
// do not repeat count the same url
m_lastReqUh48a = uh48;
// sanity
if ( ! sreq->m_siteHash32){char*xx=NULL;*xx=0;}
if ( ! sreq->m_domHash32){char*xx=NULL;*xx=0;}
// do a little magic because we count
// seeds as "manual adds" as well as normal pg
int32_t h32;
h32 = sreq->m_siteHash32 ^ 0x123456;
m_localTable.addScore(&h32);
h32 = sreq->m_domHash32 ^ 0x123456;
m_localTable.addScore(&h32);
}
// unique votes per other for quota
if ( uh48 == m_lastReqUh48b ) continue;
// update this to ensure unique voting
m_lastReqUh48b = uh48;
// now count pages indexed below here
if ( ! srep ) continue;
if ( srepUh48 == m_lastRepUh48 ) continue;
m_lastRepUh48 = srepUh48;
//if ( ! srep ) continue;
// TODO: what is srep->m_isIndexedINValid is set????
if ( ! srep->m_isIndexed ) continue;
// keep count per site and firstip
m_localTable.addScore(&sreq->m_firstIp,1);
m_localTable.addScore(&sreq->m_siteHash32,1);
m_localTable.addScore(&sreq->m_domHash32,1);
// debug
if ( ! g_conf.m_logDebugSpider ) continue;
int32_t *valPtr ;
int32_t fip = sreq->m_firstIp;
int32_t sh32 = sreq->m_siteHash32;
valPtr = (int32_t *)m_localTable.getValue(&sh32);
log("spider: sitequota: "
"got %"INT32" indexed docs for site from "
"firstip of %s from url %s",
*valPtr,iptoa(fip),sreq->m_url);
continue;
}
// if the spiderrequest has a fake firstip that means it
// was injected without doing a proper ip lookup for speed.
// xmldoc.cpp will check for m_fakeFirstIp and it that is
// set in the spiderrequest it will simply add a new request
// with the correct firstip. it will be a completely different
// spiderrequest key then. so no need to keep the "fakes".
// it will log the EFAKEFIRSTIP error msg.
if ( sreq->m_fakeFirstIp &&
srep &&
srep->m_spideredTime > sreq->m_addedTime ) {
if ( g_conf.m_logDebugSpider )
log("spider: skipping6 %s", sreq->m_url);
continue;
}
// once we have a spiderreply, even i guess if its an error,
// for a url, then bail if respidering is disabled
if ( m_cr->m_isCustomCrawl &&
srep &&
// no! for bulk jobs and crawl jobs we ALWAYS retry
// on errors... tmp errors, but this is just a int16_tcut
// so only take this int16_tcut if there is no error
// and repeat is 0.0.
// ... CRAP we do not want error'ed urls to resuscitate
// and job that is not already in progress if it is not
// supposed to be a repeat crawl.
( srep->m_errCode &&
// BUT skip this url if the job is not in progress
// even if the errCode is NON-zero, THUS we prevent
// a job from flip flopping from in progress to
// not in progress and sending out alerts. so once
// it goes to NOT in progress, that's it...
m_cr->m_spiderStatus != SP_INPROGRESS ) &&
// this means "repeat" is set to 0, to not repeat but
// if we get a &roundStart=1 request we do a round anyway.
m_cr->m_collectiveRespiderFrequency <= 0.0 ) {
if ( g_conf.m_logDebugSpider )
log("spider: skipping0 %s",sreq->m_url);
continue;
}
// sanity check. check for http(s)://
if ( ( sreq->m_url[0] != 'h' ||
sreq->m_url[1] != 't' ||
sreq->m_url[2] != 't' ||
sreq->m_url[3] != 'p' ) &&
// might be a docid from a pagereindex.cpp
! is_digit(sreq->m_url[0]) ) {
if ( m_cr->m_spiderCorruptCount == 0 )
log("spider: got corrupt 1 spiderRequest in "
"scan because url is %s (cn=%"INT32")"
,sreq->m_url,(int32_t)m_collnum);
m_cr->m_spiderCorruptCount++;
continue;
}
int32_t delta = sreq->m_addedTime - nowGlobal;
if ( delta > 86400 ) {
static bool s_first = true;
if ( m_cr->m_spiderCorruptCount == 0 || s_first ) {
s_first = false;
log("spider: got corrupt 6 spiderRequest in "
"scan because added time is %"INT32" "
"(delta=%"INT32" "
"which is well into the future. url=%s "
"(cn=%i)"
,(int32_t)sreq->m_addedTime
,delta
,sreq->m_url
,(int)m_collnum);
}
m_cr->m_spiderCorruptCount++;
continue;
}
// update SpiderRequest::m_siteNumInlinks to most recent value
int32_t sni = sreq->m_siteNumInlinks;
// get the # of inlinks to the site from our table
uint64_t *val;
val = (uint64_t *)m_sniTable.getValue32(sreq->m_siteHash32);
// use the most recent sni from this table
if ( val )
sni = (int32_t)((*val)>>32);
// if SpiderRequest is forced then m_siteHash32 is 0!
else if ( srep && srep->m_spideredTime >= sreq->m_addedTime )
sni = srep->m_siteNumInlinks;
// assign
sreq->m_siteNumInlinks = sni;
// store rror count in request so xmldoc knows what it is
// and can increment it and re-add it to its spiderreply if
// it gets another error
if ( srep ) {
sreq->m_errCount = srep->m_errCount;
// . assign this too from latest reply - smart compress
// . this WAS SpiderReply::m_pubdate so it might be
// set to a non-zero value that is wrong now... but
// not a big deal!
sreq->m_contentHash32 = srep->m_contentHash32;
// if we tried it before
sreq->m_hadReply = true;
}
// . get the url filter we match
// . if this is slow see the TODO below in dedupSpiderdbList()
// which can pre-store these values assuming url filters do
// not change and siteNumInlinks is about the same.
int32_t ufn = ::getUrlFilterNum(sreq,
srep,
nowGlobal,
false,
MAX_NICENESS,
m_cr,
false, // isOutlink?
// provide the page quota table
&m_localTable,
-1);
// sanity check
if ( ufn == -1 ) {
log("spider: failed to match url filter for "
"url = %s coll=%s", sreq->m_url,m_cr->m_coll);
g_errno = EBADENGINEER;
return true;
}
// set the priority (might be the same as old)
int32_t priority = m_cr->m_spiderPriorities[ufn];
// now get rid of negative priorities since we added a
// separate force delete checkbox in the url filters
if ( priority < 0 ) priority = 0;
// sanity checks
//if ( priority == -1 ) { char *xx=NULL;*xx=0; }
if ( priority >= MAX_SPIDER_PRIORITIES) {char *xx=NULL;*xx=0;}
if ( g_conf.m_logDebugSpider )
log("spider: got ufn=%"INT32" for %s (%"INT64"",
ufn,sreq->m_url,sreq->getUrlHash48());
if ( g_conf.m_logDebugSpider && srep )
log("spider: lastspidered=%"UINT32"",
srep->m_spideredTime);
// spiders disabled for this row in url filteres?
//if ( ! m_cr->m_spidersEnabled[ufn] ) continue;
if ( m_cr->m_maxSpidersPerRule[ufn] <= 0 ) continue;
// skip if banned (unless need to delete from index)
bool skip = false;
// if ( priority == SPIDER_PRIORITY_FILTERED ) skip = true;
// if ( priority == SPIDER_PRIORITY_BANNED ) skip = true;
if ( m_cr->m_forceDelete[ufn] ) skip = true;
// but if it is currently indexed we have to delete it
if ( skip && srep && srep->m_isIndexed ) skip = false;
if ( skip ) continue;
// temp debug
//char *xx=NULL;*xx=0;
if ( m_cr->m_forceDelete[ufn] )
// force it to a delete
sreq->m_forceDelete = true;
int64_t spiderTimeMS;
spiderTimeMS = getSpiderTimeMS ( sreq,ufn,srep,nowGlobalMS );
// how many outstanding spiders on a single IP?
//int32_t maxSpidersPerIp = m_cr->m_spiderIpMaxSpiders[ufn];
// sanity
if ( (int64_t)spiderTimeMS < 0 ) {
log("spider: got corrupt 2 spiderRequest in "
"scan (cn=%"INT32")",
(int32_t)m_collnum);
continue;
}
// more corruption detection
if ( sreq->m_hopCount < -1 ) {
log("spider: got corrupt 5 spiderRequest in "
"scan (cn=%"INT32")",
(int32_t)m_collnum);
continue;
}
// 100 days out is corruption. ppl sometimes put
// 3000 days to re-spider... so take this out
/*
int64_t delta2 = spiderTimeMS - nowGlobalMS;
if ( delta2 > 86400LL*1000LL*100LL ) {
log("spider: got corrupt 7 spiderRequest in "
"scan (cn=%"INT32") (delta=%"INT64") url=%s",
(int32_t)m_collnum,
delta2,
sreq->m_url);
SafeBuf sb; g_spiderdb.print ( (char *)sreq , &sb );
if ( srep ) g_spiderdb.print ( (char *)srep , &sb );
log("spider: %s",sb.getBufStart());
continue;
}
*/
// save this shit for storing in doledb
sreq->m_ufn = ufn;
sreq->m_priority = priority;
// if it is in future, skip it and just set m_futureTime and
// and we will update the waiting tree
// with an entry based on that future time if the winnerTree
// turns out to be empty after we've completed our scan
if ( spiderTimeMS > nowGlobalMS ) {
// if futuretime is zero set it to this time
if ( ! m_minFutureTimeMS )
m_minFutureTimeMS = spiderTimeMS;
// otherwise we get the MIN of all future times
else if ( spiderTimeMS < m_minFutureTimeMS )
m_minFutureTimeMS = spiderTimeMS;
if ( g_conf.m_logDebugSpider )
log("spider: skippingx %s",sreq->m_url);
continue;
}
// debug point
// if ( ((long long)srep->m_spideredTime)*1000LL >
// nowGlobalMS - 86400LL*1000LL*30LL )
// log("spider: should not be spidering this!");
//////
//
// MDW: no, take this out now that we allow multiple urls
// in doledb from this firstip for speeding up the spiders.
// the crawldelay will be used by Msg13.cpp when it tries
// to download the url.
//
//////
/*
// how many "ready" urls for this IP? urls in doledb
// can be spidered right now
int32_t *score ;
score = (int32_t *)m_doleIpTable.getValue32 ( sreq->m_firstIp );
// how many spiders are current outstanding
int32_t out2 = outNow;
// add in any requests in doledb
if ( score ) out2 += *score;
// . do not add any more to doledb if we could violate ourquota
// . shit we have to add it our it never gets in
// . try regulating in msg13.cpp download code. just queue
// up requests to avoid hammering there.
if ( out2 >= maxSpidersPerIp ) {
if ( g_conf.m_logDebugSpider )
log("spider: skipping1 %s",sreq->m_url);
continue;
}
// by ensuring only one spider out at a time when there
// is a positive crawl-delay, we ensure that m_lastDownloadTime
// is the last time we downloaded from this ip so that we
// can accurately set the time in getSpiderTimeMS() for
// when the next url from this firstip should be spidered.
if ( out2 >= 1 ) {
// get the crawldelay for this domain
int32_t *cdp ;
cdp = (int32_t *)m_cdTable.getValue (&sreq->m_domHash32);
// if crawl delay is NULL, we need to download
// robots.txt. most of the time it will be -1
// which indicates not specified in robots.txt
if ( ! cdp ) {
if ( g_conf.m_logDebugSpider )
log("spider: skipping2 %s",
sreq->m_url);
continue;
}
// if we had a positive crawldelay and there is
// already >= 1 outstanding spider on this ip,
// then skip this url
if ( cdp && *cdp > 0 ) {
if ( g_conf.m_logDebugSpider )
log("spider: skipping3 %s",
sreq->m_url);
continue;
}
}
*/
// debug. show candidates due to be spidered now.
//if(g_conf.m_logDebugSpider ) //&& spiderTimeMS< nowGlobalMS )
// log("spider: considering ip=%s sreq spiderTimeMS=%"INT64" "
// "pri=%"INT32" uh48=%"INT64"",
// iptoa(sreq->m_firstIp),
// spiderTimeMS,
// priority,
// sreq->getUrlHash48());
// we can't have negative priorities at this point because
// the s_ufnTree uses priority as part of the key so it
// can get the top 100 or so urls for a firstip to avoid
// having to hit spiderdb for every one!
if ( priority < 0 ) { char *xx=NULL;*xx=0; }
//
// NO! then just a single root url can prevent all his
// kids from getting spidered. because this logic was
// priority based over time. so while the high priority url
// would be sitting in the waiting tree, the kids whose
// time it was to be spidered would be starving for attention.
// only use priority if the high priority url can be spidered
// now, so he doesn't lock the others out of the waiting tree.
//
// now pick the SpiderRequest with the best priority, then
// break ties with the "spiderTime".
//if ( priority < winPriority )
// continue;
// if tied, use times
//if ( priority == winPriority && spiderTimeMS > winTimeMS )
// continue;
// bail if it is locked! we now call
// msg12::confirmLockAcquisition() after we get the lock,
// which deletes the doledb record from doledb and doleiptable
// rightaway and adds a "0" entry into the waiting tree so
// that evalIpLoop() repopulates doledb again with that
// "firstIp". this way we can spider multiple urls from the
// same ip at the same time.
int64_t key = makeLockTableKey ( sreq );
if ( g_conf.m_logDebugSpider )
log("spider: checking uh48=%"INT64" lockkey=%"INT64" "
"used=%"INT32"",
uh48,key,
g_spiderLoop.m_lockTable.getNumUsedSlots());
// MDW
if ( g_spiderLoop.m_lockTable.isInTable ( &key ) ) {
// get it
//CrawlInfo *ci = &m_cr->m_localCrawlInfo;
// do not think the round is over!
//ci->m_lastSpiderCouldLaunch = nowGlobal;
// there are urls ready to spider, just locked up
//ci->m_hasUrlsReadyToSpider = true;
// debug note
if ( g_conf.m_logDebugSpider )
log("spider: skipping url lockkey=%"INT64" in "
"lock table sreq.url=%s",key,
sreq->m_url);
continue;
}
//int64_t uh48 = sreq->getUrlHash48();
// make key
key192_t wk = makeWinnerTreeKey( firstIp ,
priority ,
sreq->m_hopCount,
spiderTimeMS ,
uh48 );
// assume our added time is the first time this url was added
sreq->m_discoveryTime = sreq->m_addedTime;
// if ( uh48 == 110582802025376LL )
// log("hey");
// if this url is already in the winnerTree then either we
// replace it or we skip ourselves.
//
// watch out for dups in winner tree, the same url can have
// multiple spiderTimeMses somehow... i guess it could have
// different hop counts
// as well, resulting in different priorities...
// actually the dedup table could map to a priority and a node
// so we can kick out a lower priority version of the same url.
int32_t winSlot = m_winnerTable.getSlot ( &uh48 );
if ( winSlot >= 0 ) {
key192_t *oldwk = (key192_t *)m_winnerTable.
getDataFromSlot ( winSlot );
// get the min hopcount
SpiderRequest *wsreq ;
wsreq =(SpiderRequest *)m_winnerTree.
getData(0,(char *)oldwk);
if ( wsreq ) {
if ( sreq->m_hopCount < wsreq->m_hopCount )
wsreq->m_hopCount = sreq->m_hopCount;
if ( wsreq->m_hopCount < sreq->m_hopCount )
sreq->m_hopCount = wsreq->m_hopCount;
// and the min added time as well!
// get the oldest timestamp so
// gbssDiscoveryTime will be accurate.
if ( sreq->m_discoveryTime < wsreq->m_discoveryTime )
wsreq->m_discoveryTime =
sreq->m_discoveryTime;
if ( wsreq->m_discoveryTime < sreq->m_discoveryTime )
sreq->m_discoveryTime =
wsreq->m_discoveryTime;
}
// are we lower priority? (or equal)
// smaller keys are HIGHER priority.
if(KEYCMP((char *)&wk,(char *)oldwk,
sizeof(key192_t))>=0)
continue;
// from table too. no it's a dup uh48!
//m_winnerTable.deleteKey ( &uh48 );
// otherwise we supplant it. remove old key from tree.
m_winnerTree.deleteNode ( 0 , (char *)oldwk , false);
// supplant in table and tree... just add below...
}
// get the top 100 spider requests by priority/time/etc.
int32_t maxWinners = (int32_t)MAX_WINNER_NODES; // 40
//if ( ! m_cr->m_isCustomCrawl ) maxWinners = 1;
// if less than 10MB of spiderdb requests limit to 400
if ( m_totalBytesScanned < 10000000 ) maxWinners = 400;
// only put one doledb record into winner tree if
// the list is pretty short. otherwise, we end up caching
// too much. granted, we only cache for about 2 mins.
// mdw: for testing take this out!
if ( m_totalBytesScanned < 25000 ) maxWinners = 1;
// sanity. make sure read is somewhat hefty for our
// maxWinners=1 thing
if ( (int32_t)SR_READ_SIZE < 500000 ) { char *xx=NULL;*xx=0; }
// only compare to min winner in tree if tree is full
if ( m_winnerTree.getNumUsedNodes() >= maxWinners ) {
// get that key
int64_t tm1 = spiderTimeMS;
// get the spider time of lowest scoring req in tree
int64_t tm2 = m_tailTimeMS;
// if they are both overdue, make them the same
if ( tm1 < nowGlobalMS ) tm1 = 1;
if ( tm2 < nowGlobalMS ) tm2 = 1;
// skip spider request if its time is past winner's
if ( tm1 > tm2 )
continue;
if ( tm1 < tm2 )
goto gotNewWinner;
// if tied, use priority
if ( priority < m_tailPriority )
continue;
if ( priority > m_tailPriority )
goto gotNewWinner;
// if tied use hop counts so we are breadth first
if ( sreq->m_hopCount > m_tailHopCount )
continue;
if ( sreq->m_hopCount < m_tailHopCount )
goto gotNewWinner;
// if hopcounts tied prefer the unindexed doc
// i don't think we need this b/c spidertimems
// for new docs should be less than old docs...
// TODO: verify that
//if ( sreq->m_isIndexed && ! m_tailIsIndexed )
// continue;
//if ( ! sreq->m_isIndexed && m_tailIsIndexed )
// goto gotNewWinner;
// if tied, use actual times. assuming both<nowGlobalMS
if ( spiderTimeMS > m_tailTimeMS )
continue;
if ( spiderTimeMS < m_tailTimeMS )
goto gotNewWinner;
// all tied, keep it the same i guess
continue;
// otherwise, add the new winner in and remove the old
gotNewWinner:
// get lowest scoring node in tree
int32_t tailNode = m_winnerTree.getLastNode();
// from table too
m_winnerTable.removeKey ( &m_tailUh48 );
// delete the tail so new spiderrequest can enter
m_winnerTree.deleteNode3 ( tailNode , true );
}
// somestimes the firstip in its key does not match the
// firstip in the record!
if ( sreq->m_firstIp != firstIp ) {
log("spider: request %s firstip does not match "
"firstip in key collnum=%i",sreq->m_url,
(int)m_collnum);
log("spider: ip1=%s",iptoa(sreq->m_firstIp));
log("spider: ip2=%s",iptoa(firstIp));
continue;
}
////
//
// add spider request to winner tree
//
////
/*
// make the key
//if ( useTree ) {
int64_t uh48 = sreq->getUrlHash48();
key128_t k = makeUfnTreeKey ( firstIp ,priority,
spiderTimeMS , uh48 );
//int32_t nn =;
m_winnerTree.addNode(0,(char *)&k,NULL,8);
//log("adding node #%"INT32" firstip=%s uh48=%"UINT64" "
// "ufntree.k.n1=0x%"XINT64" "
// "spiderdb.k.n1=0x%"XINT64" "
// "spiderdb.k.n0=0x%"XINT64" "
// ,
// nn,iptoa(firstIp),uh48,k.n1,
// *(int64_t *)rec,
// *(int64_t *)(rec+8)
// );
//numNodes++;
//}
*/
// . add to table which allows us to ensure same url not
// repeated in tree
// . just skip if fail to add...
if ( m_winnerTable.addKey ( &uh48 , &wk ) < 0 ) continue;
// use an individually allocated buffer for each spiderrequest
// so if it gets removed from tree the memory can be freed by
// the tree which "owns" the data because m_winnerTree.set()
// above set ownsData
// to true above.
int32_t need = sreq->getRecSize();
char *newMem = (char *)mdup ( sreq , need , "sreqbuf" );
if ( ! newMem ) continue;
// add it to the tree of the top urls to spider
m_winnerTree.addNode( 0,
(char *)&wk ,
(char *)newMem ,
need );
// log("adding wk uh48=%llu #usednodes=%i",
// uh48,m_winnerTree.m_numUsedNodes);
// sanity
//SpiderRequest *sreq2 = (SpiderRequest *)m_winnerTree.
//getData ( nn );
// //////////////////////
// // MDW dedup test
// HashTableX dedup;
// int32_t ntn = m_winnerTree.getNumNodes();
// char dbuf[3*MAX_WINNER_NODES*(8+1)];
// dedup.set ( 8,
// 0,
// (int32_t)2*ntn, // # slots to initialize to
// dbuf,
// (int32_t)(3*MAX_WINNER_NODES*(8+1)),
// false,
// MAX_NICENESS,
// "windt");
// for ( int32_t node = m_winnerTree.getFirstNode() ;
// node >= 0 ;
// node = m_winnerTree.getNextNode ( node ) ) {
// // get data for that
// SpiderRequest *sreq2;
// sreq2 = (SpiderRequest *)m_winnerTree.getData ( node );
// // parse it up
// int32_t winIp;
// int32_t winPriority;
// int32_t winHopCount;
// int64_t winSpiderTimeMS;
// int64_t winUh48;
// key192_t *winKey = (key192_t *)m_winnerTree.getKey ( node );
// parseWinnerTreeKey ( winKey ,
// &winIp ,
// &winPriority,
// &winHopCount,
// &winSpiderTimeMS ,
// &winUh48 );
// // sanity
//if(winUh48 != sreq2->getUrlHash48() ) { char *xx=NULL;*xx=0;}
// // make the doledb key
// key_t doleKey = g_doledb.makeKey ( winPriority,
// // convert to secs from ms
// winSpiderTimeMS / 1000 ,
// winUh48 ,
// false );
// // dedup. if we add dups the problem is is that they
// // overwrite the key in doledb yet the doleiptable count
// // remains undecremented and doledb is empty and never
// // replenished because the firstip can not be added to
// // waitingTree because doleiptable count is > 0. this was
// // causing spiders to hang for collections. i am not sure
// // why we should be getting dups in winnertree because they
// // have the same uh48 and that is the key in the tree.
// if ( dedup.isInTable ( &winUh48 ) ) {
// log("spider: got dup uh48=%"UINT64" dammit", winUh48);
// char *xx=NULL;*xx=0;
// continue;
// }
// // do not allow dups
// dedup.addKey ( &winUh48 );
// }
// // end dedup test
//////////////////////////
// set new tail priority and time for next compare
if ( m_winnerTree.getNumUsedNodes() >= maxWinners ) {
// for the worst node in the tree...
int32_t tailNode = m_winnerTree.getLastNode();
if ( tailNode < 0 ) { char *xx=NULL;*xx=0; }
// set new tail parms
key192_t *tailKey;
tailKey = (key192_t *)m_winnerTree.getKey ( tailNode );
// convert to char first then to signed int32_t
parseWinnerTreeKey ( tailKey ,
&m_tailIp ,
&m_tailPriority,
&m_tailHopCount,
&m_tailTimeMS ,
&m_tailUh48 );
// sanity
if ( m_tailIp != firstIp ) { char *xx=NULL;*xx=0;}
}
/*
// ok, we got a new winner
winPriority = priority;
winTimeMS = spiderTimeMS;
winMaxSpidersPerIp = maxSpidersPerIp;
winReq = sreq;
// set these for doledb
winReq->m_priority = priority;
winReq->m_ufn = ufn;
//winReq->m_spiderTime = spiderTime;
*/
}
/*
MDW: now that we have winnerTree obsolete this stuff:
// if its ready to spider now, that trumps one in the future always!
if ( winReq &&
m_bestRequestValid &&
m_bestSpiderTimeMS <= nowGlobalMS &&
winTimeMS > nowGlobal )
winReq = NULL;
// if this is a successive call we have to beat the global because
// the firstIp has a *ton* of spider requests and we can't read them
// all in one list, then see if we beat our global winner!
if ( winReq &&
m_bestRequestValid &&
m_bestSpiderTimeMS <= nowGlobalMS &&
m_bestRequest->m_priority > winPriority )
winReq = NULL;
// or if both in future. use time.
if ( winReq &&
m_bestRequestValid &&
m_bestSpiderTimeMS > nowGlobalMS &&
winTimeMS > nowGlobal &&
m_bestSpiderTimeMS < winTimeMS )
winReq = NULL;
// if both recs are overdue for spidering and priorities tied, use
// the hopcount. should make us breadth-first, all else being equal.
if ( winReq &&
m_bestRequestValid &&
m_bestRequest->m_priority == winPriority &&
m_bestSpiderTimeMS <= nowGlobalMS &&
winTimeMS <= nowGlobal &&
m_bestRequest->m_hopCount < winReq->m_hopCount )
winReq = NULL;
// use times if hops are equal and both are overdue from same priority.
if ( winReq &&
m_bestRequestValid &&
m_bestRequest->m_priority == winPriority &&
m_bestSpiderTimeMS <= nowGlobalMS &&
winTimeMS <= nowGlobal &&
m_bestRequest->m_hopCount == winReq->m_hopCount &&
m_bestSpiderTimeMS <= winTimeMS )
winReq = NULL;
// if nothing, we are done!
if ( winReq ) {
// store this
int32_t rsize = winReq->getRecSize();
// sanity check
if ( rsize > (int32_t)MAX_BEST_REQUEST_SIZE){char *xx=NULL;*xx=0;}
// now store this SpiderRequest for adding to doledb
gbmemcpy ( m_bestRequestBuf , winReq, rsize );
// point to that
m_bestRequest = (SpiderRequest *)m_bestRequestBuf;
// set this
m_bestRequestValid = true;
// this too
m_bestSpiderTimeMS = winTimeMS;
m_bestMaxSpidersPerIp = winMaxSpidersPerIp;
// sanity
if ( (int64_t)winTimeMS < 0 ) { char *xx=NULL;*xx=0; }
// note it
if ( g_conf.m_logDebugSpider ) {
log("spider: found winning request IN THIS LIST ip=%s "
"spiderTimeMS=%"INT64" "
"ufn=%"INT32" "
"hadreply=%"INT32" "
"pri=%"INT32" uh48=%"INT64" url=%s",
iptoa(m_bestRequest->m_firstIp),
m_bestSpiderTimeMS,
(int32_t)m_bestRequest->m_ufn,
(int32_t)m_bestRequest->m_hadReply,
(int32_t)m_bestRequest->m_priority,
m_bestRequest->getUrlHash48(),
m_bestRequest->m_url);
// debug why not sticking to our site!
//if ( strstr(m_bestRequest->m_url,"//www.jezebelgallery.com") == NULL ) { char *xx=NULL;*xx=0; }
}
}
else if ( g_conf.m_logDebugSpider ) {
log("spider: did not find winning request for %s but "
"bestReqValid=%"INT32"",
iptoa(m_scanningIp),(int32_t)m_bestRequestValid);
}
*/
// are we the final list in the scan?
//m_isReadDone = ( list->getListSize() < (int32_t)SR_READ_SIZE ) ;
//
// . try to fix the bug of reading like only 150k when we asked for 512
// . that bug was because of dedupList() function
//
//if ( list->isEmpty() )
// m_isReadDone = true;
// if no spiderreply for the current url, invalidate this
m_lastReplyValid = false;
// if read is not yet done, save the reply in case next list needs it
if ( srep ) { // && ! m_isReadDone ) {
int32_t rsize = srep->getRecSize();
if ( rsize > (int32_t)MAX_SP_REPLY_SIZE){char *xx=NULL;*xx=0; }
gbmemcpy ( m_lastReplyBuf, srep, rsize );
m_lastReplyValid = true;
}
// debug info
if ( g_conf.m_logDebugSpider )
log("spider: Checked list of %"INT32" spiderdb "
"bytes (%"INT32" recs) "
"for winners "
"for firstip=%s. winnerTreeUsedNodes=%"INT32" #newreqs=%"
INT64
,list->getListSize(),recCount,iptoa(m_scanningIp),
m_winnerTree.getNumUsedNodes(),
m_totalNewSpiderRequests);
// reset any errno cuz we're just a cache
g_errno = 0;
/////
//
// BEGIN maintain firstip overflow list
//
/////
bool overflow = false;
// don't add any more outlinks to this firstip after we
// have 10M spider requests for it.
// lower for testing
//if ( m_totalNewSpiderRequests > 1 )
if ( m_totalNewSpiderRequests > 10000000 )
overflow = true;
// need space
if ( overflow && ! m_overflowList ) {
int32_t need = OVERFLOWLISTSIZE*4;
m_overflowList = (int32_t *)mmalloc(need,"list");
m_overflowList[0] = 0;
}
//
// ensure firstip is in the overflow list if we overflowed
int32_t emptySlot = -1;
bool found = false;
int32_t oi;
// if we dealt with this last round, we're done
if ( m_lastOverflowFirstIp == firstIp )
return true;
m_lastOverflowFirstIp = firstIp;
if ( overflow && g_conf.m_logDebugSpider )
log("spider: firstip %s overflowing with %"INT32" new reqs",
iptoa(firstIp),(int32_t)m_totalNewSpiderRequests);
for ( oi = 0 ; ; oi++ ) {
// sanity
if ( ! m_overflowList ) break;
// an ip of zero is end of the list
if ( ! m_overflowList[oi] ) break;
// if already in there, we are done
if ( m_overflowList[oi] == firstIp ) {
found = true;
break;
}
// -1 means empty slot
if ( m_overflowList[oi] == -1 ) emptySlot = oi;
}
// if we need to add it...
if ( overflow && ! found && m_overflowList ) {
log("spider: adding %s to overflow list",iptoa(firstIp));
// reset this little cache thingy
s_lastOut = 0;
// take the empty slot if there is one
if ( emptySlot >= 0 )
m_overflowList[emptySlot] = firstIp;
// or add to new slot. this is #defined to 200 last check
else if ( oi+1 < OVERFLOWLISTSIZE ) {
m_overflowList[oi] = firstIp;
m_overflowList[oi+1] = 0;
}
else
log("spider: could not add firstip %s to "
"overflow list, full.", iptoa(firstIp));
}
// ensure firstip is NOT in the overflow list if we are ok
for ( int32_t oi2 = 0 ; ! overflow ; oi2++ ) {
// sanity
if ( ! m_overflowList ) break;
// an ip of zero is end of the list
if ( ! m_overflowList[oi2] ) break;
// skip if not a match
if ( m_overflowList[oi2] != firstIp ) continue;
// take it out of list
m_overflowList[oi2] = -1;
log("spider: removing %s from overflow list",iptoa(firstIp));
// reset this little cache thingy
s_lastIn = 0;
break;
}
/////
//
// END maintain firstip overflow list
//
/////
// ok we've updated m_bestRequest!!!
return true;
}
// . this is ONLY CALLED from evalIpLoop() above
// . add another 0 entry into waiting tree, unless we had no winner
// . add winner in here into doledb
// . returns false if blocked and doledWrapper() will be called
// . returns true and sets g_errno on error
bool SpiderColl::addWinnersIntoDoledb ( ) {
if ( g_errno ) {
log("spider: got error when trying to add winner to doledb: "
"%s",mstrerror(g_errno));
return true;
}
// gotta check this again since we might have done a QUICKPOLL() above
// to call g_process.shutdown() so now tree might be unwritable
RdbTree *wt = &m_waitingTree;
if ( wt->m_isSaving || ! wt->m_isWritable )
return true;
/*
MDW: let's print out recs we add to doledb
//if ( g_conf.m_logDebugSpider && m_bestRequestValid ) {
if ( g_conf.m_logDebugSpider && m_bestRequestValid ) {
log("spider: got best ip=%s sreq spiderTimeMS=%"INT64" "
"pri=%"INT32" uh48=%"INT64"",
iptoa(m_bestRequest->m_firstIp),
m_bestSpiderTimeMS,
(int32_t)m_bestRequest->m_priority,
m_bestRequest->getUrlHash48());
}
else if ( g_conf.m_logDebugSpider ) {
log("spider: no best request for ip=%s",iptoa(m_scanningIp));
}
*/
// ok, all done if nothing to add to doledb. i guess we were misled
// that firstIp had something ready for us. maybe the url filters
// table changed to filter/ban them all. if a new request/reply comes
// in for this firstIp then it will re-add an entry to waitingtree and
// we will re-scan spiderdb. if we had something to spider but it was
// in the future the m_minFutureTimeMS will be non-zero, and we deal
// with that below...
if ( m_winnerTree.isEmpty() && ! m_minFutureTimeMS ) {
// if we received new incoming requests while we were
// scanning, which is happening for some crawls, then do
// not nuke! just repeat later in populateDoledbFromWaitingTree
if ( m_gotNewDataForScanningIp ) {
if ( g_conf.m_logDebugSpider )
log("spider: received new requests, not "
"nuking misleading key");
return true;
}
// note it - this can happen if no more to spider right now!
if ( g_conf.m_logDebugSpider )
log("spider: nuking misleading waitingtree key "
"firstIp=%s", iptoa(m_scanningIp));
m_waitingTree.deleteNode ( 0,(char *)&m_waitingTreeKey,true);
//log("spider: 7 del node for %s",iptoa(m_scanningIp));
m_waitingTreeKeyValid = false;
// note it
uint64_t timestamp64 = m_waitingTreeKey.n1;
timestamp64 <<= 32;
timestamp64 |= m_waitingTreeKey.n0 >> 32;
int32_t firstIp = m_waitingTreeKey.n0 &= 0xffffffff;
if ( g_conf.m_logDebugSpider )
log(LOG_DEBUG,"spider: removed2 time=%"INT64" ip=%s from "
"waiting tree. nn=%"INT32".",
timestamp64, iptoa(firstIp),
m_waitingTree.m_numUsedNodes);
m_waitingTable.removeKey ( &firstIp );
// sanity check
if ( ! m_waitingTable.m_isWritable ) { char *xx=NULL;*xx=0;}
return true;
}
// i've seen this happen, wtf?
if ( m_winnerTree.isEmpty() && m_minFutureTimeMS ) {
// this will update the waiting tree key with minFutureTimeMS
addDoleBufIntoDoledb ( NULL , false );
return true;
}
// i am seeing dup uh48's in the m_winnerTree
int32_t firstIp = m_waitingTreeKey.n0 & 0xffffffff;
//char dbuf[147456];//3*MAX_WINNER_NODES*(8+1)];
HashTableX dedup;
int32_t ntn = m_winnerTree.getNumNodes();
dedup.set ( 8,
0,
(int32_t)2*ntn, // # slots to initialize to
NULL,//dbuf,
0,//147456,//(int32_t)(3*MAX_WINNER_NODES*(8+1)),
false,
MAX_NICENESS,
"windt");
///////////
//
// make winner tree into doledb list to add
//
///////////
//m_doleBuf.reset();
//m_doleBuf.setLabel("dolbuf");
// first 4 bytes is offset of next doledb record to add to doledb
// so we do not have to re-add the dolebuf to the cache and make it
// churn. it is really inefficient.
SafeBuf doleBuf;
doleBuf.pushLong(4);
int32_t added = 0;
for ( int32_t node = m_winnerTree.getFirstNode() ;
node >= 0 ;
node = m_winnerTree.getNextNode ( node ) ) {
// breathe
QUICKPOLL ( MAX_NICENESS );
// get data for that
SpiderRequest *sreq2;
sreq2 = (SpiderRequest *)m_winnerTree.getData ( node );
// sanity
if ( sreq2->m_firstIp != firstIp ) { char *xx=NULL;*xx=0; }
//if ( sreq2->m_spiderTimeMS < 0 ) { char *xx=NULL;*xx=0; }
if ( sreq2->m_ufn < 0 ) { char *xx=NULL;*xx=0; }
if ( sreq2->m_priority == -1 ) { char *xx=NULL;*xx=0; }
// check for errors
bool hadError = false;
// parse it up
int32_t winIp;
int32_t winPriority;
int32_t winHopCount;
int64_t winSpiderTimeMS;
int64_t winUh48;
key192_t *winKey = (key192_t *)m_winnerTree.getKey ( node );
parseWinnerTreeKey ( winKey ,
&winIp ,
&winPriority,
&winHopCount,
&winSpiderTimeMS ,
&winUh48 );
// sanity
if ( winIp != firstIp ) { char *xx=NULL;*xx=0;}
if ( winUh48 != sreq2->getUrlHash48() ) { char *xx=NULL;*xx=0;}
// make the doledb key
key_t doleKey = g_doledb.makeKey ( winPriority,
// convert to secs from ms
winSpiderTimeMS / 1000 ,
winUh48 ,
false );
// dedup. if we add dups the problem is is that they
// overwrite the key in doledb yet the doleiptable count
// remains undecremented and doledb is empty and never
// replenished because the firstip can not be added to
// waitingTree because doleiptable count is > 0. this was
// causing spiders to hang for collections. i am not sure
// why we should be getting dups in winnertree because they
// have the same uh48 and that is the key in the tree.
if ( dedup.isInTable ( &winUh48 ) ) {
log("spider: got dup uh48=%"UINT64" dammit", winUh48);
continue;
}
// count it
added++;
// do not allow dups
dedup.addKey ( &winUh48 );
// store doledb key first
if ( ! doleBuf.safeMemcpy ( &doleKey, sizeof(key_t) ) )
hadError = true;
// then size of spiderrequest
if ( ! doleBuf.pushLong ( sreq2->getRecSize() ) )
hadError = true;
// then the spiderrequest encapsulated
if ( ! doleBuf.safeMemcpy ( sreq2 , sreq2->getRecSize() ))
hadError=true;
// note and error
if ( hadError ) {
log("spider: error making doledb list: %s",
mstrerror(g_errno));
return true;
}
}
// log("spider: added %"INT32" doledb recs to cache for cn=%i "
// "dolebufsize=%i",
// added,
// (int)m_collnum,
// (int)doleBuf.length());
return addDoleBufIntoDoledb ( &doleBuf , false );//, 0 );
}
bool SpiderColl::validateDoleBuf ( SafeBuf *doleBuf ) {
char *doleBufEnd = doleBuf->getBuf();
// get offset
char *pstart = doleBuf->getBufStart();
char *p = pstart;
int32_t jump = *(int32_t *)p;
p += 4;
// sanity
if ( jump < 4 || jump > doleBuf->getLength() ) {
char *xx=NULL;*xx=0; }
bool gotIt = false;
for ( ; p < doleBuf->getBuf() ; ) {
if ( p == pstart + jump )
gotIt = true;
// first is doledbkey
p += sizeof(key_t);
// then size of spider request
int32_t recSize = *(int32_t *)p;
p += 4;
// the spider request encapsulated
SpiderRequest *sreq3;
sreq3 = (SpiderRequest *)p;
// point "p" to next spiderrequest
if ( recSize != sreq3->getRecSize() ) { char *xx=NULL;*xx=0;}
p += recSize;//sreq3->getRecSize();
// sanity
if ( p > doleBufEnd ) { char *xx=NULL;*xx=0; }
if ( p < pstart ) { char *xx=NULL;*xx=0; }
}
if ( ! gotIt ) { char *xx=NULL;*xx=0; }
return true;
}
bool SpiderColl::addDoleBufIntoDoledb ( SafeBuf *doleBuf, bool isFromCache ) {
// uint32_t cachedTimestamp ) {
//validateDoleBuf ( doleBuf );
////////////////////
//
// UPDATE WAITING TREE ENTRY
//
// Normally the "spidertime" is 0 for a firstIp. This will make it
// a future time if it is not yet due for spidering.
//
////////////////////
int32_t firstIp = m_waitingTreeKey.n0 & 0xffffffff;
// sanity check. how did this happen? it messes up our crawl!
// maybe a doledb add went through? so we should add again?
int32_t wn = m_waitingTree.getNode(0,(char *)&m_waitingTreeKey);
if ( wn < 0 ) {
log("spider: waiting tree key removed while reading list for "
"%s (%"INT32")",m_coll,(int32_t)m_collnum);
// play it safe and add it back for now...
// when i try to break here in gdb it never happens because
// of timing issues. heisenbug...
// false = callForScan
//if ( ! addToWaitingTree ( 0 , m_scanningIp , false ) )
// log("spider: failed to add wk2 to waiting tree: %s"
// ,mstrerror(g_errno));
return true;
}
/*
MDW: now we store multiple winners in doledb, so this logic
won't be used. maybe just rely on the crawldelay type logic
in Msg13.cpp to space things out. we could also deny locks if
too many urls being spidered for this ip.
// how many spiders currently out for this ip?
int32_t outNow=g_spiderLoop.getNumSpidersOutPerIp(m_scanningIp,m_collnum);
// even if hadn't gotten list we can bail early if too many
// spiders from this ip are out!
//int32_t out = g_spiderLoop.getNumSpidersOutPerIp ( m_scanningIp );
if ( outNow >= m_bestMaxSpidersPerIp ) {
// note it
if ( g_conf.m_logDebugSpider )
log("spider: already got %"INT32" from this ip out. ip=%s",
m_bestMaxSpidersPerIp,
iptoa(m_scanningIp)
);
// when his SpiderReply comes back it will call
// addWaitingTree with a "0" time so he'll get back in there
//if ( wn < 0 ) { char *xx=NULL; *xx=0; }
if ( wn >= 0 ) {
m_waitingTree.deleteNode (wn,false );
// note that
//log("spdr: 1 del node %"INT32" for %s",wn,iptoa(firstIp));
}
// keep the table in sync now with the time
m_waitingTable.removeKey( &m_bestRequest->m_firstIp );
return true;
}
*/
//int64_t nowGlobalMS = gettimeofdayInMillisecondsGlobal();//Local();
// if best request has a future spiderTime, at least update
// the wait tree with that since we will not be doling this request
// right now.
if ( m_winnerTree.isEmpty() && m_minFutureTimeMS && ! isFromCache ) {
// save memory
m_winnerTree.reset();
m_winnerTable.reset();
// if in the process of being added to doledb or in doledb...
if ( m_doleIpTable.isInTable ( &firstIp ) ) {
// sanity i guess. remove this line if it hits this!
log("spider: wtf????");
//char *xx=NULL;*xx=0;
return true;
}
// before you set a time too far into the future, if we
// did receive new spider requests, entertain those
if ( m_gotNewDataForScanningIp &&
// we had twitter.com with a future spider date
// on the pe2 cluster but we kept hitting this, so
// don't do this anymore if we scanned a ton of bytes
// like we did for twitter.com because it uses all the
// resources when we can like 150MB of spider requests
// for a single firstip
m_totalBytesScanned < 30000 ) {
if ( g_conf.m_logDebugSpider )
log("spider: received new requests, not "
"updating waiting tree with future time");
return true;
}
// get old time
uint64_t oldSpiderTimeMS = m_waitingTreeKey.n1;
oldSpiderTimeMS <<= 32;
oldSpiderTimeMS |= (m_waitingTreeKey.n0 >> 32);
// delete old node
//int32_t wn = m_waitingTree.getNode(0,(char *)&m_waitingTreeKey);
//if ( wn < 0 ) { char *xx=NULL;*xx=0; }
if ( wn >= 0 ) {
m_waitingTree.deleteNode3 (wn,false );
//log("spdr: 2 del node %"INT32" for %s",wn,iptoa(firstIp));
}
// invalidate
m_waitingTreeKeyValid = false;
//int32_t fip = m_bestRequest->m_firstIp;
key_t wk2 = makeWaitingTreeKey ( m_minFutureTimeMS , firstIp );
// log the replacement
if ( g_conf.m_logDebugSpider )
log("spider: scan replacing waitingtree key "
"oldtime=%"UINT32" newtime=%"UINT32" firstip=%s",
// bestpri=%"INT32" "
//"besturl=%s",
(uint32_t)(oldSpiderTimeMS/1000LL),
(uint32_t)(m_minFutureTimeMS/1000LL),
iptoa(firstIp)
//(int32_t)m_bestRequest->m_priority,
// m_bestRequest->m_url);
);
// this should never fail since we deleted one above
int32_t dn = m_waitingTree.addKey ( &wk2 );
// note it
if ( g_conf.m_logDebugSpider )
logf(LOG_DEBUG,"spider: RE-added time=%"INT64" ip=%s to "
"waiting tree node %"INT32"",
m_minFutureTimeMS , iptoa(firstIp),dn);
// keep the table in sync now with the time
m_waitingTable.addKey( &firstIp, &m_minFutureTimeMS );
// sanity check
if ( ! m_waitingTable.m_isWritable ) { char *xx=NULL;*xx=0;}
return true;
}
// we are coring here. i guess the best request or a copy of it
// somehow started spidering since our last spider read, so i would
// say we should bail on this spider scan! really i'm not exactly
// sure what happened...
// MDW: now we add a bunch of urls to doledb, so i guess we can't
// check locks...
/*
int64_t key = makeLockTableKey ( m_bestRequest );
if ( g_spiderLoop.m_lockTable.isInTable ( &key ) ) {
log("spider: best request got doled out from under us");
return true;
char *xx=NULL;*xx=0;
}
// make the doledb key first for this so we can add it
key_t doleKey = g_doledb.makeKey ( m_bestRequest->m_priority ,
// convert to seconds from ms
m_bestSpiderTimeMS / 1000 ,
m_bestRequest->getUrlHash48() ,
false );
if ( g_conf.m_logDebugSpider )
log("spider: got winner pdocid=%"INT64" url=%s",
m_bestRequest->m_probDocId,
m_bestRequest->m_url);
// make it into a doledb record
char *p = m_doleBuf;
*(key_t *)p = doleKey;
p += sizeof(key_t);
int32_t recSize = m_bestRequest->getRecSize();
*(int32_t *)p = recSize;
p += 4;
gbmemcpy ( p , m_bestRequest , recSize );
p += recSize;
// sanity check
if ( p - m_doleBuf > (int32_t)MAX_DOLEREC_SIZE ) { char *xx=NULL;*xx=0; }
*/
// how did this happen?
//if ( ! m_msg1Avail ) { char *xx=NULL;*xx=0; }
char *doleBufEnd = doleBuf->getBuf();
// add it to doledb ip table now so that waiting tree does not
// immediately get another spider request from this same ip added
// to it while the msg4 is out. but if add failes we totally bail
// with g_errno set
//
// crap, i think this could be slowing us down when spidering
// a single ip address. maybe use msg1 here not msg4?
//if ( ! addToDoleTable ( m_bestRequest ) ) return true;
// . MDW: now we have a list of doledb records in a SafeBuf:
// . scan the requests in safebuf
// get offset
char *p = doleBuf->getBufStart();
int32_t jump = *(int32_t *)p;
// sanity
if ( jump < 4 || jump > doleBuf->getLength() ) {
char *xx=NULL;*xx=0; }
// the jump includes itself
p += jump;
//for ( ; p < m_doleBuf.getBuf() ; ) {
// save it
char *doledbRec = p;
// first is doledbkey
p += sizeof(key_t);
// then size of spider request
p += 4;
// the spider request encapsulated
SpiderRequest *sreq3;
sreq3 = (SpiderRequest *)p;
// point "p" to next spiderrequest
p += sreq3->getRecSize();
// sanity
if ( p > doleBufEnd ) { char *xx=NULL;*xx=0; }
// for caching logic below, set this
int32_t doledbRecSize = sizeof(key_t) + 4 + sreq3->getRecSize();
// process sreq3 my incrementing the firstip count in
// m_doleIpTable
if ( ! addToDoleTable ( sreq3 ) ) return true;
// only add the top key for now!
//break;
// // this logic is now in addToDoleTable()
// // . if it was empty it is no longer
// // . we have this flag here to avoid scanning empty doledb
// // priorities because it saves us a msg5 call to doledb in
// // the scanning loop
// //int32_t bp = sreq3->m_priority;//m_bestRequest->m_priority;
// //if ( bp < 0 ) { char *xx=NULL;*xx=0; }
// //if ( bp >= MAX_SPIDER_PRIORITIES ) { char *xx=NULL;*xx=0; }
// //m_isDoledbEmpty [ bp ] = 0;
// }
// now cache the REST of the spider requests to speed up scanning.
// better than adding 400 recs per firstip to doledb because
// msg5's call to RdbTree::getList() is way faster.
// even if m_doleBuf is from the cache, re-add it to lose the
// top rec.
// allow this to add a 0 length record otherwise we keep the same
// old url in here and keep spidering it over and over again!
//bool addToCache = false;
//if( skipSize && m_doleBuf.length() - skipSize > 0 ) addToCache =true;
// if winnertree was empty, then we might have scanned like 10M
// twitter.com urls and not wanted any of them, so we don't want to
// have to keep redoing that!
//if ( m_doleBuf.length() == 0 && ! isFromCache ) addToCache = true;
RdbCache *wc = &g_spiderLoop.m_winnerListCache;
// remove from cache? if we added the last spider request in the
// cached dolebuf to doledb then remove it from cache so it's not
// a cached empty dolebuf and we recompute it not using the cache.
if ( isFromCache && p >= doleBufEnd ) {
//if ( addToCache ) { char *xx=NULL;*xx=0; }
// debug note
// if ( m_collnum == 18752 )
// log("spider: rdbcache: adding single byte. skipsize=%i"
// ,doledbRecSize);
// let's get this working right...
//wc->removeKey ( collnum , k , start );
//wc->markDeletedRecord(start);
// i don't think we can remove keys from cache so add
// a rec with a byte size of 1 to indicate for us to ignore.
// set the timestamp to 12345 so the getRecord above will
// not get it and promote it in the linked list.
char byte = 0;
key_t cacheKey;
cacheKey.n0 = firstIp;
cacheKey.n1 = 0;
//wc->verify();
wc->addRecord ( m_collnum,
(char *)&cacheKey,
&byte ,
1 ,
12345 );//cachedTimestamp );
//wc->verify();
}
// if it wasn't in the cache and it was only one record we
// obviously do not want to add it to the cache.
else if ( p < doleBufEnd ) { // if ( addToCache ) {
key_t cacheKey;
cacheKey.n0 = firstIp;
cacheKey.n1 = 0;
char *x = doleBuf->getBufStart();
// the new offset is the next record after the one we
// just added to doledb
int32_t newJump = (int32_t)(p - x);
int32_t oldJump = *(int32_t *)x;
// NO! we do a copy in rdbcache and copy the thing over
// since we promote it. so this won't work...
*(int32_t *)x = newJump;
if ( newJump >= doleBuf->getLength() ) { char *xx=NULL;*xx=0;}
if ( newJump < 4 ) { char *xx=NULL;*xx=0;}
if ( g_conf.m_logDebugSpider ) // || m_collnum == 18752 )
log("spider: rdbcache: updating "
"%"INT32" bytes of SpiderRequests "
"to winnerlistcache for ip %s oldjump=%"INT32
" newJump=%"INT32" ptr=0x%"PTRFMT,
doleBuf->length(),iptoa(firstIp),oldJump,
newJump,
(PTRTYPE)x);
//validateDoleBuf ( doleBuf );
//wc->verify();
// inherit timestamp. if 0, RdbCache will set to current time
// don't re-add just use the same modified buffer so we
// don't churn the cache.
// but do add it to cache if not already in there yet.
if ( ! isFromCache ) {
// if ( m_collnum == 18752 )
// log("spider: rdbcache: adding record a new "
// "dbufsize=%i",(int)doleBuf->length());
wc->addRecord ( m_collnum,
(char *)&cacheKey,
doleBuf->getBufStart(),//+ skipSize ,
doleBuf->length() ,//- skipSize ,
0);//cachedTimestamp );
}
//validateDoleBuf( doleBuf );
/*
// test it
char *testPtr;
int32_t testLen;
bool inCache2 = wc->getRecord ( m_collnum ,
(char *)&cacheKey ,
&testPtr,
&testLen,
false, // doCopy?
600, // maxAge,600 secs
true ,// incCounts
NULL , // rec timestamp
true ); // promote?
if ( ! inCache2 ) { char *xx=NULL;*xx=0; }
if ( testLen != m_doleBuf.length() ) {char *xx=NULL;*xx=0; }
if ( *(int32_t *)testPtr != newJump ){char *xx=NULL;*xx=0; }
SafeBuf tmp;
tmp.setBuf ( testPtr , testLen , testLen , false );
validateDoleBuf ( &tmp );
*/
//wc->verify();
}
// and the whole thing is no longer empty
//m_allDoledbPrioritiesEmpty = 0;//false;
//m_lastEmptyCheck = 0;
//
// delete the winner from ufntree as well
//
/*
int64_t buh48 = m_bestRequest->getUrlHash48();
key128_t bkey = makeUfnTreeKey ( m_bestRequest->m_firstIp ,
m_bestRequest->m_priority ,
m_bestSpiderTimeMS ,
buh48 );
// must be in tree!
int32_t node = s_ufnTree.getNextNode ( 0, (char *)&bkey );
// if this firstip had too few requests to make it into the
// tree then node will be < 0!
//if ( node < 0 ) { char *xx=NULL;*xx=0; }
if ( node >= 0 ) {
//log("deleting node #%"INT32" firstip=%s uh48=%"UINT64"",
// node,iptoa(firstIp),uh48);
s_ufnTree.deleteNode ( node , true );
}
*/
m_msg4Start = gettimeofdayInMilliseconds();
//RdbList *tmpList = &m_msg1.m_tmpList;
// keep it on stack now that doledb is tree-only
RdbList tmpList;
// only add one doledb record at a time now since we
// have the winnerListCache
//m_doleBuf.setLength ( skipSize );
//tmpList.setFromSafeBuf ( &m_doleBuf , RDB_DOLEDB );
tmpList.setFromPtr ( doledbRec , doledbRecSize , RDB_DOLEDB );
// now that doledb is tree-only and never dumps to disk, just
// add it directly
g_doledb.m_rdb.addList ( m_collnum , &tmpList , MAX_NICENESS );
if ( g_conf.m_logDebugSpider )
log("spider: adding doledb tree node size=%"INT32"",
doledbRecSize);
// and it happens right away. just add it locally.
bool status = true;
// . use msg4 to transmit our guys into the rdb, RDB_DOLEDB
// . no, use msg1 for speed, so we get it right away!!
// . we' already incremented doleiptable counts... so we need to
// make sure this happens!!!
/*
bool status = m_msg1.addList ( tmpList ,
RDB_DOLEDB ,
m_collnum ,
this ,
doledWrapper ,
false , // forcelocal?
// Rdb.cpp can't call dumpTree()
// if niceness is 0!
MAX_NICENESS);
// if it blocked set this to true so we do not reuse it
if ( ! status ) m_msg1Avail = false;
*/
int32_t storedFirstIp = (m_waitingTreeKey.n0) & 0xffffffff;
// log it
if ( g_conf.m_logDebugSpcache ) {
uint64_t spiderTimeMS = m_waitingTreeKey.n1;
spiderTimeMS <<= 32;
spiderTimeMS |= (m_waitingTreeKey.n0 >> 32);
logf(LOG_DEBUG,"spider: removing doled waitingtree key"
" spidertime=%"UINT64" firstIp=%s "
//"pri=%"INT32" "
//"url=%s"
,spiderTimeMS,
iptoa(storedFirstIp)
//(int32_t)m_bestRequest->m_priority,
//m_bestRequest->m_url);
);
}
// before adding to doledb remove from waiting tree so we do not try
// to readd to doledb...
m_waitingTree.deleteNode ( 0, (char *)&m_waitingTreeKey , true);
m_waitingTable.removeKey ( &storedFirstIp );
//log("spider: 3 del node for %s",iptoa(storedFirstIp));
// invalidate
m_waitingTreeKeyValid = false;
// sanity check
if ( ! m_waitingTable.m_isWritable ) { char *xx=NULL;*xx=0;}
// note that ip as being in dole table
if ( g_conf.m_logDebugSpider )
log("spider: added best sreq for ip=%s to doletable AND "
"removed from waiting table",
iptoa(firstIp));
// save memory
m_winnerTree.reset();
m_winnerTable.reset();
//validateDoleBuf( doleBuf );
// add did not block
return status;
}
uint64_t SpiderColl::getSpiderTimeMS ( SpiderRequest *sreq,
int32_t ufn,
SpiderReply *srep,
uint64_t nowGlobalMS ) {
// . get the scheduled spiderTime for it
// . assume this SpiderRequest never been successfully spidered
int64_t spiderTimeMS = ((uint64_t)sreq->m_addedTime) * 1000LL;
// how can added time be in the future? did admin set clock back?
//if ( spiderTimeMS > nowGlobalMS ) spiderTimeMS = nowGlobalMS;
// if injecting for first time, use that!
if ( ! srep && sreq->m_isInjecting ) return spiderTimeMS;
if ( ! srep && sreq->m_isPageReindex ) return spiderTimeMS;
//log("spider: getting spider time %"INT64, spiderTimeMS);
// to avoid hammering an ip, get last time we spidered it...
int64_t lastMS ;
lastMS = m_lastDownloadCache.getLongLong ( m_collnum ,
sreq->m_firstIp ,
-1 , // maxAge
true );// promote
// -1 means not found
if ( (int64_t)lastMS == -1 ) lastMS = 0;
// sanity
if ( (int64_t)lastMS < -1 ) {
log("spider: corrupt last time in download cache. nuking.");
lastMS = 0;
}
// min time we can spider it
int64_t minSpiderTimeMS1 = lastMS + m_cr->m_spiderIpWaits[ufn];
// if not found in cache
if ( lastMS == -1 ) minSpiderTimeMS1 = 0LL;
/////////////////////////////////////////////////
/////////////////////////////////////////////////
// crawldelay table check!!!!
/////////////////////////////////////////////////
/////////////////////////////////////////////////
int32_t *cdp = (int32_t *)m_cdTable.getValue ( &sreq->m_domHash32 );
int64_t minSpiderTimeMS2 = 0;
// limit to 60 seconds crawl delay.
// help fight SpiderReply corruption too
if ( cdp && *cdp > 60000 ) *cdp = 60000;
if ( cdp && *cdp >= 0 ) minSpiderTimeMS2 = lastMS + *cdp;
// wait 5 seconds for all outlinks in order for them to have a
// chance to get any link info that might have been added
// from the page that supplied this outlink
// CRAP! this slows down same ip spidering i think... yeah, without
// this it seems the spiders are always at 10 (sometimes 8 or 9)
// when i spider techcrunch.com.
//spiderTimeMS += 5000;
// ensure min
if ( spiderTimeMS < minSpiderTimeMS1 ) spiderTimeMS = minSpiderTimeMS1;
if ( spiderTimeMS < minSpiderTimeMS2 ) spiderTimeMS = minSpiderTimeMS2;
// if no reply, use that
if ( ! srep ) return spiderTimeMS;
// if this is not the first try, then re-compute the spiderTime
// based on that last time
// sanity check
if ( srep->m_spideredTime <= 0 ) {
// a lot of times these are corrupt! wtf???
//spiderTimeMS = minSpiderTimeMS;
return spiderTimeMS;
//{ char*xx=NULL;*xx=0;}
}
// compute new spiderTime for this guy, in seconds
int64_t waitInSecs = (uint64_t)(m_cr->m_spiderFreqs[ufn]*3600*24.0);
// do not spider more than once per 15 seconds ever!
// no! might be a query reindex!!
if ( waitInSecs < 15 && ! sreq->m_isPageReindex ) { //urlIsDocId ) {
static bool s_printed = false;
if ( ! s_printed ) {
s_printed = true;
log("spider: min spider wait is 15 seconds, "
"not %"UINT64" (ufn=%"INT32")",waitInSecs,ufn);
}
waitInSecs = 15;//900; this was 15 minutes
}
// in fact, force docid based guys to be zero!
//if ( sreq->m_urlIsDocId ) waitInSecs = 0;
if ( sreq->m_isPageReindex ) waitInSecs = 0;
// when it was spidered
int64_t lastSpideredMS = ((uint64_t)srep->m_spideredTime) * 1000;
// . when we last attempted to spider it... (base time)
// . use a lastAttempt of 0 to indicate never!
// (first time)
int64_t minSpiderTimeMS3 = lastSpideredMS + (waitInSecs * 1000LL);
// ensure min
if ( spiderTimeMS < minSpiderTimeMS3 ) spiderTimeMS = minSpiderTimeMS3;
// sanity
if ( (int64_t)spiderTimeMS < 0 ) { char *xx=NULL;*xx=0; }
return spiderTimeMS;
}
// . returns false with g_errno set on error
// . Rdb.cpp should call this when it receives a doledb key
// . when trying to add a SpiderRequest to the waiting tree we first check
// the doledb table to see if doledb already has an sreq from this firstIp
// . therefore, we should add the ip to the dole table before we launch the
// Msg4 request to add it to doledb, that way we don't add a bunch from the
// same firstIP to doledb
bool SpiderColl::addToDoleTable ( SpiderRequest *sreq ) {
// update how many per ip we got doled
int32_t *score = (int32_t *)m_doleIpTable.getValue32 ( sreq->m_firstIp );
// debug point
if ( g_conf.m_logDebugSpider ){//&&1==2 ) { // disable for now, spammy
int64_t uh48 = sreq->getUrlHash48();
int64_t pdocid = sreq->getParentDocId();
int32_t ss = 1;
if ( score ) ss = *score + 1;
// if for some reason this collides with another key
// already in doledb then our counts are off
log("spider: added to doletbl uh48=%"UINT64" parentdocid=%"UINT64" "
"ipdolecount=%"INT32" ufn=%"INT32" priority=%"INT32" firstip=%s",
uh48,pdocid,ss,(int32_t)sreq->m_ufn,(int32_t)sreq->m_priority,
iptoa(sreq->m_firstIp));
}
// we had a score there already, so inc it
if ( score ) {
// inc it
*score = *score + 1;
// sanity check
if ( *score <= 0 ) { char *xx=NULL;*xx=0; }
// only one per ip!
// not any more! we allow MAX_WINNER_NODES per ip!
if ( *score > MAX_WINNER_NODES )
log("spider: crap. had %"INT32" recs in doledb for %s "
"from %s."
"how did this happen?",
(int32_t)*score,m_coll,iptoa(sreq->m_firstIp));
// now we log it too
if ( g_conf.m_logDebugSpider )
log(LOG_DEBUG,"spider: added ip=%s to doleiptable "
"(score=%"INT32")",
iptoa(sreq->m_firstIp),*score);
}
else {
// ok, add new slot
int32_t val = 1;
if ( ! m_doleIpTable.addKey ( &sreq->m_firstIp , &val ) ) {
// log it, this is bad
log("spider: failed to add ip %s to dole ip tbl",
iptoa(sreq->m_firstIp));
// return true with g_errno set on error
return false;
}
// now we log it too
if ( g_conf.m_logDebugSpider )
log(LOG_DEBUG,"spider: added ip=%s to doleiptable "
"(score=1)",iptoa(sreq->m_firstIp));
// sanity check
//if ( ! m_doleIpTable.m_isWritable ) { char *xx=NULL;*xx=0;}
}
// . these priority slots in doledb are not empty
// . unmark individual priority buckets
// . do not skip them when scanning for urls to spiderd
int32_t pri = sreq->m_priority;
m_isDoledbEmpty[pri] = 0;
// reset scan for this priority in doledb
m_nextKeys [pri] =g_doledb.makeFirstKey2 ( pri );
return true;
}
/////////////////////////
///////////////////////// UTILITY FUNCTIONS
/////////////////////////
// . map a spiderdb rec to the shard # that should spider it
// . "sr" can be a SpiderRequest or SpiderReply
// . shouldn't this use Hostdb::getShardNum()?
/*
uint32_t getShardToSpider ( char *sr ) {
// use the url hash
int64_t uh48 = g_spiderdb.getUrlHash48 ( (key128_t *)sr );
// host to dole it based on ip
int32_t hostId = uh48 % g_hostdb.m_numHosts ;
// get it
Host *h = g_hostdb.getHost ( hostId ) ;
// and return groupid
return h->m_groupId;
}
*/
// does this belong in our spider cache?
bool isAssignedToUs ( int32_t firstIp ) {
// sanity check... must be in our group.. we assume this much
//if ( g_spiderdb.getGroupId(firstIp) != g_hostdb.m_myHost->m_groupId){
// char *xx=NULL;*xx=0; }
// . host to dole it based on ip
// . ignore lower 8 bits of ip since one guy often owns a whole block!
//int32_t hostId=(((uint32_t)firstIp) >> 8) % g_hostdb.getNumHosts();
if( !g_hostdb.getMyHost()->m_spiderEnabled ) return false;
// get our group
//Host *group = g_hostdb.getMyGroup();
Host *shard = g_hostdb.getMyShard();
// pick a host in our group
// if not dead return it
//if ( ! g_hostdb.isDead(hostId) ) return hostId;
// get that host
//Host *h = g_hostdb.getHost(hostId);
// get the group
//Host *group = g_hostdb.getGroup ( h->m_groupId );
// and number of hosts in the group
int32_t hpg = g_hostdb.getNumHostsPerShard();
// let's mix it up since spider shard was selected using this
// same mod on the firstIp method!!
uint64_t h64 = firstIp;
unsigned char c = firstIp & 0xff;
h64 ^= g_hashtab[c][0];
// select the next host number to try
//int32_t next = (((uint32_t)firstIp) >> 16) % hpg ;
// hash to a host
int32_t i = ((uint32_t)h64) % hpg;
Host *h = &shard[i];
// return that if alive
if ( ! g_hostdb.isDead(h) && h->m_spiderEnabled) {
return (h->m_hostId == g_hostdb.m_hostId);
}
// . select another otherwise
// . put all alive in an array now
Host *alive[64];
int32_t upc = 0;
for ( int32_t j = 0 ; j < hpg ; j++ ) {
Host *h = &shard[j];
if ( g_hostdb.isDead(h) ) continue;
if( ! h->m_spiderEnabled ) continue;
alive[upc++] = h;
}
// if none, that is bad! return the first one that we wanted to
if ( upc == 0 ) {
log("spider: no hosts can handle spider request for ip=%s", iptoa(firstIp));
return false;
//return (h->m_hostId == g_hostdb.m_hostId);
}
// select from the good ones now
i = ((uint32_t)firstIp) % upc;
// get that
h = alive[i]; //&shard[i];
// guaranteed to be alive... kinda
return (h->m_hostId == g_hostdb.m_hostId);
}
/////////////////////////
///////////////////////// SPIDERLOOP
/////////////////////////
static void indexedDocWrapper ( void *state ) ;
static void doneSleepingWrapperSL ( int fd , void *state ) ;
// a global class extern'd in .h file
SpiderLoop g_spiderLoop;
SpiderLoop::SpiderLoop ( ) {
m_crx = NULL;
// clear array of ptrs to Doc's
memset ( m_docs , 0 , sizeof(XmlDoc *) * MAX_SPIDERS );
}
SpiderLoop::~SpiderLoop ( ) { reset(); }
// free all doc's
void SpiderLoop::reset() {
// delete all doc's in use
for ( int32_t i = 0 ; i < MAX_SPIDERS ; i++ ) {
if ( m_docs[i] ) {
mdelete ( m_docs[i] , sizeof(XmlDoc) , "Doc" );
delete (m_docs[i]);
}
m_docs[i] = NULL;
//m_lists[i].freeList();
}
m_list.freeList();
m_lockTable.reset();
m_lockCache.reset();
m_winnerListCache.reset();
}
void updateAllCrawlInfosSleepWrapper ( int fd , void *state ) ;
void SpiderLoop::startLoop ( ) {
//m_cri = 0;
m_crx = NULL;
m_activeListValid = false;
m_activeListModified = false;
m_activeList = NULL;
m_recalcTime = 0;
m_recalcTimeValid = false;
// falsify this flag
m_outstanding1 = false;
// not flushing
m_msg12.m_gettingLocks = false;
// we aren't in the middle of waiting to get a list of SpiderRequests
m_gettingDoledbList = false;
// we haven't registered for sleeping yet
m_isRegistered = false;
// clear array of ptrs to Doc's
memset ( m_docs , 0 , sizeof(XmlDoc *) * MAX_SPIDERS );
// . m_maxUsed is the largest i such that m_docs[i] is in use
// . -1 means there are no used m_docs's
m_maxUsed = -1;
m_numSpidersOut = 0;
m_processed = 0;
// for locking. key size is 8 for easier debugging
m_lockTable.set ( 8,sizeof(UrlLock),0,NULL,0,false,MAX_NICENESS,
"splocks", true ); // useKeyMagic? yes.
if ( ! m_lockCache.init ( 20000 , // maxcachemem
4 , // fixedatasize
false , // supportlists?
1000 , // maxcachenodes
false , // use half keys
"lockcache", // dbname
false ) )
log("spider: failed to init lock cache. performance hit." );
if ( ! m_winnerListCache.init ( 20000000 , // maxcachemem, 20MB
-1 , // fixedatasize
false , // supportlists?
10000 , // maxcachenodes
false , // use half keys
"winnerspidercache", // dbname
false ) )
log("spider: failed to init winnerlist cache. slows down.");
// dole some out
//g_spiderLoop.doleUrls1();
// spider some urls that were doled to us
//g_spiderLoop.spiderDoledUrls( );
// sleep for .1 seconds = 100ms
if (!g_loop.registerSleepCallback(50,this,doneSleepingWrapperSL))
log("build: Failed to register timer callback. Spidering "
"is permanently disabled. Restart to fix.");
// crawlinfo updating
// save bandwidth for now make this every 4 seconds not 1 second
// then try not to send crawlinfo the host should already have.
// each collrec can have a checksum for each host of the last
// info we sent it. but we should resend all every 100 secs anyway
// in case host when dead.
// now that we only send the info on startup and if changed,
// let's move back down to 1 second
// . make it 20 seconds because handlerequestc1 is always on
// profiler when we have thousands of collections
if ( !g_loop.registerSleepCallback(20000,
this,
updateAllCrawlInfosSleepWrapper))
log("build: failed to register updatecrawlinfowrapper");
}
// call this every 50ms it seems to try to spider urls and populate doledb
// from the waiting tree
void doneSleepingWrapperSL ( int fd , void *state ) {
//SpiderLoop *THIS = (SpiderLoop *)state;
// dole some out
//g_spiderLoop.doleUrls1();
// if spidering disabled then do not do this crap
if ( ! g_conf.m_spideringEnabled ) return;
if ( ! g_hostdb.getMyHost( )->m_spiderEnabled ) return;
//if ( ! g_conf.m_webSpideringEnabled ) return;
// or if trying to exit
if ( g_process.m_mode == EXIT_MODE ) return;
// skip if udp table is full
if ( g_udpServer.getNumUsedSlotsIncoming() >= MAXUDPSLOTS ) return;
// wait for clock to sync with host #0
if ( ! isClockInSync() ) {
// let admin know why we are not spidering
static char s_printed = false;
if ( ! s_printed ) {
logf(LOG_DEBUG,"spider: NOT SPIDERING until clock "
"is in sync with host #0.");
s_printed = true;
}
return;
}
//if ( g_hostdb.hasDeadHost() ) return;
static int32_t s_count = -1;
// count these calls
s_count++;
int32_t now = getTimeLocal();
// reset SpiderColl::m_didRound and m_nextDoledbKey if it is maxed
// because we might have had a lock collision
//int32_t nc = g_collectiondb.m_numRecs;
redo:
// point to head of active linked list of collection recs
CollectionRec *nextActive = g_spiderLoop.getActiveList();
collnum_t nextActiveCollnum ;
if ( nextActive ) nextActiveCollnum = nextActive->m_collnum;
//for ( int32_t i = 0 ; i < nc ; i++ ) {
for ( ; nextActive ; ) {
// breathe
QUICKPOLL(MAX_NICENESS);
// before we assign crp to nextActive, ensure that it did
// not get deleted on us.
// if the next collrec got deleted, tr will be NULL
CollectionRec *tr = g_collectiondb.getRec( nextActiveCollnum );
// if it got deleted or restarted then it will not
// match most likely
if ( tr != nextActive ) {
// this shouldn't happen much so log it
log("spider: collnum %"INT32" got deleted. "
"rebuilding active list",
(int32_t)nextActiveCollnum);
// rebuild the active list now
goto redo;
}
// now we become him
CollectionRec *crp = nextActive;
// update these two vars for next iteration
nextActive = crp->m_nextActive;
nextActiveCollnum = -1;
if ( nextActive ) nextActiveCollnum = nextActive->m_collnum;
// if list was modified a collection was deleted/added
//if ( g_spiderLoop.m_activeListModified ) goto top;
// // get collectionrec
// CollectionRec *cr = g_collectiondb.getRec(i);
// if ( ! cr ) continue;
// skip if not enabled
if ( ! crp->m_spideringEnabled ) continue;
// get it
//SpiderColl *sc = cr->m_spiderColl;
SpiderColl *sc = g_spiderCache.getSpiderColl(crp->m_collnum);
// skip if none
if ( ! sc ) continue;
// also scan spiderdb to populate waiting tree now but
// only one read per 100ms!!
// MDW: try taking this out
//if ( (s_count % 10) == 0 ) {
// always do a scan at startup & every 24 hrs
// AND at process startup!!!
if ( ! sc->m_waitingTreeNeedsRebuild &&
now - sc->m_lastScanTime > 24*3600) {
// if a scan is ongoing, this will re-set it
sc->m_nextKey2.setMin();
sc->m_waitingTreeNeedsRebuild = true;
log(LOG_INFO,
"spider: hit spider queue "
"rebuild timeout for %s (%"INT32")",
crp->m_coll,(int32_t)crp->m_collnum);
// flush the ufn table
//clearUfnTable();
}
// try this then. it just returns if
// sc->m_waitingTreeNeedsRebuild is false so it
// should be fast in those cases
sc->populateWaitingTreeFromSpiderdb ( false );
//}
// if list was modified a collection was deleted/added
//if ( g_spiderLoop.m_activeListModified ) goto top;
// re-entry is false because we are entering for the first time
sc->populateDoledbFromWaitingTree ( );
// . skip if still loading doledb lists from disk this round
// . we use m_didRound to share spiders across all collections
//if ( ! sc->m_didRound ) continue;
// ensure at the top!
//if( sc->m_pri2!=MAX_SPIDER_PRIORITIES-1){char*xx=NULL;*xx=0;}
// ok, reset it so it can start a new doledb scan
//sc->m_didRound = false;
// reset this as well. if there are no spiderRequests
// available on any priority level for this collection,
// then it will remain true. but if we attempt to spider
// a url, or can't spider a url b/c of a max oustanding
// constraint, we set this to false. this is used to
// send notifications when a crawl is basically in hiatus.
//sc->m_encounteredDoledbRecs = false;
//sc->m_nextDoledbKey.setMin();
// if list was modified a collection was deleted/added
//if ( g_spiderLoop.m_activeListModified ) goto top;
}
// set initial priority to the highest to start spidering there
//g_spiderLoop.m_pri = MAX_SPIDER_PRIORITIES - 1;
// if recently called, do not call again from the sleep wrapper
int64_t nowms = gettimeofdayInMillisecondsLocal();
if ( nowms - g_spiderLoop.m_lastCallTime < 50 )
return;
// if we have a ton of collections, reduce cpu load from calling
// spiderDoledUrls()
static uint64_t s_skipCount = 0;
s_skipCount++;
// so instead of every 50ms make it every 200ms if we got
// 100+ collections in use.
//CollectionRec *tmp = g_spiderLoop.getActiveList();
g_spiderLoop.getActiveList();
int32_t activeListCount = g_spiderLoop.m_activeListCount;
if ( ! g_spiderLoop.m_activeListValid )
activeListCount = 0;
int32_t skip = 1;
if ( activeListCount >= 50 )
skip = 2;
if ( activeListCount >= 100 )
skip = 4;
if ( activeListCount >= 200 )
skip = 8;
if ( ( s_skipCount % skip ) != 0 )
return;
// spider some urls that were doled to us
g_spiderLoop.spiderDoledUrls( );
}
void doneSendingNotification ( void *state ) {
EmailInfo *ei = (EmailInfo *)state;
collnum_t collnum = ei->m_collnum;
CollectionRec *cr = g_collectiondb.m_recs[collnum];
if ( cr != ei->m_collRec ) cr = NULL;
char *coll = "lostcoll";
if ( cr ) coll = cr->m_coll;
log(LOG_INFO,"spider: done sending notifications for coll=%s (%i)",
coll,(int)ei->m_collnum);
// all done if collection was deleted from under us
if ( ! cr ) return;
// do not re-call this stuff
cr->m_sendingAlertInProgress = false;
// we can re-use the EmailInfo class now
// pingserver.cpp sets this
//ei->m_inUse = false;
// so we do not send for this status again, mark it as sent for
// but we reset sentCrawlDoneAlert to 0 on round increment below
//log("spider: setting sentCrawlDoneAlert status to %"INT32"",
// (int32_t)cr->m_spiderStatus);
// mark it as sent. anytime a new url is spidered will mark this
// as false again! use LOCAL crawlInfo, since global is reset often.
cr->m_localCrawlInfo.m_sentCrawlDoneAlert = 1;//cr->m_spiderStatus;//1;
// be sure to save state so we do not re-send emails
cr->m_needsSave = 1;
// sanity
if ( cr->m_spiderStatus == 0 ) { char *xx=NULL;*xx=0; }
// i guess each host advances its own round... so take this out
// sanity check
//if ( g_hostdb.m_myHost->m_hostId != 0 ) { char *xx=NULL;*xx=0; }
//float respiderFreq = -1.0;
float respiderFreq = cr->m_collectiveRespiderFrequency;
// if not REcrawling, set this to 0 so we at least update our
// round # and round start time...
if ( respiderFreq < 0.0 ) //== -1.0 )
respiderFreq = 0.0;
//if ( respiderFreq < 0.0 ) {
// log("spider: bad respiderFreq of %f. making 0.",
// respiderFreq);
// respiderFreq = 0.0;
//}
// advance round if that round has completed, or there are no
// more urls to spider. if we hit maxToProcess/maxToCrawl then
// do not increment the round #. otherwise we should increment it.
// do allow maxtocrawl guys through if they repeat, however!
//if(cr->m_spiderStatus == SP_MAXTOCRAWL && respiderFreq <= 0.0)return;
//if(cr->m_spiderStatus == SP_MAXTOPROCESS && respiderFreq<=0.0)return;
////////
//
// . we are here because hasUrlsReadyToSpider is false
// . we just got done sending an email alert
// . now increment the round only if doing rounds!
//
///////
// if not doing rounds, keep the round 0. they might want to up
// their maxToCrawl limit or something.
if ( respiderFreq <= 0.0 ) return;
// if we hit the max to crawl rounds, then stop!!! do not
// increment the round...
if ( cr->m_spiderRoundNum >= cr->m_maxCrawlRounds &&
// there was a bug when maxCrawlRounds was 0, which should
// mean NO max, so fix that here:
cr->m_maxCrawlRounds > 0 ) return;
// this should have been set below
//if ( cr->m_spiderRoundStartTime == 0 ) { char *xx=NULL;*xx=0; }
// find the "respider frequency" from the first line in the url
// filters table whose expressions contains "{roundstart}" i guess
//for ( int32_t i = 0 ; i < cr->m_numRegExs ; i++ ) {
// // get it
// char *ex = cr->m_regExs[i].getBufStart();
// // compare
// if ( ! strstr ( ex , "roundstart" ) ) continue;
// // that's good enough
// respiderFreq = cr->m_spiderFreqs[i];
// break;
//}
int32_t seconds = (int32_t)(respiderFreq * 24*3600);
// add 1 for lastspidertime round off errors so we can be assured
// all spiders have a lastspidertime LESS than the new
// m_spiderRoundStartTime we set below.
if ( seconds <= 0 ) seconds = 1;
// now update this round start time. all the other hosts should
// sync with us using the parm sync code, msg3e, every 13.5 seconds.
//cr->m_spiderRoundStartTime += respiderFreq;
char roundTime[128];
sprintf(roundTime,"%"UINT32"", (uint32_t)(getTimeGlobal() + seconds));
// roundNum++ round++
char roundStr[128];
sprintf(roundStr,"%"INT32"", cr->m_spiderRoundNum + 1);
// waiting tree will usually be empty for this coll since no
// spider requests had a valid spider priority, so let's rebuild!
// this is not necessary because PF_REBUILD is set for the
// "spiderRoundStart" parm in Parms.cpp so it will rebuild if that parm
// changes already.
//if ( cr->m_spiderColl )
// cr->m_spiderColl->m_waitingTreeNeedsRebuild = true;
// we have to send these two parms to all in cluster now INCLUDING
// ourselves, when we get it in Parms.cpp there's special
// code to set this *ThisRound counts to 0!!!
SafeBuf parmList;
g_parms.addNewParmToList1 ( &parmList,cr->m_collnum,roundStr,-1 ,
"spiderRoundNum");
g_parms.addNewParmToList1 ( &parmList,cr->m_collnum,roundTime, -1 ,
"spiderRoundStart");
//g_parms.addParmToList1 ( &parmList , cr , "spiderRoundNum" );
//g_parms.addParmToList1 ( &parmList , cr , "spiderRoundStart" );
// this uses msg4 so parm ordering is guaranteed
g_parms.broadcastParmList ( &parmList , NULL , NULL );
// log it
log("spider: new round #%"INT32" starttime = %"UINT32" for %s"
, cr->m_spiderRoundNum
, (uint32_t)cr->m_spiderRoundStartTime
, cr->m_coll
);
}
bool sendNotificationForCollRec ( CollectionRec *cr ) {
// do not send email for maxrounds hit, it will send a round done
// email for that. otherwise we end up calling doneSendingEmail()
// twice and increment the round twice
//if ( cr->m_spiderStatus == SP_MAXROUNDS ) {
// log("spider: not sending email for max rounds limit "
// "since already sent for round done.");
// return true;
//}
// wtf? caller must set this
if ( ! cr->m_spiderStatus ) { char *xx=NULL; *xx=0; }
log(LOG_INFO,
"spider: sending notification for crawl status %"INT32" in "
"coll %s. "
//"current sent state is %"INT32""
,(int32_t)cr->m_spiderStatus
,cr->m_coll
//cr->m_spiderStatusMsg,
//,(int32_t)cr->m_localCrawlInfo.m_sentCrawlDoneAlert);
);
// if we already sent it return now. we set this to false everytime
// we spider a url, which resets it. use local crawlinfo for this
// since we reset global.
//if ( cr->m_localCrawlInfo.m_sentCrawlDoneAlert ) return true;
if ( cr->m_sendingAlertInProgress ) return true;
// ok, send it
//EmailInfo *ei = &cr->m_emailInfo;
EmailInfo *ei = (EmailInfo *)mcalloc ( sizeof(EmailInfo),"eialrt");
if ( ! ei ) {
log("spider: could not send email alert: %s",
mstrerror(g_errno));
return true;
}
// in use already?
//if ( ei->m_inUse ) return true;
// pingserver.cpp sets this
//ei->m_inUse = true;
// set it up
ei->m_finalCallback = doneSendingNotification;
ei->m_finalState = ei;
ei->m_collnum = cr->m_collnum;
// collnums can be recycled, so ensure collection with the ptr
ei->m_collRec = cr;
SafeBuf *buf = &ei->m_spiderStatusMsg;
// stop it from accumulating status msgs
buf->reset();
int32_t status = -1;
getSpiderStatusMsg ( cr , buf , &status );
// if no email address or webhook provided this will not block!
// DISABLE THIS UNTIL FIXED
//log("spider: SENDING EMAIL NOT");
// do not re-call this stuff
cr->m_sendingAlertInProgress = true;
// ok, put it back...
if ( ! sendNotification ( ei ) ) return false;
// so handle this ourselves in that case:
doneSendingNotification ( ei );
return true;
}
// we need to update crawl info for collections that
// have urls ready to spider
SpiderColl *getNextSpiderColl ( int32_t *cri ) ;
void gotDoledbListWrapper2 ( void *state , RdbList *list , Msg5 *msg5 ) ;
//////////////////////////
//////////////////////////
//
// The second KEYSTONE function.
//
// Scans doledb and spiders the doledb records.
//
// Doledb records contain SpiderRequests ready for spidering NOW.
//
// 1. gets all locks from all hosts in the shard
// 2. sends confirm msg to all hosts if lock acquired:
// - each host will remove from doledb then
// - assigned host will also add new "0" entry to waiting tree if need be
// - calling addToWaitingTree() will trigger populateDoledbFromWaitingTree()
// to add a new entry into waiting tree, not the one just locked.
// 3. makes a new xmldoc class for that url and calls indexDoc() on it
//
//////////////////////////
//////////////////////////
// now check our RDB_DOLEDB for SpiderRequests to spider!
void SpiderLoop::spiderDoledUrls ( ) {
//char *reb = g_rebalance.getNeedsRebalance();
//if ( ! reb || *reb ) {return;
//if ( g_conf.m_logDebugSpider )
// log("spider: trying to get a doledb rec to spider. "
// "currentnumout=%"INT32"",m_numSpidersOut);
m_lastCallTime = gettimeofdayInMillisecondsLocal();
// when getting a lock we keep a ptr to the SpiderRequest in the
// doledb list, so do not try to read more just yet until we know
// if we got the lock or not
if ( m_msg12.m_gettingLocks ) {
// this should no longer happen
char *xx=NULL; *xx=0;
// make a note, maybe this is why spiders are deficient?
if ( g_conf.m_logDebugSpider )
log("spider: failed to get doledb rec to spider: "
"msg12 is getting locks");
return;
}
// turn on for now
//g_conf.m_logDebugSpider = 1;
collLoop:
// start again at head if this is NULL
if ( ! m_crx ) m_crx = getActiveList();
bool firstTime = true;
// detect overlap
//CollectionRec *start = m_crx;
m_bookmark = m_crx;
// log this now
//logf(LOG_DEBUG,"spider: getting collnum to dole from");
// get this
m_sc = NULL;
// avoid infinite loops
//int32_t count = g_collectiondb.m_numRecs;
// set this in the loop
CollectionRec *cr = NULL;
uint32_t nowGlobal = 0;
// debug
//log("spider: m_cri=%"INT32"",(int32_t)m_cri);
// . get the next collection to spider
// . just alternate them
//for ( ; count > 0 ; m_cri++ , count-- ) {
m_launches = 0;
subloop:
QUICKPOLL(MAX_NICENESS);
// must be spidering to dole out
if ( ! g_conf.m_spideringEnabled ) return;
if ( ! g_hostdb.getMyHost( )->m_spiderEnabled ) return;
// or if trying to exit
if ( g_process.m_mode == EXIT_MODE ) return;
// if we don't have all the url counts from all hosts, then wait.
// one host is probably down and was never up to begin with
if ( ! s_countsAreValid ) return;
//if ( ! g_conf.m_webSpideringEnabled ) return;
// if we do not overlap ourselves
if ( m_gettingDoledbList ) return;
// bail instantly if in read-only mode (no RdbTrees!)
if ( g_conf.m_readOnlyMode ) return;
// or if doing a daily merge
if ( g_dailyMerge.m_mergeMode ) return;
// skip if too many udp slots being used
if ( g_udpServer.getNumUsedSlotsIncoming() >= MAXUDPSLOTS ) return;
// stop if too many out. this is now 50 down from 500.
if ( m_numSpidersOut >= MAX_SPIDERS ) return;
// a new global conf rule
if ( m_numSpidersOut >= g_conf.m_maxTotalSpiders ) return;
// bail if no collections
if ( g_collectiondb.m_numRecs <= 0 ) return;
// not while repairing
if ( g_repairMode ) return;
// do not spider until collections/parms in sync with host #0
if ( ! g_parms.m_inSyncWithHost0 ) return;
// don't spider if not all hosts are up, or they do not all
// have the same hosts.conf.
if ( ! g_pingServer.m_hostsConfInAgreement ) return;
// if nothin in the active list then return as well
if ( ! m_activeList ) return;
// if we hit the end of the list, wrap it around
if ( ! m_crx ) m_crx = m_activeList;
// we use m_bookmark to determine when we've done a round over all
// the collections. but it will be set to null sometimes when we
// are in this loop because the active list gets recomputed. so
// if we lost it because our bookmarked collection is no longer
// 'active' then just set it to the list head i guess
if ( ! m_bookmark || ! m_bookmark->m_isActive )
m_bookmark = m_activeList;
// i guess return at the end of the linked list if no collection
// launched a spider... otherwise do another cycle to launch another
// spider. i could see a single collection dominating all the spider
// slots in some scenarios with this approach unfortunately.
if ( m_crx == m_bookmark && ! firstTime && m_launches == 0 )
return;
// reset # launches after doing a round and having launched > 0
if ( m_crx == m_bookmark && ! firstTime )
m_launches = 0;
firstTime = false;
// if a collection got deleted re-calc the active list so
// we don't core trying to access a delete collectionrec.
// i'm not sure if this can happen here but i put this in as a
// precaution.
if ( ! m_activeListValid ) { m_crx = NULL; goto collLoop; }
// return now if list is just empty
if ( ! m_activeList ) return;
cr = m_crx;
// advance for next time we call goto subloop;
m_crx = m_crx->m_nextActive;
// get the spider collection for this collnum
m_sc = g_spiderCache.getSpiderColl(cr->m_collnum);//m_cri);
// skip if none
if ( ! m_sc ) goto subloop;
// always reset priority to max at start
m_sc->setPriority ( MAX_SPIDER_PRIORITIES - 1 );
subloopNextPriority:
QUICKPOLL(MAX_NICENESS);
// wrap it if we should
//if ( m_cri >= g_collectiondb.m_numRecs ) m_cri = 0;
// get rec
//cr = g_collectiondb.m_recs[m_cri];
// skip if gone
if ( ! cr ) goto subloop;
// stop if not enabled
if ( ! cr->m_spideringEnabled ) goto subloop;
// hit crawl round max?
if ( cr->m_maxCrawlRounds > 0 &&
cr->m_isCustomCrawl &&
cr->m_spiderRoundNum >= cr->m_maxCrawlRounds ) {
// should we resend our local crawl info to all hosts?
if ( cr->m_localCrawlInfo.m_hasUrlsReadyToSpider )
cr->localCrawlInfoUpdate();
// mark it has false
cr->m_localCrawlInfo.m_hasUrlsReadyToSpider = false;
// prevent having to save all the time
if ( cr->m_spiderStatus != SP_MAXROUNDS ) {
cr->m_needsSave = true;
cr->m_spiderStatus = SP_MAXROUNDS;
}
goto subloop;
}
// hit pages to crawl max?
if ( cr->m_maxToCrawl > 0 &&
cr->m_isCustomCrawl == 1 && // bulk jobs exempt!
cr->m_globalCrawlInfo.m_pageDownloadSuccessesThisRound >=
cr->m_maxToCrawl ) {
// should we resend our local crawl info to all hosts?
if ( cr->m_localCrawlInfo.m_hasUrlsReadyToSpider )
cr->localCrawlInfoUpdate();
// now once all hosts have no urls ready to spider
// then the send email code will be called.
// do it this way for code simplicity.
cr->m_localCrawlInfo.m_hasUrlsReadyToSpider = false;
// prevent having to save all the time
if ( cr->m_spiderStatus != SP_MAXTOCRAWL ) {
cr->m_needsSave = true;
cr->m_spiderStatus = SP_MAXTOCRAWL;
}
goto subloop;
}
// hit pages to process max?
if ( cr->m_maxToProcess > 0 &&
cr->m_isCustomCrawl == 1 && // bulk jobs exempt!
cr->m_globalCrawlInfo.m_pageProcessSuccessesThisRound >=
cr->m_maxToProcess ) {
// should we resend our local crawl info to all hosts?
if ( cr->m_localCrawlInfo.m_hasUrlsReadyToSpider )
cr->localCrawlInfoUpdate();
// now we can update it
cr->m_localCrawlInfo.m_hasUrlsReadyToSpider = false;
// prevent having to save all the time
if ( cr->m_spiderStatus != SP_MAXTOPROCESS ) {
cr->m_needsSave = true;
cr->m_spiderStatus = SP_MAXTOPROCESS;
}
goto subloop;
}
// int16_tcut
CrawlInfo *ci = &cr->m_localCrawlInfo;
// . if nothing left to spider...
// . this is what makes us fast again! but the problem
// is is that if they change the url filters so that
// something becomes ready to spider again we won't know
// because we do not set this flag back to true in
// Parms.cpp "doRebuild" because we don't want to get
// another email alert if there is nothing ready to spider
// after they change the parms. perhaps we should set
// the # of urls spidered to the "sentEmail" flag so we
// know if that changes to send another...
// . bug in this...
//if ( cr->m_isCustomCrawl && ! ci->m_hasUrlsReadyToSpider )
// continue;
// get the spider collection for this collnum
//m_sc = g_spiderCache.getSpiderColl(cr->m_collnum);//m_cri);
// skip if none
//if ( ! m_sc ) goto subloop;
// skip if we completed the doledb scan for every spider
// priority in this collection
// MDW: this was the only thing based on the value of
// m_didRound, but it was preventing addSpiderReply() from
// calling g_spiderLoop.spiderDoledUrls( ) to launch another
// spider. but now since we use m_crx logic and not
// m_didRound logic to spread spiders equally over all
// collections, let's try without this
//if ( m_sc->m_didRound ) goto subloop;
// set current time, synced with host #0
nowGlobal = (uint32_t)getTimeGlobal();
// the last time we attempted to spider a url for this coll
//m_sc->m_lastSpiderAttempt = nowGlobal;
// now we save this so when we restart these two times
// are from where we left off so we do not end up setting
// hasUrlsReadyToSpider to true which in turn sets
// the sentEmailAlert flag to false, which makes us
// send ANOTHER email alert!!
ci->m_lastSpiderAttempt = nowGlobal;
// sometimes our read of spiderdb to populate the waiting
// tree using evalIpLoop() takes a LONG time because
// a niceness 0 thread is taking a LONG time! so do not
// set hasUrlsReadyToSpider to false because of that!!
if ( m_sc->m_gettingList1 )
ci->m_lastSpiderCouldLaunch = nowGlobal;
// update this for the first time in case it is never updated.
// then after 60 seconds we assume the crawl is done and
// we send out notifications. see below.
if ( ci->m_lastSpiderCouldLaunch == 0 )
ci->m_lastSpiderCouldLaunch = nowGlobal;
//
// . if doing respider with roundstarttime....
// . roundstarttime is > 0 if m_collectiveRespiderFrequency
// is > 0, unless it has not been set to current time yet
// . if m_collectiveRespiderFrequency was set to 0.0 then
// PageCrawlBot.cpp also sets m_roundStartTime to 0.
//
if ( nowGlobal < cr->m_spiderRoundStartTime ) goto subloop;
// if populating this collection's waitingtree assume
// we would have found something to launch as well. it might
// mean the waitingtree-saved.dat file was deleted from disk
// so we need to rebuild it at startup.
if ( m_sc->m_waitingTreeNeedsRebuild )
ci->m_lastSpiderCouldLaunch = nowGlobal;
// get max spiders
int32_t maxSpiders = cr->m_maxNumSpiders;
if ( m_sc->m_isTestColl ) {
// parser does one at a time for consistency
if ( g_conf.m_testParserEnabled ) maxSpiders = 1;
// need to make it 6 since some priorities essentially
// lock the ips up that have urls in higher
// priorities. i.e. once we dole a url out for ip X,
// then if later we add a high priority url for IP X it
// can't get spidered until the one that is doled does.
if ( g_conf.m_testSpiderEnabled ) maxSpiders = 6;
}
// if some spiders are currently outstanding
if ( m_sc->m_spidersOut )
// do not end the crawl until empty of urls because
// that url might end up adding more links to spider
// when it finally completes
ci->m_lastSpiderCouldLaunch = nowGlobal;
// debug log
//if ( g_conf.m_logDebugSpider )
// log("spider: has %"INT32" spiders out",m_sc->m_spidersOut);
// obey max spiders per collection too
if ( m_sc->m_spidersOut >= maxSpiders )
goto subloop;
// int16_tcut
SpiderColl *sc = cr->m_spiderColl;
if ( sc && sc->m_doleIpTable.isEmpty() )
goto subloop;
/*
// . HACK.
// . TODO: we set spidercoll->m_gotDoledbRec to false above,
// then make Rdb.cpp set spidercoll->m_gotDoledbRec to
// true if it receives a rec. then if we see that it
// got set to true do not update m_nextKeys[i] or
// m_isDoledbEmpty[i] etc. because we might have missed
// the new doledb rec coming in.
// . reset our doledb empty timer every 3 minutes and also
// . reset our doledb empty status
int32_t wait = SPIDER_DONE_TIMER - 10;
if ( wait < 10 ) { char *xx=NULL;*xx=0; }
if ( sc && nowGlobal - sc->m_lastEmptyCheck >= wait ) {
// assume doledb not empty
sc->m_allDoledbPrioritiesEmpty = 0;
// reset the timer
sc->m_lastEmptyCheck = nowGlobal;
// reset all empty flags for each priority
for ( int32_t i = 0 ; i < MAX_SPIDER_PRIORITIES ; i++ ) {
sc->m_nextKeys[i] = g_doledb.makeFirstKey2(i);
sc->m_isDoledbEmpty[i] = 0;
}
}
// . if all doledb priorities are empty, skip it quickly
// . do this only after we update lastSpiderAttempt above
// . this is broken!! why??
if ( sc && sc->m_allDoledbPrioritiesEmpty >= 3 )
continue;
*/
// ok, we are good to launch a spider for coll m_cri
//break;
//}
// if none, bail, wait for sleep wrapper to re-call us later on
//if ( count == 0 ) return;
// sanity check
if ( nowGlobal == 0 ) { char *xx=NULL;*xx=0; }
// sanity check
//if ( m_cri >= g_collectiondb.m_numRecs ) { char *xx=NULL;*xx=0; }
// grab this
//collnum_t collnum = m_cri;
//CollectionRec *cr = g_collectiondb.m_recs[collnum];
// update the crawlinfo for this collection if it has been a while.
// should never block since callback is NULL.
//if ( ! updateCrawlInfo(cr,NULL,NULL,true) ) { char *xx=NULL;*xx=0; }
// get this
//char *coll = cr->m_coll;
// need this for msg5 call
key_t endKey; endKey.setMax();
// start at the top each time now
//m_sc->m_nextDoledbKey.setMin();
// set the key to this at start (break Spider.cpp:4683
//m_sc->m_nextDoledbKey = g_doledb.makeFirstKey2 ( m_sc->m_pri );
// init the m_priorityToUfn map array?
if ( ! m_sc->m_ufnMapValid ) {
// reset all priorities to map to a ufn of -1
for ( int32_t i = 0 ; i < MAX_SPIDER_PRIORITIES ; i++ )
m_sc->m_priorityToUfn[i] = -1;
// initialize the map that maps priority to first ufn that uses
// that priority. map to -1 if no ufn uses it.
for ( int32_t i = 0 ; i < cr->m_numRegExs ; i++ ) {
// breathe
QUICKPOLL ( MAX_NICENESS );
// get the ith rule priority
int32_t sp = cr->m_spiderPriorities[i];
// must not be filtered or banned
if ( sp < 0 ) continue;
// sanity
if ( sp >= MAX_SPIDER_PRIORITIES){char *xx=NULL;*xx=0;}
// skip if already mapped
if ( m_sc->m_priorityToUfn[sp] != -1 ) continue;
// map that
m_sc->m_priorityToUfn[sp] = i;
}
// all done
m_sc->m_ufnMapValid = true;
}
loop:
QUICKPOLL(MAX_NICENESS);
// shortcut
//CrawlInfo *ci = &cr->m_localCrawlInfo;
ci = &cr->m_localCrawlInfo;
// bail if waiting for lock reply, no point in reading more
if ( m_msg12.m_gettingLocks ) {
// assume we would have launched a spider for this coll
ci->m_lastSpiderCouldLaunch = nowGlobal;
// wait for sleep callback to re-call us in 10ms
return;
}
// reset priority when it goes bogus
if ( m_sc->m_pri2 < 0 ) {
// i guess the scan is complete for this guy
//m_sc->m_didRound = true;
// count # of priority scan rounds done
//m_sc->m_numRoundsDone++;
// reset for next coll
m_sc->setPriority ( MAX_SPIDER_PRIORITIES - 1 );
//m_sc->m_pri2 = MAX_SPIDER_PRIORITIES - 1;
// reset key now too since this coll was exhausted
//m_sc->m_nextDoledbKey=g_doledb.makeFirstKey2 ( m_sc->m_pri );
// we can't keep starting over because there are often tons
// of annihilations between positive and negative keys
// and causes massive disk slow down because we have to do
// like 300 re-reads or more of about 2k each on coeus
//m_sc->m_nextDoledbKey = m_sc->m_nextKeys [ m_sc->m_pri2 ];
// and this
//m_sc->m_msg5StartKey = m_sc->m_nextDoledbKey;
// was it all empty? if did not encounter ANY doledb recs
// after scanning all priorities, set empty to true.
//if ( ! m_sc->m_encounteredDoledbRecs &&
// // if waiting tree is rebuilding... could be empty...
// ! m_sc->m_waitingTreeNeedsRebuild )
// m_sc->m_lastDoledbReadEmpty = true;
// and go up top
//goto collLoop;
goto subloop;
}
// . skip priority if we knows its empty in doledb
// . this will save us a call to msg5 below
//if ( m_sc->m_isDoledbEmpty [ m_sc->m_pri2 ] ) {
// // decrease the priority
// m_sc->devancePriority();
// // and try the one below
// goto loop;
//}
// int16_tcut
//CollectionRec *cr = m_sc->m_cr;
// sanity
if ( cr != m_sc->getCollectionRec() ) { char *xx=NULL;*xx=0; }
// skip the priority if we already have enough spiders on it
int32_t out = m_sc->m_outstandingSpiders[m_sc->m_pri2];
// how many spiders can we have out?
int32_t max = 0;
for ( int32_t i =0 ; i < cr->m_numRegExs ; i++ ) {
if ( cr->m_spiderPriorities[i] != m_sc->m_pri2 ) continue;
//if ( ! cr->m_spidersEnabled[i] ) continue;
if ( cr->m_maxSpidersPerRule[i] > max )
max = cr->m_maxSpidersPerRule[i];
}
// get the max # of spiders over all ufns that use this priority!
//int32_t max = getMaxAllowableSpidersOut ( m_sc->m_pri2 );
//int32_t ufn = m_sc->m_priorityToUfn[m_sc->m_pri2];
// how many can we have? crap, this is based on ufn, not priority
// so we need to map the priority to a ufn that uses that priority
//int32_t max = 0;
// see if it has a maxSpiders, if no ufn uses this priority then
// "max" will remain set to 0
//if ( ufn >= 0 ) max = m_sc->m_cr->m_maxSpidersPerRule[ufn];
// turned off?
//if ( ufn >= 0 && ! m_sc->m_cr->m_spidersEnabled[ufn] ) max = 0;
// if we have one out, do not end the round!
if ( out > 0 ) {
// assume we could have launched a spider
ci->m_lastSpiderCouldLaunch = nowGlobal;
}
// always allow at least 1, they can disable spidering otherwise
// no, we use this to disabled spiders... if ( max <= 0 ) max = 1;
// skip?
if ( out >= max ) {
// count as non-empty then!
//m_sc->m_encounteredDoledbRecs = true;
// try the priority below us
m_sc->devancePriority();
//m_sc->m_pri--;
// set the new key for this priority if valid
//if ( m_sc->m_pri >= 0 )
// //m_sc->m_nextDoledbKey =
// // g_doledb.makeFirstKey2(m_sc->m_pri);
// m_sc->m_nextDoledbKey = m_sc->m_nextKeys[m_sc->m_pri2];
// and try again
goto loop;
}
// we only launch one spider at a time... so lock it up
m_gettingDoledbList = true;
// log this now
if ( g_conf.m_logDebugSpider )
m_doleStart = gettimeofdayInMillisecondsLocal();
// debug
if ( g_conf.m_logDebugSpider &&
m_sc->m_msg5StartKey != m_sc->m_nextDoledbKey )
log("spider: msg5startKey differs from nextdoledbkey");
// seems like we need this reset here... strange
m_list.reset();
// get a spider rec for us to spider from doledb (mdw)
if ( ! m_msg5.getList ( RDB_DOLEDB ,
cr->m_collnum, // coll ,
&m_list ,
m_sc->m_msg5StartKey,//m_sc->m_nextDoledbKey,
endKey ,
// need to make this big because we don't
// want to end up getting just a negative key
//1 , // minRecSizes (~ 7000)
// we need to read in a lot because we call
// "goto listLoop" below if the url we want
// to dole is locked.
// seems like a ton of negative recs
2000 , // minRecSizes
true , // includeTree
false , // addToCache
0 , // max cache age
0 , // startFileNum
-1 , // numFiles (all)
this , // state
gotDoledbListWrapper2 ,
MAX_NICENESS , // niceness
true ))// do error correction?
// return if it blocked
return ;
// debug
//log(LOG_DEBUG,"spider: read list of %"INT32" bytes from spiderdb for "
// "pri=%"INT32"+",m_list.m_listSize,(int32_t)m_sc->m_pri);
// breathe
QUICKPOLL ( MAX_NICENESS );
int32_t saved = m_launches;
// . add urls in list to cache
// . returns true if we should read another list
// . will set startKey to next key to start at
bool status = gotDoledbList2 ( );
// if we did not launch anything, then decrement priority and
// try again. but if priority hits -1 then subloop2 will just go to
// the next collection.
if ( saved == m_launches ) {
m_sc->devancePriority();
goto subloopNextPriority;
}
if ( status ) {
// . if priority is -1 that means try next priority
// . DO NOT reset the whole scan. that was what was happening
// when we just had "goto loop;" here
// . this means a reset above!!!
//if ( m_sc->m_pri2 == -1 ) return;
// bail if waiting for lock reply, no point in reading more
// mdw- i moved this check up to loop: jump point.
//if ( m_msg12.m_gettingLocks ) return;
// gotDoledbList2() always advances m_nextDoledbKey so
// try another read
// now advance to next coll, launch one spider per coll
goto subloop;//loop;
}
// wait for the msg12 get lock request to return...
// or maybe spiders are off
//return;
// now advance to next coll, launch one spider per coll
goto subloop;//loop;
}
// . decrement priority
// . will also set m_sc->m_nextDoledbKey
// . will also set m_sc->m_msg5StartKey
void SpiderColl::devancePriority() {
// try next
m_pri2 = m_pri2 - 1;
// how can this happen?
if ( m_pri2 < -1 ) m_pri2 = -1;
// bogus?
if ( m_pri2 < 0 ) return;
// set to next priority otherwise
//m_sc->m_nextDoledbKey=g_doledb.makeFirstKey2 ( m_sc->m_pri );
m_nextDoledbKey = m_nextKeys [m_pri2];
// and the read key
m_msg5StartKey = m_nextDoledbKey;
}
void SpiderColl::setPriority(int32_t pri) {
m_pri2 = pri;
m_nextDoledbKey = m_nextKeys [ m_pri2 ];
m_msg5StartKey = m_nextDoledbKey;
}
void gotDoledbListWrapper2 ( void *state , RdbList *list , Msg5 *msg5 ) {
// process the doledb list and try to launch a spider
g_spiderLoop.gotDoledbList2();
// regardless of whether that blocked or not try to launch another
// and try to get the next SpiderRequest from doledb
g_spiderLoop.spiderDoledUrls();
}
// . this is in seconds
// . had to make this 4 hours since one url was taking more than an hour
// to lookup over 80,000 places in placedb. after an hour it had only
// reached about 30,000
// http://pitchfork.com/news/tours/833-julianna-barwick-announces-european-and-north-american-dates/
// . this problem with this now is that it will lock an entire IP until it
// expires if we have maxSpidersPerIp set to 1. so now we try to add
// a SpiderReply for local errors like when XmlDoc::indexDoc() sets g_errno,
// we try to add a SpiderReply at least.
#define MAX_LOCK_AGE (3600*4)
// spider the spider rec in this list from doledb
// returns false if would block indexing a doc, returns true if would not,
// and returns true and sets g_errno on error
bool SpiderLoop::gotDoledbList2 ( ) {
// unlock
m_gettingDoledbList = false;
// int16_tcuts
CollectionRec *cr = m_sc->getCollectionRec();
CrawlInfo *ci = &cr->m_localCrawlInfo;
// update m_msg5StartKey for next read
if ( m_list.getListSize() > 0 ) {
// what is m_list.m_ks ?
m_list.getLastKey((char *)&m_sc->m_msg5StartKey);
m_sc->m_msg5StartKey += 1;
// i guess we had something? wait for nothing to be there
//m_sc->m_encounteredDoledbRecs = true;
}
// log this now
if ( g_conf.m_logDebugSpider ) {
int64_t now = gettimeofdayInMillisecondsLocal();
int64_t took = now - m_doleStart;
if ( took > 2 )
logf(LOG_DEBUG,"spider: GOT list from doledb in "
"%"INT64"ms "
"size=%"INT32" bytes",
took,m_list.getListSize());
}
bool bail = false;
// bail instantly if in read-only mode (no RdbTrees!)
if ( g_conf.m_readOnlyMode ) bail = true;
// or if doing a daily merge
if ( g_dailyMerge.m_mergeMode ) bail = true;
// skip if too many udp slots being used
if(g_udpServer.getNumUsedSlotsIncoming() >= MAXUDPSLOTS ) bail =true;
// stop if too many out
if ( m_numSpidersOut >= MAX_SPIDERS ) bail = true;
if ( bail ) {
// assume we could have launched a spider
ci->m_lastSpiderCouldLaunch = getTimeGlobal();
// return false to indicate to try another
return false;
}
// bail if list is empty
if ( m_list.getListSize() <= 0 ) {
// don't bother with this priority again until a key is
// added to it! addToDoleIpTable() will be called
// when that happens and it might unset this then.
m_sc->m_isDoledbEmpty [ m_sc->m_pri2 ] = 1;
/*
// if all priorities now empty set another flag
m_sc->m_allDoledbPrioritiesEmpty++;
for ( int32_t i = 0 ; i < MAX_SPIDER_PRIORITIES ; i++ ) {
if ( m_sc->m_isDoledbEmpty[m_sc->m_pri2] ) continue;
// must get empties 3 times in a row to ignore it
// in case something was added to doledb while
// we were reading from doledb.
m_sc->m_allDoledbPrioritiesEmpty--;
break;
}
*/
// if no spiders...
//if ( g_conf.m_logDebugSpider ) {
// log("spider: empty doledblist collnum=%"INT32" "
// "inwaitingtree=%"INT32"",
// (int32_t)cr->m_collnum,
// m_sc->m_waitingTree.m_numUsedNodes);
//}
//if ( g_conf.m_logDebugSpider )
// log("spider: resetting doledb priority pri=%"INT32"",
// m_sc->m_pri);
// trigger a reset
//m_sc->m_pri = -1;
// . let the sleep timer init the loop again!
// . no, just continue the loop
//return true;
// . this priority is EMPTY, try next
// . will also set m_sc->m_nextDoledbKey
// . will also set m_sc->m_msg5StartKey
//m_sc->devancePriority();
// this priority is EMPTY, try next
//m_sc->m_pri = m_sc->m_pri - 1;
// how can this happen?
//if ( m_sc->m_pri < -1 ) m_sc->m_pri = -1;
// all done if priority is negative, it will start over
// at the top most priority, we've completed a round
//if ( m_sc->m_pri < 0 ) return true;
// set to next priority otherwise
//m_sc->m_nextDoledbKey=g_doledb.makeFirstKey2 ( m_sc->m_pri );
//m_sc->m_nextDoledbKey = m_sc->m_nextKeys [m_sc->m_pri];
// and load that list from doledb for that priority
return true;
}
// if debugging the spider flow show the start key if list non-empty
/*if ( g_conf.m_logDebugSpider ) {
// 12 byte doledb keys
int32_t pri = g_doledb.getPriority(&m_sc->m_nextDoledbKey);
int32_t stm = g_doledb.getSpiderTime(&m_sc->m_nextDoledbKey);
int64_t uh48 = g_doledb.getUrlHash48(&m_sc->m_nextDoledbKey);
logf(LOG_DEBUG,"spider: loading list from doledb startkey=%s"
" pri=%"INT32" time=%"UINT32" uh48=%"UINT64"",
KEYSTR(&m_sc->m_nextDoledbKey,12),
pri,
stm,
uh48);
}*/
time_t nowGlobal = getTimeGlobal();
// double check
//if ( ! m_list.checkList_r( true , false, RDB_DOLEDB) ) {
// char *xx=NULL;*xx=0; }
// debug parm
//int32_t lastpri = -2;
//int32_t lockCount = 0;
// reset ptr to point to first rec in list
m_list.resetListPtr();
listLoop:
// all done if empty
//if ( m_list.isExhausted() ) {
// // copied from above
// m_sc->m_didRound = true;
// // and try next colleciton immediately
// return true;
//}
// breathe
QUICKPOLL(MAX_NICENESS);
// get the current rec from list ptr
char *rec = (char *)m_list.getListPtr();
// the doledbkey
key_t *doledbKey = (key_t *)rec;
// get record after it next time
m_sc->m_nextDoledbKey = *doledbKey ;
// sanity check -- wrap watch -- how can this really happen?
if ( m_sc->m_nextDoledbKey.n1 == 0xffffffff &&
m_sc->m_nextDoledbKey.n0 == 0xffffffffffffffffLL ) {
char *xx=NULL;*xx=0; }
// only inc it if its positive! because we do have negative
// doledb keys in here now
//if ( (m_sc->m_nextDoledbKey & 0x01) == 0x01 )
// m_sc->m_nextDoledbKey += 1;
// if its negative inc by two then! this fixes the bug where the
// list consisted only of one negative key and was spinning forever
//else
// m_sc->m_nextDoledbKey += 2;
// if its negative inc by two then! this fixes the bug where the
// list consisted only of one negative key and was spinning forever
if ( (m_sc->m_nextDoledbKey & 0x01) == 0x00 )
m_sc->m_nextDoledbKey += 2;
// did it hit zero? that means it wrapped around!
if ( m_sc->m_nextDoledbKey.n1 == 0x0 &&
m_sc->m_nextDoledbKey.n0 == 0x0 ) {
// TODO: work this out
char *xx=NULL;*xx=0; }
// get priority from doledb key
int32_t pri = g_doledb.getPriority ( doledbKey );
// if the key went out of its priority because its priority had no
// spider requests then it will bleed over into another priority so
// in that case reset it to the top of its priority for next time
int32_t pri3 = g_doledb.getPriority ( &m_sc->m_nextDoledbKey );
if ( pri3 != m_sc->m_pri2 ) {
m_sc->m_nextDoledbKey = g_doledb.makeFirstKey2 ( m_sc->m_pri2);
// the key must match the priority queue its in as nextKey
//if ( pri3 != m_sc->m_pri2 ) { char *xx=NULL;*xx=0; }
}
if ( g_conf.m_logDebugSpider ) {
int32_t pri4 = g_doledb.getPriority ( &m_sc->m_nextDoledbKey );
log("spider: setting pri2=%"INT32" queue doledb nextkey to "
"%s (pri=%"INT32")",
m_sc->m_pri2,KEYSTR(&m_sc->m_nextDoledbKey,12),pri4);
//if ( pri4 != m_sc->m_pri2 ) { char *xx=NULL;*xx=0; }
}
// update next doledbkey for this priority to avoid having to
// process excessive positive/negative key annihilations (mdw)
m_sc->m_nextKeys [ m_sc->m_pri2 ] = m_sc->m_nextDoledbKey;
// sanity
if ( pri < 0 || pri >= MAX_SPIDER_PRIORITIES ) { char *xx=NULL;*xx=0; }
// skip the priority if we already have enough spiders on it
int32_t out = m_sc->m_outstandingSpiders[pri];
// get the first ufn that uses this priority
//int32_t max = getMaxAllowableSpidersOut ( pri );
// how many spiders can we have out?
int32_t max = 0;
// in milliseconds. ho wint32_t to wait between downloads from same IP.
// only for parnent urls, not including child docs like robots.txt
// iframe contents, etc.
int32_t sameIpWaitTime = 5000; // 250; // ms
int32_t maxSpidersOutPerIp = 1;
for ( int32_t i = 0 ; i < cr->m_numRegExs ; i++ ) {
if ( cr->m_spiderPriorities[i] != pri ) continue;
//if ( ! cr->m_spidersEnabled[i] ) continue;
if ( cr->m_maxSpidersPerRule[i] > max )
max = cr->m_maxSpidersPerRule[i];
if ( cr->m_spiderIpWaits[i] < sameIpWaitTime )
sameIpWaitTime = cr->m_spiderIpWaits[i];
if ( cr->m_spiderIpMaxSpiders[i] > maxSpidersOutPerIp )
maxSpidersOutPerIp = cr->m_spiderIpMaxSpiders[i];
}
//int32_t ufn = m_sc->m_priorityToUfn[pri];
// how many can we have? crap, this is based on ufn, not priority
// so we need to map the priority to a ufn that uses that priority
//int32_t max = 0;
// see if it has a maxSpiders, if no ufn uses this priority then
// "max" will remain set to 0
//if ( ufn >= 0 ) max = m_sc->m_cr->m_maxSpidersPerRule[ufn];
// turned off?
//if ( ufn >= 0 && ! m_sc->m_cr->m_spidersEnabled[ufn] ) max = 0;
// if we skipped over the priority we wanted, update that
//m_pri = pri;
// then do the next one after that for next round
//m_pri--;
// always allow at least 1, they can disable spidering otherwise
//if ( max <= 0 ) max = 1;
// skip? and re-get another doledb list from next priority...
if ( out >= max ) {
// come here if we hit our ip limit too
hitMax:
// assume we could have launched a spider
if ( max > 0 ) ci->m_lastSpiderCouldLaunch = nowGlobal;
// this priority is maxed out, try next
//m_sc->devancePriority();
// assume not an empty read
//m_sc->m_encounteredDoledbRecs = true;
//m_sc->m_pri = pri - 1;
// all done if priority is negative
//if ( m_sc->m_pri < 0 ) return true;
// set to next priority otherwise
//m_sc->m_nextDoledbKey=g_doledb.makeFirstKey2 ( m_sc->m_pri );
//m_sc->m_nextDoledbKey = m_sc->m_nextKeys [m_sc->m_pri];
// and load that list
return true;
}
// no negatives - wtf?
// if only the tree has doledb recs, Msg5.cpp does not remove
// the negative recs... it doesn't bother to merge.
if ( (doledbKey->n0 & 0x01) == 0 ) {
// just increment then i guess
m_list.skipCurrentRecord();
// if exhausted -- try another load with m_nextKey set
if ( m_list.isExhausted() ) return true;
// otherwise, try the next doledb rec in this list
goto listLoop;
}
// what is this? a dataless positive key?
if ( m_list.getCurrentRecSize() <= 16 ) { char *xx=NULL;*xx=0; }
// get the "spider rec" (SpiderRequest) (embedded in the doledb rec)
SpiderRequest *sreq = (SpiderRequest *)(rec + sizeof(key_t)+4);
// sanity check. check for http(s)://
if ( sreq->m_url[0] != 'h' &&
// might be a docid from a pagereindex.cpp
! is_digit(sreq->m_url[0]) ) {
// note it
if ( (g_corruptCount % 1000) == 0 )
log("spider: got corrupt doledb record. ignoring. "
"pls fix!!!");
g_corruptCount++;
// skip for now....!! what is causing this???
m_list.skipCurrentRecord();
// if exhausted -- try another load with m_nextKey set
if ( m_list.isExhausted() ) return true;
// otherwise, try the next doledb rec in this list
goto listLoop;
}
// . how many spiders out for this ip now?
// . TODO: count locks in case twin is spidering... but it did not seem
// to work right for some reason
int32_t ipOut = 0;
int32_t globalOut = 0;
for ( int32_t i = 0 ; i <= m_maxUsed ; i++ ) {
// get it
XmlDoc *xd = m_docs[i];
if ( ! xd ) continue;
if ( ! xd->m_sreqValid ) continue;
// to prevent one collection from hogging all the urls for
// particular IP and starving other collections, let's make
// this a per collection count.
// then allow msg13.cpp to handle the throttling on its end.
// also do a global count over all collections now
if ( xd->m_sreq.m_firstIp == sreq->m_firstIp ) globalOut++;
// only count for our same collection otherwise another
// collection can starve us out
if ( xd->m_collnum != cr->m_collnum ) continue;
if ( xd->m_sreq.m_firstIp == sreq->m_firstIp ) ipOut++;
}
if ( ipOut >= maxSpidersOutPerIp ) goto hitMax;
// but if the global is high, only allow one out per coll so at
// least we dont starve and at least we don't make a huge wait in
// line of queued results just sitting there taking up mem and
// spider slots so the crawlbot hourly can't pass.
if ( globalOut >= maxSpidersOutPerIp && ipOut >= 1 ) goto hitMax;
if ( g_conf.m_logDebugSpider )
log("spider: %"INT32" spiders out for %s for %s",
ipOut,iptoa(sreq->m_firstIp),
sreq->m_url);
// sometimes we have it locked, but is still in doledb i guess.
// seems like we might have give the lock to someone else and
// there confirmation has not come through yet, so it's still
// in doledb.
HashTableX *ht = &g_spiderLoop.m_lockTable;
// int16_tcut
int64_t lockKey = makeLockTableKey ( sreq );
// get the lock... only avoid if confirmed!
int32_t slot = ht->getSlot ( &lockKey );
UrlLock *lock = NULL;
if ( slot >= 0 )
// get the corresponding lock then if there
lock = (UrlLock *)ht->getValueFromSlot ( slot );
// if there and confirmed, why still in doledb?
if ( lock && lock->m_confirmed ) {
// fight log spam
static int32_t s_lastTime = 0;
if ( nowGlobal - s_lastTime >= 2 ) {
// why is it not getting unlocked!?!?!
log("spider: spider request locked but still in "
"doledb. uh48=%"INT64" firstip=%s %s",
sreq->getUrlHash48(),
iptoa(sreq->m_firstIp),sreq->m_url );
s_lastTime = nowGlobal;
}
// just increment then i guess
m_list.skipCurrentRecord();
// let's return false here to avoid an infinite loop
// since we are not advancing nextkey and m_pri is not
// being changed, that is what happens!
if ( m_list.isExhausted() ) {
// crap. but then we never make it to lower priorities.
// since we are returning false. so let's try the
// next priority in line.
//m_sc->m_pri--;
//m_sc->devancePriority();
// try returning true now that we skipped to
// the next priority level to avoid the infinite
// loop as described above.
return true;
//return false;//true;
}
// try the next record in this list
goto listLoop;
}
// . no no! the SpiderRequests in doledb are in our group because
// doledb is split based on ... firstIp i guess...
// BUT now lock is done based on probable docid since we do not
// know the firstIp if injected spider requests but we do know
// their probable docids since that is basically a function of
// the url itself. THUS we now must realize this by trying to
// get the lock for it and failing!
/*
// . likewise, if this request is already being spidered, if it
// is in the lock table, skip it...
// . if this is currently locked for spidering by us or another
// host (or by us) then return true here
HashTableX *ht = &g_spiderLoop.m_lockTable;
// int16_tcut
//int64_t uh48 = sreq->getUrlHash48();
// get the lock key
uint64_t lockKey ;
lockKey = g_titledb.getFirstProbableDocId(sreq->m_probDocId);
// check tree
int32_t slot = ht->getSlot ( &lockKey );
// if more than an hour old nuke it and clear it
if ( slot >= 0 ) {
// get the corresponding lock then if there
UrlLock *lock = (UrlLock *)ht->getValueFromSlot ( slot );
// if 1hr+ old, nuke it and disregard
if ( nowGlobal - lock->m_timestamp > MAX_LOCK_AGE ) {
// unlock it
ht->removeSlot ( slot );
// it is gone
slot = -1;
}
}
// if there say no no -- will try next spiderrequest in doledb then
if ( slot >= 0 ) {
// just increment then i guess
m_list.skipCurrentRecord();
// count locks
//if ( pri == lastpri ) lockCount++;
//else lockCount = 1;
//lastpri = pri;
// how is it we can have 2 locked but only 1 outstanding
// for the same priority?
// basically this url is done being spidered, but we have
// not yet processed the negative doledb key in Rdb.cpp which
// will remove the lock from the lock table... so this
// situation is perfectly fine i guess.. assuming that is
// what is going on
//if ( lockCount >= max ) { char *xx=NULL;*xx=0; }
// this is not good
static bool s_flag = false;
if ( ! s_flag ) {
s_flag = true;
log("spider: got url %s that is locked but in dole "
"table... skipping",sreq->m_url);
}
// if exhausted -- try another load with m_nextKey set
if ( m_list.isExhausted() ) return true;
// otherwise, try the next doledb rec in this list
goto listLoop;
}
*/
// force this set i guess... why isn't it set already? i guess when
// we added the spider request to doledb it was not set at that time
//sreq->m_doled = 1;
//
// sanity check. verify the spiderrequest also exists in our
// spidercache. we no longer store doled out spider requests in our
// cache!! they are separate now.
//
//if ( g_conf.m_logDebugSpider ) {
// // scan for it since we may have dup requests
// int64_t uh48 = sreq->getUrlHash48();
// int64_t pdocid = sreq->getParentDocId();
// // get any request from our urlhash table
// SpiderRequest *sreq2 = m_sc->getSpiderRequest2 (&uh48,pdocid);
// // must be there. i guess it could be missing if there is
// // corruption and we lost it in spiderdb but not in doledb...
// if ( ! sreq2 ) { char *xx=NULL;*xx=0; }
//}
// log this now
if ( g_conf.m_logDebugSpider )
logf(LOG_DEBUG,"spider: trying to spider url %s",sreq->m_url);
/*
if ( ufn >= 0 ) {
int32_t siwt = m_sc->m_cr->m_spiderIpWaits[ufn];
if ( siwt >= 0 ) sameIpWaitTime = siwt;
}
if ( ufn >= 0 ) {
maxSpidersOutPerIp = m_sc->m_cr->m_spiderIpMaxSpiders[ufn];
if ( maxSpidersOutPerIp < 0 ) maxSpidersOutPerIp = 999;
}
*/
// assume we launch the spider below. really this timestamp indicates
// the last time we COULD HAVE LAUNCHED *OR* did actually launch
// a spider
ci->m_lastSpiderCouldLaunch = nowGlobal;
// set crawl done email sent flag so another email can be sent again
// in case the user upped the maxToCrawl limit, for instance,
// so that the crawl could continue.
//ci->m_sentCrawlDoneAlert = 0;
// if we thought we were done, note it if something comes back up
if ( ! ci->m_hasUrlsReadyToSpider )
log("spider: got a reviving url for coll %s (%"INT32") to crawl %s",
cr->m_coll,(int32_t)cr->m_collnum,sreq->m_url);
// if changing status, resend our local crawl info to all hosts?
if ( ! ci->m_hasUrlsReadyToSpider )
cr->localCrawlInfoUpdate();
// there are urls ready to spider
ci->m_hasUrlsReadyToSpider = true;
// newly created crawls usually have this set to false so set it
// to true so getSpiderStatus() does not return that "the job
// is completed and no repeat is scheduled"...
if ( cr->m_spiderStatus == SP_INITIALIZING ) {
// this is the GLOBAL crawl info, not the LOCAL, which
// is what "ci" represents...
// MDW: is this causing the bug?
// the other have already reported that there are no urls
// to spider, so they do not re-report. we already
// had 'hasurlsreadytospider' set to true so we didn't get
// the reviving log msg.
cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider = true;
// set this right i guess...?
ci->m_lastSpiderAttempt = nowGlobal;
}
// reset reason why crawl is not running, because we basically are now
cr->m_spiderStatus = SP_INPROGRESS; // this is 7
//cr->m_spiderStatusMsg = NULL;
// be sure to save state so we do not re-send emails
cr->m_needsSave = 1;
// assume not an empty read
//m_sc->m_encounteredDoledbRecs = true;
// int16_tcut
//char *coll = m_sc->m_cr->m_coll;
// sometimes the spider coll is reset/deleted while we are
// trying to get the lock in spiderUrl9() so let's use collnum
collnum_t collnum = m_sc->getCollectionRec()->m_collnum;
// . spider that. we don't care wheter it blocks or not
// . crap, it will need to block to get the locks!
// . so at least wait for that!!!
// . but if we end up launching the spider then this should NOT
// return false! only return false if we should hold up the doledb
// scan
// . this returns true right away if it failed to get the lock...
// which means the url is already locked by someone else...
// . it might also return true if we are already spidering the url
bool status = spiderUrl9 ( sreq , doledbKey , collnum, sameIpWaitTime ,
maxSpidersOutPerIp ) ;
// just increment then i guess
m_list.skipCurrentRecord();
// if it blocked, wait for it to return to resume the doledb list
// processing because the msg12 is out and we gotta wait for it to
// come back. when lock reply comes back it tries to spider the url
// then it tries to call spiderDoledUrls() to keep the spider queue
// spidering fully.
if ( ! status ) return false;
// if exhausted -- try another load with m_nextKey set
if ( m_list.isExhausted() ) {
// if no more in list, fix the next doledbkey,
// m_sc->m_nextDoledbKey
log ( LOG_DEBUG,"spider: list exhausted.");
return true;
}
// otherwise, it might have been in the lock cache and quickly
// rejected, or rejected for some other reason, so try the next
// doledb rec in this list
goto listLoop;
//
// otherwise, it blocked, trying to get the lock across the network.
// so reset the doledb scan assuming it will go through. if it does
// NOT get the lock, then it will be in the lock cache for quick
// "return true" from spiderUrl() above next time we try it.
//
// once we get a url from doledb to spider, reset our doledb scan.
// that way if a new url gets added to doledb that is high priority
// then we get it right away.
//
// NO! because the lock request can block then fail!! and we end
// up resetting and in an infinite loop!
//
//m_sc->m_pri = -1;
//return false;
}
// . spider the next url that needs it the most
// . returns false if blocked on a spider launch, otherwise true.
// . returns false if your callback will be called
// . returns true and sets g_errno on error
bool SpiderLoop::spiderUrl9 ( SpiderRequest *sreq ,
key_t *doledbKey ,
//char *coll ,
collnum_t collnum ,
int32_t sameIpWaitTime ,
int32_t maxSpidersOutPerIp ) {
// sanity check
//if ( ! sreq->m_doled ) { char *xx=NULL;*xx=0; }
// if waiting on a lock, wait
if ( m_msg12.m_gettingLocks ) { char *xx=NULL;*xx=0; }
// sanity
if ( ! m_sc ) { char *xx=NULL;*xx=0; }
// sanity check
// core dump? just re-run gb and restart the parser test...
if ( //g_test.m_isRunning &&
//! g_test.m_spiderLinks &&
g_conf.m_testParserEnabled &&
! sreq->m_isInjecting ) {
char *xx=NULL;*xx=0; }
// wait until our clock is synced with host #0 before spidering since
// we store time stamps in the domain and ip wait tables in
// SpiderCache.cpp. We don't want to freeze domain for a int32_t time
// because we think we have to wait until tomorrow before we can
// spider it.
if ( ! isClockInSync() ) {
// let admin know why we are not spidering
static char s_printed = false;
if ( ! s_printed ) {
logf(LOG_DEBUG,"spider: NOT SPIDERING until clock "
"is in sync with host #0.");
s_printed = true;
}
return true;
}
// turned off?
if ( ( (! g_conf.m_spideringEnabled ||
// or if trying to exit
g_process.m_mode == EXIT_MODE
) && // ! g_conf.m_webSpideringEnabled ) &&
! sreq->m_isInjecting ) ||
// repairing the collection's rdbs?
g_repairMode ||
// power went off?
! g_process.m_powerIsOn ) {
// try to cancel outstanding spiders, ignore injects
for ( int32_t i = 0 ; i <= m_maxUsed ; i++ ) {
// get it
XmlDoc *xd = m_docs[i];
if ( ! xd ) continue;
//if ( xd->m_sreq.m_isInjecting ) continue;
// let everyone know, TcpServer::cancel() uses this in
// destroySocket()
g_errno = ECANCELLED;
// cancel the socket trans who has "xd" as its state.
// this will cause XmlDoc::gotDocWrapper() to be called
// now, on this call stack with g_errno set to
// ECANCELLED. But if Msg16 was not in the middle of
// HttpServer::getDoc() then this will have no effect.
g_httpServer.cancel ( xd );//, g_msg13RobotsWrapper );
// cancel any Msg13 that xd might have been waiting for
g_udpServer.cancel ( &xd->m_msg13 , 0x13 );
}
return true;
}
// do not launch any new spiders if in repair mode
if ( g_repairMode ) {
g_conf.m_spideringEnabled = false;
//g_conf.m_injectionEnabled = false;
return true;
}
// do not launch another spider if less than 25MB of memory available.
// this causes us to dead lock when spiders use up all the mem, and
// file merge operation can not get any, and spiders need to add to
// titledb but can not until the merge completes!!
if ( g_conf.m_maxMem - g_mem.m_used < 25*1024*1024 ) {
static int32_t s_lastTime = 0;
static int32_t s_missed = 0;
s_missed++;
int32_t now = getTime();
// don't spam the log, bug let people know about it
if ( now - s_lastTime > 10 ) {
log("spider: Need 25MB of free mem to launch spider, "
"only have %"INT64". Failed to launch %"INT32" times so "
"far.", g_conf.m_maxMem - g_mem.m_used , s_missed );
s_lastTime = now;
}
}
// we store this in msg12 for making a fakedb key
//collnum_t collnum = g_collectiondb.getCollnum ( coll );
// int16_tcut
int64_t lockKeyUh48 = makeLockTableKey ( sreq );
//uint64_t lockKey ;
//lockKey = g_titledb.getFirstProbableDocId(sreq->m_probDocId);
//lockKey = g_titledb.getFirstProbableDocId(sreq->m_probDocId);
// . now that we have to use msg12 to see if the thing is locked
// to avoid spidering it.. (see comment in above function)
// we often try to spider something we are already spidering. that
// is why we have an rdbcache, m_lockCache, to make these lock
// lookups quick, now that the locking group is usually different
// than our own!
// . we have to check this now because removeAllLocks() below will
// remove a lock that one of our spiders might have. it is only
// sensitive to our hostid, not "spider id"
// sometimes we exhaust the doledb and m_nextDoledbKey gets reset
// to zero, we do a re-scan and get a doledbkey that is currently
// being spidered or is waiting for its negative doledb key to
// get into our doledb tree
for ( int32_t i = 0 ; i <= m_maxUsed ; i++ ) {
// get it
XmlDoc *xd = m_docs[i];
if ( ! xd ) continue;
// jenkins was coring spidering the same url in different
// collections at the same time
if ( ! xd->m_collnumValid ) continue;
if ( xd->m_collnum != collnum ) continue;
// . problem if it has our doledb key!
// . this happens if we removed the lock above before the
// spider returned!! that's why you need to set
// MAX_LOCK_AGE to like an hour or so
// . i've also seen this happen because we got stuck looking
// up like 80,000 places and it was taking more than an
// hour. it had only reach about 30,000 after an hour.
// so at this point just set the lock timeout to
// 4 hours i guess.
// . i am seeing this again and we are trying over and over
// again to spider the same url and hogging the cpu so
// we need to keep this sanity check in here for times
// like this
if ( xd->m_doledbKey == *doledbKey ) {
// just note it for now
log("spider: spidering same url %s twice. "
"different firstips?",
xd->m_firstUrl.m_url);
//char *xx=NULL;*xx=0; }
}
// keep chugging
continue;
//if ( xd->m_doledbKey != *doledbKey ) continue;
// count it as processed
m_processed++;
// log it
if ( g_conf.m_logDebugSpider )
logf(LOG_DEBUG,"spider: we are already spidering %s "
"lockkey=%"UINT64"",sreq->m_url,lockKeyUh48);
// all done, no lock granted...
return true;
}
// reset g_errno
g_errno = 0;
// breathe
QUICKPOLL(MAX_NICENESS);
// sanity. ensure m_sreq doesn't change from under us i guess
if ( m_msg12.m_gettingLocks ) { char *xx=NULL;*xx=0; }
// get rid of this crap for now
//g_spiderCache.meterBandwidth();
// save these in case getLocks() blocks
m_sreq = sreq;
m_doledbKey = doledbKey;
//m_coll = coll;
m_collnum = collnum;
// if we already have the lock then forget it. this can happen
// if spidering was turned off then back on.
// MDW: TODO: we can't do this anymore since we no longer have
// the lockTable check above because we do not control our own
// lock now necessarily. it often is in another group's lockTable.
//if ( g_spiderLoop.m_lockTable.isInTable(&lockKey) ) {
// log("spider: already have lock for lockKey=%"UINT64"",lockKey);
// // proceed
// return spiderUrl2();
//}
// count it
m_processed++;
// now we just take it out of doledb instantly
int32_t node = g_doledb.m_rdb.m_tree.deleteNode(m_collnum,
(char *)m_doledbKey,
true);
if ( g_conf.m_logDebugSpider )
log("spider: deleting doledb tree node %"INT32,node);
// if url filters rebuilt then doledb gets reset and i've seen us hit
// this node == -1 condition here... so maybe ignore it... just log
// what happened? i think we did a quickpoll somewhere between here
// and the call to spiderDoledUrls() and it the url filters changed
// so it reset doledb's tree. so in that case we should bail on this
// url.
if ( node == -1 ) {
g_errno = EADMININTERFERENCE;
log("spider: lost url about to spider from url filters "
"and doledb tree reset. %s",mstrerror(g_errno));
return true;
}
// now remove from doleiptable since we removed from doledb
m_sc->removeFromDoledbTable ( sreq->m_firstIp );
// DO NOT add back to waiting tree if max spiders
// out per ip was 1 OR there was a crawldelay. but better
// yet, take care of that in the winReq code above.
// . now add to waiting tree so we add another spiderdb
// record for this firstip to doledb
// . true = callForScan
// . do not add to waiting tree if we have enough outstanding
// spiders for this ip. we will add to waiting tree when
// we receive a SpiderReply in addSpiderReply()
if ( //sc && //out < cq->m_maxSpidersOutPerIp &&
// this will just return true if we are not the
// responsible host for this firstip
// DO NOT populate from this!!! say "false" here...
! m_sc->addToWaitingTree ( 0 , sreq->m_firstIp, false ) &&
// must be an error...
g_errno ) {
char *msg = "FAILED TO ADD TO WAITING TREE";
log("spider: %s %s",msg,mstrerror(g_errno));
//us->sendErrorReply ( udpSlot , g_errno );
//return;
}
// . add it to lock table to avoid respider, removing from doledb
// is not enough because we re-add to doledb right away
// . return true on error here
HashTableX *ht = &g_spiderLoop.m_lockTable;
UrlLock tmp;
tmp.m_hostId = g_hostdb.m_myHost->m_hostId;
tmp.m_timestamp = 0;
tmp.m_expires = 0;
tmp.m_firstIp = m_sreq->m_firstIp;
tmp.m_spiderOutstanding = 0;
tmp.m_confirmed = 1;
tmp.m_collnum = m_collnum;
if ( g_conf.m_logDebugSpider )
log("spider: adding lock uh48=%"INT64" lockkey=%"INT64"",
m_sreq->getUrlHash48(),lockKeyUh48);
if ( ! ht->addKey ( &lockKeyUh48 , &tmp ) )
return true;
// now do it. this returns false if it would block, returns true if it
// would not block. sets g_errno on error. it spiders m_sreq.
return spiderUrl2 ( );
// flag it so m_sreq does not "disappear"
m_msg12.m_gettingLocks = true;
// count it
m_processed++;
//if ( g_conf.m_logDebugSpider )
// logf(LOG_DEBUG,"spider: getting lock for %s",m_sreq->m_url);
//
// . try to get the lock. assume it always blocks
// . it will call spiderUrl2 with sr when it gets a reply
// . if injecting, no need for lock! will return true for that!
//
if ( ! m_msg12.getLocks ( m_sreq->getUrlHash48() ,
//m_sreq->m_probDocId,//UrlHash48(),
m_sreq->m_url ,
m_doledbKey ,
collnum,
sameIpWaitTime,
maxSpidersOutPerIp,
m_sreq->m_firstIp,
NULL , // state
NULL ) ) // callback
return false;
// no go
m_msg12.m_gettingLocks = false;
// it will not block if the lock was found in our m_lockCache!
return true;
// should always block now!
//char *xx=NULL;*xx=0;
// i guess we got it
//return spiderUrl2 ( );
//return true;
}
bool SpiderLoop::spiderUrl2 ( ) {
// sanity check
//if ( ! m_sreq->m_doled ) { char *xx=NULL;*xx=0; }
// . find an available doc slot
// . we can have up to MAX_SPIDERS spiders (300)
int32_t i;
for ( i=0 ; i<MAX_SPIDERS ; i++ ) if (! m_docs[i]) break;
// breathe
QUICKPOLL(MAX_NICENESS);
// come back later if we're full
if ( i >= MAX_SPIDERS ) {
log(LOG_DEBUG,"build: Already have %"INT32" outstanding spiders.",
(int32_t)MAX_SPIDERS);
char *xx = NULL; *xx = 0;
}
// breathe
QUICKPOLL(MAX_NICENESS);
XmlDoc *xd;
// otherwise, make a new one if we have to
try { xd = new (XmlDoc); }
// bail on failure, sleep and try again
catch ( ... ) {
g_errno = ENOMEM;
log("build: Could not allocate %"INT32" bytes to spider "
"the url %s. Will retry later.",
(int32_t)sizeof(XmlDoc), m_sreq->m_url );
return true;
}
// register it's mem usage with Mem.cpp class
mnew ( xd , sizeof(XmlDoc) , "XmlDoc" );
// add to the array
m_docs [ i ] = xd;
CollectionRec *cr = g_collectiondb.getRec ( m_collnum );
char *coll = "collnumwasinvalid";
if ( cr ) coll = cr->m_coll;
// . pass in a pbuf if this is the "qatest123" collection
// . we will dump the SafeBuf output into a file in the
// test subdir for comparison with previous versions of gb
// in order to see what changed
SafeBuf *pbuf = NULL;
if ( !strcmp( coll,"qatest123") && g_conf.m_testParserEnabled )
pbuf = &xd->m_sbuf;
//
// sanity checks
//
//int64_t uh48;
//int64_t pdocid;
//if ( g_conf.m_logDebugSpider ) {
// // scan for it since we may have dup requests
// uh48 = m_sreq->getUrlHash48();
// pdocid = m_sreq->getParentDocId();
// // get any request from our urlhash table
// SpiderRequest *sreq2 = m_sc->getSpiderRequest2 (&uh48,pdocid);
// // must be valid parent
// if ( ! sreq2 && pdocid == 0LL ) { char *xx=NULL;*xx=0; }
// // for now core on this
// if ( ! sreq2 ) { char *xx=NULL;*xx=0; }
// // log it
// logf(LOG_DEBUG,"spider: spidering uh48=%"UINT64" pdocid=%"UINT64"",
// uh48,pdocid);
//}
if ( g_conf.m_logDebugSpider )
logf(LOG_DEBUG,"spider: spidering firstip9=%s(%"UINT32") "
"uh48=%"UINT64" prntdocid=%"UINT64" k.n1=%"UINT64" k.n0=%"UINT64"",
iptoa(m_sreq->m_firstIp),
(uint32_t)m_sreq->m_firstIp,
m_sreq->getUrlHash48(),
m_sreq->getParentDocId() ,
m_sreq->m_key.n1,
m_sreq->m_key.n0);
// this returns false and sets g_errno on error
if ( ! xd->set4 ( m_sreq ,
m_doledbKey ,
coll ,
pbuf ,
MAX_NICENESS ) ) {
// i guess m_coll is no longer valid?
mdelete ( m_docs[i] , sizeof(XmlDoc) , "Doc" );
delete (m_docs[i]);
m_docs[i] = NULL;
// error, g_errno should be set!
return true;
}
// . launch count increment
// . this is only used locally on this host to set its
// m_hasUrlsReadyToSpider to false.
//cr->m_localCrawlInfo.m_numUrlsLaunched++;
// call this after doc gets indexed
xd->setCallback ( xd , indexedDocWrapper );
/*
// set it from provided parms if we are injecting via Msg7
if ( m_sreq->m_isInjecting ) {
// now fill these in if provided too!
if ( m_content ) {
if ( m_sreq->m_firstIp ) {
xd->m_ip = m_sreq->m_firstIp;
xd->m_ipValid = true;
}
xd->m_isContentTruncated = false;
xd->m_isContentTruncatedValid = true;
xd->m_httpReplyValid = true;
xd->m_httpReply = m_content;
xd->m_httpReplySize = m_contentLen + 1;
if ( ! m_contentHasMime ) xd->m_useFakeMime = true;
}
// a special callback for injected docs
//xd->m_injectionCallback = m_callback;
//xd->m_injectionState = m_state;
}
*/
// increase m_maxUsed if we have to
if ( i > m_maxUsed ) m_maxUsed = i;
// count it
m_numSpidersOut++;
// count this
m_sc->m_spidersOut++;
g_spiderLoop.m_launches++;
// count it as a hit
//g_stats.m_spiderUrlsHit++;
// sanity check
if (m_sreq->m_priority <= -1 ) {
log("spider: fixing bogus spider req priority of %i for "
"url %s",
(int)m_sreq->m_priority,m_sreq->m_url);
m_sreq->m_priority = 0;
//char *xx=NULL;*xx=0;
}
//if(m_sreq->m_priority >= MAX_SPIDER_PRIORITIES){char *xx=NULL;*xx=0;}
// update this
m_sc->m_outstandingSpiders[(unsigned char)m_sreq->m_priority]++;
if ( g_conf.m_logDebugSpider )
log(LOG_DEBUG,"spider: sc_out=%"INT32" waiting=%"INT32" url=%s",
m_sc->m_spidersOut,
m_sc->m_waitingTree.m_numUsedNodes,
m_sreq->m_url);
// debug log
//log("XXX: incremented count to %"INT32" for %s",
// m_sc->m_spidersOut,m_sreq->m_url);
//if ( m_sc->m_spidersOut != m_numSpidersOut ) { char *xx=NULL;*xx=0; }
// . return if this blocked
// . no, launch another spider!
bool status = xd->indexDoc();
// . reset the next doledbkey to start over!
// . when spiderDoledUrls() see this negative priority it will
// reset the doledb scan to the top priority.
// . MDW, no, 2/25/2015. then the 'goto loop;' statement
// basially thinks a round was completed and exits
//m_sc->m_pri2 = -1;
// maybe then try setting to 127
//m_sc->setPriority ( MAX_SPIDER_PRIORITIES - 1 );
// if we were injecting and it blocked... return false
if ( ! status ) return false;
// deal with this error
indexedDoc ( xd );
// "callback" will not be called cuz it should be NULL
return true;
}
// . the one that was just indexed
// . Msg7.cpp uses this to see what docid the injected doc got so it
// can forward it to external program
//static int64_t s_lastDocId = -1;
//int64_t SpiderLoop::getLastDocId ( ) { return s_lastDocId; }
void indexedDocWrapper ( void *state ) {
// . process the results
// . return if this blocks
if ( ! g_spiderLoop.indexedDoc ( (XmlDoc *)state ) ) return;
//a hack to fix injecting urls, because they can
//run at niceness 0 but most of the spider pipeline
//cannot. we should really just make injection run at
//MAX_NICENESS. OK, done! mdw
//if ( g_loop.m_inQuickPoll ) return;
// . continue gettings Spider recs to spider
// . if it's already waiting for a list it'll just return
// . mdw: keep your eye on this, it was commented out
// . this won't execute if we're already getting a list now
//g_spiderLoop.spiderUrl ( );
// spider some urls that were doled to us
g_spiderLoop.spiderDoledUrls( );
}
// . this will delete m_docs[i]
// . returns false if blocked, true otherwise
// . sets g_errno on error
bool SpiderLoop::indexedDoc ( XmlDoc *xd ) {
// save the error in case a call changes it below
//int32_t saved = g_errno;
// get our doc #, i
//int32_t i = doc - m_docs[0];
int32_t i = 0;
for ( ; i < MAX_SPIDERS ; i++ ) if ( m_docs[i] == xd) break;
// sanity check
if ( i >= MAX_SPIDERS ) { char *xx=NULL;*xx=0; }
// set to -1 to indicate inject
//if ( i < 0 || i >= MAX_SPIDERS ) i = -1;
//char injecting = false;
//if ( xd->m_sreq.m_isInjecting ) injecting = true;
// save it for Msg7.cpp to pass docid of injected doc back
//s_lastDocId = xd->m_docId;
// . decrease m_maxUsed if we need to
// . we can decrease all the way to -1, which means no spiders going on
if ( m_maxUsed == i ) {
m_maxUsed--;
while ( m_maxUsed >= 0 && ! m_docs[m_maxUsed] ) m_maxUsed--;
}
// count it
m_numSpidersOut--;
// get coll
collnum_t collnum = xd->m_collnum;//tiondb.getCollnum ( xd->m_coll );
// if coll was deleted while spidering, sc will be NULL
SpiderColl *sc = g_spiderCache.getSpiderColl(collnum);
// decrement this
if ( sc ) sc->m_spidersOut--;
// get the original request from xmldoc
SpiderRequest *sreq = &xd->m_sreq;
// update this.
if ( sc ) sc->m_outstandingSpiders[(unsigned char)sreq->m_priority]--;
// debug log
//log("XXX: decremented count to %"INT32" for %s",
// sc->m_spidersOut,sreq->m_url);
//if ( sc->m_spidersOut != m_numSpidersOut ) { char *xx=NULL;*xx=0; }
// breathe
QUICKPOLL ( xd->m_niceness );
// are we a re-spider?
bool respider = false;
if ( xd->m_oldDocValid && xd->m_oldDoc ) respider = true;
// . dump it out to a file in the "qatest123" subdir
// . but only the first time we spider it...
/*
if ( ! strcmp(xd->m_coll,"qatest123") && ! respider &&
// no longer need this when qa testing spider, not parser
g_conf.m_testParserEnabled ) {
// save the buffers
//saveTestBuf();
// get it
//SafeBuf *pbuf = xd->m_pbuf;
SafeBuf sb;
// get it
xd->printDoc ( &sb );
// get the first url
Url *u = xd->getFirstUrl();
// . get its hash
// . should be same hash we use to store doc.%"UINT64".html in
// XmlDoc.cpp/Msg13.cpp stuff (getTestDoc())
int64_t h = hash64 ( u->getUrl() , u->getUrlLen() );
char *testDir = g_test.getTestDir();
// make filename to dump out to
char fn[1024];
sprintf(fn,"%s/%s/parse.%"UINT64".%"UINT32".html",
g_hostdb.m_dir,testDir,h,g_test.m_runId);
// . dump it out to a file
// . WATCH OUT. g_errno is set on internal errors, like OOM
// or whatever, so don't save in those cases...???????
sb.dumpToFile ( fn );
// just dump the <div class=shotdisplay> tags into this file
sprintf(fn,"%s/%s/parse-int16_tdisplay.%"UINT64".%"UINT32".html",
g_hostdb.m_dir,testDir,h,g_test.m_runId);
// output to a special file
SafeBuf tmp;
// insert this
tmp.safeStrcpy("<meta http-equiv=\"Content-Type\" "
"content=\"text/html; "
"charset=utf-8\">\n");
// header stuff
tmp.safePrintf("<html><body>\n");
// put the onclick script in there
tmp.safeStrcpy ( xd->getCheckboxScript() );
// concatenate just these sections in "sb" to "tmp"
tmp.cat2 ( sb ,
"<div class=int16_tdisplay>" ,
"</div class=int16_tdisplay>" );
// header stuff
tmp.safePrintf("\n</body></html>\n");
// then dump
tmp.dumpToFile ( fn );
// if it had critical errors from XmlDoc::validateOutput()
// then create that file!
//if ( xd->m_validateMisses > 0 || xd->m_validateFlagged ) {
// make the critical file filename
char cf[1024];
sprintf (cf,"%s/%s/critical.%"UINT64".%"UINT32".txt",
g_hostdb.m_dir,testDir,h,g_test.m_runId);
// save to that
ttt.dumpToFile ( cf );
//char cmd[256];
//sprintf(cmd,"touch %s/test/critical.%"UINT64".%"UINT32".txt",
// g_hostdb.m_dir,h,g_test.m_runId);
//system(cmd);
// note it
//log("crazyin: %s",u->m_url );
// note it
//g_test.m_urlsAdded--;
g_test.m_urlsIndexed++;
// now in PingServer.cpp for hostid 0 it checks
// the urlsindexed from each host if g_conf.m_testParserEnabled
// is true to see if we should call g_test.stopIt()
// if that is zero we are done
//if ( g_test.m_urlsAdded == 0 && ! g_test.m_isAdding &&
// // only stop if not spidering links
// //! g_test.m_spiderLinks )
// g_conf.m_testParserEnabled )
// // wrap things up
// g_test.stopIt();
}
*/
// note it
// this should not happen any more since indexDoc() will take
// care of g_errno now by clearing it and adding an error spider
// reply to release the lock!!
if ( g_errno ) {
// log("spider: ----CRITICAL CRITICAL CRITICAL----");
// log("spider: ----CRITICAL CRITICAL CRITICAL----");
// log("spider: ------ *** LOCAL ERROR *** ------");
// log("spider: ------ *** LOCAL ERROR *** ------");
// log("spider: ------ *** LOCAL ERROR *** ------");
log("spider: spidering %s has error: %s. uh48=%"INT64". "
//"Respidering "
//"in %"INT32" seconds. MAX_LOCK_AGE when lock expires. "
"cn=%"INT32"",
xd->m_firstUrl.m_url,
mstrerror(g_errno),
xd->getFirstUrlHash48(),
//(int32_t)MAX_LOCK_AGE,
(int32_t)collnum);
// log("spider: ------ *** LOCAL ERROR *** ------");
// log("spider: ------ *** LOCAL ERROR *** ------");
// log("spider: ------ *** LOCAL ERROR *** ------");
// log("spider: ----CRITICAL CRITICAL CRITICAL----");
// log("spider: ----CRITICAL CRITICAL CRITICAL----");
// don't release the lock on it right now. just let the
// lock expire on it after MAX_LOCK_AGE seconds. then it will
// be retried. we need to debug gb so these things never
// hapeen...
}
// breathe
QUICKPOLL ( xd->m_niceness );
// . call the final callback used for injecting urls
// . this may send a reply back so the caller knows the url
// was fully injected into the index
// . Msg7.cpp uses a callback that returns a void, so use m_callback1!
//if ( xd->m_injectionCallback && injecting ) {
// g_errno = saved;
// // use the index code as the error for PageInject.cpp
// if ( ! g_errno && xd->m_indexCode ) g_errno = xd->m_indexCode;
// xd->m_injectionCallback ( xd->m_injectionState );
//}
// we don't need this g_errno passed this point
g_errno = 0;
// breathe
QUICKPOLL ( xd->m_niceness );
// did this doc get a chance to add its meta list to msg4 bufs?
//bool addedMetaList = m_docs[i]->m_listAdded;
// set this in case we need to call removeAllLocks
//m_uh48 = 0LL;
//if ( xd->m_sreqValid ) m_uh48 = xd->m_sreq.getUrlHash48();
// we are responsible for deleting doc now
mdelete ( m_docs[i] , sizeof(XmlDoc) , "Doc" );
delete (m_docs[i]);
m_docs[i] = NULL;
// we remove the spider lock from g_spiderLoop.m_lockTable in Rdb.cpp
// when it receives the negative doledb key. but if the this does not
// happen, we have a problem then!
//if ( addedMetaList ) return true;
// sanity
//if ( ! m_uh48 ) { char *xx=NULL; *xx=0; }
// the lock we had in g_spiderLoop.m_lockTable for the doleKey
// is now remove in Rdb.cpp when it receives a negative dole key to
// add to doledb... assuming we added that meta list!!
// m_uh48 should be set from above
//if ( ! removeAllLocks () ) return false;
// we did not block, so return true
return true;
}
void gotLockReplyWrapper ( void *state , UdpSlot *slot ) {
// cast it
Msg12 *msg12 = (Msg12 *)state;
// . call handler
// . returns false if waiting for more replies to come in
if ( ! msg12->gotLockReply ( slot ) ) return;
// if had callback, maybe from PageReindex.cpp
if ( msg12->m_callback ) msg12->m_callback ( msg12->m_state );
// ok, try to get another url to spider
else g_spiderLoop.spiderDoledUrls();
}
Msg12::Msg12 () {
m_numRequests = 0;
m_numReplies = 0;
}
// . returns false if blocked, true otherwise.
// . returns true and sets g_errno on error
// . before we can spider for a SpiderRequest we must be granted the lock
// . each group shares the same doledb and each host in the group competes
// for spidering all those urls.
// . that way if a host goes down is load is taken over
bool Msg12::getLocks ( int64_t uh48, // probDocId ,
char *url ,
DOLEDBKEY *doledbKey,
collnum_t collnum,
int32_t sameIpWaitTime,
int32_t maxSpidersOutPerIp,
int32_t firstIp,
void *state ,
void (* callback)(void *state) ) {
// ensure not in use. not msg12 replies outstanding.
if ( m_numRequests != m_numReplies ) { char *xx=NULL;*xx=0; }
// no longer use this
char *xx=NULL;*xx=0;
// do not use locks for injections
//if ( m_sreq->m_isInjecting ) return true;
// get # of hosts in each mirror group
int32_t hpg = g_hostdb.getNumHostsPerShard();
// reset
m_numRequests = 0;
m_numReplies = 0;
m_grants = 0;
m_removing = false;
m_confirming = false;
// make sure is really docid
//if ( probDocId & ~DOCID_MASK ) { char *xx=NULL;*xx=0; }
// . mask out the lower bits that may change if there is a collision
// . in this way a url has the same m_probDocId as the same url
// in the index. i.e. if we add a new spider request for url X and
// url X is already indexed, then they will share the same lock
// even though the indexed url X may have a different actual docid
// than its probable docid.
// . we now use probable docids instead of uh48 because query reindex
// in PageReindex.cpp adds docid based spider requests and we
// only know the docid, not the uh48 because it is creating
// SpiderRequests from docid-only search results. having to look
// up the msg20 summary for like 1M search results is too painful!
//m_lockKey = g_titledb.getFirstProbableDocId(probDocId);
// . use this for locking now, and let the docid-only requests just use
// the docid
m_lockKeyUh48 = makeLockTableKey ( uh48 , firstIp );
m_url = url;
m_callback = callback;
m_state = state;
m_hasLock = false;
m_origUh48 = uh48;
// support ability to spider multiple urls from same ip
m_doledbKey = *doledbKey;
m_collnum = collnum;
m_sameIpWaitTime = sameIpWaitTime;
m_maxSpidersOutPerIp = maxSpidersOutPerIp;
m_firstIp = firstIp;
// sanity check, just 6 bytes! (48 bits)
if ( uh48 & 0xffff000000000000LL ) { char *xx=NULL;*xx=0; }
if ( m_lockKeyUh48 & 0xffff000000000000LL ) { char *xx=NULL;*xx=0; }
// cache time
int32_t ct = 120;
// if docid based assume it was a query reindex and keep it int16_t!
// otherwise we end up waiting 120 seconds for a query reindex to
// go through on a docid we just spidered. TODO: use m_urlIsDocId
// MDW: check this out
if ( url && is_digit(url[0]) ) ct = 2;
// . this seems to be messing us up and preventing us from adding new
// requests into doledb when only spidering a few IPs.
// . make it random in the case of twin contention
ct = rand() % 10;
// . check our cache to avoid repetitive asking
// . use -1 for maxAge to indicate no max age
// . returns -1 if not in cache
// . use maxage of two minutes, 120 seconds
int32_t lockTime ;
lockTime = g_spiderLoop.m_lockCache.getLong(0,m_lockKeyUh48,ct,true);
// if it was in the cache and less than 2 minutes old then return
// true now with m_hasLock set to false.
if ( lockTime >= 0 ) {
if ( g_conf.m_logDebugSpider )
logf(LOG_DEBUG,"spider: cached missed lock for %s "
"lockkey=%"UINT64"", m_url,m_lockKeyUh48);
return true;
}
if ( g_conf.m_logDebugSpider )
logf(LOG_DEBUG,"spider: sending lock request for %s "
"lockkey=%"UINT64"", m_url,m_lockKeyUh48);
// now the locking group is based on the probable docid
//m_lockGroupId = g_hostdb.getGroupIdFromDocId(m_lockKey);
// ptr to list of hosts in the group
//Host *hosts = g_hostdb.getGroup ( m_lockGroupId );
// the same group (shard) that has the spiderRequest/Reply is
// the one responsible for locking.
Host *hosts = g_hostdb.getMyShard();
// int16_t cut
UdpServer *us = &g_udpServer;
static int32_t s_lockSequence = 0;
// remember the lock sequence # in case we have to call remove locks
m_lockSequence = s_lockSequence++;
LockRequest *lr = &m_lockRequest;
lr->m_lockKeyUh48 = m_lockKeyUh48;
lr->m_firstIp = m_firstIp;
lr->m_removeLock = 0;
lr->m_lockSequence = m_lockSequence;
lr->m_collnum = collnum;
// reset counts
m_numRequests = 0;
m_numReplies = 0;
// point to start of the 12 byte request buffer
char *request = (char *)lr;//m_lockKey;
int32_t requestSize = sizeof(LockRequest);//12;
// loop over hosts in that shard
for ( int32_t i = 0 ; i < hpg ; i++ ) {
// get a host
Host *h = &hosts[i];
// skip if dead! no need to get a reply from dead guys
if ( g_hostdb.isDead (h) ) continue;
// note it
if ( g_conf.m_logDebugSpider )
logf(LOG_DEBUG,"spider: sent lock "
"request #%"INT32" for lockkey=%"UINT64" %s to "
"hid=%"INT32"",m_numRequests,m_lockKeyUh48,
m_url,h->m_hostId);
// send request to him
if ( ! us->sendRequest ( request ,
requestSize ,
0x12 , // msgType
h->m_ip ,
h->m_port ,
h->m_hostId ,
NULL , // retSlotPtrPtr
this , // state data
gotLockReplyWrapper ,
60*60*24*365 ) )
// udpserver returns false and sets g_errno on error
return true;
// count them
m_numRequests++;
}
// block?
if ( m_numRequests > 0 ) return false;
// i guess nothing... hmmm... all dead?
//char *xx=NULL; *xx=0;
// m_hasLock should be false... all lock hosts seem dead... wait
if ( g_conf.m_logDebugSpider )
logf(LOG_DEBUG,"spider: all lock hosts seem dead for %s "
"lockkey=%"UINT64"", m_url,m_lockKeyUh48);
return true;
}
// after adding the negative doledb recs to remove the url we are spidering
// from doledb, and adding the fake titledb rec to add a new entry into
// waiting tree so that our ip can have more than one outstanding spider,
// call the callback. usually msg4::addMetaList() will not block i'd guess.
void rejuvenateIPWrapper ( void *state ) {
Msg12 *THIS = (Msg12 *)state;
THIS->m_callback ( THIS->m_state );
}
// returns true if all done, false if waiting for more replies
bool Msg12::gotLockReply ( UdpSlot *slot ) {
// no longer use this
char *xx=NULL;*xx=0;
// got reply
m_numReplies++;
// don't let udpserver free the request, it's our m_request[]
slot->m_sendBufAlloc = NULL;
// check for a hammer reply
char *reply = slot->m_readBuf;
int32_t replySize = slot->m_readBufSize;
// if error, treat as a not grant
if ( g_errno ) {
bool logIt = true;
// note it
if ( g_conf.m_logDebugSpider )
log("spider: got msg12 reply error = %s",
mstrerror(g_errno));
// if we got an ETRYAGAIN when trying to confirm our lock
// that means doledb was saving/dumping to disk and we
// could not remove the record from doledb and add an
// entry to the waiting tree, so we need to keep trying
if ( g_errno == ETRYAGAIN && m_confirming ) {
// c ount it again
m_numRequests++;
// use what we were using
char *request = (char *)&m_confirmRequest;
int32_t requestSize = sizeof(ConfirmRequest);
Host *h = g_hostdb.getHost(slot->m_hostId);
// send request to him
UdpServer *us = &g_udpServer;
if ( ! us->sendRequest ( request ,
requestSize ,
0x12 , // msgType
h->m_ip ,
h->m_port ,
h->m_hostId ,
NULL , // retSlotPtrPt
this , // state data
gotLockReplyWrapper ,
60*60*24*365 ) )
return false;
// error?
// don't spam the log!
static int32_t s_last = 0;
int32_t now = getTimeLocal();
if ( now - s_last >= 1 ) {
s_last = now;
log("spider: error re-sending confirm "
"request: %s", mstrerror(g_errno));
}
}
// only log every 10 seconds for ETRYAGAIN
if ( g_errno == ETRYAGAIN ) {
static time_t s_lastTime = 0;
time_t now = getTimeLocal();
logIt = false;
if ( now - s_lastTime >= 3 ) {
logIt = true;
s_lastTime = now;
}
}
if ( logIt )
log ( "sploop: host had error getting lock url=%s"
": %s" ,
m_url,mstrerror(g_errno) );
}
// grant or not
if ( replySize == 1 && ! g_errno && *reply == 1 ) m_grants++;
// wait for all to get back
if ( m_numReplies < m_numRequests ) return false;
// all done if we were removing
if ( m_removing ) {
// note it
if ( g_conf.m_logDebugSpider )
logf(LOG_DEBUG,"spider: done removing all locks "
"(replies=%"INT32") for %s",
m_numReplies,m_url);//m_sreq->m_url);
// we are done
m_gettingLocks = false;
return true;
}
// all done if we were confirming
if ( m_confirming ) {
// note it
if ( g_conf.m_logDebugSpider )
logf(LOG_DEBUG,"spider: done confirming all locks "
"for %s uh48=%"INT64"",m_url,m_origUh48);//m_sreq->m_url);
// we are done
m_gettingLocks = false;
// . keep processing
// . if the collection was nuked from under us the spiderUrl2
// will return true and set g_errno
if ( ! m_callback ) return g_spiderLoop.spiderUrl2();
// if we had a callback let our parent call it
return true;
}
// if got ALL locks, spider it
if ( m_grants == m_numReplies ) {
// note it
if ( g_conf.m_logDebugSpider )
logf(LOG_DEBUG,"spider: got lock for docid=lockkey=%"UINT64"",
m_lockKeyUh48);
// flag this
m_hasLock = true;
// we are done
//m_gettingLocks = false;
///////
//
// now tell our group (shard) to remove from doledb
// and re-add to waiting tree. the evalIpLoop() function
// should skip this probable docid because it is in the
// LOCK TABLE!
//
// This logic should allow us to spider multiple urls
// from the same IP at the same time.
//
///////
// returns false if would block
if ( ! confirmLockAcquisition ( ) ) return false;
// . we did it without blocking, maybe cuz we are a single node
// . ok, they are all back, resume loop
// . if the collection was nuked from under us the spiderUrl2
// will return true and set g_errno
if ( ! m_callback ) g_spiderLoop.spiderUrl2 ( );
// all done
return true;
}
// note it
if ( g_conf.m_logDebugSpider )
logf(LOG_DEBUG,"spider: missed lock for %s lockkey=%"UINT64" "
"(grants=%"INT32")", m_url,m_lockKeyUh48,m_grants);
// . if it was locked by another then add to our lock cache so we do
// not try to lock it again
// . if grants is not 0 then one host granted us the lock, but not
// all hosts, so we should probably keep trying on it until it is
// locked up by one host
if ( m_grants == 0 ) {
int32_t now = getTimeGlobal();
g_spiderLoop.m_lockCache.addLong(0,m_lockKeyUh48,now,NULL);
}
// reset again
m_numRequests = 0;
m_numReplies = 0;
// no need to remove them if none were granted because another
// host in our group might have it 100% locked.
if ( m_grants == 0 ) {
// no longer in locks operation mode
m_gettingLocks = false;
// ok, they are all back, resume loop
//if ( ! m_callback ) g_spiderLoop.spiderUrl2 ( );
// all done
return true;
}
// note that
if ( g_conf.m_logDebugSpider )
logf(LOG_DEBUG,"spider: sending request to all in shard to "
"remove lock uh48=%"UINT64". grants=%"INT32"",
m_lockKeyUh48,(int32_t)m_grants);
// remove all locks we tried to get, BUT only if from our hostid!
// no no! that doesn't quite work right... we might be the ones
// locking it! i.e. another one of our spiders has it locked...
if ( ! removeAllLocks ( ) ) return false; // true;
// if did not block, how'd that happen?
log("sploop: did not block in removeAllLocks: %s",mstrerror(g_errno));
return true;
}
bool Msg12::removeAllLocks ( ) {
// ensure not in use. not msg12 replies outstanding.
if ( m_numRequests != m_numReplies ) { char *xx=NULL;*xx=0; }
// no longer use this
char *xx=NULL;*xx=0;
// skip if injecting
//if ( m_sreq->m_isInjecting ) return true;
if ( g_conf.m_logDebugSpider )
logf(LOG_DEBUG,"spider: removing all locks for %s %"UINT64"",
m_url,m_lockKeyUh48);
// we are now removing
m_removing = true;
LockRequest *lr = &m_lockRequest;
lr->m_lockKeyUh48 = m_lockKeyUh48;
lr->m_lockSequence = m_lockSequence;
lr->m_firstIp = m_firstIp;
lr->m_removeLock = 1;
// reset counts
m_numRequests = 0;
m_numReplies = 0;
// make that the request
// . point to start of the 12 byte request buffer
// . m_lockSequence should still be valid
char *request = (char *)lr;//m_lockKey;
int32_t requestSize = sizeof(LockRequest);//12;
// now the locking group is based on the probable docid
//uint32_t groupId = g_hostdb.getGroupIdFromDocId(m_lockKeyUh48);
// ptr to list of hosts in the group
//Host *hosts = g_hostdb.getGroup ( groupId );
Host *hosts = g_hostdb.getMyShard();
// this must select the same group that is going to spider it!
// i.e. our group! because we check our local lock table to see
// if a doled url is locked before spidering it ourselves.
//Host *hosts = g_hostdb.getMyGroup();
// int16_t cut
UdpServer *us = &g_udpServer;
// set the hi bit though for this one
//m_lockKey |= 0x8000000000000000LL;
// get # of hosts in each mirror group
int32_t hpg = g_hostdb.getNumHostsPerShard();
// loop over hosts in that shard
for ( int32_t i = 0 ; i < hpg ; i++ ) {
// get a host
Host *h = &hosts[i];
// skip if dead! no need to get a reply from dead guys
if ( g_hostdb.isDead ( h ) ) continue;
// send request to him
if ( ! us->sendRequest ( request ,
requestSize ,
0x12 , // msgType
h->m_ip ,
h->m_port ,
h->m_hostId ,
NULL , // retSlotPtrPtr
this , // state data
gotLockReplyWrapper ,
60*60*24*365 ) )
// udpserver returns false and sets g_errno on error
return true;
// count them
m_numRequests++;
}
// block?
if ( m_numRequests > 0 ) return false;
// did not block
return true;
}
bool Msg12::confirmLockAcquisition ( ) {
// ensure not in use. not msg12 replies outstanding.
if ( m_numRequests != m_numReplies ) { char *xx=NULL;*xx=0; }
// no longer use this
char *xx=NULL;*xx=0;
// we are now removing
m_confirming = true;
// make that the request
// . point to start of the 12 byte request buffer
// . m_lockSequence should still be valid
ConfirmRequest *cq = &m_confirmRequest;
char *request = (char *)cq;
int32_t requestSize = sizeof(ConfirmRequest);
// sanity
if ( requestSize == sizeof(LockRequest)){ char *xx=NULL;*xx=0; }
// set it
cq->m_collnum = m_collnum;
cq->m_doledbKey = m_doledbKey;
cq->m_firstIp = m_firstIp;
cq->m_lockKeyUh48 = m_lockKeyUh48;
cq->m_maxSpidersOutPerIp = m_maxSpidersOutPerIp;
// . use the locking group from when we sent the lock request
// . get ptr to list of hosts in the group
//Host *hosts = g_hostdb.getGroup ( m_lockGroupId );
// the same group (shard) that has the spiderRequest/Reply is
// the one responsible for locking.
Host *hosts = g_hostdb.getMyShard();
// this must select the same shard that is going to spider it!
// i.e. our shard! because we check our local lock table to see
// if a doled url is locked before spidering it ourselves.
//Host *hosts = g_hostdb.getMyShard();
// int16_t cut
UdpServer *us = &g_udpServer;
// get # of hosts in each mirror group
int32_t hpg = g_hostdb.getNumHostsPerShard();
// reset counts
m_numRequests = 0;
m_numReplies = 0;
// note it
if ( g_conf.m_logDebugSpider )
log("spider: confirming lock for uh48=%"UINT64" firstip=%s",
m_lockKeyUh48,iptoa(m_firstIp));
// loop over hosts in that shard
for ( int32_t i = 0 ; i < hpg ; i++ ) {
// get a host
Host *h = &hosts[i];
// skip if dead! no need to get a reply from dead guys
if ( g_hostdb.isDead ( h ) ) continue;
// send request to him
if ( ! us->sendRequest ( request ,
// a size of 2 should mean confirm
requestSize ,
0x12 , // msgType
h->m_ip ,
h->m_port ,
h->m_hostId ,
NULL , // retSlotPtrPtr
this , // state data
gotLockReplyWrapper ,
60*60*24*365 ) )
// udpserver returns false and sets g_errno on error
return true;
// count them
m_numRequests++;
}
// block?
if ( m_numRequests > 0 ) return false;
// did not block
return true;
}
// use -1 for any collnum
int32_t SpiderLoop::getNumSpidersOutPerIp ( int32_t firstIp , collnum_t collnum ) {
int32_t count = 0;
// count locks
HashTableX *ht = &g_spiderLoop.m_lockTable;
// scan the slots
int32_t ns = ht->m_numSlots;
for ( int32_t i = 0 ; i < ns ; i++ ) {
// breathe
//QUICKPOLL(niceness);
// skip if empty
if ( ! ht->m_flags[i] ) continue;
// cast lock
UrlLock *lock = (UrlLock *)ht->getValueFromSlot(i);
// skip if not outstanding, just a 5-second expiration wait
// when the spiderReply returns, so that in case a lock
// request for the same url was in progress, it will be denied.
if ( ! lock->m_spiderOutstanding ) continue;
// must be confirmed too
if ( ! lock->m_confirmed ) continue;
// correct collnum?
if ( lock->m_collnum != collnum && collnum != -1 ) continue;
// skip if not yet expired
if ( lock->m_firstIp == firstIp ) count++;
}
/*
for ( int32_t i = 0 ; i <= m_maxUsed ; i++ ) {
// get it
XmlDoc *xd = m_docs[i];
// skip if empty
if ( ! xd ) continue;
// check it
if ( xd->m_firstIp == firstIp ) count++;
}
*/
return count;
}
void handleRequest12 ( UdpSlot *udpSlot , int32_t niceness ) {
// get request
char *request = udpSlot->m_readBuf;
int32_t reqSize = udpSlot->m_readBufSize;
// int16_t cut
UdpServer *us = &g_udpServer;
// breathe
QUICKPOLL ( niceness );
// int16_tcut
char *reply = udpSlot->m_tmpBuf;
//
// . is it confirming that he got all the locks?
// . if so, remove the doledb record and dock the doleiptable count
// before adding a waiting tree entry to re-pop the doledb record
//
if ( reqSize == sizeof(ConfirmRequest) ) {
char *msg = NULL;
ConfirmRequest *cq = (ConfirmRequest *)request;
// confirm the lock
HashTableX *ht = &g_spiderLoop.m_lockTable;
int32_t slot = ht->getSlot ( &cq->m_lockKeyUh48 );
if ( slot < 0 ) {
log("spider: got a confirm request for a key not "
"in the table! coll must have been deleted "
" or reset "
"while lock request was outstanding.");
g_errno = EBADENGINEER;
us->sendErrorReply ( udpSlot , g_errno );
return;
//char *xx=NULL;*xx=0; }
}
UrlLock *lock = (UrlLock *)ht->getValueFromSlot ( slot );
lock->m_confirmed = true;
// note that
if ( g_conf.m_logDebugSpider ) // Wait )
log("spider: got confirm lock request for ip=%s",
iptoa(lock->m_firstIp));
// get it
SpiderColl *sc = g_spiderCache.getSpiderColl(cq->m_collnum);
// make it negative
cq->m_doledbKey.n0 &= 0xfffffffffffffffeLL;
// and add the negative rec to doledb (deletion operation)
Rdb *rdb = &g_doledb.m_rdb;
if ( ! rdb->addRecord ( cq->m_collnum,
(char *)&cq->m_doledbKey,
NULL , // data
0 , //dataSize
1 )){ // niceness
// tree is dumping or something, probably ETRYAGAIN
if ( g_errno != ETRYAGAIN ) {msg = "error adding neg rec to doledb"; log("spider: %s %s",msg,mstrerror(g_errno));
}
//char *xx=NULL;*xx=0;
us->sendErrorReply ( udpSlot , g_errno );
return;
}
// now remove from doleiptable since we removed from doledb
if ( sc ) sc->removeFromDoledbTable ( cq->m_firstIp );
// how many spiders outstanding for this coll and IP?
//int32_t out=g_spiderLoop.getNumSpidersOutPerIp ( cq->m_firstIp);
// DO NOT add back to waiting tree if max spiders
// out per ip was 1 OR there was a crawldelay. but better
// yet, take care of that in the winReq code above.
// . now add to waiting tree so we add another spiderdb
// record for this firstip to doledb
// . true = callForScan
// . do not add to waiting tree if we have enough outstanding
// spiders for this ip. we will add to waiting tree when
// we receive a SpiderReply in addSpiderReply()
if ( sc && //out < cq->m_maxSpidersOutPerIp &&
// this will just return true if we are not the
// responsible host for this firstip
// DO NOT populate from this!!! say "false" here...
! sc->addToWaitingTree ( 0 , cq->m_firstIp, false ) &&
// must be an error...
g_errno ) {
msg = "FAILED TO ADD TO WAITING TREE";
log("spider: %s %s",msg,mstrerror(g_errno));
us->sendErrorReply ( udpSlot , g_errno );
return;
}
// success!!
reply[0] = 1;
us->sendReply_ass ( reply , 1 , reply , 1 , udpSlot );
return;
}
// sanity check
if ( reqSize != sizeof(LockRequest) ) {
log("spider: bad msg12 request size of %"INT32"",reqSize);
us->sendErrorReply ( udpSlot , EBADREQUEST );
return;
}
// deny it if we are not synced yet! otherwise we core in
// getTimeGlobal() below
if ( ! isClockInSync() ) {
// log it so we can debug it
//log("spider: clock not in sync with host #0. so "
// "returning etryagain for lock reply");
// let admin know why we are not spidering
us->sendErrorReply ( udpSlot , ETRYAGAIN );
return;
}
LockRequest *lr = (LockRequest *)request;
//uint64_t lockKey = *(int64_t *)request;
//int32_t lockSequence = *(int32_t *)(request+8);
// is this a remove operation? assume not
//bool remove = false;
// get top bit
//if ( lockKey & 0x8000000000000000LL ) remove = true;
// mask it out
//lockKey &= 0x7fffffffffffffffLL;
// sanity check, just 6 bytes! (48 bits)
if ( lr->m_lockKeyUh48 &0xffff000000000000LL ) { char *xx=NULL;*xx=0; }
// note it
if ( g_conf.m_logDebugSpider )
log("spider: got msg12 request uh48=%"INT64" remove=%"INT32"",
lr->m_lockKeyUh48, (int32_t)lr->m_removeLock);
// get time
int32_t nowGlobal = getTimeGlobal();
// int16_tcut
HashTableX *ht = &g_spiderLoop.m_lockTable;
int32_t hostId = g_hostdb.getHostId ( udpSlot->m_ip , udpSlot->m_port );
// this must be legit - sanity check
if ( hostId < 0 ) { char *xx=NULL;*xx=0; }
// remove expired locks from locktable
removeExpiredLocks ( hostId );
int64_t lockKey = lr->m_lockKeyUh48;
// check tree
int32_t slot = ht->getSlot ( &lockKey ); // lr->m_lockKeyUh48 );
// put it here
UrlLock *lock = NULL;
// if there say no no
if ( slot >= 0 ) lock = (UrlLock *)ht->getValueFromSlot ( slot );
// if doing a remove operation and that was our hostid then unlock it
if ( lr->m_removeLock &&
lock &&
lock->m_hostId == hostId &&
lock->m_lockSequence == lr->m_lockSequence ) {
// note it for now
if ( g_conf.m_logDebugSpider )
log("spider: removing lock for lockkey=%"UINT64" hid=%"INT32"",
lr->m_lockKeyUh48,hostId);
// unlock it
ht->removeSlot ( slot );
// it is gone
lock = NULL;
}
// ok, at this point all remove ops return
if ( lr->m_removeLock ) {
reply[0] = 1;
us->sendReply_ass ( reply , 1 , reply , 1 , udpSlot );
return;
}
/////////
//
// add new lock
//
/////////
// if lock > 1 hour old then remove it automatically!!
if ( lock && nowGlobal - lock->m_timestamp > MAX_LOCK_AGE ) {
// note it for now
log("spider: removing lock after %"INT32" seconds "
"for lockKey=%"UINT64" hid=%"INT32"",
(nowGlobal - lock->m_timestamp),
lr->m_lockKeyUh48,hostId);
// unlock it
ht->removeSlot ( slot );
// it is gone
lock = NULL;
}
// if lock still there, do not grant another lock
if ( lock ) {
// note it for now
if ( g_conf.m_logDebugSpider )
log("spider: refusing lock for lockkey=%"UINT64" hid=%"INT32"",
lr->m_lockKeyUh48,hostId);
reply[0] = 0;
us->sendReply_ass ( reply , 1 , reply , 1 , udpSlot );
return;
}
// make the new lock
UrlLock tmp;
tmp.m_hostId = hostId;
tmp.m_lockSequence = lr->m_lockSequence;
tmp.m_timestamp = nowGlobal;
tmp.m_expires = 0;
tmp.m_firstIp = lr->m_firstIp;
tmp.m_collnum = lr->m_collnum;
// when the spider returns we remove its lock on reception of the
// spiderReply, however, we actually just set the m_expires time
// to 5 seconds into the future in case there is a current request
// to get a lock for that url in progress. but, we do need to
// indicate that the spider has indeed completed by setting
// m_spiderOutstanding to true. this way, addToWaitingTree() will
// not count it towards a "max spiders per IP" quota when deciding
// on if it should add a new entry for this IP.
tmp.m_spiderOutstanding = true;
// this is set when all hosts in the group (shard) have granted the
// lock and the host sends out a confirmLockAcquisition() request.
// until then we do not know if the lock will be granted by all hosts
// in the group (shard)
tmp.m_confirmed = false;
// put it into the table
if ( ! ht->addKey ( &lockKey , &tmp ) ) {
// return error if that failed!
us->sendErrorReply ( udpSlot , g_errno );
return;
}
// note it for now
if ( g_conf.m_logDebugSpider )
log("spider: granting lock for lockKey=%"UINT64" hid=%"INT32"",
lr->m_lockKeyUh48,hostId);
// grant the lock
reply[0] = 1;
us->sendReply_ass ( reply , 1 , reply , 1 , udpSlot );
return;
}
// hostId is the remote hostid sending us the lock request
void removeExpiredLocks ( int32_t hostId ) {
// when we last cleaned them out
static time_t s_lastTime = 0;
int32_t nowGlobal = getTimeGlobalNoCore();
int32_t niceness = MAX_NICENESS;
// only do this once per second at the most
if ( nowGlobal <= s_lastTime ) return;
// int16_tcut
HashTableX *ht = &g_spiderLoop.m_lockTable;
restart:
// scan the slots
int32_t ns = ht->m_numSlots;
// . clean out expired locks...
// . if lock was there and m_expired is up, then nuke it!
// . when Rdb.cpp receives the "fake" title rec it removes the
// lock, only it just sets the m_expired to a few seconds in the
// future to give the negative doledb key time to be absorbed.
// that way we don't repeat the same url we just got done spidering.
// . this happens when we launch our lock request on a url that we
// or a twin is spidering or has just finished spidering, and
// we get the lock, but we avoided the negative doledb key.
for ( int32_t i = 0 ; i < ns ; i++ ) {
// breathe
QUICKPOLL(niceness);
// skip if empty
if ( ! ht->m_flags[i] ) continue;
// cast lock
UrlLock *lock = (UrlLock *)ht->getValueFromSlot(i);
int64_t lockKey = *(int64_t *)ht->getKeyFromSlot(i);
// if collnum got deleted or reset
collnum_t collnum = lock->m_collnum;
if ( collnum >= g_collectiondb.m_numRecs ||
! g_collectiondb.m_recs[collnum] ) {
log("spider: removing lock from missing collnum "
"%"INT32"",(int32_t)collnum);
goto nuke;
}
// skip if not yet expired
if ( lock->m_expires == 0 ) continue;
if ( lock->m_expires >= nowGlobal ) continue;
// note it for now
if ( g_conf.m_logDebugSpider )
log("spider: removing lock after waiting. elapsed=%"INT32"."
" lockKey=%"UINT64" hid=%"INT32" expires=%"UINT32" "
"nowGlobal=%"UINT32"",
(nowGlobal - lock->m_timestamp),
lockKey,hostId,
(uint32_t)lock->m_expires,
(uint32_t)nowGlobal);
nuke:
// nuke the slot and possibly re-chain
ht->removeSlot ( i );
// gotta restart from the top since table may have shrunk
goto restart;
}
// store it
s_lastTime = nowGlobal;
}
/////////////////////////
///////////////////////// PAGESPIDER
/////////////////////////
// don't change name to "State" cuz that might conflict with another
class State11 {
public:
int32_t m_numRecs;
Msg5 m_msg5;
RdbList m_list;
TcpSocket *m_socket;
HttpRequest m_r;
collnum_t m_collnum;
char *m_coll;
int32_t m_count;
key_t m_startKey;
key_t m_endKey;
int32_t m_minRecSizes;
bool m_done;
SafeBuf m_safeBuf;
int32_t m_priority;
};
static bool loadLoop ( class State11 *st ) ;
// . returns false if blocked, true otherwise
// . sets g_errno on error
// . make a web page displaying the urls we got in doledb
// . doledb is sorted by priority complement then spider time
// . do not show urls in doledb whose spider time has not yet been reached,
// so only show the urls spiderable now
// . call g_httpServer.sendDynamicPage() to send it
bool sendPageSpiderdb ( TcpSocket *s , HttpRequest *r ) {
// set up a msg5 and RdbLists to get the urls from spider queue
State11 *st ;
try { st = new (State11); }
catch ( ... ) {
g_errno = ENOMEM;
log("PageSpiderdb: new(%i): %s",
(int)sizeof(State11),mstrerror(g_errno));
return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno));}
mnew ( st , sizeof(State11) , "PageSpiderdb" );
// get the priority/#ofRecs from the cgi vars
st->m_numRecs = r->getLong ("n", 20 );
st->m_r.copy ( r );
// get collection name
char *coll = st->m_r.getString ( "c" , NULL , NULL );
// get the collection record to see if they have permission
//CollectionRec *cr = g_collectiondb.getRec ( coll );
// the socket read buffer will remain until the socket is destroyed
// and "coll" points into that
st->m_coll = coll;
CollectionRec *cr = g_collectiondb.getRec(coll);
if ( cr ) st->m_collnum = cr->m_collnum;
else st->m_collnum = -1;
// set socket for replying in case we block
st->m_socket = s;
st->m_count = 0;
st->m_priority = MAX_SPIDER_PRIORITIES - 1;
// get startKeys/endKeys/minRecSizes
st->m_startKey = g_doledb.makeFirstKey2 (st->m_priority);
st->m_endKey = g_doledb.makeLastKey2 (st->m_priority);
st->m_minRecSizes = 20000;
st->m_done = false;
// returns false if blocked, true otherwise
return loadLoop ( st ) ;
}
static void gotListWrapper3 ( void *state , RdbList *list , Msg5 *msg5 ) ;
static bool sendPage ( State11 *st );
static bool printList ( State11 *st );
bool loadLoop ( State11 *st ) {
loop:
// let's get the local list for THIS machine (use msg5)
if ( ! st->m_msg5.getList ( RDB_DOLEDB ,
st->m_collnum ,
&st->m_list ,
st->m_startKey ,
st->m_endKey ,
st->m_minRecSizes ,
true , // include tree
false , // add to cache
0 , // max age
0 , // start file #
-1 , // # files
st , // callback state
gotListWrapper3 ,
0 , // niceness
true )) // do err correction
return false;
// print it. returns false on error
if ( ! printList ( st ) ) st->m_done = true;
// check if done
if ( st->m_done ) {
// send the page back
sendPage ( st );
// bail
return true;
}
// otherwise, load more
goto loop;
}
void gotListWrapper3 ( void *state , RdbList *list , Msg5 *msg5 ) {
// cast it
State11 *st = (State11 *)state;
// print it. returns false on error
if ( ! printList ( st ) ) st->m_done = true;
// check if done
if ( st->m_done ) {
// send the page back
sendPage ( st );
// bail
return;
}
// otherwise, load more
loadLoop( (State11 *)state );
}
// . make a web page from results stored in msg40
// . send it on TcpSocket "s" when done
// . returns false if blocked, true otherwise
// . sets g_errno on error
bool printList ( State11 *st ) {
// useful
time_t nowGlobal ;
if ( isClockInSync() ) nowGlobal = getTimeGlobal();
else nowGlobal = getTimeLocal();
// print the spider recs we got
SafeBuf *sbTable = &st->m_safeBuf;
// shorcuts
RdbList *list = &st->m_list;
// row count
int32_t j = 0;
// put it in there
for ( ; ! list->isExhausted() ; list->skipCurrentRecord() ) {
// stop if we got enough
if ( st->m_count >= st->m_numRecs ) break;
// get the doledb key
key_t dk = list->getCurrentKey();
// update to that
st->m_startKey = dk;
// inc by one
st->m_startKey += 1;
// get spider time from that
int32_t spiderTime = g_doledb.getSpiderTime ( &dk );
// skip if in future
if ( spiderTime > nowGlobal ) continue;
// point to the spider request *RECORD*
char *rec = list->getCurrentData();
// skip negatives
if ( (dk.n0 & 0x01) == 0 ) continue;
// count it
st->m_count++;
// what is this?
if ( list->getCurrentRecSize() <= 16 ) { char *xx=NULL;*xx=0;}
// sanity check. requests ONLY in doledb
if ( ! g_spiderdb.isSpiderRequest ( (key128_t *)rec )) {
char*xx=NULL;*xx=0;}
// get the spider rec, encapsed in the data of the doledb rec
SpiderRequest *sreq = (SpiderRequest *)rec;
// print it into sbTable
if ( ! sreq->printToTable ( sbTable,"ready",NULL,j))
return false;
// count row
j++;
}
// need to load more?
if ( st->m_count >= st->m_numRecs ||
// if list was a partial, this priority is int16_t then
list->getListSize() < st->m_minRecSizes ) {
// . try next priority
// . if below 0 we are done
if ( --st->m_priority < 0 ) st->m_done = true;
// get startKeys/endKeys/minRecSizes
st->m_startKey = g_doledb.makeFirstKey2 (st->m_priority);
st->m_endKey = g_doledb.makeLastKey2 (st->m_priority);
// if we printed something, print a blank line after it
if ( st->m_count > 0 )
sbTable->safePrintf("<tr><td colspan=30>..."
"</td></tr>\n");
// reset for each priority
st->m_count = 0;
}
return true;
}
bool sendPage ( State11 *st ) {
// sanity check
//if ( ! g_errno ) { char *xx=NULL;*xx=0; }
//SafeBuf sb; sb.safePrintf("Error = %s",mstrerror(g_errno));
// int16_tcut
SafeBuf *sbTable = &st->m_safeBuf;
// generate a query string to pass to host bar
char qs[64]; sprintf ( qs , "&n=%"INT32"", st->m_numRecs );
// store the page in here!
SafeBuf sb;
sb.reserve ( 64*1024 );
g_pages.printAdminTop ( &sb, st->m_socket , &st->m_r , qs );
// get spider coll
collnum_t collnum = g_collectiondb.getCollnum ( st->m_coll );
// and coll rec
CollectionRec *cr = g_collectiondb.getRec ( collnum );
if ( ! cr ) {
// get the socket
TcpSocket *s = st->m_socket;
// then we can nuke the state
mdelete ( st , sizeof(State11) , "PageSpiderdb" );
delete (st);
// erase g_errno for sending
g_errno = 0;
// now encapsulate it in html head/tail and send it off
return g_httpServer.sendDynamicPage (s, sb.getBufStart(),
sb.length() );
}
// print reason why spiders are not active for this collection
int32_t tmp2;
SafeBuf mb;
if ( cr ) getSpiderStatusMsg ( cr , &mb , &tmp2 );
if ( mb.length() && tmp2 != SP_INITIALIZING )
sb.safePrintf(//"<center>"
"<table cellpadding=5 "
//"style=\""
//"border:2px solid black;"
"max-width:600px\" "
"border=0"
">"
"<tr>"
//"<td bgcolor=#ff6666>"
"<td>"
"For collection <i>%s</i>: "
"<b><font color=red>%s</font></b>"
"</td>"
"</tr>"
"</table>\n"
, cr->m_coll
, mb.getBufStart() );
// begin the table
sb.safePrintf ( "<table %s>\n"
"<tr><td colspan=50>"
//"<center>"
"<b>Currently Spidering on This Host</b>"
" (%"INT32" spiders)"
//" (%"INT32" locks)"
//"</center>"
"</td></tr>\n"
, TABLE_STYLE
, (int32_t)g_spiderLoop.m_numSpidersOut
//, g_spiderLoop.m_lockTable.m_numSlotsUsed
);
// the table headers so SpiderRequest::printToTable() works
if ( ! SpiderRequest::printTableHeader ( &sb , true ) ) return false;
// int16_tcut
XmlDoc **docs = g_spiderLoop.m_docs;
// count # of spiders out
int32_t j = 0;
// first print the spider recs we are spidering
for ( int32_t i = 0 ; i < (int32_t)MAX_SPIDERS ; i++ ) {
// get it
XmlDoc *xd = docs[i];
// skip if empty
if ( ! xd ) continue;
// sanity check
if ( ! xd->m_sreqValid ) { char *xx=NULL;*xx=0; }
// grab it
SpiderRequest *oldsr = &xd->m_sreq;
// get status
char *status = xd->m_statusMsg;
// show that
if ( ! oldsr->printToTable ( &sb , status,xd,j) ) return false;
// inc count
j++;
}
// now print the injections as well!
XmlDoc *xd = getInjectHead ( ) ;
for ( ; xd ; xd = xd->m_nextInject ) {
// how does this happen?
if ( ! xd->m_sreqValid ) continue;
// grab it
SpiderRequest *oldsr = &xd->m_sreq;
// get status
SafeBuf xb;
xb.safePrintf("[<font color=red><b>injecting</b></font>] %s",
xd->m_statusMsg);
char *status = xb.getBufStart();
// show that
if ( ! oldsr->printToTable ( &sb , status,xd,j) ) return false;
// inc count
j++;
}
// end the table
sb.safePrintf ( "</table>\n" );
sb.safePrintf ( "<br>\n" );
/*
if ( g_spiderCache.m_numMsgSamples > 0 ) {
sb.safePrintf (
"<table width=100%% bgcolor=#%s "
"cellpadding=4 border=1 >"
"<tr>"
"<td colspan=3 bgcolor=#%s>"
"<b>Proportion of Spider Time Spent in "
"Section.</b>"
"</td>"
"</tr>\n",
LIGHT_BLUE ,
DARK_BLUE );
HashTableT<uint32_t,float>* m = &g_spiderCache.m_spiderMsgs;
for(int32_t i = 0; i < m->getNumSlots();i++) {
if(m->getKey(i) == 0) continue;
sb.safePrintf (
"<tr>"
"<td>%.2f%%</td>"
"<td>%.0f</td>"
"<td>%s</td>"
"</tr>\n",
100*m->getValueFromSlot(i)/
g_spiderCache.m_numMsgSamples,
m->getValueFromSlot(i),
(char*)m->getKey(i));
}
sb.safePrintf ("</table>\n");
}
*/
/*
// try to put these in tool tips
// describe the various parms
sb.safePrintf (
"<table width=100%% bgcolor=#%s "
"cellpadding=4 border=1>"
"<tr>"
"<td colspan=2 bgcolor=#%s>"
"<b>Status descriptions</b>"
"</td>"
"</tr>\n"
"<tr>"
//"<td>getting link info</td><td>performing "
"<td>getting site title buf</td><td>getting "
"the title and all inlinker text of the root page."
"</td></tr>"
"<tr>"
"<td>getting outlink ip vector</td><td>getting "
"ips of the outlinks. Gets from tagdb firstip "
"tag if it exists."
"</td></tr>"
"<tr>"
"<td>getting robots.txt</td><td>downloading the "
"robots.txt file for this url."
"</td></tr>"
"<tr>"
"<td>checking for dup</td><td>looking up the url's "
"docid in checksumdb to see if its content checksum "
"is in use by another indexed document from the same "
"site. Will index even if it is a dup if it has a "
"higher quality."
"</td></tr>"
"<tr>"
"<td>getting web page</td><td>downloading the web "
"page."
"</td></tr>"
"<tr>"
"<td><nobr>getting cached web page</nobr></td><td>"
"looking up the "
"old record for this url in titledb to see how the "
"content changed."
"</td></tr>"
"<tr>"
"<td>adding links</td><td>adding links from the page "
"to spiderdb. Links are distributed to the host that "
"stores them based on the hash of the link. Make sure "
"&lt;tfndbMaxPageCacheMem&gt; is high enough to keep "
"tfndb disk seeks down. A tfndb access is done for "
"every link added."
"</td></tr>"
"</table><br><br>\n\n",
LIGHT_BLUE ,
DARK_BLUE );
*/
// then spider collection
//SpiderColl *sc = g_spiderCache.m_spiderColls[collnum];
SpiderColl *sc = g_spiderCache.getSpiderColl(collnum);
//
// spiderdb rec stats, from scanning spiderdb
//
// if not there, forget about it
if ( sc ) sc->printStats ( sb );
//
// Spiders Table
//
int64_t totalPoints = g_stats.m_totalSpiderSuccessNew +
g_stats.m_totalSpiderErrorsNew +
g_stats.m_totalSpiderSuccessOld +
g_stats.m_totalSpiderErrorsOld;
int64_t totalNew = g_stats.m_totalSpiderSuccessNew +
g_stats.m_totalSpiderErrorsNew;
int64_t totalOld = g_stats.m_totalSpiderSuccessOld +
g_stats.m_totalSpiderErrorsOld;
double tsr = 100.00;
double nsr = 100.00;
double osr = 100.00;
if ( totalPoints > 0 ) {
tsr = 100.00*
(double)(g_stats.m_totalSpiderSuccessNew +
g_stats.m_totalSpiderSuccessOld) /
(double)totalPoints;
if ( totalNew > 0 )
nsr= 100.00*(double)(g_stats.m_totalSpiderSuccessNew) /
(double)(totalNew);
if ( totalOld > 0 )
osr= 100.00*(double)(g_stats.m_totalSpiderSuccessOld) /
(double)(totalOld);
}
int32_t points = g_stats.m_spiderSample;
if ( points > 1000 ) points = 1000;
int32_t sampleNew = g_stats.m_spiderNew;
int32_t sampleOld = points - g_stats.m_spiderNew;
double tssr = 100.00;
double nssr = 100.00;
double ossr = 100.00;
if ( points > 0 ) {
tssr = 100.00*
(double)(points -
g_stats.m_spiderErrors) / (double)points ;
if ( sampleNew > 0 )
nssr = 100.00*(double)(sampleNew -
g_stats.m_spiderErrorsNew) /
(double)(sampleNew);
if ( sampleOld > 0 )
ossr = 100.00*(double)(sampleOld -
(g_stats.m_spiderErrors -
g_stats.m_spiderErrorsNew)) /
(double)(sampleOld);
}
sb.safePrintf(
"<style>"
".poo { background-color:#%s;}\n"
"</style>\n" ,
LIGHT_BLUE );
sb.safePrintf (
"<table %s>"
"<tr>"
"<td colspan=7>"
"<center><b>Spider Stats</b></td></tr>\n"
"<tr bgcolor=#%s><td>"
"</td><td><b>Total</b></td>"
"<td><b>Total New</b></td>"
"<td><b>Total Old</b></td>"
"<td><b>Sample</b></td>"
"<td><b>Sample New</b></td>"
"<td><b>Sample Old</b></b>"
"</td></tr>"
"<tr class=poo><td><b>Total Spiders</n>"
"</td><td>%"INT64"</td><td>%"INT64"</td><td>%"INT64"</td>\n"
"</td><td>%"INT32"</td><td>%"INT32"</td><td>%"INT32"</td></tr>\n"
//"<tr class=poo><td><b>Successful Spiders</n>"
//"</td><td>%"INT64"</td><td>%"INT64"</td><td>%"INT64"</td>\n"
//"</td><td>%"INT32"</td><td>%"INT32"</td><td>%"INT32"</td></tr>\n"
//"<tr class=poo><td><b>Failed Spiders</n>"
//"</td><td>%"INT64"</td><td>%"INT64"</td><td>%"INT64"</td>\n"
//"</td><td>%"INT32"</td><td>%"INT32"</td><td>%"INT32"</td></tr>\n"
"<tr class=poo><td><b>Success Rate</b>"
"</td><td>%.02f%%</td><td>%.02f%%</td>"
"</td><td>%.02f%%</td><td>%.02f%%</td>"
"</td><td>%.02f%%</td><td>%.02f%%</td></tr>",
TABLE_STYLE,
DARK_BLUE,
totalPoints,
totalNew,
totalOld,
points,
sampleNew,
sampleOld,
//g_stats.m_totalSpiderSuccessNew +
//g_stats.m_totalSpiderSuccessOld,
//g_stats.m_totalSpiderSuccessNew,
//g_stats.m_totalSpiderSuccessOld,
//g_stats.m_spiderSuccessNew +
//g_stats.m_spiderSuccessOld,
//g_stats.m_spiderSuccessNew,
//g_stats.m_spiderSuccessOld,
//g_stats.m_totalSpiderErrorsNew +
//g_stats.m_totalSpiderErrorsOld,
//g_stats.m_totalSpiderErrorsNew,
//g_stats.m_totalSpiderErrorsOld,
//g_stats.m_spiderErrorsNew +
//g_stats.m_spiderErrorsOld,
//g_stats.m_spiderErrorsNew,
//g_stats.m_spiderErrorsOld,
tsr, nsr, osr, tssr, nssr, ossr );
int32_t bucketsNew[65536];
int32_t bucketsOld[65536];
memset ( bucketsNew , 0 , 65536*4 );
memset ( bucketsOld , 0 , 65536*4 );
for ( int32_t i = 0 ; i < points; i++ ) {
int32_t n = g_stats.m_errCodes[i];
if ( n < 0 || n > 65535 ) {
log("admin: Bad spider error code.");
continue;
}
if ( g_stats.m_isSampleNew[i] )
bucketsNew[n]++;
else
bucketsOld[n]++;
}
for ( int32_t i = 0 ; i < 65536 ; i++ ) {
if ( g_stats.m_allErrorsNew[i] == 0 &&
g_stats.m_allErrorsOld[i] == 0 &&
bucketsNew[i] == 0 && bucketsOld[i] == 0 ) continue;
sb.safePrintf (
"<tr bgcolor=#%s>"
"<td><b><a href=/search?c=%s&q=gbstatusmsg%%3A"
"%%22"
,
LIGHT_BLUE , cr->m_coll );
sb.urlEncode(mstrerror(i));
sb.safePrintf ("%%22>"
"%s"
"</a>"
"</b></td>"
"<td>%"INT64"</td>"
"<td>%"INT64"</td>"
"<td>%"INT64"</td>"
"<td>%"INT32"</td>"
"<td>%"INT32"</td>"
"<td>%"INT32"</td>"
"</tr>\n" ,
mstrerror(i),
g_stats.m_allErrorsNew[i] +
g_stats.m_allErrorsOld[i],
g_stats.m_allErrorsNew[i],
g_stats.m_allErrorsOld[i],
bucketsNew[i] + bucketsOld[i] ,
bucketsNew[i] ,
bucketsOld[i] );
}
sb.safePrintf ( "</table><br>\n" );
// describe the various parms
/*
sb.safePrintf (
"<table width=100%% bgcolor=#%s "
"cellpadding=4 border=1>"
"<tr class=poo>"
"<td colspan=2 bgcolor=#%s>"
"<b>Field descriptions</b>"
"</td>"
"</tr>\n"
"<tr class=poo>"
"<td>hits</td><td>The number of attempts that were "
"made by the spider to read a url from the spider "
"queue cache.</td>"
"</tr>\n"
"<tr class=poo>"
"<td>misses</td><td>The number of those attempts that "
"failed to get a url to spider.</td>"
"</tr>\n"
"<tr class=poo>"
"<td>cached</td><td>The number of urls that are "
"currently in the spider queue cache.</td>"
"</tr>\n"
"<tr class=poo>"
"<td>water</td><td>The number of urls that were in the "
"spider queue cache at any one time, since the start "
"of the last disk scan.</td>"
"</tr>\n"
"<tr class=poo>"
"<td>kicked</td><td>The number of urls that were "
"replaced in the spider queue cache with urls loaded "
"from disk, since the start of the last disk scan.</td>"
"</tr>\n"
"<tr class=poo>"
"<td>added</td><td>The number of urls that were added "
"to the spider queue cache since the start of the last "
"disk scan. After a document is spidered its url "
"if often added again to the spider queue cache.</td>"
"</tr>\n"
"<tr class=poo>"
"<td>attempted</td><td>The number of urls that "
"Gigablast attempted to add to the spider queue cache "
"since the start of the last disk scan. In "
"a distributed environment, urls are distributed "
"between twins so not all urls read will "
"make it into the spider queue cache. Also includes "
"spider recs attempted to be re-added to spiderdb "
"after being spidering, but usually with a different "
"spider time.</td>"
"</tr>\n"
"<tr class=poo>"
"<td>nl</td><td>This is 1 iff Gigablast currently "
"needs to reload the spider queue cache from disk.</td>"
"</tr>\n"
"<tr class=poo>"
"<td>rnl</td><td>This is 1 iff Gigablast currently "
"really needs to reload the spider queue cache from "
"disk.</td>"
"</tr>\n"
"<tr class=poo>"
"<td>more</td><td>This is 1 iff there are urls on "
"the disk that are not in the spider queue cache.</td>"
"</tr>\n"
"<tr class=poo>"
"<td>loading</td><td>This is 1 iff Gigablast is "
"currently loading this spider cache queue from "
"disk.</td>"
"</tr>\n"
"<tr class=poo>"
"<td>scanned</td><td>The number of bytes that were "
"read from disk since the start of the last disk "
"scan.</td>"
"</tr>\n"
"<tr class=poo>"
"<td>reads</td><td>The number of disk read "
"operations since the start of the last disk "
"scan.</td>"
"</tr>\n"
"<tr class=poo>"
"<td>elapsed</td><td>The time in seconds that has "
"elapsed since the start or end of the last disk "
"scan, depending on if a scan is currently in "
"progress.</td>"
"</tr>\n"
"</table>\n",
LIGHT_BLUE ,
DARK_BLUE );
*/
// done if no sc
if ( ! sc ) {
// get the socket
TcpSocket *s = st->m_socket;
// then we can nuke the state
mdelete ( st , sizeof(State11) , "PageSpiderdb" );
delete (st);
// erase g_errno for sending
g_errno = 0;
// now encapsulate it in html head/tail and send it off
return g_httpServer.sendDynamicPage (s, sb.getBufStart(),
sb.length() );
}
/////
//
// READY TO SPIDER table
//
/////
int32_t ns = 0;
if ( sc ) ns = sc->m_doleIpTable.getNumSlotsUsed();
// begin the table
sb.safePrintf ( "<table %s>\n"
"<tr><td colspan=50>"
"<b>URLs Ready to Spider for collection "
"<font color=red><b>%s</b>"
"</font>"
" (%"INT32" ips in doleiptable)"
,
TABLE_STYLE,
st->m_coll ,
ns );
// print time format: 7/23/1971 10:45:32
time_t nowUTC = getTimeGlobal();
struct tm *timeStruct ;
char time[256];
timeStruct = gmtime ( &nowUTC );
strftime ( time , 256 , "%b %e %T %Y UTC", timeStruct );
sb.safePrintf("</b>" // (current time = %s = %"UINT32") "
"</td></tr>\n"
//,time,nowUTC
);
// the table headers so SpiderRequest::printToTable() works
if ( ! SpiderRequest::printTableHeader ( &sb ,false ) ) return false;
// the the doledb spider recs
char *bs = sbTable->getBufStart();
if ( bs && ! sb.safePrintf("%s",bs) ) return false;
// end the table
sb.safePrintf ( "</table>\n" );
sb.safePrintf ( "<br>\n" );
/////////////////
//
// PRINT WAITING TREE
//
// each row is an ip. print the next url to spider for that ip.
//
/////////////////
sb.safePrintf ( "<table %s>\n"
"<tr><td colspan=50>"
"<b>IPs Waiting for Selection Scan for collection "
"<font color=red><b>%s</b>"
"</font>"
,
TABLE_STYLE,
st->m_coll );
// print time format: 7/23/1971 10:45:32
int64_t timems = gettimeofdayInMillisecondsGlobal();
sb.safePrintf("</b> (current time = %"UINT64")(totalcount=%"INT32")"
"(waittablecount=%"INT32")",
timems,
sc->m_waitingTree.getNumUsedNodes(),
sc->m_waitingTable.getNumUsedSlots());
double a = (double)g_spiderdb.getUrlHash48 ( &sc->m_firstKey );
double b = (double)g_spiderdb.getUrlHash48 ( &sc->m_endKey );
double c = (double)g_spiderdb.getUrlHash48 ( &sc->m_nextKey );
double percent = (100.0 * (c-a)) ;
if ( b-a > 0 ) percent /= (b-a);
if ( percent > 100.0 ) percent = 100.0;
if ( percent < 0.0 ) percent = 0.0;
sb.safePrintf("(spiderdb scan for ip %s is %.2f%% complete)",
iptoa(sc->m_scanningIp),
(float)percent );
sb.safePrintf("</td></tr>\n");
sb.safePrintf("<tr bgcolor=#%s>",DARK_BLUE);
sb.safePrintf("<td><b>spidertime (MS)</b></td>\n");
sb.safePrintf("<td><b>firstip</b></td>\n");
sb.safePrintf("</tr>\n");
// the the waiting tree
int32_t node = sc->m_waitingTree.getFirstNode();
int32_t count = 0;
//uint64_t nowMS = gettimeofdayInMillisecondsGlobal();
for ( ; node >= 0 ; node = sc->m_waitingTree.getNextNode(node) ) {
// breathe
QUICKPOLL(MAX_NICENESS);
// get key
key_t *key = (key_t *)sc->m_waitingTree.getKey(node);
// get ip from that
int32_t firstIp = (key->n0) & 0xffffffff;
// get the time
uint64_t spiderTimeMS = key->n1;
// shift upp
spiderTimeMS <<= 32;
// or in
spiderTimeMS |= (key->n0 >> 32);
char *note = "";
// if a day more in the future -- complain
// no! we set the repeat crawl to 3000 days for crawl jobs that
// do not repeat...
// if ( spiderTimeMS > nowMS + 1000 * 86400 )
// note = " (<b><font color=red>This should not be "
// "this far into the future. Probably a corrupt "
// "SpiderRequest?</font></b>)";
// get the rest of the data
sb.safePrintf("<tr bgcolor=#%s>"
"<td>%"INT64"%s</td>"
"<td>%s</td>"
"</tr>\n",
LIGHT_BLUE,
(int64_t)spiderTimeMS,
note,
iptoa(firstIp));
// stop after 20
if ( ++count == 20 ) break;
}
// ...
if ( count )
sb.safePrintf("<tr bgcolor=#%s>"
"<td colspan=10>...</td></tr>\n",
LIGHT_BLUE);
// end the table
sb.safePrintf ( "</table>\n" );
sb.safePrintf ( "<br>\n" );
// get the socket
TcpSocket *s = st->m_socket;
// then we can nuke the state
mdelete ( st , sizeof(State11) , "PageSpiderdb" );
delete (st);
// erase g_errno for sending
g_errno = 0;
// now encapsulate it in html head/tail and send it off
return g_httpServer.sendDynamicPage (s, sb.getBufStart(),sb.length() );
}
///////////////////////////////////
//
// URLFILTERS
//
///////////////////////////////////
/*
// assign these a value of 1 in s_table hashtable
static char *s_ypSites[] = {
"www.yellow.com",
"www.yellowpages.com",
"www.dexknows.com",
"yellowpages.aol.com",
"www.superpages.com",
"citysearch.com",
"www.yellowbook.com",
"www.magicyellow.com",
"home.digitalcity.com",
"www.switchboard.com",
"cityguide.aol.com",
"www.bizrate.com",
"www.restaurantica.com",
"www.insiderpages.com",
"local.yahoo.com"
};
// . assign these a value of 2 in s_table hashtable
// . mwells@g0:/y$ cat gobyout | awk '{print $4}' | grep -v goby.com | grep -vi goby | grep -v google.com | grep -v mappoint | urlinfo | grep "host: " | awk '{print $2}' | sort | uniq > foo
// . then take the top linked to sites on goby and print out for direct
// insertion into this file:
// then get the popular domains from THAT list:
// mwells@g0:/y$ cat foo | awk '{print $2}' | urlinfo | grep "dom: " | awk '{print $2}' | sort | uniq -c | sort > foodom
static char *s_aggSites[] = {
"isuwmsrugby.tripod.com",
"meyerlemon.eventbrite.com",
"miami.tourcorp.com",
"valentinesdaydatenightcoupleschi.eventbrite.com",
"volcano.si.edu",
"webpages.csus.edu",
"weddingextravaganza.eventbrite.com",
"www.alliancerugby.org",
"www.asuwrfc.com",
"www.btpd.org",
"www.chicagodragons.org",
"www.chsgeorgia.org",
"www.derugbyfoundation.org",
"www.foxborosportscenter.com",
"www.lynn.edu",
"www.owensboroparks.org",
"www.scitrek.org",
"www.southcarolinaparks.com",
"www.usbr.gov",
"dummil.eventbrite.com",
"jacksonvilleantiqueshow.eventbrite.com",
"kidsfest.eventbrite.com",
"piuvalentine.eventbrite.com",
"www.anytimefitness.com",
"www.dumbartonhouse.org",
"www.lsurugby.com",
"www.maliburugby.com",
"www.pitsrugby.com",
"www.renegaderugby.org",
"www.rotor.com",
"www.rugbyrats.com",
"www.sanjoserugby.com",
"www.seattleartists.com",
"www.sixflags.com",
"www.vacavillesports.com",
"atlcomedyfest.eventbrite.com",
"easyweekdaycooking.eventbrite.com",
"hartford.citysearch.com",
"healthythaicooking.eventbrite.com",
"hicaregiversconference.eventbrite.com",
"skiing.alpinezone.com",
"spirit.lib.uconn.edu",
"springfield.ettractions.com",
"tomatofest2011.eventbrite.com",
"www.abc-of-meditation.com",
"www.amf.com",
"www.atlantaharlequins.com",
"www.chicagoparkdistrict.com",
"www.denverwildfirerfc.org",
"www.gowaterfalling.com",
"www.harlequins.org",
"www.ignatius.org",
"www.masmacon.com",
"www.palmbeachrugby.org",
"www.riversiderugby.com",
"www.rmne.org",
"www.thehilliard.org",
"www.woodsmenrugby.com",
"devildoll.eventbrite.com",
"iexpectcrabfeedfundraiser.eventbrite.com",
"sports.groups.yahoo.com",
"valentinesdaycookingwithlove.eventbrite.com",
"www.agisamazing.com",
"www.ascendinglotus.com",
"www.auduboninstitute.org",
"www.azrugbyref.com",
"www.blackicerugby.com",
"www.bluegrassmuseum.org",
"www.krewerugby.com",
"www.lamorugby.com",
"www.lsue.edu",
"www.norwichrink.com",
"www.ombac.org",
"www.sdarmada.org",
"www.sirensrugby.com",
"www.tampabarbarians.org",
"www.travellanecounty.org",
"www.visit-newhampshire.com",
"hawaii.tourcorp.com",
"tasteofkorea.eventbrite.com",
"www.ballyfitness.com",
"www.calpolyrugby.com",
"www.destateparks.com",
"www.eaa.org",
"www.goldsgym.com",
"www.gonzagarugby.com",
"www.greatexplorations.org",
"www.heparks.org",
"www.imagisphere.org",
"www.jeffdavis.org",
"www.park.granitecity.com",
"www.poets.org",
"www.regis.edu",
"www.verizoncenter.com",
"mybridalsale.eventbrite.com",
"pigandsausagetoo.eventbrite.com",
"www.gaelrugby.com",
"www.independent.com",
"www.kohlchildrensmuseum.org",
"www.operaamerica.org",
"www.recration.du.edu",
"www.symmetricalskatingschool.org",
"www.telcomhistory.org",
"www.texasoutside.com",
"reagan.eureka.edu",
"stampede2011.eventbrite.com",
"synergy2011.eventbrite.com",
"theexperience2011.eventbrite.com",
"www.24hourfitness.com",
"www.dematha.org",
"www.facebook.com",
"www.iaapa.org",
"www.icelandrestoration.com",
"www.louisvillewomensrugby.com",
"www.manchesterrunningcompany.com",
"www.moaonline.org",
"www.pvicechalet.com",
"www.rendlake.com",
"attinuptown.eventbrite.com",
"chocolateanddessertfantasy.eventbrite.com",
"colorado.ettractions.com",
"longbeachstaterugby.webs.com",
"volcano.oregonstate.edu",
"www.columbiaspacescience.org",
"www.eventful.com",
"eventful.com",
"www.newmexico.org",
"www.rmparks.org",
"www.sbyouthrugby.org",
"www.venturacountyrugbyclub.com",
"www.wheatonicearena.com",
"faithorigins.eventbrite.com",
"jerseyshore.metromix.com",
"stlouis.citysearch.com",
"valentinesdaydatenightcooking.eventbrite.com",
"www.floridarugbyunion.com",
"www.rugbyatucf.com",
"www.stingrayrugby.com",
"www.usfbullsrugby.com",
"atlanta.going.com",
"klsnzwineday.eventbrite.com",
"losangeles.citysearch.com",
"sourdough.eventbrite.com",
"valentinesdaygourmetdating.eventbrite.com",
"web.mit.edu",
"www.airmuseum.org",
"www.eparugby.org",
"www.navicache.com",
"www.siliconvalleyrugby.org",
"www.yale.edu",
"rhodeisland.ettractions.com",
"studentorgs.vanderbilt.edu",
"www.jaxrugby.org",
"www.orlandomagazine.com",
"www.plnurugby.com",
"www.recreation.du.edu",
"www.riversideraptors.com",
"www.usarchery.org",
"cacspringfling.eventbrite.com",
"dallas.going.com",
"groups.northwestern.edu",
"hpualumniiphonelaunchparty.eventbrite.com",
"juliachild.eventbrite.com",
"southbaysciencesymposium2011.eventbrite.com",
"www.curugby.com",
"www.everyoneruns.net",
"www.glendalerugby.com",
"www.phantomsyouthrugby.org",
"www.usdrugby.com",
"10000expo-sponsoship-nec.eventbrite.com",
"greenville.metromix.com",
"spssan.eventbrite.com",
"www.cmaathletics.org",
"www.csulb.edu",
"www.doralrugby.com",
"www.neworleansrugbyclub.com",
"www.sos.louisiana.gov",
"www.southbayrugby.org",
"www.travelnevada.com",
"www.uicrugbyclub.org",
"www.atlantabucksrugby.org",
"www.dinodatabase.com",
"www.fest21.com",
"www.georgiatechrugby.com",
"www.gsuwomensrugby.com",
"www.siuwomensrugby.com",
"www.snowtracks.com",
"www.trainweb.com",
"www.visitnebraska.gov",
"www.visitsanantonio.com",
"hometown.aol.com",
"next2normal.eventbrite.com",
"sixmonthpassatlanta2011.eventbrite.com",
"winejazz2.eventbrite.com",
"www.amityrugby.org",
"www.meetandplay.com",
"www.miami.edu",
"www.miamirugby.com",
"www.phillipscollection.org",
"www.tridentsrugby.com",
"wwwbloggybootcampsandiego.eventbrite.com",
"whale-watching.gordonsguide.com",
"www.culturemob.com",
"www.denver-rugby.com",
"www.hillwoodmuseum.org",
"www.peabody.yale.edu",
"www.yoursciencecenter.com",
"newyorkcity.ettractions.com",
"rawfoodcert.eventbrite.com",
"www.discoverydepot.org",
"www.dukecityrugbyclub.com",
"www.jazztimes.com",
"www.kissimmeeairmuseum.com",
"www.southstreetseaportmuseum.org",
"www.wsbarbariansrugby.com",
"beerunch2011.eventbrite.com",
"milwaukee.ettractions.com",
"seminoletampa.casinocity.com",
"silveroak.eventbrite.com",
"tsunamifitclub.eventbrite.com",
"walking-tours.gordonsguide.com",
"www.alamedarugby.com",
"www.atshelicopters.com",
"www.camelbackrugby.com",
"www.dlshs.org",
"www.eteamz.com",
"newyork.ettractions.com",
"www.allaboutrivers.com",
"www.childrensmuseumatl.org",
"www.hartfordroses.org",
"www.nationalparks.org",
"www.seahawkyouthrugby.com",
"www.skiingthebackcountry.com",
"epcontinental.eventbrite.com",
"healthandwellnessshow.eventbrite.com",
"www.apopkamuseum.org",
"www.condorsrugby.com",
"www.dcr.virginia.gov",
"www.diabloyouthrugby.org",
"www.rockandice.com",
"honolulu.metromix.com",
"mowcrabfeed2011.eventbrite.com",
"ptt-superbowl.eventbrite.com",
"whitewater-rafting.gordonsguide.com",
"winearomatraining.eventbrite.com",
"www.broadway.com",
"www.usc.edu",
"www.gatorrugby.com",
"www.iumudsharks.net",
"www.scrrs.net",
"www.sfggrugby.com",
"www.unco.edu",
"hctmspring2011conference.eventbrite.com",
"sandiego.going.com",
"www.crt.state.la.us",
"www.foodhistorynews.com",
"www.lancerrugbyclub.org",
"www.littlerockrugby.com",
"www.sharksrugbyclub.com",
"www.channelislandsice.com",
"www.idealist.org",
"www.mbtykesrugby.com",
"katahdicon.eventbrite.com",
"foodwineloversfestival.eventbrite.com",
"maristeveningseries2011.eventbrite.com",
"philadelphia.ettractions.com",
"sugarrushla.eventbrite.com",
"www.chicagolions.com",
"www.skatingsafe.com",
"www.themeparkinsider.com",
"fremdcraftfairspring2011.eventbrite.com",
"gorptravel.away.com",
"minnesota.ettractions.com",
"www.chicagohopeacademy.org",
"www.fmcicesports.com",
"www.kitebeaches.com",
"www.mixedmartialarts.com",
"www.slatermill.org",
"www.sunnysideoflouisville.org",
"www.visitrochester.com",
"careshow.eventbrite.com",
"massachusetts.ettractions.com",
"edwardianla2011.eventbrite.com",
"indianapolis.metromix.com",
"www.pasadenamarathon.org",
"washington.going.com",
"www.sjquiltmuseum.org",
"www.wannakitesurf.com",
"fauwomensrugby.sports.officelive.com",
"newhampshire.ettractions.com",
"www.vcmha.org",
"milwaukee.going.com",
"phoenix.going.com",
"www.anrdoezrs.net",
"www.temperugby.com",
"pampermefabulous2011.eventbrite.com",
"www.napavalleyvineyards.org",
"r4k11.eventbrite.com",
"ramonamusicfest.eventbrite.com",
"www.abc-of-rockclimbing.com",
"www.geocities.com",
"jackson.metromix.com",
"www.santamonicarugby.com",
"cleveland.metromix.com",
"lancaster.ettractions.com",
"www.fortnet.org",
"www.horseandtravel.com",
"www.pubcrawler.com",
"kdwp.state.ks.us",
"www.berkeleyallblues.com",
"www.liferugby.com",
"www.socalmedicalmuseum.org",
"www.dcsm.org",
"www.sutler.net",
"desmoines.metromix.com",
"www.cavern.com",
"www.dotoledo.org",
"www.fws.gov",
"www.ghosttowngallery.com",
"www.museumamericas.org",
"www.museumsofboston.org",
"www.northshorerugby.com",
"geocaching.gpsgames.org",
"www.americaeast.com",
"www.cwrfc.org",
"www.jewelryshowguide.com",
"www.livelytimes.com",
"www.pascorugbyclub.com",
"www.westminsterice.com",
"www.claremontrugby.org",
"www.jugglingdb.com",
"www.metalblade.com",
"www.preservationnation.org",
"sofla2011.eventbrite.com",
"www.belmonticeland.com",
"www.dropzone.com",
"www.smecc.org",
"www.studentgroups.ucla.edu",
"www.visitdetroit.com",
"honolulu.going.com",
"sippingandsaving5.eventbrite.com",
"www.connecticutsar.org",
"www.guestranches.com",
"www.nvtrailmaps.com",
"www.visitnh.gov",
"illinois.ettractions.com",
"www.spymuseum.org",
"www.ci.riverside.ca.us",
"www.hbnews.us",
"www.santaclarayouthrugby.com",
"www.thestranger.com",
"www.freewebs.com",
"www.miamirugbykids.com",
"www.mtwashingtonvalley.org",
"www.ocbucksrugby.com",
"bridalpaloozala.eventbrite.com",
"maps.yahoo.com",
"www.azstateparks.com",
"www.paywindowpro.com",
"www.rowadventures.com",
"parksandrecreation.idaho.gov",
"www.artsmemphis.org",
"www.lasvegasweekly.com",
"www.redmountainrugby.org",
"san-francisco.tourcorp.com",
"www.khsice.com",
"www.vansenusauto.com",
"quinceanerasmagazineoc.eventbrite.com",
"www.mvc-sports.com",
"www.tbsa.com",
"www.travelportland.com",
"rtnpilgrim.eventbrite.com",
"www.bigfishtackle.com",
"www.centralmass.org",
"cpca2011.eventbrite.com",
"www.matadorrecords.com",
"www.sebabluegrass.org",
"prescott.showup.com",
"vintagevoltage2011.eventbrite.com",
"www.seattleperforms.com",
"www.valleyskating.com",
"resetbootcamp.eventbrite.com",
"www.abc-of-mountaineering.com",
"www.snocountry.com",
"events.nytimes.com",
"www.icecenter.net",
"www.livefrommemphis.com",
"www.pasadenarfc.com",
"www.ucsdrugby.com",
"uclaccim.eventbrite.com",
"www.visitchesapeake.com",
"www.natureali.org",
"www.nordicskiracer.com",
"www.nowplayingva.org",
"www.sbcounty.gov",
"www.seedesmoines.com",
"www.world-waterfalls.com",
"denver.going.com",
"hearstmuseum.berkeley.edu",
"www.lmurugby.com",
"www.ftlrugby.com",
"www.pelicanrugby.com",
"rtnharthighschool.eventbrite.com",
"www.visitri.com",
"www.aba.org",
"www.americaonice.us",
"www.thecontemporary.org",
"www.wherigo.com",
"www.drtopo.com",
"www.visitseattle.org",
"calendar.dancemedia.com",
"trips.outdoors.org",
"www.chs.org",
"www.myneworleans.com",
"www.oaklandice.com",
"nashville.metromix.com",
"www.americangolf.com",
"www.fossilmuseum.net",
"www.oakparkparks.com",
"www.visit-maine.com",
"www.oregonlive.com",
"www.allwashingtondctours.com",
"www.wannadive.net",
"www.sportsheritage.org",
"hudsonvalley.metromix.com",
"www.scificonventions.com",
"www.wildernessvolunteers.org",
"essencemusicfestival.eventbrite.com",
"www.kitesurfatlas.com",
"www.ndtourism.com",
"valentinesgourmetdatingchicago.eventbrite.com",
"www.fingerlakeswinecountry.com",
"www.dmnh.org",
"www.ticketnetwork.com",
"partystroll.eventbrite.com",
"www.bedandbreakfastnetwork.com",
"www.sternmass.org",
"www.visitnh.com",
"www.places2ride.com",
"www.hawaiieventsonline.com",
"www.ucirugby.com",
"www.gohawaii.com",
"www.writersforum.org",
"www.roadracingworld.com",
"www.bigisland.org",
"www.boatbookings.com",
"www.lhs.berkeley.edu",
"www.dnr.state.mn.us",
"www.mostateparks.com",
"www.historicnewengland.org",
"www.waza.org",
"www.backbayrfc.com",
"newyork.metromix.com",
"www.larebellion.org",
"teetimes.golfhub.com",
"10000expo-sponsoship-ceg.eventbrite.com",
"10000expo-sponsor-bjm.eventbrite.com",
"parks.ky.gov",
"www.bostonusa.com",
"www.visitbuffaloniagara.com",
"www.sharksice.com",
"2011burbankapprentice.eventbrite.com",
"kansascity.ettractions.com",
"www.bicycling.com",
"www.cityofchino.org",
"www.ridingworld.com",
"www.whittierrugby.com",
"10000bestjobsam.eventbrite.com",
"www.adventurecentral.com",
"www.earlymusic.org",
"www.upcomingevents.com",
"www.sleddogcentral.com",
"www.capecodkidz.com",
"www.collectorsguide.com",
"www.cougarrugby.org",
"www.sfvrugby.com",
"strivetothrivepabcconf.eventbrite.com",
"www.visithoustontexas.com",
"www.authorstrack.com",
"www.aboutgolfschools.org",
"www.huntingspotz.com",
"www.lib.az.us",
"members.aol.com",
"www.fs.fed.us",
"www.ncarts.org",
"www.vermonttravelplanner.org",
//"www.scubadiving.com",
"www.waterfallsnorthwest.com",
"www.philadelphiausa.travel",
"www.usgolfschoolguide.com",
"njgin.state.nj.us",
"www.artcards.cc",
"www.rimonthly.com",
"www.atlanta.net",
"www.glacialgardens.com",
"2011superbowlcruise.eventbrite.com",
"swimming-with-dolphins.gordonsguide.com",
"www.trackpedia.com",
// why was this in there?
//"www.dailyherald.com",
"www.nhm.org",
"boston.ettractions.com",
"www.geneseefun.com",
"www.travelsd.com",
"www.golfbuzz.com",
"www.in.gov",
"cincinnati.metromix.com",
"www.sanjose.com",
"brevard.metromix.com",
"www.dogsledrides.com",
"www.orvis.com",
"philadelphia.going.com",
"twincities.metromix.com",
"www.orlandorugby.com",
"www.csufrugby.com",
"www.larugby.com",
"www.washingtonwine.org",
"calendar.gardenweb.com",
"gulfcoast.metromix.com",
"florida.ettractions.com",
"www.northeastwaterfalls.com",
"www.computerhistory.org",
"www.ct.gov",
"www.hosteltraveler.com",
"www.thinkrentals.com",
"www.4x4trailhunters.com",
"www.cityweekly.net",
"www.yourrunning.com",
"www.spasofamerica.com",
"www.indoorclimbing.com",
"www.utah.com",
"boston.going.com",
"minneapolisstpaul.ettractions.com",
"www.coolrunning.com",
"www.greensboronc.org",
"www.michigan.org",
"www.artfestival.com",
"www.divespots.com",
"www.oregonstateparks.org",
"www.virginiawine.org",
"www.morebeach.com",
"www.minnesotamonthly.com",
"www.texasescapes.com",
"www.usatf.org",
"www.findrentals.com",
"www.hachettebookgroup.com",
"www.racesonline.com",
"www.usace.army.mil",
"web.georgia.org",
"detroit.metromix.com",
"www.homebrewersassociation.org",
"www.baltimore.org",
"www.gastateparks.org",
"www.arkansasstateparks.com",
"www.visitlasvegas.com",
"www.whenwerv.com",
"www.chilicookoff.com",
"www.bikeride.com",
"www.eaglerockrugby.com",
"www.pickwickgardens.com",
"flagstaff.showup.com",
"miami.going.com",
"www.anchorage.net",
"www.wlra.us",
"www.thetrustees.org",
"www.artnet.com",
"www.mthoodterritory.com",
"www.hihostels.com",
"www.bfa.net",
"167.102.232.26",
"www.flyins.com",
"www.stepintohistory.com",
"www.festing.com",
"www.pursuetheoutdoors.com",
"newyork.going.com",
"www.fishingguidenetwork.com",
"www.visit-massachusetts.com",
"www.visitindy.com",
"www.washingtonpost.com",
"www.greatamericandays.com",
"www.washingtonian.com",
"national.citysearch.com",
"www.infohub.com",
"www.productionhub.com",
"www.events.org",
"www.traveliowa.com",
"www.findmyadventure.com",
"delaware.metromix.com",
"www.marinmagazine.com",
"us.penguingroup.com",
"www.bicycletour.com",
"www.travelok.com",
"www.scububble.com",
"www.childrensmuseums.org",
"www.conventionscene.com",
"www.scubaspots.com",
"www.tnvacation.com",
"stlouis.ettractions.com",
"www.mxparks.com",
"florida.greatestdivesites.com",
"www.nowplayingaustin.com",
"www.skinnyski.com",
"www.sportoften.com",
"www.zvents.com",
"www.visitphoenix.com",
"palmsprings.metromix.com",
"upcoming.yahoo.com",
"www.washington.org",
"www.balloonridesacrossamerica.com",
"www.playbill.com",
"palmbeach.ettractions.com",
"louisville.metromix.com",
"www.animecons.com",
"www.findanartshow.com",
"www.usef.org",
"www.villagevoice.com",
"www.discovergold.org",
"www.georgiaoffroad.com",
"www.memphistravel.com",
"dc.metromix.com",
"www.aplf-planetariums.info",
"www.skateisi.com",
"www.usacycling.org",
"www.wine-compass.com",
"www.visitdelaware.com",
"tucson.metromix.com",
"www.happycow.net",
"www.indiecraftshows.com",
"www.gethep.net",
"www.agritourismworld.com",
"stlouis.metromix.com",
"phoenix.metromix.com",
"stream-flow.allaboutrivers.com",
"www.festivalsandevents.com",
"www.winemcgee.com",
"www.aurcade.com",
"www.visitjacksonville.com",
"www.nashvillescene.com",
"www.4x4trails.net",
"www.americancraftmag.org",
"blog.danceruniverse.com",
"www.vacationrealty.com",
"www.californiasciencecenter.org",
"www.rollerhome.com",
"www.atvsource.com",
"www.hotairballooning.com",
"www.freeskateparks.com",
"www.ruralbounty.com",
"connecticut.ettractions.com",
"www.localattractions.com",
"www.skategroove.com",
"www.hawaiitours.com",
"www.visitrhodeisland.com",
"www.swac.org",
"www.swimmingholes.org",
"www.roadfood.com",
"www.gotriadscene.com",
"www.runnersworld.com",
"www.outerquest.com",
"www.seattleweekly.com",
"www.onlyinsanfrancisco.com",
"www.bikereg.com",
"www.artslant.com",
"www.louisianatravel.com",
"www.operabase.com",
"www.stepintoplaces.com",
"www.vinarium-usa.com",
"www.visitconnecticut.com",
"www.abc-of-mountainbiking.com",
"www.wannask8.com",
"www.xcski.org",
"www.active-days.org",
"www.hawaiiactivities.com",
"www.massvacation.com",
"www.uspa.org",
"miami.ettractions.com",
"www.abc-of-hiking.com",
"www.bestofneworleans.com",
"www.phillyfunguide.com",
"www.beermonthclub.com",
"www.newenglandwaterfalls.com",
"www.lake-link.com",
"www.festivalfinder.com",
"www.visitmississippi.org",
"www.lanierbb.com",
"www.thepmga.com",
"www.skitown.com",
"www.fairsandfestivals.net",
"sanfrancisco.going.com",
"www.koa.com",
"www.wildlifeviewingareas.com",
"www.boatrenting.com",
"www.nowplayingutah.com",
"www.ultimaterollercoaster.com",
"www.findacraftfair.com",
"www.ababmx.com",
"www.abc-of-skiing.com",
"www.pw.org",
"tampabay.metromix.com",
"www.onthesnow.com",
"www.sunny.org",
"www.visitnewengland.com",
"atlanta.metromix.com",
"www.allaboutapples.com",
"www.monsterjam.com",
"www.bnbfinder.com",
"www.sandiego.org",
"www.worldcasinodirectory.com",
"www.yoga.com",
"www.1-800-volunteer.org",
"www.visitkc.com",
"www.theskichannel.com",
"www.thephoenix.com",
"www.virginia.org",
"www.avclub.com",
"www.orlandoinfo.com",
"www.trustedtours.com",
"www.peakradar.com",
"web.minorleaguebaseball.com",
"www.artshound.com",
"www.daytonabeach.com",
"chicago.going.com",
"www.cetaceanwatching.com",
"www.citypages.com",
"www.nowplayingnashville.com",
"www.discoverlosangeles.com",
"www.ratebeer.com",
"www.harpercollins.com",
"www.seenewengland.com",
"www.visitmt.com",
"www.goldstar.com",
"www.caverbob.com",
"www.sanjose.org",
"www.backcountrysecrets.com",
"authors.simonandschuster.com",
"rafting.allaboutrivers.com",
"chicago.ettractions.com",
"iweb.aam-us.org",
"www.theputtingpenguin.com",
"www.festivals.com",
"www.artsboston.org",
"www.aboutskischools.com",
"tucson.showup.com",
"www.thiswaytothe.net",
"www.rei.com",
"www.magicseaweed.com",
"www.waterfallswest.com",
"fortlauderdale.ettractions.com",
"www.foodreference.com",
"www.californiawineryadvisor.com",
"www.teamap.com",
"www.neworleanscvb.com",
"www.skatetheory.com",
"www.visitmaine.com",
"www.rollerskating.org",
"www.culturecapital.com",
"www.delawarescene.com",
"www.nyc-arts.org",
"www.huntingoutfitters.net",
"www.showcaves.com",
"www.soccerbars.com",
"www.visitnewportbeach.com",
"www.beerme.com",
"www.pitch.com",
"www.museum.com",
"www.hauntworld.com",
"www.forestcamping.com",
"www.dogpark.com",
"www.critterplaces.com",
"www.visitnj.org",
"www.findagrave.com",
"www.arcadefly.com",
"www.winerybound.com",
"www.usms.org",
"www.zipscene.com",
"www.horsetraildirectory.com",
"www.coaster-net.com",
"www.anaheimoc.org",
"www.visitpa.com",
"www.antiquetrader.com",
"www.dallasobserver.com",
"www.eventsetter.com",
"www.goingoutside.com",
"www.sightseeingworld.com",
"www.artlog.com",
"www.bnbstar.com",
"www.hostels.com",
"www.theartnewspaper.com",
"consumer.discoverohio.com",
"www.nssio.org",
"www.wingshootingusa.org",
"www.shootata.com",
"www.randomhouse.com",
"www.artforum.com",
"www.bachtrack.com",
"www.wayspa.com",
"www.visitidaho.org",
"www.exploreminnesota.com",
"chicago.metromix.com",
"www.worldgolf.com",
"nysparks.state.ny.us",
"www.meetup.com",
"www.skateboardparks.com",
"www.downtownjacksonville.org",
"www.lighthousefriends.com",
"www.strikespots.com",
"ww2.americancanoe.org",
"www.inlandarts.com",
"www.horseshowcentral.com",
"www.ridingresource.com",
"www.experiencewa.com",
"database.thrillnetwork.com",
"denver.metromix.com",
"www.bostoncentral.com",
"www.segwayguidedtours.com",
"www.colorado.com",
"www.artandseek.org",
"www.floridastateparks.org",
"www.sparkoc.com",
"losangeles.going.com",
"www.motorcycleevents.com",
"www.destination-store.com",
"www.scubadviser.com",
"www.booktour.com",
"www.cloud9living.com",
"www.allaboutjazz.com",
"www.sacramento365.com",
"www.discoversouthcarolina.com",
"www.riverfronttimes.com",
"www.hauntedhouses.com",
"www.arenamaps.com",
"www.artsnwct.org",
"www.eventbrite.com",
"animal.discovery.com",
"www.eatfeats.com",
"www.1001seafoods.com",
"www.malletin.com",
"www.yelp.com",
"www.wannasurf.com",
"www.clubplanet.com",
"www.dupagecvb.com",
"www.smartdestinations.com",
"www.artfaircalendar.com",
"www.excitations.com",
"www.balloonrideus.com",
"www.extravagift.com",
"www.skisite.com",
"www.orlandoweekly.com",
"www.iloveny.com",
"www.sandiegoreader.com",
"web.usarugby.org",
"www.artscalendar.com",
"www.sfweekly.com",
"store-locator.barnesandnoble.com",
"www.realhaunts.com",
"trails.mtbr.com",
"www.bbonline.com",
"www.pickyourownchristmastree.org",
"events.myspace.com",
"www.alabama.travel",
"www.ctvisit.com",
"freepages.history.rootsweb.com",
"www.waterparks.com",
"www.flavorpill.com",
"www.marinasdirectory.org",
"www.publicgardens.org",
"www.alwaysonvacation.com",
"www.infosports.com",
"www.summitpost.org",
"www.exploregeorgia.org",
"www.brewerysearch.com",
"www.phoenixnewtimes.com",
"www.marinas.com",
"www.arestravel.com",
"www.gamebirdhunts.com",
"www.cbssports.com",
"tutsan.forest.net",
"www.azcentral.com",
"www.tennispulse.org",
"www.westword.com",
"www.factorytoursusa.com",
"www.americanwhitewater.org",
"www.spamagazine.com",
"www.dogparkusa.com",
"tps.cr.nps.gov",
"www.sfstation.com",
"www.abc-of-yoga.com",
"www.worldeventsguide.com",
"www.active.com",
"www.beerexpedition.com",
"www.iloveinns.com",
"www.warpig.com",
"www.artsopolis.com",
"www.skatepark.com",
"www.offroadnorthamerica.com",
"www.visitflorida.com",
"www.last.fm",
"www.pbplanet.com",
"www.traveltex.com",
"phoenix.showup.com",
"www.travelandleisure.com",
"www.kentuckytourism.com",
"www.gospelgigs.com",
"www.whenwegetthere.com",
"www.surfline.com",
"www.stubhub.com",
"www.centerstagechicago.com",
"www.sunshineartist.com",
"www.reserveamerica.com",
"www.clubzone.com",
"www.paddling.net",
"www.xperiencedays.com",
"www.razorgator.com",
"www.dalejtravis.com",
"www.pickyourown.org",
"www.localhikes.com",
"www.parks.ca.gov",
"www.casinocity.com",
"www.nofouls.com",
"www.laweekly.com",
"www.denver.org",
"www.enjoyillinois.com",
"www.livenation.com",
"www.viator.com",
"members.bikeleague.org",
"www.skatespotter.com",
"family.go.com",
"www.myspace.com",
"www.takemefishing.org",
"www.localwineevents.com",
"www.rinkdirectory.com",
"www.walkjogrun.net",
"www.nps.gov",
"www.ghosttowns.com",
"www.theatermania.com",
"www.skateboardpark.com",
"www.miaminewtimes.com",
"www.explorechicago.org",
"www.ocweekly.com",
"www.ustasearch.com",
"www.rateclubs.com",
"www.tennismetro.com",
"www.motorcyclemonster.com",
"www.hauntedhouse.com",
"www.pumpkinpatchesandmore.org",
"www.courtsoftheworld.com",
"www.ecoanimal.com",
"www.yogafinder.com",
"www.traillink.com",
"www.equinenow.com",
"www.jambase.com",
"www.spaemergency.com",
//"www.vacationhomerentals.com",
"www.ava.org",
"affiliate.isango.com",
"www.museumland.net",
"www.dirtworld.com",
"www.rockclimbing.com",
"www.kijubi.com",
"www.outdoortrips.info",
"www.visitcalifornia.com",
"www.heritagesites.com",
"www.bedandbreakfast.com",
"www.discoveramerica.com",
"www.singletracks.com",
"www.museumstuff.com",
"www.opentable.com",
"www.homeaway.com",
"www.thegolfcourses.net",
"www.golflink.com",
"www.trekaroo.com",
"gocitykids.parentsconnect.com",
"www.wildernet.com",
"www.10best.com",
"swim.isport.com",
"www.wheretoshoot.org",
"www.hostelworld.com",
"www.landbigfish.com",
"www.recreation.gov",
"www.healthclubdirectory.com",
"www.spafinder.com",
"www.nationalregisterofhistoricplaces.com",
"www.americantowns.com",
"www.hmdb.org",
"www.golfnow.com",
"www.grandparents.com",
"www.swimmersguide.com",
"www.luxergy.com",
"activities.wildernet.com",
"events.mapchannels.com",
"www.museumsusa.org",
"www.rinktime.com",
"www.rentandorbuy.com",
"www.mytravelguide.com",
"playspacefinder.kaboom.org",
"www.famplosion.com",
"www.eviesays.com",
"www.anglerweb.com",
"www.trails.com",
"www.waymarking.com",
"www.priceline.com",
"local.yahoo.com",
"ticketmaster.com",
// rss feeds
"trumba.com",
// movie times:
"cinemark.com",
// domains (hand selected from above list filtered with urlinfo)
"patch.com",
"gordonsguide.com",
"tourcorp.com",
"americangolf.com",
"casinocity.com",
"going.com",
"metromix.com",
"ettractions.com",
"citysearch.com",
"eventbrite.com"
};
*/
/*
static HashTableX s_table;
static bool s_init = false;
static char s_buf[25000];
static int32_t s_craigsList;
bool initAggregatorTable ( ) {
// this hashtable is used for "isyellowpages" and "iseventaggregator"
if ( s_init ) return true;
// use niceness 0
s_table.set(4,1,4096,s_buf,25000,false,0,"spsitetbl");
// now stock it with yellow pages sites
int32_t n = (int32_t)sizeof(s_ypSites)/ sizeof(char *);
for ( int32_t i = 0 ; i < n ; i++ ) {
char *s = s_ypSites[i];
int32_t slen = gbstrlen ( s );
int32_t h32 = hash32 ( s , slen );
char val = 1;
if ( ! s_table.addKey(&h32,&val)) {char*xx=NULL;*xx=0;}
}
// then stock with event aggregator sites
n = (int32_t)sizeof(s_aggSites)/ sizeof(char *);
for ( int32_t i = 0 ; i < n ; i++ ) {
char *s = s_aggSites[i];
int32_t slen = gbstrlen ( s );
int32_t h32 = hash32 ( s , slen );
char val = 2;
if ( ! s_table.addKey(&h32,&val)) {char*xx=NULL;*xx=0;}
}
// do not repeat this
s_init = true;
s_craigsList = hash32n("craigslist.org");
return true;
}
bool isAggregator ( int32_t siteHash32,int32_t domHash32,char *url,int32_t urlLen ) {
// make sure its stocked
initAggregatorTable();
// is site a hit?
char *v = (char *)s_table.getValue ( &siteHash32 );
// hit?
if ( v && *v ) return true;
// try domain?
v = (char *)s_table.getValue ( &domHash32 );
// hit?
if ( v && *v ) return true;
// these guys mirror eventful.com's db so let's grab it...
// abcd.com
if ( urlLen>30 &&
url[11]=='t' &&
url[18]=='o' &&
strncmp(url,"http://www.thingstodoin",23) == 0 )
return true;
// craigslist
if ( domHash32 == s_craigsList && strstr(url,".com/cal/") )
return true;
// otherwise, no
return false;
}
*/
#define SIGN_EQ 1
#define SIGN_NE 2
#define SIGN_GT 3
#define SIGN_LT 4
#define SIGN_GE 5
#define SIGN_LE 6
// from PageBasic.cpp
char *getMatchingUrlPattern ( SpiderColl *sc , SpiderRequest *sreq, char *tag);
// . this is called by SpiderCache.cpp for every url it scans in spiderdb
// . we must skip certain rules in getUrlFilterNum() when doing to for Msg20
// because things like "parentIsRSS" can be both true or false since a url
// can have multiple spider recs associated with it!
int32_t getUrlFilterNum2 ( SpiderRequest *sreq ,
SpiderReply *srep ,
int32_t nowGlobal ,
bool isForMsg20 ,
int32_t niceness ,
CollectionRec *cr ,
bool isOutlink ,
HashTableX *quotaTable ,
int32_t langIdArg ) {
if ( ! sreq ) {
log("spider: sreq is NULL!");
}
int32_t langId = langIdArg;
if ( srep ) langId = srep->m_langId;
// convert lang to string
char *lang = NULL;
int32_t langLen = 0;
if ( langId >= 0 ) { // if ( srep ) {
// this is NULL on corruption
lang = getLanguageAbbr ( langId );//srep->m_langId );
langLen = gbstrlen(lang);
}
// . get parent language in the request
// . primarpy language of the parent page that linked to this url
char *plang = NULL;
int32_t plangLen = 0;
plang = getLanguageAbbr(sreq->m_parentLangId);
if ( plang ) plangLen = gbstrlen(plang);
char *tld = (char *)-1;
int32_t tldLen;
int32_t urlLen = sreq->getUrlLen();
char *url = sreq->m_url;
char *row;
bool checkedRow = false;
//SpiderColl *sc = cr->m_spiderColl;
SpiderColl *sc = g_spiderCache.getSpiderColl(cr->m_collnum);
if ( ! quotaTable ) quotaTable = &sc->m_localTable;
//if ( strstr(url,"http://www.vault.com/rankings-reviews/company-rankings/law/vault-law-100/.aspx?pg=2" ))
// log("hey");
//initAggregatorTable();
//int32_t tldlen2;
//char *tld2 = getTLDFast ( sreq->m_url , &tldlen2);
//bool bad = true;
//if ( tld2[0] == 'c' && tld2[1] == 'o' && tld2[2]=='m' ) bad = false;
//if ( tld2[0] == 'o' && tld2[1] == 'r' && tld2[2]=='g' ) bad = false;
//if ( tld2[0] == 'u' && tld2[1] == 's' ) bad = false;
//if ( tld2[0] == 'g' && tld2[1] == 'o' && tld2[2]=='v' ) bad = false;
//if ( tld2[0] == 'e' && tld2[1] == 'd' && tld2[2]=='u' ) bad = false;
//if ( tld2[0] == 'i' && tld2[1] == 'n' && tld2[2]=='f' ) bad = false;
//if ( bad )
// log("hey");
// int16_tcut
char *ucp = cr->m_diffbotUrlCrawlPattern.getBufStart();
char *upp = cr->m_diffbotUrlProcessPattern.getBufStart();
if ( upp && ! upp[0] ) upp = NULL;
if ( ucp && ! ucp[0] ) ucp = NULL;
// get the compiled regular expressions
regex_t *ucr = &cr->m_ucr;
regex_t *upr = &cr->m_upr;
if ( ! cr->m_hasucr ) ucr = NULL;
if ( ! cr->m_hasupr ) upr = NULL;
char *ext;
//char *special;
// CONSIDER COMPILING FOR SPEED:
// 1) each command can be combined into a bitmask on the spiderRequest
// bits, or an access to m_siteNumInlinks, or a substring match
// 2) put all the strings we got into the list of Needles
// 3) then generate the list of needles the SpiderRequest/url matches
// 4) then reduce each line to a list of needles to have, a
// min/max/equal siteNumInlinks, min/max/equal hopCount,
// and a bitMask to match the bit flags in the SpiderRequest
// stop at first regular expression it matches
for ( int32_t i = 0 ; i < cr->m_numRegExs ; i++ ) {
// breathe
QUICKPOLL ( niceness );
// get the ith rule
SafeBuf *sb = &cr->m_regExs[i];
//char *p = cr->m_regExs[i];
char *p = sb->getBufStart();
checkNextRule:
// skip leading whitespace
while ( *p && isspace(*p) ) p++;
// do we have a leading '!'
bool val = 0;
if ( *p == '!' ) { val = 1; p++; }
// skip whitespace after the '!'
while ( *p && isspace(*p) ) p++;
// new rules for when to download (diffbot) page
if ( *p == 'm' &&
p[1]== 'a' &&
p[2]== 't' &&
p[3]== 'c' &&
p[4]== 'h' &&
p[5]== 'e' &&
p[6]== 's' &&
p[7]== 'u' &&
p[8]== 'c' &&
p[9]== 'p' ) {
// . skip this expression row if does not match
// . url must match one of the patterns in there.
// . inline this for speed
// . "ucp" is a ||-separated list of substrings
// . "ucr" is a regex
// . regexec returns 0 for a match
if ( ucr && regexec(ucr,url,0,NULL,0) &&
// seed or other manual addition always matches
! sreq->m_isAddUrl &&
! sreq->m_isPageReindex &&
! sreq->m_isInjecting )
continue;
// do not require a match on ucp if ucr is given
if ( ucp && ! ucr &&
! doesStringContainPattern(url,ucp) &&
// seed or other manual addition always matches
! sreq->m_isAddUrl &&
! sreq->m_isPageReindex &&
! sreq->m_isInjecting )
continue;
p += 10;
p = strstr(p,"&&");
if ( ! p ) return i;
p += 2;
goto checkNextRule;
}
// new rules for when to "process" (diffbot) page
if ( *p == 'm' &&
p[1]== 'a' &&
p[2]== 't' &&
p[3]== 'c' &&
p[4]== 'h' &&
p[5]== 'e' &&
p[6]== 's' &&
p[7]== 'u' &&
p[8]== 'p' &&
p[9]== 'p' ) {
// . skip this expression row if does not match
// . url must match one of the patterns in there.
// . inline this for speed
// . "upp" is a ||-separated list of substrings
// . "upr" is a regex
// . regexec returns 0 for a match
if ( upr && regexec(upr,url,0,NULL,0) )
continue;
if ( upp && !upr &&!doesStringContainPattern(url,upp))
continue;
p += 10;
p = strstr(p,"&&");
if ( ! p ) return i;
p += 2;
goto checkNextRule;
}
if ( *p=='h' && strncmp(p,"hasauthorityinlink",18) == 0 ) {
// skip for msg20
if ( isForMsg20 ) continue;
// skip if not valid (pageaddurl? injection?)
if ( ! sreq->m_hasAuthorityInlinkValid ) continue;
// if no match continue
if ( (bool)sreq->m_hasAuthorityInlink==val)continue;
// skip
p += 18;
// skip to next constraint
p = strstr(p, "&&");
// all done?
if ( ! p ) return i;
p += 2;
goto checkNextRule;
}
if ( *p=='h' && strncmp(p,"hascontactinfo",14) == 0 ) {
// skip for msg20
if ( isForMsg20 ) continue;
// skip if not valid (pageaddurl? injection?)
if ( ! sreq->m_hasContactInfoValid ) continue;
// if no match continue
if ( (bool)sreq->m_hasContactInfo==val ) continue;
// skip
p += 14;
// skip to next constraint
p = strstr(p, "&&");
// all done?
if ( ! p ) return i;
p += 2;
goto checkNextRule;
}
if ( *p=='h' && strncmp(p,"hasaddress",10) == 0 ) {
// if we do not have enough info for outlink, all done
if ( isOutlink ) return -1;
// skip for msg20
if ( isForMsg20 ) continue;
// reply based
if ( ! srep ) continue;
// skip if not valid (pageaddurl? injection?)
if ( ! srep->m_hasAddressValid ) continue;
// if no match continue
if ( (bool)srep->m_hasAddress==val ) continue;
// skip
p += 10;
// skip to next constraint
p = strstr(p, "&&");
// all done?
if ( ! p ) return i;
p += 2;
goto checkNextRule;
}
if ( *p=='h' && strncmp(p,"hastod",6) == 0 ) {
// if we do not have enough info for outlink, all done
if ( isOutlink ) return -1;
// skip for msg20
if ( isForMsg20 ) continue;
// reply based
if ( ! srep ) continue;
// skip if not valid (pageaddurl? injection?)
if ( ! srep->m_hasTODValid ) continue;
// if no match continue
if ( (bool)srep->m_hasTOD==val ) continue;
// skip
p += 6;
// skip to next constraint
p = strstr(p, "&&");
// all done?
if ( ! p ) return i;
p += 2;
goto checkNextRule;
}
if ( *p=='h' && strncmp(p,"hasreply",8) == 0 ) {
// if we do not have enough info for outlink, all done
if ( isOutlink ) return -1;
// skip for msg20
if ( isForMsg20 ) continue;
// if we got a reply, we are not new!!
//if ( (bool)srep == (bool)val ) continue;
if ( (bool)(sreq->m_hadReply) == (bool)val ) continue;
// skip it for speed
p += 8;
// check for &&
p = strstr(p, "&&");
// if nothing, else then it is a match
if ( ! p ) return i;
// skip the '&&' and go to next rule
p += 2;
goto checkNextRule;
}
// hastmperror, if while spidering, the last reply was
// like EDNSTIMEDOUT or ETCPTIMEDOUT or some kind of
// usually temporary condition that warrants a retry
if ( *p=='h' && strncmp(p,"hastmperror",11) == 0 ) {
// if we do not have enough info for outlink, all done
if ( isOutlink ) return -1;
// skip for msg20
if ( isForMsg20 ) continue;
// reply based
if ( ! srep ) continue;
// get our error code
int32_t errCode = srep->m_errCode;
// . make it zero if not tmp error
// . now have EDOCUNCHANGED and EDOCNOGOODDATE from
// Msg13.cpp, so don't count those here...
if ( errCode != EDNSTIMEDOUT &&
errCode != ETCPTIMEDOUT &&
errCode != EDNSDEAD &&
// add this here too now because we had some
// seeds that failed one time and the crawl
// never repeated after that!
errCode != EBADIP &&
// assume diffbot is temporarily experiencing errs
// but the crawl, if recurring, should retry these
// at a later point
errCode != EDIFFBOTUNABLETOAPPLYRULES &&
errCode != EDIFFBOTCOULDNOTPARSE &&
errCode != EDIFFBOTCOULDNOTDOWNLOAD &&
errCode != EDIFFBOTINVALIDAPI &&
errCode != EDIFFBOTVERSIONREQ &&
errCode != EDIFFBOTURLPROCESSERROR &&
errCode != EDIFFBOTTOKENEXPIRED &&
errCode != EDIFFBOTUNKNOWNERROR &&
errCode != EDIFFBOTINTERNALERROR &&
// if diffbot received empty content when d'lding
errCode != EDIFFBOTEMPTYCONTENT &&
// or diffbot tcp timed out when d'lding the url
errCode != EDIFFBOTREQUESTTIMEDOUT &&
// if diffbot closed the socket on us...
errCode != EDIFFBOTMIMEERROR &&
// of the diffbot reply itself was not 200 (OK)
errCode != EDIFFBOTBADHTTPSTATUS &&
// out of memory while crawling?
errCode != ENOMEM &&
errCode != ENETUNREACH &&
errCode != EHOSTUNREACH )
errCode = 0;
// if no match continue
if ( (bool)errCode == val ) continue;
// skip
p += 11;
// skip to next constraint
p = strstr(p, "&&");
// all done?
if ( ! p ) return i;
p += 2;
goto checkNextRule;
}
/*
if ( *p=='h' && strncmp(p,"hassitevenue",12) == 0 ) {
// if we do not have enough info for outlink, all done
if ( isOutlink ) return -1;
// skip for msg20
if ( isForMsg20 ) continue;
// skip if not valid (pageaddurl? injection?)
if ( ! sreq->m_hasSiteVenueValid ) continue;
// if no match continue
if ( (bool)sreq->m_hasSiteVenue==val ) continue;
// allow "!isindexed" if no SpiderReply at all
//if ( ! srep && val == 0 ) continue;
// skip
p += 12;
// skip to next constraint
p = strstr(p, "&&");
// all done?
if ( ! p ) return i;
p += 2;
goto checkNextRule;
}
*/
if ( *p != 'i' ) goto skipi;
if ( strncmp(p,"isinjected",10) == 0 ) {
// skip for msg20
if ( isForMsg20 ) continue;
// if no match continue
if ( (bool)sreq->m_isInjecting==val ) continue;
// skip
p += 10;
// skip to next constraint
p = strstr(p, "&&");
// all done?
if ( ! p ) return i;
p += 2;
goto checkNextRule;
}
if ( strncmp(p,"isdocidbased",12) == 0 ) {
// skip for msg20
if ( isForMsg20 ) continue;
// if no match continue
//if ( (bool)sreq->m_urlIsDocId==val ) continue;
if ( (bool)sreq->m_isPageReindex==val ) continue;
// skip
p += 12;
// skip to next constraint
p = strstr(p, "&&");
// all done?
if ( ! p ) return i;
p += 2;
goto checkNextRule;
}
if ( strncmp(p,"isreindex",9) == 0 ) {
// skip for msg20
if ( isForMsg20 ) continue;
// if no match continue
//if ( (bool)sreq->m_urlIsDocId==val ) continue;
if ( (bool)sreq->m_isPageReindex==val ) continue;
// skip
p += 9;
// skip to next constraint
p = strstr(p, "&&");
// all done?
if ( ! p ) return i;
p += 2;
goto checkNextRule;
}
if ( strncmp(p,"iscontacty",10) == 0 ) {
// skip for msg20
if ( isForMsg20 ) continue;
// skip if not valid
if ( ! sreq->m_isContactyValid ) continue;
// if no match continue
if ( (bool)sreq->m_isContacty==val ) continue;
// skip
p += 10;
// skip to next constraint
p = strstr(p, "&&");
// all done?
if ( ! p ) return i;
p += 2;
goto checkNextRule;
}
// is it in the big list of sites?
if ( strncmp(p,"insitelist",10) == 0 ) {
// skip for msg20
//if ( isForMsg20 ) continue;
// if only seeds in the sitelist and no
// if there is no domain or url explicitly listed
// then assume user is spidering the whole internet
// and we basically ignore "insitelist"
if ( sc->m_siteListIsEmpty &&
sc->m_siteListIsEmptyValid ) {
// use a dummy row match
row = (char *)1;
}
else if ( ! checkedRow ) {
// only do once for speed
checkedRow = true;
// this function is in PageBasic.cpp
row = getMatchingUrlPattern ( sc, sreq ,NULL);
}
// if we are not submitted from the add url api, skip
if ( (bool)row == val ) continue;
// skip
p += 10;
// skip to next constraint
p = strstr(p, "&&");
// all done?
if ( ! p ) return i;
p += 2;
goto checkNextRule;
}
// . was it submitted from PageAddUrl.cpp?
// . replaces the "add url priority" parm
if ( strncmp(p,"isaddurl",8) == 0 ) {
// skip for msg20
if ( isForMsg20 ) continue;
// if we are not submitted from the add url api, skip
if ( (bool)sreq->m_isAddUrl == val ) continue;
// skip
p += 8;
// skip to next constraint
p = strstr(p, "&&");
// all done?
if ( ! p ) return i;
p += 2;
goto checkNextRule;
}
if ( p[0]=='i' && strncmp(p,"ismanualadd",11) == 0 ) {
// skip for msg20
if ( isForMsg20 ) continue;
// . if we are not submitted from the add url api, skip
// . if we have '!' then val is 1
if ( sreq->m_isAddUrl ||
sreq->m_isInjecting ||
sreq->m_isPageReindex ||
sreq->m_isPageParser ) {
if ( val ) continue;
}
else {
if ( ! val ) continue;
}
// skip
p += 11;
// skip to next constraint
p = strstr(p, "&&");
// all done?
if ( ! p ) return i;
p += 2;
goto checkNextRule;
}
// does it have an rss inlink? we want to expedite indexing
// of such pages. i.e. that we gather from an rss feed that
// we got from a pingserver...
if ( strncmp(p,"isparentrss",11) == 0 ) {
// skip for msg20
if ( isForMsg20 ) continue;
// if we have no such inlink
if ( (bool)sreq->m_parentIsRSS == val ) continue;
// skip
p += 11;
// skip to next constraint
p = strstr(p, "&&");
// all done?
if ( ! p ) return i;
p += 2;
goto checkNextRule;
}
if ( strncmp(p,"isparentsitemap",15) == 0 ) {
// skip for msg20
if ( isForMsg20 ) continue;
// if no match continue
if ( (bool)sreq->m_parentIsSiteMap == val) continue;
// skip
p += 15;
// skip to next constraint
p = strstr(p, "&&");
// all done?
if ( ! p ) return i;
p += 2;
goto checkNextRule;
}
// does it have an rss inlink? we want to expedite indexing
// of such pages. i.e. that we gather from an rss feed that
// we got from a pingserver...
if ( strncmp(p,"isroot",6) == 0 ) {
// skip for msg20
//if ( isForMsg20 ) continue;
// this is a docid only url, no actual url, so skip
if ( sreq->m_isPageReindex ) continue;
// a fast check
char *u = sreq->m_url;
// skip http
u += 4;
// then optional s for https
if ( *u == 's' ) u++;
// then ://
u += 3;
// scan until \0 or /
for ( ; *u && *u !='/' ; u++ );
// if \0 we are root
bool isRoot = true;
if ( *u == '/' ) {
u++;
if ( *u ) isRoot = false;
}
// if we are not root
if ( isRoot == val ) continue;
// skip
p += 6;
// skip to next constraint
p = strstr(p, "&&");
// all done?
if ( ! p ) return i;
p += 2;
goto checkNextRule;
}
/*
if ( strncmp(p,"isparentindexed",16) == 0 ) {
// skip for msg20
if ( isForMsg20 ) continue;
// if we have no such inlink
if ( (bool)sreq->m_wasParentIndexed == val ) continue;
// skip
p += 16;
// skip to next constraint
p = strstr(p, "&&");
// all done?
if ( ! p ) return i;
p += 2;
goto checkNextRule;
}
*/
// we can now handle this guy since we have the latest
// SpiderReply, pretty much guaranteed
if ( strncmp(p,"isindexed",9) == 0 ) {
// if we do not have enough info for outlink, all done
if ( isOutlink ) return -1;
// skip for msg20
if ( isForMsg20 ) continue;
// skip if reply does not KNOW because of an error
// since XmDoc::indexDoc() called
// XmlDoc::getNewSpiderReply() and did not have this
// info...
if ( srep && (bool)srep->m_isIndexedINValid ) continue;
// if no match continue
if ( srep && (bool)srep->m_isIndexed==val ) continue;
// allow "!isindexed" if no SpiderReply at all
if ( ! srep && val == 0 ) continue;
// skip
p += 9;
// skip to next constraint
p = strstr(p, "&&");
// all done?
if ( ! p ) return i;
p += 2;
goto checkNextRule;
}
if ( strncmp(p,"ingoogle",8) == 0 ) {
if ( isOutlink ) return -1;
// skip for msg20
if ( isForMsg20 ) continue;
// skip if not valid (pageaddurl? injection?)
if ( ! sreq->m_inGoogleValid ) continue;
// if no match continue
if ( (bool)sreq->m_inGoogle == val ) continue;
// allow "!isindexed" if no SpiderReply at all
if ( ! srep && val == 0 ) continue;
// skip
p += 8;
// skip to next constraint
p = strstr(p, "&&");
// all done?
if ( ! p ) return i;
p += 2;
goto checkNextRule;
}
// . check to see if a page is linked to by
// www.weblogs.com/int16_tChanges.xml and if it is we put
// it into a queue that has a respider rate no faster than
// 30 days, because we don't need to spider it quick since
// it is in the ping server!
if ( strncmp(p,"isparentpingserver",18) == 0 ) {
// skip for msg20
if ( isForMsg20 ) continue;
// if no match continue
if ( (bool)sreq->m_parentIsPingServer == val) continue;
// skip
p += 18;
// skip to next constraint
p = strstr(p, "&&");
// all done?
if ( ! p ) return i;
p += 2;
goto checkNextRule;
}
if ( strncmp(p,"ispingserver",12) == 0 ) {
// skip for msg20
if ( isForMsg20 ) continue;
// if no match continue
if ( (bool)sreq->m_isPingServer == val ) continue;
// skip
p += 12;
// skip to next constraint
p = strstr(p, "&&");
// all done?
if ( ! p ) return i;
p += 2;
goto checkNextRule;
}
if ( strncmp ( p , "isonsamesubdomain",17 ) == 0 ) {
// skip for msg20
if ( isForMsg20 ) continue;
if ( val == 0 &&
sreq->m_parentHostHash32 != sreq->m_hostHash32 )
continue;
if ( val == 1 &&
sreq->m_parentHostHash32 == sreq->m_hostHash32 )
continue;
p += 6;
p = strstr(p, "&&");
if ( ! p ) return i;
p += 2;
goto checkNextRule;
}
if ( strncmp ( p , "isfakeip",8 ) == 0 ) {
// skip for msg20
if ( isForMsg20 ) continue;
// if no match continue
if ( (bool)sreq->m_fakeFirstIp == val ) continue;
p += 8;
p = strstr(p, "&&");
if ( ! p ) return i;
p += 2;
goto checkNextRule;
}
if ( strncmp ( p , "isonsamedomain",14 ) == 0 ) {
// skip for msg20
if ( isForMsg20 ) continue;
if ( val == 0 &&
sreq->m_parentDomHash32 != sreq->m_domHash32 )
continue;
if ( val == 1 &&
sreq->m_parentDomHash32 == sreq->m_domHash32 )
continue;
p += 6;
p = strstr(p, "&&");
if ( ! p ) return i;
p += 2;
goto checkNextRule;
}
// jpg JPG gif GIF wmv mpg css etc.
if ( strncmp ( p , "ismedia",7 ) == 0 ) {
// skip for msg20
if ( isForMsg20 ) continue;
// the new way is much faster, but support the
// old way below for a while since this bit is new
if ( sreq->m_hasMediaExtension )
goto gotOne;
// if that bit is valid, and zero, then we do not match
if ( sreq->m_hasMediaExtensionValid )
continue;
// check the extension
if ( urlLen<=5 ) continue;
ext = url + urlLen - 4;
if ( ext[0] == '.' ) {
if ( to_lower_a(ext[1]) == 'c' &&
to_lower_a(ext[2]) == 's' &&
to_lower_a(ext[3]) == 's' )
goto gotOne;
if ( to_lower_a(ext[1]) == 'm' &&
to_lower_a(ext[2]) == 'p' &&
to_lower_a(ext[3]) == 'g' )
goto gotOne;
if ( to_lower_a(ext[1]) == 'p' &&
to_lower_a(ext[2]) == 'n' &&
to_lower_a(ext[3]) == 'g' )
goto gotOne;
if ( to_lower_a(ext[1]) == 'w' &&
to_lower_a(ext[2]) == 'm' &&
to_lower_a(ext[3]) == 'v' )
goto gotOne;
if ( to_lower_a(ext[1]) == 'w' &&
to_lower_a(ext[2]) == 'a' &&
to_lower_a(ext[3]) == 'v' )
goto gotOne;
if ( to_lower_a(ext[1]) == 'j' &&
to_lower_a(ext[2]) == 'p' &&
to_lower_a(ext[3]) == 'g' )
goto gotOne;
if ( to_lower_a(ext[1]) == 'g' &&
to_lower_a(ext[2]) == 'i' &&
to_lower_a(ext[3]) == 'f' )
goto gotOne;
if ( to_lower_a(ext[1]) == 'i' &&
to_lower_a(ext[2]) == 'c' &&
to_lower_a(ext[3]) == 'o' )
goto gotOne;
if ( to_lower_a(ext[1]) == 'm' &&
to_lower_a(ext[2]) == 'p' &&
to_lower_a(ext[3]) == '3' )
goto gotOne;
if ( to_lower_a(ext[1]) == 'm' &&
to_lower_a(ext[2]) == 'p' &&
to_lower_a(ext[3]) == '4' )
goto gotOne;
if ( to_lower_a(ext[1]) == 'm' &&
to_lower_a(ext[2]) == 'o' &&
to_lower_a(ext[3]) == 'v' )
goto gotOne;
if ( to_lower_a(ext[1]) == 'a' &&
to_lower_a(ext[2]) == 'v' &&
to_lower_a(ext[3]) == 'i' )
goto gotOne;
}
else if ( ext[-1] == '.' ) {
if ( to_lower_a(ext[0]) == 'm' &&
to_lower_a(ext[1]) == 'p' &&
to_lower_a(ext[2]) == 'e' &&
to_lower_a(ext[3]) == 'g' )
goto gotOne;
if ( to_lower_a(ext[0]) == 'j' &&
to_lower_a(ext[1]) == 'p' &&
to_lower_a(ext[2]) == 'e' &&
to_lower_a(ext[3]) == 'g' )
goto gotOne;
}
// two letter extensions
// .warc.gz and .arc.gz is ok
// take this out for now
// else if ( ext[1] == '.' ) {
// if ( to_lower_a(ext[2]) == 'g' &&
// to_lower_a(ext[3]) == 'z' )
// goto gotOne;
// }
// check for ".css?" substring
// these two suck up a lot of time:
// take them out for now. MDW 2/21/2015
//special = strstr(url,".css?");
//if ( special ) goto gotOne;
//special = strstr(url,"/print/");
// try to make detecting .css? super fast
if ( ext[0] != '.' &&
ext[1] != '.' &&
urlLen > 10 ) {
for(register int32_t k=urlLen-10;k<urlLen;k++){
if ( url[k] != '.' ) continue;
if ( url[k+1] == 'c' &&
url[k+2] == 's' &&
url[k+3] == 's' &&
url[k+4] == '?' )
goto gotOne;
}
}
//if ( special ) goto gotOne;
// no match, try the next rule
continue;
gotOne:
p += 7;
p = strstr(p, "&&");
if ( ! p ) return i;
p += 2;
goto checkNextRule;
}
// check for "isrss" aka "rss"
if ( strncmp(p,"isrss",5) == 0 ) {
if ( isOutlink ) return -1;
// must have a reply
if ( ! srep ) continue;
// if we are not rss, we do not match this rule
if ( (bool)srep->m_isRSS == val ) continue;
// skip it
p += 5;
// check for &&
p = strstr(p, "&&");
// if nothing, else then it is a match
if ( ! p ) return i;
// skip the '&&' and go to next rule
p += 2;
goto checkNextRule;
}
// check for "isrss" aka "rss"
if ( strncmp(p,"isrssext",8) == 0 ) {
// if we are not rss, we do not match this rule
if ( (bool)sreq->m_isRSSExt == val ) continue;
// skip it
p += 8;
// check for &&
p = strstr(p, "&&");
// if nothing, else then it is a match
if ( ! p ) return i;
// skip the '&&' and go to next rule
p += 2;
goto checkNextRule;
}
// check for permalinks. for new outlinks we *guess* if its
// a permalink by calling isPermalink() function.
if (!strncmp(p,"ispermalink",11) ) {
// if we do not have enough info for outlink, all done
if ( isOutlink ) return -1;
// must have a reply
if ( ! srep ) continue;
// if we are not rss, we do not match this rule
if ( (bool)srep->m_isPermalink == val ) continue;
// skip it
p += 11;
// check for &&
p = strstr(p, "&&");
// if nothing, else then it is a match
if ( ! p ) return i;
// skip the '&&' and go to next rule
p += 2;
goto checkNextRule;
}
// supports LF_ISPERMALINK bit for outlinks that *seem* to
// be permalinks but might not
if (!strncmp(p,"ispermalinkformat",17) ) {
// if we are not rss, we do not match this rule
if ( (bool)sreq->m_isUrlPermalinkFormat ==val)continue;
// check for &&
p = strstr(p, "&&");
// if nothing, else then it is a match
if ( ! p ) return i;
// skip the '&&' and go to next rule
p += 2;
goto checkNextRule;
}
// check for this
if ( strncmp(p,"isnewoutlink",12) == 0 ) {
// skip for msg20
if ( isForMsg20 ) continue;
// skip if we do not match this rule
if ( (bool)sreq->m_isNewOutlink == val ) continue;
// skip it
p += 10;
// check for &&
p = strstr(p, "&&");
// if nothing, else then it is a match
if ( ! p ) return i;
// skip the '&&' and go to next rule
p += 2;
goto checkNextRule;
}
// check for this
if ( strncmp(p,"isnewrequest",12) == 0 ) {
// if we do not have enough info for outlink, all done
if ( isOutlink ) return -1;
// skip for msg20
if ( isForMsg20 ) continue;
// skip if we are a new request and val is 1 (has '!')
if ( ! srep && val ) continue;
// skip if we are a new request and val is 1 (has '!')
if(srep&&sreq->m_addedTime>srep->m_spideredTime &&val)
continue;
// skip if we are old and val is 0 (does not have '!')
if(srep&&sreq->m_addedTime<=srep->m_spideredTime&&!val)
continue;
// skip it for speed
p += 12;
// check for &&
p = strstr(p, "&&");
// if nothing, else then it is a match
if ( ! p ) return i;
// skip the '&&' and go to next rule
p += 2;
goto checkNextRule;
}
// kinda like isnewrequest, but has no reply. use hasreply?
if ( strncmp(p,"isnew",5) == 0 ) {
// if we do not have enough info for outlink, all done
if ( isOutlink ) return -1;
// skip for msg20
if ( isForMsg20 ) continue;
// if we got a reply, we are not new!!
if ( (bool)sreq->m_hadReply != (bool)val ) continue;
// skip it for speed
p += 5;
// check for &&
p = strstr(p, "&&");
// if nothing, else then it is a match
if ( ! p ) return i;
// skip the '&&' and go to next rule
p += 2;
goto checkNextRule;
}
// iswww, means url is like www.xyz.com/...
if ( strncmp(p,"iswww", 5) == 0 ) {
// now this is a bit - doesn't seem to be working yet
//if ( (bool)sreq->m_isWWWSubdomain == (bool)val )
// continue;
// skip "iswww"
p += 5;
// skip over http:// or https://
char *u = sreq->m_url;
if ( u[4] == ':' ) u += 7;
if ( u[5] == ':' ) u += 8;
// url MUST be a www url
char isWWW = 0;
if( u[0] == 'w' &&
u[1] == 'w' &&
u[2] == 'w' ) isWWW = 1;
// skip if no match
if ( isWWW == val ) continue;
// TODO: fix www.knightstown.skepter.com
// maybe just have a bit in the spider request
// another rule?
p = strstr(p,"&&");
if ( ! p ) return i;
// skip the '&&'
p += 2;
goto checkNextRule;
}
// non-boolen junk
skipi:
// . we always match the "default" reg ex
// . this line must ALWAYS exist!
if ( *p=='d' && ! strcmp(p,"default" ) )
return i;
// is it in the big list of sites?
if ( *p == 't' && strncmp(p,"tag:",4) == 0 ) {
// skip for msg20
//if ( isForMsg20 ) continue;
// if only seeds in the sitelist and no
// if there is no domain or url explicitly listed
// then assume user is spidering the whole internet
// and we basically ignore "insitelist"
if ( sc->m_siteListIsEmpty &&
sc->m_siteListIsEmptyValid ) {
row = NULL;// no row
}
else if ( ! checkedRow ) {
// only do once for speed
checkedRow = true;
// this function is in PageBasic.cpp
// . it also has to match "tag" at (p+4)
row = getMatchingUrlPattern ( sc, sreq ,p+4);
}
// if we are not submitted from the add url api, skip
if ( (bool)row == val ) continue;
// skip tag:
p += 4;
// skip to next constraint
p = strstr(p, "&&");
// all done?
if ( ! p ) return i;
p += 2;
goto checkNextRule;
}
// set the sign
char *s = p;
// skip s to after
while ( *s && is_alpha_a(*s) ) s++;
// skip white space before the operator
//char *saved = s;
while ( *s && is_wspace_a(*s) ) s++;
char sign = 0;
if ( *s == '=' ) {
s++;
if ( *s == '=' ) s++;
sign = SIGN_EQ;
}
else if ( *s == '!' && s[1] == '=' ) {
s += 2;
sign = SIGN_NE;
}
else if ( *s == '<' ) {
s++;
if ( *s == '=' ) { sign = SIGN_LE; s++; }
else sign = SIGN_LT;
}
else if ( *s == '>' ) {
s++;
if ( *s == '=' ) { sign = SIGN_GE; s++; }
else sign = SIGN_GT;
}
// skip whitespace after the operator
while ( *s && is_wspace_a(*s) ) s++;
// seed counts. how many seeds this subdomain has. 'siteadds'
if ( *p == 's' &&
p[1] == 'i' &&
p[2] == 't' &&
p[3] == 'e' &&
p[4] == 'a' &&
p[5] == 'd' &&
p[6] == 'd' &&
p[7] == 's' ) {
// need a quota table for this
if ( ! quotaTable ) continue;
// a special hack so it is seeds so we can use same tbl
int32_t h32 = sreq->m_siteHash32 ^ 0x123456;
int32_t *valPtr =(int32_t *)quotaTable->getValue(&h32);
int32_t a;
// if no count in table, that is strange, i guess
// skip for now???
// this happens if INJECTING a url from the
// "add url" function on homepage
if ( ! valPtr ) a=0;//continue;//{char *xx=NULL;*xx=0;}
// int16_tcut
else a = *valPtr;
//log("siteadds=%"INT32" for %s",a,sreq->m_url);
// what is the provided value in the url filter rule?
int32_t b = atoi(s);
// compare
if ( sign == SIGN_EQ && a != b ) continue;
if ( sign == SIGN_NE && a == b ) continue;
if ( sign == SIGN_GT && a <= b ) continue;
if ( sign == SIGN_LT && a >= b ) continue;
if ( sign == SIGN_GE && a < b ) continue;
if ( sign == SIGN_LE && a > b ) continue;
p = strstr(s, "&&");
//if nothing, else then it is a match
if ( ! p ) return i;
//skip the '&&' and go to next rule
p += 2;
goto checkNextRule;
}
// domain seeds. 'domainadds'
if ( *p == 'd' &&
p[1] == 'o' &&
p[2] == 'm' &&
p[3] == 'a' &&
p[4] == 'i' &&
p[5] == 'n' &&
p[6] == 'a' &&
p[7] == 'd' &&
p[8] == 'd' &&
p[9] == 's' ) {
// need a quota table for this
if ( ! quotaTable ) continue;
// a special hack so it is seeds so we can use same tbl
int32_t h32 = sreq->m_domHash32 ^ 0x123456;
int32_t *valPtr ;
valPtr = (int32_t *)quotaTable->getValue(&h32);
// if no count in table, that is strange, i guess
// skip for now???
int32_t a;
if ( ! valPtr ) a = 0;//{ char *xx=NULL;*xx=0; }
else a = *valPtr;
// what is the provided value in the url filter rule?
int32_t b = atoi(s);
// compare
if ( sign == SIGN_EQ && a != b ) continue;
if ( sign == SIGN_NE && a == b ) continue;
if ( sign == SIGN_GT && a <= b ) continue;
if ( sign == SIGN_LT && a >= b ) continue;
if ( sign == SIGN_GE && a < b ) continue;
if ( sign == SIGN_LE && a > b ) continue;
p = strstr(s, "&&");
//if nothing, else then it is a match
if ( ! p ) return i;
//skip the '&&' and go to next rule
p += 2;
goto checkNextRule;
}
// new quotas. 'sitepages' = pages from site.
// 'sitepages > 20 && seedcount <= 1 --> FILTERED'
if ( *p == 's' &&
p[1] == 'i' &&
p[2] == 't' &&
p[3] == 'e' &&
p[4] == 'p' &&
p[5] == 'a' &&
p[6] == 'g' &&
p[7] == 'e' &&
p[8] == 's' ) {
// need a quota table for this
if ( ! quotaTable ) continue;
int32_t *valPtr ;
valPtr=(int32_t*)quotaTable->getValue(&sreq->m_siteHash32);
// if no count in table, that is strange, i guess
// skip for now???
int32_t a;
if ( ! valPtr ) a = 0;//{ char *xx=NULL;*xx=0; }
else a = *valPtr;
// int16_tcut
//log("sitepgs=%"INT32" for %s",a,sreq->m_url);
// what is the provided value in the url filter rule?
int32_t b = atoi(s);
// compare
if ( sign == SIGN_EQ && a != b ) continue;
if ( sign == SIGN_NE && a == b ) continue;
if ( sign == SIGN_GT && a <= b ) continue;
if ( sign == SIGN_LT && a >= b ) continue;
if ( sign == SIGN_GE && a < b ) continue;
if ( sign == SIGN_LE && a > b ) continue;
p = strstr(s, "&&");
//if nothing, else then it is a match
if ( ! p ) return i;
//skip the '&&' and go to next rule
p += 2;
goto checkNextRule;
}
// domain quotas. 'domainpages > 10 && hopcount >= 1 --> FILTERED'
if ( *p == 'd' &&
p[1] == 'o' &&
p[2] == 'm' &&
p[3] == 'a' &&
p[4] == 'i' &&
p[5] == 'n' &&
p[6] == 'p' &&
p[7] == 'a' &&
p[8] == 'g' &&
p[9] == 'e' &&
p[10] == 's' ) {
// need a quota table for this. this only happens
// when trying to int16_tcut things to avoid adding
// urls to spiderdb... like XmlDoc.cpp calls
// getUrlFtilerNum() to see if doc is banned or
// if it should harvest links.
if ( ! quotaTable )
return -1;
int32_t *valPtr;
valPtr=(int32_t*)quotaTable->getValue(&sreq->m_domHash32);
// if no count in table, that is strange, i guess
// skip for now???
int32_t a;
if ( ! valPtr ) a = 0;//{ char *xx=NULL;*xx=0; }
else a = *valPtr;
// what is the provided value in the url filter rule?
int32_t b = atoi(s);
// compare
if ( sign == SIGN_EQ && a != b ) continue;
if ( sign == SIGN_NE && a == b ) continue;
if ( sign == SIGN_GT && a <= b ) continue;
if ( sign == SIGN_LT && a >= b ) continue;
if ( sign == SIGN_GE && a < b ) continue;
if ( sign == SIGN_LE && a > b ) continue;
p = strstr(s, "&&");
//if nothing, else then it is a match
if ( ! p ) return i;
//skip the '&&' and go to next rule
p += 2;
goto checkNextRule;
}
// tld:cn
if ( *p=='t' && strncmp(p,"tld",3)==0){
// set it on demand
if ( tld == (char *)-1 )
tld = getTLDFast ( sreq->m_url , &tldLen );
// no match if we have no tld. might be an IP only url,
// or not in our list in Domains.cpp::isTLD()
if ( ! tld || tldLen == 0 ) continue;
// set these up
//char *a = tld;
//int32_t alen = tldLen;
char *b = s;
// loop for the comma-separated list of tlds
// like tld:us,uk,fr,it,de
subloop1:
// get length of it in the regular expression box
char *start = b;
while ( *b && !is_wspace_a(*b) && *b!=',' ) b++;
int32_t blen = b - start;
//char sm;
// if we had tld==com,org,...
if ( sign == SIGN_EQ &&
blen == tldLen &&
strncasecmp(start,tld,tldLen)==0 )
// if we matched any, that's great
goto matched1;
// if its tld!=com,org,...
// and we equal the string, then we do not matcht his
// particular rule!!!
if ( sign == SIGN_NE &&
blen == tldLen &&
strncasecmp(start,tld,tldLen)==0 )
// we do not match this rule if we matched
// and of the tlds in the != list
continue;
// might have another tld in a comma-separated list
if ( *b != ',' ) {
// if that was the end of the list and the
// sign was == then skip this rule
if ( sign == SIGN_EQ ) continue;
// otherwise, if the sign was != then we win!
if ( sign == SIGN_NE ) goto matched1;
// otherwise, bad sign?
continue;
}
// advance to next tld if there was a comma after us
b++;
// and try again
goto subloop1;
// otherwise
// do we match, if not, try next regex
//sm = strncasecmp(a,b,blen);
//if ( sm != 0 && sign == SIGN_EQ ) goto miss1;
//if ( sm == 0 && sign == SIGN_NE ) goto miss1;
// come here on a match
matched1:
// we matched, now look for &&
p = strstr ( b , "&&" );
// if nothing, else then it is a match
if ( ! p ) return i;
// skip the '&&' and go to next rule
p += 2;
goto checkNextRule;
// come here if we did not match the tld
}
// lang:en,zh_cn
if ( *p=='l' && strncmp(p,"lang",4)==0){
// if we do not have enough info for outlink, all done
if ( isOutlink ) return -1;
// must have a reply
if ( langId == -1 ) continue;
// skip if unknown? no, we support "xx" as unknown now
//if ( srep->m_langId == 0 ) continue;
// set these up
char *b = s;
// loop for the comma-separated list of langids
// like lang==en,es,...
subloop2:
// get length of it in the regular expression box
char *start = b;
while ( *b && !is_wspace_a(*b) && *b!=',' ) b++;
int32_t blen = b - start;
//char sm;
// if we had lang==en,es,...
if ( sign == SIGN_EQ &&
blen == langLen &&
strncasecmp(start,lang,langLen)==0 )
// if we matched any, that's great
goto matched2;
// if its lang!=en,es,...
// and we equal the string, then we do not matcht his
// particular rule!!!
if ( sign == SIGN_NE &&
blen == langLen &&
strncasecmp(start,lang,langLen)==0 )
// we do not match this rule if we matched
// and of the langs in the != list
continue;
// might have another in the comma-separated list
if ( *b != ',' ) {
// if that was the end of the list and the
// sign was == then skip this rule
if ( sign == SIGN_EQ ) continue;
// otherwise, if the sign was != then we win!
if ( sign == SIGN_NE ) goto matched2;
// otherwise, bad sign?
continue;
}
// advance to next list item if was a comma after us
b++;
// and try again
goto subloop2;
// come here on a match
matched2:
// we matched, now look for &&
p = strstr ( b , "&&" );
// if nothing, else then it is a match
if ( ! p ) return i;
// skip the '&&' and go to next rule
p += 2;
goto checkNextRule;
// come here if we did not match the tld
}
// parentlang=en,zh_cn
if ( *p=='p' && strncmp(p,"parentlang",10)==0){
// if we do not have enough info for outlink, all done
if ( isOutlink ) return -1;
// must have a reply
//if ( ! srep ) continue;
// skip if unknown? no, we support "xx" as unknown now
//if ( srep->m_langId == 0 ) continue;
// set these up
char *b = s;
// loop for the comma-separated list of langids
// like parentlang==en,es,...
subloop2b:
// get length of it in the expression box
char *start = b;
while ( *b && !is_wspace_a(*b) && *b!=',' ) b++;
int32_t blen = b - start;
//char sm;
// if we had parentlang==en,es,...
if ( sign == SIGN_EQ &&
blen == plangLen &&
strncasecmp(start,plang,plangLen)==0 )
// if we matched any, that's great
goto matched2b;
// if its parentlang!=en,es,...
// and we equal the string, then we do not matcht his
// particular rule!!!
if ( sign == SIGN_NE &&
blen == plangLen &&
strncasecmp(start,plang,plangLen)==0 )
// we do not match this rule if we matched
// and of the langs in the != list
continue;
// might have another in the comma-separated list
if ( *b != ',' ) {
// if that was the end of the list and the
// sign was == then skip this rule
if ( sign == SIGN_EQ ) continue;
// otherwise, if the sign was != then we win!
if ( sign == SIGN_NE ) goto matched2b;
// otherwise, bad sign?
continue;
}
// advance to next list item if was a comma after us
b++;
// and try again
goto subloop2b;
// come here on a match
matched2b:
// we matched, now look for &&
p = strstr ( b , "&&" );
// if nothing, else then it is a match
if ( ! p ) return i;
// skip the '&&' and go to next rule
p += 2;
goto checkNextRule;
// come here if we did not match the tld
}
// hopcount == 20 [&&]
if ( *p=='h' && strncmp(p, "hopcount", 8) == 0){
// skip if not valid
if ( ! sreq->m_hopCountValid ) continue;
// int16_tcut
int32_t a = sreq->m_hopCount;
// make it point to the priority
int32_t b = atoi(s);
// compare
if ( sign == SIGN_EQ && a != b ) continue;
if ( sign == SIGN_NE && a == b ) continue;
if ( sign == SIGN_GT && a <= b ) continue;
if ( sign == SIGN_LT && a >= b ) continue;
if ( sign == SIGN_GE && a < b ) continue;
if ( sign == SIGN_LE && a > b ) continue;
p = strstr(s, "&&");
//if nothing, else then it is a match
if ( ! p ) return i;
//skip the '&&' and go to next rule
p += 2;
goto checkNextRule;
}
// the last time it was spidered
if ( *p=='l' && strncmp(p,"lastspidertime",14) == 0 ) {
// if we do not have enough info for outlink, all done
if ( isOutlink ) return -1;
// skip for msg20
if ( isForMsg20 ) continue;
// reply based
int32_t a = 0;
// if no spider reply we can't match this rule!
if ( ! srep ) continue;
// int16_tcut
if ( srep ) a = srep->m_spideredTime;
// make it point to the retry count
int32_t b ;
// now "s" can be "{roundstart}"
if ( s[0]=='{' && strncmp(s,"{roundstart}",12)==0)
b = cr->m_spiderRoundStartTime;//Num;
else
b = atoi(s);
// compare
if ( sign == SIGN_EQ && a != b ) continue;
if ( sign == SIGN_NE && a == b ) continue;
if ( sign == SIGN_GT && a <= b ) continue;
if ( sign == SIGN_LT && a >= b ) continue;
if ( sign == SIGN_GE && a < b ) continue;
if ( sign == SIGN_LE && a > b ) continue;
p = strstr(s, "&&");
//if nothing, else then it is a match
if ( ! p ) return i;
//skip the '&&' and go to next rule
p += 2;
goto checkNextRule;
}
// selector using the first time it was added to the Spiderdb
// added by Sam, May 5th 2015
if ( *p=='u' && strncmp(p,"urlage",6) == 0 ) {
// skip for msg20
if ( isForMsg20 ) {
//log("was for message 20");
continue;
}
// get the age of the spider_request.
// (substraction of uint with int, hope
// every thing goes well there)
int32_t sreq_age = 0;
// if m_discoveryTime is available, we use it. Otherwise we use m_addedTime
if ( sreq && sreq->m_discoveryTime!=0) sreq_age = nowGlobal-sreq->m_discoveryTime;
if ( sreq && sreq->m_discoveryTime==0) sreq_age = nowGlobal-sreq->m_addedTime;
//log("spiderage=%d",sreq_age);
// the argument entered by user
int32_t argument_age=atoi(s) ;
if ( sign == SIGN_EQ && sreq_age != argument_age ) continue;
if ( sign == SIGN_NE && sreq_age == argument_age ) continue;
if ( sign == SIGN_GT && sreq_age <= argument_age ) continue;
if ( sign == SIGN_LT && sreq_age >= argument_age ) continue;
if ( sign == SIGN_GE && sreq_age < argument_age ) continue;
if ( sign == SIGN_LE && sreq_age > argument_age ) continue;
p = strstr(s, "&&");
//if nothing, else then it is a match
if ( ! p ) return i;
//skip the '&&' and go to next rule
p += 2;
goto checkNextRule;
}
if ( *p=='e' && strncmp(p,"errorcount",10) == 0 ) {
// if we do not have enough info for outlink, all done
if ( isOutlink ) return -1;
// skip for msg20
if ( isForMsg20 ) continue;
// reply based
if ( ! srep ) continue;
// int16_tcut
int32_t a = srep->m_errCount;
// make it point to the retry count
int32_t b = atoi(s);
// compare
if ( sign == SIGN_EQ && a != b ) continue;
if ( sign == SIGN_NE && a == b ) continue;
if ( sign == SIGN_GT && a <= b ) continue;
if ( sign == SIGN_LT && a >= b ) continue;
if ( sign == SIGN_GE && a < b ) continue;
if ( sign == SIGN_LE && a > b ) continue;
p = strstr(s, "&&");
//if nothing, else then it is a match
if ( ! p ) return i;
//skip the '&&' and go to next rule
p += 2;
goto checkNextRule;
}
if ( *p == 'n' && strncmp(p,"numinlinks",10) == 0 ) {
// skip for msg20
if ( isForMsg20 ) continue;
// these are -1 if they are NOT valid
int32_t a = sreq->m_pageNumInlinks;
// make it point to the priority
int32_t b = atoi(s);
// compare
if ( sign == SIGN_EQ && a != b ) continue;
if ( sign == SIGN_NE && a == b ) continue;
if ( sign == SIGN_GT && a <= b ) continue;
if ( sign == SIGN_LT && a >= b ) continue;
if ( sign == SIGN_GE && a < b ) continue;
if ( sign == SIGN_LE && a > b ) continue;
// skip fast
p += 10;
p = strstr(s, "&&");
//if nothing, else then it is a match
if ( ! p ) return i;
//skip the '&&' and go to next rule
p += 2;
goto checkNextRule;
}
// siteNumInlinks >= 300 [&&]
if ( *p=='s' && strncmp(p, "sitenuminlinks", 14) == 0){
// these are -1 if they are NOT valid
int32_t a1 = sreq->m_siteNumInlinks;
// only assign if valid
int32_t a2 = -1;
if ( srep ) a2 = srep->m_siteNumInlinks;
// assume a1 is the best
int32_t a ;
// assign to the first valid one
if ( a1 != -1 ) a = a1;
else if ( a2 != -1 ) a = a2;
// swap if both are valid, but srep is more recent
if ( a1 != -1 && a2 != -1 &&
srep->m_spideredTime > sreq->m_addedTime )
a = a2;
// skip if nothing valid
if ( a == -1 ) continue;
// make it point to the priority
int32_t b = atoi(s);
// compare
if ( sign == SIGN_EQ && a != b ) continue;
if ( sign == SIGN_NE && a == b ) continue;
if ( sign == SIGN_GT && a <= b ) continue;
if ( sign == SIGN_LT && a >= b ) continue;
if ( sign == SIGN_GE && a < b ) continue;
if ( sign == SIGN_LE && a > b ) continue;
// skip fast
p += 14;
p = strstr(s, "&&");
//if nothing, else then it is a match
if ( ! p ) return i;
//skip the '&&' and go to next rule
p += 2;
goto checkNextRule;
}
/*
// retryNum >= 2 [&&] ...
if ( *p=='r' && strncmp(p, "retrynum", 8) == 0){
// int16_tcut
int32_t a = sr->m_retryNum;
// make it point to the priority
int32_t b = atoi(s);
// compare
if ( sign == SIGN_EQ && a != b ) continue;
if ( sign == SIGN_NE && a == b ) continue;
if ( sign == SIGN_GT && a <= b ) continue;
if ( sign == SIGN_LT && a >= b ) continue;
if ( sign == SIGN_GE && a < b ) continue;
if ( sign == SIGN_LE && a > b ) continue;
p = strstr(s, "&&");
//if nothing, else then it is a match
if ( ! p ) return i;
//skip the '&&' and go to next rule
p += 2;
goto checkNextRule;
}
*/
// how many days have passed since it was last attempted
// to be spidered? used in conjunction with percentchanged
// to assign when to re-spider it next
if ( *p=='s' && strncmp(p, "spiderwaited", 12) == 0){
// if we do not have enough info for outlink, all done
if ( isOutlink ) return -1;
// must have a reply
if ( ! srep ) continue;
// skip for msg20
if ( isForMsg20 ) continue;
// do not match rule if never attempted
// if ( srep->m_spideredTime == 0 ) {
// char*xx=NULL;*xx=0;}
// if ( srep->m_spideredTime == (uint32_t)-1){
// char*xx=NULL;*xx=0;}
// shortcut
int32_t a = nowGlobal - srep->m_spideredTime;
// make into days
//af /= (3600.0*24.0);
// back to a int32_t, round it
//int32_t a = (int32_t)(af + 0.5);
// make it point to the priority
int32_t b = atoi(s);
// compare
if ( sign == SIGN_EQ && a != b ) continue;
if ( sign == SIGN_NE && a == b ) continue;
if ( sign == SIGN_GT && a <= b ) continue;
if ( sign == SIGN_LT && a >= b ) continue;
if ( sign == SIGN_GE && a < b ) continue;
if ( sign == SIGN_LE && a > b ) continue;
p = strstr(s, "&&");
//if nothing, else then it is a match
if ( ! p ) return i;
//skip the '&&' and go to next rule
p += 2;
goto checkNextRule;
}
// percentchanged >= 50 [&&] ...
if ( *p=='p' && strncmp(p, "percentchangedperday", 20) == 0){
// if we do not have enough info for outlink, all done
if ( isOutlink ) return -1;
// must have a reply
if ( ! srep ) continue;
// skip for msg20
if ( isForMsg20 ) continue;
// int16_tcut
float a = srep->m_percentChangedPerDay;
// make it point to the priority
float b = atof(s);
// compare
if ( sign == SIGN_EQ && a != b ) continue;
if ( sign == SIGN_NE && a == b ) continue;
if ( sign == SIGN_GT && a <= b ) continue;
if ( sign == SIGN_LT && a >= b ) continue;
if ( sign == SIGN_GE && a < b ) continue;
if ( sign == SIGN_LE && a > b ) continue;
p = strstr(s, "&&");
//if nothing, else then it is a match
if ( ! p ) return i;
//skip the '&&' and go to next rule
p += 2;
goto checkNextRule;
}
// httpStatus == 400
if ( *p=='h' && strncmp(p, "httpstatus", 10) == 0){
// if we do not have enough info for outlink, all done
if ( isOutlink ) return -1;
// must have a reply
if ( ! srep ) continue;
// int16_tcut (errCode doubles as g_errno)
int32_t a = srep->m_errCode;
// make it point to the priority
int32_t b = atoi(s);
// compare
if ( sign == SIGN_EQ && a != b ) continue;
if ( sign == SIGN_NE && a == b ) continue;
if ( sign == SIGN_GT && a <= b ) continue;
if ( sign == SIGN_LT && a >= b ) continue;
if ( sign == SIGN_GE && a < b ) continue;
if ( sign == SIGN_LE && a > b ) continue;
p = strstr(s, "&&");
//if nothing, else then it is a match
if ( ! p ) return i;
//skip the '&&' and go to next rule
p += 2;
goto checkNextRule;
}
// how old is the doc in seconds? age is the pubDate age
if ( *p =='a' && strncmp(p, "age", 3) == 0){
// if we do not have enough info for outlink, all done
if ( isOutlink ) return -1;
// must have a reply
if ( ! srep ) continue;
// int16_tcut
int32_t age;
if ( srep->m_pubDate <= 0 ) age = -1;
else age = nowGlobal - srep->m_pubDate;
// we can not match if invalid
if ( age <= 0 ) continue;
// make it point to the priority
int32_t b = atoi(s);
// compare
if ( sign == SIGN_EQ && age != b ) continue;
if ( sign == SIGN_NE && age == b ) continue;
if ( sign == SIGN_GT && age <= b ) continue;
if ( sign == SIGN_LT && age >= b ) continue;
if ( sign == SIGN_GE && age < b ) continue;
if ( sign == SIGN_LE && age > b ) continue;
p = strstr(s, "&&");
//if nothing, else then it is a match
if ( ! p ) return i;
//skip the '&&' and go to next rule
p += 2;
goto checkNextRule;
}
/*
MDW: i replaced this with
m_contentHash32 to make spiders faster/smarter so let's
take this out for now
// how many new inlinkers we got since last spidered time?
if ( *p =='n' && strncmp(p, "newinlinks", 10) == 0){
// if we do not have enough info for outlink, all done
if ( isOutlink ) return -1;
// must have a reply
if ( ! srep ) continue;
// . make it point to the newinlinks.
// . # of new SpiderRequests added since
// srep->m_spideredTime
// . m_dupCache insures that the same ip/hostHash
// does not add more than 1 SpiderRequest for the
// same url/outlink
int32_t a = srep->m_newRequests;
int32_t b = atoi(s);
// compare
if ( sign == SIGN_EQ && a != b ) continue;
if ( sign == SIGN_NE && a == b ) continue;
if ( sign == SIGN_GT && a <= b ) continue;
if ( sign == SIGN_LT && a >= b ) continue;
if ( sign == SIGN_GE && a < b ) continue;
if ( sign == SIGN_LE && a > b ) continue;
// quick
p += 10;
// look for more
p = strstr(s, "&&");
//if nothing, else then it is a match
if ( ! p ) return i;
//skip the '&&' and go to next rule
p += 2;
goto checkNextRule;
}
*/
// our own regex thing (match front of url)
if ( *p=='^' ) {
// advance over caret
p++;
// now pstart pts to the string we will match
char *pstart = p;
// make "p" point to one past the last char in string
while ( *p && ! is_wspace_a(*p) ) p++;
// how long is the string to match?
int32_t plen = p - pstart;
// empty? that's kinda an error
if ( plen == 0 )
continue;
int32_t m = 1;
// check to see if we matched if url was int32_t enough
if ( urlLen >= plen )
m = strncmp(pstart,url,plen);
if ( ( m == 0 && val == 0 ) ||
// if they used the '!' operator and we
// did not match the string, that's a
// row match
( m && val == 1 ) ) {
// another expression follows?
p = strstr(s, "&&");
//if nothing, else then it is a match
if ( ! p ) return i;
//skip the '&&' and go to next rule
p += 2;
goto checkNextRule;
}
// no match
continue;
}
// our own regex thing (match end of url)
if ( *p=='$' ) {
// advance over dollar sign
p++;
// a hack for $\.css, skip over the backslash too
if ( *p=='\\' && *(p+1)=='.' ) p++;
// now pstart pts to the string we will match
char *pstart = p;
// make "p" point to one past the last char in string
while ( *p && ! is_wspace_a(*p) ) p++;
// how long is the string to match?
int32_t plen = p - pstart;
// empty? that's kinda an error
if ( plen == 0 )
continue;
// . do we match it?
// . url has to be at least as big
// . match our tail
int32_t m = 1;
// check to see if we matched if url was int32_t enough
if ( urlLen >= plen )
m = strncmp(pstart,url+urlLen-plen,plen);
if ( ( m == 0 && val == 0 ) ||
// if they used the '!' operator and we
// did not match the string, that's a
// row match
( m && val == 1 ) ) {
// another expression follows?
p = strstr(s, "&&");
//if nothing, else then it is a match
if ( ! p ) return i;
//skip the '&&' and go to next rule
p += 2;
goto checkNextRule;
}
// no match
continue;
}
// . by default a substring match
// . action=edit
// . action=history
// now pstart pts to the string we will match
char *pstart = p;
// make "p" point to one past the last char in string
while ( *p && ! is_wspace_a(*p) ) p++;
// how long is the string to match?
int32_t plen = p - pstart;
// need something...
if ( plen <= 0 ) continue;
// must be at least as big
//if ( urlLen < plen ) continue;
// nullilfy it temporarily
char c = *p;
*p = '\0';
// does url contain it? haystack=u needle=p
char *found = strstr ( url , pstart );
// put char back
*p = c;
// kinda of a hack fix. if they inject a filtered url
// into test coll, do not filter it! fixes the fact that
// we filtered facebook, but still add it in our test
// collection injection in urls.txt
if ( found &&
sreq->m_isInjecting &&
cr->m_coll[0]=='t' &&
cr->m_coll[1]=='e' &&
cr->m_coll[2]=='s' &&
cr->m_coll[3]=='t' &&
cr->m_coll[4]=='\0' &&
cr->m_spiderPriorities[i] < 0 )
continue;
// support "!company" meaning if it does NOT match
// then do this ...
if ( ( found && val == 0 ) ||
// if they used the '!' operator and we
// did not match the string, that's a
// row match
( ! found && val == 1 ) ) {
// another expression follows?
p = strstr(s, "&&");
//if nothing, else then it is a match
if ( ! p ) return i;
//skip the '&&' and go to next rule
p += 2;
goto checkNextRule;
}
}
// sanity check ... must be a default rule!
//char *xx=NULL;*xx=0;
// return -1 if no match, caller should use a default
return -1;
}
//static bool s_ufnInit = false;
//static HashTableX s_ufnTable;
//void clearUfnTable ( ) {
// s_ufnTable.clear();
// s_ufnTree.clear();
//}
int32_t getUrlFilterNum ( SpiderRequest *sreq ,
SpiderReply *srep ,
int32_t nowGlobal ,
bool isForMsg20 ,
int32_t niceness ,
CollectionRec *cr ,
bool isOutlink ,
HashTableX *quotaTable ,
int32_t langId ) {
/*
turn this off for now to save memory on the g0 cluster.
we should nuke this anyway with rankdb
// init table?
if ( ! s_ufnInit ) {
s_ufnInit = true;
if ( ! s_ufnTable.set(8,
1,
1024*1024*5,
NULL,0,
false,
MAX_NICENESS,
"ufntab") ) { char *xx=NULL;*xx=0; }
}
// check in cache using date of request and reply and uh48 as the key
int64_t key64 = sreq->getUrlHash48();
key64 ^= (int64_t)sreq->m_addedTime;
if ( srep ) key64 ^= ((int64_t)srep->m_spideredTime)<<32;
char *uv = (char *)s_ufnTable.getValue(&key64);
if ( uv )
return *uv;
*/
char ufn = getUrlFilterNum2 ( sreq,
srep,
nowGlobal,
isForMsg20,
niceness,
cr,
isOutlink,
quotaTable ,
langId );
/*
// is table full? clear it if so
if ( s_ufnTable.getNumSlotsUsed() > 2000000 ) {
log("spider: resetting ufn table");
s_ufnTable.clear();
}
// cache it
s_ufnTable.addKey ( &key64 , &ufn );
*/
return (int32_t)ufn;
}
bool SpiderColl::printStats ( SafeBuf &sb ) {
return true;
}
// . dedup for spiderdb
// . TODO: we can still have spider request dups in this if they are
// sandwiched together just right because we only compare to the previous
// SpiderRequest we added when looking for dups. just need to hash the
// relevant input bits and use that for deduping.
// . TODO: we can store ufn/priority/spiderTime in the SpiderRequest along
// with the date now, so if url filters do not change then
// gotSpiderdbList() can assume those to be valid and save time. BUT it does
// have siteNumInlinks...
void dedupSpiderdbList ( RdbList *list , int32_t niceness , bool removeNegRecs ) {
//int32_t need = list->m_listSize;
char *newList = list->m_list;//(char *)mmalloc (need,"dslist");
//if ( ! newList ) {
// log("spider: could not dedup spiderdb list: %s",
// mstrerror(g_errno));
// return;
//}
char *dst = newList;
char *restorePoint = newList;
int64_t reqUh48 = 0LL;
int64_t repUh48 = 0LL;
SpiderReply *oldRep = NULL;
SpiderRequest *oldReq = NULL;
char *lastKey = NULL;
char *prevLastKey = NULL;
// save list ptr in case of re-read?
//char *saved = list->m_listPtr;
// reset it
list->resetListPtr();
for ( ; ! list->isExhausted() ; ) {
// breathe. NO! assume in thread!!
//QUICKPOLL(niceness);
// get rec
char *rec = list->getCurrentRec();
// pre skip it
list->skipCurrentRec();
// skip if negative, just copy over
if ( ( rec[0] & 0x01 ) == 0x00 ) {
// should not be in here if this was true...
if ( removeNegRecs ) {
log("spider: filter got negative key");
char *xx=NULL;*xx=0;
}
// save this
prevLastKey = lastKey;
lastKey = dst;
// otherwise, keep it
memmove ( dst , rec , sizeof(key128_t) );
dst += sizeof(key128_t);
continue;
}
// is it a reply?
if ( g_spiderdb.isSpiderReply ( (key128_t *)rec ) ) {
// cast it
SpiderReply *srep = (SpiderReply *)rec;
// int16_tcut
int64_t uh48 = srep->getUrlHash48();
// crazy?
if ( ! uh48 ) {
//uh48 = hash64b ( srep->m_url );
uh48 = 12345678;
log("spider: got uh48 of zero for spider req. "
"computing now.");
}
// does match last reply?
if ( repUh48 == uh48 ) {
// if he's a later date than us, skip us!
if ( oldRep->m_spideredTime >=
srep->m_spideredTime )
// skip us!
continue;
// otherwise, erase him
dst = restorePoint;
lastKey = prevLastKey;
}
// save in case we get erased
restorePoint = dst;
prevLastKey = lastKey;
lastKey = dst;
// get our size
int32_t recSize = srep->getRecSize();
// and add us
memmove ( dst , rec , recSize );
// advance
dst += recSize;
// update this crap for comparing to next reply
repUh48 = uh48;
oldRep = srep;
// get next spiderdb record
continue;
}
// int16_tcut
SpiderRequest *sreq = (SpiderRequest *)rec;
// int16_tcut
int64_t uh48 = sreq->getUrlHash48();
// crazy?
if ( ! uh48 ) {
//uh48 = hash64b ( sreq->m_url );
uh48 = 12345678;
log("spider: got uh48 of zero for spider req. "
"computing now.");
}
// update request with SpiderReply if newer, because ultimately
// ::getUrlFilterNum() will just look at SpiderRequest's
// version of these bits!
if ( oldRep && repUh48 == uh48 &&
oldRep->m_spideredTime > sreq->m_addedTime ) {
// if request was a page reindex docid based request
// and url has since been spidered, nuke it!
//if ( sreq->m_urlIsDocId ) continue;
if ( sreq->m_isPageReindex ) continue;
// same if indexcode was EFAKEFIRSTIP which XmlDoc.cpp
// re-adds to spiderdb with the right firstip. once
// those guys have a reply we can ignore them.
// TODO: what about diffbotxyz spider requests? those
// have a fakefirstip... they should not have requests
// though, since their parent url has that.
if ( sreq->m_fakeFirstIp ) continue;
SpiderReply *old = oldRep;
sreq->m_inGoogle = old->m_inGoogle;
sreq->m_hasAuthorityInlink = old->m_hasAuthorityInlink;
sreq->m_hasContactInfo = old->m_hasContactInfo;
//sreq->m_hasSiteVenue = old->m_hasSiteVenue;
}
// if we are not the same url as last request, add it
if ( uh48 != reqUh48 ) {
// a nice hook in
addIt:
// save in case we get erased
restorePoint = dst;
prevLastKey = lastKey;
// get our size
int32_t recSize = sreq->getRecSize();
// save this
lastKey = dst;
// and add us
memmove ( dst , rec , recSize );
// advance
dst += recSize;
// update this crap for comparing to next reply
reqUh48 = uh48;
oldReq = sreq;
// get next spiderdb record
continue;
}
// try to kinda grab the min hop count as well
// do not alter spiderdb!
// if ( sreq->m_hopCountValid && oldReq->m_hopCountValid ) {
// if ( oldReq->m_hopCount < sreq->m_hopCount )
// sreq->m_hopCount = oldReq->m_hopCount;
// else
// oldReq->m_hopCount = sreq->m_hopCount;
// }
// if he's essentially different input parms but for the
// same url, we want to keep him because he might map the
// url to a different url priority!
if ( oldReq->m_siteHash32 != sreq->m_siteHash32 ||
oldReq->m_isNewOutlink != sreq->m_isNewOutlink ||
// use hopcount now too!
oldReq->m_hopCount != sreq->m_hopCount ||
// makes a difference as far a m_minPubDate goes, because
// we want to make sure not to delete that request that
// has m_parentPrevSpiderTime
// no no, we prefer the most recent spider request
// from thsi site in the logic above, so this is not
// necessary. mdw commented out.
//oldReq->m_wasParentIndexed != sreq->m_wasParentIndexed||
oldReq->m_isInjecting != sreq->m_isInjecting ||
oldReq->m_hasContent != sreq->m_hasContent ||
oldReq->m_isAddUrl != sreq->m_isAddUrl ||
oldReq->m_isPageReindex != sreq->m_isPageReindex ||
oldReq->m_forceDelete != sreq->m_forceDelete )
// we are different enough to coexist
goto addIt;
// . if the same check who has the most recent added time
// . if we are not the most recent, just do not add us
// . no, now i want the oldest so we can do gbssDiscoveryTime
// and set sreq->m_discoveryTime accurately, above
if ( sreq->m_addedTime >= oldReq->m_addedTime ) continue;
// otherwise, erase over him
dst = restorePoint;
lastKey = prevLastKey;
// and add us over top of him
goto addIt;
}
// free the old list
//char *oldbuf = list->m_alloc;
//int32_t oldSize = list->m_allocSize;
// sanity check
if ( dst < list->m_list || dst > list->m_list + list->m_listSize ) {
char *xx=NULL;*xx=0; }
// and stick our newly filtered list in there
//list->m_list = newList;
list->m_listSize = dst - newList;
// set to end i guess
list->m_listPtr = dst;
//list->m_allocSize = need;
//list->m_alloc = newList;
list->m_listEnd = list->m_list + list->m_listSize;
list->m_listPtrHi = NULL;
//KEYSET(list->m_lastKey,lastKey,list->m_ks);
if ( lastKey ) KEYSET(list->m_lastKey,lastKey,list->m_ks);
//mfree ( oldbuf , oldSize, "oldspbuf");
}
///////
//
// diffbot uses these for limiting crawls in a collection
//
///////
void gotCrawlInfoReply ( void *state , UdpSlot *slot);
static int32_t s_requests = 0;
static int32_t s_replies = 0;
static int32_t s_validReplies = 0;
static bool s_inUse = false;
// we initialize CollectionRec::m_updateRoundNum to 0 so make this 1
static int32_t s_updateRoundNum = 1;
// . just call this once per second for all collections
// . figure out how to backoff on collections that don't need it so much
// . ask every host for their crawl infos for each collection rec
void updateAllCrawlInfosSleepWrapper ( int fd , void *state ) {
// debug test
//int32_t mr = g_collectiondb.m_recs[0]->m_maxCrawlRounds;
//log("mcr: %"INT32"",mr);
// i don't know why we have locks in the lock table that are not
// getting removed... so log when we remove an expired locks and see.
// piggyback on this sleep wrapper call i guess...
// perhaps the collection was deleted or reset before the spider
// reply could be generated. in that case we'd have a dangling lock.
removeExpiredLocks ( -1 );
if ( s_inUse ) return;
// "i" means to get incremental updates since last round
// "f" means to get all stats
char *request = "i";
int32_t requestSize = 1;
static bool s_firstCall = true;
if ( s_firstCall ) {
s_firstCall = false;
request = "f";
}
s_inUse = true;
// reset tmp crawlinfo classes to hold the ones returned to us
//for ( int32_t i = 0 ; i < g_collectiondb.m_numRecs ; i++ ) {
// CollectionRec *cr = g_collectiondb.m_recs[i];
// if ( ! cr ) continue;
// cr->m_tmpCrawlInfo.reset();
//}
// send out the msg request
for ( int32_t i = 0 ; i < g_hostdb.m_numHosts ; i++ ) {
Host *h = g_hostdb.getHost(i);
// skip if dead. no! we need replies from all hosts
// otherwise our counts could be int16_t and we might end up
// re-spidering stuff even though we've really hit maxToCrawl
//if ( g_hostdb.isDead(i) ) {
// if ( g_conf.m_logDebugSpider )
// log("spider: skipping dead host #%"INT32" "
// "when getting "
// "crawl info",i);
// continue;
//}
// count it as launched
s_requests++;
// launch it
if ( ! g_udpServer.sendRequest ( request,
requestSize,
0xc1 , // msgtype
h->m_ip ,
h->m_port ,
h->m_hostId ,
NULL, // retslot
NULL, // state
gotCrawlInfoReply ) ) {
log("spider: error sending c1 request: %s",
mstrerror(g_errno));
s_replies++;
}
}
// return false if we blocked awaiting replies
if ( s_replies < s_requests )
return;
// how did this happen?
log("spider: got bogus crawl info replies!");
s_inUse = false;
return;
// somehow we did not block... hmmmm...
//char *xx=NULL;*xx=0;
//gotCrawlInfoReply( cr , NULL );
// we did not block...
//return true;
}
// . Parms.cpp calls this when it receives our "spiderRoundNum" increment above
// . all hosts should get it at *about* the same time
void spiderRoundIncremented ( CollectionRec *cr ) {
log("spider: incrementing spider round for coll %s to %"INT32" (%"UINT32")",
cr->m_coll,cr->m_spiderRoundNum,
(uint32_t)cr->m_spiderRoundStartTime);
// . need to send a notification for this round
// . we are only here because the round was incremented and
// Parms.cpp just called us... and that only happens in
// doneSending... so do not send again!!!
//cr->m_localCrawlInfo.m_sentCrawlDoneAlert = 0;
// . if we set sentCrawlDoneALert to 0 it will immediately
// trigger another round increment !! so we have to set these
// to true to prevent that.
// . if we learnt that there really are no more urls ready to spider
// then we'll go to the next round. but that can take like
// SPIDER_DONE_TIMER seconds of getting nothing.
cr->m_localCrawlInfo.m_hasUrlsReadyToSpider = true;
cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider = true;
cr->m_localCrawlInfo.m_pageDownloadSuccessesThisRound = 0;
cr->m_localCrawlInfo.m_pageProcessSuccessesThisRound = 0;
cr->m_globalCrawlInfo.m_pageDownloadSuccessesThisRound = 0;
cr->m_globalCrawlInfo.m_pageProcessSuccessesThisRound = 0;
cr->localCrawlInfoUpdate();
cr->m_needsSave = true;
}
void gotCrawlInfoReply ( void *state , UdpSlot *slot ) {
// loop over each LOCAL crawlinfo we received from this host
CrawlInfo *ptr = (CrawlInfo *)(slot->m_readBuf);
CrawlInfo *end = (CrawlInfo *)(slot->m_readBuf+ slot->m_readBufSize);
//int32_t allocSize = slot->m_readBufMaxSize;
// host sending us this reply
Host *h = slot->m_host;
// assume it is a valid reply, not an error, like a udptimedout
s_validReplies++;
// reply is error? then use the last known good reply we had from him
// assuming udp reply timed out. empty buf just means no update now!
if ( ! slot->m_readBuf && g_errno ) {
log("spider: got crawlinfo reply error from host %"INT32": %s. "
"spidering will be paused.",
h->m_hostId,mstrerror(g_errno));
// just clear it
g_errno = 0;
// if never had any reply... can't be valid then
if ( ! ptr ) s_validReplies--;
/*
// just use his last known good reply
ptr = (CrawlInfo *)h->m_lastKnownGoodCrawlInfoReply;
end = (CrawlInfo *)h->m_lastKnownGoodCrawlInfoReplyEnd;
*/
}
// otherwise, if reply was good it is the last known good now!
/*
else {
// free the old good one and replace it with the new one
if ( h->m_lastKnownGoodCrawlInfoReply ) {
//log("spider: skiipping possible bad free!!!! until we fix");
mfree ( h->m_lastKnownGoodCrawlInfoReply ,
h->m_replyAllocSize ,
"lknown" );
}
// add in the new good in case he goes down in the future
h->m_lastKnownGoodCrawlInfoReply = (char *)ptr;
h->m_lastKnownGoodCrawlInfoReplyEnd = (char *)end;
// set new alloc size
h->m_replyAllocSize = allocSize;
// if valid, don't let him free it now!
slot->m_readBuf = NULL;
}
*/
// inc it
s_replies++;
if ( s_replies > s_requests ) { char *xx=NULL;*xx=0; }
// crap, if any host is dead and not reporting it's number then
// that seriously fucks us up because our global count will drop
// and something that had hit a max limit, like maxToCrawl, will
// now be under the limit and the crawl will resume.
// what's the best way to fix this?
//
// perhaps, let's just keep the dead host's counts the same
// as the last time we got them. or maybe the simplest way is to
// just not allow spidering if a host is dead
// the sendbuf should never be freed! it points into collrec
// it is 'i' or 'f' right now
slot->m_sendBufAlloc = NULL;
/////
// SCAN the list of CrawlInfos we received from this host,
// one for each non-null collection
/////
// . add the LOCAL stats we got from the remote into the GLOBAL stats
// . readBuf is null on an error, so check for that...
// . TODO: do not update on error???
for ( ; ptr < end ; ptr++ ) {
QUICKPOLL ( slot->m_niceness );
// get collnum
collnum_t collnum = (collnum_t)(ptr->m_collnum);
CollectionRec *cr = g_collectiondb.getRec ( collnum );
if ( ! cr ) {
log("spider: updatecrawlinfo collnum %"INT32" "
"not found",(int32_t)collnum);
continue;
}
//CrawlInfo *stats = ptr;
// just copy into the stats buf
if ( ! cr->m_crawlInfoBuf.getBufStart() ) {
int32_t need = sizeof(CrawlInfo) * g_hostdb.m_numHosts;
cr->m_crawlInfoBuf.setLabel("cibuf");
cr->m_crawlInfoBuf.reserve(need);
// in case one was udp server timed out or something
cr->m_crawlInfoBuf.zeroOut();
}
CrawlInfo *cia = (CrawlInfo *)cr->m_crawlInfoBuf.getBufStart();
if ( cia )
gbmemcpy ( &cia[h->m_hostId] , ptr , sizeof(CrawlInfo));
// debug
// log("spd: got ci from host %"INT32" downloads=%"INT64", replies=%"INT32"",
// h->m_hostId,
// ptr->m_pageDownloadSuccessesThisRound,
// s_replies
// );
// mark it for computation once we got all replies
cr->m_updateRoundNum = s_updateRoundNum;
}
// keep going until we get all replies
if ( s_replies < s_requests ) return;
// if it's the last reply we are to receive, and 1 or more
// hosts did not have a valid reply, and not even a
// "last known good reply" then then we can't do
// much, so do not spider then because our counts could be
// way off and cause us to start spidering again even though
// we hit a maxtocrawl limit!!!!!
if ( s_validReplies < s_replies ) {
// this will tell us to halt all spidering
// because a host is essentially down!
s_countsAreValid = false;
// might as well stop the loop here since we are
// not updating our crawlinfo states.
//break;
}
else {
if ( ! s_countsAreValid )
log("spider: got all crawlinfo replies. all shards "
"up. spidering back on.");
s_countsAreValid = true;
}
// loop over
for ( int32_t x = 0 ; x < g_collectiondb.m_numRecs ; x++ ) {
QUICKPOLL ( slot->m_niceness );
// a niceness 0 routine could have nuked it?
if ( x >= g_collectiondb.m_numRecs )
break;
CollectionRec *cr = g_collectiondb.m_recs[x];
if ( ! cr ) continue;
// must be in need of computation
if ( cr->m_updateRoundNum != s_updateRoundNum ) continue;
//log("spider: processing c=%s",cr->m_coll);
CrawlInfo *gi = &cr->m_globalCrawlInfo;
int32_t hadUrlsReady = gi->m_hasUrlsReadyToSpider;
// clear it out
gi->reset();
// retrieve stats for this collection and scan all hosts
CrawlInfo *cia = (CrawlInfo *)cr->m_crawlInfoBuf.getBufStart();
// if empty for all hosts, i guess no stats...
if ( ! cia ) continue;
for ( int32_t k = 0 ; k < g_hostdb.m_numHosts; k++ ) {
QUICKPOLL ( slot->m_niceness );
// get the CrawlInfo for the ith host
CrawlInfo *stats = &cia[k];
// point to the stats for that host
int64_t *ss = (int64_t *)stats;
int64_t *gs = (int64_t *)gi;
// add each hosts counts into the global accumulators
for ( int32_t j = 0 ; j < NUMCRAWLSTATS ; j++ ) {
*gs = *gs + *ss;
// crazy stat?
if ( *ss > 1000000000LL ||
*ss < -1000000000LL )
log("spider: crazy stats %"INT64" "
"from host #%"INT32" coll=%s",
*ss,k,cr->m_coll);
gs++;
ss++;
}
// . special counts
// . assume round #'s match!
//if ( ss->m_spiderRoundNum ==
// cr->m_localCrawlInfo.m_spiderRoundNum ) {
gi->m_pageDownloadSuccessesThisRound +=
stats->m_pageDownloadSuccessesThisRound;
gi->m_pageProcessSuccessesThisRound +=
stats->m_pageProcessSuccessesThisRound;
//}
if ( ! stats->m_hasUrlsReadyToSpider ) continue;
// inc the count otherwise
gi->m_hasUrlsReadyToSpider++;
// . no longer initializing?
// . sometimes other shards get the spider
// requests and not us!!!
if ( cr->m_spiderStatus == SP_INITIALIZING )
cr->m_spiderStatus = SP_INPROGRESS;
// i guess we are back in business even if
// m_spiderStatus was SP_MAXTOCRAWL or
// SP_ROUNDDONE...
cr->m_spiderStatus = SP_INPROGRESS;
// unflag the sent flag if we had sent an alert
// but only if it was a crawl round done alert,
// not a maxToCrawl or maxToProcess or
// maxRounds alert.
// we can't do this because on startup we end
// up setting hasUrlsReadyToSpider to true and
// we may have already sent an email, and it
// gets RESET here when it shouldn't be
//if(cr->m_localCrawlInfo.m_sentCrawlDoneAlert
//== SP_ROUNDDONE )
//cr->m_localCrawlInfo.m_sentCrawlDoneAlert=0;
// revival?
if ( ! cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider )
log("spider: reviving crawl %s "
"from host %"INT32"", cr->m_coll,k);
} // end loop over hosts
// log("spider: %"INT64" (%"INT64") total downloads for c=%s",
// gi->m_pageDownloadSuccessesThisRound,
// gi->m_pageDownloadSuccesses,
// cr->m_coll);
// revival?
//if ( cr->m_tmpCrawlInfo.m_hasUrlsReadyToSpider &&
// ! cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider ) {
// log("spider: reviving crawl %s (%"INT32")",cr->m_coll,
// cr->m_tmpCrawlInfo.m_hasUrlsReadyToSpider);
//}
//bool has = cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider;
if ( hadUrlsReady &&
// and it no longer does now...
! cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider ) {
log(LOG_INFO,
"spider: all %"INT32" hosts report "
"%s (%"INT32") has no "
"more urls ready to spider",
s_replies,cr->m_coll,(int32_t)cr->m_collnum);
// set crawl end time
cr->m_diffbotCrawlEndTime = getTimeGlobalNoCore();
}
// now copy over to global crawl info so things are not
// half ass should we try to read globalcrawlinfo
// in between packets received.
//gbmemcpy ( &cr->m_globalCrawlInfo ,
// &cr->m_tmpCrawlInfo ,
// sizeof(CrawlInfo) );
// turn not assume we are out of urls just yet if a host
// in the network has not reported...
//if ( g_hostdb.hasDeadHost() && has )
// cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider = true;
// should we reset our "sent email" flag?
bool reset = false;
// can't reset if we've never sent an email out yet
if ( cr->m_localCrawlInfo.m_sentCrawlDoneAlert ) reset = true;
// must have some urls ready to spider now so we can send
// another email after another round of spidering
if (!cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider) reset=false;
// . if we have urls ready to be spidered then prepare to send
// another email/webhook notification.
// . do not reset this flag if SP_MAXTOCRAWL etc otherwise we
// end up sending multiple notifications, so this logic here
// is only for when we are done spidering a round, which
// happens when hasUrlsReadyToSpider goes false for all
// shards.
if ( reset ) {
log("spider: resetting sent crawl done alert to 0 "
"for coll %s",cr->m_coll);
cr->m_localCrawlInfo.m_sentCrawlDoneAlert = 0;
}
// update cache time
cr->m_globalCrawlInfo.m_lastUpdateTime = getTime();
// make it save to disk i guess
cr->m_needsSave = true;
// if spidering disabled in master controls then send no
// notifications
// crap, but then we can not update the round start time
// because that is done in doneSendingNotification().
// but why does it say all 32 report done, but then
// it has urls ready to spider?
if ( ! g_conf.m_spideringEnabled )
continue;
// and we've examined at least one url. to prevent us from
// sending a notification if we haven't spidered anything
// because no seed urls have been added/injected.
//if ( cr->m_globalCrawlInfo.m_urlsConsidered == 0 ) return;
if ( cr->m_globalCrawlInfo.m_pageDownloadAttempts == 0 &&
// if we don't have this here we may not get the
// pageDownloadAttempts in time from the host that
// did the spidering.
! hadUrlsReady )
continue;
// if urls were considered and roundstarttime is still 0 then
// set it to the current time...
//if ( cr->m_spiderRoundStartTime == 0 )
// // all hosts in the network should sync with host #0
// // on this
// cr->m_spiderRoundStartTime = getTimeGlobal();
// but of course if it has urls ready to spider, do not send
// alert... or if this is -1, indicating "unknown".
if ( cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider )
continue;
// update status if nto already SP_MAXTOCRAWL, etc. we might
// just be flat out of urls
if ( ! cr->m_spiderStatus ||
cr->m_spiderStatus == SP_INPROGRESS ||
cr->m_spiderStatus == SP_INITIALIZING )
cr->m_spiderStatus = SP_ROUNDDONE;
//
// TODO: set the spiderstatus outright here...
// maxtocrawl, maxtoprocess, etc. based on the counts.
//
// only host #0 sends emails
if ( g_hostdb.m_myHost->m_hostId != 0 )
continue;
// . if already sent email for this, skip
// . localCrawlInfo stores this value on disk so persistent
// . we do it this way so SP_ROUNDDONE can be emailed and then
// we'd email SP_MAXROUNDS to indicate we've hit the maximum
// round count.
if ( cr->m_localCrawlInfo.m_sentCrawlDoneAlert ) {
// debug
logf(LOG_DEBUG,"spider: already sent alert for %s"
, cr->m_coll);
continue;
}
// do email and web hook...
sendNotificationForCollRec ( cr );
// deal with next collection rec
}
// wait for more replies to come in
//if ( s_replies < s_requests ) return;
// initialize
s_replies = 0;
s_requests = 0;
s_validReplies = 0;
s_inUse = false;
s_updateRoundNum++;
}
void handleRequestc1 ( UdpSlot *slot , int32_t niceness ) {
//char *request = slot->m_readBuf;
// just a single collnum
if ( slot->m_readBufSize != 1 ) { char *xx=NULL;*xx=0;}
char *req = slot->m_readBuf;
if ( ! slot->m_host ) {
log("handc1: no slot->m_host from ip=%s udpport=%i",
iptoa(slot->m_ip),(int)slot->m_port);
g_errno = ENOHOSTS;
g_udpServer.sendErrorReply ( slot , g_errno );
return;
}
//if ( ! isClockSynced() ) {
//}
//collnum_t collnum = *(collnum_t *)request;
//CollectionRec *cr = g_collectiondb.getRec(collnum);
// deleted from under us? i've seen this happen
//if ( ! cr ) {
// log("spider: c1: coll deleted returning empty reply");
// g_udpServer.sendReply_ass ( "", // reply
// 0,
// 0 , // alloc
// 0 , //alloc size
// slot );
// return;
//}
// while we are here update CrawlInfo::m_nextSpiderTime
// to the time of the next spider request to spider.
// if doledb is empty and the next rec in the waiting tree
// does not have a time of zero, but rather, in the future, then
// return that future time. so if a crawl is enabled we should
// actively call updateCrawlInfo a collection every minute or
// so.
//cr->m_localCrawlInfo.m_hasUrlsReadyToSpider = 1;
//int64_t nowGlobalMS = gettimeofdayInMillisecondsGlobal();
//int64_t nextSpiderTimeMS;
// this will be 0 for ip's which have not had their SpiderRequests
// in spiderdb scanned yet to get the best SpiderRequest, so we
// just have to wait for that.
/*
nextSpiderTimeMS = sc->getEarliestSpiderTimeFromWaitingTree(0);
if ( ! sc->m_waitingTreeNeedsRebuild &&
sc->m_lastDoledbReadEmpty &&
cr->m_spideringEnabled &&
g_conf.m_spideringEnabled &&
nextSpiderTimeMS > nowGlobalMS +10*60*1000 )
// turn off this flag, "ready queue" is empty
cr->m_localCrawlInfo.m_hasUrlsReadyToSpider = 0;
// but send back a -1 if we do not know yet because we haven't
// read the doledblists from disk from all priorities for this coll
if ( sc->m_numRoundsDone == 0 )
cr->m_localCrawlInfo.m_hasUrlsReadyToSpider = -1;
*/
//int32_t now = getTimeGlobal();
SafeBuf replyBuf;
uint32_t now = (uint32_t)getTimeGlobalNoCore();
uint64_t nowMS = gettimeofdayInMillisecondsGlobalNoCore();
//SpiderColl *sc = g_spiderCache.getSpiderColl(collnum);
for ( int32_t i = 0 ; i < g_collectiondb.m_numRecs ; i++ ) {
QUICKPOLL(slot->m_niceness);
CollectionRec *cr = g_collectiondb.m_recs[i];
if ( ! cr ) continue;
// int16_tcut
CrawlInfo *ci = &cr->m_localCrawlInfo;
// this is now needed for alignment by the receiver
ci->m_collnum = i;
SpiderColl *sc = cr->m_spiderColl;
/////////
//
// ARE WE DONE SPIDERING?????
//
/////////
// speed up qa pipeline
uint32_t spiderDoneTimer = (uint32_t)SPIDER_DONE_TIMER;
if ( cr->m_coll[0] == 'q' &&
cr->m_coll[1] == 'a' &&
strcmp(cr->m_coll,"qatest123")==0)
spiderDoneTimer = 10;
// if we haven't spidered anything in 1 min assume the
// queue is basically empty...
if ( ci->m_lastSpiderAttempt &&
ci->m_lastSpiderCouldLaunch &&
ci->m_hasUrlsReadyToSpider &&
// the next round we are waiting for, if any, must
// have had some time to get urls! otherwise we
// will increment the round # and wait just
// SPIDER_DONE_TIMER seconds and end up setting
// hasUrlsReadyToSpider to false!
now > cr->m_spiderRoundStartTime + spiderDoneTimer &&
// no spiders currently out. i've seen a couple out
// waiting for a diffbot reply. wait for them to
// return before ending the round...
sc && sc->m_spidersOut == 0 &&
// it must have launched at least one url! this should
// prevent us from incrementing the round # at the gb
// process startup
//ci->m_numUrlsLaunched > 0 &&
//cr->m_spideringEnabled &&
//g_conf.m_spideringEnabled &&
ci->m_lastSpiderAttempt - ci->m_lastSpiderCouldLaunch >
spiderDoneTimer ) {
// break it here for our collnum to see if
// doledb was just lagging or not.
bool printIt = true;
if ( now < sc->m_lastPrinted ) printIt = false;
if ( printIt ) sc->m_lastPrinted = now + 5;
// doledb must be empty
if ( ! sc->m_doleIpTable.isEmpty() ) {
if ( printIt )
log("spider: not ending crawl because "
"doledb not empty for coll=%s",cr->m_coll);
goto doNotEnd;
}
uint64_t nextTimeMS ;
nextTimeMS = sc->getNextSpiderTimeFromWaitingTree ( );
// and no ips awaiting scans to get into doledb
// except for ips needing scans 60+ seconds from now
if ( nextTimeMS && nextTimeMS < nowMS + 60000 ) {
if ( printIt )
log("spider: not ending crawl because "
"waiting tree key is ready for scan "
"%"INT64" ms from now for coll=%s",
nextTimeMS - nowMS,cr->m_coll );
goto doNotEnd;
}
// maybe wait for waiting tree population to finish
if ( sc->m_waitingTreeNeedsRebuild ) {
if ( printIt )
log("spider: not ending crawl because "
"waiting tree is building for coll=%s",
cr->m_coll );
goto doNotEnd;
}
// this is the MOST IMPORTANT variable so note it
log(LOG_INFO,
"spider: coll %s has no more urls to spider",
cr->m_coll);
// assume our crawl on this host is completed i guess
ci->m_hasUrlsReadyToSpider = 0;
// if changing status, resend local crawl info to all
cr->localCrawlInfoUpdate();
// save that!
cr->m_needsSave = true;
}
doNotEnd:
int32_t hostId = slot->m_host->m_hostId;
bool sendIt = false;
// . if not sent to host yet, send
// . this will be true when WE startup, not them...
// . but once we send it we set flag to false
// . and if we update anything we send we set flag to true
// again for all hosts
if ( cr->shouldSendLocalCrawlInfoToHost(hostId) )
sendIt = true;
// they can override. if host crashed and came back up
// it might not have saved the global crawl info for a coll
// perhaps, at the very least it does not have
// the correct CollectionRec::m_crawlInfoBuf because we do
// not save the array of crawlinfos for each host for
// all collections.
if ( req && req[0] == 'f' )
sendIt = true;
if ( ! sendIt ) continue;
// note it
// log("spider: sending ci for coll %s to host %"INT32"",
// cr->m_coll,hostId);
// save it
replyBuf.safeMemcpy ( ci , sizeof(CrawlInfo) );
// do not re-do it unless it gets update here or in XmlDoc.cpp
cr->sentLocalCrawlInfoToHost ( hostId );
}
g_udpServer.sendReply_ass ( replyBuf.getBufStart() ,
replyBuf.length() ,
replyBuf.getBufStart() , // alloc
replyBuf.getCapacity() , //alloc size
slot );
// udp server will free this
replyBuf.detachBuf();
}
bool getSpiderStatusMsg ( CollectionRec *cx , SafeBuf *msg , int32_t *status ) {
if ( ! g_conf.m_spideringEnabled && ! cx->m_isCustomCrawl ) {
*status = SP_ADMIN_PAUSED;
return msg->safePrintf("Spidering disabled in "
"master controls. You can turn it "
"back on there.");
}
if ( g_conf.m_readOnlyMode ) {
*status = SP_ADMIN_PAUSED;
return msg->safePrintf("In read-only mode. Spidering off.");
}
if ( g_dailyMerge.m_mergeMode ) {
*status = SP_ADMIN_PAUSED;
return msg->safePrintf("Daily merge engaged, spidering "
"paused.");
}
// if ( g_udpServer.getNumUsedSlotsIncoming() >= MAXUDPSLOTS ) {
// *status = SP_ADMIN_PAUSED;
// return msg->safePrintf("Too many UDP slots in use, "
// "spidering paused.");
// }
if ( g_repairMode ) {
*status = SP_ADMIN_PAUSED;
return msg->safePrintf("In repair mode, spidering paused.");
}
// do not spider until collections/parms in sync with host #0
if ( ! g_parms.m_inSyncWithHost0 ) {
*status = SP_ADMIN_PAUSED;
return msg->safePrintf("Parms not in sync with host #0, "
"spidering paused");
}
// don't spider if not all hosts are up, or they do not all
// have the same hosts.conf.
if ( g_pingServer.m_hostsConfInDisagreement ) {
*status = SP_ADMIN_PAUSED;
return msg->safePrintf("Hosts.conf discrepancy, "
"spidering paused.");
}
uint32_t now = (uint32_t)getTimeGlobal();
// try to fix crawlbot nightly test complaining about job status
// for TestRepeatCrawlWithMaxToCrawl
if ( (cx->m_spiderStatus == SP_MAXTOCRAWL ||
cx->m_spiderStatus == SP_MAXTOPROCESS ) &&
cx->m_collectiveRespiderFrequency > 0.0 &&
now < cx->m_spiderRoundStartTime &&
cx->m_spiderRoundNum >= cx->m_maxCrawlRounds ) {
*status = SP_MAXROUNDS;
return msg->safePrintf ( "Job has reached maxRounds "
"limit." );
}
// . 0 means not to RE-crawl
// . indicate if we are WAITING for next round...
if ( cx->m_spiderStatus == SP_MAXTOCRAWL &&
cx->m_collectiveRespiderFrequency > 0.0 &&
now < cx->m_spiderRoundStartTime ) {
*status = SP_ROUNDDONE;
return msg->safePrintf("Jobs has reached maxToCrawl limit. "
"Next crawl round to start "
"in %"INT32" seconds.",
(int32_t)(cx->m_spiderRoundStartTime-
now));
}
if ( cx->m_spiderStatus == SP_MAXTOPROCESS &&
cx->m_collectiveRespiderFrequency > 0.0 &&
now < cx->m_spiderRoundStartTime ) {
*status = SP_ROUNDDONE;
return msg->safePrintf("Jobs has reached maxToProcess limit. "
"Next crawl round to start "
"in %"INT32" seconds.",
(int32_t)(cx->m_spiderRoundStartTime-
now));
}
if ( cx->m_spiderStatus == SP_MAXTOCRAWL ) {
*status = SP_MAXTOCRAWL;
return msg->safePrintf ( "Job has reached maxToCrawl "
"limit." );
}
if ( cx->m_spiderStatus == SP_MAXTOPROCESS ) {
*status = SP_MAXTOPROCESS;
return msg->safePrintf ( "Job has reached maxToProcess "
"limit." );
}
if ( cx->m_spiderStatus == SP_MAXROUNDS ) {
*status = SP_MAXROUNDS;
return msg->safePrintf ( "Job has reached maxRounds "
"limit." );
}
if ( ! cx->m_spideringEnabled ) {
*status = SP_PAUSED;
if ( cx->m_isCustomCrawl )
return msg->safePrintf("Job paused.");
else
return msg->safePrintf("Spidering disabled "
"in spider controls.");
}
// . 0 means not to RE-crawl
// . indicate if we are WAITING for next round...
if ( cx->m_collectiveRespiderFrequency > 0.0 &&
now < cx->m_spiderRoundStartTime ) {
*status = SP_ROUNDDONE;
return msg->safePrintf("Next crawl round to start "
"in %"INT32" seconds.",
(int32_t)(cx->m_spiderRoundStartTime-
now) );
}
// if spiderdb is empty for this coll, then no url
// has been added to spiderdb yet.. either seed or spot
//CrawlInfo *cg = &cx->m_globalCrawlInfo;
//if ( cg->m_pageDownloadAttempts == 0 ) {
// *status = SP_NOURLS;
// return msg->safePrintf("Crawl is waiting for urls.");
//}
if ( cx->m_spiderStatus == SP_INITIALIZING ) {
*status = SP_INITIALIZING;
return msg->safePrintf("Job is initializing.");
}
// if we had seeds and none were successfully crawled, do not just
// print that the crawl completed.
if ( cx->m_collectiveRespiderFrequency <= 0.0 &&
cx->m_isCustomCrawl &&
! cx->m_globalCrawlInfo.m_hasUrlsReadyToSpider &&
cx->m_globalCrawlInfo.m_pageDownloadAttempts > 0 &&
cx->m_globalCrawlInfo.m_pageDownloadSuccesses == 0 ) {
*status = SP_SEEDSERROR;
return msg->safePrintf("Failed to crawl any seed.");
}
// if we sent an email simply because no urls
// were left and we are not recrawling!
if ( cx->m_collectiveRespiderFrequency <= 0.0 &&
cx->m_isCustomCrawl &&
! cx->m_globalCrawlInfo.m_hasUrlsReadyToSpider ) {
*status = SP_COMPLETED;
return msg->safePrintf("Job has completed and no "
"repeat is scheduled.");
}
if ( cx->m_spiderStatus == SP_ROUNDDONE && ! cx->m_isCustomCrawl ) {
*status = SP_ROUNDDONE;
return msg->safePrintf ( "Nothing currently "
"available to spider. "
"Change your url filters, try "
"adding new urls, or wait for "
"existing urls to be respidered.");
}
// let's pass the qareindex() test in qa.cpp... it wasn't updating
// the status to done. it kept saying in progress.
if ( ! cx->m_isCustomCrawl &&
! cx->m_globalCrawlInfo.m_hasUrlsReadyToSpider ) {
//*status = SP_COMPLETED;
*status = SP_INPROGRESS;
return msg->safePrintf ( "Nothing currently "
"available to spider. "
"Change your url filters, try "
"adding new urls, or wait for "
"existing urls to be respidered.");
}
if ( cx->m_spiderStatus == SP_ROUNDDONE ) {
*status = SP_ROUNDDONE;
return msg->safePrintf ( "Job round completed.");
}
if ( ! g_conf.m_spideringEnabled ) {
*status = SP_ADMIN_PAUSED;
return msg->safePrintf("All crawling temporarily paused "
"by root administrator for "
"maintenance.");
}
// out CollectionRec::m_globalCrawlInfo counts do not have a dead
// host's counts tallied into it, which could make a difference on
// whether we have exceed a maxtocrawl limit or some such, so wait...
if ( ! s_countsAreValid && g_hostdb.hasDeadHost() ) {
*status = SP_ADMIN_PAUSED;
return msg->safePrintf("All crawling temporarily paused "
"because a shard is down.");
}
// otherwise in progress?
*status = SP_INPROGRESS;
if ( cx->m_isCustomCrawl )
return msg->safePrintf("Job is in progress.");
else
return msg->safePrintf("Spider is in progress.");
}
bool hasPositivePattern ( char *pattern ) {
char *p = pattern;
// scan the " || " separated substrings
for ( ; *p ; ) {
// get beginning of this string
char *start = p;
// skip white space
while ( *start && is_wspace_a(*start) ) start++;
// done?
if ( ! *start ) break;
// find end of it
char *end = start;
while ( *end && end[0] != '|' )
end++;
// advance p for next guy
p = end;
// should be two |'s
if ( *p ) p++;
if ( *p ) p++;
// skip if negative pattern
if ( start[0] == '!' && start[1] && start[1]!='|' )
continue;
// otherwise it's a positive pattern
return true;
}
return false;
}
// pattern is a ||-separted list of substrings
bool doesStringContainPattern ( char *content , char *pattern ) {
//bool checkForNegatives ) {
char *p = pattern;
int32_t matchedOne = 0;
bool hadPositive = false;
int32_t count = 0;
// scan the " || " separated substrings
for ( ; *p ; ) {
// get beginning of this string
char *start = p;
// skip white space
while ( *start && is_wspace_a(*start) ) start++;
// done?
if ( ! *start ) break;
// find end of it
char *end = start;
while ( *end && end[0] != '|' )
end++;
// advance p for next guy
p = end;
// should be two |'s
if ( *p ) p++;
if ( *p ) p++;
// temp null this
char c = *end;
*end = '\0';
// count it as an attempt
count++;
bool matchFront = false;
if ( start[0] == '^' ) { start++; matchFront = true; }
// if pattern is NOT/NEGATIVE...
bool negative = false;
if ( start[0] == '!' && start[1] && start[1]!='|' ) {
start++;
negative = true;
}
else
hadPositive = true;
// . is this substring anywhere in the document
// . check the rawest content before converting to utf8 i guess
// . suuport the ^ operator
char *foundPtr = NULL;
if ( matchFront ) {
// if we match the front, set to bogus 0x01
if ( strncmp(content,start,end-start)==0 )
foundPtr =(char *)0x01;
}
else {
foundPtr = strstr ( content , start ) ;
}
// debug log statement
//if ( foundPtr )
// log("build: page %s matches ppp of \"%s\"",
// m_firstUrl.m_url,start);
// revert \0
*end = c;
// negative mean we should NOT match it
if ( negative ) {
// so if its matched, that is bad
if ( foundPtr ) return false;
continue;
}
// skip if not found
if ( ! foundPtr ) continue;
// did we find it?
matchedOne++;
// if no negatives, done
//if ( ! checkForNegatives )
//return true;
}
// if we had no attempts, it is ok
if ( count == 0 ) return true;
// must have matched one at least
if ( matchedOne ) return true;
// if all negative? i.e. !category||!author
if ( ! hadPositive ) return true;
// if we had an unfound substring...
return false;
}
int32_t getFakeIpForUrl1 ( char *url1 ) {
// make the probable docid
int64_t probDocId = g_titledb.getProbableDocId ( url1 );
// make one up, like we do in PageReindex.cpp
int32_t firstIp = (probDocId & 0xffffffff);
return firstIp;
}
int32_t getFakeIpForUrl2 ( Url *url2 ) {
// make the probable docid
int64_t probDocId = g_titledb.getProbableDocId ( url2 );
// make one up, like we do in PageReindex.cpp
int32_t firstIp = (probDocId & 0xffffffff);
return firstIp;
}
// returns false and sets g_errno on error
bool SpiderRequest::setFromAddUrl ( char *url ) {
// reset it
reset();
// make the probable docid
int64_t probDocId = g_titledb.getProbableDocId ( url );
// make one up, like we do in PageReindex.cpp
int32_t firstIp = (probDocId & 0xffffffff);
//int32_t firstIp = getFakeIpForUrl1 ( url );
// ensure not crazy
if ( firstIp == -1 || firstIp == 0 ) firstIp = 1;
// . now fill it up
// . TODO: calculate the other values... lazy!!! (m_isRSSExt,
// m_siteNumInlinks,...)
m_isNewOutlink = 1;
m_isAddUrl = 1;
m_addedTime = (uint32_t)getTimeGlobal();//now;
m_fakeFirstIp = 1;
//m_probDocId = probDocId;
m_firstIp = firstIp;
m_hopCount = 0;
// new: validate it?
m_hopCountValid = 1;
// its valid if root
Url uu; uu.set ( url );
if ( uu.isRoot() ) m_hopCountValid = true;
// too big?
if ( gbstrlen(url) > MAX_URL_LEN ) {
g_errno = EURLTOOLONG;
return false;
}
// the url! includes \0
strcpy ( m_url , url );
// call this to set m_dataSize now
setDataSize();
// make the key dude -- after setting url
setKey ( firstIp , 0LL, false );
// need a fake first ip lest we core!
//m_firstIp = (pdocId & 0xffffffff);
// how to set m_firstIp? i guess addurl can be throttled independently
// of the other urls??? use the hash of the domain for it!
int32_t dlen;
char *dom = getDomFast ( url , &dlen );
// fake it for this...
//m_firstIp = hash32 ( dom , dlen );
// sanity
if ( ! dom ) {
g_errno = EBADURL;
return false;
//return sendReply ( st1 , true );
}
m_domHash32 = hash32 ( dom , dlen );
// and "site"
int32_t hlen = 0;
char *host = getHostFast ( url , &hlen );
m_siteHash32 = hash32 ( host , hlen );
m_hostHash32 = m_siteHash32;
return true;
}
bool SpiderRequest::setFromInject ( char *url ) {
// just like add url
if ( ! setFromAddUrl ( url ) ) return false;
// but fix this
m_isAddUrl = 0;
m_isInjecting = 1;
return true;
}
bool SpiderRequest::isCorrupt ( ) {
// more corruption detection
if ( m_hopCount < -1 ) {
log("spider: got corrupt 5 spiderRequest");
return true;
}
// sanity check. check for http(s)://
if ( m_url[0] != 'h' &&
// might be a docid from a pagereindex.cpp
! is_digit(m_url[0]) ) {
log("spider: got corrupt 1 spiderRequest");
return true;
}
return false;
}
CollectionRec *SpiderLoop::getActiveList() {
uint32_t nowGlobal = (uint32_t)getTimeGlobal();
if ( nowGlobal >= m_recalcTime && m_recalcTimeValid )
m_activeListValid = false;
// we set m_activeListValid to false when enabling/disabling spiders,
// when rebuilding url filters in Collectiondb.cpp rebuildUrlFilters()
// and when updating the site list in updateSiteList(). all of these
// could possible make an inactive collection active again, or vice
// versa. also when deleting a collection in Collectiondb.cpp. this
// keeps the below loop fast when we have thousands of collections
// and most are inactive or empty/deleted.
if ( ! m_activeListValid || m_activeListModified ) {
buildActiveList();
//m_crx = m_activeList;
// recompute every 3 seconds, it seems kinda buggy!!
m_recalcTime = nowGlobal + 3;
m_recalcTimeValid = true;
m_activeListModified = false;
}
return m_activeList;
}
void SpiderLoop::buildActiveList ( ) {
//log("spider: rebuilding active list");
//m_bookmark = NULL;
// set current time, synced with host #0
uint32_t nowGlobal = (uint32_t)getTimeGlobal();
// when do we need to rebuild the active list again?
m_recalcTimeValid = false;
m_activeListValid = true;
m_activeListCount = 0;
// reset the linked list of active collections
m_activeList = NULL;
bool found = false;
CollectionRec *tail = NULL;
for ( int32_t i = 0 ; i < g_collectiondb.m_numRecs ; i++ ) {
// get rec
CollectionRec *cr = g_collectiondb.m_recs[i];
// skip if gone
if ( ! cr ) continue;
// stop if not enabled
bool active = true;
if ( ! cr->m_spideringEnabled ) active = false;
if ( cr->m_spiderStatus == SP_MAXROUNDS ) active = false;
if ( cr->m_spiderStatus == SP_MAXTOCRAWL ) active = false;
if ( cr->m_spiderStatus == SP_MAXTOPROCESS ) active = false;
//
// . if doing respider with roundstarttime....
// . roundstarttime is > 0 if m_collectiveRespiderFrequency
// is > 0, unless it has not been set to current time yet
// . if m_collectiveRespiderFrequency was set to 0.0 then
// PageCrawlBot.cpp also sets m_roundStartTime to 0.
//
if ( nowGlobal < cr->m_spiderRoundStartTime ) {
active = false;
// no need to do this now since we recalc every
// 3 seconds anyway...
// if ( cr->m_spiderRoundStartTime < m_recalcTime ) {
// m_recalcTime = cr->m_spiderRoundStartTime;
// m_recalcTimeValid = true;
// }
}
// MDW: let's not do this either unless it proves to be a prob
//if ( sc->m_needsRebuild ) active = true;
// we are at the tail of the linked list OR not in the list
cr->m_nextActive = NULL;
cr->m_isActive = false;
if ( ! active ) continue;
cr->m_isActive = true;
m_activeListCount++;
if ( cr == m_crx ) found = true;
// if first one, set it to head
if ( ! tail ) {
m_activeList = cr;
tail = cr;
continue;
}
// if not first one, add it to end of tail
tail->m_nextActive = cr;
tail = cr;
}
// we use m_bookmark so we do not get into an infinite loop
// in spider urls logic above
if ( ! found ) {
m_bookmark = NULL;
m_crx = NULL;
}
}