// TODO: add m_downloadTimeTable to measure download speed of an IP // TODO: consider adding m_titleWeight/m_bodyWeight/ etc. to url filters table. // like maybe make the wikipedia page titles really heavy.. // TODO: consider a "latestpubdateage" in url filters for pages that are // adding new dates (not clocks) all the time #include "gb-include.h" #include "Spider.h" #include "Msg5.h" #include "Collectiondb.h" #include "XmlDoc.h" // score8to32() #include "Stats.h" #include "SafeBuf.h" #include "Repair.h" #include "CountryCode.h" #include "DailyMerge.h" #include "Process.h" #include "Test.h" // g_test #include "Threads.h" #include "XmlDoc.h" #include "HttpServer.h" #include "Pages.h" #include "Parms.h" #include "Rebalance.h" void testWinnerTreeKey ( ) ; // . this was 10 but cpu is getting pegged, so i set to 45 // . we consider the collection done spidering when no urls to spider // for this many seconds // . i'd like to set back to 10 for speed... maybe even 5 or less // . back to 30 from 20 to try to fix crawls thinking they are done // maybe because of the empty doledb logic taking too long? #define SPIDER_DONE_TIMER 30 // seems like timecity.com as gigabytes of spiderdb data so up from 40 to 400 #define MAX_WINNER_NODES 400 Doledb g_doledb; RdbTree *g_tree = NULL; SpiderRequest *g_sreq = NULL; long g_corruptCount = 0; static char s_countsAreValid = 1; ///////////////////////// ///////////////////////// SPIDEREC ///////////////////////// void SpiderRequest::setKey (long firstIp, long long parentDocId, long long uh48, bool isDel) { // sanity if ( firstIp == 0 || firstIp == -1 ) { char *xx=NULL;*xx=0; } m_key = g_spiderdb.makeKey ( firstIp,uh48,true,parentDocId , isDel ); // set dataSize too! setDataSize(); } void SpiderRequest::setDataSize ( ) { m_dataSize = (m_url - (char *)this) + gbstrlen(m_url) + 1 // subtract m_key and m_dataSize - sizeof(key128_t) - 4 ; } long SpiderRequest::print ( SafeBuf *sbarg ) { SafeBuf *sb = sbarg; SafeBuf tmp; if ( ! sb ) sb = &tmp; //sb->safePrintf("k.n1=0x%llx ",m_key.n1); //sb->safePrintf("k.n0=0x%llx ",m_key.n0); sb->safePrintf("k=%s ",KEYSTR(this, getKeySizeFromRdbId(RDB_SPIDERDB))); sb->safePrintf("uh48=%llu ",getUrlHash48()); // if negtaive bail early now if ( (m_key.n0 & 0x01) == 0x00 ) { sb->safePrintf("[DELETE]"); if ( ! sbarg ) printf("%s",sb->getBufStart() ); return sb->length(); } sb->safePrintf("recsize=%li ",getRecSize()); sb->safePrintf("parentDocId=%llu ",getParentDocId()); sb->safePrintf("firstip=%s ",iptoa(m_firstIp) ); sb->safePrintf("hostHash32=0x%lx ",m_hostHash32 ); sb->safePrintf("domHash32=0x%lx ",m_domHash32 ); sb->safePrintf("siteHash32=0x%lx ",m_siteHash32 ); sb->safePrintf("siteNumInlinks=%li ",m_siteNumInlinks ); // print time format: 7/23/1971 10:45:32 struct tm *timeStruct ; char time[256]; timeStruct = gmtime ( &m_addedTime ); strftime ( time , 256 , "%b %e %T %Y UTC", timeStruct ); sb->safePrintf("addedTime=%s(%lu) ",time,m_addedTime ); sb->safePrintf("parentFirstIp=%s ",iptoa(m_parentFirstIp) ); sb->safePrintf("parentHostHash32=0x%lx ",m_parentHostHash32 ); sb->safePrintf("parentDomHash32=0x%lx ",m_parentDomHash32 ); sb->safePrintf("parentSiteHash32=0x%lx ",m_parentSiteHash32 ); sb->safePrintf("hopCount=%li ",m_hopCount ); //timeStruct = gmtime ( &m_spiderTime ); //time[0] = 0; //if ( m_spiderTime ) strftime (time,256,"%b %e %T %Y UTC",timeStruct); //sb->safePrintf("spiderTime=%s(%lu) ",time,m_spiderTime); //timeStruct = gmtime ( &m_pubDate ); //time[0] = 0; //if ( m_pubDate ) strftime (time,256,"%b %e %T %Y UTC",timeStruct); //sb->safePrintf("pubDate=%s(%lu) ",time,m_pubDate ); sb->safePrintf("ufn=%li ", (long)m_ufn); // why was this unsigned? sb->safePrintf("priority=%li ", (long)m_priority); //sb->safePrintf("errCode=%s(%lu) ",mstrerror(m_errCode),m_errCode ); //sb->safePrintf("crawlDelay=%lims ",m_crawlDelay ); //sb->safePrintf("httpStatus=%li ",(long)m_httpStatus ); //sb->safePrintf("retryNum=%li ",(long)m_retryNum ); //sb->safePrintf("langId=%s(%li) ", // getLanguageString(m_langId),(long)m_langId ); //sb->safePrintf("percentChanged=%li%% ",(long)m_percentChanged ); if ( m_isNewOutlink ) sb->safePrintf("ISNEWOUTLINK "); if ( m_isAddUrl ) sb->safePrintf("ISADDURL "); if ( m_isPageReindex ) sb->safePrintf("ISPAGEREINDEX "); if ( m_isPageParser ) sb->safePrintf("ISPAGEPARSER "); if ( m_urlIsDocId ) sb->safePrintf("URLISDOCID "); if ( m_isRSSExt ) sb->safePrintf("ISRSSEXT "); if ( m_isUrlPermalinkFormat ) sb->safePrintf("ISURLPERMALINKFORMAT "); if ( m_isPingServer ) sb->safePrintf("ISPINGSERVER "); if ( m_fakeFirstIp ) sb->safePrintf("ISFAKEFIRSTIP "); if ( m_isInjecting ) sb->safePrintf("ISINJECTING "); if ( m_forceDelete ) sb->safePrintf("FORCEDELETE "); if ( m_sameDom ) sb->safePrintf("SAMEDOM "); if ( m_sameHost ) sb->safePrintf("SAMEHOST "); if ( m_sameSite ) sb->safePrintf("SAMESITE "); if ( m_wasParentIndexed ) sb->safePrintf("WASPARENTINDEXED "); if ( m_parentIsRSS ) sb->safePrintf("PARENTISRSS "); if ( m_parentIsPermalink ) sb->safePrintf("PARENTISPERMALINK "); if ( m_parentIsPingServer ) sb->safePrintf("PARENTISPINGSERVER "); if ( m_isMenuOutlink ) sb->safePrintf("MENUOUTLINK "); if ( m_parentHasAddress ) sb->safePrintf("PARENTHASADDRESS "); //if ( m_fromSections ) sb->safePrintf("FROMSECTIONS "); if ( m_isScraping ) sb->safePrintf("ISSCRAPING "); if ( m_hasContent ) sb->safePrintf("HASCONTENT "); if ( m_inGoogle ) sb->safePrintf("INGOOGLE "); if ( m_hasAuthorityInlink ) sb->safePrintf("HASAUTHORITYINLINK "); if ( m_hasContactInfo ) sb->safePrintf("HASCONTACTINFO "); //if ( m_hasSiteVenue ) sb->safePrintf("HASSITEVENUE "); if ( m_isContacty ) sb->safePrintf("CONTACTY "); if ( m_isWWWSubdomain ) sb->safePrintf("WWWSUBDOMAIN "); if ( m_avoidSpiderLinks ) sb->safePrintf("AVOIDSPIDERLINKS "); //if ( m_inOrderTree ) sb->safePrintf("INORDERTREE "); //if ( m_doled ) sb->safePrintf("DOLED "); //unsigned long gid = g_spiderdb.getGroupId(m_firstIp); long shardNum = g_hostdb.getShardNum(RDB_SPIDERDB,this); sb->safePrintf("shardnum=%lu ",shardNum); sb->safePrintf("url=%s",m_url); if ( ! sbarg ) printf("%s",sb->getBufStart() ); return sb->length(); } void SpiderReply::setKey (long firstIp, long long parentDocId, long long uh48, bool isDel) { m_key = g_spiderdb.makeKey ( firstIp,uh48,false,parentDocId , isDel ); // set dataSize too! m_dataSize = sizeof(SpiderReply) - sizeof(key128_t) - 4; } long SpiderReply::print ( SafeBuf *sbarg ) { SafeBuf *sb = sbarg; SafeBuf tmp; if ( ! sb ) sb = &tmp; sb->safePrintf("k.n1=0x%llx ",m_key.n1); sb->safePrintf("k.n0=0x%llx ",m_key.n0); sb->safePrintf("uh48=%llu ",getUrlHash48()); sb->safePrintf("parentDocId=%llu ",getParentDocId()); // if negtaive bail early now if ( (m_key.n0 & 0x01) == 0x00 ) { sb->safePrintf("[DELETE]"); if ( ! sbarg ) printf("%s",sb->getBufStart() ); return sb->length(); } sb->safePrintf("firstip=%s ",iptoa(m_firstIp) ); sb->safePrintf("percentChangedPerDay=%.02f%% ",m_percentChangedPerDay); // print time format: 7/23/1971 10:45:32 struct tm *timeStruct ; char time[256]; timeStruct = gmtime ( &m_spideredTime ); time[0] = 0; if ( m_spideredTime ) strftime (time,256,"%b %e %T %Y UTC",timeStruct); sb->safePrintf("spideredTime=%s(%lu) ",time,m_spideredTime); sb->safePrintf("siteNumInlinks=%li ",m_siteNumInlinks ); timeStruct = gmtime ( &m_pubDate ); time[0] = 0; if ( m_pubDate != 0 && m_pubDate != -1 ) strftime (time,256,"%b %e %T %Y UTC",timeStruct); sb->safePrintf("pubDate=%s(%li) ",time,m_pubDate ); //sb->safePrintf("newRequests=%li ",m_newRequests ); sb->safePrintf("ch32=%lu ",(long)m_contentHash32); sb->safePrintf("crawldelayms=%lims ",m_crawlDelayMS ); sb->safePrintf("httpStatus=%li ",(long)m_httpStatus ); sb->safePrintf("langId=%s(%li) ", getLanguageString(m_langId),(long)m_langId ); if ( m_errCount ) sb->safePrintf("errCount=%li ",(long)m_errCount); sb->safePrintf("errCode=%s(%lu) ",mstrerror(m_errCode),m_errCode ); //if ( m_isSpam ) sb->safePrintf("ISSPAM "); if ( m_isRSS ) sb->safePrintf("ISRSS "); if ( m_isPermalink ) sb->safePrintf("ISPERMALINK "); if ( m_isPingServer ) sb->safePrintf("ISPINGSERVER "); //if ( m_deleted ) sb->safePrintf("DELETED "); if ( m_isIndexed && ! m_isIndexedINValid) sb->safePrintf("ISINDEXED "); if ( m_hasAddress ) sb->safePrintf("HASADDRESS "); if ( m_hasTOD ) sb->safePrintf("HASTOD "); //if ( m_hasSiteVenue ) sb->safePrintf("HASSITEVENUE "); if ( m_isContacty ) sb->safePrintf("CONTACTY "); //sb->safePrintf("url=%s",m_url); if ( ! sbarg ) printf("%s",sb->getBufStart() ); return sb->length(); } long SpiderRequest::printToTable ( SafeBuf *sb , char *status , XmlDoc *xd , long row ) { sb->safePrintf("\n",LIGHT_BLUE); // show elapsed time if ( xd ) { long long now = gettimeofdayInMilliseconds(); long long elapsed = now - xd->m_startTime; sb->safePrintf(" %li\n",row); sb->safePrintf(" %llims\n",elapsed); collnum_t collnum = xd->m_collnum; CollectionRec *cr = g_collectiondb.getRec(collnum); char *cs = ""; if ( cr ) cs = cr->m_coll; sb->safePrintf(" %li\n", cs,(long)collnum); } sb->safePrintf(" "); sb->safeTruncateEllipsis ( m_url , 64 ); sb->safePrintf("\n"); sb->safePrintf(" %s\n",status ); sb->safePrintf(" %li\n",(long)m_priority); sb->safePrintf(" %li\n",(long)m_ufn); sb->safePrintf(" %s\n",iptoa(m_firstIp) ); sb->safePrintf(" %li\n",(long)m_errCount ); sb->safePrintf(" %llu\n",getUrlHash48()); //sb->safePrintf(" 0x%lx\n",m_hostHash32 ); //sb->safePrintf(" 0x%lx\n",m_domHash32 ); //sb->safePrintf(" 0x%lx\n",m_siteHash32 ); sb->safePrintf(" %li\n",m_siteNumInlinks ); //sb->safePrintf(" %li\n",m_pageNumInlinks ); sb->safePrintf(" %li\n",m_hopCount ); // print time format: 7/23/1971 10:45:32 struct tm *timeStruct ; char time[256]; timeStruct = gmtime ( &m_addedTime ); strftime ( time , 256 , "%b %e %T %Y UTC", timeStruct ); sb->safePrintf(" %s(%lu)\n",time,m_addedTime); //timeStruct = gmtime ( &m_pubDate ); //time[0] = 0; //if ( m_pubDate ) strftime (time,256,"%b %e %T %Y UTC",timeStruct); //sb->safePrintf(" %s(%lu)\n",time,m_pubDate ); //sb->safePrintf(" %s(%lu)\n",mstrerror(m_errCode),m_errCode); //sb->safePrintf(" %lims\n",m_crawlDelay ); sb->safePrintf(" %s\n",iptoa(m_parentFirstIp) ); sb->safePrintf(" %llu\n",getParentDocId() ); //sb->safePrintf(" 0x%lx\n",m_parentHostHash32); //sb->safePrintf(" 0x%lx\n",m_parentDomHash32 ); //sb->safePrintf(" 0x%lx\n",m_parentSiteHash32 ); //sb->safePrintf(" %li\n",(long)m_httpStatus ); //sb->safePrintf(" %li\n",(long)m_retryNum ); //sb->safePrintf(" %s(%li)\n", // getLanguageString(m_langId),(long)m_langId ); //sb->safePrintf(" %li%%\n",(long)m_percentChanged ); sb->safePrintf(" "); if ( m_isNewOutlink ) sb->safePrintf("ISNEWOUTLINK "); if ( m_isAddUrl ) sb->safePrintf("ISADDURL "); if ( m_isPageReindex ) sb->safePrintf("ISPAGEREINDEX "); if ( m_isPageParser ) sb->safePrintf("ISPAGEPARSER "); if ( m_urlIsDocId ) sb->safePrintf("URLISDOCID "); if ( m_isRSSExt ) sb->safePrintf("ISRSSEXT "); if ( m_isUrlPermalinkFormat ) sb->safePrintf("ISURLPERMALINKFORMAT "); if ( m_isPingServer ) sb->safePrintf("ISPINGSERVER "); if ( m_isInjecting ) sb->safePrintf("ISINJECTING "); if ( m_forceDelete ) sb->safePrintf("FORCEDELETE "); if ( m_sameDom ) sb->safePrintf("SAMEDOM "); if ( m_sameHost ) sb->safePrintf("SAMEHOST "); if ( m_sameSite ) sb->safePrintf("SAMESITE "); if ( m_wasParentIndexed ) sb->safePrintf("WASPARENTINDEXED "); if ( m_parentIsRSS ) sb->safePrintf("PARENTISRSS "); if ( m_parentIsPermalink ) sb->safePrintf("PARENTISPERMALINK "); if ( m_parentIsPingServer ) sb->safePrintf("PARENTISPINGSERVER "); if ( m_isMenuOutlink ) sb->safePrintf("MENUOUTLINK "); if ( m_parentHasAddress ) sb->safePrintf("PARENTHASADDRESS "); //if ( m_fromSections ) sb->safePrintf("FROMSECTIONS "); if ( m_isScraping ) sb->safePrintf("ISSCRAPING "); if ( m_hasContent ) sb->safePrintf("HASCONTENT "); if ( m_inGoogle ) sb->safePrintf("INGOOGLE "); if ( m_hasAuthorityInlink ) sb->safePrintf("HASAUTHORITYINLINK "); if ( m_hasContactInfo ) sb->safePrintf("HASCONTACTINFO "); //if ( m_hasSiteVenue ) sb->safePrintf("HASSITEVENUE "); if ( m_isContacty ) sb->safePrintf("CONTACTY "); //if ( m_inOrderTree ) sb->safePrintf("INORDERTREE "); //if ( m_doled ) sb->safePrintf("DOLED "); sb->safePrintf("\n"); sb->safePrintf("\n"); return sb->length(); } long SpiderRequest::printTableHeaderSimple ( SafeBuf *sb , bool currentlySpidering) { sb->safePrintf("\n",DARK_BLUE); // how long its been being spidered if ( currentlySpidering ) { sb->safePrintf(" #\n"); sb->safePrintf(" elapsed\n"); sb->safePrintf(" coll\n"); } sb->safePrintf(" url\n"); sb->safePrintf(" status\n"); sb->safePrintf(" first IP\n"); sb->safePrintf(" crawlDelay\n"); sb->safePrintf(" pri\n"); sb->safePrintf(" errCount\n"); sb->safePrintf(" hops\n"); sb->safePrintf(" addedTime\n"); //sb->safePrintf(" flags\n"); sb->safePrintf("\n"); return sb->length(); } long SpiderRequest::printToTableSimple ( SafeBuf *sb , char *status , XmlDoc *xd , long row ) { sb->safePrintf("\n",LIGHT_BLUE); // show elapsed time if ( xd ) { long long now = gettimeofdayInMilliseconds(); long long elapsed = now - xd->m_startTime; sb->safePrintf(" %li\n",row); sb->safePrintf(" %llims\n",elapsed); // print collection CollectionRec *cr = g_collectiondb.getRec ( xd->m_collnum ); char *coll = ""; if ( cr ) coll = cr->m_coll; sb->safePrintf("%s",coll); } sb->safePrintf(" "); sb->safeTruncateEllipsis ( m_url , 64 ); sb->safePrintf("\n"); sb->safePrintf(" %s\n",status ); sb->safePrintf(" %s\n",iptoa(m_firstIp)); if ( xd->m_crawlDelayValid && xd->m_crawlDelay >= 0 ) sb->safePrintf(" %li ms\n",xd->m_crawlDelay); else sb->safePrintf(" --\n"); sb->safePrintf(" %li\n",(long)m_priority); sb->safePrintf(" %li\n",(long)m_errCount ); sb->safePrintf(" %li\n",m_hopCount ); // print time format: 7/23/1971 10:45:32 struct tm *timeStruct ; char time[256]; timeStruct = gmtime ( &m_addedTime ); strftime ( time , 256 , "%b %e %T %Y UTC", timeStruct ); sb->safePrintf(" %s(%lu)\n",time,m_addedTime); /* sb->safePrintf(" "); if ( m_isNewOutlink ) sb->safePrintf("ISNEWOUTLINK "); if ( m_isAddUrl ) sb->safePrintf("ISADDURL "); if ( m_isPageReindex ) sb->safePrintf("ISPAGEREINDEX "); if ( m_isPageParser ) sb->safePrintf("ISPAGEPARSER "); if ( m_urlIsDocId ) sb->safePrintf("URLISDOCID "); if ( m_isRSSExt ) sb->safePrintf("ISRSSEXT "); if ( m_isUrlPermalinkFormat ) sb->safePrintf("ISURLPERMALINKFORMAT "); if ( m_isPingServer ) sb->safePrintf("ISPINGSERVER "); if ( m_isInjecting ) sb->safePrintf("ISINJECTING "); if ( m_forceDelete ) sb->safePrintf("FORCEDELETE "); if ( m_sameDom ) sb->safePrintf("SAMEDOM "); if ( m_sameHost ) sb->safePrintf("SAMEHOST "); if ( m_sameSite ) sb->safePrintf("SAMESITE "); if ( m_wasParentIndexed ) sb->safePrintf("WASPARENTINDEXED "); if ( m_parentIsRSS ) sb->safePrintf("PARENTISRSS "); if ( m_parentIsPermalink ) sb->safePrintf("PARENTISPERMALINK "); if ( m_parentIsPingServer ) sb->safePrintf("PARENTISPINGSERVER "); if ( m_isMenuOutlink ) sb->safePrintf("MENUOUTLINK "); if ( m_parentHasAddress ) sb->safePrintf("PARENTHASADDRESS "); //if ( m_fromSections ) sb->safePrintf("FROMSECTIONS "); if ( m_isScraping ) sb->safePrintf("ISSCRAPING "); if ( m_hasContent ) sb->safePrintf("HASCONTENT "); if ( m_inGoogle ) sb->safePrintf("INGOOGLE "); if ( m_hasAuthorityInlink ) sb->safePrintf("HASAUTHORITYINLINK "); if ( m_hasContactInfo ) sb->safePrintf("HASCONTACTINFO "); if ( m_hasSiteVenue ) sb->safePrintf("HASSITEVENUE "); if ( m_isContacty ) sb->safePrintf("CONTACTY "); sb->safePrintf("\n"); */ sb->safePrintf("\n"); return sb->length(); } long SpiderRequest::printTableHeader ( SafeBuf *sb , bool currentlySpidering) { sb->safePrintf("\n",DARK_BLUE); // how long its been being spidered if ( currentlySpidering ) { sb->safePrintf(" #\n"); sb->safePrintf(" elapsed\n"); sb->safePrintf(" coll\n"); } sb->safePrintf(" url\n"); sb->safePrintf(" status\n"); sb->safePrintf(" pri\n"); sb->safePrintf(" ufn\n"); sb->safePrintf(" firstIp\n"); sb->safePrintf(" errCount\n"); sb->safePrintf(" urlHash48\n"); //sb->safePrintf(" hostHash32\n"); //sb->safePrintf(" domHash32\n"); //sb->safePrintf(" siteHash32\n"); sb->safePrintf(" siteInlinks\n"); //sb->safePrintf(" pageNumInlinks\n"); sb->safePrintf(" hops\n"); sb->safePrintf(" addedTime\n"); //sb->safePrintf(" lastAttempt\n"); //sb->safePrintf(" pubDate\n"); //sb->safePrintf(" errCode\n"); //sb->safePrintf(" crawlDelay\n"); sb->safePrintf(" parentIp\n"); sb->safePrintf(" parentDocId\n"); //sb->safePrintf(" parentHostHash32\n"); //sb->safePrintf(" parentDomHash32\n"); //sb->safePrintf(" parentSiteHash32\n"); //sb->safePrintf(" httpStatus\n"); //sb->safePrintf(" retryNum\n"); //sb->safePrintf(" langId\n"); //sb->safePrintf(" percentChanged\n"); sb->safePrintf(" flags\n"); sb->safePrintf("\n"); return sb->length(); } ///////////////////////// ///////////////////////// SPIDERDB ///////////////////////// // a global class extern'd in .h file Spiderdb g_spiderdb; Spiderdb g_spiderdb2; // reset rdb void Spiderdb::reset() { m_rdb.reset(); } // print the spider rec long Spiderdb::print( char *srec ) { // get if request or reply and print it if ( isSpiderRequest ( (key128_t *)srec ) ) ((SpiderRequest *)srec)->print(NULL); else ((SpiderReply *)srec)->print(NULL); return 0; } bool Spiderdb::init ( ) { long maxMem = 200000000; // . what's max # of tree nodes? // . assume avg spider rec size (url) is about 45 // . 45 + 33 bytes overhead in tree is 78 long maxTreeNodes = maxMem / 78; // . really we just cache the first 64k of each priority list // . used only by SpiderLoop //long maxCacheNodes = 32; // we use the same disk page size as indexdb (for rdbmap.cpp) long pageSize = GB_INDEXDB_PAGE_SIZE; // disk page cache mem, 100MB on gk0 now long pcmem = 20000000;//g_conf.m_spiderdbMaxDiskPageCacheMem; // keep this low if we are the tmp cluster if ( g_hostdb.m_useTmpCluster ) pcmem = 0; // turn off to prevent blocking up cpu pcmem = 0; // key parser checks //long ip = 0x1234; char priority = 12; long spiderTime = 0x3fe96610; long long urlHash48 = 0x1234567887654321LL & 0x0000ffffffffffffLL; //long long pdocid = 0x567834222LL; //key192_t k = makeOrderKey ( ip,priority,spiderTime,urlHash48,pdocid); //if (getOrderKeyUrlHash48 (&k)!=urlHash48 ){char*xx=NULL;*xx=0;} //if (getOrderKeySpiderTime (&k)!=spiderTime){char*xx=NULL;*xx=0;} //if (getOrderKeyPriority (&k)!=priority ){char*xx=NULL;*xx=0;} //if (getOrderKeyIp (&k)!=ip ){char*xx=NULL;*xx=0;} //if (getOrderKeyParentDocId(&k)!=pdocid ){char*xx=NULL;*xx=0;} // doledb key test key_t dk = g_doledb.makeKey(priority,spiderTime,urlHash48,false); if(g_doledb.getPriority(&dk)!=priority){char*xx=NULL;*xx=0;} if(g_doledb.getSpiderTime(&dk)!=spiderTime){char*xx=NULL;*xx=0;} if(g_doledb.getUrlHash48(&dk)!=urlHash48){char*xx=NULL;*xx=0;} if(g_doledb.getIsDel(&dk)!= 0){char*xx=NULL;*xx=0;} // spiderdb key test long long docId = 123456789; long firstIp = 0x23991688; key128_t sk = g_spiderdb.makeKey ( firstIp,urlHash48,1,docId,false); if ( ! g_spiderdb.isSpiderRequest (&sk) ) { char *xx=NULL;*xx=0; } if ( g_spiderdb.getUrlHash48(&sk) != urlHash48){char *xx=NULL;*xx=0;} if ( g_spiderdb.getFirstIp(&sk) != firstIp) {char *xx=NULL;*xx=0;} testWinnerTreeKey(); // we now use a page cache if ( ! m_pc.init ( "spiderdb", RDB_SPIDERDB , pcmem , pageSize , false , // use shared mem? false )) // minimizeDiskSeeks? return log(LOG_INIT,"spiderdb: Init failed."); // initialize our own internal rdb return m_rdb.init ( g_hostdb.m_dir , "spiderdb" , true , // dedup -1 , // fixedDataSize // now that we have MAX_WINNER_NODES allowed in doledb // we don't have to keep spiderdb so tightly merged i guess.. // MDW: it seems to slow performance when not tightly merged // so put this back to "2"... 2,//g_conf.m_spiderdbMinFilesToMerge , mintomerge maxMem,//g_conf.m_spiderdbMaxTreeMem , maxTreeNodes , true , // balance tree? 0,//g_conf.m_spiderdbMaxCacheMem, 0,//maxCacheNodes , false , // half keys? false , // save cache? &m_pc , false , false , sizeof(key128_t) ); } // init the rebuild/secondary rdb, used by PageRepair.cpp bool Spiderdb::init2 ( long treeMem ) { // . what's max # of tree nodes? // . assume avg spider rec size (url) is about 45 // . 45 + 33 bytes overhead in tree is 78 long maxTreeNodes = treeMem / 78; // initialize our own internal rdb return m_rdb.init ( g_hostdb.m_dir , "spiderdbRebuild" , true , // dedup -1 , // fixedDataSize 200 , // g_conf.m_spiderdbMinFilesToMerge treeMem , // g_conf.m_spiderdbMaxTreeMem , maxTreeNodes , true , // balance tree? 0 , // m_spiderdbMaxCacheMem, 0 , // maxCacheNodes , false , // half keys? false , // save cache? NULL );// &m_pc } /* bool Spiderdb::addColl ( char *coll, bool doVerify ) { if ( ! m_rdb.addColl ( coll ) ) return false; if ( ! doVerify ) return true; // verify if ( verify(coll) ) return true; // if not allowing scale, return false if ( ! g_conf.m_allowScale ) return false; // otherwise let it go log ( "db: Verify failed, but scaling is allowed, passing." ); return true; } */ bool Spiderdb::verify ( char *coll ) { //return true; log ( LOG_DEBUG, "db: Verifying Spiderdb for coll %s...", coll ); g_threads.disableThreads(); Msg5 msg5; Msg5 msg5b; RdbList list; key128_t startKey; key128_t endKey; startKey.setMin(); endKey.setMax(); //long minRecSizes = 64000; CollectionRec *cr = g_collectiondb.getRec(coll); if ( ! msg5.getList ( RDB_SPIDERDB , cr->m_collnum , &list , (char *)&startKey , (char *)&endKey , 64000 , // minRecSizes , true , // includeTree , false , // add to cache? 0 , // max cache age 0 , // startFileNum , -1 , // numFiles , NULL , // state NULL , // callback 0 , // niceness false , // err correction? NULL , 0 , -1 , true , -1LL , &msg5b , true )) { g_threads.enableThreads(); return log("db: HEY! it did not block"); } long count = 0; long got = 0; for ( list.resetListPtr() ; ! list.isExhausted() ; list.skipCurrentRecord() ) { char *k = list.getCurrentRec(); //key_t k = list.getCurrentKey(); count++; // what group's spiderdb should hold this rec //uint32_t groupId = g_hostdb.getGroupId ( RDB_SPIDERDB , k ); //if ( groupId == g_hostdb.m_groupId ) got++; long shardNum = g_hostdb.getShardNum(RDB_SPIDERDB,k); if ( shardNum == g_hostdb.getMyShardNum() ) got++; } if ( got != count ) { // tally it up g_rebalance.m_numForeignRecs += count - got; log ("db: Out of first %li records in spiderdb, " "only %li belong to our shard.",count,got); // exit if NONE, we probably got the wrong data if ( got == 0 ) log("db: Are you sure you have the " "right " "data in the right directory? " "Exiting."); log ( "db: Exiting due to Spiderdb inconsistency." ); g_threads.enableThreads(); return g_conf.m_bypassValidation; } log (LOG_DEBUG,"db: Spiderdb passed verification successfully for %li " "recs.", count ); // DONE g_threads.enableThreads(); return true; } key128_t Spiderdb::makeKey ( long firstIp , long long urlHash48 , bool isRequest , long long parentDocId , bool isDel ) { key128_t k; k.n1 = (unsigned long)firstIp; // push ip to top 32 bits k.n1 <<= 32; // . top 32 bits of url hash are in the lower 32 bits of k.n1 // . often the urlhash48 has top bits set that shouldn't be so mask // it to 48 bits k.n1 |= (urlHash48 >> 16) & 0xffffffff; // remaining 16 bits k.n0 = urlHash48 & 0xffff; // room for isRequest k.n0 <<= 1; if ( isRequest ) k.n0 |= 0x01; // parent docid k.n0 <<= 38; k.n0 |= parentDocId & DOCID_MASK; // reserved (padding) k.n0 <<= 8; // del bit k.n0 <<= 1; if ( ! isDel ) k.n0 |= 0x01; return k; } ///////////////////////// ///////////////////////// DOLEDB ///////////////////////// // reset rdb void Doledb::reset() { m_rdb.reset(); } bool Doledb::init ( ) { // . what's max # of tree nodes? // . assume avg spider rec size (url) is about 45 // . 45 + 33 bytes overhead in tree is 78 // . use 5MB for the tree long maxTreeMem = 150000000; // 150MB long maxTreeNodes = maxTreeMem / 78; // we use the same disk page size as indexdb (for rdbmap.cpp) long pageSize = GB_INDEXDB_PAGE_SIZE; // disk page cache mem, hard code to 5MB long pcmem = 5000000; // g_conf.m_spiderdbMaxDiskPageCacheMem; // keep this low if we are the tmp cluster if ( g_hostdb.m_useTmpCluster ) pcmem = 0; // we now use a page cache if ( ! m_pc.init ( "doledb" , RDB_DOLEDB , pcmem , pageSize , true , // use shared mem? false )) // minimizeDiskSeeks? return log(LOG_INIT,"doledb: Init failed."); // initialize our own internal rdb if ( ! m_rdb.init ( g_hostdb.m_dir , "doledb" , true , // dedup -1 , // fixedDataSize 2 , // MinFilesToMerge maxTreeMem , maxTreeNodes , true , // balance tree? 0 , // spiderdbMaxCacheMe 0 , // maxCacheNodes false , // half keys? false , // save cache? &m_pc )) return false; return true; } /* bool Doledb::addColl ( char *coll, bool doVerify ) { if ( ! m_rdb.addColl ( coll ) ) return false; //if ( ! doVerify ) return true; // verify //if ( verify(coll) ) return true; // if not allowing scale, return false //if ( ! g_conf.m_allowScale ) return false; // otherwise let it go //log ( "db: Verify failed, but scaling is allowed, passing." ); return true; } */ ///////////////////////// ///////////////////////// SpiderCache ///////////////////////// // . reload everything this many seconds // . this was originally done to as a lazy compensation for a bug but // now i do not add too many of the same domain if the same domain wait // is ample and we know we'll be refreshed in X seconds anyway //#define DEFAULT_SPIDER_RELOAD_RATE (3*60*60) // . size of spiderecs to load in one call to readList // . i increased it to 1MB to speed everything up, seems like cache is // getting loaded up way too slow #define SR_READ_SIZE (512*1024) // for caching in s_ufnTree //#define MAX_NODES (30) // a global class extern'd in .h file SpiderCache g_spiderCache; SpiderCache::SpiderCache ( ) { //m_numSpiderColls = 0; //m_isSaving = false; } // returns false and set g_errno on error bool SpiderCache::init ( ) { //for ( long i = 0 ; i < MAX_COLL_RECS ; i++ ) // m_spiderColls[i] = NULL; // success return true; } /* static void doneSavingWrapper ( void *state ) { SpiderCache *THIS = (SpiderCache *)state; log("spcache: done saving something"); //THIS->doneSaving(); // . call the callback if any // . this let's PageMaster.cpp know when we're closed //if (THIS->m_closeCallback) THIS->m_closeCallback(THIS->m_closeState); } void SpiderCache::doneSaving ( ) { // bail if g_errno was set if ( g_errno ) { log("spider: Had error saving waitingtree.dat or doleiptable: " "%s.", mstrerror(g_errno)); g_errno = 0; } else { // display any error, if any, otherwise prints "Success" logf(LOG_INFO,"db: Successfully saved waitingtree and " "doleiptable"); } // if still more need to save, not done yet if ( needsSave ( ) ) return; // ok, call callback that initiaed the save if ( m_callback ) m_callback ( THIS->m_state ); // ok, we are done! //m_isSaving = false; } */ // return false if any tree save blocked void SpiderCache::save ( bool useThread ) { // bail if already saving //if ( m_isSaving ) return true; // assume saving //m_isSaving = true; // loop over all SpiderColls and get the best for ( long i = 0 ; i < g_collectiondb.m_numRecs ; i++ ) { SpiderColl *sc = getSpiderCollIffNonNull(i);//m_spiderColls[i]; if ( ! sc ) continue; RdbTree *tree = &sc->m_waitingTree; if ( ! tree->m_needsSave ) continue; // if already saving from a thread if ( tree->m_isSaving ) continue; char *filename = "waitingtree"; char dir[1024]; sprintf(dir,"%scoll.%s.%li",g_hostdb.m_dir, sc->m_coll,(long)sc->m_collnum); // log it for now log("spider: saving waiting tree for cn=%li",(long)i); // returns false if it blocked, callback will be called tree->fastSave ( dir, // g_hostdb.m_dir , filename , useThread , NULL,//this , NULL);//doneSavingWrapper ); // also the doleIpTable /* filename = "doleiptable.dat"; sc->m_doleIpTable.fastSave(useThread, dir, filename, NULL, 0, NULL,//this, NULL);//doneSavingWrapper ); */ // . crap, this is made at startup from waitintree! /* // waiting table filename = "waitingtable.dat"; if ( sc->m_waitingTable.m_needsSave ) logf(LOG_INFO,"db: Saving %s/%s",dir, filename); sc->m_waitingTable.fastSave(useThread, dir, filename, NULL, 0, NULL,//this, NULL );//doneSavingWrapper ); */ } // if still needs save, not done yet, return false to indicate blocked //if ( blocked ) return false; // all done //m_isSaving = false; // did not block //return true; } bool SpiderCache::needsSave ( ) { for ( long i = 0 ; i < g_collectiondb.m_numRecs ; i++ ) { SpiderColl *sc = getSpiderCollIffNonNull(i);//m_spiderColls[i]; if ( ! sc ) continue; if ( sc->m_waitingTree.m_needsSave ) return true; // also the doleIpTable //if ( sc->m_doleIpTable.m_needsSave ) return true; } return false; } void SpiderCache::reset ( ) { log(LOG_DEBUG,"spider: resetting spidercache"); // loop over all SpiderColls and get the best for ( long i = 0 ; i < g_collectiondb.m_numRecs ; i++ ) { SpiderColl *sc = getSpiderCollIffNonNull(i); if ( ! sc ) continue; sc->reset(); mdelete ( sc , sizeof(SpiderColl) , "SpiderCache" ); delete ( sc ); //m_spiderColls[i] = NULL; CollectionRec *cr = g_collectiondb.getRec(i); cr->m_spiderColl = NULL; } //m_numSpiderColls = 0; } SpiderColl *SpiderCache::getSpiderCollIffNonNull ( collnum_t collnum ) { // "coll" must be invalid if ( collnum < 0 ) return NULL; if ( collnum >= g_collectiondb.m_numRecs ) return NULL; // shortcut CollectionRec *cr = g_collectiondb.m_recs[collnum]; // empty? if ( ! cr ) return NULL; // return it if non-NULL return cr->m_spiderColl; } bool tryToDeleteSpiderColl ( SpiderColl *sc ) { // if not being deleted return false if ( ! sc->m_deleteMyself ) return false; // otherwise always return true if ( sc->m_msg5b.m_waitingForList ) { log("spider: deleting sc=0x%lx for collnum=%li waiting1", (long)sc,(long)sc->m_collnum); return true; } if ( sc->m_msg1.m_mcast.m_inUse ) { log("spider: deleting sc=0x%lx for collnum=%li waiting2", (long)sc,(long)sc->m_collnum); return true; } if ( sc->m_isLoading ) { log("spider: deleting sc=0x%lx for collnum=%li waiting3", (long)sc,(long)sc->m_collnum); return true; } // this means msg5 is out if ( sc->m_msg5.m_waitingForList ) { log("spider: deleting sc=0x%lx for collnum=%li waiting4", (long)sc,(long)sc->m_collnum); return true; } // there's still a core of someone trying to write to someting // in "sc" so we have to try to fix that. somewhere in xmldoc.cpp // or spider.cpp. everyone should get sc from cr everytime i'd think log("spider: deleting sc=0x%lx for collnum=%li", (long)sc,(long)sc->m_collnum); // . make sure nobody has it // . cr might be NULL because Collectiondb.cpp::deleteRec2() might // have nuked it CollectionRec *cr = sc->m_cr; if ( cr ) cr->m_spiderColl = NULL; mdelete ( sc , sizeof(SpiderColl),"postdel1"); delete ( sc ); return true; } // . get SpiderColl for a collection // . if it is NULL for that collection then make a new one SpiderColl *SpiderCache::getSpiderColl ( collnum_t collnum ) { // "coll" must be invalid if ( collnum < 0 ) return NULL; // return it if non-NULL //if ( m_spiderColls [ collnum ] ) return m_spiderColls [ collnum ]; // if spidering disabled, do not bother creating this! //if ( ! g_conf.m_spideringEnabled ) return NULL; // shortcut CollectionRec *cr = g_collectiondb.m_recs[collnum]; // collection might have been reset in which case collnum changes if ( ! cr ) return NULL; // return it if non-NULL SpiderColl *sc = cr->m_spiderColl; if ( sc ) return sc; // if spidering disabled, do not bother creating this! //if ( ! cr->m_spideringEnabled ) return NULL; // cast it //SpiderColl *sc; // make it try { sc = new(SpiderColl); } catch ( ... ) { log("spider: failed to make SpiderColl for collnum=%li", (long)collnum); return NULL; } // register it mnew ( sc , sizeof(SpiderColl), "spcoll" ); // store it //m_spiderColls [ collnum ] = sc; cr->m_spiderColl = sc; // note it log(LOG_DEBUG,"spider: made spidercoll=%lx for cr=%lx", (long)sc,(long)cr); // update this //if ( m_numSpiderColls < collnum + 1 ) // m_numSpiderColls = collnum + 1; // set this sc->m_collnum = collnum; // save this strcpy ( sc->m_coll , cr->m_coll ); // set this if ( ! strcmp ( cr->m_coll,"qatest123" ) ) sc->m_isTestColl = true; else sc->m_isTestColl = false; // set first doledb scan key sc->m_nextDoledbKey.setMin(); // mark it as loading so it can't be deleted while loading sc->m_isLoading = true; // . load its tables from disk // . crap i think this might call quickpoll and we get a parm // update to delete this spider coll! sc->load(); // mark it as loading sc->m_isLoading = false; // set this sc->m_cr = cr; // did crawlbottesting delete it right away? if ( tryToDeleteSpiderColl( sc ) ) return NULL; // sanity check if ( ! cr ) { char *xx=NULL;*xx=0; } // note it! log(LOG_DEBUG,"spider: adding new spider collection for %s", cr->m_coll); // that was it return sc; } ///////////////////////// ///////////////////////// SpiderColl ///////////////////////// SpiderColl::SpiderColl () { m_deleteMyself = false; m_isLoading = false; m_gettingList1 = false; m_gettingList2 = false; m_lastScanTime = 0; m_isPopulating = false; m_numAdded = 0; m_numBytesScanned = 0; m_lastPrintCount = 0; //m_lastSpiderAttempt = 0; //m_lastSpiderCouldLaunch = 0; //m_numRoundsDone = 0; //m_lastDoledbReadEmpty = false; // over all priorities in this coll // re-set this to min and set m_needsWaitingTreeRebuild to true // when the admin updates the url filters page m_waitingTreeNeedsRebuild = false; m_nextKey2.setMin(); m_endKey2.setMax(); m_spidersOut = 0; m_coll[0] = '\0';// = NULL; reset(); // reset this memset ( m_outstandingSpiders , 0 , 4 * MAX_SPIDER_PRIORITIES ); } long SpiderColl::getTotalOutstandingSpiders ( ) { long sum = 0; for ( long i = 0 ; i < MAX_SPIDER_PRIORITIES ; i++ ) sum += m_outstandingSpiders[i]; return sum; } // load the tables that we set when m_doInitialScan is true bool SpiderColl::load ( ) { // error? long err = 0; // make the dir char *coll = g_collectiondb.getColl(m_collnum); // sanity check if ( ! coll || coll[0]=='\0' ) { log("spider: bad collnum of %li",(long)m_collnum); g_errno = ENOCOLLREC; return false; //char *xx=NULL;*xx=0; } } // reset this once m_msg1Avail = true; m_isPopulating = false; if ( ! m_lastDownloadCache.init ( 35000 , // maxcachemem, 8 , // fixed data size (MS) false , // support lists? 500 , // max nodes false , // use half keys? "downcache", // dbname false , // load from disk? 12 , // key size (firstip) 12 , // data key size? -1 ))// numPtrsMax return log("spider: dcache init failed"); // this has a quickpoll in it, so that quickpoll processes // a restart request from crawlbottesting for this collnum which // calls Collectiondb::resetColl2() which calls deleteSpiderColl() // on THIS spidercoll, but our m_loading flag is set if (!m_sniTable.set ( 4,8,5000,NULL,0,false,MAX_NICENESS,"snitbl") ) return false; if (!m_cdTable.set (4,4,3000,NULL,0,false,MAX_NICENESS,"cdtbl")) return false; // doledb seems to have like 32000 entries in it long numSlots = 0; // was 128000 if(!m_doleIpTable.set(4,4,numSlots,NULL,0,false,MAX_NICENESS,"doleip")) return false; // this should grow dynamically... if (!m_waitingTable.set (4,8,3000,NULL,0,false,MAX_NICENESS,"waittbl")) return false; // . a tree of keys, key is earliestSpiderTime|ip (key=12 bytes) // . earliestSpiderTime is 0 if unknown // . max nodes is 1M but we should grow dynamically! TODO // . let's up this to 5M because we are hitting the limit in some // test runs... // . try going to 20M now since we hit it again... // . start off at just 10 nodes since we grow dynamically now if (!m_waitingTree.set(0,10,true,-1,true,"waittree2", false,"waitingtree",sizeof(key_t)))return false; m_waitingTreeKeyValid = false; m_scanningIp = 0; // prevent core with this //m_waitingTree.m_rdbId = RDB_NONE; // make dir char dir[500]; sprintf(dir,"%scoll.%s.%li",g_hostdb.m_dir,coll,(long)m_collnum); // load up all the tables if ( ! m_cdTable .load(dir,"crawldelay.dat" ) ) err = g_errno; if ( ! m_sniTable.load(dir,"siteinlinks.dat" ) ) err = g_errno; // and its doledb data //if ( ! initializeDoleTables( ) ) err = g_errno; // our table that has how many of each firstIP are in doledb //if ( ! m_doleIpTable.load(dir,"doleiptable.dat") ) err = g_errno; // load in the waiting tree, IPs waiting to get into doledb BigFile file; file.set ( dir , "waitingtree-saved.dat" , NULL ); bool treeExists = file.doesExist() > 0; // load the table with file named "THISDIR/saved" if ( treeExists && ! m_waitingTree.fastLoad(&file,&m_waitingMem) ) err = g_errno; // init wait table. scan wait tree and add the ips into table. if ( ! makeWaitingTable() ) err = g_errno; // save it g_errno = err; // return false on error if ( g_errno ) // note it return log("spider: had error loading initial table: %s", mstrerror(g_errno)); // . do this now just to keep everything somewhat in sync // . we lost dmoz.org and could not get it back in because it was // in the doleip table but NOT in doledb!!! if ( ! makeDoleIPTable() ) return false; // otherwise true return true; } // . scan all spiderRequests in doledb at startup and add them to our tables // . then, when we scan spiderdb and add to orderTree/urlhashtable it will // see that the request is in doledb and set m_doled... // . initialize the dole table for that then // quickly scan doledb and add the doledb records to our trees and // tables. that way if we receive a SpiderReply() then addSpiderReply() // will be able to find the associated SpiderRequest. // MAKE SURE to put each spiderrequest into m_doleTable... and into // maybe m_urlHashTable too??? // this should block since we are at startup... bool SpiderColl::makeDoleIPTable ( ) { log(LOG_DEBUG,"spider: making dole ip table for %s",m_coll); key_t startKey ; startKey.setMin(); key_t endKey ; endKey.setMax(); key_t lastKey ; lastKey.setMin(); // turn off threads for this so it blocks bool enabled = g_threads.areThreadsEnabled(); // turn off regardless g_threads.disableThreads(); // get a meg at a time long minRecSizes = 1024*1024; Msg5 msg5; Msg5 msg5b; RdbList list; loop: // use msg5 to get the list, should ALWAYS block since no threads if ( ! msg5.getList ( RDB_DOLEDB , m_collnum , &list , startKey , endKey , minRecSizes , true , // includeTree? false , // add to cache? 0 , // max cache age 0 , // startFileNum , -1 , // numFiles , NULL , // state NULL , // callback 0,//MAX_NICENESS , // niceness false , // err correction? NULL , // cache key ptr 0 , // retry num -1 , // maxRetries true , // compensate for merge -1LL , // sync point &msg5b )){ log(LOG_LOGIC,"spider: getList did not block."); return false; } // shortcut long minSize=(long)(sizeof(SpiderRequest)+sizeof(key_t)+4-MAX_URL_LEN); // all done if empty if ( list.isEmpty() ) goto done; // loop over entries in list for (list.resetListPtr();!list.isExhausted();list.skipCurrentRecord()){ // get rec char *rec = list.getCurrentRec(); // get key key_t k = list.getCurrentKey(); // skip deletes -- how did this happen? if ( (k.n0 & 0x01) == 0) continue; // check this out long recSize = list.getCurrentRecSize(); // zero? if ( recSize <= 0 ) { char *xx=NULL;*xx=0; } // 16 is bad too... wtf is this? if ( recSize <= 16 ) continue; // crazy? if ( recSize<=minSize) {char *xx=NULL;*xx=0;} // . doledb key is 12 bytes, followed by a 4 byte datasize // . so skip that key and dataSize to point to spider request SpiderRequest *sreq = (SpiderRequest *)(rec+sizeof(key_t)+4); // add to dole tables if ( ! addToDoleTable ( sreq ) ) // return false with g_errno set on error return false; } startKey = *(key_t *)list.getLastKey(); startKey += (unsigned long) 1; // watch out for wrap around if ( startKey >= *(key_t *)list.getLastKey() ) goto loop; done: log(LOG_DEBUG,"spider: making dole ip table done."); // re-enable threads if ( enabled ) g_threads.enableThreads(); // we wrapped, all done return true; } key_t makeWaitingTreeKey ( uint64_t spiderTimeMS , long firstIp ) { // sanity if ( ((long long)spiderTimeMS) < 0 ) { char *xx=NULL;*xx=0; } // make the wait tree key key_t wk; wk.n1 = (spiderTimeMS>>32); wk.n0 = (spiderTimeMS&0xffffffff); wk.n0 <<= 32; wk.n0 |= (unsigned long)firstIp; // sanity if ( wk.n1 & 0x8000000000000000LL ) { char *xx=NULL;*xx=0; } return wk; } CollectionRec *SpiderColl::getCollRec() { CollectionRec *cr = g_collectiondb.m_recs[m_collnum]; if ( ! cr ) log("spider: lost coll rec"); return cr; } char *SpiderColl::getCollName() { CollectionRec *cr = getCollRec(); if ( ! cr ) return "lostcollection"; return cr->m_coll; } void nukeDoledb ( collnum_t collnum ) ; // // remove all recs from doledb for the given collection // static void nukeDoledbWrapper ( int fd , void *state ) { g_loop.unregisterSleepCallback ( state , nukeDoledbWrapper ); collnum_t collnum = *(collnum_t *)state; nukeDoledb ( collnum ); } void nukeDoledb ( collnum_t collnum ) { //WaitEntry *we = (WaitEntry *)state; //if ( we->m_registered ) // g_loop.unregisterSleepCallback ( we , doDoledbNuke ); // . nuke doledb for this collnum // . it will unlink the files and maps for doledb for this collnum // . it will remove all recs of this collnum from its tree too if ( g_doledb.getRdb()->isSavingTree () ) { g_loop.registerSleepCallback(100,&collnum,nukeDoledbWrapper); //we->m_registered = true; return; } // . ok, tree is not saving, it should complete entirely from this call g_doledb.getRdb()->deleteAllRecs ( collnum ); // re-add it back so the RdbBase is new'd //g_doledb.getRdb()->addColl2 ( we->m_collnum ); SpiderColl *sc = g_spiderCache.getSpiderCollIffNonNull ( collnum ); if ( sc ) { sc->m_lastUrlFiltersUpdate = getTimeLocal();//GlobalNoCore(); // . make sure to nuke m_doleIpTable as well sc->m_doleIpTable.clear(); // need to recompute this! //sc->m_ufnMapValid = false; // reset this cache //clearUfnTable(); // activate a scan if not already activated sc->m_waitingTreeNeedsRebuild = true; // if a scan is ongoing, this will re-set it sc->m_nextKey2.setMin(); // clear it? sc->m_waitingTree.clear(); sc->m_waitingTable.clear(); // kick off the spiderdb scan to rep waiting tree and doledb sc->populateWaitingTreeFromSpiderdb(false); } // nuke this state //mfree ( we , sizeof(WaitEntry) , "waitet" ); // note it log("spider: finished nuking doledb for coll (%li)", (long)collnum); } // . call this when changing the url filters // . will make all entries in waiting tree have zero time basically // . and makes us repopulate doledb from these waiting tree entries /* void SpiderColl::urlFiltersChanged ( ) { // log it log("spider: rebuilding doledb/waitingtree for coll=%s",getCollName()); WaitEntry *we = (WaitEntry *)mmalloc ( sizeof(WaitEntry) , "waite2" ); if ( ! we ) { log("spider: wait entry alloc: %s",mstrerror(g_errno)); g_errno = 0; return; } // prepare our state in case the purge operation would block we->m_registered = false; we->m_cr = m_cr; we->m_collnum = m_cr->m_collnum; //we->m_callback = doDoledbNuke2; //we->m_state = NULL; // remove all recs from doledb for the given collection doDoledbNuke ( 0 , we ); } */ // this one has to scan all of spiderdb bool SpiderColl::makeWaitingTree ( ) { log(LOG_DEBUG,"spider: making waiting tree for %s",m_coll); key128_t startKey ; startKey.setMin(); key128_t endKey ; endKey.setMax(); key128_t lastKey ; lastKey.setMin(); // turn off threads for this so it blocks bool enabled = g_threads.areThreadsEnabled(); // turn off regardless g_threads.disableThreads(); // get a meg at a time long minRecSizes = 1024*1024; Msg5 msg5; Msg5 msg5b; RdbList list; loop: // use msg5 to get the list, should ALWAYS block since no threads if ( ! msg5.getList ( RDB_SPIDERDB , m_collnum , &list , &startKey , &endKey , minRecSizes , true , // includeTree? false , // add to cache? 0 , // max cache age 0 , // startFileNum , -1 , // numFiles , NULL , // state NULL , // callback MAX_NICENESS , // niceness false , // err correction? NULL , // cache key ptr 0 , // retry num -1 , // maxRetries true , // compensate for merge -1LL , // sync point &msg5b )){ log(LOG_LOGIC,"spider: getList did not block."); return false; } // all done if empty if ( list.isEmpty() ) goto done; // loop over entries in list for (list.resetListPtr();!list.isExhausted();list.skipCurrentRecord()){ // get rec char *rec = list.getCurrentRec(); // get key key128_t k; list.getCurrentKey(&k); // skip deletes -- how did this happen? if ( (k.n0 & 0x01) == 0) continue; // check this out long recSize = list.getCurrentRecSize(); // zero? if ( recSize <= 0 ) { char *xx=NULL;*xx=0; } // 16 is bad too... wtf is this? if ( recSize <= 16 ) continue; // skip replies if ( g_spiderdb.isSpiderReply ( (key128_t *)rec ) ) continue; // get request SpiderRequest *sreq = (SpiderRequest *)rec; // get first ip long firstIp = sreq->m_firstIp; // skip if in dole ip table if ( m_doleIpTable.isInTable ( &firstIp ) ) continue; // make the key. use 1 for spiderTimeMS. this tells the // spider loop that it is temporary and should be updated key_t wk = makeWaitingTreeKey ( 1 , firstIp ); // ok, add to waiting tree long wn = m_waitingTree.addKey ( &wk ); if ( wn < 0 ) { log("spider: makeWaitTree: %s",mstrerror(g_errno)); return false; } // note it if ( g_conf.m_logDebugSpider ) logf(LOG_DEBUG,"spider: added time=1 ip=%s to waiting " "tree (node#=%li)", iptoa(firstIp),wn); // a tmp var long long fakeone = 1LL; // add to table now since its in the tree if ( ! m_waitingTable.addKey ( &firstIp , &fakeone ) ) { log("spider: makeWaitTree2: %s",mstrerror(g_errno)); m_waitingTree.deleteNode ( wn , true ); //log("sper: 6 del node %li for %s",wn,iptoa(firstIp)); return false; } } startKey = *(key128_t *)list.getLastKey(); startKey += (unsigned long) 1; // watch out for wrap around if ( startKey >= *(key128_t *)list.getLastKey() ) goto loop; done: log(LOG_DEBUG,"spider: making waiting tree done."); // re-enable threads if ( enabled ) g_threads.enableThreads(); // we wrapped, all done return true; } // for debugging query reindex i guess long long SpiderColl::getEarliestSpiderTimeFromWaitingTree ( long firstIp ) { // make the key. use 0 as the time... key_t wk = makeWaitingTreeKey ( 0, firstIp ); // set node from wait tree key. this way we can resume from a prev key long node = m_waitingTree.getNextNode ( 0, (char *)&wk ); // if empty, stop if ( node < 0 ) return -1; // breathe QUICKPOLL(MAX_NICENESS); // get the key key_t *k = (key_t *)m_waitingTree.getKey ( node ); // ok, we got one long storedFirstIp = (k->n0) & 0xffffffff; // match? we call this with a firstIp of 0 below to indicate // any IP, we just want to get the next spider time. if ( firstIp != 0 && storedFirstIp != firstIp ) return -1; // get the time unsigned long long spiderTimeMS = k->n1; // shift upp spiderTimeMS <<= 32; // or in spiderTimeMS |= (k->n0 >> 32); // make into seconds return spiderTimeMS; } bool SpiderColl::makeWaitingTable ( ) { log(LOG_DEBUG,"spider: making waiting table for %s.",m_coll); long node = m_waitingTree.getFirstNode(); for ( ; node >= 0 ; node = m_waitingTree.getNextNode(node) ) { // breathe QUICKPOLL(MAX_NICENESS); // get key key_t *key = (key_t *)m_waitingTree.getKey(node); // get ip from that long ip = (key->n0) & 0xffffffff; // spider time is up top uint64_t spiderTimeMS = (key->n1); spiderTimeMS <<= 32; spiderTimeMS |= ((key->n0) >> 32); // store in waiting table if ( ! m_waitingTable.addKey(&ip,&spiderTimeMS) ) return false; } log(LOG_DEBUG,"spider: making waiting table done."); return true; } SpiderColl::~SpiderColl () { reset(); } // we call this now instead of reset when Collectiondb::resetColl() is used void SpiderColl::clearLocks ( ) { // remove locks from locktable for all spiders out i guess HashTableX *ht = &g_spiderLoop.m_lockTable; top: // scan the slots long ns = ht->m_numSlots; for ( long i = 0 ; i < ns ; i++ ) { // skip if empty if ( ! ht->m_flags[i] ) continue; // cast lock UrlLock *lock = (UrlLock *)ht->getValueFromSlot(i); // skip if not our collnum if ( lock->m_collnum != m_collnum ) continue; // nuke it! ht->removeSlot(i); // restart since cells may have shifted goto top; } /* // reset these for SpiderLoop; m_nextDoledbKey.setMin(); m_didRound = false; // set this to -1 here, when we enter spiderDoledUrls() it will // see that its -1 and set the m_msg5StartKey m_pri2 = -1; // MAX_SPIDER_PRIORITIES - 1; m_twinDied = false; m_lastUrlFiltersUpdate = 0; char *coll = "unknown"; if ( m_coll[0] ) coll = m_coll; logf(LOG_DEBUG,"spider: CLEARING spider cache coll=%s",coll); m_ufnMapValid = false; m_doleIpTable .clear(); m_cdTable .clear(); m_sniTable .clear(); m_waitingTable.clear(); m_waitingTree .clear(); m_waitingMem .clear(); //m_lastDownloadCache.clear ( m_collnum ); // copied from reset() below for ( long i = 0 ; i < MAX_SPIDER_PRIORITIES ; i++ ) { m_nextKeys[i] = g_doledb.makeFirstKey2 ( i ); m_isDoledbEmpty[i] = 0; } // assume the whole thing is not empty m_allDoledbPrioritiesEmpty = 0;//false; m_lastEmptyCheck = 0; */ } void SpiderColl::reset ( ) { // reset these for SpiderLoop; m_nextDoledbKey.setMin(); m_didRound = false; // set this to -1 here, when we enter spiderDoledUrls() it will // see that its -1 and set the m_msg5StartKey m_pri2 = -1; // MAX_SPIDER_PRIORITIES - 1; m_twinDied = false; m_lastUrlFiltersUpdate = 0; m_isPopulating = false; char *coll = "unknown"; if ( m_coll[0] ) coll = m_coll; log(LOG_DEBUG,"spider: resetting spider cache coll=%s",coll); m_ufnMapValid = false; m_doleIpTable .reset(); m_cdTable .reset(); m_sniTable .reset(); m_waitingTable.reset(); m_waitingTree .reset(); m_waitingMem .reset(); m_winnerTree .reset(); m_winnerTable .reset(); m_dupCache .reset(); // each spider priority in the collection has essentially a cursor // that references the next spider rec in doledb to spider. it is // used as a performance hack to avoid the massive positive/negative // key annihilations related to starting at the top of the priority // queue every time we scan it, which causes us to do upwards of // 300 re-reads! for ( long i = 0 ; i < MAX_SPIDER_PRIORITIES ; i++ ) { m_nextKeys[i] = g_doledb.makeFirstKey2 ( i ); m_isDoledbEmpty[i] = 0; } // assume the whole thing is not empty //m_allDoledbPrioritiesEmpty = 0;//false; //m_lastEmptyCheck = 0; } bool SpiderColl::updateSiteNumInlinksTable ( long siteHash32, long sni, time_t timestamp ) { // do not update if invalid if ( sni == -1 ) return true; // . get entry for siteNumInlinks table // . use 32-bit key specialized lookup for speed uint64_t *val = (uint64_t *)m_sniTable.getValue32(siteHash32); // bail? if ( val && ((*val)&0xffffffff) > (uint32_t)timestamp ) return true; // . make new data for this key // . lower 32 bits is the addedTime // . upper 32 bits is the siteNumInlinks uint64_t nv = (uint32_t)sni; // shift up nv <<= 32; // or in time nv |= (uint32_t)timestamp;//sreq->m_addedTime; // just direct update if faster if ( val ) *val = nv; // store it anew otherwise else if ( ! m_sniTable.addKey(&siteHash32,&nv) ) // return false with g_errno set on error return false; // success return true; } ///////// // // we now include the firstip in the case where the same url // has 2 spiderrequests where one is a fake firstip. in that scenario // we will miss the spider request to spider, the waiting tree // node will be removed, and the spider round will complete, // which triggers a waiting tree recompute and we end up spidering // the dup spider request right away and double increment the round. // ///////// inline long long makeLockTableKey ( long long uh48 , long firstIp ) { return uh48 ^ (unsigned long)firstIp; } inline long long makeLockTableKey ( SpiderRequest *sreq ) { return makeLockTableKey(sreq->getUrlHash48(),sreq->m_firstIp); } inline long long makeLockTableKey ( SpiderReply *srep ) { return makeLockTableKey(srep->getUrlHash48(),srep->m_firstIp); } // . we call this when we receive a spider reply in Rdb.cpp // . returns false and sets g_errno on error // . xmldoc.cpp adds reply AFTER the negative doledb rec since we decement // the count in m_doleIpTable here bool SpiderColl::addSpiderReply ( SpiderReply *srep ) { ///////// // // remove the lock here // ////// long long lockKey = makeLockTableKey ( srep ); // shortcut HashTableX *ht = &g_spiderLoop.m_lockTable; UrlLock *lock = (UrlLock *)ht->getValue ( &lockKey ); time_t nowGlobal = getTimeGlobal(); if ( g_conf.m_logDebugSpider ) logf(LOG_DEBUG,"spider: scheduled lock removal in 5 secs for " "lockKey=%llu", lockKey ); // test it //if ( m_nowGlobal == 0 && lock ) // m_nowGlobal = getTimeGlobal(); // we do it this way rather than remove it ourselves // because a lock request for this guy // might be currently outstanding, and it will end up // being granted the lock even though we have by now removed // it from doledb, because it read doledb before we removed // it! so wait 5 seconds for the doledb negative key to // be absorbed to prevent a url we just spidered from being // re-spidered right away because of this sync issue. // . if we wait too long then the round end time, SPIDER_DONE_TIMER, // will kick in before us and end the round, then we end up // spidering a previously locked url right after and DOUBLE // increment the round! if ( lock ) lock->m_expires = nowGlobal + 2; ///// // // but do note that its spider has returned for populating the // waiting tree. addToWaitingTree should not add an entry if // a spiderReply is still pending according to the lock table, // UNLESS, maxSpidersPerIP is more than what the lock table says // is currently being spidered. // ///// if ( lock ) lock->m_spiderOutstanding = false; // bitch if not in there if ( !lock ) // &&g_conf.m_logDebugSpider)//ht->isInTable(&lockKey)) logf(LOG_DEBUG,"spider: rdb: lockKey=%llu " "was not in lock table",lockKey); //// // // skip if not assigned to us for doling // //// if ( ! isAssignedToUs ( srep->m_firstIp ) ) return true; // update the latest siteNumInlinks count for this "site" (repeatbelow) updateSiteNumInlinksTable ( srep->m_siteHash32, srep->m_siteNumInlinks, srep->m_spideredTime ); // . skip the rest if injecting // . otherwise it triggers a lookup for this firstip in spiderdb to // get a new spider request to add to doledb // . no, because there might be more on disk from the same firstip // so comment this out again //if ( srep->m_fromInjectionRequest ) // return true; // clear error for this g_errno = 0; // . update the latest crawl delay for this domain // . only add to the table if we had a crawl delay // . -1 implies an invalid or unknown crawl delay // . we have to store crawl delays of -1 now so we at least know we // tried to download the robots.txt (todo: verify that!) // and the webmaster did not have one. then we can // crawl more vigorously... //if ( srep->m_crawlDelayMS >= 0 ) { bool update = false; // use the domain hash for this guy! since its from robots.txt long *cdp = (long *)m_cdTable.getValue32(srep->m_domHash32); // update it only if better or empty if ( ! cdp ) update = true; // no update if injecting or from pagereindex (docid based spider request) if ( srep->m_fromInjectionRequest ) update = false; //else if (((*cdp)&0xffffffff)<(uint32_t)srep->m_spideredTime) // update = true; // update m_sniTable if we should if ( update ) { // . make new data for this key // . lower 32 bits is the spideredTime // . upper 32 bits is the crawldelay long nv = (long)(srep->m_crawlDelayMS); // shift up //nv <<= 32; // or in time //nv |= (uint32_t)srep->m_spideredTime; // just direct update if faster if ( cdp ) *cdp = nv; // store it anew otherwise else if ( ! m_cdTable.addKey(&srep->m_domHash32,&nv)){ // return false with g_errno set on error //return false; log("spider: failed to add crawl delay for " "firstip=%s",iptoa(srep->m_firstIp)); // just ignore g_errno = 0; } } // . anytime we add a reply then // we must update this downloadTable with the replies // SpiderReply::m_downloadEndTime so we can obey sameIpWait // . that is the earliest that this url can be respidered, but we // also have a sameIpWait constraint we have to consider... // . we alone our responsible for adding doledb recs from this ip so // this is easy to throttle... // . and make sure to only add to this download time hash table if // SpiderReply::m_downloadEndTime is non-zero, because zero means // no download happened. (TODO: check this) // . TODO: consult crawldelay table here too! use that value if is // less than our sameIpWait // . make m_lastDownloadTable an rdbcache ... // . this is 0 for pagereindex docid-based replies if ( srep->m_downloadEndTime ) m_lastDownloadCache.addLongLong ( m_collnum, srep->m_firstIp , srep->m_downloadEndTime ); // log this for now if ( g_conf.m_logDebugSpider ) log("spider: adding spider reply, download end time %lli for " "ip=%s(%lu) uh48=%llu indexcode=\"%s\" coll=%li " "k.n1=%llu k.n0=%llu", //"to SpiderColl::m_lastDownloadCache", srep->m_downloadEndTime, iptoa(srep->m_firstIp), srep->m_firstIp, srep->getUrlHash48(), mstrerror(srep->m_errCode), (long)m_collnum, srep->m_key.n1, srep->m_key.n0); // ignore errors from that, it's just a cache g_errno = 0; // sanity check - test cache //if ( g_conf.m_logDebugSpider && srep->m_downloadEndTime ) { // long long last = m_lastDownloadCache.getLongLong ( m_collnum , // srep->m_firstIp , // -1,// maxAge // true );//pro // if ( last != srep->m_downloadEndTime ) { char *xx=NULL;*xx=0;} //} // skip: // . add to wait tree and let it populate doledb on its batch run // . use a spiderTime of 0 which means unknown and that it needs to // scan spiderdb to get that // . returns false if did not add to waiting tree // . returns false sets g_errno on error bool added = addToWaitingTree ( 0LL, srep->m_firstIp , true ); // ignore errors i guess g_errno = 0; // if added to waiting tree, bail now, needs to scan spiderdb // in order to add to doledb, because it won't add to waiting tree // if we already have spiderrequests in doledb for this firstip if ( added ) return true; // spider some urls that were doled to us g_spiderLoop.spiderDoledUrls( ); return true; } void SpiderColl::removeFromDoledbTable ( long firstIp ) { // . decrement doledb table ip count for firstIp // . update how many per ip we got doled long *score = (long *)m_doleIpTable.getValue32 ( firstIp ); // wtf! how did this spider without being doled? if ( ! score ) { //if ( ! srep->m_fromInjectionRequest ) log("spider: corruption. received spider reply whose " "ip has no entry in dole ip table. firstip=%s", iptoa(firstIp)); return; } // reduce it *score = *score - 1; // now we log it too if ( g_conf.m_logDebugSpider ) log(LOG_DEBUG,"spider: removed ip=%s from doleiptable " "(newcount=%li)", iptoa(firstIp),*score); // remove if zero if ( *score == 0 ) { // this can file if writes are disabled on this hashtablex // because it is saving m_doleIpTable.removeKey ( &firstIp ); // sanity check //if ( ! m_doleIpTable.m_isWritable ) { char *xx=NULL;*xx=0; } } // wtf! if ( *score < 0 ) { char *xx=NULL;*xx=0; } // all done? if ( g_conf.m_logDebugSpider ) { // log that too! logf(LOG_DEBUG,"spider: discounting firstip=%s to %li", iptoa(firstIp),*score); } } bool SpiderColl::isInDupCache ( SpiderRequest *sreq , bool addToCache ) { // init dup cache? if ( ! m_dupCache.isInitialized() ) // use 50k i guess of 64bit numbers and linked list info m_dupCache.init ( 50000, 4 , // fixeddatasize (don't really need this) false, // list support? 5000, // maxcachenodes false, // usehalfkeys? "urldups", // dbname false, // loadfromdisk 12, // cachekeysize 0, // datakeysize -1 ); // numptrsmax // quit add dups over and over again... long long dupKey64 = sreq->getUrlHash48(); // . these flags make big difference in url filters // . NOTE: if you see a url that is not getting spidered that should be it might // be because we are not incorporating other flags here... if ( sreq->m_fakeFirstIp ) dupKey64 ^= 12345; if ( sreq->m_isAddUrl ) dupKey64 ^= 49387333; if ( sreq->m_isInjecting ) dupKey64 ^= 3276404; if ( sreq->m_forceDelete ) dupKey64 ^= 29386239; if ( sreq->m_hadReply ) dupKey64 ^= 293294099; if ( sreq->m_sameDom ) dupKey64 ^= 963493311; if ( sreq->m_sameHost ) dupKey64 ^= 288844772; if ( sreq->m_sameSite ) dupKey64 ^= 58320355; // . maxage=86400,promoteRec=yes. returns -1 if not in there // . dupKey64 is for hopcount 0, so if this url is in the dupcache // with a hopcount of zero, do not add it if ( m_dupCache.getLong ( 0,dupKey64,86400,true ) != -1 ) { dedup: if ( g_conf.m_logDebugSpider ) log("spider: skipping dup request url=%s uh48=%llu", sreq->m_url,sreq->getUrlHash48()); return true; } // if our hopcount is 2 and there is a hopcount 1 in there, do not add if ( sreq->m_hopCount >= 2 && m_dupCache.getLong ( 0,dupKey64 ^ 0x01 ,86400,true ) != -1 ) goto dedup; // likewise, if there's a hopcount 2 in there, do not add if we are 3+ if ( sreq->m_hopCount >= 3 && m_dupCache.getLong ( 0,dupKey64 ^ 0x02 ,86400,true ) != -1 ) goto dedup; if ( ! addToCache ) return false; long hc = sreq->m_hopCount; // limit hopcount to 3 for making cache key so we don't flood cache if ( hc >= 3 ) hc = 3; // mangle the key with hopcount before adding it to the cache dupKey64 ^= hc; // add it m_dupCache.addLong(0,dupKey64 ,1); return false; } // . Rdb.cpp calls SpiderColl::addSpiderRequest/Reply() for every positive // spiderdb record it adds to spiderdb. that way our cache is kept // uptodate incrementally // . returns false and sets g_errno on error // . if the spiderTime appears to be AFTER m_nextReloadTime then we should // not add this spider request to keep the cache trimmed!!! (MDW: TODO) // . BUT! if we have 150,000 urls that is going to take a long time to // spider, so it should have a high reload rate! bool SpiderColl::addSpiderRequest ( SpiderRequest *sreq , long long nowGlobalMS ) { // don't add negative keys or data less thangs if ( sreq->m_dataSize <= 0 ) { if ( g_conf.m_logDebugSpider ) log("spider: add spider request is dataless for " "uh48=%llu",sreq->getUrlHash48()); char *xx=NULL;*xx=0; return true; } // . are we already more or less in spiderdb? true = addToCache // . put this above isAssignedToUs() so we *try* to keep twins in sync because // Rdb.cpp won't add the spiderrequest if its in this dup cache, and we add // it to the dupcache here... if ( isInDupCache ( sreq , true ) ) { if ( g_conf.m_logDebugSpider ) log("spider: skipping dup request url=%s uh48=%llu", sreq->m_url,sreq->getUrlHash48()); return true; } // skip if not assigned to us for doling if ( ! isAssignedToUs ( sreq->m_firstIp ) ) { if ( g_conf.m_logDebugSpider ) log("spider: spider request not assigned to us. " "skipping."); return true; } // . get the url's length contained in this record // . it should be NULL terminated // . we set the ip here too long ulen = sreq->getUrlLen(); // watch out for corruption if ( sreq->m_firstIp == 0 || sreq->m_firstIp == -1 || ulen <= 0 ) { log("spider: Corrupt spider req with url length of " "%li <= 0. dataSize=%li uh48=%llu. Skipping.", ulen,sreq->m_dataSize,sreq->getUrlHash48()); return true; } // update crawlinfo stats here and not in xmldoc so that we count // seeds and bulk urls added from add url and can use that to // determine if the collection is empty of urls or not for printing // out the colored bullets in printCollectionNavBar() in Pages.cpp. CollectionRec *cr = g_collectiondb.m_recs[m_collnum]; if ( cr ) { cr->m_localCrawlInfo .m_urlsHarvested++; cr->m_globalCrawlInfo.m_urlsHarvested++; cr->m_needsSave = true; } // . if already have a request in doledb for this firstIp, forget it! // . TODO: make sure we remove from doledb first before adding this // spider request // . NOW: allow it in if different priority!!! so maybe hash the // priority in with the firstIp??? // . we really just need to add it if it beats what is currently // in doledb. so maybe store the best priority doledb in the // data value part of the doleiptable...? therefore we should // probably move this check down below after we get the priority // of the spider request. //char *val = (char *)m_doleIpTable.getValue ( &sreq->m_firstIp ); //if ( val && *val > 0 ) { // if ( g_conf.m_logDebugSpider ) // log("spider: request IP already in dole table"); // return true; //} // . skip if already in wait tree // . no, no. what if the current url for this firstip is not due to // be spidered until 24 hrs and we are adding a url from this firstip // that should be spidered now... //if ( m_waitingTable.isInTable ( &sreq->m_firstIp ) ) { // if ( g_conf.m_logDebugSpider ) // log("spider: request already in waiting table"); // return true; //} // . we can't do this because we do not have the spiderReply!!!??? // . MDW: no, we have to do it because tradesy.com has links to twitter // on every page and twitter is not allowed so we continually // re-scan a big spiderdblist for twitter's firstip. major performace // degradation. so try to get ufn without reply. if we need // a reply to get the ufn then this function should return -1 which // means an unknown ufn and we'll add to waiting tree. // get ufn/priority,because if filtered we do not want to add to doledb long ufn ; // HACK: set isOutlink to true here since we don't know if we have sre ufn = ::getUrlFilterNum(sreq,NULL,nowGlobalMS,false,MAX_NICENESS,m_cr, true,//isoutlink? HACK! NULL); // quota table // sanity check //if ( ufn < 0 ) { // log("spider: failed to add spider request for %s because " // "it matched no url filter", // sreq->m_url); // g_errno = EBADENGINEER; // return false; //} // spiders disabled for this row in url filters? if ( ufn >= 0 && m_cr->m_maxSpidersPerRule[ufn] == 0 ) { if ( g_conf.m_logDebugSpider ) log("spider: request spidersoff ufn=%li url=%s",ufn, sreq->m_url); return true; } // set the priority (might be the same as old) long priority = -1; if ( ufn >= 0 ) priority = m_cr->m_spiderPriorities[ufn]; // sanity checks //if ( priority == -1 ) { char *xx=NULL;*xx=0; } if ( priority >= MAX_SPIDER_PRIORITIES) {char *xx=NULL;*xx=0;} // do not add to doledb if bad if ( priority == SPIDER_PRIORITY_FILTERED ) { if ( g_conf.m_logDebugSpider ) log("spider: request %s is filtered ufn=%li", sreq->m_url,ufn); return true; } if ( priority == SPIDER_PRIORITY_BANNED ) { if ( g_conf.m_logDebugSpider ) log("spider: request %s is banned ufn=%li", sreq->m_url,ufn); return true; } // set it for adding to doledb and computing spidertime //sreq->m_ufn = ufn; //sreq->m_priority = priority; // get spider time -- i.e. earliest time when we can spider it //uint64_t spiderTimeMS = getSpiderTimeMS (sreq,ufn,NULL,nowGlobalMS ); // sanity //if ( (long long)spiderTimeMS < 0 ) { char *xx=NULL;*xx=0; } // once in waiting tree, we will scan waiting tree and then lookup // each firstIp in waiting tree in spiderdb to get the best // SpiderRequest for that firstIp, then we can add it to doledb // as long as it can be spidered now //bool status = addToWaitingTree ( spiderTimeMS,sreq->m_firstIp,true); bool added = addToWaitingTree ( 0 , sreq->m_firstIp , true ); // if already doled and we beat the priority/spidertime of what // was doled then we should probably delete the old doledb key // and add the new one. hmm, the waitingtree scan code ... // sanity check //long long ttt=getEarliestSpiderTimeFromWaitingTree(sreq->m_firstIp); //logf (LOG_DEBUG,"spider: earliestime=%lli for firstip=%s", // ttt,iptoa(sreq->m_firstIp)); //if ( ttt != (long long)spiderTimeMS ) { char *xx=NULL;*xx=0; } // update the latest siteNumInlinks count for this "site" if ( sreq->m_siteNumInlinksValid ) { // updates m_siteNumInlinksTable updateSiteNumInlinksTable ( sreq->m_siteHash32 , sreq->m_siteNumInlinks , sreq->m_addedTime ); // clear error for this if there was any g_errno = 0; } if ( ! g_conf.m_logDebugSpider ) return true;//status; char *msg = "ADDED"; if ( ! added ) msg = "DIDNOTADD"; // log it logf(LOG_DEBUG, "spider: %s request to waiting tree %s " "uh48=%llu " "firstIp=%s " "parentFirstIp=%lu " "parentdocid=%llu " "isinjecting=%li " "ispagereindex=%li " "ufn=%li " "priority=%li " "addedtime=%lu " //"spidertime=%llu", , msg, sreq->m_url, sreq->getUrlHash48(), iptoa(sreq->m_firstIp), sreq->m_parentFirstIp, sreq->getParentDocId(), (long)(bool)sreq->m_isInjecting, (long)(bool)sreq->m_isPageReindex, (long)sreq->m_ufn, (long)sreq->m_priority, sreq->m_addedTime //spiderTimeMS); ); return true;//status; } bool SpiderColl::printWaitingTree ( ) { long node = m_waitingTree.getFirstNode(); for ( ; node >= 0 ; node = m_waitingTree.getNextNode(node) ) { key_t *wk = (key_t *)m_waitingTree.getKey (node); // spider time is up top uint64_t spiderTimeMS = (wk->n1); spiderTimeMS <<= 32; spiderTimeMS |= ((wk->n0) >> 32); // then ip long firstIp = wk->n0 & 0xffffffff; // show it log("dump: time=%lli firstip=%s",spiderTimeMS,iptoa(firstIp)); } return true; } bool SpiderLoop::printLockTable ( ) { // count locks HashTableX *ht = &g_spiderLoop.m_lockTable; // scan the slots long ns = ht->m_numSlots; for ( long i = 0 ; i < ns ; i++ ) { // skip if empty if ( ! ht->m_flags[i] ) continue; // cast lock UrlLock *lock = (UrlLock *)ht->getValueFromSlot(i); // get the key long long lockKey = *(long long *)ht->getKeyFromSlot(i); // show it log("dump: lock. " "lockkey=%lli " "spiderout=%li " "confirmed=%li " "firstip=%s " "expires=%li " "hostid=%li " "timestamp=%li " "sequence=%li " "collnum=%li " ,lockKey ,(long)(lock->m_spiderOutstanding) ,(long)(lock->m_confirmed) ,iptoa(lock->m_firstIp) ,lock->m_expires ,lock->m_hostId ,lock->m_timestamp ,lock->m_lockSequence ,(long)lock->m_collnum ); } return true; } ////// // // . 1. called by addSpiderReply(). it should have the sameIpWait available // or at least that will be in the crawldelay cache table. // SpiderReply::m_crawlDelayMS. Unfortunately, no maxSpidersPerIP!!! // we just add a "0" in the waiting tree which means evalIpLoop() will // be called and can get the maxSpidersPerIP from the winning candidate // and add to the waiting tree based on that. // . 2. called by addSpiderRequests(). It SHOULD maybe just add a "0" as well // to offload the logic. try that. // . 3. called by populateWaitingTreeFromSpiderdb(). it just adds "0" as well, // if not doled // . 4. UPDATED in evalIpLoop() if the best SpiderRequest for a firstIp is // in the future, this is the only time we will add a waiting tree key // whose spider time is non-zero. that is where we also take // sameIpWait and maxSpidersPerIP into consideration. evalIpLoop() // will actually REMOVE the entry from the waiting tree if that IP // already has the max spiders outstanding per IP. when a spiderReply // is received it will populate the waiting tree again with a "0" entry // and evalIpLoop() will re-do its check. // ////// // . returns true if we added to waiting tree, false if not // . if one of these add fails consider increasing mem used by tree/table // . if we lose an ip that sux because it won't be gotten again unless // we somehow add another request/reply to spiderdb in the future bool SpiderColl::addToWaitingTree ( uint64_t spiderTimeMS , long firstIp , bool callForScan ) { // skip if already in wait tree. no - might be an override with // a sooner spiderTimeMS //if ( m_waitingTable.isInTable ( &firstIp ) ) return true; if ( g_conf.m_logDebugSpider ) log("spider: addtowaitingtree ip=%s",iptoa(firstIp)); // we are currently reading spiderdb for this ip and trying to find // a best SpiderRequest or requests to add to doledb. so if this // happens, let the scan know that more replies or requests came in // while we were scanning so that it should not delete the rec from // waiting tree and not add to doledb, then we'd lose it forever or // until the next waitingtree rebuild was triggered in time. // // Before i was only setting this in addSpiderRequest() so if a new // reply came in it was not setting m_gotNewDataForScanninIp and // we ended up losing the IP from the waiting tree forever (or until // the next timed rebuild). putting it here seems to fix that. if ( firstIp == m_scanningIp ) { m_gotNewDataForScanningIp = m_scanningIp; //log("spider: got new data for %s",iptoa(firstIp)); //return true; } // . this can now be only 0 // . only evalIpLoop() will add a waiting tree key with a non-zero // value after it figures out the EARLIEST time that a // SpiderRequest from this firstIp can be spidered. if ( spiderTimeMS != 0 ) { char *xx=NULL;*xx=0; } // waiting tree might be saving!!! if ( ! m_waitingTree.m_isWritable ) { log("spider: addtowaitingtree: failed. is not writable. " "saving?"); return false; } // only if we are the responsible host in the shard if ( ! isAssignedToUs ( firstIp ) ) return false; // . do not add to waiting tree if already in doledb // . an ip should not exist in both doledb and waiting tree. // . waiting tree is meant to be a signal that we need to add // a spiderrequest from that ip into doledb where it can be picked // up for immediate spidering if ( m_doleIpTable.isInTable ( &firstIp ) ) { if ( g_conf.m_logDebugSpider ) log("spider: not adding to waiting tree, already in " "doleip table"); return false; } // sanity check // i think this trigged on gk209 during an auto-save!!! FIX! if ( ! m_waitingTree.m_isWritable ) { char *xx=NULL; *xx=0; } /* /////// // // compute the min time for this entry to satisfy sameIpWait // /////// long long spiderTimeMS = spiderTimeMSArg; long long lastDownloadTimeMS = lastDownloadTime ( firstIp ); // how long to wait between downloads from same ip in milliseconds? long sameIpWaitTime = 250; // ms if ( ufn >= 0 ) { long siwt = m_sc->m_cr->m_spiderIpWaits[ufn]; if ( siwt >= 0 ) sameIpWaitTime = siwt; } long long minDownloadTime = sameIpWaitTime + siwt; // use that if it is more restrictive if ( minDownloadTime > now && minDownloadTime > spiderTimeMS ) spiderTimeMS = minDownloadTime; */ // see if in tree already, so we can delete it and replace it below long ws = m_waitingTable.getSlot ( &firstIp ) ; // . this is >= 0 if already in tree // . if spiderTimeMS is a sooner time than what this firstIp already // has as its earliest time, then we will override it and have to // update both m_waitingTree and m_waitingTable, however // IF the spiderTimeMS is a later time, then we bail without doing // anything at this point. if ( ws >= 0 ) { // get timems from waiting table long long sms = m_waitingTable.getScore64FromSlot(ws); // get current time //long long nowMS = gettimeofdayInMillisecondsGlobal(); // make the key then key_t wk = makeWaitingTreeKey ( sms, firstIp ); // must be there long tn = m_waitingTree.getNode ( (collnum_t)0, (char *)&wk ); // sanity check. ensure waitingTable and waitingTree in sync if ( tn < 0 ) { char *xx=NULL;*xx=0; } // not only must we be a sooner time, but we must be 5-seconds // sooner than the time currently in there to avoid thrashing // when we had a ton of outlinks with this first ip within an // 5-second interval. // // i'm not so sure what i was doing here before, but i don't // want to starve the spiders, so make this 100ms not 5000ms if ( (long long)spiderTimeMS > sms - 100 ) { if ( g_conf.m_logDebugSpider ) log("spider: skip updating waiting tree"); return false; } // log the replacement if ( g_conf.m_logDebugSpider ) log("spider: replacing waitingtree key " "oldtime=%lu newtime=%lu firstip=%s", (unsigned long)(sms/1000LL), (unsigned long)(spiderTimeMS/1000LL), iptoa(firstIp)); // remove from tree so we can add it below m_waitingTree.deleteNode ( tn , false ); //log("spider: 4 del node %li for %s",tn,iptoa(firstIp)); } else { char *s=""; // time of 0 means we got the reply for something we spidered // in doledb so we will need to recompute the best spider // requests for this first ip if ( spiderTimeMS==0 ) s = "(replyreset)"; // log the replacement if ( g_conf.m_logDebugSpcache ) log("spider: adding new key to waitingtree " "newtime=%lu%s firstip=%s", (unsigned long)(spiderTimeMS/1000LL),s, iptoa(firstIp)); } // make the key key_t wk = makeWaitingTreeKey ( spiderTimeMS, firstIp ); // what is this? if ( firstIp == 0 || firstIp == -1 ) { log("spider: got ip of %s. wtf? failed to add to " "waiting tree, but return true anyway.",iptoa(firstIp) ); // don't return true lest m_nextKey2 never gets updated // and we end up in an infinite loop doing // populateWaitingTreeFromSpiderdb() return true; //return false; char *xx=NULL; *xx=0; } // grow the tree if too small! long used = m_waitingTree.getNumUsedNodes(); long max = m_waitingTree.getNumTotalNodes(); if ( used + 1 > max ) { long more = (((long long)used) * 15) / 10; if ( more < 10 ) more = 10; if ( more > 100000 ) more = 100000; long newNum = max + more; log("spider: growing waiting tree to from %li to %li nodes " "for collnum %li", max , newNum , (long)m_collnum ); if ( ! m_waitingTree.growTree ( newNum , MAX_NICENESS ) ) return log("spider: failed to grow waiting tree to " "add firstip %s",iptoa(firstIp) ); if ( ! m_waitingTable.setTableSize ( newNum , NULL , 0 ) ) return log("spider: failed to grow waiting table to " "add firstip %s",iptoa(firstIp) ); } // add that long wn; if ( ( wn = m_waitingTree.addKey ( &wk ) ) < 0 ) { log("spider: waitingtree add failed ip=%s. increase max nodes " "lest we lose this IP forever. err=%s", iptoa(firstIp),mstrerror(g_errno)); //char *xx=NULL; *xx=0; return false; } // note it if ( g_conf.m_logDebugSpider ) logf(LOG_DEBUG,"spider: added time=%lli ip=%s to waiting tree " "scan=%li node=%li", spiderTimeMS , iptoa(firstIp),(long)callForScan,wn); // add to table now since its in the tree if ( ! m_waitingTable.addKey ( &firstIp , &spiderTimeMS ) ) { // remove from tree then m_waitingTree.deleteNode ( wn , false ); //log("spider: 5 del node %li for %s",wn,iptoa(firstIp)); return false; } // . kick off a scan, i don't care if this blocks or not! // . the populatedoledb loop might already have a scan in progress // but usually it won't, so rather than wait for its sleepwrapper // to be called we force it here for speed. // . re-entry is false because we are entering for the first time // . calling this everytime msg4 adds a spider request is super slow!!! // SO TAKE THIS OUT FOR NOW // . no that was not it. mdw. put it back. if ( callForScan ) populateDoledbFromWaitingTree ( ); // tell caller there was no error return true; } // . this scan is started anytime we call addSpiderRequest() or addSpiderReply // . if nothing is in tree it quickly exits // . otherwise it scan the entries in the tree // . each entry is a key with spiderTime/firstIp // . if spiderTime > now it stops the scan // . if the firstIp is already in doledb (m_doleIpTable) then it removes // it from the waitingtree and waitingtable. how did that happen? // . otherwise, it looks up that firstIp in spiderdb to get a list of all // the spiderdb recs from that firstIp // . then it selects the "best" one and adds it to doledb. once added to // doledb it adds it to doleIpTable, and remove from waitingtree and // waitingtable // . returns false if blocked, true otherwise long SpiderColl::getNextIpFromWaitingTree ( ) { // if nothing to scan, bail if ( m_waitingTree.isEmpty() ) return 0; // reset first key to get first rec in waiting tree m_waitingTreeKey.setMin(); // current time on host #0 uint64_t nowMS = gettimeofdayInMillisecondsGlobal(); top: // we might have deleted the only node below... if ( m_waitingTree.isEmpty() ) return 0; // advance to next //m_waitingTreeKey += 1LL; // assume none long firstIp = 0; // set node from wait tree key. this way we can resume from a prev key long node = m_waitingTree.getNextNode ( 0, (char *)&m_waitingTreeKey ); // if empty, stop if ( node < 0 ) return 0; // breathe QUICKPOLL(MAX_NICENESS); // get the key key_t *k = (key_t *)m_waitingTree.getKey ( node ); // ok, we got one firstIp = (k->n0) & 0xffffffff; // sometimes we take over for a dead host, but if he's no longer // dead then we can remove his keys. but first make sure we have had // at least one ping from him so we do not remove at startup. // if it is in doledb or in the middle of being added to doledb // via msg4, nuke it as well! if ( ! isAssignedToUs (firstIp) || m_doleIpTable.isInTable(&firstIp)) { // only delete if this host is alive and has sent us a ping // before so we know he was up at one time. this way we do not // remove all his keys just because we restarted and think he // is alive even though we have gotten no ping from him. //if ( hp->m_numPingRequests > 0 ) // these operations should fail if writes have been disabled // and becase the trees/tables for spidercache are saving // in Process.cpp's g_spiderCache::save() call m_waitingTree.deleteNode ( node , true ); //log("spdr: 8 del node node %li for %s",node,iptoa(firstIp)); // note it if ( g_conf.m_logDebugSpider ) log(LOG_DEBUG,"spider: removed1 ip=%s from waiting " "tree. nn=%li", iptoa(firstIp),m_waitingTree.m_numUsedNodes); // log it if ( g_conf.m_logDebugSpcache ) log("spider: erasing waitingtree key firstip=%s", iptoa(firstIp) ); // remove from table too! m_waitingTable.removeKey ( &firstIp ); goto top; } // spider time is up top uint64_t spiderTimeMS = (k->n1); spiderTimeMS <<= 32; spiderTimeMS |= ((k->n0) >> 32); // stop if need to wait for this one if ( spiderTimeMS > nowMS ) return 0; // sanity if ( (long long)spiderTimeMS < 0 ) { char *xx=NULL;*xx=0; } // save key for deleting when done m_waitingTreeKey.n1 = k->n1; m_waitingTreeKey.n0 = k->n0; m_waitingTreeKeyValid = true; m_scanningIp = firstIp; // sanity if ( firstIp == 0 || firstIp == -1 ) { char *xx=NULL;*xx=0; } // we set this to true when done //m_isReadDone = false; // compute the best request from spiderdb list, not valid yet //m_bestRequestValid = false; m_lastReplyValid = false; // start reading spiderdb here m_nextKey = g_spiderdb.makeFirstKey(firstIp); m_endKey = g_spiderdb.makeLastKey (firstIp); // all done return firstIp; } static void gotSpiderdbListWrapper2( void *state , RdbList *list,Msg5 *msg5) { SpiderColl *THIS = (SpiderColl *)state; // did our collection rec get deleted? since we were doing a read // the SpiderColl will have been preserved in that case but its // m_deleteMyself flag will have been set. if ( tryToDeleteSpiderColl ( THIS ) ) return; THIS->populateWaitingTreeFromSpiderdb ( true ); } ////////////////// ////////////////// // // THE BACKGROUND FUNCTION // // when the user changes the ufn table the waiting tree is flushed // and repopulated from spiderdb with this. also used for repairs. // ////////////////// ////////////////// // . this stores an ip into the waiting tree with a spidertime of "0" so // it will be evaluate properly by populateDoledbFromWaitingTree() // // . scan spiderdb to make sure each firstip represented in spiderdb is // in the waiting tree. it seems they fall out over time. we need to fix // that but in the meantime this should do a bg repair. and is nice to have // . the waiting tree key is reall just a spidertime and a firstip. so we will // still need populatedoledbfromwaitingtree to periodically scan firstips // that are already in doledb to see if it has a higher-priority request // for that firstip. in which case it can add that to doledb too, but then // we have to be sure to only grant one lock for a firstip to avoid hammering // that firstip // . this should be called from a sleepwrapper, the same sleep wrapper we // call populateDoledbFromWaitingTree() from should be fine void SpiderColl::populateWaitingTreeFromSpiderdb ( bool reentry ) { // skip if in repair mode if ( g_repairMode ) return; // skip if spiders off if ( ! m_cr->m_spideringEnabled ) return; // if entering for the first time, we need to read list from spiderdb if ( ! reentry ) { // just return if we should not be doing this yet if ( ! m_waitingTreeNeedsRebuild ) return; // a double call? can happen if list read is slow... if ( m_gettingList2 ) return; // . borrow a msg5 // . if none available just return, we will be called again // by the sleep/timer function // . read in a replacement SpiderRequest to add to doledb from // this ip // . get the list of spiderdb records // . do not include cache, those results are old and will mess // us up log(LOG_DEBUG,"spider: populateWaitingTree: " "calling msg5: startKey=0x%llx,0x%llx " "firstip=%s", m_nextKey2.n1,m_nextKey2.n0, iptoa(g_spiderdb.getFirstIp(&m_nextKey2))); // flag it m_gettingList2 = true; // make state //long state2 = (long)m_cr->m_collnum; // read the list from local disk if ( ! m_msg5b.getList ( RDB_SPIDERDB , m_cr->m_collnum, &m_list2 , &m_nextKey2 , &m_endKey2 , SR_READ_SIZE , // minRecSizes (512k) true , // includeTree false , // addToCache 0 , // max cache age 0 , // startFileNum -1 , // numFiles (all) this,//(void *)state2,//this//state gotSpiderdbListWrapper2 , MAX_NICENESS , // niceness true )) // do error correct? // return if blocked return; } // show list stats if ( g_conf.m_logDebugSpider ) log("spider: populateWaitingTree: got list of size %li", m_list2.m_listSize); // unflag it m_gettingList2 = false; // stop if we are done //if ( m_isReadDone2 ) return; // if waitingtree is locked for writing because it is saving or // writes were disabled then just bail and let the scan be re-called // later RdbTree *wt = &m_waitingTree; if ( wt->m_isSaving || ! wt->m_isWritable ) return; // shortcut RdbList *list = &m_list2; // ensure we point to the top of the list list->resetListPtr(); // bail on error if ( g_errno ) { log("spider: Had error getting list of urls " "from spiderdb2: %s.",mstrerror(g_errno)); //m_isReadDone2 = true; return; } long lastOne = 0; // loop over all serialized spiderdb records in the list for ( ; ! list->isExhausted() ; ) { // breathe QUICKPOLL ( MAX_NICENESS ); // get spiderdb rec in its serialized form char *rec = list->getCurrentRec(); // skip to next guy list->skipCurrentRecord(); // negative? wtf? if ( (rec[0] & 0x01) == 0x00 ) { //logf(LOG_DEBUG,"spider: got negative spider rec"); continue; } // if its a SpiderReply skip it if ( ! g_spiderdb.isSpiderRequest ( (key128_t *)rec)) continue; // cast it SpiderRequest *sreq = (SpiderRequest *)rec; // get first ip long firstIp = sreq->m_firstIp; // if same as last, skip it if ( firstIp == lastOne ) continue; // set this lastOne for speed lastOne = firstIp; // check for dmoz. set up gdb on gk157/gk221 to break here // so we can see what's going on //if ( firstIp == -815809331 ) // log("got dmoz"); // if firstip already in waiting tree, skip it if ( m_waitingTable.isInTable ( &firstIp ) ) continue; // skip if only our twin should add it to waitingtree/doledb if ( ! isAssignedToUs ( firstIp ) ) continue; // skip if ip already represented in doledb i guess otehrwise // the populatedoledb scan will nuke it!! if ( m_doleIpTable.isInTable ( &firstIp ) ) continue; // not currently spidering either. when they got their // lock they called confirmLockAcquisition() which will // have added an entry to the waiting table. sometimes the // lock still exists but the spider is done. because the // lock persists for 5 seconds afterwards in case there was // a lock request for that url in progress, so it will be // denied. // . this is starving other collections , should be // added to waiting tree anyway! otherwise it won't get // added!!! // . so now i made this collection specific, not global if ( g_spiderLoop.getNumSpidersOutPerIp (firstIp,m_collnum)>0) continue; // otherwise, we want to add it with 0 time so the doledb // scan will evaluate it properly // this will return false if we are saving the tree i guess if ( ! addToWaitingTree ( 0 , firstIp , false ) ) { log("spider: failed to add ip %s to waiting tree. " "ip will not get spidered then and our " "population of waiting tree will repeat until " "this add happens." , iptoa(firstIp) ); return; } // count it m_numAdded++; // ignore errors for this g_errno = 0; } // are we the final list in the scan? bool shortRead = ( list->getListSize() <= 0);// (long)SR_READ_SIZE ) ; m_numBytesScanned += list->getListSize(); // reset? still left over from our first scan? if ( m_lastPrintCount > m_numBytesScanned ) m_lastPrintCount = 0; // announce every 100MB maybe if ( m_numBytesScanned - m_lastPrintCount > 100000000 ) { log("spider: %llu spiderdb bytes scanned for waiting tree " "re-population",m_numBytesScanned); m_lastPrintCount = m_numBytesScanned; } // debug info log(LOG_DEBUG,"spider: Read2 %li spiderdb bytes.",list->getListSize()); // reset any errno cuz we're just a cache g_errno = 0; // if not done, keep going if ( ! shortRead ) { // . inc it here // . it can also be reset on a collection rec update key128_t endKey = *(key128_t *)list->getLastKey(); m_nextKey2 = endKey; m_nextKey2 += (unsigned long) 1; // watch out for wrap around if ( m_nextKey2 < endKey ) shortRead = true; } if ( shortRead ) { // mark when the scan completed so we can do another one // like 24 hrs from that... m_lastScanTime = getTimeLocal(); // log it if ( m_numAdded ) log("spider: added %li recs to waiting tree from " "scan of %lli bytes coll=%s", m_numAdded,m_numBytesScanned, m_cr->m_coll); // reset the count for next scan m_numAdded = 0 ; m_numBytesScanned = 0; // reset for next scan m_nextKey2.setMin(); // no longer need rebuild m_waitingTreeNeedsRebuild = false; // note it log("spider: rebuild complete for %s",m_coll); } // free list to save memory list->freeList(); // wait for sleepwrapper to call us again with our updated m_nextKey2 return; } //static bool s_ufnTreeSet = false; //static RdbTree s_ufnTree; //static time_t s_lastUfnTreeFlushTime = 0; ////////////////////////// ////////////////////////// // // The first KEYSTONE function. // // CALL THIS ANYTIME to load up doledb from waiting tree entries // // This is a key function. // // It is called from two places: // // 1) sleep callback // // 2) addToWaitingTree() // is called from addSpiderRequest() anytime a SpiderRequest // is added to spiderdb (or from addSpiderReply()) // // It can only be entered once so will just return if already scanning // spiderdb. // ////////////////////////// ////////////////////////// // . for each IP in the waiting tree, scan all its SpiderRequests and determine // which one should be the next to be spidered. and put that one in doledb. // . we call this a lot, like if the admin changes the url filters table // we have to re-scan all of spiderdb basically and re-do doledb void SpiderColl::populateDoledbFromWaitingTree ( ) { // bool reentry ) { // only one loop can run at a time! //if ( ! reentry && m_isPopulating ) return; if ( m_isPopulating ) return; // skip if in repair mode if ( g_repairMode ) return; // try skipping!!!!!!!!!!! // yeah, this makes us scream. in addition to calling // Doledb::m_rdb::addRecord() below // WE NEED THIS TO REPOPULATE DOLEDB THOUGH!!! //return; //if ( g_conf.m_logDebugSpider ) // log("spider: in populatedoledbfromwaitingtree " // "numUsedNodes=%li", // m_waitingTree.m_numUsedNodes); // set this flag so we are not re-entered m_isPopulating = true; loop: // if waiting tree is being saved, we can't write to it // so in that case, bail and wait to be called another time RdbTree *wt = &m_waitingTree; if( wt->m_isSaving || ! wt->m_isWritable ) { m_isPopulating = false; return; } // . get next IP that is due to be spidered from // . also sets m_waitingTreeKey so we can delete it easily! long ip = getNextIpFromWaitingTree(); // . return if none. all done. unset populating flag. // . it returns 0 if the next firstip has a spidertime in the future if ( ip == 0 ) { m_isPopulating = false; return; } // set read range for scanning spiderdb m_nextKey = g_spiderdb.makeFirstKey(ip); m_endKey = g_spiderdb.makeLastKey (ip); ////// // // do TWO PASSES, one to count pages, the other to get the best url!! // ////// // assume we don't have to do two passes m_countingPagesIndexed = false; // get the collectionrec CollectionRec *cr = g_collectiondb.getRec ( m_collnum ); // but if we have quota based url filters we do have to count if ( cr && cr->m_urlFiltersHavePageCounts ) { // tell evalIpLoop() to count first m_countingPagesIndexed = true; // reset this stuff used for counting UNIQUE votes m_lastReqUh48a = 0LL; m_lastReqUh48b = 0LL; m_lastRepUh48 = 0LL; // and setup the LOCAL counting table if not initialized if ( m_localTable.m_ks == 0 ) m_localTable.set (4,4,0,NULL,0,false,0,"ltpct" ); // otherwise, just reset it so we can repopulate it else m_localTable.reset(); } // debug output if ( g_conf.m_logDebugSpider ) log(LOG_DEBUG,"spider: evalIpLoop: waitingtree nextip=%s " "numUsedNodes=%li",iptoa(ip),m_waitingTree.m_numUsedNodes); /* // assume using tree m_useTree = true; // . flush the tree every 12 hours // . i guess we could add incoming requests to the ufntree if // they strictly beat the ufn tree tail node, HOWEVER, we // still have the problem of that if a url we spidered is due // to be respidered very soon we will miss it, as only the reply // is added back into spiderdb, not a new request. long nowLocal = getTimeLocal(); // make it one hour so we don't cock-block a new high priority // request that just got added... crap, what if its an addurl // or something like that???? if ( nowLocal - s_lastUfnTreeFlushTime > 3600 ) { s_ufnTree.clear(); s_lastUfnTreeFlushTime = nowLocal; } long long uh48; // // s_ufnTree tries to cache the top X spiderrequests for an IP // that should be spidered next so we do not have to scan like // a million spiderrequests in spiderdb to find the best one. // // if we have a specific uh48 targetted in s_ufnTree then that // saves a ton of time! // key format for s_ufnTree: // iiiiiiii iiiiiiii iiiiiii iiiiiii i = firstip // PPPPPPPP tttttttt ttttttt ttttttt P = priority // tttttttt tttttttt hhhhhhh hhhhhhh t = spiderTimeMS (40 bits) // hhhhhhhh hhhhhhhh hhhhhhh hhhhhhh h = urlhash48 key128_t key; key.n1 = ip; key.n1 <<= 32; key.n0 = 0LL; long node = s_ufnTree.getNextNode(0,(char *)&key); // cancel node if not from our ip if ( node >= 0 ) { key128_t *rk = (key128_t *)s_ufnTree.getKey ( node ); if ( (rk->n1 >> 32) != (unsigned long)ip ) node = -1; } if ( node >= 0 ) { // get the key key128_t *nk = (key128_t *)s_ufnTree.getKey ( node ); // parse out uh48 uh48 = nk->n0; // mask out spidertimems uh48 &= 0x0000ffffffffffffLL; // use that to refine the key range immensley! m_nextKey = g_spiderdb.makeFirstKey2 (ip, uh48); m_endKey = g_spiderdb.makeLastKey2 (ip, uh48); // do not add the recs to the tree! m_useTree = false; } */ // turn this off until we figure out why it sux m_useTree = false; // so we know if we are the first read or not... m_firstKey = m_nextKey; // . initialize this before scanning the spiderdb recs of an ip // . it lets us know if we recvd new spider requests for m_scanningIp // while we were doing the scan m_gotNewDataForScanningIp = 0; m_lastListSize = -1; // let evalIpLoop() know it has not yet tried to read from spiderdb m_didRead = false; // reset this long maxWinners = (long)MAX_WINNER_NODES; if ( m_winnerTree.m_numNodes == 0 && ! m_winnerTree.set ( -1 , // fixeddatasize maxWinners , // maxnumnodes true , // balance? maxWinners * MAX_BEST_REQUEST_SIZE, // memmax true, // owndata? "wintree", // allocname false, // datainptrs? NULL, // dbname sizeof(key192_t), // keysize false, // useprotection? false, // allowdups? -1 ) ) { // rdbid m_isPopulating = false; log("spider: winntree set: %s",mstrerror(g_errno)); return; } if ( ! m_winnerTable.isInitialized() && ! m_winnerTable.set ( 8 , // uh48 is key sizeof(key192_t) , // winnertree key is data 64 , // 64 slots initially NULL , 0 , false , // allow dups? MAX_NICENESS , "wtdedup" ) ) { m_isPopulating = false; log("spider: wintable set: %s",mstrerror(g_errno)); return; } // clear it before evaluating this ip so it is empty m_winnerTree.clear(); // and table as well now m_winnerTable.clear(); // reset this as well m_minFutureTimeMS = 0LL; m_totalBytesScanned = 0LL; // . look up in spiderdb otherwise and add best req to doledb from ip // . if it blocks ultimately it calls gotSpiderdbListWrapper() which // calls this function again with re-entry set to true if ( ! evalIpLoop ( ) ) return ; // oom error? i've seen this happen and we end up locking up! if ( g_errno ) { log("spider: evalIpLoop: %s",mstrerror(g_errno)); m_isPopulating = false; return; } // try more goto loop; } // replace this func with the one above... static void doledWrapper ( void *state ) { SpiderColl *THIS = (SpiderColl *)state; // msg4 is available again THIS->m_msg1Avail = true; if ( g_errno ) log("spider: msg1 addlist had error: %s. need to " "somehow reduce doleiptable score now...", mstrerror(g_errno)); // no longer populating doledb. we also set to false in // gotSpiderListWrapper //THIS->m_isPopulating = false; long long now = gettimeofdayInMilliseconds(); long long diff = now - THIS->m_msg4Start; // we add recs to doledb using msg1 to keep things fast because // msg4 has a delay of 500ms in it. but even then, msg1 can take // 6ms or more just because of load issues. if ( diff > 10 ) log("spider: adding to doledb took %llims",diff); // we are done!! that was the final step... THIS->m_isPopulating = false; // did collection get nuked while we were waiting for msg1 reply? if ( tryToDeleteSpiderColl ( THIS ) ) return; // . we added a rec to doledb for the firstIp in m_waitingTreeKey, so // now go to the next node in the wait tree. // . it will get the next key after m_waitingTreeKey // . re-entry is true because we just got the msg4 reply THIS->populateDoledbFromWaitingTree ( ); } key128_t makeUfnTreeKey ( long firstIp , long priority , long long spiderTimeMS , long long uh48 ) { // sanity check, do not allow negative priorities for now if ( priority < 0 ) { char *xx=NULL;*xx=0; } if ( priority > 255 ) { char *xx=NULL;*xx=0; } key128_t key; key.n1 = (unsigned long)firstIp; // all of priority (COMPLEMENTED!) key.n1 <<= 8; key.n1 |= (unsigned char)(255-priority); // top 3 bytes of spiderTimeMS (5 bytes total) key.n1 <<= 24; key.n1 |= ((spiderTimeMS >> 16) & 0x00ffffff); // remaining 2 bytes of spiderTimeMS goes in key.n0 key.n0 = (spiderTimeMS & 0xffff); // 6 bytes uh48 key.n0 <<= 48; key.n0 |= uh48; return key; } //////// // // winner tree key. holds the top/best spider requests for a firstIp // for spidering purposes. // //////// // key bitmap (192 bits): // // ffffffff ffffffff ffffffff ffffffff f=firstIp // pppppppp pppppppp HHHHHHHH HHHHHHHH p=priority H=hopcount // tttttttt tttttttt tttttttt tttttttt t=spiderTimeMS // tttttttt tttttttt tttttttt tttttttt h=urlHash48 // hhhhhhhh hhhhhhhh hhhhhhhh hhhhhhhh // hhhhhhhh hhhhhhhh 00000000 00000000 key192_t makeWinnerTreeKey ( long firstIp , long priority , long hopCount, long long spiderTimeMS , long long uh48 ) { key192_t k; k.n2 = firstIp; k.n2 <<= 16; k.n2 |= (255-priority); k.n2 <<= 16; // query reindex is still using hopcount -1... if ( hopCount == -1 ) hopCount = 0; if ( hopCount < 0 ) { char *xx=NULL;*xx=0; } if ( hopCount > 0xffff ) hopCount = 0xffff; k.n2 |= hopCount; k.n1 = spiderTimeMS; k.n0 = uh48; k.n0 <<= 16; return k; } void parseWinnerTreeKey ( key192_t *k , long *firstIp , long *priority , long *hopCount, long long *spiderTimeMS , long long *uh48 ) { *firstIp = (k->n2) >> 32; *priority = 255 - ((k->n2 >> 16) & 0xffff); *hopCount = (k->n2 & 0xffff); *spiderTimeMS = k->n1; *uh48 = (k->n0 >> 16); } void testWinnerTreeKey ( ) { long firstIp = 1234567; long priority = 123; long long spiderTimeMS = 456789123LL; long long uh48 = 987654321888LL; long hc = 4321; key192_t k = makeWinnerTreeKey (firstIp,priority,hc,spiderTimeMS,uh48); long firstIp2; long priority2; long long spiderTimeMS2; long long uh482; long hc2; parseWinnerTreeKey(&k,&firstIp2,&priority2,&hc2,&spiderTimeMS2,&uh482); if ( firstIp != firstIp2 ) { char *xx=NULL;*xx=0; } if ( priority != priority2 ) { char *xx=NULL;*xx=0; } if ( spiderTimeMS != spiderTimeMS2 ) { char *xx=NULL;*xx=0; } if ( uh48 != uh482 ) { char *xx=NULL;*xx=0; } if ( hc != hc2 ) { char *xx=NULL;*xx=0; } } void removeExpiredLocks ( long hostId ); static void gotSpiderdbListWrapper ( void *state , RdbList *list , Msg5 *msg5){ SpiderColl *THIS = (SpiderColl *)state; // prevent a core THIS->m_gettingList1 = false; // return if that blocked if ( ! THIS->evalIpLoop() ) return; // we are done, re-entry popuatedoledb THIS->m_isPopulating = false; // gotta set m_isPopulating to false lest it won't work THIS->populateDoledbFromWaitingTree ( ); } /////////////////// // // KEYSTONE FUNCTION // // . READ ALL spiderdb recs for IP of m_scanningIp // . add winner to doledb // . called ONLY by populateDoledbFromWaitingTree() // // . continually scan spiderdb requests for a particular ip, m_scanningIp // . compute the best spider request to spider next // . add it to doledb // . getNextIpFromWaitingTree() must have been called to set m_scanningIp // otherwise m_bestRequestValid might not have been reset to false // /////////////////// bool SpiderColl::evalIpLoop ( ) { //testWinnerTreeKey ( ); // sanity if ( m_scanningIp == 0 || m_scanningIp == -1 ) { char *xx=NULL;*xx=0;} top: // did our collection rec get deleted? since we were doing a read // the SpiderColl will have been preserved in that case but its // m_deleteMyself flag will have been set. if ( tryToDeleteSpiderColl ( this ) ) return false; // if first time here, let's do a read first if ( ! m_didRead ) { // reset list size to 0 m_list.reset(); // assume we did a read now m_didRead = true; // reset some stuff m_lastScanningIp = 0; // do a read. if it blocks it will recall this loop if ( ! readListFromSpiderdb () ) return false; } loop: // did our collection rec get deleted? since we were doing a read // the SpiderColl will have been preserved in that case but its // m_deleteMyself flag will have been set. if ( tryToDeleteSpiderColl ( this ) ) // pretend to block since we got deleted!!! return false; // . did reading the list from spiderdb have an error? // . i guess we don't add to doledb then if ( g_errno ) { log("spider: Had error getting list of urls " "from spiderdb: %s.",mstrerror(g_errno)); // save mem m_list.freeList(); //m_isReadDone = true; return true; } // if we started reading, then assume we got a fresh list here if ( g_conf.m_logDebugSpider ) log("spider: back from msg5 spiderdb read2"); // . set the winning request for all lists we read so far // . if m_countingPagesIndexed is true this will just fill in // quota info into m_localTable... scanListForWinners(); // if list not empty, keep reading! if ( ! m_list.isEmpty() ) { // update m_nextKey for successive reads of spiderdb by // calling readListFromSpiderdb() key128_t endKey = *(key128_t *)m_list.getLastKey(); // sanity //if ( endKey != finalKey ) { char *xx=NULL;*xx=0; } m_nextKey = endKey; m_nextKey += (unsigned long) 1; // . watch out for wrap around // . normally i would go by this to indicate that we are // done reading, but there's some bugs... so we go // by whether our list is empty or not for now if ( m_nextKey < endKey ) m_nextKey = endKey; // reset list to save mem m_list.reset(); // read more! return if it blocked if ( ! readListFromSpiderdb() ) return false; // we got a list without blocking goto loop; } // . we are all done if last list read was empty // . if we were just counting pages for quota, do a 2nd pass! if ( m_countingPagesIndexed ) { // do not do again. m_countingPagesIndexed = false; // start at the top again m_nextKey = g_spiderdb.makeFirstKey(m_scanningIp); // this time m_localTable should have the quota info in it so // getUrlFilterNum() can use that m_didRead = false; // do the 2nd pass. read list from the very top. goto top; } // free list to save memory m_list.freeList(); // . add all winners if we can in m_winnerTree into doledb // . if list was empty, then reading is all done so take the winner we // got from all the lists we did read for this IP and add him // to doledb // . if no winner exists, then remove m_scanningIp from m_waitingTree // so we do not waste our time again. if url filters change then // waiting tree will be rebuilt and we'll try again... or if // a new spider request or reply for this ip comes in we'll try // again as well... // . this returns false if blocked adding to doledb using msg1 if ( ! addWinnersIntoDoledb() ) return false; // . do more from tree // . re-entry is true because we just got the msg5 reply // . don't do this because populateDoledb calls us in a loop // and we call it from all our callbacks if we blocked... //populateDoledbFromWaitingTree ( true ); // we are done... return true; } // . this is ONLY CALLED from evalIpLoop() above // . returns false if blocked, true otherwise // . returns true and sets g_errno on error bool SpiderColl::readListFromSpiderdb ( ) { if ( ! m_waitingTreeKeyValid ) { char *xx=NULL;*xx=0; } if ( ! m_scanningIp ) { char *xx=NULL;*xx=0; } CollectionRec *cr = g_collectiondb.getRec ( m_collnum ); if ( ! cr ) { log("spider: lost collnum %li",(long)m_collnum); g_errno = ENOCOLLREC; return true; } // i guess we are always restricted to an ip, because // populateWaitingTreeFromSpiderdb calls its own msg5. long firstIp0 = g_spiderdb.getFirstIp(&m_nextKey); // sanity if ( m_scanningIp != firstIp0 ) { char *xx=NULL;*xx=0; } // sometimes we already have this ip in doledb/doleiptable // already and somehow we try to scan spiderdb for it anyway if ( m_doleIpTable.isInTable ( &firstIp0 ) ) { char *xx=NULL;*xx=0;} // if it got zapped from the waiting tree by the time we read the list if ( ! m_waitingTable.isInTable ( &m_scanningIp ) ) return true; // sanity check long wn = m_waitingTree.getNode(0,(char *)&m_waitingTreeKey); // it gets removed because addSpiderReply() calls addToWaitingTree // and replaces the node we are scanning with one that has a better // time, an earlier time, even though that time may have come and // we are scanning it now. perhaps addToWaitingTree() should ignore // the ip if it equals m_scanningIp? if ( wn < 0 ) { log("spider: waiting tree key removed while reading list " "for %s (%li)", cr->m_coll,(long)m_collnum); return true; } // sanity. if first time, this must be invalid //if ( needList && m_nextKey == m_firstKey && m_bestRequestValid ) { // char *xx=NULL; *xx=0 ; } // . if the scanning ip has too many outstanding spiders // . looks a UrlLock::m_firstIp and UrlLock::m_isSpiderOutstanding // since the lock lives for 5 seconds after the spider reply // comes back. // . when the spiderReply comes back that will re-add a "0" entry // to the waiting tree. // . PROBLEM: some spiders don't seem to add a spiderReply!! wtf??? // they end up having their locks timeout after like 3 hrs? // . maybe just do not add to waiting tree in confirmLockAcquisition() // handler in such cases? YEAH.. try that //long numOutPerIp = getOustandingSpidersPerIp ( firstIp ); //if ( numOutPerIp > maxSpidersPerIp ) { // // remove from the tree and table // removeFromWaitingTree ( firstIp ); // return true; //} // readLoop: // if we re-entered from the read wrapper, jump down //if ( needList ) { // sanity check if ( m_gettingList1 ) { char *xx=NULL;*xx=0; } // . read in a replacement SpiderRequest to add to doledb from // this ip // . get the list of spiderdb records // . do not include cache, those results are old and will mess // us up if (g_conf.m_logDebugSpider ) { // got print each out individually because KEYSTR // uses a static buffer to store the string SafeBuf tmp; tmp.safePrintf("spider: readListFromSpiderdb: " "calling msg5: "); tmp.safePrintf("firstKey=%s " ,KEYSTR(&m_firstKey,sizeof(key128_t))); tmp.safePrintf("endKey=%s " ,KEYSTR(&m_endKey,sizeof(key128_t))); tmp.safePrintf("nextKey=%s " ,KEYSTR(&m_nextKey,sizeof(key128_t))); tmp.safePrintf("firstip=%s" ,iptoa(m_scanningIp)); log(LOG_DEBUG,"%s",tmp.getBufStart()); } // log this better if ( g_conf.m_logDebugSpider ) log("spider: readListFromSpiderdb: firstip=%s key=%s" ,iptoa(m_scanningIp) ,KEYSTR(&m_nextKey,sizeof(key128_t) ) ); // flag it m_gettingList1 = true; // make state //long state2 = (long)m_cr->m_collnum; // . read the list from local disk // . if a niceness 0 intersect thread is taking a LONG time // then this will not complete in a long time and we // end up timing out the round. so try checking for // m_gettingList in spiderDoledUrls() and setting // m_lastSpiderCouldLaunch if ( ! m_msg5.getList ( RDB_SPIDERDB , m_cr->m_collnum , &m_list , &m_nextKey , &m_endKey , SR_READ_SIZE , // minRecSizes (512k) true , // includeTree false , // addToCache 0 , // max cache age 0 , // startFileNum -1 , // numFiles (all) this,//(void *)state2,//this,//state gotSpiderdbListWrapper , MAX_NICENESS , // niceness true )) // do error correct? // return false if blocked return false ; // note its return if ( g_conf.m_logDebugSpider ) log("spider: back from msg5 spiderdb read"); // no longer getting list m_gettingList1 = false; // got it without blocking. maybe all in tree or in cache return true; // unflag it //m_gettingList = false; // stop if we are done //if ( m_isReadDone ) return true; } // . ADDS top X winners to m_winnerTree // . this is ONLY CALLED from evalIpLoop() above // . scan m_list that we read from spiderdb for m_scanningIp IP // . set m_bestRequest if an request in m_list is better than what is // in m_bestRequest from previous lists for this IP bool SpiderColl::scanListForWinners ( ) { // if list is empty why are we here? if ( m_list.isEmpty() ) return true; // if waitingtree is locked for writing because it is saving or // writes were disabled then just bail and let the scan be re-called // later // // MDW: move this up in evalIpLoop() i think RdbTree *wt = &m_waitingTree; if ( wt->m_isSaving || ! wt->m_isWritable ) return true; // shortcut RdbList *list = &m_list; // ensure we point to the top of the list list->resetListPtr(); // get this long long nowGlobalMS = gettimeofdayInMillisecondsGlobal();//Local(); uint32_t nowGlobal = nowGlobalMS / 1000; //SpiderRequest *winReq = NULL; //long winPriority = -10; //uint64_t winTimeMS = 0xffffffffffffffffLL; //long winMaxSpidersPerIp = 9999; SpiderReply *srep = NULL; long long srepUh48; // for getting the top MAX_NODES nodes //long tailPriority = -10; //uint64_t tailTimeMS = 0xffffffffffffffffLL; // if we are continuing from another list... if ( m_lastReplyValid ) { srep = (SpiderReply *)m_lastReplyBuf; srepUh48 = srep->getUrlHash48(); } // sanity, if it was in ufntree it should be on disk then... /* if ( list->isEmpty() && m_nextKey == m_firstKey && ! m_useTree ) { SafeBuf sb; sb.safePrintf("startkey=%s,", KEYSTR(&m_nextKey,sizeof(key128_t) )); sb.safePrintf("endkey=%s", KEYSTR(&m_endKey,sizeof(key128_t) )); // get waiting key info long firstIp = m_waitingTreeKey.n0 & 0xffffffff; log("spider: strange corruption #1. there was an entry " "in the waiting tree, but spiderdb read was empty. " "%s. deleting waitingtree key firstip=%s", sb.getBufStart(), iptoa(firstIp)); // delete the exact node # m_waitingTree.deleteNode ( wn , false ); } */ //char *xx=NULL;*xx=0; } /* // use the ufntree? bool useTree = m_useTree; // if we are the first read and list is not full do not bother // using the tree because its just as fast to scan the little list // we got if ( m_nextKey == m_firstKey && list->getListSize() < SR_READ_SIZE ) useTree = false; // init ufn tree if ( useTree && ! s_ufnTreeSet ) { s_ufnTreeSet = true; s_ufnTree.set ( 0 , // fixed data size (uh48) 1000000 , // max num nodes true, // balance? -1 , // maxmem, none false , // own data? "ufntree", false, // data is ptr! (true?) "ufntreedb", sizeof(key128_t), false, false ); } */ // show list stats if ( g_conf.m_logDebugSpider ) log("spider: readListFromSpiderdb: got list of size %li " "for firstip=%s", m_list.m_listSize,iptoa(m_scanningIp)); // if we don't read minRecSizes worth of data that MUST indicate // there is no more data to read. put this theory to the test // before we use it to indcate an end of list condition. if ( list->getListSize() > 0 && m_lastScanningIp == m_scanningIp && m_lastListSize < (long)SR_READ_SIZE && m_lastListSize >= 0 ) { //char *xx=NULL;*xx=0; } log("spider: shucks. spiderdb reads not full."); } m_lastListSize = list->getListSize(); m_lastScanningIp = m_scanningIp; m_totalBytesScanned += list->getListSize(); if ( list->isEmpty() && g_conf.m_logDebugSpider ) log("spider: failed to get rec for ip=%s",iptoa(m_scanningIp)); long firstIp = m_waitingTreeKey.n0 & 0xffffffff; //long numNodes = 0; //long tailNode = -1; key128_t finalKey; // how many spiders currently out for this ip? //longoutNow=g_spiderLoop.getNumSpidersOutPerIp(m_scanningIp,m_collnum) // loop over all serialized spiderdb records in the list for ( ; ! list->isExhausted() ; ) { // breathe QUICKPOLL ( MAX_NICENESS ); // stop coring on empty lists if ( list->isEmpty() ) break; // get spiderdb rec in its serialized form char *rec = list->getCurrentRec(); // sanity memcpy ( (char *)&finalKey , rec , sizeof(key128_t) ); // skip to next guy list->skipCurrentRecord(); // negative? wtf? if ( (rec[0] & 0x01) == 0x00 ) { logf(LOG_DEBUG,"spider: got negative spider rec"); continue; } // if its a SpiderReply set it for an upcoming requests if ( ! g_spiderdb.isSpiderRequest ( (key128_t *)rec ) ) { // see if this is the most recent one SpiderReply *tmp = (SpiderReply *)rec; // if we have a more recent reply already, skip this if ( srep && srep->getUrlHash48() == tmp->getUrlHash48() && srep->m_spideredTime >= tmp->m_spideredTime ) continue; // otherwise, assign it srep = tmp; srepUh48 = srep->getUrlHash48(); continue; } // cast it SpiderRequest *sreq = (SpiderRequest *)rec; // . skip if our twin should add it to doledb // . waiting tree only has firstIps assigned to us so // this should not be necessary //if ( ! isAssignedToUs ( sreq->m_firstIp ) ) continue; // null out srep if no match if ( srep && srepUh48 != sreq->getUrlHash48() ) srep = NULL; // if we are doing parser test, ignore all but initially // injected requests. NEVER DOLE OUT non-injected urls // when doing parser test if ( g_conf.m_testParserEnabled ) { // skip if already did it if ( srep ) continue; // skip if not injected if ( ! sreq->m_isInjecting ) { if ( g_conf.m_logDebugSpider ) log("spider: skipping8 %s", sreq->m_url); continue; } } // . ignore docid-based requests if spidered the url afterwards // . these are one-hit wonders // . once done they can be deleted if ( sreq->m_urlIsDocId && srep && srep->m_spideredTime > sreq->m_addedTime ) { if ( g_conf.m_logDebugSpider ) log("spider: skipping9 %s", sreq->m_url); continue; } // only add firstip if manually added and not fake // // just calculating page counts? if the url filters are based // on the # of pages indexed per ip or subdomain/site then // we have to maintain a page count table. // if ( m_countingPagesIndexed ) { //&& sreq->m_fakeFirstIp ) { // get request url hash48 (jez= 220459274533043 ) long long uh48 = sreq->getUrlHash48(); // do not repeatedly page count if we just have // a single fake firstip request. this just adds // an entry to the table that will end up in // m_pageCountTable so we avoid doing this count // again over and over. also gives url filters // table a zero-entry... //m_localTable.addScore(&sreq->m_firstIp,0); //m_localTable.addScore(&sreq->m_siteHash32,0); //m_localTable.addScore(&sreq->m_domHash32,0); // only add dom/site hash seeds if it is // a fake firstIp to avoid double counting seeds if ( sreq->m_fakeFirstIp ) continue; // count the manual additions separately. mangle their // hash with 0x123456 so they are separate. if ( (sreq->m_isAddUrl || sreq->m_isInjecting) && // unique votes per seed uh48 != m_lastReqUh48a ) { // do not repeat count the same url m_lastReqUh48a = uh48; // sanity if ( ! sreq->m_siteHash32){char*xx=NULL;*xx=0;} if ( ! sreq->m_domHash32){char*xx=NULL;*xx=0;} // do a little magic because we count // seeds as "manual adds" as well as normal pg long h32; h32 = sreq->m_siteHash32 ^ 0x123456; m_localTable.addScore(&h32); h32 = sreq->m_domHash32 ^ 0x123456; m_localTable.addScore(&h32); } // unique votes per other for quota if ( uh48 == m_lastReqUh48b ) continue; // update this to ensure unique voting m_lastReqUh48b = uh48; // now count pages indexed below here if ( ! srep ) continue; if ( srepUh48 == m_lastRepUh48 ) continue; m_lastRepUh48 = srepUh48; //if ( ! srep ) continue; // TODO: what is srep->m_isIndexedINValid is set???? if ( ! srep->m_isIndexed ) continue; // keep count per site and firstip m_localTable.addScore(&sreq->m_firstIp,1); m_localTable.addScore(&sreq->m_siteHash32,1); m_localTable.addScore(&sreq->m_domHash32,1); continue; } // if the spiderrequest has a fake firstip that means it // was injected without doing a proper ip lookup for speed. // xmldoc.cpp will check for m_fakeFirstIp and it that is // set in the spiderrequest it will simply add a new request // with the correct firstip. it will be a completely different // spiderrequest key then. so no need to keep the "fakes". // it will log the EFAKEFIRSTIP error msg. if ( sreq->m_fakeFirstIp && srep && srep->m_spideredTime > sreq->m_addedTime ) { if ( g_conf.m_logDebugSpider ) log("spider: skipping6 %s", sreq->m_url); continue; } // once we have a spiderreply, even i guess if its an error, // for a url, then bail if respidering is disabled if ( m_cr->m_isCustomCrawl && srep && m_cr->m_collectiveRespiderFrequency <= 0.0 ) { if ( g_conf.m_logDebugSpider ) log("spider: skipping0 %s",sreq->m_url); continue; } // sanity check. check for http(s):// if ( sreq->m_url[0] != 'h' && // might be a docid from a pagereindex.cpp ! is_digit(sreq->m_url[0]) ) { log("spider: got corrupt 1 spiderRequest in scan " "because url is %s",sreq->m_url); continue; } // update SpiderRequest::m_siteNumInlinks to most recent value long sni = sreq->m_siteNumInlinks; // get the # of inlinks to the site from our table uint64_t *val; val = (uint64_t *)m_sniTable.getValue32(sreq->m_siteHash32); // use the most recent sni from this table if ( val ) sni = (long)((*val)>>32); // if SpiderRequest is forced then m_siteHash32 is 0! else if ( srep && srep->m_spideredTime >= sreq->m_addedTime ) sni = srep->m_siteNumInlinks; // assign sreq->m_siteNumInlinks = sni; // store rror count in request so xmldoc knows what it is // and can increment it and re-add it to its spiderreply if // it gets another error if ( srep ) { sreq->m_errCount = srep->m_errCount; // . assign this too from latest reply - smart compress // . this WAS SpiderReply::m_pubdate so it might be // set to a non-zero value that is wrong now... but // not a big deal! sreq->m_contentHash32 = srep->m_contentHash32; // if we tried it before sreq->m_hadReply = true; } // this is -1 on corruption if ( srep && srep->m_httpStatus >= 1000 ) { log("spider: got corrupt 3 spiderReply in scan"); srep = NULL; } // bad langid? if ( srep && ! getLanguageAbbr (srep->m_langId) ) { log("spider: got corrupt 4 spiderReply in scan"); srep = NULL; } // . get the url filter we match // . if this is slow see the TODO below in dedupSpiderdbList() // which can pre-store these values assuming url filters do // not change and siteNumInlinks is about the same. long ufn = ::getUrlFilterNum(sreq, srep, nowGlobal, false, MAX_NICENESS, m_cr, false, // isOutlink? // provide the page quota table &m_localTable); // sanity check if ( ufn == -1 ) { log("spider: failed to match url filter for " "url = %s coll=%s", sreq->m_url,m_cr->m_coll); g_errno = EBADENGINEER; return true; } // set the priority (might be the same as old) long priority = m_cr->m_spiderPriorities[ufn]; // sanity checks if ( priority == -1 ) { char *xx=NULL;*xx=0; } if ( priority >= MAX_SPIDER_PRIORITIES) {char *xx=NULL;*xx=0;} // spiders disabled for this row in url filteres? //if ( ! m_cr->m_spidersEnabled[ufn] ) continue; if ( m_cr->m_maxSpidersPerRule[ufn] <= 0 ) continue; // skip if banned if ( priority == SPIDER_PRIORITY_FILTERED ) continue; if ( priority == SPIDER_PRIORITY_BANNED ) continue; long long spiderTimeMS; spiderTimeMS = getSpiderTimeMS ( sreq,ufn,srep,nowGlobalMS ); // how many outstanding spiders on a single IP? //long maxSpidersPerIp = m_cr->m_spiderIpMaxSpiders[ufn]; // sanity if ( (long long)spiderTimeMS < 0 ) { log("spider: got corrupt 2 spiderRequest in scan"); continue; } // save this shit for storing in doledb sreq->m_ufn = ufn; sreq->m_priority = priority; // if it is in future, skip it and just set m_futureTime and // and we will update the waiting tree // with an entry based on that future time if the winnerTree turns // out to be empty after we've completed our scan if ( spiderTimeMS > nowGlobalMS ) { // if futuretime is zero set it to this time if ( ! m_minFutureTimeMS ) m_minFutureTimeMS = spiderTimeMS; // otherwise we get the MIN of all future times else if ( spiderTimeMS < m_minFutureTimeMS ) m_minFutureTimeMS = spiderTimeMS; if ( g_conf.m_logDebugSpider ) log("spider: skippingx %s",sreq->m_url); continue; } ////// // // MDW: no, take this out now that we allow multiple urls // in doledb from this firstip for speeding up the spiders. // the crawldelay will be used by Msg13.cpp when it tries // to download the url. // ////// /* // how many "ready" urls for this IP? urls in doledb // can be spidered right now long *score ; score = (long *)m_doleIpTable.getValue32 ( sreq->m_firstIp ); // how many spiders are current outstanding long out2 = outNow; // add in any requests in doledb if ( score ) out2 += *score; // . do not add any more to doledb if we could violate ourquota // . shit we have to add it our it never gets in // . try regulating in msg13.cpp download code. just queue // up requests to avoid hammering there. if ( out2 >= maxSpidersPerIp ) { if ( g_conf.m_logDebugSpider ) log("spider: skipping1 %s",sreq->m_url); continue; } // by ensuring only one spider out at a time when there // is a positive crawl-delay, we ensure that m_lastDownloadTime // is the last time we downloaded from this ip so that we // can accurately set the time in getSpiderTimeMS() for // when the next url from this firstip should be spidered. if ( out2 >= 1 ) { // get the crawldelay for this domain long *cdp ; cdp = (long *)m_cdTable.getValue (&sreq->m_domHash32); // if crawl delay is NULL, we need to download // robots.txt. most of the time it will be -1 // which indicates not specified in robots.txt if ( ! cdp ) { if ( g_conf.m_logDebugSpider ) log("spider: skipping2 %s", sreq->m_url); continue; } // if we had a positive crawldelay and there is // already >= 1 outstanding spider on this ip, // then skip this url if ( cdp && *cdp > 0 ) { if ( g_conf.m_logDebugSpider ) log("spider: skipping3 %s", sreq->m_url); continue; } } */ // debug. show candidates due to be spidered now. //if(g_conf.m_logDebugSpider ) //&& spiderTimeMS< nowGlobalMS ) // log("spider: considering ip=%s sreq spiderTimeMS=%lli " // "pri=%li uh48=%lli", // iptoa(sreq->m_firstIp), // spiderTimeMS, // priority, // sreq->getUrlHash48()); // we can't have negative priorities at this point because // the s_ufnTree uses priority as part of the key so it // can get the top 100 or so urls for a firstip to avoid // having to hit spiderdb for every one! if ( priority < 0 ) { char *xx=NULL;*xx=0; } // // NO! then just a single root url can prevent all his // kids from getting spidered. because this logic was // priority based over time. so while the high priority url // would be sitting in the waiting tree, the kids whose // time it was to be spidered would be starving for attention. // only use priority if the high priority url can be spidered // now, so he doesn't lock the others out of the waiting tree. // // now pick the SpiderRequest with the best priority, then // break ties with the "spiderTime". //if ( priority < winPriority ) // continue; // if tied, use times //if ( priority == winPriority && spiderTimeMS > winTimeMS ) // continue; // bail if it is locked! we now call // msg12::confirmLockAcquisition() after we get the lock, // which deletes the doledb record from doledb and doleiptable // rightaway and adds a "0" entry into the waiting tree so // that evalIpLoop() repopulates doledb again with that // "firstIp". this way we can spider multiple urls from the // same ip at the same time. long long key = makeLockTableKey ( sreq ); if ( g_spiderLoop.m_lockTable.isInTable ( &key ) ) { // get it //CrawlInfo *ci = &m_cr->m_localCrawlInfo; // do not think the round is over! //ci->m_lastSpiderCouldLaunch = nowGlobal; // there are urls ready to spider, just locked up //ci->m_hasUrlsReadyToSpider = true; // debug note if ( g_conf.m_logDebugSpider ) log("spider: skipping url lockkey=%lli in " "lock table",key); continue; } long long uh48 = sreq->getUrlHash48(); // make key key192_t wk = makeWinnerTreeKey( firstIp , priority , sreq->m_hopCount, spiderTimeMS , uh48 ); // if this url is already in the winnerTree then either we // replace it or we skip ourselves. // // watch out for dups in winner tree, the same url can have // multiple spiderTimeMses somehow... i guess it could have // different hop counts // as well, resulting in different priorities... // actually the dedup table could map to a priority and a node // so we can kick out a lower priority version of the same url. long winSlot = m_winnerTable.getSlot ( &uh48 ); if ( winSlot >= 0 ) { key192_t *oldwk ; oldwk = (key192_t *)m_winnerTable. getDataFromSlot ( winSlot ); // are we lower priority? (or equal) if(KEYCMP((char *)&wk,(char *)oldwk, sizeof(key192_t))<=0) continue; // from table too. no it's a dup uh48! //m_winnerTable.deleteKey ( &uh48 ); // otherwise we supplant it. remove old key from tree. m_winnerTree.deleteNode ( 0 , oldwk ); // supplant in table and tree... just add below... } // get the top 100 spider requests by priority/time/etc. long maxWinners = (long)MAX_WINNER_NODES; // 40 // only put 40 urls from the same firstIp into doledb if // we have a lot of urls in our spiderdb already. if ( m_totalBytesScanned < 200000 ) maxWinners = 1; // sanity. make sure read is somewhat hefty for our // maxWinners=1 thing if ( (long)SR_READ_SIZE < 500000 ) { char *xx=NULL;*xx=0; } // only compare to min winner in tree if tree is full if ( m_winnerTree.getNumUsedNodes() >= maxWinners ) { // get that key long long tm1 = spiderTimeMS; // get the spider time of lowest scoring req in tree long long tm2 = m_tailTimeMS; // if they are both overdue, make them the same if ( tm1 < nowGlobalMS ) tm1 = 1; if ( tm2 < nowGlobalMS ) tm2 = 1; // skip spider request if its time is past winner's if ( tm1 > tm2 ) continue; if ( tm1 < tm2 ) goto gotNewWinner; // if tied, use priority if ( priority < m_tailPriority ) continue; if ( priority > m_tailPriority ) goto gotNewWinner; // if tied use hop counts so we are breadth first if ( sreq->m_hopCount > m_tailHopCount ) continue; if ( sreq->m_hopCount < m_tailHopCount ) goto gotNewWinner; // if hopcounts tied prefer the unindexed doc // i don't think we need this b/c spidertimems // for new docs should be less than old docs... // TODO: verify that //if ( sreq->m_isIndexed && ! m_tailIsIndexed ) // continue; //if ( ! sreq->m_isIndexed && m_tailIsIndexed ) // goto gotNewWinner; // if tied, use actual times. assuming both m_tailTimeMS ) continue; if ( spiderTimeMS < m_tailTimeMS ) goto gotNewWinner; // all tied, keep it the same i guess continue; // otherwise, add the new winner in and remove the old gotNewWinner: // get lowest scoring node in tree long tailNode = m_winnerTree.getLastNode(); // from table too m_winnerTable.removeKey ( &m_tailUh48 ); // delete the tail so new spiderrequest can enter m_winnerTree.deleteNode ( tailNode , true ); } // somestimes the firstip in its key does not match the // firstip in the record! if ( sreq->m_firstIp != firstIp ) { log("spider: request %s firstip does not match " "firstip in key",sreq->m_url); log("spider: ip1=%s",iptoa(sreq->m_firstIp)); log("spider: ip2=%s",iptoa(firstIp)); continue; } //// // // add spider request to winner tree // //// /* // make the key //if ( useTree ) { long long uh48 = sreq->getUrlHash48(); key128_t k = makeUfnTreeKey ( firstIp ,priority, spiderTimeMS , uh48 ); //long nn =; m_winnerTree.addNode(0,(char *)&k,NULL,8); //log("adding node #%li firstip=%s uh48=%llu " // "ufntree.k.n1=0x%llx " // "spiderdb.k.n1=0x%llx " // "spiderdb.k.n0=0x%llx " // , // nn,iptoa(firstIp),uh48,k.n1, // *(long long *)rec, // *(long long *)(rec+8) // ); //numNodes++; //} */ // . add to table which allows us to ensure same url not // repeated in tree // . just skip if fail to add... if ( m_winnerTable.addKey ( &uh48 , &wk ) < 0 ) continue; // use an individually allocated buffer for each spiderrequest // so if it gets removed from tree the memory can be freed by // the tree which "owns" the data because m_winnerTree.set() // above set ownsData // to true above. long need = sreq->getRecSize(); char *newMem = (char *)mdup ( sreq , need , "sreqbuf" ); if ( ! newMem ) continue; // add it to the tree of the top urls to spider m_winnerTree.addNode( 0, (char *)&wk , (char *)newMem , need ); // sanity //SpiderRequest *sreq2 = (SpiderRequest *)m_winnerTree. //getData ( nn ); // set new tail priority and time for next compare if ( m_winnerTree.getNumUsedNodes() >= maxWinners ) { // for the worst node in the tree... long tailNode = m_winnerTree.getLastNode(); if ( tailNode < 0 ) { char *xx=NULL;*xx=0; } // set new tail parms key192_t *tailKey; tailKey = (key192_t *)m_winnerTree.getKey ( tailNode ); // convert to char first then to signed long parseWinnerTreeKey ( tailKey , &m_tailIp , &m_tailPriority, &m_tailHopCount, &m_tailTimeMS , &m_tailUh48 ); // sanity if ( m_tailIp != firstIp ) { char *xx=NULL;*xx=0;} } /* // ok, we got a new winner winPriority = priority; winTimeMS = spiderTimeMS; winMaxSpidersPerIp = maxSpidersPerIp; winReq = sreq; // set these for doledb winReq->m_priority = priority; winReq->m_ufn = ufn; //winReq->m_spiderTime = spiderTime; */ } /* MDW: now that we have winnerTree obsolete this stuff: // if its ready to spider now, that trumps one in the future always! if ( winReq && m_bestRequestValid && m_bestSpiderTimeMS <= nowGlobalMS && winTimeMS > nowGlobal ) winReq = NULL; // if this is a successive call we have to beat the global because // the firstIp has a *ton* of spider requests and we can't read them // all in one list, then see if we beat our global winner! if ( winReq && m_bestRequestValid && m_bestSpiderTimeMS <= nowGlobalMS && m_bestRequest->m_priority > winPriority ) winReq = NULL; // or if both in future. use time. if ( winReq && m_bestRequestValid && m_bestSpiderTimeMS > nowGlobalMS && winTimeMS > nowGlobal && m_bestSpiderTimeMS < winTimeMS ) winReq = NULL; // if both recs are overdue for spidering and priorities tied, use // the hopcount. should make us breadth-first, all else being equal. if ( winReq && m_bestRequestValid && m_bestRequest->m_priority == winPriority && m_bestSpiderTimeMS <= nowGlobalMS && winTimeMS <= nowGlobal && m_bestRequest->m_hopCount < winReq->m_hopCount ) winReq = NULL; // use times if hops are equal and both are overdue from same priority. if ( winReq && m_bestRequestValid && m_bestRequest->m_priority == winPriority && m_bestSpiderTimeMS <= nowGlobalMS && winTimeMS <= nowGlobal && m_bestRequest->m_hopCount == winReq->m_hopCount && m_bestSpiderTimeMS <= winTimeMS ) winReq = NULL; // if nothing, we are done! if ( winReq ) { // store this long rsize = winReq->getRecSize(); // sanity check if ( rsize > (long)MAX_BEST_REQUEST_SIZE){char *xx=NULL;*xx=0;} // now store this SpiderRequest for adding to doledb memcpy ( m_bestRequestBuf , winReq, rsize ); // point to that m_bestRequest = (SpiderRequest *)m_bestRequestBuf; // set this m_bestRequestValid = true; // this too m_bestSpiderTimeMS = winTimeMS; m_bestMaxSpidersPerIp = winMaxSpidersPerIp; // sanity if ( (long long)winTimeMS < 0 ) { char *xx=NULL;*xx=0; } // note it if ( g_conf.m_logDebugSpider ) { log("spider: found winning request IN THIS LIST ip=%s " "spiderTimeMS=%lli " "ufn=%li " "hadreply=%li " "pri=%li uh48=%lli url=%s", iptoa(m_bestRequest->m_firstIp), m_bestSpiderTimeMS, (long)m_bestRequest->m_ufn, (long)m_bestRequest->m_hadReply, (long)m_bestRequest->m_priority, m_bestRequest->getUrlHash48(), m_bestRequest->m_url); // debug why not sticking to our site! //if ( strstr(m_bestRequest->m_url,"//www.jezebelgallery.com") == NULL ) { char *xx=NULL;*xx=0; } } } else if ( g_conf.m_logDebugSpider ) { log("spider: did not find winning request for %s but " "bestReqValid=%li", iptoa(m_scanningIp),(long)m_bestRequestValid); } */ // are we the final list in the scan? //m_isReadDone = ( list->getListSize() < (long)SR_READ_SIZE ) ; // // . try to fix the bug of reading like only 150k when we asked for 512 // . that bug was because of dedupList() function // //if ( list->isEmpty() ) // m_isReadDone = true; // if no spiderreply for the current url, invalidate this m_lastReplyValid = false; // if read is not yet done, save the reply in case next list needs it if ( srep ) { // && ! m_isReadDone ) { long rsize = srep->getRecSize(); if ( rsize > (long)MAX_SP_REPLY_SIZE ) { char *xx=NULL;*xx=0; } memcpy ( m_lastReplyBuf, srep, rsize ); m_lastReplyValid = true; } // debug info if ( g_conf.m_logDebugSpider ) log("spider: Checked list of %li spiderdb bytes for winners " "for firstip=%s. winnerTreeUsedNodes=%li", list->getListSize(),iptoa(m_scanningIp), m_winnerTree.getNumUsedNodes()); // reset any errno cuz we're just a cache g_errno = 0; // ok we've updated m_bestRequest!!! return true; } // . this is ONLY CALLED from evalIpLoop() above // . add another 0 entry into waiting tree, unless we had no winner // . add winner in here into doledb // . returns false if blocked and doledWrapper() will be called // . returns true and sets g_errno on error bool SpiderColl::addWinnersIntoDoledb ( ) { if ( g_errno ) { log("spider: got error when trying to add winner to doledb: " "%s",mstrerror(g_errno)); return true; } // gotta check this again since we might have done a QUICKPOLL() above // to call g_process.shutdown() so now tree might be unwritable RdbTree *wt = &m_waitingTree; if ( wt->m_isSaving || ! wt->m_isWritable ) return true; /* MDW: let's print out recs we add to doledb //if ( g_conf.m_logDebugSpider && m_bestRequestValid ) { if ( g_conf.m_logDebugSpider && m_bestRequestValid ) { log("spider: got best ip=%s sreq spiderTimeMS=%lli " "pri=%li uh48=%lli", iptoa(m_bestRequest->m_firstIp), m_bestSpiderTimeMS, (long)m_bestRequest->m_priority, m_bestRequest->getUrlHash48()); } else if ( g_conf.m_logDebugSpider ) { log("spider: no best request for ip=%s",iptoa(m_scanningIp)); } */ // ok, all done if nothing to add to doledb. i guess we were misled // that firstIp had something ready for us. maybe the url filters // table changed to filter/ban them all. if a new request/reply comes // in for this firstIp then it will re-add an entry to waitingtree and // we will re-scan spiderdb. if we had something to spider but it was // in the future the m_minFutureTimeMS will be non-zero, and we deal // with that below... if ( m_winnerTree.isEmpty() && ! m_minFutureTimeMS ) { // if we received new incoming requests while we were // scanning, which is happening for some crawls, then do // not nuke! just repeat later in populateDoledbFromWaitingTree if ( m_gotNewDataForScanningIp ) { if ( g_conf.m_logDebugSpider ) log("spider: received new requests, not " "nuking misleading key"); return true; } // note it - this can happen if no more to spider right now! if ( g_conf.m_logDebugSpider ) log("spider: nuking misleading waitingtree key " "firstIp=%s", iptoa(m_scanningIp)); m_waitingTree.deleteNode ( 0,(char *)&m_waitingTreeKey,true); //log("spider: 7 del node for %s",iptoa(m_scanningIp)); m_waitingTreeKeyValid = false; // note it unsigned long long timestamp64 = m_waitingTreeKey.n1; timestamp64 <<= 32; timestamp64 |= m_waitingTreeKey.n0 >> 32; long firstIp = m_waitingTreeKey.n0 &= 0xffffffff; if ( g_conf.m_logDebugSpider ) log(LOG_DEBUG,"spider: removed2 time=%lli ip=%s from " "waiting tree. nn=%li.", timestamp64, iptoa(firstIp), m_waitingTree.m_numUsedNodes); m_waitingTable.removeKey ( &firstIp ); // sanity check if ( ! m_waitingTable.m_isWritable ) { char *xx=NULL;*xx=0;} return true; } // i am seeing dup uh48's in the m_winnerTree long firstIp = m_waitingTreeKey.n0 & 0xffffffff; char dbuf[3*MAX_WINNER_NODES*(8+1)]; HashTableX dedup; long ntn = m_winnerTree.getNumNodes(); dedup.set ( 8, 0, (long)2*ntn, // # slots to initialize to dbuf, (long)(3*MAX_WINNER_NODES*(8+1)), false, MAX_NICENESS, "windt"); /////////// // // make winner tree into doledb list to add // /////////// m_doleBuf.reset(); for ( long node = m_winnerTree.getFirstNode() ; node >= 0 ; node = m_winnerTree.getNextNode ( node ) ) { // breathe QUICKPOLL ( MAX_NICENESS ); // get data for that SpiderRequest *sreq2; sreq2 = (SpiderRequest *)m_winnerTree.getData ( node ); // sanity if ( sreq2->m_firstIp != firstIp ) { char *xx=NULL;*xx=0; } //if ( sreq2->m_spiderTimeMS < 0 ) { char *xx=NULL;*xx=0; } if ( sreq2->m_ufn < 0 ) { char *xx=NULL;*xx=0; } if ( sreq2->m_priority == -1 ) { char *xx=NULL;*xx=0; } // check for errors bool hadError = false; // parse it up long winIp; long winPriority; long winHopCount; long long winSpiderTimeMS; long long winUh48; key192_t *winKey = (key192_t *)m_winnerTree.getKey ( node ); parseWinnerTreeKey ( winKey , &winIp , &winPriority, &winHopCount, &winSpiderTimeMS , &winUh48 ); // sanity if ( winIp != firstIp ) { char *xx=NULL;*xx=0;} if ( winUh48 != sreq2->getUrlHash48() ) { char *xx=NULL;*xx=0;} // make the doledb key key_t doleKey = g_doledb.makeKey ( winPriority, // convert to secs from ms winSpiderTimeMS / 1000 , winUh48 , false ); // dedup. if we add dups the problem is is that they // overwrite the key in doledb yet the doleiptable count // remains undecremented and doledb is empty and never // replenished because the firstip can not be added to // waitingTree because doleiptable count is > 0. this was // causing spiders to hang for collections. i am not sure // why we should be getting dups in winnertree because they // have the same uh48 and that is the key in the tree. if ( dedup.isInTable ( &winUh48 ) ) { log("spider: got dup uh48=%llu dammit", winUh48); continue; } // do not allow dups dedup.addKey ( &winUh48 ); // store doledb key first if ( ! m_doleBuf.safeMemcpy ( &doleKey, sizeof(key_t) ) ) hadError = true; // then size of spiderrequest if ( ! m_doleBuf.pushLong ( sreq2->getRecSize() ) ) hadError = true; // then the spiderrequest encapsulated if ( ! m_doleBuf.safeMemcpy ( sreq2 , sreq2->getRecSize() )) hadError=true; // note and error if ( hadError ) { log("spider: error making doledb list: %s", mstrerror(g_errno)); return true; } } //////////////////// // // UPDATE WAITING TREE ENTRY // // Normally the "spidertime" is 0 for a firstIp. This will make it // a future time if it is not yet due for spidering. // //////////////////// // sanity check. how did this happen? it messes up our crawl! // maybe a doledb add went through? so we should add again? long wn = m_waitingTree.getNode(0,(char *)&m_waitingTreeKey); if ( wn < 0 ) { log("spider: waiting tree key removed while reading list for " "%s (%li)",m_coll,(long)m_collnum); // play it safe and add it back for now... // when i try to break here in gdb it never happens because // of timing issues. heisenbug... // false = callForScan //if ( ! addToWaitingTree ( 0 , m_scanningIp , false ) ) // log("spider: failed to add wk2 to waiting tree: %s" // ,mstrerror(g_errno)); return true; } /* MDW: now we store multiple winners in doledb, so this logic won't be used. maybe just rely on the crawldelay type logic in Msg13.cpp to space things out. we could also deny locks if too many urls being spidered for this ip. // how many spiders currently out for this ip? long outNow=g_spiderLoop.getNumSpidersOutPerIp(m_scanningIp,m_collnum); // even if hadn't gotten list we can bail early if too many // spiders from this ip are out! //long out = g_spiderLoop.getNumSpidersOutPerIp ( m_scanningIp ); if ( outNow >= m_bestMaxSpidersPerIp ) { // note it if ( g_conf.m_logDebugSpider ) log("spider: already got %li from this ip out. ip=%s", m_bestMaxSpidersPerIp, iptoa(m_scanningIp) ); // when his SpiderReply comes back it will call // addWaitingTree with a "0" time so he'll get back in there //if ( wn < 0 ) { char *xx=NULL; *xx=0; } if ( wn >= 0 ) { m_waitingTree.deleteNode (wn,false ); // note that //log("spdr: 1 del node %li for %s",wn,iptoa(firstIp)); } // keep the table in sync now with the time m_waitingTable.removeKey( &m_bestRequest->m_firstIp ); return true; } */ //long long nowGlobalMS = gettimeofdayInMillisecondsGlobal();//Local(); // if best request has a future spiderTime, at least update // the wait tree with that since we will not be doling this request // right now. if ( m_winnerTree.isEmpty() && m_minFutureTimeMS ) { // if in the process of being added to doledb or in doledb... if ( m_doleIpTable.isInTable ( &firstIp ) ) { // sanity i guess. remove this line if it hits this! log("spider: wtf????"); //char *xx=NULL;*xx=0; return true; } // before you set a time too far into the future, if we // did receive new spider requests, entertain those if ( m_gotNewDataForScanningIp ) { if ( g_conf.m_logDebugSpider ) log("spider: received new requests, not " "updating waiting tree with future time"); return true; } // get old time unsigned long long oldSpiderTimeMS = m_waitingTreeKey.n1; oldSpiderTimeMS <<= 32; oldSpiderTimeMS |= (m_waitingTreeKey.n0 >> 32); // delete old node //long wn = m_waitingTree.getNode(0,(char *)&m_waitingTreeKey); //if ( wn < 0 ) { char *xx=NULL;*xx=0; } if ( wn >= 0 ) { m_waitingTree.deleteNode (wn,false ); //log("spdr: 2 del node %li for %s",wn,iptoa(firstIp)); } // invalidate m_waitingTreeKeyValid = false; //long fip = m_bestRequest->m_firstIp; key_t wk2 = makeWaitingTreeKey ( m_minFutureTimeMS , firstIp ); // log the replacement if ( g_conf.m_logDebugSpider ) log("spider: scan replacing waitingtree key " "oldtime=%lu newtime=%lu firstip=%s",// bestpri=%li " //"besturl=%s", (unsigned long)(oldSpiderTimeMS/1000LL), (unsigned long)(m_minFutureTimeMS/1000LL), iptoa(firstIp) //(long)m_bestRequest->m_priority, // m_bestRequest->m_url); ); // this should never fail since we deleted one above long dn = m_waitingTree.addKey ( &wk2 ); // note it if ( g_conf.m_logDebugSpider ) logf(LOG_DEBUG,"spider: RE-added time=%lli ip=%s to " "waiting tree node %li", m_minFutureTimeMS , iptoa(firstIp),dn); // keep the table in sync now with the time m_waitingTable.addKey( &firstIp, &m_minFutureTimeMS ); // sanity check if ( ! m_waitingTable.m_isWritable ) { char *xx=NULL;*xx=0;} return true; } // we are coring here. i guess the best request or a copy of it // somehow started spidering since our last spider read, so i would // say we should bail on this spider scan! really i'm not exactly // sure what happened... // MDW: now we add a bunch of urls to doledb, so i guess we can't // check locks... /* long long key = makeLockTableKey ( m_bestRequest ); if ( g_spiderLoop.m_lockTable.isInTable ( &key ) ) { log("spider: best request got doled out from under us"); return true; char *xx=NULL;*xx=0; } // make the doledb key first for this so we can add it key_t doleKey = g_doledb.makeKey ( m_bestRequest->m_priority , // convert to seconds from ms m_bestSpiderTimeMS / 1000 , m_bestRequest->getUrlHash48() , false ); if ( g_conf.m_logDebugSpider ) log("spider: got winner pdocid=%lli url=%s", m_bestRequest->m_probDocId, m_bestRequest->m_url); // make it into a doledb record char *p = m_doleBuf; *(key_t *)p = doleKey; p += sizeof(key_t); long recSize = m_bestRequest->getRecSize(); *(long *)p = recSize; p += 4; memcpy ( p , m_bestRequest , recSize ); p += recSize; // sanity check if ( p - m_doleBuf > (long)MAX_DOLEREC_SIZE ) { char *xx=NULL;*xx=0; } */ // how did this happen? if ( ! m_msg1Avail ) { char *xx=NULL;*xx=0; } // add it to doledb ip table now so that waiting tree does not // immediately get another spider request from this same ip added // to it while the msg4 is out. but if add failes we totally bail // with g_errno set // // crap, i think this could be slowing us down when spidering // a single ip address. maybe use msg1 here not msg4? //if ( ! addToDoleTable ( m_bestRequest ) ) return true; // . MDW: now we have a list of doledb records in a SafeBuf: // . scan the requests in safebuf for ( char *p = m_doleBuf.getBufStart() ; p < m_doleBuf.getBuf() ; ) { // first is doledbkey p += sizeof(key_t); // then size of spider request p += 4; // the spider request encapsulated SpiderRequest *sreq3; sreq3 = (SpiderRequest *)p; // point "p" to next spiderrequest p += sreq3->getRecSize(); // process sreq3 my incrementing the firstip count in // m_doleIpTable if ( ! addToDoleTable ( sreq3 ) ) return true; // this logic is now in addToDoleTable() // . if it was empty it is no longer // . we have this flag here to avoid scanning empty doledb // priorities because it saves us a msg5 call to doledb in // the scanning loop //long bp = sreq3->m_priority;//m_bestRequest->m_priority; //if ( bp < 0 ) { char *xx=NULL;*xx=0; } //if ( bp >= MAX_SPIDER_PRIORITIES ) { char *xx=NULL;*xx=0; } //m_isDoledbEmpty [ bp ] = 0; } // and the whole thing is no longer empty //m_allDoledbPrioritiesEmpty = 0;//false; //m_lastEmptyCheck = 0; // // delete the winner from ufntree as well // /* long long buh48 = m_bestRequest->getUrlHash48(); key128_t bkey = makeUfnTreeKey ( m_bestRequest->m_firstIp , m_bestRequest->m_priority , m_bestSpiderTimeMS , buh48 ); // must be in tree! long node = s_ufnTree.getNextNode ( 0, (char *)&bkey ); // if this firstip had too few requests to make it into the // tree then node will be < 0! //if ( node < 0 ) { char *xx=NULL;*xx=0; } if ( node >= 0 ) { //log("deleting node #%li firstip=%s uh48=%llu", // node,iptoa(firstIp),uh48); s_ufnTree.deleteNode ( node , true ); } */ m_msg4Start = gettimeofdayInMilliseconds(); RdbList *tmpList = &m_msg1.m_tmpList; tmpList->setFromSafeBuf ( &m_doleBuf , RDB_DOLEDB ); // . use msg4 to transmit our guys into the rdb, RDB_DOLEDB // . no, use msg1 for speed, so we get it right away!! // . we' already incremented doleiptable counts... so we need to // make sure this happens!!! bool status = m_msg1.addList ( tmpList , RDB_DOLEDB , m_collnum , this , doledWrapper , false , // forcelocal? // Rdb.cpp can't call dumpTree() // if niceness is 0! MAX_NICENESS); // if it blocked set this to true so we do not reuse it if ( ! status ) m_msg1Avail = false; long storedFirstIp = (m_waitingTreeKey.n0) & 0xffffffff; // log it if ( g_conf.m_logDebugSpcache ) { unsigned long long spiderTimeMS = m_waitingTreeKey.n1; spiderTimeMS <<= 32; spiderTimeMS |= (m_waitingTreeKey.n0 >> 32); logf(LOG_DEBUG,"spider: removing doled waitingtree key" " spidertime=%llu firstIp=%s " //"pri=%li " //"url=%s" ,spiderTimeMS, iptoa(storedFirstIp) //(long)m_bestRequest->m_priority, //m_bestRequest->m_url); ); } // before adding to doledb remove from waiting tree so we do not try // to readd to doledb... m_waitingTree.deleteNode ( 0, (char *)&m_waitingTreeKey , true); m_waitingTable.removeKey ( &storedFirstIp ); //log("spider: 3 del node for %s",iptoa(storedFirstIp)); // invalidate m_waitingTreeKeyValid = false; // sanity check if ( ! m_waitingTable.m_isWritable ) { char *xx=NULL;*xx=0;} // note that ip as being in dole table if ( g_conf.m_logDebugSpider ) log("spider: added best sreq for ip=%s to doletable AND " "removed from waiting table", iptoa(firstIp)); // add did not block return status; } uint64_t SpiderColl::getSpiderTimeMS ( SpiderRequest *sreq, long ufn, SpiderReply *srep, uint64_t nowGlobalMS ) { // . get the scheduled spiderTime for it // . assume this SpiderRequest never been successfully spidered long long spiderTimeMS = ((uint64_t)sreq->m_addedTime) * 1000LL; // if injecting for first time, use that! if ( ! srep && sreq->m_isInjecting ) return spiderTimeMS; // to avoid hammering an ip, get last time we spidered it... long long lastMS ; lastMS = m_lastDownloadCache.getLongLong ( m_collnum , sreq->m_firstIp , -1 , // maxAge true );// promote // -1 means not found if ( (long long)lastMS == -1 ) lastMS = 0; // sanity if ( (long long)lastMS < -1 ) { log("spider: corrupt last time in download cache. nuking."); lastMS = 0; } // min time we can spider it long long minSpiderTimeMS1 = lastMS + m_cr->m_spiderIpWaits[ufn]; // if not found in cache if ( lastMS == -1 ) minSpiderTimeMS1 = 0LL; ///////////////////////////////////////////////// ///////////////////////////////////////////////// // crawldelay table check!!!! ///////////////////////////////////////////////// ///////////////////////////////////////////////// long *cdp = (long *)m_cdTable.getValue ( &sreq->m_domHash32 ); long long minSpiderTimeMS2 = 0; if ( cdp && *cdp >= 0 ) minSpiderTimeMS2 = lastMS + *cdp; // wait 5 seconds for all outlinks in order for them to have a // chance to get any link info that might have been added // from the page that supplied this outlink // CRAP! this slows down same ip spidering i think... yeah, without // this it seems the spiders are always at 10 (sometimes 8 or 9) // when i spider techcrunch.com. //spiderTimeMS += 5000; // ensure min if ( spiderTimeMS < minSpiderTimeMS1 ) spiderTimeMS = minSpiderTimeMS1; if ( spiderTimeMS < minSpiderTimeMS2 ) spiderTimeMS = minSpiderTimeMS2; // if no reply, use that if ( ! srep ) return spiderTimeMS; // if this is not the first try, then re-compute the spiderTime // based on that last time // sanity check if ( srep->m_spideredTime <= 0 ) { // a lot of times these are corrupt! wtf??? //spiderTimeMS = minSpiderTimeMS; return spiderTimeMS; //{ char*xx=NULL;*xx=0;} } // compute new spiderTime for this guy, in seconds long long waitInSecs = (uint64_t)(m_cr->m_spiderFreqs[ufn]*3600*24.0); // do not spider more than once per 15 seconds ever! // no! might be a query reindex!! if ( waitInSecs < 15 && ! sreq->m_urlIsDocId ) { static bool s_printed = false; if ( ! s_printed ) { s_printed = true; log("spider: min spider wait is 15 seconds, " "not %llu (ufn=%li)",waitInSecs,ufn); } waitInSecs = 15;//900; this was 15 minutes } // in fact, force docid based guys to be zero! if ( sreq->m_urlIsDocId ) waitInSecs = 0; // when it was spidered long long lastSpideredMS = ((uint64_t)srep->m_spideredTime) * 1000; // . when we last attempted to spider it... (base time) // . use a lastAttempt of 0 to indicate never! // (first time) long long minSpiderTimeMS3 = lastSpideredMS + (waitInSecs * 1000LL); // ensure min if ( spiderTimeMS < minSpiderTimeMS3 ) spiderTimeMS = minSpiderTimeMS3; // sanity if ( (long long)spiderTimeMS < 0 ) { char *xx=NULL;*xx=0; } return spiderTimeMS; } // . returns false with g_errno set on error // . Rdb.cpp should call this when it receives a doledb key // . when trying to add a SpiderRequest to the waiting tree we first check // the doledb table to see if doledb already has an sreq from this firstIp // . therefore, we should add the ip to the dole table before we launch the // Msg4 request to add it to doledb, that way we don't add a bunch from the // same firstIP to doledb bool SpiderColl::addToDoleTable ( SpiderRequest *sreq ) { // update how many per ip we got doled long *score = (long *)m_doleIpTable.getValue32 ( sreq->m_firstIp ); // debug point if ( g_conf.m_logDebugSpider ){//&&1==2 ) { // disable for now, spammy long long uh48 = sreq->getUrlHash48(); long long pdocid = sreq->getParentDocId(); long ss = 1; if ( score ) ss = *score + 1; // if for some reason this collides with another key // already in doledb then our counts are off log("spider: added to doletbl uh48=%llu parentdocid=%llu " "ipdolecount=%li ufn=%li priority=%li firstip=%s", uh48,pdocid,ss,(long)sreq->m_ufn,(long)sreq->m_priority, iptoa(sreq->m_firstIp)); } // we had a score there already, so inc it if ( score ) { // inc it *score = *score + 1; // sanity check if ( *score <= 0 ) { char *xx=NULL;*xx=0; } // only one per ip! // not any more! we allow MAX_WINNER_NODES per ip! if ( *score > MAX_WINNER_NODES ) log("spider: crap. had %li recs in doledb for %s " "from %s." "how did this happen?", (long)*score,m_coll,iptoa(sreq->m_firstIp)); // now we log it too if ( g_conf.m_logDebugSpider ) log(LOG_DEBUG,"spider: added ip=%s to doleiptable " "(score=%li)", iptoa(sreq->m_firstIp),*score); } else { // ok, add new slot long val = 1; if ( ! m_doleIpTable.addKey ( &sreq->m_firstIp , &val ) ) { // log it, this is bad log("spider: failed to add ip %s to dole ip tbl", iptoa(sreq->m_firstIp)); // return true with g_errno set on error return false; } // now we log it too if ( g_conf.m_logDebugSpider ) log(LOG_DEBUG,"spider: added ip=%s to doleiptable " "(score=1)",iptoa(sreq->m_firstIp)); // sanity check //if ( ! m_doleIpTable.m_isWritable ) { char *xx=NULL;*xx=0;} } // . these priority slots in doledb are not empty // . unmark individual priority buckets // . do not skip them when scanning for urls to spiderd long pri = sreq->m_priority; m_isDoledbEmpty[pri] = 0; // reset scan for this priority in doledb m_nextKeys [pri] =g_doledb.makeFirstKey2 ( pri ); return true; } ///////////////////////// ///////////////////////// UTILITY FUNCTIONS ///////////////////////// // . map a spiderdb rec to the shard # that should spider it // . "sr" can be a SpiderRequest or SpiderReply // . shouldn't this use Hostdb::getShardNum()? /* unsigned long getShardToSpider ( char *sr ) { // use the url hash long long uh48 = g_spiderdb.getUrlHash48 ( (key128_t *)sr ); // host to dole it based on ip long hostId = uh48 % g_hostdb.m_numHosts ; // get it Host *h = g_hostdb.getHost ( hostId ) ; // and return groupid return h->m_groupId; } */ // does this belong in our spider cache? bool isAssignedToUs ( long firstIp ) { // sanity check... must be in our group.. we assume this much //if ( g_spiderdb.getGroupId(firstIp) != g_hostdb.m_myHost->m_groupId){ // char *xx=NULL;*xx=0; } // . host to dole it based on ip // . ignore lower 8 bits of ip since one guy often owns a whole block! //long hostId=(((unsigned long)firstIp) >> 8) % g_hostdb.getNumHosts(); // get our group //Host *group = g_hostdb.getMyGroup(); Host *shard = g_hostdb.getMyShard(); // pick a host in our group // if not dead return it //if ( ! g_hostdb.isDead(hostId) ) return hostId; // get that host //Host *h = g_hostdb.getHost(hostId); // get the group //Host *group = g_hostdb.getGroup ( h->m_groupId ); // and number of hosts in the group long hpg = g_hostdb.getNumHostsPerShard(); // let's mix it up since spider shard was selected using this // same mod on the firstIp method!! unsigned long long h64 = firstIp; unsigned char c = firstIp & 0xff; h64 ^= g_hashtab[c][0]; // select the next host number to try //long next = (((unsigned long)firstIp) >> 16) % hpg ; // hash to a host long i = ((uint32_t)h64) % hpg; Host *h = &shard[i]; // return that if alive if ( ! g_hostdb.isDead(h) ) return (h->m_hostId == g_hostdb.m_hostId); // . select another otherwise // . put all alive in an array now Host *alive[64]; long upc = 0; for ( long j = 0 ; j < hpg ; j++ ) { Host *h = &shard[i]; if ( g_hostdb.isDead(h) ) continue; alive[upc++] = h; } // if none, that is bad! return the first one that we wanted to if ( upc == 0 ) return (h->m_hostId == g_hostdb.m_hostId); // select from the good ones now i = ((uint32_t)firstIp) % hpg; // get that h = &shard[i]; // guaranteed to be alive... kinda return (h->m_hostId == g_hostdb.m_hostId); } ///////////////////////// ///////////////////////// SPIDERLOOP ///////////////////////// static void indexedDocWrapper ( void *state ) ; static void doneSleepingWrapperSL ( int fd , void *state ) ; // a global class extern'd in .h file SpiderLoop g_spiderLoop; SpiderLoop::SpiderLoop ( ) { // clear array of ptrs to Doc's memset ( m_docs , 0 , sizeof(XmlDoc *) * MAX_SPIDERS ); } SpiderLoop::~SpiderLoop ( ) { reset(); } // free all doc's void SpiderLoop::reset() { // delete all doc's in use for ( long i = 0 ; i < MAX_SPIDERS ; i++ ) { if ( m_docs[i] ) { mdelete ( m_docs[i] , sizeof(XmlDoc) , "Doc" ); delete (m_docs[i]); } m_docs[i] = NULL; //m_lists[i].freeList(); } m_list.freeList(); m_lockTable.reset(); m_lockCache.reset(); } void updateAllCrawlInfosSleepWrapper ( int fd , void *state ) ; void SpiderLoop::startLoop ( ) { m_cri = 0; // falsify this flag m_outstanding1 = false; // not flushing m_msg12.m_gettingLocks = false; // we aren't in the middle of waiting to get a list of SpiderRequests m_gettingDoledbList = false; // we haven't registered for sleeping yet m_isRegistered = false; // clear array of ptrs to Doc's memset ( m_docs , 0 , sizeof(XmlDoc *) * MAX_SPIDERS ); // . m_maxUsed is the largest i such that m_docs[i] is in use // . -1 means there are no used m_docs's m_maxUsed = -1; m_numSpidersOut = 0; m_processed = 0; // for locking. key size is 8 for easier debugging m_lockTable.set ( 8,sizeof(UrlLock),0,NULL,0,false,MAX_NICENESS, "splocks", true ); // useKeyMagic? yes. if ( ! m_lockCache.init ( 10000 , // maxcachemem 4 , // fixedatasize false , // supportlists? 1000 , // maxcachenodes false , // use half keys "lockcache", // dbname false ) ) log("spider: failed to init lock cache. performance hit." ); // dole some out //g_spiderLoop.doleUrls1(); // spider some urls that were doled to us //g_spiderLoop.spiderDoledUrls( ); // sleep for .1 seconds = 100ms if (!g_loop.registerSleepCallback(50,this,doneSleepingWrapperSL)) log("build: Failed to register timer callback. Spidering " "is permanently disabled. Restart to fix."); // crawlinfo updating // save bandwidth for now make this every 4 seconds not 1 second // then try not to send crawlinfo the host should already have. // each collrec can have a checksum for each host of the last // info we sent it. but we should resend all every 100 secs anyway // in case host when dead if ( !g_loop.registerSleepCallback(4000, this, updateAllCrawlInfosSleepWrapper)) log("build: failed to register updatecrawlinfowrapper"); } void doneSleepingWrapperSL ( int fd , void *state ) { //SpiderLoop *THIS = (SpiderLoop *)state; // dole some out //g_spiderLoop.doleUrls1(); // if spidering disabled then do not do this crap if ( ! g_conf.m_spideringEnabled ) return; //if ( ! g_conf.m_webSpideringEnabled ) return; // wait for clock to sync with host #0 if ( ! isClockInSync() ) { // let admin know why we are not spidering static char s_printed = false; if ( ! s_printed ) { logf(LOG_DEBUG,"spider: NOT SPIDERING until clock " "is in sync with host #0."); s_printed = true; } return; } static long s_count = -1; // count these calls s_count++; // reset SpiderColl::m_didRound and m_nextDoledbKey if it is maxed // because we might have had a lock collision long nc = g_collectiondb.m_numRecs; for ( long i = 0 ; i < nc ; i++ ) { // get collectionrec CollectionRec *cr = g_collectiondb.getRec(i); if ( ! cr ) continue; // skip if not enabled if ( ! cr->m_spideringEnabled ) continue; // get it //SpiderColl *sc = cr->m_spiderColl; SpiderColl *sc = g_spiderCache.getSpiderColl(i); // skip if none if ( ! sc ) continue; // also scan spiderdb to populate waiting tree now but // only one read per 100ms!! if ( (s_count % 10) == 0 ) { // always do a scan at startup & every 24 hrs // AND at process startup!!! if ( ! sc->m_waitingTreeNeedsRebuild && getTimeLocal() - sc->m_lastScanTime > 24*3600) { // if a scan is ongoing, this will re-set it sc->m_nextKey2.setMin(); sc->m_waitingTreeNeedsRebuild = true; log(LOG_INFO, "spider: hit spider queue " "rebuild timeout for %s (%li)", cr->m_coll,(long)cr->m_collnum); // flush the ufn table //clearUfnTable(); } // try this then. it just returns if // sc->m_waitingTreeNeedsRebuild is false sc->populateWaitingTreeFromSpiderdb ( false ); } // re-entry is false because we are entering for the first time sc->populateDoledbFromWaitingTree ( ); // skip if still loading doledb lists from disk this round if ( ! sc->m_didRound ) continue; // ensure at the top! if ( sc->m_pri2!=MAX_SPIDER_PRIORITIES-1){char*xx=NULL;*xx=0;} // ok, reset it so it can start a new doledb scan sc->m_didRound = false; // reset this as well. if there are no spiderRequests // available on any priority level for this collection, // then it will remain true. but if we attempt to spider // a url, or can't spider a url b/c of a max oustanding // constraint, we set this to false. this is used to // send notifications when a crawl is basically in hiatus. //sc->m_encounteredDoledbRecs = false; //sc->m_nextDoledbKey.setMin(); } // set initial priority to the highest to start spidering there //g_spiderLoop.m_pri = MAX_SPIDER_PRIORITIES - 1; // spider some urls that were doled to us g_spiderLoop.spiderDoledUrls( ); } void doneSendingNotification ( void *state ) { EmailInfo *ei = (EmailInfo *)state; collnum_t collnum = ei->m_collnum; CollectionRec *cr = g_collectiondb.m_recs[collnum]; char *coll = "lostcoll"; if ( cr ) coll = cr->m_coll; log("spider: done sending notifications for coll=%s", coll); // all done if collection was deleted from under us if ( ! cr ) return; // we can re-use the EmailInfo class now // pingserver.cpp sets this //ei->m_inUse = false; // so we do not send for this status again, mark it as sent for // but we reset sentCrawlDoneAlert to 0 on round increment below //log("spider: setting sentCrawlDoneAlert status to %li", // (long)cr->m_spiderStatus); // mark it as sent. anytime a new url is spidered will mark this // as false again! use LOCAL crawlInfo, since global is reset often. cr->m_localCrawlInfo.m_sentCrawlDoneAlert = 1;//cr->m_spiderStatus;//1; // be sure to save state so we do not re-send emails cr->m_needsSave = 1; // sanity if ( cr->m_spiderStatus == 0 ) { char *xx=NULL;*xx=0; } // i guess each host advances its own round... so take this out // sanity check //if ( g_hostdb.m_myHost->m_hostId != 0 ) { char *xx=NULL;*xx=0; } //float respiderFreq = -1.0; float respiderFreq = cr->m_collectiveRespiderFrequency; // if not REcrawling, set this to 0 so we at least update our // round # and round start time... if ( respiderFreq < 0.0 ) //== -1.0 ) respiderFreq = 0.0; //if ( respiderFreq < 0.0 ) { // log("spider: bad respiderFreq of %f. making 0.", // respiderFreq); // respiderFreq = 0.0; //} // advance round if that round has completed, or there are no // more urls to spider. if we hit maxToProcess/maxToCrawl then // do not increment the round #. otherwise we should increment it. // do allow maxtocrawl guys through if they repeat, however! //if(cr->m_spiderStatus == SP_MAXTOCRAWL && respiderFreq <= 0.0)return; //if(cr->m_spiderStatus == SP_MAXTOPROCESS && respiderFreq<=0.0)return; //////// // // . we are here because hasUrlsReadyToSpider is false // . we just got done sending an email alert // . now increment the round only if doing rounds! // /////// // if not doing rounds, keep the round 0. they might want to up // their maxToCrawl limit or something. if ( respiderFreq <= 0.0 ) return; // if we hit the max to crawl rounds, then stop!!! do not // increment the round... if ( cr->m_spiderRoundNum >= cr->m_maxCrawlRounds && // there was a bug when maxCrawlRounds was 0, which should // mean NO max, so fix that here: cr->m_maxCrawlRounds > 0 ) return; // this should have been set below //if ( cr->m_spiderRoundStartTime == 0 ) { char *xx=NULL;*xx=0; } // find the "respider frequency" from the first line in the url // filters table whose expressions contains "{roundstart}" i guess //for ( long i = 0 ; i < cr->m_numRegExs ; i++ ) { // // get it // char *ex = cr->m_regExs[i].getBufStart(); // // compare // if ( ! strstr ( ex , "roundstart" ) ) continue; // // that's good enough // respiderFreq = cr->m_spiderFreqs[i]; // break; //} long seconds = (long)(respiderFreq * 24*3600); // add 1 for lastspidertime round off errors so we can be assured // all spiders have a lastspidertime LESS than the new // m_spiderRoundStartTime we set below. if ( seconds <= 0 ) seconds = 1; // now update this round start time. all the other hosts should // sync with us using the parm sync code, msg3e, every 13.5 seconds. //cr->m_spiderRoundStartTime += respiderFreq; char roundTime[128]; sprintf(roundTime,"%lu", (long)(getTimeGlobal() + seconds)); // roundNum++ round++ char roundStr[128]; sprintf(roundStr,"%li", cr->m_spiderRoundNum + 1); // waiting tree will usually be empty for this coll since no // spider requests had a valid spider priority, so let's rebuild! // this is not necessary because PF_REBUILD is set for the // "spiderRoundStart" parm in Parms.cpp so it will rebuild if that parm // changes already. //if ( cr->m_spiderColl ) // cr->m_spiderColl->m_waitingTreeNeedsRebuild = true; // we have to send these two parms to all in cluster now INCLUDING // ourselves, when we get it in Parms.cpp there's special // code to set this *ThisRound counts to 0!!! SafeBuf parmList; g_parms.addNewParmToList1 ( &parmList,cr->m_collnum,roundStr,-1 , "spiderRoundNum"); g_parms.addNewParmToList1 ( &parmList,cr->m_collnum,roundTime, -1 , "spiderRoundStart"); //g_parms.addParmToList1 ( &parmList , cr , "spiderRoundNum" ); //g_parms.addParmToList1 ( &parmList , cr , "spiderRoundStart" ); // this uses msg4 so parm ordering is guaranteed g_parms.broadcastParmList ( &parmList , NULL , NULL ); // log it log("spider: new round #%li starttime = %lu for %s" , cr->m_spiderRoundNum , cr->m_spiderRoundStartTime , cr->m_coll ); } bool sendNotificationForCollRec ( CollectionRec *cr ) { // do not send email for maxrounds hit, it will send a round done // email for that. otherwise we end up calling doneSendingEmail() // twice and increment the round twice //if ( cr->m_spiderStatus == SP_MAXROUNDS ) { // log("spider: not sending email for max rounds limit " // "since already sent for round done."); // return true; //} // wtf? caller must set this if ( ! cr->m_spiderStatus ) { char *xx=NULL; *xx=0; } log("spider: sending notification for crawl status %li in coll %s. " //"current sent state is %li" ,(long)cr->m_spiderStatus ,cr->m_coll //cr->m_spiderStatusMsg, //,(long)cr->m_localCrawlInfo.m_sentCrawlDoneAlert); ); // if we already sent it return now. we set this to false everytime // we spider a url, which resets it. use local crawlinfo for this // since we reset global. //if ( cr->m_localCrawlInfo.m_sentCrawlDoneAlert ) return true; // ok, send it EmailInfo *ei = &cr->m_emailInfo; // in use already? if ( ei->m_inUse ) return true; // pingserver.cpp sets this //ei->m_inUse = true; // set it up ei->m_finalCallback = doneSendingNotification; ei->m_finalState = ei; ei->m_collnum = cr->m_collnum; SafeBuf *buf = &ei->m_spiderStatusMsg; // stop it from accumulating status msgs buf->reset(); long status = -1; getSpiderStatusMsg ( cr , buf , &status ); // if no email address or webhook provided this will not block! // DISABLE THIS UNTIL FIXED //log("spider: SENDING EMAIL NOT"); // ok, put it back... if ( ! sendNotification ( ei ) ) return false; // so handle this ourselves in that case: doneSendingNotification ( ei ); return true; } // we need to update crawl info for collections that // have urls ready to spider SpiderColl *getNextSpiderColl ( long *cri ) ; void gotDoledbListWrapper2 ( void *state , RdbList *list , Msg5 *msg5 ) ; ////////////////////////// ////////////////////////// // // The second KEYSTONE function. // // Scans doledb and spiders the doledb records. // // Doledb records contain SpiderRequests ready for spidering NOW. // // 1. gets all locks from all hosts in the shard // 2. sends confirm msg to all hosts if lock acquired: // - each host will remove from doledb then // - assigned host will also add new "0" entry to waiting tree if need be // - calling addToWaitingTree() will trigger populateDoledbFromWaitingTree() // to add a new entry into waiting tree, not the one just locked. // 3. makes a new xmldoc class for that url and calls indexDoc() on it // ////////////////////////// ////////////////////////// // now check our RDB_DOLEDB for SpiderRequests to spider! void SpiderLoop::spiderDoledUrls ( ) { // must be spidering to dole out if ( ! g_conf.m_spideringEnabled ) return; // if we don't have all the url counts from all hosts, then wait. // one host is probably down and was never up to begin with if ( ! s_countsAreValid ) return; //if ( ! g_conf.m_webSpideringEnabled ) return; // if we do not overlap ourselves if ( m_gettingDoledbList ) return; // bail instantly if in read-only mode (no RdbTrees!) if ( g_conf.m_readOnlyMode ) return; // or if doing a daily merge if ( g_dailyMerge.m_mergeMode ) return; // skip if too many udp slots being used if ( g_udpServer.getNumUsedSlots() >= 1300 ) return; // stop if too many out. this is now 50 down from 500. if ( m_numSpidersOut >= MAX_SPIDERS ) return; // a new global conf rule if ( m_numSpidersOut >= g_conf.m_maxTotalSpiders ) return; // bail if no collections if ( g_collectiondb.m_numRecs <= 0 ) return; // not while repairing if ( g_repairMode ) return; // do not spider until collections/parms in sync with host #0 if ( ! g_parms.m_inSyncWithHost0 ) return; // don't spider if not all hosts are up, or they do not all // have the same hosts.conf. if ( ! g_pingServer.m_hostsConfInAgreement ) return; //char *reb = g_rebalance.getNeedsRebalance(); //if ( ! reb || *reb ) {return; //if ( g_conf.m_logDebugSpider ) // log("spider: trying to get a doledb rec to spider. " // "currentnumout=%li",m_numSpidersOut); // when getting a lock we keep a ptr to the SpiderRequest in the // doledb list, so do not try to read more just yet until we know // if we got the lock or not if ( m_msg12.m_gettingLocks ) { // make a note, maybe this is why spiders are deficient? if ( g_conf.m_logDebugSpider ) log("spider: failed to get doledb rec to spider: " "msg12 is getting locks"); return; } // turn on for now //g_conf.m_logDebugSpider = 1; collLoop: // log this now //logf(LOG_DEBUG,"spider: getting collnum to dole from"); // get this m_sc = NULL; // avoid infinite loops long count = g_collectiondb.m_numRecs; // set this in the loop CollectionRec *cr = NULL; long nowGlobal = 0; // debug //log("spider: m_cri=%li",(long)m_cri); // . get the next collection to spider // . just alternate them for ( ; count > 0 ; m_cri++ , count-- ) { // wrap it if we should if ( m_cri >= g_collectiondb.m_numRecs ) m_cri = 0; // get rec cr = g_collectiondb.m_recs[m_cri]; // skip if gone if ( ! cr ) continue; // stop if not enabled if ( ! cr->m_spideringEnabled ) continue; // hit crawl round max? if ( cr->m_maxCrawlRounds > 0 && cr->m_isCustomCrawl && cr->m_spiderRoundNum >= cr->m_maxCrawlRounds ) { cr->m_localCrawlInfo.m_hasUrlsReadyToSpider = false; // prevent having to save all the time if ( cr->m_spiderStatus != SP_MAXROUNDS ) { cr->m_needsSave = true; cr->m_spiderStatus = SP_MAXROUNDS; } continue; } // hit pages to crawl max? if ( cr->m_maxToCrawl > 0 && cr->m_isCustomCrawl == 1 && // bulk jobs exempt! cr->m_globalCrawlInfo.m_pageDownloadSuccessesThisRound >= cr->m_maxToCrawl ) { // now once all hosts have no urls ready to spider // then the send email code will be called. // do it this way for code simplicity. cr->m_localCrawlInfo.m_hasUrlsReadyToSpider = false; // prevent having to save all the time if ( cr->m_spiderStatus != SP_MAXTOCRAWL ) { cr->m_needsSave = true; cr->m_spiderStatus = SP_MAXTOCRAWL; } continue; } // hit pages to process max? if ( cr->m_maxToProcess > 0 && cr->m_isCustomCrawl == 1 && // bulk jobs exempt! cr->m_globalCrawlInfo.m_pageProcessSuccessesThisRound >= cr->m_maxToProcess ) { cr->m_localCrawlInfo.m_hasUrlsReadyToSpider = false; // prevent having to save all the time if ( cr->m_spiderStatus != SP_MAXTOPROCESS ) { cr->m_needsSave = true; cr->m_spiderStatus = SP_MAXTOPROCESS; } continue; } // shortcut CrawlInfo *ci = &cr->m_localCrawlInfo; // . if nothing left to spider... // . this is what makes us fast again! but the problem // is is that if they change the url filters so that // something becomes ready to spider again we won't know // because we do not set this flag back to true in // Parms.cpp "doRebuild" because we don't want to get // another email alert if there is nothing ready to spider // after they change the parms. perhaps we should set // the # of urls spidered to the "sentEmail" flag so we // know if that changes to send another... // . bug in this... //if ( cr->m_isCustomCrawl && ! ci->m_hasUrlsReadyToSpider ) // continue; // get the spider collection for this collnum m_sc = g_spiderCache.getSpiderColl(m_cri); // skip if none if ( ! m_sc ) continue; // skip if we completed the doledb scan for every spider // priority in this collection if ( m_sc->m_didRound ) continue; // set current time, synced with host #0 nowGlobal = getTimeGlobal(); // the last time we attempted to spider a url for this coll //m_sc->m_lastSpiderAttempt = nowGlobal; // now we save this so when we restart these two times // are from where we left off so we do not end up setting // hasUrlsReadyToSpider to true which in turn sets // the sentEmailAlert flag to false, which makes us // send ANOTHER email alert!! ci->m_lastSpiderAttempt = nowGlobal; // sometimes our read of spiderdb to populate the waiting // tree using evalIpLoop() takes a LONG time because // a niceness 0 thread is taking a LONG time! so do not // set hasUrlsReadyToSpider to false because of that!! if ( m_sc->m_gettingList1 ) ci->m_lastSpiderCouldLaunch = nowGlobal; // update this for the first time in case it is never updated. // then after 60 seconds we assume the crawl is done and // we send out notifications. see below. if ( ci->m_lastSpiderCouldLaunch == 0 ) ci->m_lastSpiderCouldLaunch = nowGlobal; // // . if doing respider with roundstarttime.... // . roundstarttime is > 0 if m_collectiveRespiderFrequency // is > 0, unless it has not been set to current time yet // . if m_collectiveRespiderFrequency was set to 0.0 then // PageCrawlBot.cpp also sets m_roundStartTime to 0. // if ( nowGlobal < cr->m_spiderRoundStartTime ) continue; // if populating this collection's waitingtree assume // we would have found something to launch as well. it might // mean the waitingtree-saved.dat file was deleted from disk // so we need to rebuild it at startup. if ( m_sc->m_waitingTreeNeedsRebuild ) ci->m_lastSpiderCouldLaunch = nowGlobal; // get max spiders long maxSpiders = cr->m_maxNumSpiders; if ( m_sc->m_isTestColl ) { // parser does one at a time for consistency if ( g_conf.m_testParserEnabled ) maxSpiders = 1; // need to make it 6 since some priorities essentially // lock the ips up that have urls in higher // priorities. i.e. once we dole a url out for ip X, // then if later we add a high priority url for IP X it // can't get spidered until the one that is doled does. if ( g_conf.m_testSpiderEnabled ) maxSpiders = 6; } // if some spiders are currently outstanding if ( m_sc->m_spidersOut ) // do not end the crawl until empty of urls because // that url might end up adding more links to spider // when it finally completes ci->m_lastSpiderCouldLaunch = nowGlobal; // debug log //if ( g_conf.m_logDebugSpider ) // log("spider: has %li spiders out",m_sc->m_spidersOut); // obey max spiders per collection too if ( m_sc->m_spidersOut >= maxSpiders ) continue; // shortcut SpiderColl *sc = cr->m_spiderColl; if ( sc && sc->m_doleIpTable.isEmpty() ) continue; /* // . HACK. // . TODO: we set spidercoll->m_gotDoledbRec to false above, // then make Rdb.cpp set spidercoll->m_gotDoledbRec to // true if it receives a rec. then if we see that it // got set to true do not update m_nextKeys[i] or // m_isDoledbEmpty[i] etc. because we might have missed // the new doledb rec coming in. // . reset our doledb empty timer every 3 minutes and also // . reset our doledb empty status long wait = SPIDER_DONE_TIMER - 10; if ( wait < 10 ) { char *xx=NULL;*xx=0; } if ( sc && nowGlobal - sc->m_lastEmptyCheck >= wait ) { // assume doledb not empty sc->m_allDoledbPrioritiesEmpty = 0; // reset the timer sc->m_lastEmptyCheck = nowGlobal; // reset all empty flags for each priority for ( long i = 0 ; i < MAX_SPIDER_PRIORITIES ; i++ ) { sc->m_nextKeys[i] = g_doledb.makeFirstKey2(i); sc->m_isDoledbEmpty[i] = 0; } } // . if all doledb priorities are empty, skip it quickly // . do this only after we update lastSpiderAttempt above // . this is broken!! why?? if ( sc && sc->m_allDoledbPrioritiesEmpty >= 3 ) continue; */ // ok, we are good to launch a spider for coll m_cri break; } // if none, bail, wait for sleep wrapper to re-call us later on if ( count == 0 ) return; // sanity check if ( nowGlobal == 0 ) { char *xx=NULL;*xx=0; } // sanity check if ( m_cri >= g_collectiondb.m_numRecs ) { char *xx=NULL;*xx=0; } // grab this //collnum_t collnum = m_cri; //CollectionRec *cr = g_collectiondb.m_recs[collnum]; // update the crawlinfo for this collection if it has been a while. // should never block since callback is NULL. //if ( ! updateCrawlInfo(cr,NULL,NULL,true) ) { char *xx=NULL;*xx=0; } // get this //char *coll = cr->m_coll; // need this for msg5 call key_t endKey; endKey.setMax(); // start at the top each time now //m_sc->m_nextDoledbKey.setMin(); // set the key to this at start (break Spider.cpp:4683 //m_sc->m_nextDoledbKey = g_doledb.makeFirstKey2 ( m_sc->m_pri ); // init the m_priorityToUfn map array? if ( ! m_sc->m_ufnMapValid ) { // reset all priorities to map to a ufn of -1 for ( long i = 0 ; i < MAX_SPIDER_PRIORITIES ; i++ ) m_sc->m_priorityToUfn[i] = -1; // initialize the map that maps priority to first ufn that uses // that priority. map to -1 if no ufn uses it. for ( long i = 0 ; i < cr->m_numRegExs ; i++ ) { // breathe QUICKPOLL ( MAX_NICENESS ); // get the ith rule priority long sp = cr->m_spiderPriorities[i]; // must not be filtered or banned if ( sp < 0 ) continue; // sanity if ( sp >= MAX_SPIDER_PRIORITIES){char *xx=NULL;*xx=0;} // skip if already mapped if ( m_sc->m_priorityToUfn[sp] != -1 ) continue; // map that m_sc->m_priorityToUfn[sp] = i; } // all done m_sc->m_ufnMapValid = true; } loop: // shortcut CrawlInfo *ci = &cr->m_localCrawlInfo; // bail if waiting for lock reply, no point in reading more if ( m_msg12.m_gettingLocks ) { // assume we would have launched a spider for this coll ci->m_lastSpiderCouldLaunch = nowGlobal; // wait for sleep callback to re-call us in 10ms return; } // reset priority when it goes bogus if ( m_sc->m_pri2 < 0 ) { // i guess the scan is complete for this guy m_sc->m_didRound = true; // count # of priority scan rounds done //m_sc->m_numRoundsDone++; // reset for next coll m_sc->m_pri2 = MAX_SPIDER_PRIORITIES - 1; // reset key now too since this coll was exhausted //m_sc->m_nextDoledbKey=g_doledb.makeFirstKey2 ( m_sc->m_pri ); // we can't keep starting over because there are often tons // of annihilations between positive and negative keys // and causes massive disk slow down because we have to do // like 300 re-reads or more of about 2k each on coeus m_sc->m_nextDoledbKey = m_sc->m_nextKeys [ m_sc->m_pri2 ]; // and this m_sc->m_msg5StartKey = m_sc->m_nextDoledbKey; // was it all empty? if did not encounter ANY doledb recs // after scanning all priorities, set empty to true. //if ( ! m_sc->m_encounteredDoledbRecs && // // if waiting tree is rebuilding... could be empty... // ! m_sc->m_waitingTreeNeedsRebuild ) // m_sc->m_lastDoledbReadEmpty = true; // and go up top goto collLoop; } // . skip priority if we knows its empty in doledb // . this will save us a call to msg5 below //if ( m_sc->m_isDoledbEmpty [ m_sc->m_pri2 ] ) { // // decrease the priority // m_sc->devancePriority(); // // and try the one below // goto loop; //} // shortcut //CollectionRec *cr = m_sc->m_cr; // sanity if ( cr != m_sc->m_cr ) { char *xx=NULL;*xx=0; } // skip the priority if we already have enough spiders on it long out = m_sc->m_outstandingSpiders[m_sc->m_pri2]; // how many spiders can we have out? long max = 0; for ( long i =0 ; i < cr->m_numRegExs ; i++ ) { if ( cr->m_spiderPriorities[i] != m_sc->m_pri2 ) continue; //if ( ! cr->m_spidersEnabled[i] ) continue; if ( cr->m_maxSpidersPerRule[i] > max ) max = cr->m_maxSpidersPerRule[i]; } // get the max # of spiders over all ufns that use this priority! //long max = getMaxAllowableSpidersOut ( m_sc->m_pri2 ); //long ufn = m_sc->m_priorityToUfn[m_sc->m_pri2]; // how many can we have? crap, this is based on ufn, not priority // so we need to map the priority to a ufn that uses that priority //long max = 0; // see if it has a maxSpiders, if no ufn uses this priority then // "max" will remain set to 0 //if ( ufn >= 0 ) max = m_sc->m_cr->m_maxSpidersPerRule[ufn]; // turned off? //if ( ufn >= 0 && ! m_sc->m_cr->m_spidersEnabled[ufn] ) max = 0; // if we have one out, do not end the round! if ( out > 0 ) { // assume we could have launched a spider ci->m_lastSpiderCouldLaunch = nowGlobal; } // always allow at least 1, they can disable spidering otherwise // no, we use this to disabled spiders... if ( max <= 0 ) max = 1; // skip? if ( out >= max ) { // count as non-empty then! //m_sc->m_encounteredDoledbRecs = true; // try the priority below us m_sc->devancePriority(); //m_sc->m_pri--; // set the new key for this priority if valid //if ( m_sc->m_pri >= 0 ) // //m_sc->m_nextDoledbKey = // // g_doledb.makeFirstKey2(m_sc->m_pri); // m_sc->m_nextDoledbKey = m_sc->m_nextKeys[m_sc->m_pri2]; // and try again goto loop; } // we only launch one spider at a time... so lock it up m_gettingDoledbList = true; // log this now if ( g_conf.m_logDebugSpider ) m_doleStart = gettimeofdayInMillisecondsLocal(); // debug if ( g_conf.m_logDebugSpider && m_sc->m_msg5StartKey != m_sc->m_nextDoledbKey ) log("spider: msg5startKey differs from nextdoledbkey"); // get a spider rec for us to spider from doledb (mdw) if ( ! m_msg5.getList ( RDB_DOLEDB , cr->m_collnum, // coll , &m_list , m_sc->m_msg5StartKey,//m_sc->m_nextDoledbKey, endKey , // need to make this big because we don't // want to end up getting just a negative key //1 , // minRecSizes (~ 7000) // we need to read in a lot because we call // "goto listLoop" below if the url we want // to dole is locked. // seems like a ton of negative recs 2000 , // minRecSizes true , // includeTree false , // addToCache 0 , // max cache age 0 , // startFileNum -1 , // numFiles (all) this , // state gotDoledbListWrapper2 , MAX_NICENESS , // niceness true ))// do error correction? // return if it blocked return ; // debug //log(LOG_DEBUG,"spider: read list of %li bytes from spiderdb for " // "pri=%li+",m_list.m_listSize,(long)m_sc->m_pri); // breathe QUICKPOLL ( MAX_NICENESS ); // . add urls in list to cache // . returns true if we should read another list // . will set startKey to next key to start at if ( gotDoledbList2 ( ) ) { // . if priority is -1 that means try next priority // . DO NOT reset the whole scan. that was what was happening // when we just had "goto loop;" here // . this means a reset above!!! //if ( m_sc->m_pri2 == -1 ) return; // bail if waiting for lock reply, no point in reading more // mdw- i moved this check up to loop: jump point. //if ( m_msg12.m_gettingLocks ) return; // gotDoledbList2() always advances m_nextDoledbKey so // try another read goto loop; } // wait for the msg12 get lock request to return... // or maybe spiders are off return; } // . decrement priority // . will also set m_sc->m_nextDoledbKey // . will also set m_sc->m_msg5StartKey void SpiderColl::devancePriority() { // try next m_pri2 = m_pri2 - 1; // how can this happen? if ( m_pri2 < -1 ) m_pri2 = -1; // bogus? if ( m_pri2 < 0 ) return; // set to next priority otherwise //m_sc->m_nextDoledbKey=g_doledb.makeFirstKey2 ( m_sc->m_pri ); m_nextDoledbKey = m_nextKeys [m_pri2]; // and the read key m_msg5StartKey = m_nextDoledbKey; } void gotDoledbListWrapper2 ( void *state , RdbList *list , Msg5 *msg5 ) { // process the doledb list and try to launch a spider g_spiderLoop.gotDoledbList2(); // regardless of whether that blocked or not try to launch another // and try to get the next SpiderRequest from doledb g_spiderLoop.spiderDoledUrls(); } // . this is in seconds // . had to make this 4 hours since one url was taking more than an hour // to lookup over 80,000 places in placedb. after an hour it had only // reached about 30,000 // http://pitchfork.com/news/tours/833-julianna-barwick-announces-european-and-north-american-dates/ // . this problem with this now is that it will lock an entire IP until it // expires if we have maxSpidersPerIp set to 1. so now we try to add // a SpiderReply for local errors like when XmlDoc::indexDoc() sets g_errno, // we try to add a SpiderReply at least. #define MAX_LOCK_AGE (3600*4) // spider the spider rec in this list from doledb bool SpiderLoop::gotDoledbList2 ( ) { // unlock m_gettingDoledbList = false; // shortcuts CollectionRec *cr = m_sc->m_cr; CrawlInfo *ci = &cr->m_localCrawlInfo; // update m_msg5StartKey for next read if ( m_list.getListSize() > 0 ) { m_list.getLastKey((char *)&m_sc->m_msg5StartKey); m_sc->m_msg5StartKey += 1; // i guess we had something? wait for nothing to be there //m_sc->m_encounteredDoledbRecs = true; } // log this now if ( g_conf.m_logDebugSpider ) { long long now = gettimeofdayInMillisecondsLocal(); long long took = now - m_doleStart; if ( took > 2 ) logf(LOG_DEBUG,"spider: GOT list from doledb in " "%llims " "size=%li bytes", took,m_list.getListSize()); } bool bail = false; // bail instantly if in read-only mode (no RdbTrees!) if ( g_conf.m_readOnlyMode ) bail = true; // or if doing a daily merge if ( g_dailyMerge.m_mergeMode ) bail = true; // skip if too many udp slots being used if ( g_udpServer.getNumUsedSlots() >= 1300 ) bail = true; // stop if too many out if ( m_numSpidersOut >= MAX_SPIDERS ) bail = true; if ( bail ) { // assume we could have launched a spider ci->m_lastSpiderCouldLaunch = getTimeGlobal(); // return false to indicate to try another return false; } // bail if list is empty if ( m_list.getListSize() <= 0 ) { // don't bother with this priority again until a key is // added to it! addToDoleIpTable() will be called // when that happens and it might unset this then. m_sc->m_isDoledbEmpty [ m_sc->m_pri2 ] = 1; /* // if all priorities now empty set another flag m_sc->m_allDoledbPrioritiesEmpty++; for ( long i = 0 ; i < MAX_SPIDER_PRIORITIES ; i++ ) { if ( m_sc->m_isDoledbEmpty[m_sc->m_pri2] ) continue; // must get empties 3 times in a row to ignore it // in case something was added to doledb while // we were reading from doledb. m_sc->m_allDoledbPrioritiesEmpty--; break; } */ // if no spiders... //if ( g_conf.m_logDebugSpider ) { // log("spider: empty doledblist collnum=%li " // "inwaitingtree=%li", // (long)cr->m_collnum, // m_sc->m_waitingTree.m_numUsedNodes); //} //if ( g_conf.m_logDebugSpider ) // log("spider: resetting doledb priority pri=%li", // m_sc->m_pri); // trigger a reset //m_sc->m_pri = -1; // . let the sleep timer init the loop again! // . no, just continue the loop //return true; // . this priority is EMPTY, try next // . will also set m_sc->m_nextDoledbKey // . will also set m_sc->m_msg5StartKey m_sc->devancePriority(); // this priority is EMPTY, try next //m_sc->m_pri = m_sc->m_pri - 1; // how can this happen? //if ( m_sc->m_pri < -1 ) m_sc->m_pri = -1; // all done if priority is negative, it will start over // at the top most priority, we've completed a round //if ( m_sc->m_pri < 0 ) return true; // set to next priority otherwise //m_sc->m_nextDoledbKey=g_doledb.makeFirstKey2 ( m_sc->m_pri ); //m_sc->m_nextDoledbKey = m_sc->m_nextKeys [m_sc->m_pri]; // and load that list from doledb for that priority return true; } // if debugging the spider flow show the start key if list non-empty /*if ( g_conf.m_logDebugSpider ) { // 12 byte doledb keys long pri = g_doledb.getPriority(&m_sc->m_nextDoledbKey); long stm = g_doledb.getSpiderTime(&m_sc->m_nextDoledbKey); long long uh48 = g_doledb.getUrlHash48(&m_sc->m_nextDoledbKey); logf(LOG_DEBUG,"spider: loading list from doledb startkey=%s" " pri=%li time=%lu uh48=%llu", KEYSTR(&m_sc->m_nextDoledbKey,12), pri, stm, uh48); }*/ time_t nowGlobal = getTimeGlobal(); // double check //if ( ! m_list.checkList_r( true , false, RDB_DOLEDB) ) { // char *xx=NULL;*xx=0; } // debug parm //long lastpri = -2; //long lockCount = 0; // reset ptr to point to first rec in list m_list.resetListPtr(); listLoop: // all done if empty //if ( m_list.isExhausted() ) { // // copied from above // m_sc->m_didRound = true; // // and try next colleciton immediately // return true; //} // breathe QUICKPOLL(MAX_NICENESS); // get the current rec from list ptr char *rec = (char *)m_list.getListPtr(); // the doledbkey key_t *doledbKey = (key_t *)rec; // get record after it next time m_sc->m_nextDoledbKey = *doledbKey ; // sanity check -- wrap watch -- how can this really happen? if ( m_sc->m_nextDoledbKey.n1 == 0xffffffff && m_sc->m_nextDoledbKey.n0 == 0xffffffffffffffffLL ) { char *xx=NULL;*xx=0; } // only inc it if its positive! because we do have negative // doledb keys in here now //if ( (m_sc->m_nextDoledbKey & 0x01) == 0x01 ) // m_sc->m_nextDoledbKey += 1; // if its negative inc by two then! this fixes the bug where the // list consisted only of one negative key and was spinning forever //else // m_sc->m_nextDoledbKey += 2; // if its negative inc by two then! this fixes the bug where the // list consisted only of one negative key and was spinning forever if ( (m_sc->m_nextDoledbKey & 0x01) == 0x00 ) m_sc->m_nextDoledbKey += 2; // did it hit zero? that means it wrapped around! if ( m_sc->m_nextDoledbKey.n1 == 0x0 && m_sc->m_nextDoledbKey.n0 == 0x0 ) { // TODO: work this out char *xx=NULL;*xx=0; } // get priority from doledb key long pri = g_doledb.getPriority ( doledbKey ); if ( g_conf.m_logDebugSpider ) log("spider: setting pri2=%li nextkey to %s", m_sc->m_pri2,KEYSTR(&m_sc->m_nextDoledbKey,12)); // update next doledbkey for this priority to avoid having to // process excessive positive/negative key annihilations (mdw) m_sc->m_nextKeys [ m_sc->m_pri2 ] = m_sc->m_nextDoledbKey; // sanity if ( pri < 0 || pri >= MAX_SPIDER_PRIORITIES ) { char *xx=NULL;*xx=0; } // skip the priority if we already have enough spiders on it long out = m_sc->m_outstandingSpiders[pri]; // get the first ufn that uses this priority //long max = getMaxAllowableSpidersOut ( pri ); // how many spiders can we have out? long max = 0; // in milliseconds. ho wlong to wait between downloads from same IP. // only for parnent urls, not including child docs like robots.txt // iframe contents, etc. long sameIpWaitTime = 5000; // 250; // ms long maxSpidersOutPerIp = 1; for ( long i = 0 ; i < cr->m_numRegExs ; i++ ) { if ( cr->m_spiderPriorities[i] != m_sc->m_pri2 ) continue; //if ( ! cr->m_spidersEnabled[i] ) continue; if ( cr->m_maxSpidersPerRule[i] > max ) max = cr->m_maxSpidersPerRule[i]; if ( cr->m_spiderIpWaits[i] < sameIpWaitTime ) sameIpWaitTime = cr->m_spiderIpWaits[i]; if ( cr->m_spiderIpMaxSpiders[i] > maxSpidersOutPerIp ) maxSpidersOutPerIp = cr->m_spiderIpMaxSpiders[i]; } //long ufn = m_sc->m_priorityToUfn[pri]; // how many can we have? crap, this is based on ufn, not priority // so we need to map the priority to a ufn that uses that priority //long max = 0; // see if it has a maxSpiders, if no ufn uses this priority then // "max" will remain set to 0 //if ( ufn >= 0 ) max = m_sc->m_cr->m_maxSpidersPerRule[ufn]; // turned off? //if ( ufn >= 0 && ! m_sc->m_cr->m_spidersEnabled[ufn] ) max = 0; // if we skipped over the priority we wanted, update that //m_pri = pri; // then do the next one after that for next round //m_pri--; // always allow at least 1, they can disable spidering otherwise //if ( max <= 0 ) max = 1; // skip? and re-get another doledb list from next priority... if ( out >= max ) { // come here if we hit our ip limit too hitMax: // assume we could have launched a spider if ( max > 0 ) ci->m_lastSpiderCouldLaunch = nowGlobal; // this priority is maxed out, try next m_sc->devancePriority(); // assume not an empty read //m_sc->m_encounteredDoledbRecs = true; //m_sc->m_pri = pri - 1; // all done if priority is negative //if ( m_sc->m_pri < 0 ) return true; // set to next priority otherwise //m_sc->m_nextDoledbKey=g_doledb.makeFirstKey2 ( m_sc->m_pri ); //m_sc->m_nextDoledbKey = m_sc->m_nextKeys [m_sc->m_pri]; // and load that list return true; } // no negatives - wtf? // if only the tree has doledb recs, Msg5.cpp does not remove // the negative recs... it doesn't bother to merge. if ( (doledbKey->n0 & 0x01) == 0 ) { // just increment then i guess m_list.skipCurrentRecord(); // if exhausted -- try another load with m_nextKey set if ( m_list.isExhausted() ) return true; // otherwise, try the next doledb rec in this list goto listLoop; } // what is this? a dataless positive key? if ( m_list.getCurrentRecSize() <= 16 ) { char *xx=NULL;*xx=0; } // get the "spider rec" (SpiderRequest) (embedded in the doledb rec) SpiderRequest *sreq = (SpiderRequest *)(rec + sizeof(key_t)+4); // sanity check. check for http(s):// if ( sreq->m_url[0] != 'h' && // might be a docid from a pagereindex.cpp ! is_digit(sreq->m_url[0]) ) { // note it if ( (g_corruptCount % 1000) == 0 ) log("spider: got corrupt doledb record. ignoring. " "pls fix!!!"); g_corruptCount++; // skip for now....!! what is causing this??? m_list.skipCurrentRecord(); // if exhausted -- try another load with m_nextKey set if ( m_list.isExhausted() ) return true; // otherwise, try the next doledb rec in this list goto listLoop; } // . how many spiders out for this ip now? // . TODO: count locks in case twin is spidering... but it did not seem // to work right for some reason long ipOut = 0; for ( long i = 0 ; i <= m_maxUsed ; i++ ) { // get it XmlDoc *xd = m_docs[i]; if ( ! xd ) continue; if ( ! xd->m_sreqValid ) continue; if ( xd->m_sreq.m_firstIp == sreq->m_firstIp ) ipOut++; } if ( ipOut >= maxSpidersOutPerIp ) goto hitMax; if ( g_conf.m_logDebugSpider ) log("spider: %li spiders out for %s for %s",ipOut,iptoa(sreq->m_firstIp), sreq->m_url); // sometimes we have it locked, but is still in doledb i guess. // seems like we might have give the lock to someone else and // there confirmation has not come through yet, so it's still // in doledb. HashTableX *ht = &g_spiderLoop.m_lockTable; // shortcut long long lockKey = makeLockTableKey ( sreq ); // get the lock... only avoid if confirmed! long slot = ht->getSlot ( &lockKey ); UrlLock *lock = NULL; if ( slot >= 0 ) // get the corresponding lock then if there lock = (UrlLock *)ht->getValueFromSlot ( slot ); // if there and confirmed, why still in doledb? if ( lock && lock->m_confirmed ) { // why is it not getting unlocked!?!?! log("spider: spider request locked but still in doledb. " "uh48=%lli firstip=%s %s", sreq->getUrlHash48(), iptoa(sreq->m_firstIp),sreq->m_url ); // just increment then i guess m_list.skipCurrentRecord(); // let's return false here to avoid an infinite loop // since we are not advancing nextkey and m_pri is not // being changed, that is what happens! if ( m_list.isExhausted() ) { // crap. but then we never make it to lower priorities. // since we are returning false. so let's try the // next priority in line. //m_sc->m_pri--; m_sc->devancePriority(); // try returning true now that we skipped to // the next priority level to avoid the infinite // loop as described above. return true; //return false;//true; } // try the next record in this list goto listLoop; } // . no no! the SpiderRequests in doledb are in our group because // doledb is split based on ... firstIp i guess... // BUT now lock is done based on probable docid since we do not // know the firstIp if injected spider requests but we do know // their probable docids since that is basically a function of // the url itself. THUS we now must realize this by trying to // get the lock for it and failing! /* // . likewise, if this request is already being spidered, if it // is in the lock table, skip it... // . if this is currently locked for spidering by us or another // host (or by us) then return true here HashTableX *ht = &g_spiderLoop.m_lockTable; // shortcut //long long uh48 = sreq->getUrlHash48(); // get the lock key unsigned long long lockKey ; lockKey = g_titledb.getFirstProbableDocId(sreq->m_probDocId); // check tree long slot = ht->getSlot ( &lockKey ); // if more than an hour old nuke it and clear it if ( slot >= 0 ) { // get the corresponding lock then if there UrlLock *lock = (UrlLock *)ht->getValueFromSlot ( slot ); // if 1hr+ old, nuke it and disregard if ( nowGlobal - lock->m_timestamp > MAX_LOCK_AGE ) { // unlock it ht->removeSlot ( slot ); // it is gone slot = -1; } } // if there say no no -- will try next spiderrequest in doledb then if ( slot >= 0 ) { // just increment then i guess m_list.skipCurrentRecord(); // count locks //if ( pri == lastpri ) lockCount++; //else lockCount = 1; //lastpri = pri; // how is it we can have 2 locked but only 1 outstanding // for the same priority? // basically this url is done being spidered, but we have // not yet processed the negative doledb key in Rdb.cpp which // will remove the lock from the lock table... so this // situation is perfectly fine i guess.. assuming that is // what is going on //if ( lockCount >= max ) { char *xx=NULL;*xx=0; } // this is not good static bool s_flag = false; if ( ! s_flag ) { s_flag = true; log("spider: got url %s that is locked but in dole " "table... skipping",sreq->m_url); } // if exhausted -- try another load with m_nextKey set if ( m_list.isExhausted() ) return true; // otherwise, try the next doledb rec in this list goto listLoop; } */ // force this set i guess... why isn't it set already? i guess when // we added the spider request to doledb it was not set at that time //sreq->m_doled = 1; // // sanity check. verify the spiderrequest also exists in our // spidercache. we no longer store doled out spider requests in our // cache!! they are separate now. // //if ( g_conf.m_logDebugSpider ) { // // scan for it since we may have dup requests // long long uh48 = sreq->getUrlHash48(); // long long pdocid = sreq->getParentDocId(); // // get any request from our urlhash table // SpiderRequest *sreq2 = m_sc->getSpiderRequest2 (&uh48,pdocid); // // must be there. i guess it could be missing if there is // // corruption and we lost it in spiderdb but not in doledb... // if ( ! sreq2 ) { char *xx=NULL;*xx=0; } //} // log this now if ( g_conf.m_logDebugSpider ) logf(LOG_DEBUG,"spider: trying to spider url %s",sreq->m_url); /* if ( ufn >= 0 ) { long siwt = m_sc->m_cr->m_spiderIpWaits[ufn]; if ( siwt >= 0 ) sameIpWaitTime = siwt; } if ( ufn >= 0 ) { maxSpidersOutPerIp = m_sc->m_cr->m_spiderIpMaxSpiders[ufn]; if ( maxSpidersOutPerIp < 0 ) maxSpidersOutPerIp = 999; } */ // assume we launch the spider below. really this timestamp indicates // the last time we COULD HAVE LAUNCHED *OR* did actually launch // a spider ci->m_lastSpiderCouldLaunch = nowGlobal; // set crawl done email sent flag so another email can be sent again // in case the user upped the maxToCrawl limit, for instance, // so that the crawl could continue. //ci->m_sentCrawlDoneAlert = 0; // if we thought we were done, note it if something comes back up if ( ! ci->m_hasUrlsReadyToSpider ) log("spider: got a reviving url for coll %s (%li) to crawl %s", cr->m_coll,(long)cr->m_collnum,sreq->m_url); // there are urls ready to spider ci->m_hasUrlsReadyToSpider = true; // newly created crawls usually have this set to false so set it // to true so getSpiderStatus() does not return that "the job // is completed and no repeat is scheduled"... if ( cr->m_spiderStatus == SP_INITIALIZING ) { // this is the GLOBAL crawl info, not the LOCAL, which // is what "ci" represents... cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider = true; // set this right i guess...? ci->m_lastSpiderAttempt = nowGlobal; } // reset reason why crawl is not running, because we basically are now cr->m_spiderStatus = SP_INPROGRESS; // this is 7 //cr->m_spiderStatusMsg = NULL; // be sure to save state so we do not re-send emails cr->m_needsSave = 1; // assume not an empty read //m_sc->m_encounteredDoledbRecs = true; // shortcut //char *coll = m_sc->m_cr->m_coll; // sometimes the spider coll is reset/deleted while we are // trying to get the lock in spiderUrl9() so let's use collnum collnum_t collnum = m_sc->m_cr->m_collnum; // . spider that. we don't care wheter it blocks or not // . crap, it will need to block to get the locks! // . so at least wait for that!!! // . but if we end up launching the spider then this should NOT // return false! only return false if we should hold up the doledb // scan // . this returns true right away if it failed to get the lock... // which means the url is already locked by someone else... // . it might also return true if we are already spidering the url bool status = spiderUrl9 ( sreq , doledbKey , collnum, sameIpWaitTime , maxSpidersOutPerIp ) ; // just increment then i guess m_list.skipCurrentRecord(); // if it blocked, wait for it to return to resume the doledb list // processing because the msg12 is out and we gotta wait for it to // come back. when lock reply comes back it tries to spider the url // then it tries to call spiderDoledUrls() to keep the spider queue // spidering fully. if ( ! status ) return false; // if exhausted -- try another load with m_nextKey set if ( m_list.isExhausted() ) { // if no more in list, fix the next doledbkey, // m_sc->m_nextDoledbKey log ( LOG_DEBUG,"spider: list exhausted."); return true; } // otherwise, it might have been in the lock cache and quickly // rejected, or rejected for some other reason, so try the next // doledb rec in this list goto listLoop; // // otherwise, it blocked, trying to get the lock across the network. // so reset the doledb scan assuming it will go through. if it does // NOT get the lock, then it will be in the lock cache for quick // "return true" from spiderUrl() above next time we try it. // // once we get a url from doledb to spider, reset our doledb scan. // that way if a new url gets added to doledb that is high priority // then we get it right away. // // NO! because the lock request can block then fail!! and we end // up resetting and in an infinite loop! // //m_sc->m_pri = -1; //return false; } // . spider the next url that needs it the most // . returns false if blocked on a spider launch, otherwise true. // . returns false if your callback will be called // . returns true and sets g_errno on error bool SpiderLoop::spiderUrl9 ( SpiderRequest *sreq , key_t *doledbKey , //char *coll , collnum_t collnum , long sameIpWaitTime , long maxSpidersOutPerIp ) { // sanity check //if ( ! sreq->m_doled ) { char *xx=NULL;*xx=0; } // if waiting on a lock, wait if ( m_msg12.m_gettingLocks ) { char *xx=NULL;*xx=0; } // sanity if ( ! m_sc ) { char *xx=NULL;*xx=0; } // sanity check // core dump? just re-run gb and restart the parser test... if ( //g_test.m_isRunning && //! g_test.m_spiderLinks && g_conf.m_testParserEnabled && ! sreq->m_isInjecting ) { char *xx=NULL;*xx=0; } // wait until our clock is synced with host #0 before spidering since // we store time stamps in the domain and ip wait tables in // SpiderCache.cpp. We don't want to freeze domain for a long time // because we think we have to wait until tomorrow before we can // spider it. if ( ! isClockInSync() ) { // let admin know why we are not spidering static char s_printed = false; if ( ! s_printed ) { logf(LOG_DEBUG,"spider: NOT SPIDERING until clock " "is in sync with host #0."); s_printed = true; } return true; } // turned off? if ( ( (! g_conf.m_spideringEnabled ) && // ! g_conf.m_webSpideringEnabled ) && ! sreq->m_isInjecting ) || // repairing the collection's rdbs? g_repairMode || // power went off? ! g_process.m_powerIsOn ) { // try to cancel outstanding spiders, ignore injects for ( long i = 0 ; i <= m_maxUsed ; i++ ) { // get it XmlDoc *xd = m_docs[i]; if ( ! xd ) continue; //if ( xd->m_sreq.m_isInjecting ) continue; // let everyone know, TcpServer::cancel() uses this in // destroySocket() g_errno = ECANCELLED; // cancel the socket trans who has "xd" as its state. // this will cause XmlDoc::gotDocWrapper() to be called // now, on this call stack with g_errno set to // ECANCELLED. But if Msg16 was not in the middle of // HttpServer::getDoc() then this will have no effect. g_httpServer.cancel ( xd );//, g_msg13RobotsWrapper ); // cancel any Msg13 that xd might have been waiting for g_udpServer.cancel ( &xd->m_msg13 , 0x13 ); } return true; } // do not launch any new spiders if in repair mode if ( g_repairMode ) { g_conf.m_spideringEnabled = false; //g_conf.m_injectionEnabled = false; return true; } // do not launch another spider if less than 25MB of memory available. // this causes us to dead lock when spiders use up all the mem, and // file merge operation can not get any, and spiders need to add to // titledb but can not until the merge completes!! if ( g_mem.m_maxMem - g_mem.m_used < 25*1024*1024 ) { static long s_lastTime = 0; static long s_missed = 0; s_missed++; long now = getTime(); // don't spam the log, bug let people know about it if ( now - s_lastTime > 10 ) { log("spider: Need 25MB of free mem to launch spider, " "only have %lli. Failed to launch %li times so " "far.", g_mem.m_maxMem - g_mem.m_used , s_missed ); s_lastTime = now; } } // we store this in msg12 for making a fakedb key //collnum_t collnum = g_collectiondb.getCollnum ( coll ); // shortcut long long lockKeyUh48 = makeLockTableKey ( sreq ); //unsigned long long lockKey ; //lockKey = g_titledb.getFirstProbableDocId(sreq->m_probDocId); //lockKey = g_titledb.getFirstProbableDocId(sreq->m_probDocId); // . now that we have to use msg12 to see if the thing is locked // to avoid spidering it.. (see comment in above function) // we often try to spider something we are already spidering. that // is why we have an rdbcache, m_lockCache, to make these lock // lookups quick, now that the locking group is usually different // than our own! // . we have to check this now because removeAllLocks() below will // remove a lock that one of our spiders might have. it is only // sensitive to our hostid, not "spider id" // sometimes we exhaust the doledb and m_nextDoledbKey gets reset // to zero, we do a re-scan and get a doledbkey that is currently // being spidered or is waiting for its negative doledb key to // get into our doledb tree for ( long i = 0 ; i <= m_maxUsed ; i++ ) { // get it XmlDoc *xd = m_docs[i]; if ( ! xd ) continue; // jenkins was coring spidering the same url in different // collections at the same time if ( ! xd->m_collnumValid ) continue; if ( xd->m_collnum != collnum ) continue; // . problem if it has our doledb key! // . this happens if we removed the lock above before the // spider returned!! that's why you need to set // MAX_LOCK_AGE to like an hour or so // . i've also seen this happen because we got stuck looking // up like 80,000 places and it was taking more than an // hour. it had only reach about 30,000 after an hour. // so at this point just set the lock timeout to // 4 hours i guess. // . i am seeing this again and we are trying over and over // again to spider the same url and hogging the cpu so // we need to keep this sanity check in here for times // like this if ( xd->m_doledbKey == *doledbKey ) { // just note it for now log("spider: spidering same url %s twice. " "different firstips?", xd->m_firstUrl.m_url); //char *xx=NULL;*xx=0; } } // keep chugging continue; //if ( xd->m_doledbKey != *doledbKey ) continue; // count it as processed m_processed++; // log it if ( g_conf.m_logDebugSpider ) logf(LOG_DEBUG,"spider: we are already spidering %s " "lockkey=%llu",sreq->m_url,lockKeyUh48); // all done, no lock granted... return true; } // reset g_errno g_errno = 0; // breathe QUICKPOLL(MAX_NICENESS); // sanity. ensure m_sreq doesn't change from under us i guess if ( m_msg12.m_gettingLocks ) { char *xx=NULL;*xx=0; } // get rid of this crap for now //g_spiderCache.meterBandwidth(); // save these in case getLocks() blocks m_sreq = sreq; m_doledbKey = doledbKey; //m_coll = coll; m_collnum = collnum; // if we already have the lock then forget it. this can happen // if spidering was turned off then back on. // MDW: TODO: we can't do this anymore since we no longer have // the lockTable check above because we do not control our own // lock now necessarily. it often is in another group's lockTable. //if ( g_spiderLoop.m_lockTable.isInTable(&lockKey) ) { // log("spider: already have lock for lockKey=%llu",lockKey); // // proceed // return spiderUrl2(); //} // flag it so m_sreq does not "disappear" m_msg12.m_gettingLocks = true; // count it m_processed++; //if ( g_conf.m_logDebugSpider ) // logf(LOG_DEBUG,"spider: getting lock for %s",m_sreq->m_url); // // . try to get the lock. assume it always blocks // . it will call spiderUrl2 with sr when it gets a reply // . if injecting, no need for lock! will return true for that! // if ( ! m_msg12.getLocks ( m_sreq->getUrlHash48() , //m_sreq->m_probDocId,//UrlHash48(), m_sreq->m_url , m_doledbKey , collnum, sameIpWaitTime, maxSpidersOutPerIp, m_sreq->m_firstIp, NULL , // state NULL ) ) // callback return false; // no go m_msg12.m_gettingLocks = false; // it will not block if the lock was found in our m_lockCache! return true; // should always block now! //char *xx=NULL;*xx=0; // i guess we got it //return spiderUrl2 ( ); //return true; } bool SpiderLoop::spiderUrl2 ( ) { // sanity check //if ( ! m_sreq->m_doled ) { char *xx=NULL;*xx=0; } // . find an available doc slot // . we can have up to MAX_SPIDERS spiders (300) long i; for ( i=0 ; i= MAX_SPIDERS ) { log(LOG_DEBUG,"build: Already have %li outstanding spiders.", (long)MAX_SPIDERS); char *xx = NULL; *xx = 0; } // breathe QUICKPOLL(MAX_NICENESS); XmlDoc *xd; // otherwise, make a new one if we have to try { xd = new (XmlDoc); } // bail on failure, sleep and try again catch ( ... ) { g_errno = ENOMEM; log("build: Could not allocate %li bytes to spider " "the url %s. Will retry later.", (long)sizeof(XmlDoc), m_sreq->m_url ); return true; } // register it's mem usage with Mem.cpp class mnew ( xd , sizeof(XmlDoc) , "XmlDoc" ); // add to the array m_docs [ i ] = xd; CollectionRec *cr = g_collectiondb.getRec ( m_collnum ); char *coll = "collnumwasinvalid"; if ( cr ) coll = cr->m_coll; // . pass in a pbuf if this is the "qatest123" collection // . we will dump the SafeBuf output into a file in the // test subdir for comparison with previous versions of gb // in order to see what changed SafeBuf *pbuf = NULL; if ( !strcmp( coll,"qatest123") && g_conf.m_testParserEnabled ) pbuf = &xd->m_sbuf; // // sanity checks // //long long uh48; //long long pdocid; //if ( g_conf.m_logDebugSpider ) { // // scan for it since we may have dup requests // uh48 = m_sreq->getUrlHash48(); // pdocid = m_sreq->getParentDocId(); // // get any request from our urlhash table // SpiderRequest *sreq2 = m_sc->getSpiderRequest2 (&uh48,pdocid); // // must be valid parent // if ( ! sreq2 && pdocid == 0LL ) { char *xx=NULL;*xx=0; } // // for now core on this // if ( ! sreq2 ) { char *xx=NULL;*xx=0; } // // log it // logf(LOG_DEBUG,"spider: spidering uh48=%llu pdocid=%llu", // uh48,pdocid); //} if ( g_conf.m_logDebugSpider ) logf(LOG_DEBUG,"spider: spidering firstip9=%s(%lu) " "uh48=%llu prntdocid=%llu k.n1=%llu k.n0=%llu", iptoa(m_sreq->m_firstIp), m_sreq->m_firstIp, m_sreq->getUrlHash48(), m_sreq->getParentDocId() , m_sreq->m_key.n1, m_sreq->m_key.n0); // this returns false and sets g_errno on error if ( ! xd->set4 ( m_sreq , m_doledbKey , coll , pbuf , MAX_NICENESS ) ) { // i guess m_coll is no longer valid? mdelete ( m_docs[i] , sizeof(XmlDoc) , "Doc" ); delete (m_docs[i]); m_docs[i] = NULL; // error, g_errno should be set! return true; } // . launch count increment // . this is only used locally on this host to set its // m_hasUrlsReadyToSpider to false. //cr->m_localCrawlInfo.m_numUrlsLaunched++; // call this after doc gets indexed xd->setCallback ( xd , indexedDocWrapper ); /* // set it from provided parms if we are injecting via Msg7 if ( m_sreq->m_isInjecting ) { // now fill these in if provided too! if ( m_content ) { if ( m_sreq->m_firstIp ) { xd->m_ip = m_sreq->m_firstIp; xd->m_ipValid = true; } xd->m_isContentTruncated = false; xd->m_isContentTruncatedValid = true; xd->m_httpReplyValid = true; xd->m_httpReply = m_content; xd->m_httpReplySize = m_contentLen + 1; if ( ! m_contentHasMime ) xd->m_useFakeMime = true; } // a special callback for injected docs //xd->m_injectionCallback = m_callback; //xd->m_injectionState = m_state; } */ // increase m_maxUsed if we have to if ( i > m_maxUsed ) m_maxUsed = i; // count it m_numSpidersOut++; // count this m_sc->m_spidersOut++; // count it as a hit //g_stats.m_spiderUrlsHit++; // sanity check if (m_sreq->m_priority <= -1 ) { char *xx=NULL;*xx=0; } //if(m_sreq->m_priority >= MAX_SPIDER_PRIORITIES){char *xx=NULL;*xx=0;} // update this m_sc->m_outstandingSpiders[(unsigned char)m_sreq->m_priority]++; if ( g_conf.m_logDebugSpider ) log(LOG_DEBUG,"spider: sc_out=%li waiting=%li url=%s", m_sc->m_spidersOut, m_sc->m_waitingTree.m_numUsedNodes, m_sreq->m_url); // debug log //log("XXX: incremented count to %li for %s", // m_sc->m_spidersOut,m_sreq->m_url); //if ( m_sc->m_spidersOut != m_numSpidersOut ) { char *xx=NULL;*xx=0; } // . return if this blocked // . no, launch another spider! bool status = xd->indexDoc(); // . reset the next doledbkey to start over! // . when spiderDoledUrls() see this negative priority it will // reset the doledb scan to the top priority. m_sc->m_pri2 = -1; // if we were injecting and it blocked... return false if ( ! status ) return false; // deal with this error indexedDoc ( xd ); // "callback" will not be called cuz it should be NULL return true; } // . the one that was just indexed // . Msg7.cpp uses this to see what docid the injected doc got so it // can forward it to external program //static long long s_lastDocId = -1; //long long SpiderLoop::getLastDocId ( ) { return s_lastDocId; } void indexedDocWrapper ( void *state ) { // . process the results // . return if this blocks if ( ! g_spiderLoop.indexedDoc ( (XmlDoc *)state ) ) return; //a hack to fix injecting urls, because they can //run at niceness 0 but most of the spider pipeline //cannot. we should really just make injection run at //MAX_NICENESS. OK, done! mdw //if ( g_loop.m_inQuickPoll ) return; // . continue gettings Spider recs to spider // . if it's already waiting for a list it'll just return // . mdw: keep your eye on this, it was commented out // . this won't execute if we're already getting a list now //g_spiderLoop.spiderUrl ( ); // spider some urls that were doled to us g_spiderLoop.spiderDoledUrls( ); } // . this will delete m_docs[i] // . returns false if blocked, true otherwise // . sets g_errno on error bool SpiderLoop::indexedDoc ( XmlDoc *xd ) { // save the error in case a call changes it below //long saved = g_errno; // get our doc #, i //long i = doc - m_docs[0]; long i = 0; for ( ; i < MAX_SPIDERS ; i++ ) if ( m_docs[i] == xd) break; // sanity check if ( i >= MAX_SPIDERS ) { char *xx=NULL;*xx=0; } // set to -1 to indicate inject //if ( i < 0 || i >= MAX_SPIDERS ) i = -1; //char injecting = false; //if ( xd->m_sreq.m_isInjecting ) injecting = true; // save it for Msg7.cpp to pass docid of injected doc back //s_lastDocId = xd->m_docId; // . decrease m_maxUsed if we need to // . we can decrease all the way to -1, which means no spiders going on if ( m_maxUsed == i ) { m_maxUsed--; while ( m_maxUsed >= 0 && ! m_docs[m_maxUsed] ) m_maxUsed--; } // count it m_numSpidersOut--; // get coll collnum_t collnum = xd->m_collnum;//tiondb.getCollnum ( xd->m_coll ); // if coll was deleted while spidering, sc will be NULL SpiderColl *sc = g_spiderCache.getSpiderColl(collnum); // decrement this if ( sc ) sc->m_spidersOut--; // get the original request from xmldoc SpiderRequest *sreq = &xd->m_sreq; // update this. if ( sc ) sc->m_outstandingSpiders[(unsigned char)sreq->m_priority]--; // debug log //log("XXX: decremented count to %li for %s", // sc->m_spidersOut,sreq->m_url); //if ( sc->m_spidersOut != m_numSpidersOut ) { char *xx=NULL;*xx=0; } // breathe QUICKPOLL ( xd->m_niceness ); // are we a re-spider? bool respider = false; if ( xd->m_oldDocValid && xd->m_oldDoc ) respider = true; // . dump it out to a file in the "qatest123" subdir // . but only the first time we spider it... /* if ( ! strcmp(xd->m_coll,"qatest123") && ! respider && // no longer need this when qa testing spider, not parser g_conf.m_testParserEnabled ) { // save the buffers //saveTestBuf(); // get it //SafeBuf *pbuf = xd->m_pbuf; SafeBuf sb; // get it xd->printDoc ( &sb ); // get the first url Url *u = xd->getFirstUrl(); // . get its hash // . should be same hash we use to store doc.%llu.html in // XmlDoc.cpp/Msg13.cpp stuff (getTestDoc()) long long h = hash64 ( u->getUrl() , u->getUrlLen() ); char *testDir = g_test.getTestDir(); // make filename to dump out to char fn[1024]; sprintf(fn,"%s/%s/parse.%llu.%lu.html", g_hostdb.m_dir,testDir,h,g_test.m_runId); // . dump it out to a file // . WATCH OUT. g_errno is set on internal errors, like OOM // or whatever, so don't save in those cases...??????? sb.dumpToFile ( fn ); // just dump the
tags into this file sprintf(fn,"%s/%s/parse-shortdisplay.%llu.%lu.html", g_hostdb.m_dir,testDir,h,g_test.m_runId); // output to a special file SafeBuf tmp; // insert this tmp.safeStrcpy("\n"); // header stuff tmp.safePrintf("\n"); // put the onclick script in there tmp.safeStrcpy ( xd->getCheckboxScript() ); // concatenate just these sections in "sb" to "tmp" tmp.cat2 ( sb , "
" , "
" ); // header stuff tmp.safePrintf("\n\n"); // then dump tmp.dumpToFile ( fn ); // if it had critical errors from XmlDoc::validateOutput() // then create that file! //if ( xd->m_validateMisses > 0 || xd->m_validateFlagged ) { // make the critical file filename char cf[1024]; sprintf (cf,"%s/%s/critical.%llu.%lu.txt", g_hostdb.m_dir,testDir,h,g_test.m_runId); // save to that ttt.dumpToFile ( cf ); //char cmd[256]; //sprintf(cmd,"touch %s/test/critical.%llu.%lu.txt", // g_hostdb.m_dir,h,g_test.m_runId); //system(cmd); // note it //log("crazyin: %s",u->m_url ); // note it //g_test.m_urlsAdded--; g_test.m_urlsIndexed++; // now in PingServer.cpp for hostid 0 it checks // the urlsindexed from each host if g_conf.m_testParserEnabled // is true to see if we should call g_test.stopIt() // if that is zero we are done //if ( g_test.m_urlsAdded == 0 && ! g_test.m_isAdding && // // only stop if not spidering links // //! g_test.m_spiderLinks ) // g_conf.m_testParserEnabled ) // // wrap things up // g_test.stopIt(); } */ // note it // this should not happen any more since indexDoc() will take // care of g_errno now by clearing it and adding an error spider // reply to release the lock!! if ( g_errno ) { log("spider: ----CRITICAL CRITICAL CRITICAL----"); log("spider: ----CRITICAL CRITICAL CRITICAL----"); log("spider: ------ *** LOCAL ERROR *** ------"); log("spider: ------ *** LOCAL ERROR *** ------"); log("spider: ------ *** LOCAL ERROR *** ------"); log("spider: spidering %s has error: %s. uh48=%lli. " "Respidering " "in %li seconds. MAX_LOCK_AGE when lock expires.", xd->m_firstUrl.m_url, mstrerror(g_errno), xd->getFirstUrlHash48(), (long)MAX_LOCK_AGE); log("spider: ------ *** LOCAL ERROR *** ------"); log("spider: ------ *** LOCAL ERROR *** ------"); log("spider: ------ *** LOCAL ERROR *** ------"); log("spider: ----CRITICAL CRITICAL CRITICAL----"); log("spider: ----CRITICAL CRITICAL CRITICAL----"); // don't release the lock on it right now. just let the // lock expire on it after MAX_LOCK_AGE seconds. then it will // be retried. we need to debug gb so these things never // hapeen... } // breathe QUICKPOLL ( xd->m_niceness ); // . call the final callback used for injecting urls // . this may send a reply back so the caller knows the url // was fully injected into the index // . Msg7.cpp uses a callback that returns a void, so use m_callback1! //if ( xd->m_injectionCallback && injecting ) { // g_errno = saved; // // use the index code as the error for PageInject.cpp // if ( ! g_errno && xd->m_indexCode ) g_errno = xd->m_indexCode; // xd->m_injectionCallback ( xd->m_injectionState ); //} // we don't need this g_errno passed this point g_errno = 0; // breathe QUICKPOLL ( xd->m_niceness ); // did this doc get a chance to add its meta list to msg4 bufs? //bool addedMetaList = m_docs[i]->m_listAdded; // set this in case we need to call removeAllLocks //m_uh48 = 0LL; //if ( xd->m_sreqValid ) m_uh48 = xd->m_sreq.getUrlHash48(); // we are responsible for deleting doc now mdelete ( m_docs[i] , sizeof(XmlDoc) , "Doc" ); delete (m_docs[i]); m_docs[i] = NULL; // we remove the spider lock from g_spiderLoop.m_lockTable in Rdb.cpp // when it receives the negative doledb key. but if the this does not // happen, we have a problem then! //if ( addedMetaList ) return true; // sanity //if ( ! m_uh48 ) { char *xx=NULL; *xx=0; } // the lock we had in g_spiderLoop.m_lockTable for the doleKey // is now remove in Rdb.cpp when it receives a negative dole key to // add to doledb... assuming we added that meta list!! // m_uh48 should be set from above //if ( ! removeAllLocks () ) return false; // we did not block, so return true return true; } void gotLockReplyWrapper ( void *state , UdpSlot *slot ) { // cast it Msg12 *msg12 = (Msg12 *)state; // . call handler // . returns false if waiting for more replies to come in if ( ! msg12->gotLockReply ( slot ) ) return; // if had callback, maybe from PageReindex.cpp if ( msg12->m_callback ) msg12->m_callback ( msg12->m_state ); // ok, try to get another url to spider else g_spiderLoop.spiderDoledUrls(); } Msg12::Msg12 () { m_numRequests = 0; m_numReplies = 0; } // . returns false if blocked, true otherwise. // . returns true and sets g_errno on error // . before we can spider for a SpiderRequest we must be granted the lock // . each group shares the same doledb and each host in the group competes // for spidering all those urls. // . that way if a host goes down is load is taken over bool Msg12::getLocks ( long long uh48, // probDocId , char *url , DOLEDBKEY *doledbKey, collnum_t collnum, long sameIpWaitTime, long maxSpidersOutPerIp, long firstIp, void *state , void (* callback)(void *state) ) { // ensure not in use. not msg12 replies outstanding. if ( m_numRequests != m_numReplies ) { char *xx=NULL;*xx=0; } // do not use locks for injections //if ( m_sreq->m_isInjecting ) return true; // get # of hosts in each mirror group long hpg = g_hostdb.getNumHostsPerShard(); // reset m_numRequests = 0; m_numReplies = 0; m_grants = 0; m_removing = false; m_confirming = false; // make sure is really docid //if ( probDocId & ~DOCID_MASK ) { char *xx=NULL;*xx=0; } // . mask out the lower bits that may change if there is a collision // . in this way a url has the same m_probDocId as the same url // in the index. i.e. if we add a new spider request for url X and // url X is already indexed, then they will share the same lock // even though the indexed url X may have a different actual docid // than its probable docid. // . we now use probable docids instead of uh48 because query reindex // in PageReindex.cpp adds docid based spider requests and we // only know the docid, not the uh48 because it is creating // SpiderRequests from docid-only search results. having to look // up the msg20 summary for like 1M search results is too painful! //m_lockKey = g_titledb.getFirstProbableDocId(probDocId); // . use this for locking now, and let the docid-only requests just use // the docid m_lockKeyUh48 = makeLockTableKey ( uh48 , firstIp ); m_url = url; m_callback = callback; m_state = state; m_hasLock = false; m_origUh48 = uh48; // support ability to spider multiple urls from same ip m_doledbKey = *doledbKey; m_collnum = collnum; m_sameIpWaitTime = sameIpWaitTime; m_maxSpidersOutPerIp = maxSpidersOutPerIp; m_firstIp = firstIp; // sanity check, just 6 bytes! (48 bits) if ( uh48 & 0xffff000000000000LL ) { char *xx=NULL;*xx=0; } if ( m_lockKeyUh48 & 0xffff000000000000LL ) { char *xx=NULL;*xx=0; } // cache time long ct = 120; // if docid based assume it was a query reindex and keep it short! // otherwise we end up waiting 120 seconds for a query reindex to // go through on a docid we just spidered. TODO: use m_urlIsDocId if ( url && is_digit(url[0]) ) ct = 2; // . this seems to be messing us up and preventing us from adding new // requests into doledb when only spidering a few IPs. // . make it random in the case of twin contention ct = rand() % 10; // . check our cache to avoid repetitive asking // . use -1 for maxAge to indicate no max age // . returns -1 if not in cache // . use maxage of two minutes, 120 seconds long lockTime ; lockTime = g_spiderLoop.m_lockCache.getLong(0,m_lockKeyUh48,ct,true); // if it was in the cache and less than 2 minutes old then return // true now with m_hasLock set to false. if ( lockTime >= 0 ) { if ( g_conf.m_logDebugSpider ) logf(LOG_DEBUG,"spider: cached missed lock for %s " "lockkey=%llu", m_url,m_lockKeyUh48); return true; } if ( g_conf.m_logDebugSpider ) logf(LOG_DEBUG,"spider: sending lock request for %s " "lockkey=%llu", m_url,m_lockKeyUh48); // now the locking group is based on the probable docid //m_lockGroupId = g_hostdb.getGroupIdFromDocId(m_lockKey); // ptr to list of hosts in the group //Host *hosts = g_hostdb.getGroup ( m_lockGroupId ); // the same group (shard) that has the spiderRequest/Reply is // the one responsible for locking. Host *hosts = g_hostdb.getMyShard(); // short cut UdpServer *us = &g_udpServer; static long s_lockSequence = 0; // remember the lock sequence # in case we have to call remove locks m_lockSequence = s_lockSequence++; LockRequest *lr = &m_lockRequest; lr->m_lockKeyUh48 = m_lockKeyUh48; lr->m_firstIp = m_firstIp; lr->m_removeLock = 0; lr->m_lockSequence = m_lockSequence; lr->m_collnum = collnum; // reset counts m_numRequests = 0; m_numReplies = 0; // point to start of the 12 byte request buffer char *request = (char *)lr;//m_lockKey; long requestSize = sizeof(LockRequest);//12; // loop over hosts in that shard for ( long i = 0 ; i < hpg ; i++ ) { // get a host Host *h = &hosts[i]; // skip if dead! no need to get a reply from dead guys if ( g_hostdb.isDead (h) ) continue; // note it if ( g_conf.m_logDebugSpider ) logf(LOG_DEBUG,"spider: sent lock " "request #%li for lockkey=%llu %s to " "hid=%li",m_numRequests,m_lockKeyUh48, m_url,h->m_hostId); // send request to him if ( ! us->sendRequest ( request , requestSize , 0x12 , // msgType h->m_ip , h->m_port , h->m_hostId , NULL , // retSlotPtrPtr this , // state data gotLockReplyWrapper , 60*60*24*365 ) ) // udpserver returns false and sets g_errno on error return true; // count them m_numRequests++; } // block? if ( m_numRequests > 0 ) return false; // i guess nothing... hmmm... all dead? //char *xx=NULL; *xx=0; // m_hasLock should be false... all lock hosts seem dead... wait if ( g_conf.m_logDebugSpider ) logf(LOG_DEBUG,"spider: all lock hosts seem dead for %s " "lockkey=%llu", m_url,m_lockKeyUh48); return true; } // after adding the negative doledb recs to remove the url we are spidering // from doledb, and adding the fake titledb rec to add a new entry into // waiting tree so that our ip can have more than one outstanding spider, // call the callback. usually msg4::addMetaList() will not block i'd guess. void rejuvenateIPWrapper ( void *state ) { Msg12 *THIS = (Msg12 *)state; THIS->m_callback ( THIS->m_state ); } // returns true if all done, false if waiting for more replies bool Msg12::gotLockReply ( UdpSlot *slot ) { // got reply m_numReplies++; // don't let udpserver free the request, it's our m_request[] slot->m_sendBufAlloc = NULL; // check for a hammer reply char *reply = slot->m_readBuf; long replySize = slot->m_readBufSize; // if error, treat as a not grant if ( g_errno ) { bool logIt = true; // note it if ( g_conf.m_logDebugSpider ) log("spider: got msg12 reply error = %s", mstrerror(g_errno)); // if we got an ETRYAGAIN when trying to confirm our lock // that means doledb was saving/dumping to disk and we // could not remove the record from doledb and add an // entry to the waiting tree, so we need to keep trying if ( g_errno == ETRYAGAIN && m_confirming ) { // c ount it again m_numRequests++; // use what we were using char *request = (char *)&m_confirmRequest; long requestSize = sizeof(ConfirmRequest); Host *h = g_hostdb.getHost(slot->m_hostId); // send request to him UdpServer *us = &g_udpServer; if ( ! us->sendRequest ( request , requestSize , 0x12 , // msgType h->m_ip , h->m_port , h->m_hostId , NULL , // retSlotPtrPt this , // state data gotLockReplyWrapper , 60*60*24*365 ) ) return false; // error? // don't spam the log! static long s_last = 0; long now = getTimeLocal(); if ( now - s_last >= 1 ) { s_last = now; log("spider: error re-sending confirm " "request: %s", mstrerror(g_errno)); } } // only log every 10 seconds for ETRYAGAIN if ( g_errno == ETRYAGAIN ) { static time_t s_lastTime = 0; time_t now = getTimeLocal(); logIt = false; if ( now - s_lastTime >= 3 ) { logIt = true; s_lastTime = now; } } if ( logIt ) log ( "sploop: host had error getting lock url=%s" ": %s" , m_url,mstrerror(g_errno) ); } // grant or not if ( replySize == 1 && ! g_errno && *reply == 1 ) m_grants++; // wait for all to get back if ( m_numReplies < m_numRequests ) return false; // all done if we were removing if ( m_removing ) { // note it if ( g_conf.m_logDebugSpider ) logf(LOG_DEBUG,"spider: done removing all locks " "(replies=%li) for %s", m_numReplies,m_url);//m_sreq->m_url); // we are done m_gettingLocks = false; return true; } // all done if we were confirming if ( m_confirming ) { // note it if ( g_conf.m_logDebugSpider ) logf(LOG_DEBUG,"spider: done confirming all locks " "for %s uh48=%lli",m_url,m_origUh48);//m_sreq->m_url); // we are done m_gettingLocks = false; // . keep processing // . if the collection was nuked from under us the spiderUrl2 // will return true and set g_errno if ( ! m_callback ) return g_spiderLoop.spiderUrl2(); // if we had a callback let our parent call it return true; } // if got ALL locks, spider it if ( m_grants == m_numReplies ) { // note it if ( g_conf.m_logDebugSpider ) logf(LOG_DEBUG,"spider: got lock for docid=lockkey=%llu", m_lockKeyUh48); // flag this m_hasLock = true; // we are done //m_gettingLocks = false; /////// // // now tell our group (shard) to remove from doledb // and re-add to waiting tree. the evalIpLoop() function // should skip this probable docid because it is in the // LOCK TABLE! // // This logic should allow us to spider multiple urls // from the same IP at the same time. // /////// // returns false if would block if ( ! confirmLockAcquisition ( ) ) return false; // . we did it without blocking, maybe cuz we are a single node // . ok, they are all back, resume loop // . if the collection was nuked from under us the spiderUrl2 // will return true and set g_errno if ( ! m_callback ) g_spiderLoop.spiderUrl2 ( ); // all done return true; } // note it if ( g_conf.m_logDebugSpider ) logf(LOG_DEBUG,"spider: missed lock for %s lockkey=%llu " "(grants=%li)", m_url,m_lockKeyUh48,m_grants); // . if it was locked by another then add to our lock cache so we do // not try to lock it again // . if grants is not 0 then one host granted us the lock, but not // all hosts, so we should probably keep trying on it until it is // locked up by one host if ( m_grants == 0 ) { long now = getTimeGlobal(); g_spiderLoop.m_lockCache.addLong(0,m_lockKeyUh48,now,NULL); } // reset again m_numRequests = 0; m_numReplies = 0; // no need to remove them if none were granted because another // host in our group might have it 100% locked. if ( m_grants == 0 ) { // no longer in locks operation mode m_gettingLocks = false; // ok, they are all back, resume loop //if ( ! m_callback ) g_spiderLoop.spiderUrl2 ( ); // all done return true; } // note that if ( g_conf.m_logDebugSpider ) logf(LOG_DEBUG,"spider: sending request to all in shard to " "remove lock uh48=%llu. grants=%li", m_lockKeyUh48,(long)m_grants); // remove all locks we tried to get, BUT only if from our hostid! // no no! that doesn't quite work right... we might be the ones // locking it! i.e. another one of our spiders has it locked... if ( ! removeAllLocks ( ) ) return false; // true; // if did not block, how'd that happen? log("sploop: did not block in removeAllLocks: %s",mstrerror(g_errno)); return true; } bool Msg12::removeAllLocks ( ) { // ensure not in use. not msg12 replies outstanding. if ( m_numRequests != m_numReplies ) { char *xx=NULL;*xx=0; } // skip if injecting //if ( m_sreq->m_isInjecting ) return true; if ( g_conf.m_logDebugSpider ) logf(LOG_DEBUG,"spider: removing all locks for %s %llu", m_url,m_lockKeyUh48); // we are now removing m_removing = true; LockRequest *lr = &m_lockRequest; lr->m_lockKeyUh48 = m_lockKeyUh48; lr->m_lockSequence = m_lockSequence; lr->m_firstIp = m_firstIp; lr->m_removeLock = 1; // reset counts m_numRequests = 0; m_numReplies = 0; // make that the request // . point to start of the 12 byte request buffer // . m_lockSequence should still be valid char *request = (char *)lr;//m_lockKey; long requestSize = sizeof(LockRequest);//12; // now the locking group is based on the probable docid //unsigned long groupId = g_hostdb.getGroupIdFromDocId(m_lockKeyUh48); // ptr to list of hosts in the group //Host *hosts = g_hostdb.getGroup ( groupId ); Host *hosts = g_hostdb.getMyShard(); // this must select the same group that is going to spider it! // i.e. our group! because we check our local lock table to see // if a doled url is locked before spidering it ourselves. //Host *hosts = g_hostdb.getMyGroup(); // short cut UdpServer *us = &g_udpServer; // set the hi bit though for this one //m_lockKey |= 0x8000000000000000LL; // get # of hosts in each mirror group long hpg = g_hostdb.getNumHostsPerShard(); // loop over hosts in that shard for ( long i = 0 ; i < hpg ; i++ ) { // get a host Host *h = &hosts[i]; // skip if dead! no need to get a reply from dead guys if ( g_hostdb.isDead ( h ) ) continue; // send request to him if ( ! us->sendRequest ( request , requestSize , 0x12 , // msgType h->m_ip , h->m_port , h->m_hostId , NULL , // retSlotPtrPtr this , // state data gotLockReplyWrapper , 60*60*24*365 ) ) // udpserver returns false and sets g_errno on error return true; // count them m_numRequests++; } // block? if ( m_numRequests > 0 ) return false; // did not block return true; } bool Msg12::confirmLockAcquisition ( ) { // ensure not in use. not msg12 replies outstanding. if ( m_numRequests != m_numReplies ) { char *xx=NULL;*xx=0; } // we are now removing m_confirming = true; // make that the request // . point to start of the 12 byte request buffer // . m_lockSequence should still be valid ConfirmRequest *cq = &m_confirmRequest; char *request = (char *)cq; long requestSize = sizeof(ConfirmRequest); // sanity if ( requestSize == sizeof(LockRequest)){ char *xx=NULL;*xx=0; } // set it cq->m_collnum = m_collnum; cq->m_doledbKey = m_doledbKey; cq->m_firstIp = m_firstIp; cq->m_lockKeyUh48 = m_lockKeyUh48; cq->m_maxSpidersOutPerIp = m_maxSpidersOutPerIp; // . use the locking group from when we sent the lock request // . get ptr to list of hosts in the group //Host *hosts = g_hostdb.getGroup ( m_lockGroupId ); // the same group (shard) that has the spiderRequest/Reply is // the one responsible for locking. Host *hosts = g_hostdb.getMyShard(); // this must select the same shard that is going to spider it! // i.e. our shard! because we check our local lock table to see // if a doled url is locked before spidering it ourselves. //Host *hosts = g_hostdb.getMyShard(); // short cut UdpServer *us = &g_udpServer; // get # of hosts in each mirror group long hpg = g_hostdb.getNumHostsPerShard(); // reset counts m_numRequests = 0; m_numReplies = 0; // note it if ( g_conf.m_logDebugSpider ) log("spider: confirming lock for uh48=%llu firstip=%s", m_lockKeyUh48,iptoa(m_firstIp)); // loop over hosts in that shard for ( long i = 0 ; i < hpg ; i++ ) { // get a host Host *h = &hosts[i]; // skip if dead! no need to get a reply from dead guys if ( g_hostdb.isDead ( h ) ) continue; // send request to him if ( ! us->sendRequest ( request , // a size of 2 should mean confirm requestSize , 0x12 , // msgType h->m_ip , h->m_port , h->m_hostId , NULL , // retSlotPtrPtr this , // state data gotLockReplyWrapper , 60*60*24*365 ) ) // udpserver returns false and sets g_errno on error return true; // count them m_numRequests++; } // block? if ( m_numRequests > 0 ) return false; // did not block return true; } // use -1 for any collnum long SpiderLoop::getNumSpidersOutPerIp ( long firstIp , collnum_t collnum ) { long count = 0; // count locks HashTableX *ht = &g_spiderLoop.m_lockTable; // scan the slots long ns = ht->m_numSlots; for ( long i = 0 ; i < ns ; i++ ) { // breathe //QUICKPOLL(niceness); // skip if empty if ( ! ht->m_flags[i] ) continue; // cast lock UrlLock *lock = (UrlLock *)ht->getValueFromSlot(i); // skip if not outstanding, just a 5-second expiration wait // when the spiderReply returns, so that in case a lock // request for the same url was in progress, it will be denied. if ( ! lock->m_spiderOutstanding ) continue; // must be confirmed too if ( ! lock->m_confirmed ) continue; // correct collnum? if ( lock->m_collnum != collnum && collnum != -1 ) continue; // skip if not yet expired if ( lock->m_firstIp == firstIp ) count++; } /* for ( long i = 0 ; i <= m_maxUsed ; i++ ) { // get it XmlDoc *xd = m_docs[i]; // skip if empty if ( ! xd ) continue; // check it if ( xd->m_firstIp == firstIp ) count++; } */ return count; } void handleRequest12 ( UdpSlot *udpSlot , long niceness ) { // get request char *request = udpSlot->m_readBuf; long reqSize = udpSlot->m_readBufSize; // short cut UdpServer *us = &g_udpServer; // breathe QUICKPOLL ( niceness ); // shortcut char *reply = udpSlot->m_tmpBuf; // // . is it confirming that he got all the locks? // . if so, remove the doledb record and dock the doleiptable count // before adding a waiting tree entry to re-pop the doledb record // if ( reqSize == sizeof(ConfirmRequest) ) { char *msg = NULL; ConfirmRequest *cq = (ConfirmRequest *)request; // confirm the lock HashTableX *ht = &g_spiderLoop.m_lockTable; long slot = ht->getSlot ( &cq->m_lockKeyUh48 ); if ( slot < 0 ) { log("spider: got a confirm request for a key not " "in the table! coll must have been deleted " " or reset " "while lock request was outstanding."); g_errno = EBADENGINEER; us->sendErrorReply ( udpSlot , g_errno ); return; //char *xx=NULL;*xx=0; } } UrlLock *lock = (UrlLock *)ht->getValueFromSlot ( slot ); lock->m_confirmed = true; // note that if ( g_conf.m_logDebugSpider ) // Wait ) log("spider: got confirm lock request for ip=%s", iptoa(lock->m_firstIp)); // get it SpiderColl *sc = g_spiderCache.getSpiderColl(cq->m_collnum); // make it negative cq->m_doledbKey.n0 &= 0xfffffffffffffffeLL; // and add the negative rec to doledb (deletion operation) Rdb *rdb = &g_doledb.m_rdb; if ( ! rdb->addRecord ( cq->m_collnum, (char *)&cq->m_doledbKey, NULL , // data 0 , //dataSize 1 )){ // niceness // tree is dumping or something, probably ETRYAGAIN if ( g_errno != ETRYAGAIN ) {msg = "error adding neg rec to doledb"; log("spider: %s %s",msg,mstrerror(g_errno)); } //char *xx=NULL;*xx=0; us->sendErrorReply ( udpSlot , g_errno ); return; } // now remove from doleiptable since we removed from doledb if ( sc ) sc->removeFromDoledbTable ( cq->m_firstIp ); // how many spiders outstanding for this coll and IP? //long out=g_spiderLoop.getNumSpidersOutPerIp ( cq->m_firstIp); // DO NOT add back to waiting tree if max spiders // out per ip was 1 OR there was a crawldelay. but better // yet, take care of that in the winReq code above. // . now add to waiting tree so we add another spiderdb // record for this firstip to doledb // . true = callForScan // . do not add to waiting tree if we have enough outstanding // spiders for this ip. we will add to waiting tree when // we receive a SpiderReply in addSpiderReply() if ( sc && //out < cq->m_maxSpidersOutPerIp && // this will just return true if we are not the // responsible host for this firstip // DO NOT populate from this!!! say "false" here... ! sc->addToWaitingTree ( 0 , cq->m_firstIp, false ) && // must be an error... g_errno ) { msg = "FAILED TO ADD TO WAITING TREE"; log("spider: %s %s",msg,mstrerror(g_errno)); us->sendErrorReply ( udpSlot , g_errno ); return; } // success!! reply[0] = 1; us->sendReply_ass ( reply , 1 , reply , 1 , udpSlot ); return; } // sanity check if ( reqSize != sizeof(LockRequest) ) { log("spider: bad msg12 request size of %li",reqSize); us->sendErrorReply ( udpSlot , EBADREQUEST ); return; } // deny it if we are not synced yet! otherwise we core in // getTimeGlobal() below if ( ! isClockInSync() ) { // log it so we can debug it //log("spider: clock not in sync with host #0. so " // "returning etryagain for lock reply"); // let admin know why we are not spidering us->sendErrorReply ( udpSlot , ETRYAGAIN ); return; } LockRequest *lr = (LockRequest *)request; //unsigned long long lockKey = *(long long *)request; //long lockSequence = *(long *)(request+8); // is this a remove operation? assume not //bool remove = false; // get top bit //if ( lockKey & 0x8000000000000000LL ) remove = true; // mask it out //lockKey &= 0x7fffffffffffffffLL; // sanity check, just 6 bytes! (48 bits) if ( lr->m_lockKeyUh48 &0xffff000000000000LL ) { char *xx=NULL;*xx=0; } // note it if ( g_conf.m_logDebugSpider ) log("spider: got msg12 request uh48=%lli remove=%li", lr->m_lockKeyUh48, (long)lr->m_removeLock); // get time long nowGlobal = getTimeGlobal(); // shortcut HashTableX *ht = &g_spiderLoop.m_lockTable; long hostId = g_hostdb.getHostId ( udpSlot->m_ip , udpSlot->m_port ); // this must be legit - sanity check if ( hostId < 0 ) { char *xx=NULL;*xx=0; } // remove expired locks from locktable removeExpiredLocks ( hostId ); long long lockKey = lr->m_lockKeyUh48; // check tree long slot = ht->getSlot ( &lockKey ); // lr->m_lockKeyUh48 ); // put it here UrlLock *lock = NULL; // if there say no no if ( slot >= 0 ) lock = (UrlLock *)ht->getValueFromSlot ( slot ); // if doing a remove operation and that was our hostid then unlock it if ( lr->m_removeLock && lock && lock->m_hostId == hostId && lock->m_lockSequence == lr->m_lockSequence ) { // note it for now if ( g_conf.m_logDebugSpider ) log("spider: removing lock for lockkey=%llu hid=%li", lr->m_lockKeyUh48,hostId); // unlock it ht->removeSlot ( slot ); // it is gone lock = NULL; } // ok, at this point all remove ops return if ( lr->m_removeLock ) { reply[0] = 1; us->sendReply_ass ( reply , 1 , reply , 1 , udpSlot ); return; } ///////// // // add new lock // ///////// // if lock > 1 hour old then remove it automatically!! if ( lock && nowGlobal - lock->m_timestamp > MAX_LOCK_AGE ) { // note it for now log("spider: removing lock after %li seconds " "for lockKey=%llu hid=%li", (nowGlobal - lock->m_timestamp), lr->m_lockKeyUh48,hostId); // unlock it ht->removeSlot ( slot ); // it is gone lock = NULL; } // if lock still there, do not grant another lock if ( lock ) { // note it for now if ( g_conf.m_logDebugSpider ) log("spider: refusing lock for lockkey=%llu hid=%li", lr->m_lockKeyUh48,hostId); reply[0] = 0; us->sendReply_ass ( reply , 1 , reply , 1 , udpSlot ); return; } // make the new lock UrlLock tmp; tmp.m_hostId = hostId; tmp.m_lockSequence = lr->m_lockSequence; tmp.m_timestamp = nowGlobal; tmp.m_expires = 0; tmp.m_firstIp = lr->m_firstIp; tmp.m_collnum = lr->m_collnum; // when the spider returns we remove its lock on reception of the // spiderReply, however, we actually just set the m_expires time // to 5 seconds into the future in case there is a current request // to get a lock for that url in progress. but, we do need to // indicate that the spider has indeed completed by setting // m_spiderOutstanding to true. this way, addToWaitingTree() will // not count it towards a "max spiders per IP" quota when deciding // on if it should add a new entry for this IP. tmp.m_spiderOutstanding = true; // this is set when all hosts in the group (shard) have granted the // lock and the host sends out a confirmLockAcquisition() request. // until then we do not know if the lock will be granted by all hosts // in the group (shard) tmp.m_confirmed = false; // put it into the table if ( ! ht->addKey ( &lockKey , &tmp ) ) { // return error if that failed! us->sendErrorReply ( udpSlot , g_errno ); return; } // note it for now if ( g_conf.m_logDebugSpider ) log("spider: granting lock for lockKey=%llu hid=%li", lr->m_lockKeyUh48,hostId); // grant the lock reply[0] = 1; us->sendReply_ass ( reply , 1 , reply , 1 , udpSlot ); return; } // hostId is the remote hostid sending us the lock request void removeExpiredLocks ( long hostId ) { // when we last cleaned them out static time_t s_lastTime = 0; long nowGlobal = getTimeGlobalNoCore(); long niceness = MAX_NICENESS; // only do this once per second at the most if ( nowGlobal <= s_lastTime ) return; // shortcut HashTableX *ht = &g_spiderLoop.m_lockTable; restart: // scan the slots long ns = ht->m_numSlots; // . clean out expired locks... // . if lock was there and m_expired is up, then nuke it! // . when Rdb.cpp receives the "fake" title rec it removes the // lock, only it just sets the m_expired to a few seconds in the // future to give the negative doledb key time to be absorbed. // that way we don't repeat the same url we just got done spidering. // . this happens when we launch our lock request on a url that we // or a twin is spidering or has just finished spidering, and // we get the lock, but we avoided the negative doledb key. for ( long i = 0 ; i < ns ; i++ ) { // breathe QUICKPOLL(niceness); // skip if empty if ( ! ht->m_flags[i] ) continue; // cast lock UrlLock *lock = (UrlLock *)ht->getValueFromSlot(i); long long lockKey = *(long long *)ht->getKeyFromSlot(i); // if collnum got deleted or reset collnum_t collnum = lock->m_collnum; if ( collnum >= g_collectiondb.m_numRecs || ! g_collectiondb.m_recs[collnum] ) { log("spider: removing lock from missing collnum " "%li",(long)collnum); goto nuke; } // skip if not yet expired if ( lock->m_expires == 0 ) continue; if ( lock->m_expires >= nowGlobal ) continue; // note it for now if ( g_conf.m_logDebugSpider ) log("spider: removing lock after waiting. elapsed=%li." " lockKey=%llu hid=%li expires=%lu nowGlobal=%lu", (nowGlobal - lock->m_timestamp), lockKey,hostId,lock->m_expires,nowGlobal); nuke: // nuke the slot and possibly re-chain ht->removeSlot ( i ); // gotta restart from the top since table may have shrunk goto restart; } // store it s_lastTime = nowGlobal; } ///////////////////////// ///////////////////////// PAGESPIDER ///////////////////////// // don't change name to "State" cuz that might conflict with another class State11 { public: long m_numRecs; Msg5 m_msg5; RdbList m_list; TcpSocket *m_socket; HttpRequest m_r; collnum_t m_collnum; char *m_coll; long m_count; key_t m_startKey; key_t m_endKey; long m_minRecSizes; bool m_done; SafeBuf m_safeBuf; long m_priority; }; static bool loadLoop ( class State11 *st ) ; // . returns false if blocked, true otherwise // . sets g_errno on error // . make a web page displaying the urls we got in doledb // . doledb is sorted by priority complement then spider time // . do not show urls in doledb whose spider time has not yet been reached, // so only show the urls spiderable now // . call g_httpServer.sendDynamicPage() to send it bool sendPageSpiderdb ( TcpSocket *s , HttpRequest *r ) { // set up a msg5 and RdbLists to get the urls from spider queue State11 *st ; try { st = new (State11); } catch ( ... ) { g_errno = ENOMEM; log("PageSpiderdb: new(%i): %s", sizeof(State11),mstrerror(g_errno)); return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno));} mnew ( st , sizeof(State11) , "PageSpiderdb" ); // get the priority/#ofRecs from the cgi vars st->m_numRecs = r->getLong ("n", 20 ); st->m_r.copy ( r ); // get collection name char *coll = st->m_r.getString ( "c" , NULL , NULL ); // get the collection record to see if they have permission //CollectionRec *cr = g_collectiondb.getRec ( coll ); // the socket read buffer will remain until the socket is destroyed // and "coll" points into that st->m_coll = coll; CollectionRec *cr = g_collectiondb.getRec(coll); if ( cr ) st->m_collnum = cr->m_collnum; else st->m_collnum = -1; // set socket for replying in case we block st->m_socket = s; st->m_count = 0; st->m_priority = MAX_SPIDER_PRIORITIES - 1; // get startKeys/endKeys/minRecSizes st->m_startKey = g_doledb.makeFirstKey2 (st->m_priority); st->m_endKey = g_doledb.makeLastKey2 (st->m_priority); st->m_minRecSizes = 20000; st->m_done = false; // returns false if blocked, true otherwise return loadLoop ( st ) ; } static void gotListWrapper3 ( void *state , RdbList *list , Msg5 *msg5 ) ; static bool sendPage ( State11 *st ); static bool printList ( State11 *st ); bool loadLoop ( State11 *st ) { loop: // let's get the local list for THIS machine (use msg5) if ( ! st->m_msg5.getList ( RDB_DOLEDB , st->m_collnum , &st->m_list , st->m_startKey , st->m_endKey , st->m_minRecSizes , true , // include tree false , // add to cache 0 , // max age 0 , // start file # -1 , // # files st , // callback state gotListWrapper3 , 0 , // niceness true )) // do err correction return false; // print it. returns false on error if ( ! printList ( st ) ) st->m_done = true; // check if done if ( st->m_done ) { // send the page back sendPage ( st ); // bail return true; } // otherwise, load more goto loop; } void gotListWrapper3 ( void *state , RdbList *list , Msg5 *msg5 ) { // cast it State11 *st = (State11 *)state; // print it. returns false on error if ( ! printList ( st ) ) st->m_done = true; // check if done if ( st->m_done ) { // send the page back sendPage ( st ); // bail return; } // otherwise, load more loadLoop( (State11 *)state ); } // . make a web page from results stored in msg40 // . send it on TcpSocket "s" when done // . returns false if blocked, true otherwise // . sets g_errno on error bool printList ( State11 *st ) { // useful time_t nowGlobal ; if ( isClockInSync() ) nowGlobal = getTimeGlobal(); else nowGlobal = getTimeLocal(); // print the spider recs we got SafeBuf *sbTable = &st->m_safeBuf; // shorcuts RdbList *list = &st->m_list; // row count long j = 0; // put it in there for ( ; ! list->isExhausted() ; list->skipCurrentRecord() ) { // stop if we got enough if ( st->m_count >= st->m_numRecs ) break; // get the doledb key key_t dk = list->getCurrentKey(); // update to that st->m_startKey = dk; // inc by one st->m_startKey += 1; // get spider time from that long spiderTime = g_doledb.getSpiderTime ( &dk ); // skip if in future if ( spiderTime > nowGlobal ) continue; // point to the spider request *RECORD* char *rec = list->getCurrentData(); // skip negatives if ( (dk.n0 & 0x01) == 0 ) continue; // count it st->m_count++; // what is this? if ( list->getCurrentRecSize() <= 16 ) { char *xx=NULL;*xx=0;} // sanity check. requests ONLY in doledb if ( ! g_spiderdb.isSpiderRequest ( (key128_t *)rec )) { char*xx=NULL;*xx=0;} // get the spider rec, encapsed in the data of the doledb rec SpiderRequest *sreq = (SpiderRequest *)rec; // print it into sbTable if ( ! sreq->printToTable ( sbTable,"ready",NULL,j)) return false; // count row j++; } // need to load more? if ( st->m_count >= st->m_numRecs || // if list was a partial, this priority is short then list->getListSize() < st->m_minRecSizes ) { // . try next priority // . if below 0 we are done if ( --st->m_priority < 0 ) st->m_done = true; // get startKeys/endKeys/minRecSizes st->m_startKey = g_doledb.makeFirstKey2 (st->m_priority); st->m_endKey = g_doledb.makeLastKey2 (st->m_priority); // if we printed something, print a blank line after it if ( st->m_count > 0 ) sbTable->safePrintf("..." "\n"); // reset for each priority st->m_count = 0; } return true; } bool sendPage ( State11 *st ) { // sanity check //if ( ! g_errno ) { char *xx=NULL;*xx=0; } //SafeBuf sb; sb.safePrintf("Error = %s",mstrerror(g_errno)); // shortcut SafeBuf *sbTable = &st->m_safeBuf; // generate a query string to pass to host bar char qs[64]; sprintf ( qs , "&n=%li", st->m_numRecs ); // store the page in here! SafeBuf sb; sb.reserve ( 64*1024 ); g_pages.printAdminTop ( &sb, st->m_socket , &st->m_r , qs ); // get spider coll collnum_t collnum = g_collectiondb.getCollnum ( st->m_coll ); // and coll rec CollectionRec *cr = g_collectiondb.getRec ( collnum ); // print reason why spiders are not active for this collection long tmp2; SafeBuf mb; if ( cr ) getSpiderStatusMsg ( cr , &mb , &tmp2 ); if ( mb.length() && tmp2 != SP_INITIALIZING ) sb.safePrintf(//"
" "" "" //"" "" "
" "" "%s" "
\n" , mb.getBufStart() ); // begin the table sb.safePrintf ( "\n" "\n" , TABLE_STYLE , (long)g_spiderLoop.m_numSpidersOut //, g_spiderLoop.m_lockTable.m_numSlotsUsed ); // the table headers so SpiderRequest::printToTable() works if ( ! SpiderRequest::printTableHeader ( &sb , true ) ) return false; // shortcut XmlDoc **docs = g_spiderLoop.m_docs; // count # of spiders out long j = 0; // first print the spider recs we are spidering for ( long i = 0 ; i < (long)MAX_SPIDERS ; i++ ) { // get it XmlDoc *xd = docs[i]; // skip if empty if ( ! xd ) continue; // sanity check if ( ! xd->m_sreqValid ) { char *xx=NULL;*xx=0; } // grab it SpiderRequest *oldsr = &xd->m_sreq; // get status char *status = xd->m_statusMsg; // show that if ( ! oldsr->printToTable ( &sb , status,xd,j) ) return false; // inc count j++; } // end the table sb.safePrintf ( "
" //"
" "Currently Spidering on This Host" " (%li spiders)" //" (%li locks)" //"
" "
\n" ); sb.safePrintf ( "
\n" ); /* if ( g_spiderCache.m_numMsgSamples > 0 ) { sb.safePrintf ( "" "" "" "\n", LIGHT_BLUE , DARK_BLUE ); HashTableT* m = &g_spiderCache.m_spiderMsgs; for(long i = 0; i < m->getNumSlots();i++) { if(m->getKey(i) == 0) continue; sb.safePrintf ( "" "" "" "" "\n", 100*m->getValueFromSlot(i)/ g_spiderCache.m_numMsgSamples, m->getValueFromSlot(i), (char*)m->getKey(i)); } sb.safePrintf ("
" "Proportion of Spider Time Spent in " "Section." "
%.2f%%%.0f%s
\n"); } */ /* // try to put these in tool tips // describe the various parms sb.safePrintf ( "" "" "" "\n" "" //"" "" "" "" "" "" "" "" "" "" "" "" "" "
" "Status descriptions" "
getting link infoperforming " "getting site title bufgetting " "the title and all inlinker text of the root page." "
getting outlink ip vectorgetting " "ips of the outlinks. Gets from tagdb firstip " "tag if it exists." "
getting robots.txtdownloading the " "robots.txt file for this url." "
checking for duplooking up the url's " "docid in checksumdb to see if its content checksum " "is in use by another indexed document from the same " "site. Will index even if it is a dup if it has a " "higher quality." "
getting web pagedownloading the web " "page." "
getting cached web page" "looking up the " "old record for this url in titledb to see how the " "content changed." "
adding linksadding links from the page " "to spiderdb. Links are distributed to the host that " "stores them based on the hash of the link. Make sure " "<tfndbMaxPageCacheMem> is high enough to keep " "tfndb disk seeks down. A tfndb access is done for " "every link added." "


\n\n", LIGHT_BLUE , DARK_BLUE ); */ // then spider collection //SpiderColl *sc = g_spiderCache.m_spiderColls[collnum]; SpiderColl *sc = g_spiderCache.getSpiderColl(collnum); // // spiderdb rec stats, from scanning spiderdb // // if not there, forget about it if ( sc ) sc->printStats ( sb ); // // Spiders Table // long long totalPoints = g_stats.m_totalSpiderSuccessNew + g_stats.m_totalSpiderErrorsNew + g_stats.m_totalSpiderSuccessOld + g_stats.m_totalSpiderErrorsOld; long long totalNew = g_stats.m_totalSpiderSuccessNew + g_stats.m_totalSpiderErrorsNew; long long totalOld = g_stats.m_totalSpiderSuccessOld + g_stats.m_totalSpiderErrorsOld; double tsr = 100.00; double nsr = 100.00; double osr = 100.00; if ( totalPoints > 0 ) { tsr = 100.00* (double)(g_stats.m_totalSpiderSuccessNew + g_stats.m_totalSpiderSuccessOld) / (double)totalPoints; if ( totalNew > 0 ) nsr= 100.00*(double)(g_stats.m_totalSpiderSuccessNew) / (double)(totalNew); if ( totalOld > 0 ) osr= 100.00*(double)(g_stats.m_totalSpiderSuccessOld) / (double)(totalOld); } long points = g_stats.m_spiderSample; if ( points > 1000 ) points = 1000; long sampleNew = g_stats.m_spiderNew; long sampleOld = points - g_stats.m_spiderNew; double tssr = 100.00; double nssr = 100.00; double ossr = 100.00; if ( points > 0 ) { tssr = 100.00* (double)(points - g_stats.m_spiderErrors) / (double)points ; if ( sampleNew > 0 ) nssr = 100.00*(double)(sampleNew - g_stats.m_spiderErrorsNew) / (double)(sampleNew); if ( sampleOld > 0 ) ossr = 100.00*(double)(sampleOld - (g_stats.m_spiderErrors - g_stats.m_spiderErrorsNew)) / (double)(sampleOld); } sb.safePrintf( "\n" , LIGHT_BLUE ); sb.safePrintf ( "" "" "\n" "" "" "" "" "" "" "\n" "\n" //"\n" //"\n" //"\n" //"\n" "" "" "", TABLE_STYLE, DARK_BLUE, totalPoints, totalNew, totalOld, points, sampleNew, sampleOld, //g_stats.m_totalSpiderSuccessNew + //g_stats.m_totalSpiderSuccessOld, //g_stats.m_totalSpiderSuccessNew, //g_stats.m_totalSpiderSuccessOld, //g_stats.m_spiderSuccessNew + //g_stats.m_spiderSuccessOld, //g_stats.m_spiderSuccessNew, //g_stats.m_spiderSuccessOld, //g_stats.m_totalSpiderErrorsNew + //g_stats.m_totalSpiderErrorsOld, //g_stats.m_totalSpiderErrorsNew, //g_stats.m_totalSpiderErrorsOld, //g_stats.m_spiderErrorsNew + //g_stats.m_spiderErrorsOld, //g_stats.m_spiderErrorsNew, //g_stats.m_spiderErrorsOld, tsr, nsr, osr, tssr, nssr, ossr ); long bucketsNew[65536]; long bucketsOld[65536]; memset ( bucketsNew , 0 , 65536*4 ); memset ( bucketsOld , 0 , 65536*4 ); for ( long i = 0 ; i < points; i++ ) { long n = g_stats.m_errCodes[i]; if ( n < 0 || n > 65535 ) { log("admin: Bad spider error code."); continue; } if ( g_stats.m_isSampleNew[i] ) bucketsNew[n]++; else bucketsOld[n]++; } for ( long i = 0 ; i < 65536 ; i++ ) { if ( g_stats.m_allErrorsNew[i] == 0 && g_stats.m_allErrorsOld[i] == 0 && bucketsNew[i] == 0 && bucketsOld[i] == 0 ) continue; sb.safePrintf ( "" "" "" "" "" "" "" "\n" , LIGHT_BLUE, mstrerror(i), g_stats.m_allErrorsNew[i] + g_stats.m_allErrorsOld[i], g_stats.m_allErrorsNew[i], g_stats.m_allErrorsOld[i], bucketsNew[i] + bucketsOld[i] , bucketsNew[i] , bucketsOld[i] ); } sb.safePrintf ( "
" "
Spider Stats
" "TotalTotal NewTotal OldSampleSample NewSample Old" "
Total Spiders" "%lli%lli%lli%li%li%li
Successful Spiders" //"%lli%lli%lli%li%li%li
Failed Spiders" //"%lli%lli%lli%li%li%li
Success Rate" "%.02f%%%.02f%%%.02f%%%.02f%%%.02f%%%.02f%%
%s%lli%lli%lli%li%li%li

\n" ); // describe the various parms /* sb.safePrintf ( "" "" "" "\n" "" "" "\n" "" "" "\n" "" "" "\n" "" "" "\n" "" "" "\n" "" "" "\n" "" "" "\n" "" "" "\n" "" "" "\n" "" "" "\n" "" "" "\n" "" "" "\n" "" "" "\n" "" "" "\n" "
" "Field descriptions" "
hitsThe number of attempts that were " "made by the spider to read a url from the spider " "queue cache.
missesThe number of those attempts that " "failed to get a url to spider.
cachedThe number of urls that are " "currently in the spider queue cache.
waterThe number of urls that were in the " "spider queue cache at any one time, since the start " "of the last disk scan.
kickedThe number of urls that were " "replaced in the spider queue cache with urls loaded " "from disk, since the start of the last disk scan.
addedThe number of urls that were added " "to the spider queue cache since the start of the last " "disk scan. After a document is spidered its url " "if often added again to the spider queue cache.
attemptedThe number of urls that " "Gigablast attempted to add to the spider queue cache " "since the start of the last disk scan. In " "a distributed environment, urls are distributed " "between twins so not all urls read will " "make it into the spider queue cache. Also includes " "spider recs attempted to be re-added to spiderdb " "after being spidering, but usually with a different " "spider time.
nlThis is 1 iff Gigablast currently " "needs to reload the spider queue cache from disk.
rnlThis is 1 iff Gigablast currently " "really needs to reload the spider queue cache from " "disk.
moreThis is 1 iff there are urls on " "the disk that are not in the spider queue cache.
loadingThis is 1 iff Gigablast is " "currently loading this spider cache queue from " "disk.
scannedThe number of bytes that were " "read from disk since the start of the last disk " "scan.
readsThe number of disk read " "operations since the start of the last disk " "scan.
elapsedThe time in seconds that has " "elapsed since the start or end of the last disk " "scan, depending on if a scan is currently in " "progress.
\n", LIGHT_BLUE , DARK_BLUE ); */ // done if no sc if ( ! sc ) { // get the socket TcpSocket *s = st->m_socket; // then we can nuke the state mdelete ( st , sizeof(State11) , "PageSpiderdb" ); delete (st); // erase g_errno for sending g_errno = 0; // now encapsulate it in html head/tail and send it off return g_httpServer.sendDynamicPage (s, sb.getBufStart(), sb.length() ); } ///// // // READY TO SPIDER table // ///// long ns = 0; if ( sc ) ns = sc->m_doleIpTable.getNumSlotsUsed(); // begin the table sb.safePrintf ( "\n" "\n" //,time,nowUTC ); // the table headers so SpiderRequest::printToTable() works if ( ! SpiderRequest::printTableHeader ( &sb ,false ) ) return false; // the the doledb spider recs char *bs = sbTable->getBufStart(); if ( bs && ! sb.safePrintf("%s",bs) ) return false; // end the table sb.safePrintf ( "
" "URLs Ready to Spider for collection " "%s" "" " (%li ips in doleiptable)" , TABLE_STYLE, st->m_coll , ns ); // print time format: 7/23/1971 10:45:32 time_t nowUTC = getTimeGlobal(); struct tm *timeStruct ; char time[256]; timeStruct = gmtime ( &nowUTC ); strftime ( time , 256 , "%b %e %T %Y UTC", timeStruct ); sb.safePrintf("" // (current time = %s = %lu) " "
\n" ); sb.safePrintf ( "
\n" ); ///////////////// // // PRINT WAITING TREE // // each row is an ip. print the next url to spider for that ip. // ///////////////// sb.safePrintf ( "\n" "\n", timems, sc->m_waitingTree.getNumUsedNodes(), sc->m_waitingTable.getNumUsedSlots()); sb.safePrintf("",DARK_BLUE); sb.safePrintf("\n"); sb.safePrintf("\n"); sb.safePrintf("\n"); // the the waiting tree long node = sc->m_waitingTree.getFirstNode(); long count = 0; for ( ; node >= 0 ; node = sc->m_waitingTree.getNextNode(node) ) { // breathe QUICKPOLL(MAX_NICENESS); // get key key_t *key = (key_t *)sc->m_waitingTree.getKey(node); // get ip from that long firstIp = (key->n0) & 0xffffffff; // get the time unsigned long long spiderTimeMS = key->n1; // shift upp spiderTimeMS <<= 32; // or in spiderTimeMS |= (key->n0 >> 32); // get the rest of the data sb.safePrintf("" "" "" "\n", LIGHT_BLUE, spiderTimeMS, iptoa(firstIp)); // stop after 20 if ( ++count == 20 ) break; } // ... if ( count ) sb.safePrintf("" "\n", LIGHT_BLUE); // end the table sb.safePrintf ( "
" "IPs Waiting for Selection Scan for collection " "%s" "" , TABLE_STYLE, st->m_coll ); // print time format: 7/23/1971 10:45:32 long long timems = gettimeofdayInMillisecondsGlobal(); sb.safePrintf(" (current time = %llu)(totalcount=%li)" "(waittablecount=%li)
spidertime (MS)firstip
%llu%s
...
\n" ); sb.safePrintf ( "
\n" ); // get the socket TcpSocket *s = st->m_socket; // then we can nuke the state mdelete ( st , sizeof(State11) , "PageSpiderdb" ); delete (st); // erase g_errno for sending g_errno = 0; // now encapsulate it in html head/tail and send it off return g_httpServer.sendDynamicPage (s, sb.getBufStart(),sb.length() ); } /////////////////////////////////// // // URLFILTERS // /////////////////////////////////// /* // assign these a value of 1 in s_table hashtable static char *s_ypSites[] = { "www.yellow.com", "www.yellowpages.com", "www.dexknows.com", "yellowpages.aol.com", "www.superpages.com", "citysearch.com", "www.yellowbook.com", "www.magicyellow.com", "home.digitalcity.com", "www.switchboard.com", "cityguide.aol.com", "www.bizrate.com", "www.restaurantica.com", "www.insiderpages.com", "local.yahoo.com" }; // . assign these a value of 2 in s_table hashtable // . mwells@g0:/y$ cat gobyout | awk '{print $4}' | grep -v goby.com | grep -vi goby | grep -v google.com | grep -v mappoint | urlinfo | grep "host: " | awk '{print $2}' | sort | uniq > foo // . then take the top linked to sites on goby and print out for direct // insertion into this file: // then get the popular domains from THAT list: // mwells@g0:/y$ cat foo | awk '{print $2}' | urlinfo | grep "dom: " | awk '{print $2}' | sort | uniq -c | sort > foodom static char *s_aggSites[] = { "isuwmsrugby.tripod.com", "meyerlemon.eventbrite.com", "miami.tourcorp.com", "valentinesdaydatenightcoupleschi.eventbrite.com", "volcano.si.edu", "webpages.csus.edu", "weddingextravaganza.eventbrite.com", "www.alliancerugby.org", "www.asuwrfc.com", "www.btpd.org", "www.chicagodragons.org", "www.chsgeorgia.org", "www.derugbyfoundation.org", "www.foxborosportscenter.com", "www.lynn.edu", "www.owensboroparks.org", "www.scitrek.org", "www.southcarolinaparks.com", "www.usbr.gov", "dummil.eventbrite.com", "jacksonvilleantiqueshow.eventbrite.com", "kidsfest.eventbrite.com", "piuvalentine.eventbrite.com", "www.anytimefitness.com", "www.dumbartonhouse.org", "www.lsurugby.com", "www.maliburugby.com", "www.pitsrugby.com", "www.renegaderugby.org", "www.rotor.com", "www.rugbyrats.com", "www.sanjoserugby.com", "www.seattleartists.com", "www.sixflags.com", "www.vacavillesports.com", "atlcomedyfest.eventbrite.com", "easyweekdaycooking.eventbrite.com", "hartford.citysearch.com", "healthythaicooking.eventbrite.com", "hicaregiversconference.eventbrite.com", "skiing.alpinezone.com", "spirit.lib.uconn.edu", "springfield.ettractions.com", "tomatofest2011.eventbrite.com", "www.abc-of-meditation.com", "www.amf.com", "www.atlantaharlequins.com", "www.chicagoparkdistrict.com", "www.denverwildfirerfc.org", "www.gowaterfalling.com", "www.harlequins.org", "www.ignatius.org", "www.masmacon.com", "www.palmbeachrugby.org", "www.riversiderugby.com", "www.rmne.org", "www.thehilliard.org", "www.woodsmenrugby.com", "devildoll.eventbrite.com", "iexpectcrabfeedfundraiser.eventbrite.com", "sports.groups.yahoo.com", "valentinesdaycookingwithlove.eventbrite.com", "www.agisamazing.com", "www.ascendinglotus.com", "www.auduboninstitute.org", "www.azrugbyref.com", "www.blackicerugby.com", "www.bluegrassmuseum.org", "www.krewerugby.com", "www.lamorugby.com", "www.lsue.edu", "www.norwichrink.com", "www.ombac.org", "www.sdarmada.org", "www.sirensrugby.com", "www.tampabarbarians.org", "www.travellanecounty.org", "www.visit-newhampshire.com", "hawaii.tourcorp.com", "tasteofkorea.eventbrite.com", "www.ballyfitness.com", "www.calpolyrugby.com", "www.destateparks.com", "www.eaa.org", "www.goldsgym.com", "www.gonzagarugby.com", "www.greatexplorations.org", "www.heparks.org", "www.imagisphere.org", "www.jeffdavis.org", "www.park.granitecity.com", "www.poets.org", "www.regis.edu", "www.verizoncenter.com", "mybridalsale.eventbrite.com", "pigandsausagetoo.eventbrite.com", "www.gaelrugby.com", "www.independent.com", "www.kohlchildrensmuseum.org", "www.operaamerica.org", "www.recration.du.edu", "www.symmetricalskatingschool.org", "www.telcomhistory.org", "www.texasoutside.com", "reagan.eureka.edu", "stampede2011.eventbrite.com", "synergy2011.eventbrite.com", "theexperience2011.eventbrite.com", "www.24hourfitness.com", "www.dematha.org", "www.facebook.com", "www.iaapa.org", "www.icelandrestoration.com", "www.louisvillewomensrugby.com", "www.manchesterrunningcompany.com", "www.moaonline.org", "www.pvicechalet.com", "www.rendlake.com", "attinuptown.eventbrite.com", "chocolateanddessertfantasy.eventbrite.com", "colorado.ettractions.com", "longbeachstaterugby.webs.com", "volcano.oregonstate.edu", "www.columbiaspacescience.org", "www.eventful.com", "eventful.com", "www.newmexico.org", "www.rmparks.org", "www.sbyouthrugby.org", "www.venturacountyrugbyclub.com", "www.wheatonicearena.com", "faithorigins.eventbrite.com", "jerseyshore.metromix.com", "stlouis.citysearch.com", "valentinesdaydatenightcooking.eventbrite.com", "www.floridarugbyunion.com", "www.rugbyatucf.com", "www.stingrayrugby.com", "www.usfbullsrugby.com", "atlanta.going.com", "klsnzwineday.eventbrite.com", "losangeles.citysearch.com", "sourdough.eventbrite.com", "valentinesdaygourmetdating.eventbrite.com", "web.mit.edu", "www.airmuseum.org", "www.eparugby.org", "www.navicache.com", "www.siliconvalleyrugby.org", "www.yale.edu", "rhodeisland.ettractions.com", "studentorgs.vanderbilt.edu", "www.jaxrugby.org", "www.orlandomagazine.com", "www.plnurugby.com", "www.recreation.du.edu", "www.riversideraptors.com", "www.usarchery.org", "cacspringfling.eventbrite.com", "dallas.going.com", "groups.northwestern.edu", "hpualumniiphonelaunchparty.eventbrite.com", "juliachild.eventbrite.com", "southbaysciencesymposium2011.eventbrite.com", "www.curugby.com", "www.everyoneruns.net", "www.glendalerugby.com", "www.phantomsyouthrugby.org", "www.usdrugby.com", "10000expo-sponsoship-nec.eventbrite.com", "greenville.metromix.com", "spssan.eventbrite.com", "www.cmaathletics.org", "www.csulb.edu", "www.doralrugby.com", "www.neworleansrugbyclub.com", "www.sos.louisiana.gov", "www.southbayrugby.org", "www.travelnevada.com", "www.uicrugbyclub.org", "www.atlantabucksrugby.org", "www.dinodatabase.com", "www.fest21.com", "www.georgiatechrugby.com", "www.gsuwomensrugby.com", "www.siuwomensrugby.com", "www.snowtracks.com", "www.trainweb.com", "www.visitnebraska.gov", "www.visitsanantonio.com", "hometown.aol.com", "next2normal.eventbrite.com", "sixmonthpassatlanta2011.eventbrite.com", "winejazz2.eventbrite.com", "www.amityrugby.org", "www.meetandplay.com", "www.miami.edu", "www.miamirugby.com", "www.phillipscollection.org", "www.tridentsrugby.com", "wwwbloggybootcampsandiego.eventbrite.com", "whale-watching.gordonsguide.com", "www.culturemob.com", "www.denver-rugby.com", "www.hillwoodmuseum.org", "www.peabody.yale.edu", "www.yoursciencecenter.com", "newyorkcity.ettractions.com", "rawfoodcert.eventbrite.com", "www.discoverydepot.org", "www.dukecityrugbyclub.com", "www.jazztimes.com", "www.kissimmeeairmuseum.com", "www.southstreetseaportmuseum.org", "www.wsbarbariansrugby.com", "beerunch2011.eventbrite.com", "milwaukee.ettractions.com", "seminoletampa.casinocity.com", "silveroak.eventbrite.com", "tsunamifitclub.eventbrite.com", "walking-tours.gordonsguide.com", "www.alamedarugby.com", "www.atshelicopters.com", "www.camelbackrugby.com", "www.dlshs.org", "www.eteamz.com", "newyork.ettractions.com", "www.allaboutrivers.com", "www.childrensmuseumatl.org", "www.hartfordroses.org", "www.nationalparks.org", "www.seahawkyouthrugby.com", "www.skiingthebackcountry.com", "epcontinental.eventbrite.com", "healthandwellnessshow.eventbrite.com", "www.apopkamuseum.org", "www.condorsrugby.com", "www.dcr.virginia.gov", "www.diabloyouthrugby.org", "www.rockandice.com", "honolulu.metromix.com", "mowcrabfeed2011.eventbrite.com", "ptt-superbowl.eventbrite.com", "whitewater-rafting.gordonsguide.com", "winearomatraining.eventbrite.com", "www.broadway.com", "www.usc.edu", "www.gatorrugby.com", "www.iumudsharks.net", "www.scrrs.net", "www.sfggrugby.com", "www.unco.edu", "hctmspring2011conference.eventbrite.com", "sandiego.going.com", "www.crt.state.la.us", "www.foodhistorynews.com", "www.lancerrugbyclub.org", "www.littlerockrugby.com", "www.sharksrugbyclub.com", "www.channelislandsice.com", "www.idealist.org", "www.mbtykesrugby.com", "katahdicon.eventbrite.com", "foodwineloversfestival.eventbrite.com", "maristeveningseries2011.eventbrite.com", "philadelphia.ettractions.com", "sugarrushla.eventbrite.com", "www.chicagolions.com", "www.skatingsafe.com", "www.themeparkinsider.com", "fremdcraftfairspring2011.eventbrite.com", "gorptravel.away.com", "minnesota.ettractions.com", "www.chicagohopeacademy.org", "www.fmcicesports.com", "www.kitebeaches.com", "www.mixedmartialarts.com", "www.slatermill.org", "www.sunnysideoflouisville.org", "www.visitrochester.com", "careshow.eventbrite.com", "massachusetts.ettractions.com", "edwardianla2011.eventbrite.com", "indianapolis.metromix.com", "www.pasadenamarathon.org", "washington.going.com", "www.sjquiltmuseum.org", "www.wannakitesurf.com", "fauwomensrugby.sports.officelive.com", "newhampshire.ettractions.com", "www.vcmha.org", "milwaukee.going.com", "phoenix.going.com", "www.anrdoezrs.net", "www.temperugby.com", "pampermefabulous2011.eventbrite.com", "www.napavalleyvineyards.org", "r4k11.eventbrite.com", "ramonamusicfest.eventbrite.com", "www.abc-of-rockclimbing.com", "www.geocities.com", "jackson.metromix.com", "www.santamonicarugby.com", "cleveland.metromix.com", "lancaster.ettractions.com", "www.fortnet.org", "www.horseandtravel.com", "www.pubcrawler.com", "kdwp.state.ks.us", "www.berkeleyallblues.com", "www.liferugby.com", "www.socalmedicalmuseum.org", "www.dcsm.org", "www.sutler.net", "desmoines.metromix.com", "www.cavern.com", "www.dotoledo.org", "www.fws.gov", "www.ghosttowngallery.com", "www.museumamericas.org", "www.museumsofboston.org", "www.northshorerugby.com", "geocaching.gpsgames.org", "www.americaeast.com", "www.cwrfc.org", "www.jewelryshowguide.com", "www.livelytimes.com", "www.pascorugbyclub.com", "www.westminsterice.com", "www.claremontrugby.org", "www.jugglingdb.com", "www.metalblade.com", "www.preservationnation.org", "sofla2011.eventbrite.com", "www.belmonticeland.com", "www.dropzone.com", "www.smecc.org", "www.studentgroups.ucla.edu", "www.visitdetroit.com", "honolulu.going.com", "sippingandsaving5.eventbrite.com", "www.connecticutsar.org", "www.guestranches.com", "www.nvtrailmaps.com", "www.visitnh.gov", "illinois.ettractions.com", "www.spymuseum.org", "www.ci.riverside.ca.us", "www.hbnews.us", "www.santaclarayouthrugby.com", "www.thestranger.com", "www.freewebs.com", "www.miamirugbykids.com", "www.mtwashingtonvalley.org", "www.ocbucksrugby.com", "bridalpaloozala.eventbrite.com", "maps.yahoo.com", "www.azstateparks.com", "www.paywindowpro.com", "www.rowadventures.com", "parksandrecreation.idaho.gov", "www.artsmemphis.org", "www.lasvegasweekly.com", "www.redmountainrugby.org", "san-francisco.tourcorp.com", "www.khsice.com", "www.vansenusauto.com", "quinceanerasmagazineoc.eventbrite.com", "www.mvc-sports.com", "www.tbsa.com", "www.travelportland.com", "rtnpilgrim.eventbrite.com", "www.bigfishtackle.com", "www.centralmass.org", "cpca2011.eventbrite.com", "www.matadorrecords.com", "www.sebabluegrass.org", "prescott.showup.com", "vintagevoltage2011.eventbrite.com", "www.seattleperforms.com", "www.valleyskating.com", "resetbootcamp.eventbrite.com", "www.abc-of-mountaineering.com", "www.snocountry.com", "events.nytimes.com", "www.icecenter.net", "www.livefrommemphis.com", "www.pasadenarfc.com", "www.ucsdrugby.com", "uclaccim.eventbrite.com", "www.visitchesapeake.com", "www.natureali.org", "www.nordicskiracer.com", "www.nowplayingva.org", "www.sbcounty.gov", "www.seedesmoines.com", "www.world-waterfalls.com", "denver.going.com", "hearstmuseum.berkeley.edu", "www.lmurugby.com", "www.ftlrugby.com", "www.pelicanrugby.com", "rtnharthighschool.eventbrite.com", "www.visitri.com", "www.aba.org", "www.americaonice.us", "www.thecontemporary.org", "www.wherigo.com", "www.drtopo.com", "www.visitseattle.org", "calendar.dancemedia.com", "trips.outdoors.org", "www.chs.org", "www.myneworleans.com", "www.oaklandice.com", "nashville.metromix.com", "www.americangolf.com", "www.fossilmuseum.net", "www.oakparkparks.com", "www.visit-maine.com", "www.oregonlive.com", "www.allwashingtondctours.com", "www.wannadive.net", "www.sportsheritage.org", "hudsonvalley.metromix.com", "www.scificonventions.com", "www.wildernessvolunteers.org", "essencemusicfestival.eventbrite.com", "www.kitesurfatlas.com", "www.ndtourism.com", "valentinesgourmetdatingchicago.eventbrite.com", "www.fingerlakeswinecountry.com", "www.dmnh.org", "www.ticketnetwork.com", "partystroll.eventbrite.com", "www.bedandbreakfastnetwork.com", "www.sternmass.org", "www.visitnh.com", "www.places2ride.com", "www.hawaiieventsonline.com", "www.ucirugby.com", "www.gohawaii.com", "www.writersforum.org", "www.roadracingworld.com", "www.bigisland.org", "www.boatbookings.com", "www.lhs.berkeley.edu", "www.dnr.state.mn.us", "www.mostateparks.com", "www.historicnewengland.org", "www.waza.org", "www.backbayrfc.com", "newyork.metromix.com", "www.larebellion.org", "teetimes.golfhub.com", "10000expo-sponsoship-ceg.eventbrite.com", "10000expo-sponsor-bjm.eventbrite.com", "parks.ky.gov", "www.bostonusa.com", "www.visitbuffaloniagara.com", "www.sharksice.com", "2011burbankapprentice.eventbrite.com", "kansascity.ettractions.com", "www.bicycling.com", "www.cityofchino.org", "www.ridingworld.com", "www.whittierrugby.com", "10000bestjobsam.eventbrite.com", "www.adventurecentral.com", "www.earlymusic.org", "www.upcomingevents.com", "www.sleddogcentral.com", "www.capecodkidz.com", "www.collectorsguide.com", "www.cougarrugby.org", "www.sfvrugby.com", "strivetothrivepabcconf.eventbrite.com", "www.visithoustontexas.com", "www.authorstrack.com", "www.aboutgolfschools.org", "www.huntingspotz.com", "www.lib.az.us", "members.aol.com", "www.fs.fed.us", "www.ncarts.org", "www.vermonttravelplanner.org", //"www.scubadiving.com", "www.waterfallsnorthwest.com", "www.philadelphiausa.travel", "www.usgolfschoolguide.com", "njgin.state.nj.us", "www.artcards.cc", "www.rimonthly.com", "www.atlanta.net", "www.glacialgardens.com", "2011superbowlcruise.eventbrite.com", "swimming-with-dolphins.gordonsguide.com", "www.trackpedia.com", // why was this in there? //"www.dailyherald.com", "www.nhm.org", "boston.ettractions.com", "www.geneseefun.com", "www.travelsd.com", "www.golfbuzz.com", "www.in.gov", "cincinnati.metromix.com", "www.sanjose.com", "brevard.metromix.com", "www.dogsledrides.com", "www.orvis.com", "philadelphia.going.com", "twincities.metromix.com", "www.orlandorugby.com", "www.csufrugby.com", "www.larugby.com", "www.washingtonwine.org", "calendar.gardenweb.com", "gulfcoast.metromix.com", "florida.ettractions.com", "www.northeastwaterfalls.com", "www.computerhistory.org", "www.ct.gov", "www.hosteltraveler.com", "www.thinkrentals.com", "www.4x4trailhunters.com", "www.cityweekly.net", "www.yourrunning.com", "www.spasofamerica.com", "www.indoorclimbing.com", "www.utah.com", "boston.going.com", "minneapolisstpaul.ettractions.com", "www.coolrunning.com", "www.greensboronc.org", "www.michigan.org", "www.artfestival.com", "www.divespots.com", "www.oregonstateparks.org", "www.virginiawine.org", "www.morebeach.com", "www.minnesotamonthly.com", "www.texasescapes.com", "www.usatf.org", "www.findrentals.com", "www.hachettebookgroup.com", "www.racesonline.com", "www.usace.army.mil", "web.georgia.org", "detroit.metromix.com", "www.homebrewersassociation.org", "www.baltimore.org", "www.gastateparks.org", "www.arkansasstateparks.com", "www.visitlasvegas.com", "www.whenwerv.com", "www.chilicookoff.com", "www.bikeride.com", "www.eaglerockrugby.com", "www.pickwickgardens.com", "flagstaff.showup.com", "miami.going.com", "www.anchorage.net", "www.wlra.us", "www.thetrustees.org", "www.artnet.com", "www.mthoodterritory.com", "www.hihostels.com", "www.bfa.net", "167.102.232.26", "www.flyins.com", "www.stepintohistory.com", "www.festing.com", "www.pursuetheoutdoors.com", "newyork.going.com", "www.fishingguidenetwork.com", "www.visit-massachusetts.com", "www.visitindy.com", "www.washingtonpost.com", "www.greatamericandays.com", "www.washingtonian.com", "national.citysearch.com", "www.infohub.com", "www.productionhub.com", "www.events.org", "www.traveliowa.com", "www.findmyadventure.com", "delaware.metromix.com", "www.marinmagazine.com", "us.penguingroup.com", "www.bicycletour.com", "www.travelok.com", "www.scububble.com", "www.childrensmuseums.org", "www.conventionscene.com", "www.scubaspots.com", "www.tnvacation.com", "stlouis.ettractions.com", "www.mxparks.com", "florida.greatestdivesites.com", "www.nowplayingaustin.com", "www.skinnyski.com", "www.sportoften.com", "www.zvents.com", "www.visitphoenix.com", "palmsprings.metromix.com", "upcoming.yahoo.com", "www.washington.org", "www.balloonridesacrossamerica.com", "www.playbill.com", "palmbeach.ettractions.com", "louisville.metromix.com", "www.animecons.com", "www.findanartshow.com", "www.usef.org", "www.villagevoice.com", "www.discovergold.org", "www.georgiaoffroad.com", "www.memphistravel.com", "dc.metromix.com", "www.aplf-planetariums.info", "www.skateisi.com", "www.usacycling.org", "www.wine-compass.com", "www.visitdelaware.com", "tucson.metromix.com", "www.happycow.net", "www.indiecraftshows.com", "www.gethep.net", "www.agritourismworld.com", "stlouis.metromix.com", "phoenix.metromix.com", "stream-flow.allaboutrivers.com", "www.festivalsandevents.com", "www.winemcgee.com", "www.aurcade.com", "www.visitjacksonville.com", "www.nashvillescene.com", "www.4x4trails.net", "www.americancraftmag.org", "blog.danceruniverse.com", "www.vacationrealty.com", "www.californiasciencecenter.org", "www.rollerhome.com", "www.atvsource.com", "www.hotairballooning.com", "www.freeskateparks.com", "www.ruralbounty.com", "connecticut.ettractions.com", "www.localattractions.com", "www.skategroove.com", "www.hawaiitours.com", "www.visitrhodeisland.com", "www.swac.org", "www.swimmingholes.org", "www.roadfood.com", "www.gotriadscene.com", "www.runnersworld.com", "www.outerquest.com", "www.seattleweekly.com", "www.onlyinsanfrancisco.com", "www.bikereg.com", "www.artslant.com", "www.louisianatravel.com", "www.operabase.com", "www.stepintoplaces.com", "www.vinarium-usa.com", "www.visitconnecticut.com", "www.abc-of-mountainbiking.com", "www.wannask8.com", "www.xcski.org", "www.active-days.org", "www.hawaiiactivities.com", "www.massvacation.com", "www.uspa.org", "miami.ettractions.com", "www.abc-of-hiking.com", "www.bestofneworleans.com", "www.phillyfunguide.com", "www.beermonthclub.com", "www.newenglandwaterfalls.com", "www.lake-link.com", "www.festivalfinder.com", "www.visitmississippi.org", "www.lanierbb.com", "www.thepmga.com", "www.skitown.com", "www.fairsandfestivals.net", "sanfrancisco.going.com", "www.koa.com", "www.wildlifeviewingareas.com", "www.boatrenting.com", "www.nowplayingutah.com", "www.ultimaterollercoaster.com", "www.findacraftfair.com", "www.ababmx.com", "www.abc-of-skiing.com", "www.pw.org", "tampabay.metromix.com", "www.onthesnow.com", "www.sunny.org", "www.visitnewengland.com", "atlanta.metromix.com", "www.allaboutapples.com", "www.monsterjam.com", "www.bnbfinder.com", "www.sandiego.org", "www.worldcasinodirectory.com", "www.yoga.com", "www.1-800-volunteer.org", "www.visitkc.com", "www.theskichannel.com", "www.thephoenix.com", "www.virginia.org", "www.avclub.com", "www.orlandoinfo.com", "www.trustedtours.com", "www.peakradar.com", "web.minorleaguebaseball.com", "www.artshound.com", "www.daytonabeach.com", "chicago.going.com", "www.cetaceanwatching.com", "www.citypages.com", "www.nowplayingnashville.com", "www.discoverlosangeles.com", "www.ratebeer.com", "www.harpercollins.com", "www.seenewengland.com", "www.visitmt.com", "www.goldstar.com", "www.caverbob.com", "www.sanjose.org", "www.backcountrysecrets.com", "authors.simonandschuster.com", "rafting.allaboutrivers.com", "chicago.ettractions.com", "iweb.aam-us.org", "www.theputtingpenguin.com", "www.festivals.com", "www.artsboston.org", "www.aboutskischools.com", "tucson.showup.com", "www.thiswaytothe.net", "www.rei.com", "www.magicseaweed.com", "www.waterfallswest.com", "fortlauderdale.ettractions.com", "www.foodreference.com", "www.californiawineryadvisor.com", "www.teamap.com", "www.neworleanscvb.com", "www.skatetheory.com", "www.visitmaine.com", "www.rollerskating.org", "www.culturecapital.com", "www.delawarescene.com", "www.nyc-arts.org", "www.huntingoutfitters.net", "www.showcaves.com", "www.soccerbars.com", "www.visitnewportbeach.com", "www.beerme.com", "www.pitch.com", "www.museum.com", "www.hauntworld.com", "www.forestcamping.com", "www.dogpark.com", "www.critterplaces.com", "www.visitnj.org", "www.findagrave.com", "www.arcadefly.com", "www.winerybound.com", "www.usms.org", "www.zipscene.com", "www.horsetraildirectory.com", "www.coaster-net.com", "www.anaheimoc.org", "www.visitpa.com", "www.antiquetrader.com", "www.dallasobserver.com", "www.eventsetter.com", "www.goingoutside.com", "www.sightseeingworld.com", "www.artlog.com", "www.bnbstar.com", "www.hostels.com", "www.theartnewspaper.com", "consumer.discoverohio.com", "www.nssio.org", "www.wingshootingusa.org", "www.shootata.com", "www.randomhouse.com", "www.artforum.com", "www.bachtrack.com", "www.wayspa.com", "www.visitidaho.org", "www.exploreminnesota.com", "chicago.metromix.com", "www.worldgolf.com", "nysparks.state.ny.us", "www.meetup.com", "www.skateboardparks.com", "www.downtownjacksonville.org", "www.lighthousefriends.com", "www.strikespots.com", "ww2.americancanoe.org", "www.inlandarts.com", "www.horseshowcentral.com", "www.ridingresource.com", "www.experiencewa.com", "database.thrillnetwork.com", "denver.metromix.com", "www.bostoncentral.com", "www.segwayguidedtours.com", "www.colorado.com", "www.artandseek.org", "www.floridastateparks.org", "www.sparkoc.com", "losangeles.going.com", "www.motorcycleevents.com", "www.destination-store.com", "www.scubadviser.com", "www.booktour.com", "www.cloud9living.com", "www.allaboutjazz.com", "www.sacramento365.com", "www.discoversouthcarolina.com", "www.riverfronttimes.com", "www.hauntedhouses.com", "www.arenamaps.com", "www.artsnwct.org", "www.eventbrite.com", "animal.discovery.com", "www.eatfeats.com", "www.1001seafoods.com", "www.malletin.com", "www.yelp.com", "www.wannasurf.com", "www.clubplanet.com", "www.dupagecvb.com", "www.smartdestinations.com", "www.artfaircalendar.com", "www.excitations.com", "www.balloonrideus.com", "www.extravagift.com", "www.skisite.com", "www.orlandoweekly.com", "www.iloveny.com", "www.sandiegoreader.com", "web.usarugby.org", "www.artscalendar.com", "www.sfweekly.com", "store-locator.barnesandnoble.com", "www.realhaunts.com", "trails.mtbr.com", "www.bbonline.com", "www.pickyourownchristmastree.org", "events.myspace.com", "www.alabama.travel", "www.ctvisit.com", "freepages.history.rootsweb.com", "www.waterparks.com", "www.flavorpill.com", "www.marinasdirectory.org", "www.publicgardens.org", "www.alwaysonvacation.com", "www.infosports.com", "www.summitpost.org", "www.exploregeorgia.org", "www.brewerysearch.com", "www.phoenixnewtimes.com", "www.marinas.com", "www.arestravel.com", "www.gamebirdhunts.com", "www.cbssports.com", "tutsan.forest.net", "www.azcentral.com", "www.tennispulse.org", "www.westword.com", "www.factorytoursusa.com", "www.americanwhitewater.org", "www.spamagazine.com", "www.dogparkusa.com", "tps.cr.nps.gov", "www.sfstation.com", "www.abc-of-yoga.com", "www.worldeventsguide.com", "www.active.com", "www.beerexpedition.com", "www.iloveinns.com", "www.warpig.com", "www.artsopolis.com", "www.skatepark.com", "www.offroadnorthamerica.com", "www.visitflorida.com", "www.last.fm", "www.pbplanet.com", "www.traveltex.com", "phoenix.showup.com", "www.travelandleisure.com", "www.kentuckytourism.com", "www.gospelgigs.com", "www.whenwegetthere.com", "www.surfline.com", "www.stubhub.com", "www.centerstagechicago.com", "www.sunshineartist.com", "www.reserveamerica.com", "www.clubzone.com", "www.paddling.net", "www.xperiencedays.com", "www.razorgator.com", "www.dalejtravis.com", "www.pickyourown.org", "www.localhikes.com", "www.parks.ca.gov", "www.casinocity.com", "www.nofouls.com", "www.laweekly.com", "www.denver.org", "www.enjoyillinois.com", "www.livenation.com", "www.viator.com", "members.bikeleague.org", "www.skatespotter.com", "family.go.com", "www.myspace.com", "www.takemefishing.org", "www.localwineevents.com", "www.rinkdirectory.com", "www.walkjogrun.net", "www.nps.gov", "www.ghosttowns.com", "www.theatermania.com", "www.skateboardpark.com", "www.miaminewtimes.com", "www.explorechicago.org", "www.ocweekly.com", "www.ustasearch.com", "www.rateclubs.com", "www.tennismetro.com", "www.motorcyclemonster.com", "www.hauntedhouse.com", "www.pumpkinpatchesandmore.org", "www.courtsoftheworld.com", "www.ecoanimal.com", "www.yogafinder.com", "www.traillink.com", "www.equinenow.com", "www.jambase.com", "www.spaemergency.com", //"www.vacationhomerentals.com", "www.ava.org", "affiliate.isango.com", "www.museumland.net", "www.dirtworld.com", "www.rockclimbing.com", "www.kijubi.com", "www.outdoortrips.info", "www.visitcalifornia.com", "www.heritagesites.com", "www.bedandbreakfast.com", "www.discoveramerica.com", "www.singletracks.com", "www.museumstuff.com", "www.opentable.com", "www.homeaway.com", "www.thegolfcourses.net", "www.golflink.com", "www.trekaroo.com", "gocitykids.parentsconnect.com", "www.wildernet.com", "www.10best.com", "swim.isport.com", "www.wheretoshoot.org", "www.hostelworld.com", "www.landbigfish.com", "www.recreation.gov", "www.healthclubdirectory.com", "www.spafinder.com", "www.nationalregisterofhistoricplaces.com", "www.americantowns.com", "www.hmdb.org", "www.golfnow.com", "www.grandparents.com", "www.swimmersguide.com", "www.luxergy.com", "activities.wildernet.com", "events.mapchannels.com", "www.museumsusa.org", "www.rinktime.com", "www.rentandorbuy.com", "www.mytravelguide.com", "playspacefinder.kaboom.org", "www.famplosion.com", "www.eviesays.com", "www.anglerweb.com", "www.trails.com", "www.waymarking.com", "www.priceline.com", "local.yahoo.com", "ticketmaster.com", // rss feeds "trumba.com", // movie times: "cinemark.com", // domains (hand selected from above list filtered with urlinfo) "patch.com", "gordonsguide.com", "tourcorp.com", "americangolf.com", "casinocity.com", "going.com", "metromix.com", "ettractions.com", "citysearch.com", "eventbrite.com" }; */ /* static HashTableX s_table; static bool s_init = false; static char s_buf[25000]; static long s_craigsList; bool initAggregatorTable ( ) { // this hashtable is used for "isyellowpages" and "iseventaggregator" if ( s_init ) return true; // use niceness 0 s_table.set(4,1,4096,s_buf,25000,false,0,"spsitetbl"); // now stock it with yellow pages sites long n = (long)sizeof(s_ypSites)/ sizeof(char *); for ( long i = 0 ; i < n ; i++ ) { char *s = s_ypSites[i]; long slen = gbstrlen ( s ); long h32 = hash32 ( s , slen ); char val = 1; if ( ! s_table.addKey(&h32,&val)) {char*xx=NULL;*xx=0;} } // then stock with event aggregator sites n = (long)sizeof(s_aggSites)/ sizeof(char *); for ( long i = 0 ; i < n ; i++ ) { char *s = s_aggSites[i]; long slen = gbstrlen ( s ); long h32 = hash32 ( s , slen ); char val = 2; if ( ! s_table.addKey(&h32,&val)) {char*xx=NULL;*xx=0;} } // do not repeat this s_init = true; s_craigsList = hash32n("craigslist.org"); return true; } bool isAggregator ( long siteHash32,long domHash32,char *url,long urlLen ) { // make sure its stocked initAggregatorTable(); // is site a hit? char *v = (char *)s_table.getValue ( &siteHash32 ); // hit? if ( v && *v ) return true; // try domain? v = (char *)s_table.getValue ( &domHash32 ); // hit? if ( v && *v ) return true; // these guys mirror eventful.com's db so let's grab it... // abcd.com if ( urlLen>30 && url[11]=='t' && url[18]=='o' && strncmp(url,"http://www.thingstodoin",23) == 0 ) return true; // craigslist if ( domHash32 == s_craigsList && strstr(url,".com/cal/") ) return true; // otherwise, no return false; } */ #define SIGN_EQ 1 #define SIGN_NE 2 #define SIGN_GT 3 #define SIGN_LT 4 #define SIGN_GE 5 #define SIGN_LE 6 // from PageBasic.cpp char *getMatchingUrlPattern ( SpiderColl *sc , SpiderRequest *sreq ) ; // . this is called by SpiderCache.cpp for every url it scans in spiderdb // . we must skip certain rules in getUrlFilterNum() when doing to for Msg20 // because things like "parentIsRSS" can be both true or false since a url // can have multiple spider recs associated with it! long getUrlFilterNum2 ( SpiderRequest *sreq , SpiderReply *srep , long nowGlobal , bool isForMsg20 , long niceness , CollectionRec *cr , bool isOutlink , HashTableX *quotaTable ) { // convert lang to string char *lang = NULL; long langLen = 0; if ( srep ) { // this is NULL on corruption lang = getLanguageAbbr ( srep->m_langId ); langLen = gbstrlen(lang); } char *tld = (char *)-1; long tldLen; long urlLen = sreq->getUrlLen(); char *url = sreq->m_url; char *row; bool checkedRow = false; //SpiderColl *sc = cr->m_spiderColl; SpiderColl *sc = g_spiderCache.getSpiderColl(cr->m_collnum); //if ( strstr(url,"http://www.vault.com/rankings-reviews/company-rankings/law/vault-law-100/.aspx?pg=2" )) // log("hey"); //initAggregatorTable(); //long tldlen2; //char *tld2 = getTLDFast ( sreq->m_url , &tldlen2); //bool bad = true; //if ( tld2[0] == 'c' && tld2[1] == 'o' && tld2[2]=='m' ) bad = false; //if ( tld2[0] == 'o' && tld2[1] == 'r' && tld2[2]=='g' ) bad = false; //if ( tld2[0] == 'u' && tld2[1] == 's' ) bad = false; //if ( tld2[0] == 'g' && tld2[1] == 'o' && tld2[2]=='v' ) bad = false; //if ( tld2[0] == 'e' && tld2[1] == 'd' && tld2[2]=='u' ) bad = false; //if ( tld2[0] == 'i' && tld2[1] == 'n' && tld2[2]=='f' ) bad = false; //if ( bad ) // log("hey"); // shortcut char *ucp = cr->m_diffbotUrlCrawlPattern.getBufStart(); char *upp = cr->m_diffbotUrlProcessPattern.getBufStart(); if ( upp && ! upp[0] ) upp = NULL; if ( ucp && ! ucp[0] ) ucp = NULL; // get the compiled regular expressions regex_t *ucr = &cr->m_ucr; regex_t *upr = &cr->m_upr; if ( ! cr->m_hasucr ) ucr = NULL; if ( ! cr->m_hasupr ) upr = NULL; char *ext; char *special; // CONSIDER COMPILING FOR SPEED: // 1) each command can be combined into a bitmask on the spiderRequest // bits, or an access to m_siteNumInlinks, or a substring match // 2) put all the strings we got into the list of Needles // 3) then generate the list of needles the SpiderRequest/url matches // 4) then reduce each line to a list of needles to have, a // min/max/equal siteNumInlinks, min/max/equal hopCount, // and a bitMask to match the bit flags in the SpiderRequest // stop at first regular expression it matches for ( long i = 0 ; i < cr->m_numRegExs ; i++ ) { // breathe QUICKPOLL ( niceness ); // get the ith rule SafeBuf *sb = &cr->m_regExs[i]; //char *p = cr->m_regExs[i]; char *p = sb->getBufStart(); checkNextRule: // skip leading whitespace while ( *p && isspace(*p) ) p++; // do we have a leading '!' bool val = 0; if ( *p == '!' ) { val = 1; p++; } // skip whitespace after the '!' while ( *p && isspace(*p) ) p++; // new rules for when to download (diffbot) page if ( *p == 'm' && p[1]== 'a' && p[2]== 't' && p[3]== 'c' && p[4]== 'h' && p[5]== 'e' && p[6]== 's' && p[7]== 'u' && p[8]== 'c' && p[9]== 'p' ) { // . skip this expression row if does not match // . url must match one of the patterns in there. // . inline this for speed // . "ucp" is a ||-separated list of substrings // . "ucr" is a regex // . regexec returns 0 for a match if ( ucr && regexec(ucr,url,0,NULL,0) && // seed or other manual addition always matches ! sreq->m_isAddUrl && ! sreq->m_isInjecting ) continue; // do not require a match on ucp if ucr is given if ( ucp && ! ucr && ! doesStringContainPattern(url,ucp) && // seed or other manual addition always matches ! sreq->m_isAddUrl && ! sreq->m_isInjecting ) continue; p += 10; p = strstr(p,"&&"); if ( ! p ) return i; p += 2; goto checkNextRule; } // new rules for when to "process" (diffbot) page if ( *p == 'm' && p[1]== 'a' && p[2]== 't' && p[3]== 'c' && p[4]== 'h' && p[5]== 'e' && p[6]== 's' && p[7]== 'u' && p[8]== 'p' && p[9]== 'p' ) { // . skip this expression row if does not match // . url must match one of the patterns in there. // . inline this for speed // . "upp" is a ||-separated list of substrings // . "upr" is a regex // . regexec returns 0 for a match if ( upr && regexec(upr,url,0,NULL,0) ) continue; if ( upp && !upr &&!doesStringContainPattern(url,upp)) continue; p += 10; p = strstr(p,"&&"); if ( ! p ) return i; p += 2; goto checkNextRule; } if ( *p=='h' && strncmp(p,"hasauthorityinlink",18) == 0 ) { // skip for msg20 if ( isForMsg20 ) continue; // skip if not valid (pageaddurl? injection?) if ( ! sreq->m_hasAuthorityInlinkValid ) continue; // if no match continue if ( (bool)sreq->m_hasAuthorityInlink==val)continue; // skip p += 18; // skip to next constraint p = strstr(p, "&&"); // all done? if ( ! p ) return i; p += 2; goto checkNextRule; } if ( *p=='h' && strncmp(p,"hascontactinfo",14) == 0 ) { // skip for msg20 if ( isForMsg20 ) continue; // skip if not valid (pageaddurl? injection?) if ( ! sreq->m_hasContactInfoValid ) continue; // if no match continue if ( (bool)sreq->m_hasContactInfo==val ) continue; // skip p += 14; // skip to next constraint p = strstr(p, "&&"); // all done? if ( ! p ) return i; p += 2; goto checkNextRule; } if ( *p=='h' && strncmp(p,"hasaddress",10) == 0 ) { // if we do not have enough info for outlink, all done if ( isOutlink ) return -1; // skip for msg20 if ( isForMsg20 ) continue; // reply based if ( ! srep ) continue; // skip if not valid (pageaddurl? injection?) if ( ! srep->m_hasAddressValid ) continue; // if no match continue if ( (bool)srep->m_hasAddress==val ) continue; // skip p += 10; // skip to next constraint p = strstr(p, "&&"); // all done? if ( ! p ) return i; p += 2; goto checkNextRule; } if ( *p=='h' && strncmp(p,"hastod",6) == 0 ) { // if we do not have enough info for outlink, all done if ( isOutlink ) return -1; // skip for msg20 if ( isForMsg20 ) continue; // reply based if ( ! srep ) continue; // skip if not valid (pageaddurl? injection?) if ( ! srep->m_hasTODValid ) continue; // if no match continue if ( (bool)srep->m_hasTOD==val ) continue; // skip p += 6; // skip to next constraint p = strstr(p, "&&"); // all done? if ( ! p ) return i; p += 2; goto checkNextRule; } if ( *p=='h' && strncmp(p,"hasreply",8) == 0 ) { // if we do not have enough info for outlink, all done if ( isOutlink ) return -1; // skip for msg20 if ( isForMsg20 ) continue; // if we got a reply, we are not new!! //if ( (bool)srep == (bool)val ) continue; if ( (bool)(sreq->m_hadReply) == (bool)val ) continue; // skip it for speed p += 8; // check for && p = strstr(p, "&&"); // if nothing, else then it is a match if ( ! p ) return i; // skip the '&&' and go to next rule p += 2; goto checkNextRule; } // hastmperror, if while spidering, the last reply was // like EDNSTIMEDOUT or ETCPTIMEDOUT or some kind of // usually temporary condition that warrants a retry if ( *p=='h' && strncmp(p,"hastmperror",11) == 0 ) { // if we do not have enough info for outlink, all done if ( isOutlink ) return -1; // skip for msg20 if ( isForMsg20 ) continue; // reply based if ( ! srep ) continue; // get our error code long errCode = srep->m_errCode; // . make it zero if not tmp error // . now have EDOCUNCHANGED and EDOCNOGOODDATE from // Msg13.cpp, so don't count those here... if ( errCode != EDNSTIMEDOUT && errCode != ETCPTIMEDOUT && errCode != EDNSDEAD && // assume diffbot is temporarily experiencing errs errCode != EDIFFBOTINTERNALERROR && // out of memory while crawling? errCode != ENOMEM && errCode != ENETUNREACH && errCode != EHOSTUNREACH ) errCode = 0; // if no match continue if ( (bool)errCode == val ) continue; // skip p += 11; // skip to next constraint p = strstr(p, "&&"); // all done? if ( ! p ) return i; p += 2; goto checkNextRule; } /* if ( *p=='h' && strncmp(p,"hassitevenue",12) == 0 ) { // if we do not have enough info for outlink, all done if ( isOutlink ) return -1; // skip for msg20 if ( isForMsg20 ) continue; // skip if not valid (pageaddurl? injection?) if ( ! sreq->m_hasSiteVenueValid ) continue; // if no match continue if ( (bool)sreq->m_hasSiteVenue==val ) continue; // allow "!isindexed" if no SpiderReply at all //if ( ! srep && val == 0 ) continue; // skip p += 12; // skip to next constraint p = strstr(p, "&&"); // all done? if ( ! p ) return i; p += 2; goto checkNextRule; } */ if ( *p != 'i' ) goto skipi; if ( strncmp(p,"isinjected",10) == 0 ) { // skip for msg20 if ( isForMsg20 ) continue; // if no match continue if ( (bool)sreq->m_isInjecting==val ) continue; // skip p += 10; // skip to next constraint p = strstr(p, "&&"); // all done? if ( ! p ) return i; p += 2; goto checkNextRule; } if ( strncmp(p,"isdocidbased",12) == 0 ) { // skip for msg20 if ( isForMsg20 ) continue; // if no match continue if ( (bool)sreq->m_urlIsDocId==val ) continue; // skip p += 10; // skip to next constraint p = strstr(p, "&&"); // all done? if ( ! p ) return i; p += 2; goto checkNextRule; } if ( strncmp(p,"iscontacty",10) == 0 ) { // skip for msg20 if ( isForMsg20 ) continue; // skip if not valid if ( ! sreq->m_isContactyValid ) continue; // if no match continue if ( (bool)sreq->m_isContacty==val ) continue; // skip p += 10; // skip to next constraint p = strstr(p, "&&"); // all done? if ( ! p ) return i; p += 2; goto checkNextRule; } // is it in the big list of sites? if ( strncmp(p,"insitelist",10) == 0 ) { // skip for msg20 //if ( isForMsg20 ) continue; if ( ! checkedRow ) { // only do once for speed checkedRow = true; // this function is in PageBasic.cpp row = getMatchingUrlPattern ( sc, sreq ); } // if we are not submitted from the add url api, skip if ( (bool)row == val ) continue; // skip p += 10; // skip to next constraint p = strstr(p, "&&"); // all done? if ( ! p ) return i; p += 2; goto checkNextRule; } // . was it submitted from PageAddUrl.cpp? // . replaces the "add url priority" parm if ( strncmp(p,"isaddurl",8) == 0 ) { // skip for msg20 if ( isForMsg20 ) continue; // if we are not submitted from the add url api, skip if ( (bool)sreq->m_isAddUrl == val ) continue; // skip p += 8; // skip to next constraint p = strstr(p, "&&"); // all done? if ( ! p ) return i; p += 2; goto checkNextRule; } if ( p[0]=='i' && strncmp(p,"ismanualadd",11) == 0 ) { // skip for msg20 if ( isForMsg20 ) continue; // . if we are not submitted from the add url api, skip // . if we have '!' then val is 1 if ( sreq->m_isAddUrl || sreq->m_isInjecting || sreq->m_isPageParser ) { if ( val ) continue; } else { if ( ! val ) continue; } // skip p += 11; // skip to next constraint p = strstr(p, "&&"); // all done? if ( ! p ) return i; p += 2; goto checkNextRule; } // does it have an rss inlink? we want to expedite indexing // of such pages. i.e. that we gather from an rss feed that // we got from a pingserver... if ( strncmp(p,"isparentrss",11) == 0 ) { // skip for msg20 if ( isForMsg20 ) continue; // if we have no such inlink if ( (bool)sreq->m_parentIsRSS == val ) continue; // skip p += 11; // skip to next constraint p = strstr(p, "&&"); // all done? if ( ! p ) return i; p += 2; goto checkNextRule; } /* if ( strncmp(p,"isparentindexed",16) == 0 ) { // skip for msg20 if ( isForMsg20 ) continue; // if we have no such inlink if ( (bool)sreq->m_wasParentIndexed == val ) continue; // skip p += 16; // skip to next constraint p = strstr(p, "&&"); // all done? if ( ! p ) return i; p += 2; goto checkNextRule; } */ // we can now handle this guy since we have the latest // SpiderReply, pretty much guaranteed if ( strncmp(p,"isindexed",9) == 0 ) { // if we do not have enough info for outlink, all done if ( isOutlink ) return -1; // skip for msg20 if ( isForMsg20 ) continue; // skip if reply does not KNOW because of an error // since XmDoc::indexDoc() called // XmlDoc::getNewSpiderReply() and did not have this // info... if ( srep && (bool)srep->m_isIndexedINValid ) continue; // if no match continue if ( srep && (bool)srep->m_isIndexed==val ) continue; // allow "!isindexed" if no SpiderReply at all if ( ! srep && val == 0 ) continue; // skip p += 9; // skip to next constraint p = strstr(p, "&&"); // all done? if ( ! p ) return i; p += 2; goto checkNextRule; } if ( strncmp(p,"ingoogle",8) == 0 ) { if ( isOutlink ) return -1; // skip for msg20 if ( isForMsg20 ) continue; // skip if not valid (pageaddurl? injection?) if ( ! sreq->m_inGoogleValid ) continue; // if no match continue if ( (bool)sreq->m_inGoogle == val ) continue; // allow "!isindexed" if no SpiderReply at all if ( ! srep && val == 0 ) continue; // skip p += 8; // skip to next constraint p = strstr(p, "&&"); // all done? if ( ! p ) return i; p += 2; goto checkNextRule; } // . check to see if a page is linked to by // www.weblogs.com/shortChanges.xml and if it is we put // it into a queue that has a respider rate no faster than // 30 days, because we don't need to spider it quick since // it is in the ping server! if ( strncmp(p,"isparentpingserver",18) == 0 ) { // skip for msg20 if ( isForMsg20 ) continue; // if no match continue if ( (bool)sreq->m_parentIsPingServer == val) continue; // skip p += 18; // skip to next constraint p = strstr(p, "&&"); // all done? if ( ! p ) return i; p += 2; goto checkNextRule; } if ( strncmp(p,"ispingserver",12) == 0 ) { // skip for msg20 if ( isForMsg20 ) continue; // if no match continue if ( (bool)sreq->m_isPingServer == val ) continue; // skip p += 12; // skip to next constraint p = strstr(p, "&&"); // all done? if ( ! p ) return i; p += 2; goto checkNextRule; } if ( strncmp ( p , "isonsamesubdomain",17 ) == 0 ) { // skip for msg20 if ( isForMsg20 ) continue; if ( val == 0 && sreq->m_parentHostHash32 != sreq->m_hostHash32 ) continue; if ( val == 1 && sreq->m_parentHostHash32 == sreq->m_hostHash32 ) continue; p += 6; p = strstr(p, "&&"); if ( ! p ) return i; p += 2; goto checkNextRule; } if ( strncmp ( p , "isonsamedomain",14 ) == 0 ) { // skip for msg20 if ( isForMsg20 ) continue; if ( val == 0 && sreq->m_parentDomHash32 != sreq->m_domHash32 ) continue; if ( val == 1 && sreq->m_parentDomHash32 == sreq->m_domHash32 ) continue; p += 6; p = strstr(p, "&&"); if ( ! p ) return i; p += 2; goto checkNextRule; } // jpg JPG gif GIF wmv mpg css etc. if ( strncmp ( p , "ismedia",7 ) == 0 ) { // skip for msg20 if ( isForMsg20 ) continue; // check the extension if ( urlLen<=5 ) continue; ext = url + urlLen - 4; if ( ext[0] == '.' ) { if ( to_lower_a(ext[1]) == 'c' && to_lower_a(ext[2]) == 's' && to_lower_a(ext[3]) == 's' ) goto gotOne; if ( to_lower_a(ext[1]) == 'm' && to_lower_a(ext[2]) == 'p' && to_lower_a(ext[3]) == 'g' ) goto gotOne; if ( to_lower_a(ext[1]) == 'p' && to_lower_a(ext[2]) == 'n' && to_lower_a(ext[3]) == 'g' ) goto gotOne; if ( to_lower_a(ext[1]) == 'w' && to_lower_a(ext[2]) == 'm' && to_lower_a(ext[3]) == 'v' ) goto gotOne; if ( to_lower_a(ext[1]) == 'w' && to_lower_a(ext[2]) == 'a' && to_lower_a(ext[3]) == 'v' ) goto gotOne; if ( to_lower_a(ext[1]) == 'j' && to_lower_a(ext[2]) == 'p' && to_lower_a(ext[3]) == 'g' ) goto gotOne; if ( to_lower_a(ext[1]) == 'g' && to_lower_a(ext[2]) == 'i' && to_lower_a(ext[3]) == 'f' ) goto gotOne; if ( to_lower_a(ext[1]) == 'i' && to_lower_a(ext[2]) == 'c' && to_lower_a(ext[3]) == 'o' ) goto gotOne; if ( to_lower_a(ext[1]) == 'm' && to_lower_a(ext[2]) == 'p' && to_lower_a(ext[3]) == '3' ) goto gotOne; if ( to_lower_a(ext[1]) == 'm' && to_lower_a(ext[2]) == 'p' && to_lower_a(ext[3]) == '4' ) goto gotOne; if ( to_lower_a(ext[1]) == 'm' && to_lower_a(ext[2]) == 'o' && to_lower_a(ext[3]) == 'v' ) goto gotOne; if ( to_lower_a(ext[1]) == 'a' && to_lower_a(ext[2]) == 'v' && to_lower_a(ext[3]) == 'i' ) goto gotOne; } else if ( ext[-1] == '.' ) { if ( to_lower_a(ext[0]) == 'm' && to_lower_a(ext[1]) == 'p' && to_lower_a(ext[2]) == 'e' && to_lower_a(ext[3]) == 'g' ) goto gotOne; if ( to_lower_a(ext[0]) == 'j' && to_lower_a(ext[1]) == 'p' && to_lower_a(ext[2]) == 'e' && to_lower_a(ext[3]) == 'g' ) goto gotOne; } // two letter extensions else if ( ext[1] == '.' ) { if ( to_lower_a(ext[2]) == 'g' && to_lower_a(ext[3]) == 'z' ) goto gotOne; } // check for ".css?" substring special = strstr(url,".css?"); if ( special ) goto gotOne; special = strstr(url,"/print/"); if ( special ) goto gotOne; // no match, try the next rule continue; gotOne: p += 7; p = strstr(p, "&&"); if ( ! p ) return i; p += 2; goto checkNextRule; } // check for "isrss" aka "rss" if ( strncmp(p,"isrss",5) == 0 ) { if ( isOutlink ) return -1; // must have a reply if ( ! srep ) continue; // if we are not rss, we do not match this rule if ( (bool)srep->m_isRSS == val ) continue; // skip it p += 5; // check for && p = strstr(p, "&&"); // if nothing, else then it is a match if ( ! p ) return i; // skip the '&&' and go to next rule p += 2; goto checkNextRule; } // check for permalinks. for new outlinks we *guess* if its // a permalink by calling isPermalink() function. if (!strncmp(p,"ispermalink",11) ) { // if we do not have enough info for outlink, all done if ( isOutlink ) return -1; // must have a reply if ( ! srep ) continue; // if we are not rss, we do not match this rule if ( (bool)srep->m_isPermalink == val ) continue; // skip it p += 11; // check for && p = strstr(p, "&&"); // if nothing, else then it is a match if ( ! p ) return i; // skip the '&&' and go to next rule p += 2; goto checkNextRule; } // supports LF_ISPERMALINK bit for outlinks that *seem* to // be permalinks but might not if (!strncmp(p,"ispermalinkformat",17) ) { // if we are not rss, we do not match this rule if ( (bool)sreq->m_isUrlPermalinkFormat ==val)continue; // check for && p = strstr(p, "&&"); // if nothing, else then it is a match if ( ! p ) return i; // skip the '&&' and go to next rule p += 2; goto checkNextRule; } // check for this if ( strncmp(p,"isnewoutlink",12) == 0 ) { // skip for msg20 if ( isForMsg20 ) continue; // skip if we do not match this rule if ( (bool)sreq->m_isNewOutlink == val ) continue; // skip it p += 10; // check for && p = strstr(p, "&&"); // if nothing, else then it is a match if ( ! p ) return i; // skip the '&&' and go to next rule p += 2; goto checkNextRule; } // check for this if ( strncmp(p,"isnewrequest",12) == 0 ) { // if we do not have enough info for outlink, all done if ( isOutlink ) return -1; // skip for msg20 if ( isForMsg20 ) continue; // skip if we are a new request and val is 1 (has '!') if ( ! srep && val ) continue; // skip if we are a new request and val is 1 (has '!') if(srep&&sreq->m_addedTime>srep->m_spideredTime &&val) continue; // skip if we are old and val is 0 (does not have '!') if(srep&&sreq->m_addedTime<=srep->m_spideredTime&&!val) continue; // skip it for speed p += 12; // check for && p = strstr(p, "&&"); // if nothing, else then it is a match if ( ! p ) return i; // skip the '&&' and go to next rule p += 2; goto checkNextRule; } // kinda like isnewrequest, but has no reply. use hasreply? if ( strncmp(p,"isnew",5) == 0 ) { // if we do not have enough info for outlink, all done if ( isOutlink ) return -1; // skip for msg20 if ( isForMsg20 ) continue; // if we got a reply, we are not new!! if ( (bool)sreq->m_hadReply != (bool)val ) continue; // skip it for speed p += 5; // check for && p = strstr(p, "&&"); // if nothing, else then it is a match if ( ! p ) return i; // skip the '&&' and go to next rule p += 2; goto checkNextRule; } // iswww, means url is like www.xyz.com/... if ( strncmp(p,"iswww", 5) == 0 ) { // now this is a bit if ( (bool)sreq->m_isWWWSubdomain == (bool)val ) continue; /* // skip "iswww" p += 5; // skip over http:// or https:// char *u = sreq->m_url; if ( u[4] == ':' ) u += 7; if ( u[5] == ':' ) u += 8; // url MUST be a www url char isWWW = 0; if( u[0] == 'w' && u[1] == 'w' && u[2] == 'w' ) isWWW = 1; // skip if no match if ( isWWW == val ) continue; */ // TODO: fix www.knightstown.skepter.com // maybe just have a bit in the spider request // another rule? p = strstr(p,"&&"); if ( ! p ) return i; // skip the '&&' p += 2; goto checkNextRule; } // non-boolen junk skipi: // . we always match the "default" reg ex // . this line must ALWAYS exist! if ( *p=='d' && ! strcmp(p,"default" ) ) return i; // set the sign char *s = p; // skip s to after while ( *s && is_alpha_a(*s) ) s++; // skip white space before the operator //char *saved = s; while ( *s && is_wspace_a(*s) ) s++; char sign = 0; if ( *s == '=' ) { s++; if ( *s == '=' ) s++; sign = SIGN_EQ; } else if ( *s == '!' && s[1] == '=' ) { s += 2; sign = SIGN_NE; } else if ( *s == '<' ) { s++; if ( *s == '=' ) { sign = SIGN_LE; s++; } else sign = SIGN_LT; } else if ( *s == '>' ) { s++; if ( *s == '=' ) { sign = SIGN_GE; s++; } else sign = SIGN_GT; } // skip whitespace after the operator while ( *s && is_wspace_a(*s) ) s++; // seed counts. how many seeds this subdomain has. 'siteadds' if ( *p == 's' && p[1] == 'i' && p[2] == 't' && p[3] == 'e' && p[4] == 'a' && p[5] == 'd' && p[6] == 'd' && p[7] == 's' ) { // need a quota table for this if ( ! quotaTable ) continue; // a special hack so it is seeds so we can use same tbl long h32 = sreq->m_siteHash32 ^ 0x123456; long *valPtr =(long *)quotaTable->getValue(&h32); long a; // if no count in table, that is strange, i guess // skip for now??? // this happens if INJECTING a url from the // "add url" function on homepage if ( ! valPtr ) a=0;//continue;//{char *xx=NULL;*xx=0;} // shortcut else a = *valPtr; //log("siteadds=%li for %s",a,sreq->m_url); // what is the provided value in the url filter rule? long b = atoi(s); // compare if ( sign == SIGN_EQ && a != b ) continue; if ( sign == SIGN_NE && a == b ) continue; if ( sign == SIGN_GT && a <= b ) continue; if ( sign == SIGN_LT && a >= b ) continue; if ( sign == SIGN_GE && a < b ) continue; if ( sign == SIGN_LE && a > b ) continue; p = strstr(s, "&&"); //if nothing, else then it is a match if ( ! p ) return i; //skip the '&&' and go to next rule p += 2; goto checkNextRule; } // domain seeds. 'domainadds' if ( *p == 'd' && p[1] == 'o' && p[2] == 'm' && p[3] == 'a' && p[4] == 'i' && p[5] == 'n' && p[6] == 'a' && p[7] == 'd' && p[8] == 'd' && p[9] == 's' ) { // need a quota table for this if ( ! quotaTable ) continue; // a special hack so it is seeds so we can use same tbl long h32 = sreq->m_domHash32 ^ 0x123456; long *valPtr ; valPtr = (long *)quotaTable->getValue(&h32); // if no count in table, that is strange, i guess // skip for now??? long a; if ( ! valPtr ) a = 0;//{ char *xx=NULL;*xx=0; } else a = *valPtr; // what is the provided value in the url filter rule? long b = atoi(s); // compare if ( sign == SIGN_EQ && a != b ) continue; if ( sign == SIGN_NE && a == b ) continue; if ( sign == SIGN_GT && a <= b ) continue; if ( sign == SIGN_LT && a >= b ) continue; if ( sign == SIGN_GE && a < b ) continue; if ( sign == SIGN_LE && a > b ) continue; p = strstr(s, "&&"); //if nothing, else then it is a match if ( ! p ) return i; //skip the '&&' and go to next rule p += 2; goto checkNextRule; } // new quotas. 'sitepages' = pages from site. // 'sitepages > 20 && seedcount <= 1 --> FILTERED' if ( *p == 's' && p[1] == 'i' && p[2] == 't' && p[3] == 'e' && p[4] == 'p' && p[5] == 'a' && p[6] == 'g' && p[7] == 'e' && p[8] == 's' ) { // need a quota table for this if ( ! quotaTable ) continue; long *valPtr = (long *)quotaTable-> getValue(&sreq->m_siteHash32); // if no count in table, that is strange, i guess // skip for now??? long a; if ( ! valPtr ) a = 0;//{ char *xx=NULL;*xx=0; } else a = *valPtr; // shortcut //log("sitepgs=%li for %s",a,sreq->m_url); // what is the provided value in the url filter rule? long b = atoi(s); // compare if ( sign == SIGN_EQ && a != b ) continue; if ( sign == SIGN_NE && a == b ) continue; if ( sign == SIGN_GT && a <= b ) continue; if ( sign == SIGN_LT && a >= b ) continue; if ( sign == SIGN_GE && a < b ) continue; if ( sign == SIGN_LE && a > b ) continue; p = strstr(s, "&&"); //if nothing, else then it is a match if ( ! p ) return i; //skip the '&&' and go to next rule p += 2; goto checkNextRule; } // domain quotas. 'domainpages > 10 && hopcount >= 1 --> FILTERED' if ( *p == 'd' && p[1] == 'o' && p[2] == 'm' && p[3] == 'a' && p[4] == 'i' && p[5] == 'n' && p[6] == 'p' && p[7] == 'a' && p[8] == 'g' && p[9] == 'e' && p[10] == 's' ) { // need a quota table for this if ( ! quotaTable ) continue; long *valPtr ; valPtr=(long*)quotaTable->getValue(&sreq->m_domHash32); // if no count in table, that is strange, i guess // skip for now??? long a; if ( ! valPtr ) a = 0;//{ char *xx=NULL;*xx=0; } else a = *valPtr; // what is the provided value in the url filter rule? long b = atoi(s); // compare if ( sign == SIGN_EQ && a != b ) continue; if ( sign == SIGN_NE && a == b ) continue; if ( sign == SIGN_GT && a <= b ) continue; if ( sign == SIGN_LT && a >= b ) continue; if ( sign == SIGN_GE && a < b ) continue; if ( sign == SIGN_LE && a > b ) continue; p = strstr(s, "&&"); //if nothing, else then it is a match if ( ! p ) return i; //skip the '&&' and go to next rule p += 2; goto checkNextRule; } // tld:cn if ( *p=='t' && strncmp(p,"tld",3)==0){ // set it on demand if ( tld == (char *)-1 ) tld = getTLDFast ( sreq->m_url , &tldLen ); // no match if we have no tld. might be an IP only url, // or not in our list in Domains.cpp::isTLD() if ( ! tld || tldLen == 0 ) continue; // set these up //char *a = tld; //long alen = tldLen; char *b = s; // loop for the comma-separated list of tlds // like tld:us,uk,fr,it,de subloop1: // get length of it in the regular expression box char *start = b; while ( *b && !is_wspace_a(*b) && *b!=',' ) b++; long blen = b - start; //char sm; // if we had tld==com,org,... if ( sign == SIGN_EQ && blen == tldLen && strncasecmp(start,tld,tldLen)==0 ) // if we matched any, that's great goto matched1; // if its tld!=com,org,... // and we equal the string, then we do not matcht his // particular rule!!! if ( sign == SIGN_NE && blen == tldLen && strncasecmp(start,tld,tldLen)==0 ) // we do not match this rule if we matched // and of the tlds in the != list continue; // might have another tld in a comma-separated list if ( *b != ',' ) { // if that was the end of the list and the // sign was == then skip this rule if ( sign == SIGN_EQ ) continue; // otherwise, if the sign was != then we win! if ( sign == SIGN_NE ) goto matched1; // otherwise, bad sign? continue; } // advance to next tld if there was a comma after us b++; // and try again goto subloop1; // otherwise // do we match, if not, try next regex //sm = strncasecmp(a,b,blen); //if ( sm != 0 && sign == SIGN_EQ ) goto miss1; //if ( sm == 0 && sign == SIGN_NE ) goto miss1; // come here on a match matched1: // we matched, now look for && p = strstr ( b , "&&" ); // if nothing, else then it is a match if ( ! p ) return i; // skip the '&&' and go to next rule p += 2; goto checkNextRule; // come here if we did not match the tld } // lang:en,zh_cn if ( *p=='l' && strncmp(p,"lang",4)==0){ // if we do not have enough info for outlink, all done if ( isOutlink ) return -1; // must have a reply if ( ! srep ) continue; // skip if unknown? no, we support "xx" as unknown now //if ( srep->m_langId == 0 ) continue; // set these up char *b = s; // loop for the comma-separated list of langids // like lang==en,es,... subloop2: // get length of it in the regular expression box char *start = b; while ( *b && !is_wspace_a(*b) && *b!=',' ) b++; long blen = b - start; //char sm; // if we had lang==en,es,... if ( sign == SIGN_EQ && blen == langLen && strncasecmp(start,lang,langLen)==0 ) // if we matched any, that's great goto matched2; // if its lang!=en,es,... // and we equal the string, then we do not matcht his // particular rule!!! if ( sign == SIGN_NE && blen == langLen && strncasecmp(start,lang,langLen)==0 ) // we do not match this rule if we matched // and of the langs in the != list continue; // might have another in the comma-separated list if ( *b != ',' ) { // if that was the end of the list and the // sign was == then skip this rule if ( sign == SIGN_EQ ) continue; // otherwise, if the sign was != then we win! if ( sign == SIGN_NE ) goto matched2; // otherwise, bad sign? continue; } // advance to next list item if was a comma after us b++; // and try again goto subloop2; // come here on a match matched2: // we matched, now look for && p = strstr ( b , "&&" ); // if nothing, else then it is a match if ( ! p ) return i; // skip the '&&' and go to next rule p += 2; goto checkNextRule; // come here if we did not match the tld } // hopcount == 20 [&&] if ( *p=='h' && strncmp(p, "hopcount", 8) == 0){ // skip if not valid if ( ! sreq->m_hopCountValid ) continue; // shortcut long a = sreq->m_hopCount; // make it point to the priority long b = atoi(s); // compare if ( sign == SIGN_EQ && a != b ) continue; if ( sign == SIGN_NE && a == b ) continue; if ( sign == SIGN_GT && a <= b ) continue; if ( sign == SIGN_LT && a >= b ) continue; if ( sign == SIGN_GE && a < b ) continue; if ( sign == SIGN_LE && a > b ) continue; p = strstr(s, "&&"); //if nothing, else then it is a match if ( ! p ) return i; //skip the '&&' and go to next rule p += 2; goto checkNextRule; } // the last time it was spidered if ( *p=='l' && strncmp(p,"lastspidertime",14) == 0 ) { // if we do not have enough info for outlink, all done if ( isOutlink ) return -1; // skip for msg20 if ( isForMsg20 ) continue; // reply based long a = 0; // if no spider reply we can't match this rule! if ( ! srep ) continue; // shortcut if ( srep ) a = srep->m_spideredTime; // make it point to the retry count long b ; // now "s" can be "{roundstart}" if ( s[0]=='{' && strncmp(s,"{roundstart}",12)==0) b = cr->m_spiderRoundStartTime;//Num; else b = atoi(s); // compare if ( sign == SIGN_EQ && a != b ) continue; if ( sign == SIGN_NE && a == b ) continue; if ( sign == SIGN_GT && a <= b ) continue; if ( sign == SIGN_LT && a >= b ) continue; if ( sign == SIGN_GE && a < b ) continue; if ( sign == SIGN_LE && a > b ) continue; p = strstr(s, "&&"); //if nothing, else then it is a match if ( ! p ) return i; //skip the '&&' and go to next rule p += 2; goto checkNextRule; } if ( *p=='e' && strncmp(p,"errorcount",10) == 0 ) { // if we do not have enough info for outlink, all done if ( isOutlink ) return -1; // skip for msg20 if ( isForMsg20 ) continue; // reply based if ( ! srep ) continue; // shortcut long a = srep->m_errCount; // make it point to the retry count long b = atoi(s); // compare if ( sign == SIGN_EQ && a != b ) continue; if ( sign == SIGN_NE && a == b ) continue; if ( sign == SIGN_GT && a <= b ) continue; if ( sign == SIGN_LT && a >= b ) continue; if ( sign == SIGN_GE && a < b ) continue; if ( sign == SIGN_LE && a > b ) continue; p = strstr(s, "&&"); //if nothing, else then it is a match if ( ! p ) return i; //skip the '&&' and go to next rule p += 2; goto checkNextRule; } // siteNumInlinks >= 300 [&&] if ( *p=='s' && strncmp(p, "sitenuminlinks", 14) == 0){ // these are -1 if they are NOT valid long a1 = sreq->m_siteNumInlinks; // only assign if valid long a2 = -1; if ( srep ) a2 = srep->m_siteNumInlinks; // assume a1 is the best long a ; // assign to the first valid one if ( a1 != -1 ) a = a1; else if ( a2 != -1 ) a = a2; // swap if both are valid, but srep is more recent if ( a1 != -1 && a2 != -1 && srep->m_spideredTime > sreq->m_addedTime ) a = a2; // skip if nothing valid if ( a == -1 ) continue; // make it point to the priority long b = atoi(s); // compare if ( sign == SIGN_EQ && a != b ) continue; if ( sign == SIGN_NE && a == b ) continue; if ( sign == SIGN_GT && a <= b ) continue; if ( sign == SIGN_LT && a >= b ) continue; if ( sign == SIGN_GE && a < b ) continue; if ( sign == SIGN_LE && a > b ) continue; // skip fast p += 14; p = strstr(s, "&&"); //if nothing, else then it is a match if ( ! p ) return i; //skip the '&&' and go to next rule p += 2; goto checkNextRule; } /* // retryNum >= 2 [&&] ... if ( *p=='r' && strncmp(p, "retrynum", 8) == 0){ // shortcut long a = sr->m_retryNum; // make it point to the priority long b = atoi(s); // compare if ( sign == SIGN_EQ && a != b ) continue; if ( sign == SIGN_NE && a == b ) continue; if ( sign == SIGN_GT && a <= b ) continue; if ( sign == SIGN_LT && a >= b ) continue; if ( sign == SIGN_GE && a < b ) continue; if ( sign == SIGN_LE && a > b ) continue; p = strstr(s, "&&"); //if nothing, else then it is a match if ( ! p ) return i; //skip the '&&' and go to next rule p += 2; goto checkNextRule; } */ // how many days have passed since it was last attempted // to be spidered? used in conjunction with percentchanged // to assign when to re-spider it next if ( *p=='s' && strncmp(p, "spiderwaited", 12) == 0){ // if we do not have enough info for outlink, all done if ( isOutlink ) return -1; // must have a reply if ( ! srep ) continue; // skip for msg20 if ( isForMsg20 ) continue; // do not match rule if never attempted if ( srep->m_spideredTime == 0 ) {char*xx=NULL;*xx=0;} if ( srep->m_spideredTime == -1 ) {char*xx=NULL;*xx=0;} // shortcut float af = (srep->m_spideredTime - nowGlobal); // make into days af /= (3600.0*24.0); // back to a long, round it long a = (long)(af + 0.5); // make it point to the priority long b = atoi(s); // compare if ( sign == SIGN_EQ && a != b ) continue; if ( sign == SIGN_NE && a == b ) continue; if ( sign == SIGN_GT && a <= b ) continue; if ( sign == SIGN_LT && a >= b ) continue; if ( sign == SIGN_GE && a < b ) continue; if ( sign == SIGN_LE && a > b ) continue; p = strstr(s, "&&"); //if nothing, else then it is a match if ( ! p ) return i; //skip the '&&' and go to next rule p += 2; goto checkNextRule; } // percentchanged >= 50 [&&] ... if ( *p=='p' && strncmp(p, "percentchangedperday", 20) == 0){ // if we do not have enough info for outlink, all done if ( isOutlink ) return -1; // must have a reply if ( ! srep ) continue; // skip for msg20 if ( isForMsg20 ) continue; // shortcut float a = srep->m_percentChangedPerDay; // make it point to the priority float b = atof(s); // compare if ( sign == SIGN_EQ && a != b ) continue; if ( sign == SIGN_NE && a == b ) continue; if ( sign == SIGN_GT && a <= b ) continue; if ( sign == SIGN_LT && a >= b ) continue; if ( sign == SIGN_GE && a < b ) continue; if ( sign == SIGN_LE && a > b ) continue; p = strstr(s, "&&"); //if nothing, else then it is a match if ( ! p ) return i; //skip the '&&' and go to next rule p += 2; goto checkNextRule; } // httpStatus == 400 if ( *p=='h' && strncmp(p, "httpstatus", 10) == 0){ // if we do not have enough info for outlink, all done if ( isOutlink ) return -1; // must have a reply if ( ! srep ) continue; // shortcut (errCode doubles as g_errno) long a = srep->m_errCode; // make it point to the priority long b = atoi(s); // compare if ( sign == SIGN_EQ && a != b ) continue; if ( sign == SIGN_NE && a == b ) continue; if ( sign == SIGN_GT && a <= b ) continue; if ( sign == SIGN_LT && a >= b ) continue; if ( sign == SIGN_GE && a < b ) continue; if ( sign == SIGN_LE && a > b ) continue; p = strstr(s, "&&"); //if nothing, else then it is a match if ( ! p ) return i; //skip the '&&' and go to next rule p += 2; goto checkNextRule; } // how old is the doc in seconds? age is the pubDate age if ( *p =='a' && strncmp(p, "age", 3) == 0){ // if we do not have enough info for outlink, all done if ( isOutlink ) return -1; // must have a reply if ( ! srep ) continue; // shortcut long age; if ( srep->m_pubDate <= 0 ) age = -1; else age = nowGlobal - srep->m_pubDate; // we can not match if invalid if ( age <= 0 ) continue; // make it point to the priority long b = atoi(s); // compare if ( sign == SIGN_EQ && age != b ) continue; if ( sign == SIGN_NE && age == b ) continue; if ( sign == SIGN_GT && age <= b ) continue; if ( sign == SIGN_LT && age >= b ) continue; if ( sign == SIGN_GE && age < b ) continue; if ( sign == SIGN_LE && age > b ) continue; p = strstr(s, "&&"); //if nothing, else then it is a match if ( ! p ) return i; //skip the '&&' and go to next rule p += 2; goto checkNextRule; } /* MDW: i replaced this with m_contentHash32 to make spiders faster/smarter so let's take this out for now // how many new inlinkers we got since last spidered time? if ( *p =='n' && strncmp(p, "newinlinks", 10) == 0){ // if we do not have enough info for outlink, all done if ( isOutlink ) return -1; // must have a reply if ( ! srep ) continue; // . make it point to the newinlinks. // . # of new SpiderRequests added since // srep->m_spideredTime // . m_dupCache insures that the same ip/hostHash // does not add more than 1 SpiderRequest for the // same url/outlink long a = srep->m_newRequests; long b = atoi(s); // compare if ( sign == SIGN_EQ && a != b ) continue; if ( sign == SIGN_NE && a == b ) continue; if ( sign == SIGN_GT && a <= b ) continue; if ( sign == SIGN_LT && a >= b ) continue; if ( sign == SIGN_GE && a < b ) continue; if ( sign == SIGN_LE && a > b ) continue; // quick p += 10; // look for more p = strstr(s, "&&"); //if nothing, else then it is a match if ( ! p ) return i; //skip the '&&' and go to next rule p += 2; goto checkNextRule; } */ // our own regex thing (match front of url) if ( *p=='^' ) { // advance over caret p++; // now pstart pts to the string we will match char *pstart = p; // make "p" point to one past the last char in string while ( *p && ! is_wspace_a(*p) ) p++; // how long is the string to match? long plen = p - pstart; // empty? that's kinda an error if ( plen == 0 ) continue; long m = 1; // check to see if we matched if url was long enough if ( urlLen >= plen ) m = strncmp(pstart,url,plen); if ( ( m == 0 && val == 0 ) || // if they used the '!' operator and we // did not match the string, that's a // row match ( m && val == 1 ) ) { // another expression follows? p = strstr(s, "&&"); //if nothing, else then it is a match if ( ! p ) return i; //skip the '&&' and go to next rule p += 2; goto checkNextRule; } // no match continue; } // our own regex thing (match end of url) if ( *p=='$' ) { // advance over dollar sign p++; // a hack for $\.css, skip over the backslash too if ( *p=='\\' && *(p+1)=='.' ) p++; // now pstart pts to the string we will match char *pstart = p; // make "p" point to one past the last char in string while ( *p && ! is_wspace_a(*p) ) p++; // how long is the string to match? long plen = p - pstart; // empty? that's kinda an error if ( plen == 0 ) continue; // . do we match it? // . url has to be at least as big // . match our tail long m = 1; // check to see if we matched if url was long enough if ( urlLen >= plen ) m = strncmp(pstart,url+urlLen-plen,plen); if ( ( m == 0 && val == 0 ) || // if they used the '!' operator and we // did not match the string, that's a // row match ( m && val == 1 ) ) { // another expression follows? p = strstr(s, "&&"); //if nothing, else then it is a match if ( ! p ) return i; //skip the '&&' and go to next rule p += 2; goto checkNextRule; } // no match continue; } // . by default a substring match // . action=edit // . action=history // now pstart pts to the string we will match char *pstart = p; // make "p" point to one past the last char in string while ( *p && ! is_wspace_a(*p) ) p++; // how long is the string to match? long plen = p - pstart; // need something... if ( plen <= 0 ) continue; // must be at least as big //if ( urlLen < plen ) continue; // nullilfy it temporarily char c = *p; *p = '\0'; // does url contain it? haystack=u needle=p char *found = strstr ( url , pstart ); // put char back *p = c; // kinda of a hack fix. if they inject a filtered url // into test coll, do not filter it! fixes the fact that // we filtered facebook, but still add it in our test // collection injection in urls.txt if ( found && sreq->m_isInjecting && cr->m_coll[0]=='t' && cr->m_coll[1]=='e' && cr->m_coll[2]=='s' && cr->m_coll[3]=='t' && cr->m_coll[4]=='\0' && cr->m_spiderPriorities[i] < 0 ) continue; // support "!company" meaning if it does NOT match // then do this ... if ( ( found && val == 0 ) || // if they used the '!' operator and we // did not match the string, that's a // row match ( ! found && val == 1 ) ) { // another expression follows? p = strstr(s, "&&"); //if nothing, else then it is a match if ( ! p ) return i; //skip the '&&' and go to next rule p += 2; goto checkNextRule; } } // sanity check ... must be a default rule! //char *xx=NULL;*xx=0; // return -1 if no match, caller should use a default return -1; } //static bool s_ufnInit = false; //static HashTableX s_ufnTable; //void clearUfnTable ( ) { // s_ufnTable.clear(); // s_ufnTree.clear(); //} long getUrlFilterNum ( SpiderRequest *sreq , SpiderReply *srep , long nowGlobal , bool isForMsg20 , long niceness , CollectionRec *cr , bool isOutlink , HashTableX *quotaTable ) { /* turn this off for now to save memory on the g0 cluster. we should nuke this anyway with rankdb // init table? if ( ! s_ufnInit ) { s_ufnInit = true; if ( ! s_ufnTable.set(8, 1, 1024*1024*5, NULL,0, false, MAX_NICENESS, "ufntab") ) { char *xx=NULL;*xx=0; } } // check in cache using date of request and reply and uh48 as the key long long key64 = sreq->getUrlHash48(); key64 ^= (long long)sreq->m_addedTime; if ( srep ) key64 ^= ((long long)srep->m_spideredTime)<<32; char *uv = (char *)s_ufnTable.getValue(&key64); if ( uv ) return *uv; */ char ufn = getUrlFilterNum2 ( sreq, srep, nowGlobal, isForMsg20, niceness, cr, isOutlink, quotaTable ); /* // is table full? clear it if so if ( s_ufnTable.getNumSlotsUsed() > 2000000 ) { log("spider: resetting ufn table"); s_ufnTable.clear(); } // cache it s_ufnTable.addKey ( &key64 , &ufn ); */ return (long)ufn; } bool SpiderColl::printStats ( SafeBuf &sb ) { return true; } // . dedup for spiderdb // . TODO: we can still have spider request dups in this if they are // sandwiched together just right because we only compare to the previous // SpiderRequest we added when looking for dups. just need to hash the // relevant input bits and use that for deduping. // . TODO: we can store ufn/priority/spiderTime in the SpiderRequest along // with the date now, so if url filters do not change then // gotSpiderdbList() can assume those to be valid and save time. BUT it does // have siteNumInlinks... void dedupSpiderdbList ( RdbList *list , long niceness , bool removeNegRecs ) { //long need = list->m_listSize; char *newList = list->m_list;//(char *)mmalloc (need,"dslist"); //if ( ! newList ) { // log("spider: could not dedup spiderdb list: %s", // mstrerror(g_errno)); // return; //} char *dst = newList; char *restorePoint = newList; long long reqUh48 = 0LL; long long repUh48 = 0LL; SpiderReply *oldRep = NULL; SpiderRequest *oldReq = NULL; char *lastKey = NULL; char *prevLastKey = NULL; // save list ptr in case of re-read? //char *saved = list->m_listPtr; // reset it list->resetListPtr(); for ( ; ! list->isExhausted() ; ) { // breathe. NO! assume in thread!! //QUICKPOLL(niceness); // get rec char *rec = list->getCurrentRec(); // pre skip it list->skipCurrentRec(); // skip if negative, just copy over if ( ( rec[0] & 0x01 ) == 0x00 ) { // should not be in here if this was true... if ( removeNegRecs ) { log("spider: filter got negative key"); char *xx=NULL;*xx=0; } // save this prevLastKey = lastKey; lastKey = dst; // otherwise, keep it memmove ( dst , rec , sizeof(key128_t) ); dst += sizeof(key128_t); continue; } // is it a reply? if ( g_spiderdb.isSpiderReply ( (key128_t *)rec ) ) { // cast it SpiderReply *srep = (SpiderReply *)rec; // shortcut long long uh48 = srep->getUrlHash48(); // crazy? if ( ! uh48 ) { //uh48 = hash64b ( srep->m_url ); uh48 = 12345678; log("spider: got uh48 of zero for spider req. " "computing now."); } // does match last reply? if ( repUh48 == uh48 ) { // if he's a later date than us, skip us! if ( oldRep->m_spideredTime >= srep->m_spideredTime ) // skip us! continue; // otherwise, erase him dst = restorePoint; lastKey = prevLastKey; } // save in case we get erased restorePoint = dst; prevLastKey = lastKey; lastKey = dst; // get our size long recSize = srep->getRecSize(); // and add us memmove ( dst , rec , recSize ); // advance dst += recSize; // update this crap for comparing to next reply repUh48 = uh48; oldRep = srep; // get next spiderdb record continue; } // shortcut SpiderRequest *sreq = (SpiderRequest *)rec; // shortcut long long uh48 = sreq->getUrlHash48(); // crazy? if ( ! uh48 ) { //uh48 = hash64b ( sreq->m_url ); uh48 = 12345678; log("spider: got uh48 of zero for spider req. " "computing now."); } // update request with SpiderReply if newer, because ultimately // ::getUrlFilterNum() will just look at SpiderRequest's // version of these bits! if ( oldRep && repUh48 == uh48 && oldRep->m_spideredTime > sreq->m_addedTime ) { // if request was a page reindex docid based request // and url has since been spidered, nuke it! if ( sreq->m_urlIsDocId ) continue; // same if indexcode was EFAKEFIRSTIP which XmlDoc.cpp // re-adds to spiderdb with the right firstip. once // those guys have a reply we can ignore them. // TODO: what about diffbotxyz spider requests? those // have a fakefirstip... they should not have requests // though, since their parent url has that. if ( sreq->m_fakeFirstIp ) continue; SpiderReply *old = oldRep; sreq->m_inGoogle = old->m_inGoogle; sreq->m_hasAuthorityInlink = old->m_hasAuthorityInlink; sreq->m_hasContactInfo = old->m_hasContactInfo; //sreq->m_hasSiteVenue = old->m_hasSiteVenue; } // if we are not the same url as last request, add it if ( uh48 != reqUh48 ) { // a nice hook in addIt: // save in case we get erased restorePoint = dst; prevLastKey = lastKey; // get our size long recSize = sreq->getRecSize(); // save this lastKey = dst; // and add us memmove ( dst , rec , recSize ); // advance dst += recSize; // update this crap for comparing to next reply reqUh48 = uh48; oldReq = sreq; // get next spiderdb record continue; } // try to kinda grab the min hop count as well if ( sreq->m_hopCountValid && oldReq->m_hopCountValid ) { if ( oldReq->m_hopCount < sreq->m_hopCount ) sreq->m_hopCount = oldReq->m_hopCount; else oldReq->m_hopCount = sreq->m_hopCount; } // if he's essentially different input parms but for the // same url, we want to keep him because he might map the // url to a different url priority! if ( oldReq->m_siteHash32 != sreq->m_siteHash32 || oldReq->m_isNewOutlink != sreq->m_isNewOutlink || // makes a difference as far a m_minPubDate goes, because // we want to make sure not to delete that request that // has m_parentPrevSpiderTime // no no, we prefer the most recent spider request // from thsi site in the logic above, so this is not // necessary. mdw commented out. //oldReq->m_wasParentIndexed != sreq->m_wasParentIndexed|| oldReq->m_isInjecting != sreq->m_isInjecting || oldReq->m_hasContent != sreq->m_hasContent || oldReq->m_isAddUrl != sreq->m_isAddUrl || oldReq->m_isPageReindex != sreq->m_isPageReindex || oldReq->m_forceDelete != sreq->m_forceDelete ) // we are different enough to coexist goto addIt; // . if the same check who has the most recent added time // . if we are not the most recent, just do not add us if ( sreq->m_addedTime <= oldReq->m_addedTime ) continue; // otherwise, erase over him dst = restorePoint; lastKey = prevLastKey; // and add us over top of him goto addIt; } // free the old list //char *oldbuf = list->m_alloc; //long oldSize = list->m_allocSize; // sanity check if ( dst < list->m_list || dst > list->m_list + list->m_listSize ) { char *xx=NULL;*xx=0; } // and stick our newly filtered list in there //list->m_list = newList; list->m_listSize = dst - newList; // set to end i guess list->m_listPtr = dst; //list->m_allocSize = need; //list->m_alloc = newList; list->m_listEnd = list->m_list + list->m_listSize; list->m_listPtrHi = NULL; //KEYSET(list->m_lastKey,lastKey,list->m_ks); if ( lastKey ) KEYSET(list->m_lastKey,lastKey,list->m_ks); //mfree ( oldbuf , oldSize, "oldspbuf"); } /////// // // diffbot uses these for limiting crawls in a collection // /////// void gotCrawlInfoReply ( void *state , UdpSlot *slot); static long s_requests = 0; static long s_replies = 0; static long s_validReplies = 0; static bool s_inUse = false; // . just call this once per second for all collections // . figure out how to backoff on collections that don't need it so much // . ask every host for their crawl infos for each collection rec void updateAllCrawlInfosSleepWrapper ( int fd , void *state ) { // debug test //long mr = g_collectiondb.m_recs[0]->m_maxCrawlRounds; //log("mcr: %li",mr); // i don't know why we have locks in the lock table that are not // getting removed... so log when we remove an expired locks and see. // piggyback on this sleep wrapper call i guess... // perhaps the collection was deleted or reset before the spider // reply could be generated. in that case we'd have a dangling lock. removeExpiredLocks ( -1 ); if ( s_inUse ) return; char *request = ""; long requestSize = 0; s_inUse = true; // reset tmp crawlinfo classes to hold the ones returned to us for ( long i = 0 ; i < g_collectiondb.m_numRecs ; i++ ) { CollectionRec *cr = g_collectiondb.m_recs[i]; if ( ! cr ) continue; cr->m_tmpCrawlInfo.reset(); } // send out the msg request for ( long i = 0 ; i < g_hostdb.m_numHosts ; i++ ) { Host *h = g_hostdb.getHost(i); // skip if dead. no! we need replies from all hosts // otherwise our counts could be short and we might end up // re-spidering stuff even though we've really hit maxToCrawl //if ( g_hostdb.isDead(i) ) { // if ( g_conf.m_logDebugSpider ) // log("spider: skipping dead host #%li " // "when getting " // "crawl info",i); // continue; //} // count it as launched s_requests++; // launch it if ( ! g_udpServer.sendRequest ( request, requestSize, 0xc1 , // msgtype h->m_ip , h->m_port , h->m_hostId , NULL, // retslot NULL, // state gotCrawlInfoReply ) ) { log("spider: error sending c1 request: %s", mstrerror(g_errno)); s_replies++; } } // return false if we blocked awaiting replies if ( s_replies < s_requests ) return; // how did this happen? log("spider: got bogus crawl info replies!"); s_inUse = false; return; // somehow we did not block... hmmmm... //char *xx=NULL;*xx=0; //gotCrawlInfoReply( cr , NULL ); // we did not block... //return true; } // . Parms.cpp calls this when it receives our "spiderRoundNum" increment above // . all hosts should get it at *about* the same time void spiderRoundIncremented ( CollectionRec *cr ) { log("spider: incrementing spider round for coll %s to %li (%lu)", cr->m_coll,cr->m_spiderRoundNum,cr->m_spiderRoundStartTime); // . need to send a notification for this round // . we are only here because the round was incremented and // Parms.cpp just called us... and that only happens in // doneSending... so do not send again!!! //cr->m_localCrawlInfo.m_sentCrawlDoneAlert = 0; // . if we set sentCrawlDoneALert to 0 it will immediately // trigger another round increment !! so we have to set these // to true to prevent that. // . if we learnt that there really are no more urls ready to spider // then we'll go to the next round. but that can take like // SPIDER_DONE_TIMER seconds of getting nothing. cr->m_localCrawlInfo.m_hasUrlsReadyToSpider = true; cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider = true; cr->m_localCrawlInfo.m_pageDownloadSuccessesThisRound = 0; cr->m_localCrawlInfo.m_pageProcessSuccessesThisRound = 0; cr->m_globalCrawlInfo.m_pageDownloadSuccessesThisRound = 0; cr->m_globalCrawlInfo.m_pageProcessSuccessesThisRound = 0; cr->m_needsSave = true; } void gotCrawlInfoReply ( void *state , UdpSlot *slot ) { // loop over each LOCAL crawlinfo we received from this host CrawlInfo *ptr = (CrawlInfo *)(slot->m_readBuf); CrawlInfo *end = (CrawlInfo *)(slot->m_readBuf+ slot->m_readBufSize); long allocSize = slot->m_readBufMaxSize; // host sending us this reply Host *h = slot->m_host; // assume it is a valid reply, not an error, like a udptimedout s_validReplies++; // reply is error? then use the last known good reply we had from him if ( ! slot->m_readBuf || g_errno ) { log("spider: got crawlinfo reply error: %s", mstrerror(g_errno)); // just clear it g_errno = 0; // just use his last known good reply ptr = (CrawlInfo *)h->m_lastKnownGoodCrawlInfoReply; end = (CrawlInfo *)h->m_lastKnownGoodCrawlInfoReplyEnd; // if never had any reply... can't be valid then if ( ! ptr ) s_validReplies--; } // otherwise, if reply was good it is the last known good now! else { // free the old good one and replace it with the new one if ( h->m_lastKnownGoodCrawlInfoReply ) { //log("spider: skiipping possible bad free!!!! until we fix"); mfree ( h->m_lastKnownGoodCrawlInfoReply , h->m_replyAllocSize , "lknown" ); } // add in the new good in case he goes down in the future h->m_lastKnownGoodCrawlInfoReply = (char *)ptr; h->m_lastKnownGoodCrawlInfoReplyEnd = (char *)end; // set new alloc size h->m_replyAllocSize = allocSize; // if valid, don't let him free it now! slot->m_readBuf = NULL; } // inc it s_replies++; if ( s_replies > s_requests ) { char *xx=NULL;*xx=0; } // crap, if any host is dead and not reporting it's number then // that seriously fucks us up because our global count will drop // and something that had hit a max limit, like maxToCrawl, will // now be under the limit and the crawl will resume. // what's the best way to fix this? // // perhaps, let's just keep the dead host's counts the same // as the last time we got them. or maybe the simplest way is to // just not allow spidering if a host is dead // the sendbuf should never be freed! it points into collrec slot->m_sendBufAlloc = NULL; ///// // SCAN the list of CrawlInfos we received from this host, // one for each non-null collection ///// // . add the LOCAL stats we got from the remote into the GLOBAL stats // . readBuf is null on an error, so check for that... // . TODO: do not update on error??? for ( ; ptr < end ; ptr++ ) { // get collnum collnum_t collnum = (collnum_t)(ptr->m_collnum); CollectionRec *cr = g_collectiondb.getRec ( collnum ); if ( ! cr ) { log("spider: updatecrawlinfo collnum %li " "not found",(long)collnum); continue; } CrawlInfo *stats = ptr; long long *gs = (long long *)&cr->m_tmpCrawlInfo; long long *ss = (long long *)stats; for ( long i = 0 ; i < NUMCRAWLSTATS ; i++ ) { *gs = *gs + *ss; gs++; ss++; } // . special counts // . assume round #'s match! //if ( ss->m_spiderRoundNum == // cr->m_localCrawlInfo.m_spiderRoundNum ) { cr->m_tmpCrawlInfo.m_pageDownloadSuccessesThisRound += stats->m_pageDownloadSuccessesThisRound; cr->m_tmpCrawlInfo.m_pageProcessSuccessesThisRound += stats->m_pageProcessSuccessesThisRound; //} if ( stats->m_hasUrlsReadyToSpider ) { // inc the count otherwise cr->m_tmpCrawlInfo.m_hasUrlsReadyToSpider++; // . no longer initializing? // . sometimes other shards get the spider requests // and not us!!! if ( cr->m_spiderStatus == SP_INITIALIZING ) cr->m_spiderStatus = SP_INPROGRESS; // i guess we are back in business even if // m_spiderStatus was SP_MAXTOCRAWL or // SP_ROUNDDONE... cr->m_spiderStatus = SP_INPROGRESS; // unflag the sent flag if we had sent an alert // but only if it was a crawl round done alert, // not a maxToCrawl or maxToProcess or // maxRounds alert. // we can't do this because on startup we end // up setting hasUrlsReadyToSpider to true and // we may have already sent an email, and it // gets RESET here when it shouldn't be //if(cr->m_localCrawlInfo.m_sentCrawlDoneAlert //== SP_ROUNDDONE ) //cr->m_localCrawlInfo.m_sentCrawlDoneAlert=0; // revival? if ( ! cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider ) log("spider: reviving crawl %s from host %li", cr->m_coll,slot->m_host->m_hostId); } // if not the last reply, skip this part if ( s_replies < s_requests ) continue; // if it's the last reply we are to receive, and 1 or more // hosts did not have a valid reply, and not even a // "last known good reply" then then we can't do // much, so do not spider then because our counts could be // way off and cause us to start spidering again even though // we hit a maxtocrawl limit!!!!! if ( s_validReplies < s_replies ) { // this will tell us to halt all spidering // because a host is essentially down! s_countsAreValid = false; // might as well stop the loop here since we are // not updating our crawlinfo states. break; } // revival? //if ( cr->m_tmpCrawlInfo.m_hasUrlsReadyToSpider && // ! cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider ) { // log("spider: reviving crawl %s (%li)",cr->m_coll, // cr->m_tmpCrawlInfo.m_hasUrlsReadyToSpider); //} //bool has = cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider; if ( cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider && ! cr->m_tmpCrawlInfo.m_hasUrlsReadyToSpider ) log("spider: all %li hosts report %s (%li) has no " "more urls ready to spider", s_replies,cr->m_coll,(long)cr->m_collnum); // now copy over to global crawl info so things are not // half ass should we try to read globalcrawlinfo // in between packets received. memcpy ( &cr->m_globalCrawlInfo , &cr->m_tmpCrawlInfo , sizeof(CrawlInfo) ); // turn not assume we are out of urls just yet if a host // in the network has not reported... //if ( g_hostdb.hasDeadHost() && has ) // cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider = true; // should we reset our "sent email" flag? bool reset = false; // can't reset if we've never sent an email out yet if ( cr->m_localCrawlInfo.m_sentCrawlDoneAlert ) reset = true; // must have some urls ready to spider now so we can send // another email after another round of spidering if (!cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider) reset=false; // . if we have urls ready to be spidered then prepare to send // another email/webhook notification. // . do not reset this flag if SP_MAXTOCRAWL etc otherwise we // end up sending multiple notifications, so this logic here // is only for when we are done spidering a round, which // happens when hasUrlsReadyToSpider goes false for all // shards. if ( reset ) { log("spider: resetting sent crawl done alert to 0 " "for coll %s",cr->m_coll); cr->m_localCrawlInfo.m_sentCrawlDoneAlert = 0; } // update cache time cr->m_globalCrawlInfo.m_lastUpdateTime = getTime(); // make it save to disk i guess cr->m_needsSave = true; // and we've examined at least one url. to prevent us from // sending a notification if we haven't spidered anything // because no seed urls have been added/injected. //if ( cr->m_globalCrawlInfo.m_urlsConsidered == 0 ) return; if ( cr->m_globalCrawlInfo.m_pageDownloadAttempts == 0 ) continue; // if urls were considered and roundstarttime is still 0 then // set it to the current time... //if ( cr->m_spiderRoundStartTime == 0 ) // // all hosts in the network should sync with host #0 // // on this // cr->m_spiderRoundStartTime = getTimeGlobal(); // but of course if it has urls ready to spider, do not send // alert... or if this is -1, indicating "unknown". if ( cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider ) continue; // update status if nto already SP_MAXTOCRAWL, etc. we might // just be flat out of urls if ( ! cr->m_spiderStatus || cr->m_spiderStatus == SP_INPROGRESS || cr->m_spiderStatus == SP_INITIALIZING ) cr->m_spiderStatus = SP_ROUNDDONE; // // TODO: set the spiderstatus outright here... // maxtocrawl, maxtoprocess, etc. based on the counts. // // only host #0 sends emails if ( g_hostdb.m_myHost->m_hostId != 0 ) continue; // . if already sent email for this, skip // . localCrawlInfo stores this value on disk so persistent // . we do it this way so SP_ROUNDDONE can be emailed and then // we'd email SP_MAXROUNDS to indicate we've hit the maximum // round count. if ( cr->m_localCrawlInfo.m_sentCrawlDoneAlert ) continue; // do email and web hook... sendNotificationForCollRec ( cr ); // deal with next collection rec } // wait for more replies to come in if ( s_replies < s_requests ) return; // initialize s_replies = 0; s_requests = 0; s_validReplies = 0; s_inUse = false; } void handleRequestc1 ( UdpSlot *slot , long niceness ) { //char *request = slot->m_readBuf; // just a single collnum if ( slot->m_readBufSize != 0 ) { char *xx=NULL;*xx=0;} //if ( ! isClockSynced() ) { //} //collnum_t collnum = *(collnum_t *)request; //CollectionRec *cr = g_collectiondb.getRec(collnum); // deleted from under us? i've seen this happen //if ( ! cr ) { // log("spider: c1: coll deleted returning empty reply"); // g_udpServer.sendReply_ass ( "", // reply // 0, // 0 , // alloc // 0 , //alloc size // slot ); // return; //} // while we are here update CrawlInfo::m_nextSpiderTime // to the time of the next spider request to spider. // if doledb is empty and the next rec in the waiting tree // does not have a time of zero, but rather, in the future, then // return that future time. so if a crawl is enabled we should // actively call updateCrawlInfo a collection every minute or // so. //cr->m_localCrawlInfo.m_hasUrlsReadyToSpider = 1; //long long nowGlobalMS = gettimeofdayInMillisecondsGlobal(); //long long nextSpiderTimeMS; // this will be 0 for ip's which have not had their SpiderRequests // in spiderdb scanned yet to get the best SpiderRequest, so we // just have to wait for that. /* nextSpiderTimeMS = sc->getEarliestSpiderTimeFromWaitingTree(0); if ( ! sc->m_waitingTreeNeedsRebuild && sc->m_lastDoledbReadEmpty && cr->m_spideringEnabled && g_conf.m_spideringEnabled && nextSpiderTimeMS > nowGlobalMS +10*60*1000 ) // turn off this flag, "ready queue" is empty cr->m_localCrawlInfo.m_hasUrlsReadyToSpider = 0; // but send back a -1 if we do not know yet because we haven't // read the doledblists from disk from all priorities for this coll if ( sc->m_numRoundsDone == 0 ) cr->m_localCrawlInfo.m_hasUrlsReadyToSpider = -1; */ //long now = getTimeGlobal(); SafeBuf replyBuf; long now = getTimeGlobalNoCore(); //SpiderColl *sc = g_spiderCache.getSpiderColl(collnum); for ( long i = 0 ; i < g_collectiondb.m_numRecs ; i++ ) { CollectionRec *cr = g_collectiondb.m_recs[i]; if ( ! cr ) continue; // shortcut CrawlInfo *ci = &cr->m_localCrawlInfo; // this is now needed for alignment by the receiver ci->m_collnum = i; SpiderColl *sc = cr->m_spiderColl; ///////// // // ARE WE DONE SPIDERING????? // ///////// // if we haven't spidered anything in 1 min assume the // queue is basically empty... if ( ci->m_lastSpiderAttempt && ci->m_lastSpiderCouldLaunch && ci->m_hasUrlsReadyToSpider && // the next round we are waiting for, if any, must // have had some time to get urls! otherwise we // will increment the round # and wait just // SPIDER_DONE_TIMER seconds and end up setting // hasUrlsReadyToSpider to false! now > cr->m_spiderRoundStartTime + SPIDER_DONE_TIMER && // no spiders currently out. i've seen a couple out // waiting for a diffbot reply. wait for them to // return before ending the round... sc && sc->m_spidersOut == 0 && // it must have launched at least one url! this should // prevent us from incrementing the round # at the gb // process startup //ci->m_numUrlsLaunched > 0 && //cr->m_spideringEnabled && //g_conf.m_spideringEnabled && ci->m_lastSpiderAttempt - ci->m_lastSpiderCouldLaunch > (long) SPIDER_DONE_TIMER ) { // this is the MOST IMPORTANT variable so note it log("spider: coll %s has no more urls to spider", cr->m_coll); // assume our crawl on this host is completed i guess ci->m_hasUrlsReadyToSpider = 0; // save that! cr->m_needsSave = true; // set the time that this happens cr->m_diffbotCrawlEndTime = getTimeGlobalNoCore(); } // save it replyBuf.safeMemcpy ( ci , sizeof(CrawlInfo) ); } g_udpServer.sendReply_ass ( replyBuf.getBufStart() , replyBuf.length() , replyBuf.getBufStart() , // alloc replyBuf.getCapacity() , //alloc size slot ); // udp server will free this replyBuf.detachBuf(); } bool getSpiderStatusMsg ( CollectionRec *cx , SafeBuf *msg , long *status ) { if ( ! g_conf.m_spideringEnabled && ! cx->m_isCustomCrawl ) return msg->safePrintf("Spidering disabled in " "master controls. You can turn it " "back on there."); if ( g_conf.m_readOnlyMode ) return msg->safePrintf("In read-only mode. Spidering off."); if ( g_dailyMerge.m_mergeMode ) return msg->safePrintf("Daily merge engaged, spidering " "paused."); if ( g_udpServer.getNumUsedSlots() >= 1300 ) return msg->safePrintf("Too many UDP slots in use, " "spidering paused."); if ( g_repairMode ) return msg->safePrintf("In repair mode, spidering paused."); // do not spider until collections/parms in sync with host #0 if ( ! g_parms.m_inSyncWithHost0 ) return msg->safePrintf("Parms not in sync with host #0, " "spidering paused"); // don't spider if not all hosts are up, or they do not all // have the same hosts.conf. if ( g_pingServer.m_hostsConfInDisagreement ) return msg->safePrintf("Hosts.conf discrepancy, " "spidering paused."); long now = getTimeGlobal(); // . 0 means not to RE-crawl // . indicate if we are WAITING for next round... if ( cx->m_spiderStatus == SP_MAXTOCRAWL && cx->m_collectiveRespiderFrequency > 0.0 && now < cx->m_spiderRoundStartTime ) { *status = SP_ROUNDDONE; return msg->safePrintf("Jobs has reached maxToCrawl limit. " "Next crawl round to start " "in %li seconds.", cx->m_spiderRoundStartTime-now ); } if ( cx->m_spiderStatus == SP_MAXTOPROCESS && cx->m_collectiveRespiderFrequency > 0.0 && now < cx->m_spiderRoundStartTime ) { *status = SP_ROUNDDONE; return msg->safePrintf("Jobs has reached maxToProcess limit. " "Next crawl round to start " "in %li seconds.", cx->m_spiderRoundStartTime-now ); } if ( cx->m_spiderStatus == SP_MAXTOCRAWL ) { *status = SP_MAXTOCRAWL; return msg->safePrintf ( "Job has reached maxToCrawl " "limit." ); } if ( cx->m_spiderStatus == SP_MAXTOPROCESS ) { *status = SP_MAXTOPROCESS; return msg->safePrintf ( "Job has reached maxToProcess " "limit." ); } if ( cx->m_spiderStatus == SP_MAXROUNDS ) { *status = SP_MAXROUNDS; return msg->safePrintf ( "Job has reached maxRounds " "limit." ); } // . 0 means not to RE-crawl // . indicate if we are WAITING for next round... if ( cx->m_collectiveRespiderFrequency > 0.0 && now < cx->m_spiderRoundStartTime ) { *status = SP_ROUNDDONE; return msg->safePrintf("Next crawl round to start " "in %li seconds.", cx->m_spiderRoundStartTime-now ); } if ( ! cx->m_spideringEnabled ) { *status = SP_PAUSED; if ( cx->m_isCustomCrawl ) return msg->safePrintf("Job paused."); else return msg->safePrintf("Spidering disabled " "in spider controls."); } if ( ! g_conf.m_spideringEnabled ) { *status = SP_ADMIN_PAUSED; return msg->safePrintf("All crawling temporarily paused " "by root administrator for " "maintenance."); } // out CollectionRec::m_globalCrawlInfo counts do not have a dead // host's counts tallied into it, which could make a difference on // whether we have exceed a maxtocrawl limit or some such, so wait... if ( ! s_countsAreValid ) { *status = SP_ADMIN_PAUSED; return msg->safePrintf("All crawling temporarily paused " "because a shard is down."); } // if spiderdb is empty for this coll, then no url // has been added to spiderdb yet.. either seed or spot //CrawlInfo *cg = &cx->m_globalCrawlInfo; //if ( cg->m_pageDownloadAttempts == 0 ) { // *status = SP_NOURLS; // return msg->safePrintf("Crawl is waiting for urls."); //} if ( cx->m_spiderStatus == SP_INITIALIZING ) { *status = SP_INITIALIZING; return msg->safePrintf("Job is initializing."); } // if we sent an email simply because no urls // were left and we are not recrawling! if ( cx->m_collectiveRespiderFrequency <= 0.0 && cx->m_isCustomCrawl && ! cx->m_globalCrawlInfo.m_hasUrlsReadyToSpider ) { *status = SP_COMPLETED; return msg->safePrintf("Job has completed and no " "repeat is scheduled."); } if ( cx->m_spiderStatus == SP_ROUNDDONE && ! cx->m_isCustomCrawl ) { *status = SP_ROUNDDONE; return msg->safePrintf ( "Nothing currently " "available to spider. " "Change your url filters, try " "adding new urls, or wait for " "existing urls to be respidered."); } if ( cx->m_spiderStatus == SP_ROUNDDONE ) { *status = SP_ROUNDDONE; return msg->safePrintf ( "Job round completed."); } // otherwise in progress? *status = SP_INPROGRESS; if ( cx->m_isCustomCrawl ) return msg->safePrintf("Job is in progress."); else return msg->safePrintf("Spider is in progress."); } // pattern is a ||-separted list of substrings bool doesStringContainPattern ( char *content , char *pattern ) { //bool checkForNegatives ) { char *p = pattern; long matchedOne = 0; bool hadPositive = false; long count = 0; // scan the " || " separated substrings for ( ; *p ; ) { // get beginning of this string char *start = p; // skip white space while ( *start && is_wspace_a(*start) ) start++; // done? if ( ! *start ) break; // find end of it char *end = start; while ( *end && end[0] != '|' ) end++; // advance p for next guy p = end; // should be two |'s if ( *p ) p++; if ( *p ) p++; // temp null this char c = *end; *end = '\0'; // count it as an attempt count++; bool matchFront = false; if ( start[0] == '^' ) { start++; matchFront = true; } // if pattern is NOT/NEGATIVE... bool negative = false; if ( start[0] == '!' && start[1] && start[1]!='|' ) { start++; negative = true; } else hadPositive = true; // . is this substring anywhere in the document // . check the rawest content before converting to utf8 i guess // . suuport the ^ operator char *foundPtr = NULL; if ( matchFront ) { // if we match the front, set to bogus 0x01 if ( strncmp(content,start,end-start)==0 ) foundPtr =(char *)0x01; } else { foundPtr = strstr ( content , start ) ; } // debug log statement //if ( foundPtr ) // log("build: page %s matches ppp of \"%s\"", // m_firstUrl.m_url,start); // revert \0 *end = c; // negative mean we should NOT match it if ( negative ) { // so if its matched, that is bad if ( foundPtr ) return false; continue; } // skip if not found if ( ! foundPtr ) continue; // did we find it? matchedOne++; // if no negatives, done //if ( ! checkForNegatives ) //return true; } // if we had no attempts, it is ok if ( count == 0 ) return true; // must have matched one at least if ( matchedOne ) return true; // if all negative? i.e. !category||!author if ( ! hadPositive ) return true; // if we had an unfound substring... return false; } long getFakeIpForUrl1 ( char *url1 ) { // make the probable docid long long probDocId = g_titledb.getProbableDocId ( url1 ); // make one up, like we do in PageReindex.cpp long firstIp = (probDocId & 0xffffffff); return firstIp; } long getFakeIpForUrl2 ( Url *url2 ) { // make the probable docid long long probDocId = g_titledb.getProbableDocId ( url2 ); // make one up, like we do in PageReindex.cpp long firstIp = (probDocId & 0xffffffff); return firstIp; } // returns false and sets g_errno on error bool SpiderRequest::setFromAddUrl ( char *url ) { // reset it reset(); // make the probable docid long long probDocId = g_titledb.getProbableDocId ( url ); // make one up, like we do in PageReindex.cpp long firstIp = (probDocId & 0xffffffff); //long firstIp = getFakeIpForUrl1 ( url ); // ensure not crazy if ( firstIp == -1 || firstIp == 0 ) firstIp = 1; // . now fill it up // . TODO: calculate the other values... lazy!!! (m_isRSSExt, // m_siteNumInlinks,...) m_isNewOutlink = 1; m_isAddUrl = 1; m_addedTime = getTimeGlobal();//now; m_fakeFirstIp = 1; m_probDocId = probDocId; m_firstIp = firstIp; m_hopCount = 0; // new: validate it? m_hopCountValid = 1; // its valid if root Url uu; uu.set ( url ); if ( uu.isRoot() ) m_hopCountValid = true; // too big? if ( gbstrlen(url) > MAX_URL_LEN ) { g_errno = EURLTOOLONG; return false; } // the url! includes \0 strcpy ( m_url , url ); // call this to set m_dataSize now setDataSize(); // make the key dude -- after setting url setKey ( firstIp , 0LL, false ); // need a fake first ip lest we core! //m_firstIp = (pdocId & 0xffffffff); // how to set m_firstIp? i guess addurl can be throttled independently // of the other urls??? use the hash of the domain for it! long dlen; char *dom = getDomFast ( url , &dlen ); // fake it for this... //m_firstIp = hash32 ( dom , dlen ); // sanity if ( ! dom ) { g_errno = EBADURL; return false; //return sendReply ( st1 , true ); } m_domHash32 = hash32 ( dom , dlen ); // and "site" long hlen = 0; char *host = getHostFast ( url , &hlen ); m_siteHash32 = hash32 ( host , hlen ); m_hostHash32 = m_siteHash32; return true; } bool SpiderRequest::setFromInject ( char *url ) { // just like add url if ( ! setFromAddUrl ( url ) ) return false; // but fix this m_isAddUrl = 0; m_isInjecting = 1; return true; }