#include "SafeBuf.h" #include "HttpRequest.h" #include "SearchInput.h" #include "Pages.h" #include "Parms.h" #include "Spider.h" #include "PageResults.h" // for RESULT_HEIGHT #include "Stats.h" bool printFrontPageShell ( SafeBuf *sb , char *tabName , CollectionRec *cr , bool printGigablast ) ; // 5 seconds #define DEFAULT_WIDGET_RELOAD 1000 //bool printSitePatternExamples ( SafeBuf *sb , HttpRequest *hr ) ; /////////// // // main > Basic > Settings // /////////// /* bool sendPageBasicSettings ( TcpSocket *socket , HttpRequest *hr ) { char buf [ 128000 ]; SafeBuf sb(buf,128000); // true = usedefault coll? CollectionRec *cr = g_collectiondb.getRec ( hr , true ); if ( ! cr ) { g_httpServer.sendErrorReply(socket,500,"invalid collection"); return true; } // process any incoming request handleSettingsRequest ( socket , hr ); // . print standard header // . this prints the
m_siteListBuf // . only adds seeds for the shard we are on iff we are responsible for // the fake firstip!!! that way only one shard does the add. bool updateSiteListBuf ( collnum_t collnum , bool addSeeds , char *siteListArg ) { CollectionRec *cr = g_collectiondb.getRec ( collnum ); if ( ! cr ) return true; // tell spiderloop to update the active list in case this // collection suddenly becomes active g_spiderLoop.m_activeListValid = false; // this might make a new spidercoll... SpiderColl *sc = g_spiderCache.getSpiderColl ( cr->m_collnum ); // sanity. if in use we should not even be here if ( sc->m_msg4x.m_inUse ) { log("basic: trying to update site list while previous " "update still outstanding."); g_errno = EBADENGINEER; return true; } // when sitelist is update Parms.cpp should invalidate this flag! //if ( sc->m_siteListTableValid ) return true; // hash current sitelist entries, each line so we don't add // dup requests into spiderdb i guess... HashTableX dedup; if ( ! dedup.set ( 4,0,1024,NULL,0,false,0,"sldt") ) return true; // this is a safebuf PARM in Parms.cpp now HOWEVER, not really // because we set it here from a call to CommandUpdateSiteList() // because it requires all this computational crap. char *op = cr->m_siteListBuf.getBufStart(); // scan and hash each line in it for ( ; ; ) { // done? if ( ! *op ) break; // skip spaces if ( is_wspace_a(*op) ) op++; // done? if ( ! *op ) break; // get end char *s = op; // skip to end of line marker for ( ; *op && *op != '\n' ; op++ ) ; // keep it simple int32_t h32 = hash32 ( s , op - s ); // for deduping if ( ! dedup.addKey ( &h32 ) ) return true; } // get the old sitelist Domain Hash to PatternData mapping table // which tells us what domains, subdomains or paths we can or // can not spider... HashTableX *dt = &sc->m_siteListDomTable; // reset it if ( ! dt->set ( 4 , sizeof(PatternData), 1024 , NULL , 0 , true , // allow dup keys? 0 , // niceness - at least for now "sldt" ) ) return true; // clear old shit sc->m_posSubstringBuf.purge(); sc->m_negSubstringBuf.purge(); // we can now free the old site list methinks //cr->m_siteListBuf.purge(); // reset flags //sc->m_siteListAsteriskLine = NULL; sc->m_siteListHasNegatives = false; sc->m_siteListIsEmpty = true; sc->m_siteListIsEmptyValid = true; sc->m_siteListIsEmptyValid = true; // use this so it will be free automatically when msg4 completes! SafeBuf *spiderReqBuf = &sc->m_msg4x.m_tmpBuf; //char *siteList = cr->m_siteListBuf.getBufStart(); // scan the list char *pn = siteListArg; // completely empty? if ( ! pn ) return true; int32_t lineNum = 1; int32_t added = 0; Url u; for ( ; *pn ; lineNum++ ) { // get end char *s = pn; // skip to end of line marker for ( ; *pn && *pn != '\n' ; pn++ ) ; // point to the pattern (skips over "tag:xxx " if there) char *patternStart = s; // back p up over spaces in case ended in spaces char *pe = pn; for ( ; pe > s && is_wspace_a(pe[-1]) ; pe-- ); // skip over the \n so pn points to next line for next time if ( *pn == '\n' ) pn++; // make hash of the line int32_t h32 = hash32 ( s , pe - s ); bool seedMe = true; bool isUrl = true; bool isNeg = false; bool isFilter = true; // skip spaces at start of line for ( ; *s && *s == ' ' ; s++ ); // comment? if ( *s == '#' ) continue; // empty line? if ( s[0] == '\r' && s[1] == '\n' ) { s++; continue; } // empty line? if ( *s == '\n' ) continue; // all? //if ( *s == '*' ) { // sc->m_siteListAsteriskLine = start; // continue; //} char *tag = NULL; int32_t tagLen = 0; innerLoop: // skip spaces for ( ; *s && *s == ' ' ; s++ ); // exact:? //if ( strncmp(s,"exact:",6) == 0 ) { // s += 6; // goto innerLoop; //} // these will be manual adds and should pass url filters // because they have the "ismanual" directive override if ( strncmp(s,"seed:",5) == 0 ) { s += 5; isFilter = false; goto innerLoop; } // does it start with "tag:xxxxx "? if ( *s == 't' && s[1] == 'a' && s[2] == 'g' && s[3] == ':' ) { tag = s+4; for ( ; *s && ! is_wspace_a(*s) ; s++ ); tagLen = s - tag; // skip over white space after tag:xxxx so "s" // point to the url or contains: or whatever for ( ; *s && is_wspace_a(*s) ; s++ ); // set pattern start to AFTER the tag stuff patternStart = s; } if ( *s == '-' ) { sc->m_siteListHasNegatives = true; isNeg = true; s++; } if ( strncmp(s,"site:",5) == 0 ) { s += 5; seedMe = false; goto innerLoop; } if ( strncmp(s,"contains:",9) == 0 ) { s += 9; seedMe = false; isUrl = false; goto innerLoop; } int32_t slen = pe - s; // empty line? if ( slen <= 0 ) continue; // add to string buffers if ( ! isUrl && isNeg ) { if ( !sc->m_negSubstringBuf.safeMemcpy(s,slen)) return true; if ( !sc->m_negSubstringBuf.pushChar('\0') ) return true; if ( ! tagLen ) continue; // append tag if ( !sc->m_negSubstringBuf.safeMemcpy("tag:",4)) return true; if ( !sc->m_negSubstringBuf.safeMemcpy(tag,tagLen) ) return true; if ( !sc->m_negSubstringBuf.pushChar('\0') ) return true; } if ( ! isUrl ) { // add to string buffers if ( ! sc->m_posSubstringBuf.safeMemcpy(s,slen) ) return true; if ( ! sc->m_posSubstringBuf.pushChar('\0') ) return true; if ( ! tagLen ) continue; // append tag if ( !sc->m_posSubstringBuf.safeMemcpy("tag:",4)) return true; if ( !sc->m_posSubstringBuf.safeMemcpy(tag,tagLen) ) return true; if ( !sc->m_posSubstringBuf.pushChar('\0') ) return true; continue; } u.set ( s , slen ); // error? skip it then... if ( u.getHostLen() <= 0 ) { log("basic: error on line #%"INT32" in sitelist",lineNum); continue; } // is fake ip assigned to us? int32_t firstIp = getFakeIpForUrl2 ( &u ); if ( ! isAssignedToUs( firstIp ) ) continue; // see if in existing table for existing site list if ( addSeeds && // a "site:" directive mean no seeding // a "contains:" directive mean no seeding seedMe && // do not seed stuff after tag:xxx directives // no, we need to seed it to avoid confusion. if // they don't want it seeded they can use site: after // the tag: //! tag && ! dedup.isInTable ( &h32 ) ) { // make spider request SpiderRequest sreq; sreq.setFromAddUrl ( u.getUrl() ); if ( // . add this url to spiderdb as a spiderrequest // . calling msg4 will be the last thing we do !spiderReqBuf->safeMemcpy(&sreq,sreq.getRecSize())) return true; // count it added++; } // if it is a "seed: xyz.com" thing it is seed only // do not use it for a filter rule if ( ! isFilter ) continue; // make the data node used for filtering urls during spidering PatternData pd; // hash of the subdomain or domain for this line in sitelist pd.m_thingHash32 = u.getHostHash32(); // . ptr to the line in CollectionRec::m_siteListBuf. // . includes pointing to "exact:" too i guess and tag: later. // . store offset since CommandUpdateSiteList() passes us // a temp buf that will be freed before copying the buf // over to its permanent place at cr->m_siteListBuf pd.m_patternStrOff = patternStart - siteListArg; // offset of the url path in the pattern, 0 means none pd.m_pathOff = 0; // did we have a tag? if ( tag ) { pd.m_tagOff = tag - siteListArg; pd.m_tagLen = tagLen; } else { pd.m_tagOff = -1; pd.m_tagLen = 0; } // scan url pattern, it should start at "s" char *x = s; // go all the way to the end for ( ; *x && x < pe ; x++ ) { // skip :// if ( x[0] == ':' && x[1] =='/' && x[2] == '/' ) { x += 2; continue; } // stop if we hit another /, that is path start if ( x[0] != '/' ) continue; x++; // empty path besides the /? if ( x >= pe ) break; // ok, we got something here i think // no, might be like http://xyz.com/?poo //if ( u.getPathLen() <= 1 ) { char *xx=NULL;*xx=0; } // calc length from "start" of line so we can // jump to the path quickly for compares. inc "/" pd.m_pathOff = (x-1) - patternStart; pd.m_pathLen = pe - (x-1); break; } // add to new dt int32_t domHash32 = u.getDomainHash32(); if ( ! dt->addKey ( &domHash32 , &pd ) ) return true; // we have some patterns in there sc->m_siteListIsEmpty = false; } // go back to a high niceness dt->m_niceness = MAX_NICENESS; //int32_t siteListLen = gbstrlen(siteList); //cr->m_siteListBuf.safeMemcpy ( siteList , siteListLen + 1 ); if ( ! addSeeds ) return true; log("spider: adding %"INT32" seed urls",added); // use spidercoll to contain this msg4 but if in use it // won't be able to be deleted until it comes back.. if ( ! sc->m_msg4x.addMetaList ( spiderReqBuf , sc->m_collnum , // no need for callback since m_msg4x // should set msg4::m_inUse to false // when it comes back NULL , // state NULL , // callback MAX_NICENESS , RDB_SPIDERDB ) ) return false; return true; } // . Spider.cpp calls this to see if a url it wants to spider is // in our "site list" // . we should return the row of the FIRST match really // . the url patterns all contain a domain now, so this can use the domain // hash to speed things up // . return ptr to the start of the line in case it has "tag:" i guess char *getMatchingUrlPattern ( SpiderColl *sc , SpiderRequest *sreq , char *tagArg ) { // tagArg can be NULL // if it has * and no negatives, we are in! //if ( sc->m_siteListAsteriskLine && ! sc->m_siteListHasNegatives ) // return sc->m_siteListAsteriskLine; // if it is just a bunch of comments or blank lines, it is empty if ( sc->m_siteListIsEmpty && sc->m_siteListIsEmptyValid ) return NULL; // if we had a list of contains: or regex: directives in the sitelist // we have to linear scan those char *nb = sc->m_negSubstringBuf.getBufStart(); char *nbend = nb + sc->m_negSubstringBuf.getLength(); for ( ; nb && nb < nbend ; ) { // return NULL if matches a negative substring if ( strstr ( sreq->m_url , nb ) ) return NULL; // skip it nb += strlen(nb) + 1; } char *myPath = NULL; // check domain specific tables HashTableX *dt = &sc->m_siteListDomTable; // get this CollectionRec *cr = sc->getCollectionRec(); // need to build dom table for pattern matching? if ( dt->getNumSlotsUsed() == 0 && cr ) { // do not add seeds, just make siteListDomTable, etc. updateSiteListBuf ( sc->m_collnum , false , // add seeds? cr->m_siteListBuf.getBufStart() ); } if ( dt->getNumSlotsUsed() == 0 ) { // empty site list -- no matches return NULL; //char *xx=NULL;*xx=0; } } // this table maps a 32-bit domain hash of a domain to a // patternData class. only for those urls that have firstIps that // we handle. int32_t slot = dt->getSlot ( &sreq->m_domHash32 ); char *buf = cr->m_siteListBuf.getBufStart(); // loop over all the patterns that contain this domain and see // the first one we match, and if we match a negative one. for ( ; slot >= 0 ; slot = dt->getNextSlot(slot,&sreq->m_domHash32)) { // get pattern PatternData *pd = (PatternData *)dt->getValueFromSlot ( slot ); // point to string char *patternStr = buf + pd->m_patternStrOff; // is it negative? return NULL if so so url will be ignored //if ( patternStr[0] == '-' ) // return NULL; // otherwise, it has a path. skip if we don't match path ptrn if ( pd->m_pathOff ) { if ( ! myPath ) myPath = sreq->getUrlPath(); if ( strncmp (myPath, patternStr + pd->m_pathOff, pd->m_pathLen ) ) continue; } // for entries like http://domain.com/ we have to match // protocol and url can NOT be like www.domain.com to match. // this is really like a regex like ^http://xyz.com/poo/boo/ if ( (patternStr[0]=='h' || patternStr[0]=='H') && ( patternStr[1]=='t' || patternStr[1]=='T' ) && ( patternStr[2]=='t' || patternStr[2]=='T' ) && ( patternStr[3]=='p' || patternStr[3]=='P' ) ) { char *x = patternStr+4; // is it https:// ? if ( *x == 's' || *x == 'S' ) x++; // watch out for subdomains like http.foo.com if ( *x != ':' ) goto nomatch; // ok, we have to substring match exactly. like // ^http://xyssds.com/foobar/ char *a = patternStr; char *b = sreq->m_url; for ( ; ; a++, b++ ) { // stop matching when pattern is exhausted if ( is_wspace_a(*a) || ! *a ) return patternStr; if ( *a != *b ) break; } // we failed to match "pd" so try next line continue; } nomatch: // if caller also gave a tag we'll want to see if this // "pd" has an entry for this domain that has that tag if ( tagArg ) { // skip if entry has no tag if ( pd->m_tagLen <= 0 ) continue; // skip if does not match domain or host if ( pd->m_thingHash32 != sreq->m_domHash32 && pd->m_thingHash32 != sreq->m_hostHash32 ) continue; // compare tags char *pdtag = pd->m_tagOff + buf; if ( strncmp(tagArg,pdtag,pd->m_tagLen) ) continue; // must be nothing after if ( is_alnum_a(tagArg[pd->m_tagLen]) ) continue; // that's a match return patternStr; } // was the line just a domain and not a subdomain? if ( pd->m_thingHash32 == sreq->m_domHash32 ) // this will be false if negative pattern i guess return patternStr; // was it just a subdomain? if ( pd->m_thingHash32 == sreq->m_hostHash32 ) // this will be false if negative pattern i guess return patternStr; } // if we had a list of contains: or regex: directives in the sitelist // we have to linear scan those char *pb = sc->m_posSubstringBuf.getBufStart(); char *pend = pb + sc->m_posSubstringBuf.length(); for ( ; pb && pb < pend ; ) { // return NULL if matches a negative substring if ( strstr ( sreq->m_url , pb ) ) return pb; // skip it pb += strlen(pb) + 1; } // is there an '*' in the patterns? //if ( sc->m_siteListAsteriskLine ) return sc->m_siteListAsteriskLine; return NULL; } bool printSitePatternExamples ( SafeBuf *sb , HttpRequest *hr ) { // true = useDefault? CollectionRec *cr = g_collectiondb.getRec ( hr , true ); if ( ! cr ) return true; /* // it is a safebuf parm char *siteList = cr->m_siteListBuf.getBufStart(); if ( ! siteList ) siteList = ""; SafeBuf msgBuf; char *status = ""; int32_t max = 1000000; if ( cr->m_siteListBuf.length() > max ) { msgBuf.safePrintf( "" "Site list is over %"INT32" bytes large, " "too many to " "display on this web page. Please use the " "file upload feature only for now." "" , max ); status = " disabled"; } */ /* sb->safePrintf( "On the command like you can issue a command like " "" "gb addurls < fileofurls.txt" " or " "" "gb addfile < *.html" " or " "" "gb injecturls < fileofurls.txt" " or " "" "gb injectfile < *.html" " or " "to schedule downloads or inject content directly " "into Gigablast." "" "" "" ); */ // example table sb->safePrintf ( "" "" "" //"" //"" //"" //"" "" "" "" "" // protocol and subdomain match "" "" "" "" // protocol and subdomain match "" "" "" "" "" "" "" "" // protocol and subdomain match "" "" "" "" // subdomain match "" "" "" "" "" "" "" "" // domain match "" "" "" "" // spider this subdir "" "" "" "" // exact match //"" //"" //"" //"" /* // local subdir match "" "" "" "" "" "" */ // connect to a device and index it as a stream //"" //"" //"" // negative subdomain match "" "" "" "" "" "" "" "" /* "" "" "" "" */ // tag match "" "" "" "" "" "" "" "
" "
Site List Examples
" ,TABLE_STYLE );//, DARK_BLUE); sb->safePrintf( //"*" //"Spider all urls encountered. If you just submit " //"this by itself, then Gigablast will initiate spidering " //"automatically at dmoz.org, an internet " //"directory of good sites.
goodstuff.com" "Spider the url goodstuff.com/ and spider " "any links we harvest that have the domain " "goodstuff.com" "
http://www.goodstuff.com/" "Spider the url " "http://www.goodstuff.com/ and spider " "any links we harvest that start with " "http://www.goodstuff.com/. NOTE: if the url " "www.goodstuff.com redirects to foo.goodstuff.com then " "foo.goodstuff.com still gets spidered " "because it is considered to be manually added, but " "no other urls from foo.goodstuff.com will be spidered." "
http://justdomain.com/foo/" "Spider the url " "http://justdomain.com/foo/ and spider " "any links we harvest that start with " "http://justdomain.com/foo/. " "Urls that start with " "http://www.justdomain.com/, for example, " "will NOT match this." "
seed:www.goodstuff.com/myurl.html" "Spider the url www.goodstuff.com/myurl.html. " "Add any outlinks we find into the " "spider queue, but those outlinks will only be " "spidered if they " "match ANOTHER line in this site list." "
site:http://www.goodstuff.com/" "Allow any urls starting with " "http://www.goodstuff.com/ to be spidered " "if encountered." "
site:www.goodstuff.com" "Allow any urls starting with " "www.goodstuff.com/ to be spidered " "if encountered." "
-site:bad.goodstuff.com" "Do not spider any urls starting with " "bad.goodstuff.com/ to be spidered " "if encountered." "
site:goodstuff.com" "Allow any urls starting with " "goodstuff.com/ to be spidered " "if encountered." "
site:" "http://www.goodstuff.com/goodir/anotherdir/" "Allow any urls starting with " "http://www.goodstuff.com/goodir/anotherdir/ " "to be spidered " "if encountered." "
exact:http://xyz.goodstuff.com/myurl.html" //"Allow this specific url." //"
file://C/mydir/mysubdir/" "" "Spider all files in the given subdirectory or lower. " "
-file://C/mydir/mysubdir/baddir/" "" "Do not spider files in this subdirectory." "
stream:/dev/eth0" //"" //"Connect to a device and index it as a stream. " //"It will be treated like a single huge document for " //"searching purposes with chunks being indexed in " //"realtime. Or chunk it up into individual document " //"chunks, but proximity term searching will have to " //"be adjusted to compute query term distances " //"inter-document." //"
contains:goodtuffSpider any url containing goodstuff." "
-contains:badstuffDo not spider any url containing badstuff." "
regexp:-pid=[0-9A-Z]+/Url must match this regular expression. " "Try to avoid using these if possible; they can slow " "things down and are confusing to use." "
" //"tag:boots contains:boots
" "tag:boots site:www.westernfootwear." "com
" "tag:boots cowboyshop.com
" "tag:boots contains:/boots
" "tag:boots site:www.moreboots.com
" "tag:boots http://lotsoffootwear.com/" "
" //"
t:boots -contains:www.cowboyshop.com/shoes/" "Advance users only. " "Tag any urls matching these 5 url patterns " "so we can use " "the expression tag:boots in the " "url filters and perhaps " "give such urls higher spider priority. " "For more " "precise spidering control over url subsets. " "Preceed any pattern with the tagname followed by " "space to tag it." "
# This line is a comment.Empty lines and lines starting with # are " "ignored." "
" ); return true; } bool printScrollingWidget ( SafeBuf *sb , CollectionRec *cr ) { sb->safePrintf("\n\n" ); int32_t widgetWidth = 300; int32_t widgetHeight = 500; // make the ajax url that gets the search results SafeBuf ub; ub.safePrintf("/search" //"format=ajax" "?c=%s" //"&prepend=gbsortbyint%%3Agbspiderdate" "&q=-gbstatus:0+gbsortbyint%%3Agbindexdate" "&sc=0" // no site clustering "&dr=0" // no deduping // 10 results at a time "&n=10" "&widgetheight=%"INT32"" "&widgetwidth=%"INT32"" , cr->m_coll , widgetHeight , widgetWidth ); //ub.safePrintf("&topdocid=" // ); // get the search results from neo as soon as this div is // being rendered, and set its contents to them sb->safePrintf(//"\n\n" // if (pos < (sd.scrollHeight-%"INT32")) return... // once user scrolls down to within last 5 // results then try to append to the results. , widgetHeight +5*((int32_t)RESULT_HEIGHT+2*PADDING) , ub.getBufStart() //,widgetHeight +5*((int32_t)RESULT_HEIGHT+2*PADDING ); // then the WIDGET MASTER div. set the "id" so that the // style tag the user sets can control its appearance. // when the browser loads this the ajax sets the contents // to the reply from neo. // on scroll call widget123_append() which will append // more search results if we are near the bottom of the // widget. sb->safePrintf("
" , widgetWidth , widgetHeight ); //sb->safePrintf(""); sb->safePrintf("Waiting for Server Response..."); // end the containing div sb->safePrintf("
"); return true; } bool sendPageWidgets ( TcpSocket *socket , HttpRequest *hr ) { // true = usedefault coll? CollectionRec *cr = g_collectiondb.getRec ( hr , true ); if ( ! cr ) { g_httpServer.sendErrorReply(socket,500,"invalid collection"); return true; } char buf [ 128000 ]; SafeBuf sb(buf,128000); printFrontPageShell ( &sb, "widgets", cr , true ); sb.safePrintf("
"); sb.safePrintf("
"); //char format = hr->getReplyFormat(); //if ( format == FORMAT_HTML ) printGigabotAdvice ( &sb , PAGE_BASIC_STATUS , hr , NULL ); printScrollingWidget ( &sb , cr ); return g_httpServer.sendDynamicPage (socket, sb.getBufStart(), sb.length(), 0); // cachetime } // from pagecrawlbot.cpp for printCrawlDetails() #include "PageCrawlBot.h" /////////// // // main > Basic > Status // /////////// bool sendPageBasicStatus ( TcpSocket *socket , HttpRequest *hr ) { char buf [ 128000 ]; SafeBuf sb(buf,128000); sb.reset(); // char *fs = hr->getString("format",NULL,NULL); // char format = FORMAT_HTML; // if ( fs && strcmp(fs,"html") == 0 ) format = FORMAT_HTML; // if ( fs && strcmp(fs,"json") == 0 ) format = FORMAT_JSON; // if ( fs && strcmp(fs,"xml") == 0 ) format = FORMAT_XML; char format = hr->getReplyFormat(); // true = usedefault coll? CollectionRec *cr = g_collectiondb.getRec ( hr , true ); if ( ! cr ) { g_httpServer.sendErrorReply(socket,500,"invalid collection"); return true; } if ( format == FORMAT_JSON || format == FORMAT_XML) { // this is in PageCrawlBot.cpp printCrawlDetails2 ( &sb , cr , format ); char *ct = "text/xml"; if ( format == FORMAT_JSON ) ct = "application/json"; return g_httpServer.sendDynamicPage (socket, sb.getBufStart(), sb.length(), 0, // cachetime false,//POSTReply , ct); } // print standard header if ( format == FORMAT_HTML ) // this prints the " ""); } int32_t savedLen1, savedLen2; // // widget // // put the widget in here, just sort results by spidered date // // the scripts do "infinite" scrolling both up and down. // but if you are at the top then new results will load above // you and we try to maintain your current visual state even though // the scrollbar position will change. // if ( format == FORMAT_HTML ) { // save position so we can output the widget code // so user can embed it into their own web page savedLen1 = sb.length(); printScrollingWidget ( &sb , cr ); savedLen2 = sb.length(); } // the right table pane is the crawl stats if ( format == FORMAT_HTML ) { sb.safePrintf(""); } // // show stats // if ( format == FORMAT_HTML ) { char *seedStr = cr->m_diffbotSeeds.getBufStart(); if ( ! seedStr ) seedStr = ""; SafeBuf tmp; int32_t crawlStatus = -1; getSpiderStatusMsg ( cr , &tmp , &crawlStatus ); CrawlInfo *ci = &cr->m_localCrawlInfo; int32_t sentAlert = (int32_t)ci->m_sentCrawlDoneAlert; if ( sentAlert ) sentAlert = 1; //sb.safePrintf( // "" // "%s" // , sb.getBufStart() // hidden input token/name/.. // ); char *hurts = "No"; if ( cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider ) hurts = "Yes"; sb.safePrintf(//"" //"
" "" "" "" "" "" "" "" "" "" //"" //"" //"" //"" "" "" "" "" // this will have to be in crawlinfo too! //"" //"" //"" "" "" "" "" //"" //"" //"" //"" "" "" "" "" "" "" "" "" , crawlStatus , tmp.getBufStart() //, cr->m_spiderRoundNum //, cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider , hurts , cr->m_globalCrawlInfo.m_urlsHarvested //, cr->m_globalCrawlInfo.m_urlsConsidered , cr->m_globalCrawlInfo.m_pageDownloadAttempts , cr->m_globalCrawlInfo.m_pageDownloadSuccesses ); // // begin status code breakdown // for ( int32_t i = 0 ; i < 65536 ; i++ ) { if ( g_stats.m_allErrorsNew[i] == 0 && g_stats.m_allErrorsOld[i] == 0 ) continue; sb.safePrintf ( "" "" "" "\n" , mstrerror(i), g_stats.m_allErrorsNew[i] + g_stats.m_allErrorsOld[i] ); } // // end status code breakdown // char tmp3[64]; struct tm *timeStruct; time_t tt = (time_t)cr->m_diffbotCrawlStartTime; timeStruct = localtime(&tt); // Jan 01 1970 at 10:30:00 strftime ( tmp3,64 , "%b %d %Y at %H:%M:%S",timeStruct); sb.safePrintf("" "",tmp3); // print link to embed the code in their own site SafeBuf embed; embed.htmlEncode(sb.getBufStart()+savedLen1, savedLen2-savedLen1, false); // encodePoundSign #? // convert all ''s to "'s for php's echo ''; cmd embed.replaceChar('\'','\"'); sb.safePrintf("" "" , embed.getBufStart() ); sb.safePrintf("" "" "" , embed.getBufStart() ); sb.safePrintf("
Crawl Status Code:%"INT32"
Crawl Status Msg:%s
Rounds Completed:%"INT32"
Has Urls Ready to Spider:%s
pages indexed" //"%"INT64"
URLs Harvested " "(may include dups)%"INT64"
URLs Examined%"INT64"
Page Crawl Attempts%"INT64"
Page Crawl Successes%"INT64"
  m_coll ); sb.urlEncode(mstrerror(i)); sb.safePrintf ("%%22>" "%s" "" "%"INT64"
Collection Created%s (local time)
" "" "" "show Widget HTML code" "" "" "" "" "
" "" "" "show Widget PHP code" "" "" "" "" "
\n\n"); } // end the right table pane if ( format == FORMAT_HTML ) { sb.safePrintf("
"); } //if ( format != FORMAT_JSON ) // // wrap up the form, print a submit button // g_pages.printAdminBottom ( &sb ); return g_httpServer.sendDynamicPage (socket, sb.getBufStart(), sb.length(), 0); // cachetime }