// Matt Wells, copyright Sep 2001 // the main program that brings it all together #include "gb-include.h" #include "Blaster.h" #include "Titledb.h" // TITLEREC_CURRENT_VERSION #include "Linkdb.h" Blaster g_blaster; static void gotDocWrapper1 ( void *state , TcpSocket *s ) ; static void gotDocWrapper2 ( void *state , TcpSocket *s ) ; static void gotDocWrapper3 ( void *state , TcpSocket *s ) ; static void gotDocWrapper4 ( void *state , TcpSocket *s ) ; static void sleepWrapper ( int fd , void *state ) ; static void sleepWrapperLog(int fd, void *state); Blaster::Blaster() {} Blaster::~Blaster() { if (m_buf1) mfree(m_buf1,m_bufSize1,"blaster1"); if (m_buf2) mfree(m_buf2,m_bufSize2,"blaster2"); } bool Blaster::init(){ // let's ensure our core file can dump struct rlimit lim; lim.rlim_cur = lim.rlim_max = RLIM_INFINITY; if ( setrlimit(RLIMIT_CORE,&lim) ) log("blaster::setrlimit: %s", mstrerror(errno) ); g_conf.m_maxMem = 500000000; // init our table for doing zobrist hashing if ( ! hashinit() ) { log("blaster::hashinit failed" ); return 0; } // init the memory class after conf since it gets maxMem from Conf if ( ! g_mem.init ( 200000000 ) ) { log("blaster::Mem init failed" ); return 0; } // start up log file if ( ! g_log.init( "/tmp/blasterLog" ) ) { log("blaster::Log open /tmp/blasterLog failed" ); return 0; } /* // get dns ip from /etc/resolv.conf g_conf.m_dnsIps[0] = 0; FILE *fd = fopen ( "/etc/resolv.conf" , "r" ); if ( ! fd ) { log("blaster::fopen: /etc/resolve.conf %s", mstrerror(errno)); return 0; } char tmp[1024]; while ( fgets ( tmp , 1024 , fd ) ) { // tmp buf ptr char *p = tmp; // skip comments if ( *p == '#' ) continue; // skip nameserver name if ( ! isdigit(*p) ) while ( ! isspace ( *p ) ) p++ ; // skip spaces while ( isspace ( *p ) ) p++; // if this is not a digit, continue if ( ! isdigit(*p) ) continue; // get ip g_conf.m_dnsIps[0] = atoip ( p , gbstrlen(p) ); // done break; } fclose ( fd ); // if no dns server found, bail if ( g_conf.m_dnsIps[0] == 0 ) { log("blaster:: no dns ip found in /etc/resolv.conf");return 0;} // hack # of dns servers g_conf.m_numDns = 1; g_conf.m_dnsPorts[0] = 53; */ g_conf.m_askRootNameservers = true; //g_conf.m_dnsIps [0] = atoip ( "192.168.0.1", 11 ); //g_conf.m_dnsClientPort = 9909; g_conf.m_dnsMaxCacheMem = 1024*10; // hack http server port to -1 (none) //g_conf.m_httpPort = 0; g_conf.m_httpMaxSockets = 200; //g_conf.m_httpMaxReadBufSize = 102*1024*1024; g_conf.m_httpMaxSendBufSize = 16*1024; // init the loop if ( ! g_loop.init() ) { log("blaster::Loop init failed" ); return 0; } // . then dns client // . server should listen to a socket and register with g_loop if ( ! g_dns.init(6000) ) { log("blaster::Dns client init failed" ); return 0; } // . then webserver // . server should listen to a socket and register with g_loop if ( ! g_httpServer.init( 8333 , 9334 ) ) { log("blaster::HttpServer init failed" ); return 0; } return 1; } void Blaster::runBlaster(char *file1,char *file2, long maxNumThreads, long wait, bool isLogFile, bool verbose,bool justDisplay, bool useProxy , bool injectUrlWithLinks , bool injectUrl ) { if (!init()) return; m_blasterDiff=true; if (!file2) m_blasterDiff=false; // set File class File f1; f1.set ( file1 ); // open files if ( ! f1.open ( O_RDONLY ) ) { log("blaster:open: %s %s",file1,mstrerror(g_errno)); return; } // get file size long fileSize1 = f1.getFileSize() ; // store a \0 at the end long m_bufSize1 = fileSize1 + 1; m_doInjectionWithLinks = injectUrlWithLinks; m_doInjection = injectUrl; // make buffers to hold all m_buf1 = (char *) mmalloc ( m_bufSize1 , "blaster1" ); if ( ! m_buf1) { log("blaster:mmalloc: %s",mstrerror(errno)); return; } //char *bufEnd = buf + bufSize; // set m_p1 m_p1 = m_buf1; m_p1end = m_buf1 + m_bufSize1 - 1; // read em all in if ( ! f1.read ( m_buf1 , fileSize1 , 0 ) ) { log("blaster:read: %s %s",file1,mstrerror(g_errno)); return; } // change \n to \0 //char *p = buf; long n = 0; for ( long i = 0 ; i < m_bufSize1 ; i++ ) { if ( m_buf1[i] != '\n' ) continue; m_buf1[i] = '\0'; n++; } if (m_blasterDiff){ File f2; f2.set ( file2 ); if ( ! f2.open ( O_RDONLY ) ) { log("blaster:open: %s %s",file2,mstrerror(g_errno)); return; } long fileSize2 = f2.getFileSize() ; long m_bufSize2 = fileSize2 + 1; m_buf2 = (char *) mmalloc ( m_bufSize2 , "blaster2" ); if ( ! m_buf2) { log("blaster:mmalloc: %s",mstrerror(errno)); return; } // set m_p2 m_p2 = m_buf2; m_p2end = m_buf2 + m_bufSize2 - 1; if ( ! f2.read ( m_buf2 , fileSize2 , 0 ) ) { log("blaster:read: %s %s",file2,mstrerror(g_errno)); return; } long m=0; for ( long i = 0 ; i < m_bufSize2 ; i++ ) { if ( m_buf2[i] != '\n' ) continue; m_buf2[i] = '\0'; m++; } // Working on only the least number of urls from both files, //because we need to work in pairs if (mnormal exit status for the shell) return; } void sleepWrapper ( int fd , void *state ) { g_blaster.startBlastering(); } void sleepWrapperLog(int fd, void *state) { // unregister the sleepCallback g_loop.unregisterSleepCallback(state,sleepWrapperLog); g_blaster.processLogFile(state); } void Blaster:: processLogFile(void *state){ // No need to print how many docs processed in log // because this is called at epochs given in the log char *urlStart=(char*)state; if (!urlStart){ log(LOG_WARN,"blaster: got NULL urlStart"); return; } // log(LOG_WARN,"blaster:: Line is %s",urlStart); char tmp[1024]; if (urlStart[0]=='P'){ //POST // advance by "POST /search HTTP/1.1 " = 22 chars urlStart+=22; sprintf(tmp,"http://www.gigablast.com/search?%s",urlStart); } else if (urlStart[0]=='G'){ //GET // advance by "GET "= 4 chars urlStart+=4; char *end=strstr(urlStart," HTTP/1."); if (end) end[0]='\0'; sprintf(tmp,"http://www.gigablast.com%s",urlStart); } // log(LOG_WARN,"blaster: URL=%s",tmp); StateBD *st; try { st = new (StateBD); } catch ( ... ) { g_errno = ENOMEM; log("blaster: Failed. " "Could not allocate %li bytes for query. " "Returning HTTP status of 500.", (long)sizeof(StateBD)); return; } mnew ( st , sizeof(StateBD) , "BlasterDiff3" ); //st->m_u1.set(tmp,gbstrlen(tmp)); st->m_buf1=NULL; // get it bool status = g_httpServer.getDoc ( tmp, // &(st->m_u1) , // url 0 , // ip (none) 0 , // offset -1 , // size 0 , // ifModifiedSince st, // state gotDocWrapper1, // callback 20*1000, // timeout 0, // proxy ip 0, // proxy port 30*1024*1024, //maxLen 30*1024*1024);//maxOtherLen // continue if it blocked if ( status ) // else there was error log("blaster: got doc %s: %s", urlStart,mstrerror(g_errno) ); return; } void Blaster::startBlastering(){ long long now=gettimeofdayInMilliseconds(); if(m_print && m_totalDone>0 && (m_totalDone % 20)==0){ log("blaster: Processed %li urls in %li ms",m_totalDone, (long) (now-m_startTime)); m_print=false; } //Launch the maximum number of threads that are allowed while ( m_p1 < m_p1end && m_launched < m_maxNumThreads && m_totalUrls){ // clear any error g_errno = 0; // make a new state StateBD *st; try { st = new (StateBD); } catch ( ... ) { g_errno = ENOMEM; log("blaster: Failed. " "Could not allocate %li bytes for query. " "Returning HTTP status of 500.", (long)sizeof(StateBD)); return; } mnew ( st , sizeof(StateBD) , "BlasterDiff3" ); st->m_buf1=NULL; m_totalUrls--; // make into a url class. Set both u1 and u2 here. //st->m_u1.set ( m_p1 , gbstrlen(m_p1) ); st->m_u1 = m_p1; // is it an injection url if ( m_doInjection || m_doInjectionWithLinks ) { // get host #0 i guess Host *h0 = g_hostdb.getHost(0); if ( ! h0 ) { char *xx=NULL;*xx=0; } static bool s_flag = true; if ( s_flag ) { s_flag = false; log("blaster: injecting to host #0 at %s on " "http/tcp port %li", iptoa(h0->m_ip), (long)h0->m_httpPort); } // use spiderlinks=1 so we add the outlinks to spiderdb // but that will slow the spider rate down since it // will have to do a dns lookup on the domain of every // outlink. st->m_injectUrl.safePrintf("http://127.0.0.1:8000/" "admin/inject?"); if ( m_doInjectionWithLinks ) st->m_injectUrl.safePrintf("spiderlinks=1&"); else st->m_injectUrl.safePrintf("spiderlinks=0&"); st->m_injectUrl.safePrintf("u="); st->m_injectUrl.urlEncode(m_p1); st->m_injectUrl.pushChar('\0'); st->m_u1 = st->m_injectUrl.getBufStart(); } // skip to next url m_p1 += gbstrlen ( m_p1 ) + 1; if (m_blasterDiff){ //st->m_u2.set ( m_p2 , gbstrlen(m_p2) ); st->m_u2 = m_p2; m_p2 += gbstrlen ( m_p2 ) + 1; } // log(LOG_WARN,"\n"); log(LOG_WARN,"blaster: Downloading %s",st->m_u1); // set port if port switch is true //if ( m_portSwitch ) { // long r = rand() % 32; // u.setPort ( 8000 + r ); //} // count it m_launched++; long ip=0; long port=0; if (m_useProxy){ ip=atoip("66.154.102.20",13); port=3128; } // get it bool status = g_httpServer.getDoc ( st->m_u1 , // url 0, // ip 0 , // offset -1 , // size 0 , // ifModifiedSince st , // state gotDocWrapper1, // callback 60*1000, // timeout ip, port, 30*1024*1024, //maxLen 30*1024*1024); // continue if it blocked if ( ! status ) continue; // If not blocked, there is an error. m_launched--; // log msg log("From file1, got doc1 %s: %s", st->m_u1 , mstrerror(g_errno) ); // we gotta wait break; } // bail if not done yet //if ( m_launched > 0 ) return; if (m_totalUrls) return; //otherwise return if launched have not come back if (m_launched) return; // exit now // g_conf.save(); // closeALL(NULL,NULL); exit ( 0 ); } void gotDocWrapper1 ( void *state , TcpSocket *s ) { g_blaster.gotDoc1(state,s); } void Blaster::gotDoc1( void *state, TcpSocket *s){ StateBD *st=(StateBD *)state; // Even if we loose the request, still count it as done. m_totalDone++; m_print=true; // bail if got cut off if ( s->m_readOffset == 0 ) { log("blaster: lost the Request in gotDoc1"); m_launched--; freeStateBD(st); return; } //if we are not doing diff if (!m_blasterDiff){ m_launched--; } long long now = gettimeofdayInMilliseconds(); // get hash char *reply = s->m_readBuf ; long size = s->m_readOffset; HttpMime mime; mime.set ( reply , size , NULL ); char *content = reply + mime.getMimeLen(); long contentLen = size - mime.getMimeLen(); unsigned long h = hash32 ( content , contentLen ); // log msg if ( g_errno ) logf(LOG_INFO,"blaster: got doc (%li) (%li ms) %s : %s", s->m_readOffset , (long)(now - s->m_startTime) , st->m_u1 , mstrerror(g_errno) ); else logf(LOG_INFO,"blaster: got doc (%li) (%li ms) " "(hash=%lx) %s", s->m_readOffset , (long)(now - s->m_startTime) , h , st->m_u1 ); if (!m_blasterDiff){ // try to launch another if not using log file freeStateBD(st); if (!m_isLogFile){ startBlastering(); } if (m_isLogFile && --m_totalUrls==0) exit(0); return; } // Store the buffer from socket so that it does not get destroyed // at the end. Also, add another space because in gotDoc2 xml.set // demands the content to be null ended, so we need to store the // null character there. So as a precaution, just allocating the // max buf size. st->m_buf1=(char*) mcalloc(s->m_readBufSize,"Blaster5"); memcpy(st->m_buf1,s->m_readBuf,s->m_readOffset); //st->m_buf1=(char*) mdup(s->m_readBuf,s->m_readOffset,"Blaster5"); st->m_buf1Len=s->m_readOffset; st->m_buf1MaxLen=s->m_readBufSize; // . don't let TcpServer free m_buf when socket is recycled/closed // . we own it now and are responsible for freeing it. DON'T do this // because I believe this makes malloc crash, since TcpServer says // that it has freed the memory so malloc tries to allocate wrong // memory and gives a seg fault. // s->m_readBuf = NULL; log(LOG_WARN,"blaster: Downloading %s",st->m_u2); //char *ss="www.gigablast.com/search?q=hoopla&code=gbmonitor"; // st->m_u2.set(ss,gbstrlen(ss)); // get it bool status = g_httpServer.getDoc ( st->m_u2 , // url 0,//ip 0 , // offset -1 , // size 0 , // ifModifiedSince st , // state gotDocWrapper2, // callback 60*1000, // timeout 0,//atoip("66.154.102.20",13),//proxy ip 0,//3128,//80, // proxy port 30*1024*1024, //maxLen 30*1024*1024);//maxOtherLen // continue if it blocked if ( ! status ) return; // If not blocked, there is an error. m_launched--; // log msg log("From file2, gotdoc2 %s: %s", st->m_u2, mstrerror(g_errno) ); // No need to point p2 ahead because already been done // Free stateBD freeStateBD(st); return; } void gotDocWrapper2 ( void *state , TcpSocket *s ) { g_blaster.gotDoc2(state,s); } void Blaster::gotDoc2 ( void *state, TcpSocket *s){ StateBD *st=(StateBD *)state; // bail if got cut off if ( s->m_readOffset == 0 ) { log("blaster: Lost the Request in gotDoc2"); m_launched--; //No need to point p2 // Free stateBD freeStateBD(st); return; } // . don't let TcpServer free m_buf when socket is recycled/closed // . we own it now and are responsible for freeing it // s->m_readBuf = NULL; long long now = gettimeofdayInMilliseconds(); // So now after getting both docIds, get their contents char *reply1 = st->m_buf1 ; long size1 = st->m_buf1Len; HttpMime mime1; mime1.set ( reply1 , size1 , NULL ); char *content1 = reply1 + mime1.getMimeLen(); long content1Len = size1 - mime1.getMimeLen(); unsigned long h = hash32 ( content1 , content1Len ); // log msg if ( g_errno ) logf(LOG_INFO,"blaster: got doc (%li) (%li ms) %s : %s", s->m_readOffset , (long)(now - s->m_startTime) , st->m_u2 , mstrerror(g_errno) ); else logf(LOG_INFO,"blaster: got doc (%li) (%li ms) " "(hash=%lx) %s", s->m_readOffset , (long)(now - s->m_startTime) , h , st->m_u2 ); if (m_verbose){ log(LOG_WARN,"blaster: content1len=%li, Content1 is =%s", content1Len,content1); log(LOG_WARN,"\n"); } char *reply2 = s->m_readBuf ; long size2 = s->m_readOffset; HttpMime mime2; mime2.set ( reply2 , size2 , NULL ); char *content2 = reply2 + mime2.getMimeLen(); long content2Len = size2 - mime2.getMimeLen(); if (m_verbose) log(LOG_WARN,"blaster: content2len=%li, Content2 is =%s", content2Len,content2); // Now that we've got the contents, lets get the url links out // of these pages.Passing them to function getSearchLinks should // get the first x links found out. /* st->m_links1=(char *) mmalloc(200*MAX_URL_LEN,"Blaster3"); st->m_links2=st->m_links1+100*MAX_URL_LEN; st->m_numLinks1=100; st->m_numLinks2=100;*/ /* long numLinks1=getSearchLinks(content1,content1Len, st->m_links1,st->m_numLinks1); long numLinks2=getSearchLinks(content2,content2Len, st->m_links2,st->m_numLinks2);*/ content1[content1Len]='\0'; //short csEnum1= get_iana_charset(mime1.getCharset(), // mime1.getCharsetLen()); /* if (csEnum1== csUnknown) log(LOG_DEBUG, "blaster: Unknown charset : %s", mime2.getCharset());*/ Xml xml1; // assume utf8 if (!xml1.set(content1, content1Len, false, 0, false, TITLEREC_CURRENT_VERSION , true , // set parents 0 , // niceness CT_XML )){ // content type log(LOG_WARN,"blaster: Couldn't set XML1 Class in gotDoc2"); } Links links1; Url parent; parent.set ( st->m_u1); if (!links1.set(false , // userellnofollow &xml1, &parent,//mime1.getLocationUrl(), parent Url false, // setLinkHashes NULL , // baseUrl TITLEREC_CURRENT_VERSION, // version 0 , // niceness false , // parent is permalink? NULL )) { // oldLinks log(LOG_WARN,"blaster: Couldn't set Links Class in gotDoc2"); } content2[content2Len]='\0'; //short csEnum2= get_iana_charset(mime2.getCharset(), // mime2.getCharsetLen()); /* if (csEnum2== csUnknown) log(LOG_DEBUG, "blaster: Unknown charset : %s", mime2.getCharset());*/ Xml xml2; if (!xml2.set(content2, content2Len, false, 0, false, TITLEREC_CURRENT_VERSION, true , // setparents 0 , // niceness CT_XML )){ log(LOG_WARN,"blaster: Couldn't set XML2 Class in gotDoc2"); } Links links2; parent.set(st->m_u2); if (!links2.set(0,//siterec xml &xml2, &parent,//&st->m_u2,//mime2.getLocationUrl(), false, NULL, TITLEREC_CURRENT_VERSION, 0, false, NULL)){ log(LOG_WARN,"blaster: Couldn't set links2 Class in gotDoc2"); } // put the hash of the sites into a hashtable, since we have // about a 100 or so of them HashTableT urlHash; // put the urls from doc2 into the hastable, but first check if // they are links to google or gigablast (for now). For msn and // yahoo we have to add other checks. char domain2[256]; long dlen = 0; char *dom = getDomFast ( st->m_u2 , &dlen ); if ( dom ) strncpy(domain2,dom,dlen); domain2[dlen]='\0'; for (long i=0;im_u1 ,&dlen ); if ( dom ) strncpy(domain1,dom,dlen); domain1[dlen]='\0'; for (long i=0;im_numUrlDocsReceived=0; StateBD2 *st2; try { st2 = new (StateBD2); } catch ( ... ) { g_errno = ENOMEM; log("blaster: Failed. " "Could not allocate %li bytes for query. " "Returning HTTP status of 500.", (long)sizeof(StateBD2)); return; } mnew ( st2 , sizeof(StateBD2) , "Blaster4" ); //Point to the big state; st2->m_st=st; //Msg16 does 6 redirects, so I do 6 too st2->m_numRedirects=6; //st2->m_url.set(links1.getLink(i),links1.getLinkLen(i)); st2->m_url = links1.getLink(i); // No need for a proxy ip here, since we are fetching // doc's from different IPs. Faster this way bool status = g_httpServer.getDoc ( st2->m_url, // url 0,//ip 0 , // offset -1 , // size 0 , // ifModifiedSince st2, // state gotDocWrapper3, // callback 60*1000, // timeout 0, // proxy ip 0, // proxy port 30*1024*1024, //maxLen 30*1024*1024);//maxOtherLen // continue if it blocked if ( ! status ) continue; // If not blocked, there is an error. st->m_numUrlDocsReceived++; } st->m_numUrlDocsSent=numUrlsNotFound; //There might have been an error while sending the docs, so if there //has been put a check if ( st->m_numUrlDocsReceived > 0 && st->m_numUrlDocsReceived <= st->m_numUrlDocsSent ){ log(LOG_WARN,"blaster: %li docs could not be sent due to " "error",st->m_numUrlDocsReceived); m_launched--; freeStateBD(st); return; } if (numUrlsNotFound==0){ //job done for this pair log(LOG_WARN,"blaster: All urls from %s found in " "%s",domain1,domain2); m_launched--; // Free stateBD freeStateBD(st); return; } log(LOG_WARN,"blaster: %li urls from %s Not found in %s", numUrlsNotFound,domain1,domain2); if(m_justDisplay){ m_launched--; // Free stateBD freeStateBD(st); } return; } // This is not a generic function as yet. Gigablast stores the link in tag // and google stores it in tag . Takes // the content to search for links, the array in which to store the links and // the length of the array as arguments.Returns number of links it found in // the page. This function is not being used as yet as Xml and Links are used #if 0 long Blaster::getSearchLinks(char *content, long contentLen, char *links, long numLinks){ char *p=content; char *pend=content+contentLen; char *p2; long linksFound=0; //considering code given is raw=1 /* while (p=MAX_URL_LEN) length=255; strncpy(links+linksFound*MAX_URL_LEN,p,length); links[linksFound*MAX_URL_LEN+length]='\0'; log(LOG_WARN,"blaster: The url is=%s", links+linksFound*MAX_URL_LEN); linksFound++; p+=7; } return linksFound;*/ // Deciding if it is gigablast 1 or google 0 or else 2 long isGB; if (contentLen<19) { log(LOG_WARN,"blaster: Contentlen is less"); return 0; } if (strstr(content,"")) isGB=1; else isGB=0; p=content; if (isGB){ while (p && p"); if (!p) break; p2=strstr(p,""); if (!p2) break; //point to the url p+=18; //Check if it is in bounds. Also need to put '\0' at // the end. long length=p2-p; if (length>=MAX_URL_LEN) length=MAX_URL_LEN-1; //Copy into the links buffer strncpy(links+linksFound*MAX_URL_LEN,p,length); links[linksFound*MAX_URL_LEN+length]='\0'; log(LOG_WARN,"blaster:the url is=%s", links+linksFound*MAX_URL_LEN); //advance p2 too p2+=7; linksFound++; } } else{ while (p && p"); if(!p) break; p2=strstr(p,""); if (!p2) break; //point to the url p+=20; //Check if it is in bounds. Also need to put '\0' at // the end. long length=p2-p; if (length>=MAX_URL_LEN) length=255; //Copy into the links buffer strncpy(links+linksFound*MAX_URL_LEN,p,length); links[linksFound*MAX_URL_LEN+length]='\0'; log(LOG_WARN,"blaster:the url is=%s", links+linksFound*MAX_URL_LEN); //advance p2 too p2+=7; linksFound++; } } return linksFound; } #endif void gotDocWrapper3 ( void *state , TcpSocket *s ) { g_blaster.gotDoc3(state,s); } void Blaster::gotDoc3 ( void *state, TcpSocket *s){ StateBD2 *st2=(StateBD2 *)state; StateBD *st=st2->m_st; if (!s) { log (LOG_WARN,"blaster: Got a null s in gotDoc3." "Happened because ip could not be found"); st->m_numUrlDocsReceived++; //Free StateBD2 mdelete(st2,sizeof(StateBD2),"Blaster4"); if (st->m_numUrlDocsReceived==st->m_numUrlDocsSent){ m_launched--; // Free stateBD freeStateBD(st); } return; } // bail if got cut off if ( s->m_readOffset == 0 ) { log("blasterDiff : lost the Request in gotDoc3"); st->m_numUrlDocsReceived++; //Free StateBD2 mdelete(st2,sizeof(StateBD2),"Blaster4"); if (st->m_numUrlDocsReceived==st->m_numUrlDocsSent){ m_launched--; // Free stateBD freeStateBD(st); } return; } char *reply = s->m_readBuf ; long size = s->m_readOffset; HttpMime mime; mime.set(reply,size,NULL); long httpStatus=mime.getHttpStatus(); if(httpStatus==404){ if (m_verbose) log(LOG_WARN,"blaster: The page was not found - 404"); st->m_numUrlDocsReceived++; } // If the url is a redirect check if it is still http (might have // become https or something else, in which case we aren't going to // follow it else if (httpStatus>=300){ Url *u=mime.getLocationUrl(); //If max number of redirects done, bail if(!st2->m_numRedirects--){ log(LOG_WARN,"blaster: Max number of redirects " "reached."); st->m_numUrlDocsReceived++; } //check if it is still http (might have become https or // something else, in which case we aren't going to follow it else if (!u->isHttp()){ log(LOG_WARN,"blaster: Redirection not for an http " "page for url %s",u->getUrl()); st->m_numUrlDocsReceived++; } // sometimes idiots don't supply us with a Location: mime else if ( u->getUrlLen() == 0 ) { log(LOG_WARN,"blaster: Redirect url is of 0 length"); st->m_numUrlDocsReceived++; } else{ // I'm not checking as yet if the redirect url is the // same as the earlier url, as I've set the max number // of redirs to 6 Now lets get the redirect url. Do not // increase the numDocsReceived because this wrapper // will be called back for the page if (m_verbose) log(LOG_WARN,"blaster: Downloading redirect" " %s",u->getUrl()); //Changing the url to the new place //st2->m_url.set(u,false); st2->m_url = u->getUrl(); bool status = g_httpServer.getDoc (st2->m_url, // url 0,//ip 0 , // offset -1 , // size 0 , st2 , // state gotDocWrapper3, 60*1000, // timeout 0, // proxy ip 0, // proxy port 30*1024*1024, //maxLen 30*1024*1024); // If not blocked, there is an error. if (status ) st->m_numUrlDocsReceived++; } } else if(httpStatus<200){ log(LOG_WARN,"blaster: Bad HTTP status %li",httpStatus); st->m_numUrlDocsReceived++; } else{ // This means the page is still there, somewhere. Status must // be 200 So find it on server2. This server is assumed to be // running an instance of gb, so it shall be given the query in // the format 'xxxxx.com/search?q=url%3Ayyyy&code=gbmonitor. // Then check if we have the exact page in the search results // that have come back. So now the problem is that we do // not know which url has been got. So I get the location // url from mime. // The site name is in st->m_u2.getSite() // But copy it because it is not nulled. char tmp[1024]; //char site[1024];//how long could a site be? long siteLen = 0; char *site = getHostFast(st->m_u2,&siteLen); char c = site[siteLen]; site[siteLen] = 0; //strncpy(site,st->m_u2.getSite(), // st->m_u2.getSiteLen()); //site[st->m_u2.getSiteLen()]='\0'; sprintf(tmp,"%ssearch?" "code=gbmonitor&" "q=url%%3A%s",site,st2->m_url); site[siteLen] = c; if (m_verbose) log(LOG_WARN,"blaster: Checking %s",tmp); //Url u; //u.set(tmp,gbstrlen(tmp)); //Now get the doc bool status = g_httpServer.getDoc ( tmp,//&u, 0,//ip 0, // offset -1 , // size 0 , st , // state gotDocWrapper4, 60*1000, // timeout 0,//atoip("66.154.102.20",13),//proxy ip 0,//3128,//proxy port 30*1024*1024, 30*1024*1024); // continue if it blocked // If not blocked, there is an error. Since we are // getting the doc from a gigablast server, report it if (status ){ st->m_numUrlDocsReceived++; log(LOG_WARN,"blaster: could not get back" "%s from server in gotDoc3",tmp); } } // If we reached here, that means all the url redirects have been // finished, and there is no need for st2. Free it mdelete(st2,sizeof(StateBD2),"Blaster4"); if (st->m_numUrlDocsReceived==st->m_numUrlDocsSent){ m_launched--; // Free stateBD freeStateBD(st); } return; } void gotDocWrapper4 ( void *state , TcpSocket *s ) { g_blaster.gotDoc4(state,s); } void Blaster::gotDoc4 ( void *state, TcpSocket *s){ StateBD *st=(StateBD *)state; st->m_numUrlDocsReceived++; if (!s) { //Shouldn't happen, but still putting a checkpoint log (LOG_WARN,"blaster: Got a null s in gotDoc4." "Happened because ip could not be found for gigablast" "server"); if (st->m_numUrlDocsReceived==st->m_numUrlDocsSent){ m_launched--; // Free stateBD freeStateBD(st); } return; } // bail if got cut off if ( s->m_readOffset == 0 ) { log("blasterDiff : lost the Request in gotDoc4"); if (st->m_numUrlDocsReceived==st->m_numUrlDocsSent){ m_launched--; freeStateBD(st); } return; } char *reply = s->m_readBuf ; long size = s->m_readOffset; HttpMime mime; mime.set ( reply , size , NULL ); char *content = reply + mime.getMimeLen(); long contentLen = size - mime.getMimeLen(); //short csEnum = get_iana_charset(mime.getCharset(), // mime.getCharsetLen()); /* if (csEnum == csUnknown) log(LOG_DEBUG, "blaster: Unknown charset : %s", mime.getCharset());*/ Xml xml; if (!xml.set( content, contentLen, false, 0, false, TITLEREC_CURRENT_VERSION, true, // setparents 0, // niceness CT_XML )){ log(LOG_WARN,"blaster: Couldn't set XML Class in gotDoc4"); } Links links; Url *url=mime.getLocationUrl(); if (!links.set(0,//siterec xml &xml, url, false, NULL, TITLEREC_CURRENT_VERSION, 0, false, NULL)){ log(LOG_WARN, "blaster: Coudn't set Links class in gotDoc4"); } for (long i=0;im_sendBuf; char *p1,*p2; // First get the Host, which is the domain. Since socket s is going to // be useless after this function, changing m_sendBuf instead of using // more space p1=strstr(sendBuf,"%3A"); if(p1){ p1+=3; p2=strstr(p1," HTTP"); if (p2){ //Since I do not care about the sendbuf anymore *p2='\0'; } } if (!p1 || !p2){ log(LOG_WARN,"blasterdiff: Could not find search link" "from m_sendBuf in gotdoc4"); } else{ sprintf(tmp,"%s",p1); //log(LOG_WARN,"blaster: tmp in gotDoc4 = %s",tmp); bool isFound=false; // So now we search for tmp in the links for (long i=0;im_u1);//->getQuery() } } if (!isFound) log(LOG_WARN,"blaster: %s in results1 but not" " in results2 for query %s and does NOT exist" " in server2",tmp,st->m_u1); // ->getQuery() } if (st->m_numUrlDocsReceived==st->m_numUrlDocsSent){ m_launched--; // Free stateBD freeStateBD(st); } return; } void Blaster::freeStateBD(StateBD *st){ // Free stateBD's buf if (!st) return; if (st->m_buf1) mfree(st->m_buf1,st->m_buf1MaxLen,"Blaster5"); mdelete(st,sizeof(StateBD),"Blaster3"); }