#include "gb-include.h" #include "Msge1.h" #include "Test.h" // utility functions bool getTestIp ( char *url , int32_t *retIp , bool *found , int32_t niceness , char *testDir ) ; bool addTestIp ( char *host , int32_t hostLen , int32_t ip ) ; bool saveTestBuf ( char *testDir ) ; Msge1::Msge1() { m_buf = NULL; m_numReplies = 0; reset(); } Msge1::~Msge1() { reset(); } #define SLAB_SIZE (8*1024) void Msge1::reset() { m_errno = 0; m_ipBuf = NULL; if ( m_buf ) mfree ( m_buf , m_bufSize,"Msge1buf"); m_buf = NULL; m_numReplies = 0; } // . get various information for each url in a list of urls // . urls in "urlBuf" are \0 terminated // . used to be called getSiteRecs() // . you can pass in a list of docIds rather than urlPtrs bool Msge1::getFirstIps ( TagRec **grv , char **urlPtrs , linkflags_t *urlFlags ,//Links::m_linkFlags int32_t numUrls , // if skipOldLinks && urlFlags[i]&LF_OLDLINK, skip it bool skipOldLinks , char *coll , int32_t niceness , void *state , void (*callback)(void *state) , int32_t nowGlobal , bool addTags , char *testDir ) { reset(); // bail if no urls or linkee if ( numUrls <= 0 ) return true; // save all input parms m_grv = grv; m_urlPtrs = urlPtrs; m_urlFlags = urlFlags; m_numUrls = numUrls; m_skipOldLinks = skipOldLinks; m_coll = coll; m_niceness = niceness; m_state = state; m_callback = callback; m_nowGlobal = nowGlobal; m_addTags = addTags; m_testDir = testDir; // . how much mem to alloc? // . include an extra 4 bytes for each one to hold possible errno int32_t need = 4 + 4; // ip + error // one per url need *= numUrls; // allocate the buffer to hold all the info we gather m_buf = (char *)mcalloc ( need , "Msge1buf" ); if ( ! m_buf ) return true; m_bufSize = need; // clear it all memset ( m_buf , 0 , m_bufSize ); // set the ptrs! char *p = m_buf; m_ipBuf = (int32_t *)p ; p += numUrls * 4; m_ipErrors = (int32_t *)p ; p += numUrls * 4; // initialize m_numRequests = 0; m_numReplies = 0; // . point to first url to process // . url # m_n m_n = 0; // clear the m_used flags memset ( m_used , 0 , MAX_OUTSTANDING_MSGE1 ); // . launch the requests // . a request can be a msg8a, msgc, msg50 or msg20 request depending // on what we need to get // . when a reply returns, the next request is launched for that url // . we keep a msge1Slot state for each active url in the buffer // . we can have up to MAX_ACTIVE urls active if ( ! launchRequests ( 0 ) ) return false; // save it? might be a page parser if ( m_coll && ! strcmp(m_coll,"qatest123") ) saveTestBuf("qa"); // none blocked, we are done return true; } // we only come back up here 1) in the very beginning or 2) when a url // completes its pipeline of requests bool Msge1::launchRequests ( int32_t starti ) { // reset any error code g_errno = 0; loop: // stop if no more urls. return true if we got all replies! no block. if ( m_n >= m_numUrls ) return (m_numRequests == m_numReplies); // if we are maxed out, we basically blocked! if (m_numRequests - m_numReplies >= MAX_OUTSTANDING_MSGE1)return false; // . skip if "old" // . we are not planning on adding this to spiderdb, so Msg16 // want to skip the ip lookup, etc. if ( m_urlFlags && (m_urlFlags[m_n] & LF_OLDLINK) && m_skipOldLinks ) { m_numRequests++; m_numReplies++; m_n++; goto loop; } // grab the "firstip" from the tagRec if we can TagRec *gr = m_grv[m_n]; Tag *tag = NULL; if ( gr ) tag = gr->getTag("firstip"); int32_t ip; // grab the ip that was in there if ( tag ) ip = atoip(tag->getTagData()); // if we had it but it was 0 or -1, then time that out // after a day or so in case it works again! 0 and -1 mean // NXDOMAIN or timeout error, etc. if ( tag && ( ip == 0 || ip == -1 ) ) if ( m_nowGlobal - tag->m_timestamp > 3600*24 ) tag = NULL; // . if we still got the tag, use that, even if ip is 0 or -1 // . this keeps things fast // . this makes sure doConsistencyCheck() does not block too in // XmlDoc.cpp... cuz it cores if it does block if ( tag ) { // now "ip" might actually be -1 or 0 (invalid) so be careful m_ipBuf[m_n] = ip; // what is this? //if ( ip == 3 ) { char *xx=NULL;*xx=0; } m_numRequests++; m_numReplies++; m_n++; goto loop; } // or if banned Tag *btag = NULL; if ( gr ) btag = gr->getTag("manualban"); if ( btag && btag->getTagData()[0] !='0') { // debug for now if ( g_conf.m_logDebugDns ) log("dns: skipping dns lookup on banned hostname"); // -1 means time out i guess m_ipBuf[m_n] = -1; m_numRequests++; m_numReplies++; m_n++; goto loop; } // . get the next url // . if m_xd is set, create the url from the ad id char *p = m_urlPtrs[m_n]; // if it is ip based that makes things easy int32_t hlen = 0; char *host = getHostFast ( p , &hlen ); // reset this again ip = 0; // see if the hostname is actually an ip like "1.2.3.4" if ( host && is_digit(host[0]) ) ip = atoip ( host , hlen ); // if legit this is non-zero if ( ip ) { // what is this? i no longer have this bug really - i fixed // it - but it did core here probably from a bad dns reply! // so take this out... //if ( ip == 3 ) { char *xx=NULL;*xx=0; } m_ipBuf[m_n] = ip; m_numRequests++; m_numReplies++; m_n++; goto loop; } // use domain, we are "firstip" only now!!! //int32_t dlen = 0; //char *dom = getDomFast ( p , &dlen ); // get the length //int32_t plen = gbstrlen(p); /* // look up in our m_testBuf. if ( m_coll && ! strcmp(m_coll,"qatest123") ) { bool found = false; // do we got it? int32_t quickIp ; bool status = getTestIp ( p , &quickIp, &found); // error? if ( ! status ) { // save it m_errno = g_errno; // hard exit char *xx=NULL; *xx=0; } // an ip of 0 means we could not find it if ( found ) { // quickIp != 0 ) { // set it m_ipBuf[m_n] = quickIp; m_numRequests++; m_numReplies++; m_n++; goto loop; } } */ // . grab a slot // . m_msg8as[i], m_msgCs[i], m_msg50s[i], m_msg20s[i] int32_t i; for ( i = starti ; i < MAX_OUTSTANDING_MSGE1 ; i++ ) if ( ! m_used[i] ) break; // sanity check if ( i >= MAX_OUTSTANDING_MSGE1 ) { char *xx = NULL; *xx = 0; } // normalize the url //m_urls[i].set ( p , plen ); // save the url number, "n" m_ns [i] = m_n; // claim it m_used[i] = true; // note it //if ( g_conf.m_logDebugSpider ) // log(LOG_DEBUG,"spider: msge1: processing url %s",p); // . start it off // . this will start the pipeline for this url // . it will set m_used[i] to true if we use it and block // . it will increment m_numRequests and NOT m_numReplies if it blocked //sendMsgC ( i , dom , dlen ); sendMsgC ( i , host , hlen ); // consider it launched m_numRequests++; // inc the url count m_n++; // try to do another goto loop; } static void gotMsgCWrapper ( void *state , int32_t ip ) ; bool Msge1::sendMsgC ( int32_t i , char *host , int32_t hlen ) { // we are processing the nth url int32_t n = m_ns[i]; // set m_errno if we should at this point if ( ! m_errno && g_errno != ENOTFOUND ) m_errno = g_errno; // reset it g_errno = 0; // using the the ith msgC MsgC *m = &m_msgCs[i]; // save i and this in the msgC itself m->m_state2 = this; m->m_state3 = (void *)(PTRTYPE)i; // note it //if ( g_conf.m_logDebugSpider ) // logf(LOG_DEBUG,"spider: msge1: getting ip for %s", // m_urlPtrs[n]); //int32_t hlen = 0; //char *host = getHostFast ( m_urlPtrs[n] , &hlen ); // look up in our m_testBuf. if ( m_coll && ! strcmp(m_coll,"qatest123") ) { bool found = false; // int16_tcut //char *p = m_urlPtrs[n]; // do we got it? //bool status = getTestIp ( p , &m_ipBuf[n], &found); bool status = getTestIp ( host, &m_ipBuf[n],&found,m_niceness, m_testDir ); // error? if ( ! status ) { // save it m_errno = g_errno; // hard exit char *xx=NULL; *xx=0; } // an ip of 0 means we could not find it if ( found ) return addTag(i); } //char *xx=NULL;*xx=0; if ( ! m->getIp ( host , hlen , &m_ipBuf[n] , m , // state gotMsgCWrapper ))// callback return false; return doneSending ( i ); } void gotMsgCWrapper ( void *state , int32_t ip ) { MsgC *m = (MsgC *)state; Msge1 *THIS = (Msge1 *)m->m_state2; int32_t i = (int32_t )(PTRTYPE)m->m_state3; if ( ! THIS->doneSending ( i ) ) return; // try to launch more, returns false if not done if ( ! THIS->launchRequests(i) ) return; // . save it if we should. might be a page parser // . mdw i uncommented this when we cored all the time if ( THIS->m_coll&&!strcmp(THIS->m_coll,"qatest123"))saveTestBuf("qa"); // must be all done, call the callback THIS->m_callback ( THIS->m_state ); } void doneAddingTagWrapper ( void *state ) ; bool Msge1::doneSending ( int32_t i ) { // we are processing the nth url int32_t n = m_ns[i]; // save the error m_ipErrors[n] = g_errno; // save m_errno if ( g_errno && ! m_errno ) m_errno = g_errno; // clear it g_errno = 0; // get ip we got int32_t ip = m_ipBuf[n]; // what is this? //if ( ip == 3 ) { char *xx=NULL;*xx=0; } //log ( LOG_DEBUG, "build: Finished Msge1 for url [%"INT32",%"INT32"]: %s ip=%s", // n, i, m_urls[i].getUrl() ,iptoa(ip)); // store it? if ( ! strcmp(m_coll,"qatest123") ) { // get host int32_t hlen = 0; char *host = getHostFast ( m_urlPtrs[n] , &hlen ); // use domain, we are "firstip" only now!!! //int32_t dlen = 0; //char *dom = getDomFast ( m_urlPtrs[n] , &dlen ); // add it to "./test/ips.txt" addTestIp ( host , hlen ,ip); //addTestIp ( dom,dlen ,ip); } // . all done if invalid // . otherwise, add the "firstip" tag to this the domain in tagdb // . we now add invalid ips to keep doConsistencyCheck() from // blocking as well as to keep performance fast so we do not // have to keep re-looking up bad ips to get their "firstip", // but we only respect bad "firstips" for 1 day (see above) // before we try to recompute them //if ( ip == 0 || ip == -1 ) { // // close it up // doneAddingTag ( i ); // return true; //} return addTag ( i ); } bool Msge1::addTag ( int32_t i ) { // we are processing the nth url int32_t n = m_ns[i]; // get ip we got //int32_t ip = m_ipBuf[n]; // // HACK: hijack this MsgC to use as a "state" for call to msg9a // so we can add the "firstip" tag, since we did not have one! // // using the the ith msgC MsgC *m = &m_msgCs[i]; // save i and this in the msgC itself m->m_state2 = this; m->m_state3 = (void *)(PTRTYPE)i; // store the domain here //char *domBuf = m->m_request; // get the domain //int32_t dlen = 0; //char *dom = getDomFast ( m_urlPtrs[n] , &dlen ); // make it all host based //char *hostBuf = m->m_request; // get the host int32_t hlen = 0; char *host = getHostFast ( m_urlPtrs[n] , &hlen ); // if invalid or ip-based, skip it! //if ( ! dom || dlen <= 0 ) if ( ! host || hlen <= 0 ) return doneAddingTag ( i ); if ( ! m_addTags ) return doneAddingTag ( i ); // now let xmldoc add the firstip tags of each outlink! return doneAddingTag ( i ); /* // store it //strncpy ( domBuf , dom , dlen ); strncpy ( hostBuf , host , hlen ); // NULL term it //domBuf[dlen] = '\0'; hostBuf[hlen] = '\0'; // get time now synced with host #0 //int32_t nowGlobal = getTimeGlobal(); // put in buf char ipbuf[32]; sprintf(ipbuf,"%s",iptoa(ip) ); // . make the tag rec to add // . msg9a copies it into a request buffer, so no need to be persistent TagRec gr; // returns false and sets g_errno on error if ( !gr.addTag("firstip",m_nowGlobal,"msge1",ip,ipbuf,gbstrlen(ipbuf))){ // should never have error char *xx=NULL;*xx=0; } // int16_tcut Msg9a *m9 = &m_msg9as[i]; // . now add to "firstip" in tagdb // . borrow the ith msg9a (only 40 bytes each) // . this should only return control to us once it is safely in tagdb! if ( ! m9->addTags ( NULL , //&domBuf , &hostBuf , 1 , m_coll , m , // state doneAddingTagWrapper , m_niceness , &gr , false , &ip )) // we blocked return false; return doneAddingTag ( i ); */ } void doneAddingTagWrapper ( void *state ) { // get the hijacked msgc MsgC *m = (MsgC *)state; Msge1 *THIS = (Msge1 *)m->m_state2; int32_t i = (int32_t )(PTRTYPE)m->m_state3; // return if that blocked if ( ! THIS->doneAddingTag ( i ) ) return; // loop back for more if ( ! THIS->launchRequests ( i ) ) return; // must be all done, call the callback THIS->m_callback ( THIS->m_state ); } bool Msge1::doneAddingTag ( int32_t i ) { // unmangle //*m_pathPtr[i] = '/'; m_numReplies++; // free it m_used[i] = false; // we did not block return true; } #include "HashTableX.h" static char *s_testBuf = NULL ; static char *s_testBufPtr ; static int32_t s_testBufSize ; static char *s_testBufEnd ; static char s_needsReload = true ; static char *s_last = NULL ; static int32_t s_lastLen = 0 ; static HashTableX s_ht; // . only call this if the collection is "qatest123" // . we try to get the ip by accessing the "./test/ips.txt" file // . we also ad ips we lookup to that file in the collection is "qatest123" // . returns false and sets g_errno on error, true on success bool getTestIp ( char *url , int32_t *retIp , bool *found , int32_t niceness , char *testDir ) { // set the url from the url string, "us" Url u; u.set ( url ); // get host of the url char *host = u.getHost(); int32_t hlen = u.getHostLen(); // if it is an ip, that is easy! if ( is_digit(host[0]) ) { int32_t aip = atoip(host,hlen); if ( aip ) return aip; } // assume not found *found = false; // . if we are the "qatestq123" collection, check for "./test/ips.txt" // file that gives us the ips of the given urls. // . if we end up doing some lookups we should append to that file if ( ! s_testBuf || s_needsReload ) { // assume needs reload now s_needsReload = true; // free it if ( s_testBuf ) mfree ( s_testBuf , s_testBufSize, "msge1" ); // hashtable set, map urlhash32 to ip if ( !s_ht.set(4,4,400000,NULL,0,false,niceness,"msge1tab")) { char *xx=NULL;*xx=0; } // null it out now, we freed it s_testBuf = NULL; //char *testDir = g_test.getTestDir(); // filename char fn[100]; sprintf(fn,"%s/%s/ips.txt",g_hostdb.m_dir,testDir); // set it File f; f.set ( fn ); // get size int32_t fsize = f.getFileSize ( ); // < 0 means error? does not exist? if ( fsize < 0 ) fsize = 0; // how much to alloc? 1MB for all for now int32_t need = 3000001; // and what we had need += fsize; // make buf big enough to hold the read s_testBuf = (char *)mmalloc ( need , "tmsge1" ); // this for freeing s_testBufSize = need; s_testBufEnd = s_testBuf + need; // error? if ( ! s_testBuf ) { // note it log("test: failed to alloc %"INT32" bytes for ip buf",need); // error out return false; } // assign end to the beginning, assume nothing to read s_testBufPtr = s_testBuf; // read in the file, if it was there if ( fsize > 0 ) { // open it f.open ( O_RDWR ); // read it in int32_t rs = f.read ( s_testBuf , fsize , 0 ) ; // check it if ( rs != fsize ) { // note it log("test: failed to read %"INT32" bytes of " "./%s/ips.txt file",fsize,testDir); // close it f.close(); // error out return false; } // recompute the end of s_testBuf s_testBufPtr = s_testBuf + fsize; // trim off ending punct like \n for ( ; is_punct_a(s_testBufPtr[-1]) ; s_testBufPtr--); // and null term *s_testBufPtr = '\0'; } // close it f.close(); // good to go s_needsReload = false; // // fill hashtable, s_ht // char *p = s_testBuf; loop: // breathe QUICKPOLL(niceness); // skip over spaces for ( ; p < s_testBufPtr && is_wspace_a(*p) ; p++ ); // assign the url string, "us" to "p" char *us = p; // skip over that url char *next = p; for (;next= s_testBufPtr ) { char *xx=NULL;*xx=0; } // must be number if ( ! is_digit(*ips) ) { // there is a single line that is \0 0.0.0.\n // so let's fix this by skipping until \n for ( ; p= s_testBufEnd ) return log("test: no room to add ip %s",iptoa(ip)); // did we just add this one? prevent dups this way... if ( s_last && hostLen==s_lastLen && !strncmp(s_last,host,hostLen)) return true; // preserve ptr to last one we added s_last = s_testBufPtr; s_lastLen = hostLen; // print it gbmemcpy ( s_testBufPtr , host , hostLen ); // skip it s_testBufPtr += hostLen; // then space and ip int32_t ps = sprintf ( s_testBufPtr , " %s\n",iptoa(ip)); // skip that s_testBufPtr += ps; // add to hash table too int32_t u32 = hash32 ( host , hostLen ); if ( ! s_ht.addKey ( &u32 , &ip ) ) { char *xx=NULL;*xx=0; } // success return true; } void makeQADir(); // . save it back to disk // . we should call this from Test.cpp when the run is completed!! bool saveTestBuf ( char *testDir ) { //char *testDir = g_test.getTestDir(); // ensure ./qa/ subdir exsts. in qa.cpp makeQADir(); // filename char fn[100]; sprintf(fn,"%s/%s/ips.txt",g_hostdb.m_dir, testDir); // set it File f; f.set ( fn ); // open it f.open ( O_RDWR | O_CREAT ); // how much to write? int32_t size = s_testBufPtr - s_testBuf; // write it out int32_t ws = f.write ( s_testBuf , size , 0 ); // close it f.close(); // bitch? if ( ws != size ) return log("test: failed to write %"INT32" bytes to %s",size,fn); // note it log("test: saved ips.txt"); // ok return true; }