Merge branch 'diffbot-testing' into testing

This commit is contained in:
mwells 2014-04-05 12:34:46 -07:00
commit bd82145626
30 changed files with 705 additions and 105 deletions

View File

@ -838,8 +838,8 @@ bool Collectiondb::resetColl ( char *coll , bool purgeSeeds) {
return true;
}
// get the CollectionRec for "test"
CollectionRec *cr = getRec ( coll ); // "test" );
// get the CollectionRec for "qatest123"
CollectionRec *cr = getRec ( coll ); // "qatest123" );
// must be there. if not, we create test i guess
if ( ! cr ) {
@ -972,6 +972,39 @@ bool Collectiondb::setRecPtr ( collnum_t collnum , CollectionRec *cr ) {
return true;
}
// moves a file by first trying rename, then copying since cross device renaming doesn't work
// returns 0 on success
int mv(char* src, char* dest) {
int status = rename( src , dest );
if (status == 0)
return 0;
FILE *fsrc, *fdest;
fsrc = fopen(src, "r");
if (fsrc == NULL)
return -1;
fdest = fopen(dest, "w");
if (fdest == NULL) {
fclose(fsrc);
return -1;
}
const int BUF_SIZE = 1024;
char buf[BUF_SIZE];
while (!ferror(fdest) && !ferror(fsrc) && !feof(fsrc)) {
int read = fread(buf, 1, BUF_SIZE, fsrc);
fwrite(buf, 1, read, fdest);
}
fclose(fsrc);
fclose(fdest);
if (ferror(fdest) || ferror(fsrc))
return -1;
remove(src);
return 0;
}
// . returns false if we need a re-call, true if we completed
// . returns true with g_errno set on error
bool Collectiondb::resetColl2( collnum_t oldCollnum,
@ -982,8 +1015,8 @@ bool Collectiondb::resetColl2( collnum_t oldCollnum,
// save parms in case we block
//we->m_purgeSeeds = purgeSeeds;
// now must be "test" only for now
//if ( strcmp(coll,"test") ) { char *xx=NULL;*xx=0; }
// now must be "qatest123" only for now
//if ( strcmp(coll,"qatest123") ) { char *xx=NULL;*xx=0; }
// no spiders can be out. they may be referencing the CollectionRec
// in XmlDoc.cpp... quite likely.
//if ( g_conf.m_spideringEnabled ||
@ -1018,6 +1051,18 @@ bool Collectiondb::resetColl2( collnum_t oldCollnum,
//collnum_t oldCollnum = cr->m_collnum;
//collnum_t newCollnum = m_numRecs;
// in case of bulk job, be sure to save list of spots
// copy existing list to a /tmp, where they will later be transferred back to the new folder
char oldbulkurlsname[1036];
snprintf(oldbulkurlsname, 1036, "%scoll.%s.%li/bulkurls.txt",g_hostdb.m_dir,cr->m_coll,(long)oldCollnum);
char newbulkurlsname[1036];
snprintf(newbulkurlsname, 1036, "%scoll.%s.%li/bulkurls.txt",g_hostdb.m_dir,cr->m_coll,(long)newCollnum);
char tmpbulkurlsname[1036];
snprintf(tmpbulkurlsname, 1036, "/tmp/coll.%s.%li.bulkurls.txt",cr->m_coll,(long)oldCollnum);
if (cr->m_isCustomCrawl == 2)
mv( oldbulkurlsname , tmpbulkurlsname );
// reset spider info
SpiderColl *sc = g_spiderCache.getSpiderCollIffNonNull(oldCollnum);
if ( sc ) {
@ -1127,6 +1172,9 @@ bool Collectiondb::resetColl2( collnum_t oldCollnum,
// save coll.conf to new directory
cr->save();
// be sure to copy back the bulk urls for bulk jobs
if (cr->m_isCustomCrawl == 2)
mv( tmpbulkurlsname, newbulkurlsname );
// and clear the robots.txt cache in case we recently spidered a
// robots.txt, we don't want to use it, we want to use the one we

View File

@ -1318,7 +1318,7 @@ sections. -- todo -- might be an alignment issue... check out later
// . make a whole new set of urls for pub date detection
// . grab that sample set from buzz wiki page
// . record the correct pub date for urls in the "test" coll and make sure
// . record the correct pub date for urls in the "qatest123" coll and make sure
// we get them each time, otherwise core dump!!
// . check the date we extract with the rss feed. that is a good test too!
// report on that accuracy in the logs and on the stats page.
@ -2428,7 +2428,7 @@ bool Dates::setPart1 ( //char *u ,
//if ( m_nw != words->m_numWords ) { char *xx=NULL; *xx=0; }
// . get the current time in utc
// . NO! to ensure the "test" collection re-injects docs exactly
// . NO! to ensure the "qatest123" collection re-injects docs exactly
// the same, use the spideredTime from the doc
// . we make sure to save this in the test subdir somehow..
//m_now = nd->m_spideredTime; // getTimeSynced();
@ -3283,7 +3283,7 @@ bool Dates::setPart1 ( //char *u ,
// DF_NOTCLOCK flags from this.
// . current time. sync'd with host #0 who uses ntp supposedly...! :(
// . to ensure that the "test" subdir re-injects docs exactly the
// . to ensure that the "qatest123" subdir re-injects docs exactly the
// same, we need to use this date now
long now = nd->m_spideredTime;
// how long has elapsed since we downloaded it last approx.?
@ -3294,7 +3294,8 @@ bool Dates::setPart1 ( //char *u ,
// might have been different than ours... actually i think our
// spiderdate.txt file had an older date in it from a previous round!
// so disable this when test spidering.
if ( elapsed<0 && g_conf.m_testSpiderEnabled && !strcmp(m_coll,"test"))
if ( elapsed<0 && g_conf.m_testSpiderEnabled && !strcmp(m_coll,
"qatest123"))
elapsed = 0;
// is true.
if ( elapsed < 0 ) {

View File

@ -167,7 +167,7 @@ case EFAKEFIRSTIP: return "Fake firstIp";
case EBADHOSTSCONF: return "A hosts.conf is out of sync";
case EWAITINGTOSYNCHOSTSCONF: return "Wait to ensure hosts.conf in sync";
case EDOCNONCANONICAL: return "Url was dup of canonical page";
case ECUSTOMCRAWLMISMATCH: return "Crawl type mismatch";
case ECUSTOMCRAWLMISMATCH: return "Job name/type mismatch. Job name has already been used for a crawl or bulk job.";
}
// if the remote error bit is clear it must be a regulare errno
//if ( ! ( errnum & REMOTE_ERROR_BIT ) ) return strerror ( errnum );

View File

@ -445,7 +445,7 @@ bool Images::downloadImages () {
r->reset();
r->m_maxTextDocLen = 200000;
r->m_maxOtherDocLen = 500000;
if ( ! strcmp(cr->m_coll,"test")) {
if ( ! strcmp(cr->m_coll,"qatest123")) {
r->m_useTestCache = 1;
r->m_addToTestCache = 1;
}

View File

@ -433,7 +433,7 @@ char *JsonItem::getValueAsString ( long *valueLen ) {
// numbers...
static char s_numBuf[64];
if ( m_valueLong == (long)m_valueDouble ) {
if ( (float)m_valueLong == m_valueDouble ) {
*valueLen = sprintf ( s_numBuf,"%li", m_valueLong );
return s_numBuf;
}

View File

@ -3935,7 +3935,7 @@ LinkInfo *makeLinkInfo ( char *coll ,
// . how many unique ips link to us?
// . this count includes internal IPs as well
info->m_numUniqueIps = msg25->m_uniqueIps;
// keep things consistent for the "test" coll
// keep things consistent for the "qatest123" coll
info->m_reserved1 = 0;
info->m_reserved2 = 0;
// how many total GOOD inlinks we got. does not include internal cblock

View File

@ -57,7 +57,7 @@ OBJS = UdpSlot.o Rebalance.o \
PostQueryRerank.o Msge0.o Msge1.o \
CountryCode.o DailyMerge.o CatRec.o Tagdb.o \
Users.o Images.o Wiki.o Wiktionary.o Scraper.o \
Dates.o Sections.o SiteGetter.o Syncdb.o \
Dates.o Sections.o SiteGetter.o Syncdb.o qa.o \
Placedb.o Address.o Test.o GeoIP.o GeoIPCity.o Synonyms.o \
Cachedb.o Monitordb.o dlstubs.o PageCrawlBot.o Json.o PageBasic.o

View File

@ -721,6 +721,25 @@ void downloadTheDocForReals ( Msg13Request *r ) {
"(compatible; MSIE 6.0; Windows 98; "
"Win 9x 4.90)" ;
// for bulk jobs avoid actual downloads of the page for efficiency
if ( r->m_isCustomCrawl == 2 ) {
char *s =
"HTTP/1.0 200 (OK)\r\n"
"Content-Length: 0\r\n"
"Connection: Close\r\n"
"Content-Type: text/html\r\n\r\n";
long slen = gbstrlen(s);
long fakeBufSize = slen + 1;
char *fakeBuf = mdup ( s , fakeBufSize , "fkblk");
gotHttpReply2 ( r ,
fakeBuf,
fakeBufSize, // include \0
fakeBufSize, // allocsize
NULL ); // tcpsock
return;
}
// download it
if ( ! g_httpServer.getDoc ( r->m_url ,
r->m_urlIp ,
@ -1390,7 +1409,7 @@ void passOnReply ( void *state , UdpSlot *slot ) {
//
//
// . UTILITY FUNCTIONS for injecting into the "test" collection
// . UTILITY FUNCTIONS for injecting into the "qatest123" collection
// . we need to ensure that the web pages remain constant so we store them
//
//
@ -1400,7 +1419,7 @@ void passOnReply ( void *state , UdpSlot *slot ) {
// . now that we are lower level in Msg13.cpp, set "ts" not "slot"
bool getTestDoc ( char *u , TcpSocket *ts , Msg13Request *r ) {
// sanity check
//if ( strcmp(m_coll,"test") ) { char *xx=NULL;*xx=0; }
//if ( strcmp(m_coll,"qatest123") ) { char *xx=NULL;*xx=0; }
// hash the url into 64 bits
long long h = hash64 ( u , gbstrlen(u) );
// read the spider date file first
@ -1547,7 +1566,7 @@ bool addTestSpideredDate ( Url *u , long spideredTime , char *testDir ) {
return true;
}
// add it to our "test" subdir
// add it to our "qatest123" subdir
bool addTestDoc ( long long urlHash64 , char *httpReply , long httpReplySize ,
long err , Msg13Request *r ) {

View File

@ -32,6 +32,8 @@ public:
// if doing spider compression, compute contentHash32 of document
// downloaded, and if it matches this then send back EDOCUNCHANGED
long m_contentHash32;
// copy of CollectionRec::m_customCrawl, 0 1 for crawls or 2 for bulks
char m_isCustomCrawl;
// send back error ENOGOODDATE if it does not have one. but if
// harvestLinks is true, just send back a filtered list of links
long m_requireGoodDate:1;

View File

@ -159,7 +159,7 @@ public:
};
// . injecting into the "test" coll flushes after each inject
// . injecting into the "qatest123" coll flushes after each inject
// . returns false if blocked and callback will be called
bool flushMsg4Buffers ( void *state , void (* callback) (void *) ) {
// if all empty, return true now

View File

@ -859,9 +859,9 @@ bool Msg5::needsRecall ( ) {
if ( m_round == 0 ) logIt = false;
if ( logIt )
logf(LOG_DEBUG,"db: Reading %li again from %s (need %li total "
"got %li) this=0x%lx round=%li.",
"got %li) cn=%li this=0x%lx round=%li.",
m_newMinRecSizes , base->m_dbname , m_minRecSizes,
m_list->m_listSize, (long)this , m_round );
m_list->m_listSize, (long)m_collnum,(long)this, m_round );
m_round++;
// record how many screw ups we had so we know if it hurts performance
base->m_rdb->didReSeek ( );

View File

@ -116,7 +116,7 @@ bool Msge1::getFirstIps ( TagRec **grv ,
if ( ! launchRequests ( 0 ) ) return false;
// save it? might be a page parser
//if ( ! strcmp(m_coll,"test") ) saveTestBuf();
//if ( ! strcmp(m_coll,"qatest123") ) saveTestBuf();
// none blocked, we are done
return true;
@ -219,7 +219,7 @@ bool Msge1::launchRequests ( long starti ) {
/*
// look up in our m_testBuf.
if ( m_coll && ! strcmp(m_coll,"test") ) {
if ( m_coll && ! strcmp(m_coll,"qatest123") ) {
bool found = false;
// do we got it?
long quickIp ; bool status = getTestIp ( p , &quickIp, &found);
@ -300,7 +300,7 @@ bool Msge1::sendMsgC ( long i , char *host , long hlen ) {
// look up in our m_testBuf.
if ( m_coll && ! strcmp(m_coll,"test") ) {
if ( m_coll && ! strcmp(m_coll,"qatest123") ) {
bool found = false;
// shortcut
//char *p = m_urlPtrs[n];
@ -340,7 +340,7 @@ void gotMsgCWrapper ( void *state , long ip ) {
if ( ! THIS->launchRequests(i) ) return;
// . save it if we should. might be a page parser
// . mdw i uncommented this when we cored all the time
//if ( ! strcmp(THIS->m_coll,"test")) saveTestBuf();
//if ( ! strcmp(THIS->m_coll,"qatest123")) saveTestBuf();
// must be all done, call the callback
THIS->m_callback ( THIS->m_state );
}
@ -364,7 +364,7 @@ bool Msge1::doneSending ( long i ) {
// n, i, m_urls[i].getUrl() ,iptoa(ip));
// store it?
if ( ! strcmp(m_coll,"test") ) {
if ( ! strcmp(m_coll,"qatest123") ) {
// get host
long hlen = 0;
char *host = getHostFast ( m_urlPtrs[n] , &hlen );
@ -511,9 +511,9 @@ static char *s_last = NULL ;
static long s_lastLen = 0 ;
static HashTableX s_ht;
// . only call this if the collection is "test"
// . only call this if the collection is "qatest123"
// . we try to get the ip by accessing the "./test/ips.txt" file
// . we also ad ips we lookup to that file in the collection is "test"
// . we also ad ips we lookup to that file in the collection is "qatest123"
// . returns false and sets g_errno on error, true on success
bool getTestIp ( char *url , long *retIp , bool *found , long niceness ,
char *testDir ) {
@ -533,8 +533,8 @@ bool getTestIp ( char *url , long *retIp , bool *found , long niceness ,
// assume not found
*found = false;
// . if we are the "test" collection, check for "./test/ips.txt" file
// that gives us the ips of the given urls.
// . if we are the "qatestq123" collection, check for "./test/ips.txt"
// file that gives us the ips of the given urls.
// . if we end up doing some lookups we should append to that file
if ( ! s_testBuf || s_needsReload ) {
// assume needs reload now

View File

@ -875,6 +875,19 @@ void StateCD::printSpiderdbList ( RdbList *list,SafeBuf *sb,char **lastKeyPtr){
);
// but default to csv
else {
if (cr && cr->m_isCustomCrawl == 1 && sreq && !sreq->m_isAddUrl && !sreq->m_isInjecting) {
if (cr->m_diffbotUrlCrawlPattern.m_length == 0
&& cr->m_diffbotUrlProcessPattern.m_length == 0) {
// If a crawl and there are no urlCrawlPattern or urlCrawlRegEx values, only return URLs from seed domain
if (sreq && !sreq->m_sameDom)
continue;
} else {
// TODO: if we get here, we have a crawl with a custom urlCrawlPattern and/or custom
// urlProcessPattern. We have to check if the current url matches the pattern
}
}
sb->safePrintf("\"%s\",\"%s\","
, sreq->m_url
, as
@ -2108,7 +2121,7 @@ bool sendPageCrawlbot ( TcpSocket *socket , HttpRequest *hr ) {
char bulkurlsfile[1024];
snprintf(bulkurlsfile, 1024, "%scoll.%s.%li/bulkurls.txt", g_hostdb.m_dir , coll , (long)st->m_collnum );
if ( spots ) {
if ( spots && cr && cr->m_isCustomCrawl == 2 ) {
log("crawlbot: got spots (len=%li) to add coll=%s (%li)",
(long)gbstrlen(spots),coll,(long)st->m_collnum);
FILE *f = fopen(bulkurlsfile, "w");
@ -2120,7 +2133,7 @@ bool sendPageCrawlbot ( TcpSocket *socket , HttpRequest *hr ) {
}
// if restart flag is on and the file with bulk urls exists, get spots from there
if ( !spots && restartColl ) {
if ( !spots && restartColl && cr && cr->m_isCustomCrawl ) {
FILE *f = fopen(bulkurlsfile, "r");
if (f != NULL) {
fseek(f, 0, SEEK_END);
@ -3946,7 +3959,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
// . do not add dups into m_diffbotSeeds safebuf
// . return 0 if not in table, 1 if in table. -1 on error adding to table.
long isInSeedBuf ( CollectionRec *cr , Url *url ) {
long isInSeedBuf ( CollectionRec *cr , char *url, int len ) {
HashTableX *ht = &cr->m_seedHashTable;
@ -3973,7 +3986,7 @@ long isInSeedBuf ( CollectionRec *cr , Url *url ) {
}
// is this url in the hash table?
long long u64 = hash64 ( url->getUrl() , url->getUrlLen() );
long long u64 = hash64 ( url, len );
if ( ht->isInTable ( &u64 ) ) return 1;
@ -4072,7 +4085,7 @@ bool getSpiderRequestMetaList ( char *doc ,
if ( ! cr ) continue;
// do not add dups into m_diffbotSeeds safebuf
long status = isInSeedBuf ( cr , &url );
long status = isInSeedBuf ( cr , saved , end - saved );
// error?
if ( status == -1 ) {

View File

@ -561,7 +561,7 @@ bool processLoop ( void *state ) {
// . save the ips.txt file if we are the test coll
// . saveTestBuf() is a function in Msge1.cpp
CollectionRec *cr = xd->getCollRec();
if ( xd && cr && cr->m_coll && ! strcmp ( cr->m_coll,"test") )
if ( xd && cr && cr->m_coll && !strcmp(cr->m_coll,"qatest123"))
// use same dir that XmlDoc::getTestDir() would use
saveTestBuf ( "test-page-parser" );
// now get the meta list, in the process it will print out a
@ -855,7 +855,7 @@ bool gotXmlDoc ( void *state ) {
// . save the ips.txt file if we are the test coll
// . saveTestBuf() is a function in Msge1.cpp
//if ( xd && xd->m_coll && ! strcmp ( xd->m_coll , "test"))
//if ( xd && xd->m_coll && ! strcmp ( xd->m_coll , "qatest123"))
// // use same dir that XmlDoc::getTestDir() would use
// saveTestBuf ( "test-page-parser" );

View File

@ -985,7 +985,7 @@ bool printSearchResultsHeader ( State0 *st ) {
sb->safePrintf("\"currentTimeUTC\":%lu,\n", (long)(globalNowMS/1000));
}
// show response time
// show response time if not doing Quality Assurance
if ( si->m_format == FORMAT_XML )
sb->safePrintf("\t<responseTimeMS>%lli</responseTimeMS>\n",
st->m_took);
@ -2148,8 +2148,13 @@ bool printResult ( State0 *st, long ix ) {
// so fix that shit here...
//float f = mr->m_lastSpidered;
//sb->safePrintf(",\"lastCrawlTimeUTC\":%.0f}",f);
sb->safePrintf(",\"lastCrawlTimeUTC\":%li}\n",
sb->safePrintf(",\"lastCrawlTimeUTC\":%li\n",
mr->m_lastSpidered);
// also include a timestamp field with an RFC 1123 formatted date
char timestamp[50];
struct tm *ptm = gmtime ( &mr->m_lastSpidered );
strftime(timestamp, 50, "%a, %d %b %Y %X %Z", ptm);
sb->safePrintf(",\"timestamp\":\"%s\"}\n", timestamp);
}
//mr->size_content );

View File

@ -450,7 +450,7 @@ bool CommandParserTestInit ( char *rec ) {
g_conf.m_spideringEnabled = 1;
//g_conf.m_webSpideringEnabled = 1;
// turn on for test coll too
CollectionRec *cr = g_collectiondb.getRec("test");
CollectionRec *cr = g_collectiondb.getRec("qatest123");
// turn on spiders
if ( cr ) cr->m_spideringEnabled = 1;
// if we are not host 0, turn on spiders for testing
@ -470,7 +470,7 @@ bool CommandSpiderTestInit ( char *rec ) {
g_conf.m_spideringEnabled = 1;
//g_conf.m_webSpideringEnabled = 1;
// turn on for test coll too
CollectionRec *cr = g_collectiondb.getRec("test");
CollectionRec *cr = g_collectiondb.getRec("qatest123");
// turn on spiders
if ( cr ) cr->m_spideringEnabled = 1;
// if we are not host 0, turn on spiders for testing
@ -488,7 +488,7 @@ bool CommandSpiderTestCont ( char *rec ) {
g_conf.m_spideringEnabled = 1;
//g_conf.m_webSpideringEnabled = 1;
// turn on for test coll too
CollectionRec *cr = g_collectiondb.getRec("test");
CollectionRec *cr = g_collectiondb.getRec("qatest123");
// turn on spiders
if ( cr ) cr->m_spideringEnabled = 1;
// done
@ -5080,6 +5080,27 @@ void Parms::init ( ) {
m++;
*/
m->m_title = "init QA tests";
m->m_desc = "If initiated gb performs some integrity tests "
"to ensure injecting, spidering and searching works "
"properly. Uses ./test/ subdirectory. Injects "
"urls in ./test/inject.txt. Spiders urls "
"in ./test/spider.txt. "
"Each of those two files is essentially a simple format of "
"a url followed by the http reply received from the server "
"for that url. "
// TODO: generate these files
;
m->m_cgi = "qasptei";
m->m_type = TYPE_CMD;
m->m_func = CommandSpiderTestInit;
m->m_def = "1";
m->m_cast = 1;
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m++;
m->m_title = "init parser test run";
m->m_desc = "If enabled gb injects the urls in the "
"./test-parser/urls.txt "
@ -15299,6 +15320,18 @@ void Parms::init ( ) {
m->m_smin = 0;
m++;
// when we do &qa=1 we do not show things like responseTime in
// search results so we can verify serp checksum consistency for QA
// in qa.cpp
m->m_title = "quality assurance";
m->m_desc = "This is 1 if doing a QA test in qa.cpp";
m->m_def = "0";
m->m_soff = (char *)&si.m_qa - y;
m->m_type = TYPE_CHAR;
m->m_sparm = 1;
m->m_scgi = "qa";
m++;
//m->m_title = "show turk forms";
//m->m_desc = "If enabled summaries in search results will be "
// "turkable input forms.";
@ -16744,7 +16777,6 @@ bool Parms::addCurrentParmToList2 ( SafeBuf *parmList ,
return true;
}
// returns false and sets g_errno on error
bool Parms::convertHttpRequestToParmList (HttpRequest *hr, SafeBuf *parmList,
long page ){
@ -18268,7 +18300,7 @@ bool printUrlExpressionExamples ( SafeBuf *sb ) {
CollectionRec *cr = (CollectionRec *)THIS;
// if testUrl is provided, find in the table
char testUrl [ 1025 ];
char *tt = r->getString ( "test" , NULL );
char *tt = r->getString ( "qatest123" , NULL );
testUrl[0]='\0';
if ( tt ) strncpy ( testUrl , tt , 1024 );
char *tu = testUrl;

View File

@ -5158,7 +5158,7 @@ char *Proxy::storeLoginBar ( char *reply ,
}
// point to first digit in there
mp += 16;
// store our new content length as ascii into "test" buf
// store our new content length as ascii into test buf
char test[64];
long len = sprintf(test,"%li",(long)(newReplySize-mimeLen));
// find end

View File

@ -60,6 +60,7 @@ struct SafeBuf {
long fillFromFile(char *filename);
long fillFromFile(char *dir,char *filename);
long load(char *dir,char *fname) { return fillFromFile(dir,fname);};
long load(char *fname) { return fillFromFile(fname);};
void filterTags();
void filterQuotes();

View File

@ -179,6 +179,9 @@ class SearchInput {
long m_queryMatchOffsets;
long m_summaryMode;
// are we doing a QA query for quality assurance consistency
char m_qa;
float m_pqr_demFactSubPhrase;
float m_pqr_demFactCommonInlinks;
float m_pqr_demFactLocTitle;

View File

@ -1288,7 +1288,7 @@ bool Sections::set ( Words *w ,
}
m_isTestColl = ! strcmp(m_coll,"test") ;
m_isTestColl = ! strcmp(m_coll,"qatest123") ;
//
//

View File

@ -146,8 +146,8 @@ bool SiteGetter::getSite ( char *url ,
long age = -1;
//long now = getTimeGlobal();
//if ( tag ) age = now - tag->m_timestamp;
// to parse conssitently for the qa test "test" coll use "timestamp"
// as the "current time"
// to parse conssitently for the qa test "qatest123" coll use
// "timestamp" as the "current time"
if ( tag ) age = timestamp - tag->m_timestamp;
// if there, at least get it (might be -1)
if ( tag ) m_oldSitePathDepth = atol ( tag->getTagData() );
@ -534,7 +534,7 @@ bool SiteGetter::setSite ( ) {
//TagRec gr;
m_addedTag.addTag ( "sitepathdepth" ,
// now XmlDoc must provide it to ensure that are
// injects into the "test" coll are consistent
// injects into the "qatest123" coll are consistent
m_timestamp ,//getTime()// use now as timestamp
"sitegit" , // username
0 , // ip

View File

@ -1082,7 +1082,7 @@ SpiderColl *SpiderCache::getSpiderColl ( collnum_t collnum ) {
// save this
strcpy ( sc->m_coll , cr->m_coll );
// set this
if ( ! strcmp ( cr->m_coll,"test" ) ) sc->m_isTestColl = true;
if ( ! strcmp ( cr->m_coll,"qatest123" ) ) sc->m_isTestColl = true;
else sc->m_isTestColl = false;
// set first doledb scan key
@ -6761,12 +6761,12 @@ bool SpiderLoop::spiderUrl2 ( ) {
char *coll = "collnumwasinvalid";
if ( cr ) coll = cr->m_coll;
// . pass in a pbuf if this is the "test" collection
// . pass in a pbuf if this is the "qatest123" collection
// . we will dump the SafeBuf output into a file in the
// test subdir for comparison with previous versions of gb
// in order to see what changed
SafeBuf *pbuf = NULL;
if ( !strcmp( coll,"test") && g_conf.m_testParserEnabled )
if ( !strcmp( coll,"qatest123") && g_conf.m_testParserEnabled )
pbuf = &xd->m_sbuf;
//
@ -6969,10 +6969,10 @@ bool SpiderLoop::indexedDoc ( XmlDoc *xd ) {
bool respider = false;
if ( xd->m_oldDocValid && xd->m_oldDoc ) respider = true;
// . dump it out to a file in the "test" subdir
// . dump it out to a file in the "qatest123" subdir
// . but only the first time we spider it...
/*
if ( ! strcmp(xd->m_coll,"test") && ! respider &&
if ( ! strcmp(xd->m_coll,"qatest123") && ! respider &&
// no longer need this when qa testing spider, not parser
g_conf.m_testParserEnabled ) {
// save the buffers

View File

@ -3103,7 +3103,7 @@ void TagRec::gotAllReplies ( ) {
// site getter sometimes adds recs to tagdb to add in a new subsite
// it finds... i'd imagine this will create a parsing inconsistency
// when injecting docs into the "test" coll... but oh well!
// when injecting docs into the "qatest123" coll... but oh well!
long timestamp = getTimeGlobal();
// . begin the "inheritance loop"
@ -3288,7 +3288,7 @@ bool Msg9a::addTags ( char *sites ,
// when we add the "site" tag to it use the timestamp from one
// of the tags we are adding... therefore we must require there be
// some tags! we do this to insure injection consistency into the
// "test" collection.
// "qatest123" collection.
if ( ! tagRec || tagRec->getNumTags() <= 0 ) { char *xx=NULL;*xx=0; }
// use the first timestamp

View File

@ -55,7 +55,7 @@ bool Test::init ( ) {
}
void Test::reset ( ) {
if ( m_urlBuf ) mfree ( m_urlBuf , m_urlEnd - m_urlBuf , "test");
if ( m_urlBuf ) mfree ( m_urlBuf , m_urlEnd - m_urlBuf , "test999");
//m_spiderLinks = true;//false;
m_bypassMenuElimination = false;
}
@ -122,7 +122,7 @@ void Test::removeFiles ( ) {
long saved = g_conf.m_useQuickpoll;
g_conf.m_useQuickpoll = false;
CollectionRec *cr = g_collectiondb.getRec("test");
CollectionRec *cr = g_collectiondb.getRec("qatest123");
// . reset the qatest collection to zero docs
// . TODO: implement this. only allow it for qatest coll.
@ -172,8 +172,8 @@ void Test::initTestRun ( ) {
//if ( m_testSpiderEnabledSaved ) return;
//if ( m_testParserEnabledSaved ) return;
// you must have the "test" coll already setup!
CollectionRec *cr = g_collectiondb.getRec("test");
// you must have the "qatest123" coll already setup!
CollectionRec *cr = g_collectiondb.getRec("qatest123");
if ( ! cr ) {
// note it
log("test: please add a collection named \"test\" first.");
@ -233,7 +233,7 @@ void Test::initTestRun ( ) {
// save it
m_runId = i;
cr = g_collectiondb.getRec ( "test" );
cr = g_collectiondb.getRec ( "qatest123" );
if ( ! cr ) {
// and no more of this
g_conf.m_testParserEnabled = false;

View File

@ -71,6 +71,7 @@ void Title::reset() {
mfree ( m_title , m_titleAllocSize , "Title" );
m_title = NULL;
m_titleBytes = 0;
m_titleAllocSize = 0;
m_query = NULL;
m_titleTagStart = -1;
m_titleTagEnd = -1;
@ -113,7 +114,7 @@ bool Title::setTitle ( XmlDoc *xd ,
char *val = NULL;
// look for the "title:" field in json then use that
SafeBuf jsonTitle;
long vlen;
long vlen = 0;
if ( xd->m_contentType == CT_JSON ) {
char *jt;
jt = getJSONFieldValue(xd->ptr_utf8Content,"title",&vlen);
@ -124,7 +125,6 @@ bool Title::setTitle ( XmlDoc *xd ,
val = jsonTitle.getBufStart();
vlen = jsonTitle.length();
}
}
// if we had a title: field in the json...
if ( val && vlen > 0 ) {
@ -135,6 +135,7 @@ bool Title::setTitle ( XmlDoc *xd ,
else {
dst = (char *)mmalloc ( m_titleBytes+1,"titdst" );
if ( ! dst ) return false;
m_titleAllocSize = m_titleBytes+1;
}
m_title = dst;
memcpy ( dst , val , m_titleBytes );
@ -142,6 +143,13 @@ bool Title::setTitle ( XmlDoc *xd ,
return true;
}
// json content, if has no explicit title field, has no title then
if ( xd->m_contentType == CT_JSON ) {
m_localBuf[0] = '\0';
m_title = m_localBuf;
m_titleBytes = 0;
return true;
}
bool status = setTitle4 ( xd ,
xml ,

View File

@ -879,8 +879,8 @@ bool XmlDoc::set1 ( char *url ,
char *XmlDoc::getTestDir ( ) {
CollectionRec *cr = getCollRec();
if ( ! cr ) return NULL;
// return NULL if we are not the "test" collection
if ( strcmp(cr->m_coll,"test") ) return NULL;
// return NULL if we are not the "qatest123" collection
if ( strcmp(cr->m_coll,"qatest123") ) return NULL;
// if Test.cpp explicitly set SpiderRequest::m_useTestSpiderDir bit
// then return "test-spider" otherwise...
if ( m_sreqValid && m_sreq.m_useTestSpiderDir )
@ -914,7 +914,7 @@ long XmlDoc::getSpideredTime ( ) {
if ( ! cr ) return 0;
// if not test collection keep it simple
if ( strcmp(cr->m_coll,"test") ) {
if ( strcmp(cr->m_coll,"qatest123") ) {
// . set spider time to current time
// . this might already be valid if we set it in
// getTestSpideredDate()
@ -3295,13 +3295,13 @@ char *XmlDoc::prepareToMakeTitleRec ( ) {
CollectionRec *cr = getCollRec();
if ( ! cr ) return NULL;
// if we are injecting into the "test" coll, then we need to have
// if we are injecting into the "qatest123" coll, then we need to have
// m_spideredTimeValid be true before calling getIsSpam() which calls
// getSiteNumInlinks() which adds tags to tagdb using that date, but
// only for the "test" coll! that keeps our parser output consistent
// across runs!
// only for the "qatest123" coll!
// that keeps our parser output consistent across runs!
char **content = NULL;
if ( ! strcmp ( cr->m_coll,"test") ) {
if ( ! strcmp ( cr->m_coll,"qatest123") ) {
content = getContent ( );
if ( ! content || content == (void *)-1 )
return (char *)content;
@ -11842,7 +11842,7 @@ long *XmlDoc::getSiteNumInlinks ( ) {
// current time
long now = getTimeGlobal();
// use the spidered time for the test collection for consistency
if ( !strcmp(cr->m_coll,"test") ) {
if ( !strcmp(cr->m_coll,"qatest123") ) {
//if ( ! m_spideredTimeValid ) { char *xx=NULL;*xx=0; }
now = getSpideredTime();//m_spideredTime;
}
@ -12061,8 +12061,8 @@ LinkInfo *XmlDoc::getSiteLinkInfo() {
// get from spider request if there
//bool injected = false;
//if ( m_sreqValid && m_sreq.m_isInjecting ) injected = true;
// but be consistent if doing the "test" collection
if ( ! strcmp(cr->m_coll,"test") ) {
// but be consistent if doing the "qatest123" collection
if ( ! strcmp(cr->m_coll,"qatest123") ) {
//if ( ! m_spideredTimeValid ) {char *xx=NULL;*xx=0;}
lastUpdateTime = getSpideredTime();//m_spideredTime;
}
@ -12164,14 +12164,14 @@ long *XmlDoc::getIp ( ) {
if ( ! cr ) return NULL;
bool useTestCache = false;
if ( ! strcmp(cr->m_coll,"test") ) useTestCache = true;
if ( ! strcmp(cr->m_coll,"qatest123") ) useTestCache = true;
// unless its the pagesubmit.cpp event submission tool
//if ( m_sreqValid && m_sreq.m_isPageSubmit ) useTestCache = false;
// when building the "test" collection try to get the ip from
// when building the "qatest123" collection try to get the ip from
// "./test/ips.txt" so our injections are consistent every time
// Test.cpp runs its injection loop into the "test" collection
// Test.cpp runs its injection loop into the "qatest123" collection
if ( useTestCache ) { // && m_useIpsTxtFile ) {
// stolen from msgc.cpp:
// if url is already in a.b.c.d format return that
@ -12204,7 +12204,7 @@ long *XmlDoc::getIp ( ) {
// this basically slows the spider down.
long delay = cr->m_spiderDelayInMilliseconds;
// ignore for testing
if ( ! strcmp(cr->m_coll,"test") ) delay = 0;
if ( ! strcmp(cr->m_coll,"qatest123") ) delay = 0;
// injected?
if ( m_sreqValid && m_sreq.m_isInjecting ) delay = 0;
if ( m_sreqValid && m_sreq.m_isPageParser ) delay = 0;
@ -12281,14 +12281,14 @@ long *XmlDoc::gotIp ( bool save ) {
if ( ! cr ) return NULL;
bool useTestCache = false;
if ( ! strcmp(cr->m_coll,"test") ) useTestCache = true;
if ( ! strcmp(cr->m_coll,"qatest123") ) useTestCache = true;
// unless its the pagesubmit.cpp event submission tool
//if ( m_sreqValid && m_sreq.m_isPageSubmit ) useTestCache = false;
// when building the "test" collection try to get the ip from
// when building the "qatest123" collection try to get the ip from
// "./test/ips.txt" so our injections are consistent every time
// Test.cpp runs its injection loop into the "test" collection
// Test.cpp runs its injection loop into the "qatest123" collection
if ( save && useTestCache ) {
// ip of 0 means NXDOMAIN i think (-1 means error)
//if ( m_ip == 0 ) {
@ -12592,8 +12592,8 @@ bool *XmlDoc::getIsAllowed ( ) {
return &m_isAllowed;
}
// or if using the "test" collection, assume yes!
//if ( ! strcmp ( m_coll , "test" ) ) {
// or if using the "qatest123" collection, assume yes!
//if ( ! strcmp ( m_coll , "qatest123" ) ) {
// m_isAllowed = true;
// m_isAllowedValid = true;
// return &m_isAllowed;
@ -12939,8 +12939,8 @@ LinkInfo *XmlDoc::getLinkInfo1 ( ) {
if ( ! m_calledMsg25 ) {
// get this
long lastUpdateTime = getTimeGlobal();
// but be consistent if doing the "test" collection
if ( ! strcmp(cr->m_coll,"test") ) {
// but be consistent if doing the "qatest123" collection
if ( ! strcmp(cr->m_coll,"qatest123") ) {
//if ( ! m_spideredTimeValid ) {char *xx=NULL;*xx=0;}
lastUpdateTime = getSpideredTime();//m_spideredTime;
}
@ -14184,7 +14184,7 @@ char **XmlDoc::getHttpReply ( ) {
// come back up here if a redirect invalidates it
loop:
// sanity test -- only if not the test collection (NO, might be EBADIP)
//if ( m_indexCode && strcmp(m_coll,"test") ) { char *xx=NULL;*xx=0; }
//if ( m_indexCode && strcmp(m_coll,"qatest123")){char*xx=NULL;*xx=0;}
// get the http reply
char **replyPtr = getHttpReply2();
if ( ! replyPtr || replyPtr == (void *)-1 ) return (char **)replyPtr;
@ -14382,7 +14382,7 @@ char **XmlDoc::getHttpReply2 ( ) {
// return gotHttpReply ( );
bool useTestCache = false;
if ( ! strcmp(cr->m_coll,"test") ) useTestCache = true;
if ( ! strcmp(cr->m_coll,"qatest123") ) useTestCache = true;
// unless its the pagesubmit.cpp event submission tool
//if ( m_sreqValid && m_sreq.m_isPageSubmit ) useTestCache = false;
@ -14474,11 +14474,12 @@ char **XmlDoc::getHttpReply2 ( ) {
// turn off
r->m_useCompressionProxy = false;
r->m_compressReply = false;
r->m_isCustomCrawl = cr->m_isCustomCrawl;
// set it for this too
if ( g_conf.m_useCompressionProxy &&
// do not use for the test collection ever, that is qa'ing
strcmp(cr->m_coll,"test") ) {
strcmp(cr->m_coll,"qatest123") ) {
r->m_useCompressionProxy = true;
r->m_compressReply = true;
}
@ -14539,7 +14540,7 @@ char **XmlDoc::getHttpReply2 ( ) {
// . msg13 uses XmlDoc::getHttpReply() function to handle
// redirects, etc.? no...
bool isTestColl = false;
if ( ! strcmp(cr->m_coll,"test") ) isTestColl = true;
if ( ! strcmp(cr->m_coll,"qatest123") ) isTestColl = true;
// sanity check. keep injections fast. no downloading!
if ( m_wasInjected ) {
@ -14613,7 +14614,7 @@ char **XmlDoc::gotHttpReply ( ) {
// . i.e. what are you doing downloading the page if there was
// a problem with the page we already know about
if ( m_indexCode && m_indexCodeValid &&
strcmp(cr->m_coll,"test") ) { char *xx=NULL;*xx=0; }
strcmp(cr->m_coll,"qatest123") ) { char *xx=NULL;*xx=0; }
// fix this
if ( saved == EDOCUNCHANGED ) {
@ -17207,6 +17208,8 @@ long *XmlDoc::getContentHashJson32 ( ) {
if ( ji->m_type != JT_NUMBER && ji->m_type != JT_STRING )
continue;
char *topName = NULL;
// what name level are we?
long numNames = 1;
JsonItem *pi = ji->m_parent;
@ -17214,6 +17217,7 @@ long *XmlDoc::getContentHashJson32 ( ) {
// empty name?
if ( ! pi->m_name ) continue;
if ( ! pi->m_name[0] ) continue;
topName = pi->m_name;
numNames++;
}
@ -17232,6 +17236,22 @@ long *XmlDoc::getContentHashJson32 ( ) {
strcmp(ji->m_name,"resolved_url") == 0 )
continue;
if ( topName && strcmp(topName,"stats") == 0 )
continue;
if ( topName && strcmp(topName,"queryString") == 0 )
continue;
if ( topName && strcmp(topName,"nextPages") == 0 )
continue;
if ( topName && strcmp(topName,"textAnalysis") == 0 )
continue;
if ( topName && strcmp(topName,"links") == 0 )
continue;
// hash the fully compound name
long nameHash32 = 0;
JsonItem *p = ji;
@ -17607,7 +17627,7 @@ long **XmlDoc::getOutlinkFirstIpVector () {
if ( ! cr ) return NULL;
// . go get it
// . if coll is "test" then try to use the file ./test/ips.txt to
// . if coll is "qatest123" then try to use the file ./test/ips.txt to
// see if the ip is in there for the given url hostname
// . this will now update Tagdb with the "firstip" tags if it should!!
// . this just dns looks up the DOMAINS of each outlink because these
@ -17747,7 +17767,7 @@ long *XmlDoc::getUrlFilterNum ( ) {
// . look it up
// . use the old spidered date for "nowGlobal" so we can be consistent
// for injecting into the "test" coll
// for injecting into the "qatest123" coll
long ufn = ::getUrlFilterNum ( oldsr,&fakeReply,spideredTime,false,
m_niceness,cr,
false, // isOutlink?
@ -18754,7 +18774,7 @@ bool XmlDoc::doConsistencyTest ( bool forceTest ) {
return true;
// if not test coll skip this
//if ( strcmp(cr->m_coll,"test") ) return true;
//if ( strcmp(cr->m_coll,"qatest123") ) return true;
// title rec is null if we are reindexing an old doc
// and "unchanged" was true.
@ -19200,7 +19220,7 @@ void XmlDoc::printMetaList ( char *p , char *pend , SafeBuf *sb ) {
else if ( rdbId == RDB_TITLEDB ) {
//XmlDoc tr;
//SafeBuf tmp;
//tr.set2 ( rec,recSize ,"test",&tmp,m_niceness);
//tr.set2 ( rec,recSize ,"qatest123",&tmp,m_niceness);
// print each offset and size for the variable crap
sb->safePrintf("<td><nobr>titlerec datasize=%li "
//"sizeofxmldoc=%li "
@ -19273,7 +19293,7 @@ bool XmlDoc::verifyMetaList ( char *p , char *pend , bool forDelete ) {
if ( ! cr ) return true;
// do not do this if not test collection for now
if ( strcmp(cr->m_coll,"test") ) return true;
if ( strcmp(cr->m_coll,"qatest123") ) return true;
// store each record in the list into the send buffers
for ( ; p < pend ; ) {
@ -22437,7 +22457,7 @@ SpiderReply *XmlDoc::getNewSpiderReply ( ) {
//if ( ! m_spideredTimeValid ) { char *xx=NULL;*xx=0; }
// . set other fields besides key
// . crap! if we are the "test" collection then m_spideredTime
// . crap! if we are the "qatest123" collection then m_spideredTime
// was read from disk usually and is way in the past! watch out!!
m_srep.m_spideredTime = getSpideredTime();//m_spideredTime;
@ -22447,7 +22467,7 @@ SpiderReply *XmlDoc::getNewSpiderReply ( ) {
// crap, for the test coll this is often a very old time and it
// causes the spider request to be repeatedly executed, so let's
// fix that
if ( ! strcmp(cr->m_coll,"test") )
if ( ! strcmp(cr->m_coll,"qatest123") )
m_srep.m_spideredTime = getTimeGlobal();
@ -23031,7 +23051,7 @@ char *XmlDoc::addOutlinkSpiderRecsToMetaList ( ) {
if ( ! cr ) return NULL;
// do not do this if not test collection for now
bool isTestColl = (! strcmp(cr->m_coll,"test") );
bool isTestColl = (! strcmp(cr->m_coll,"qatest123") );
// turn off for now
isTestColl = false;
@ -30297,6 +30317,9 @@ bool XmlDoc::hashNumber ( char *beginBuf ,
// . this now allows for commas in numbers like "1,500.62"
float f = atof2 ( p , bufEnd - p );
// debug
//log("build: hashing %s %f",hi->m_prefix,f);
if ( ! hashNumber2 ( f , hi , "gbsortby" ) )
return false;
@ -33687,7 +33710,7 @@ SafeBuf *XmlDoc::getNewTagBuf ( ) {
long now = getTimeGlobal();
// actually, use spider download time if we can. that way
// Test.cpp's injection runs will be more consistent!
if ( ! strcmp(cr->m_coll,"test") ) {
if ( ! strcmp(cr->m_coll,"qatest123") ) {
//if ( ! m_spideredTimeValid ) { char *xx=NULL;*xx=0; }
now = getSpideredTime();//m_spideredTime;
}

View File

@ -104,7 +104,7 @@ void timeWrapper ( int fd , void *state ) {
// bail if too many launched
if ( s_count >= s_max ) return;
// new state
StateT *st = (StateT *)mmalloc ( sizeof(StateT) , "test" );
StateT *st = (StateT *)mmalloc ( sizeof(StateT) , "dnstest" );
// get url from stdin into buf
char *p = st->m_buf;
if ( ! fgets ( p , 1023 , stdin ) ) exit ( 0 );
@ -147,6 +147,6 @@ void dnsWrapper ( void *state , long ip ) {
st->m_buf , iptoa(ip) , mstrerror(g_errno));
//if ( g_errno == ETRYAGAIN )
// log("hey");
mfree ( st , sizeof(StateT), "test" );
mfree ( st , sizeof(StateT), "dnstest" );
s_count--;
}

View File

@ -5675,7 +5675,7 @@ void zlibtest() {
// malloc 1,000 bufs of size about 100-64k each
for ( long i = 0 ; i < 100 ; i++ ) {
long bufSize = 1000 + (rand() % 65000);
ptrs[i] = (char *)mmalloc ( bufSize , "test" );
ptrs[i] = (char *)mmalloc ( bufSize , "ztest" );
if ( ! ptrs[i] ) {
log("no mem!"); exit(-1); }
lens[i] = bufSize;
@ -5685,7 +5685,7 @@ void zlibtest() {
}
// now free them
for ( long i = 0 ; i < 100 ; i++ )
mfree (ptrs[i] , lens[i] , "test" );
mfree (ptrs[i] , lens[i] , "ztest" );
}
}
*/
@ -11550,8 +11550,8 @@ bool parseTest ( char *coll , long long docId , char *query ) {
// speed test
t = gettimeofdayInMilliseconds();
for ( long k = 0 ; k < 100 ; k++ ) {
char *mm = (char *)mmalloc ( 300*1024 , "test");
mfree ( mm , 300*1024 ,"test");
char *mm = (char *)mmalloc ( 300*1024 , "ztest");
mfree ( mm , 300*1024 ,"ztest");
}
e = gettimeofdayInMilliseconds();
logf(LOG_DEBUG,"build: Took %.3f ms to do mallocs.",
@ -14828,7 +14828,7 @@ bool cacheTest() {
false , // support lists of recs?
maxCacheNodes ,
false , // use half keys?
"test" , // dbname
"cachetest" , // dbname
false )) // save cache to disk?
return log("test: Cache init failed.");
@ -14901,7 +14901,7 @@ bool cacheTest() {
false , // support lists of recs?
maxCacheNodes ,
false , // use half keys?
"test" , // dbname
"cachetest" , // dbname
false )) // save cache to disk?
return log("test: Cache init failed.");

View File

@ -233,7 +233,6 @@ long g_qn = 0;
char *g_queries[] = {
//"buzzlogic",
//"test",
"broncos",
"ibm",
"yahoo",

446
qa.cpp Normal file
View File

@ -0,0 +1,446 @@
#include <string.h>
#include "SafeBuf.h"
#include "HttpServer.h"
static long s_failures = 0;
bool getUrl( char *path , void (* callback) (void *state, TcpSocket *sock) ) {
SafeBuf sb;
sb.safePrintf ( "http://%s:%li%s"
, iptoa(g_hostdb.m_myHost->m_ip)
, (long)g_hostdb.m_myHost->m_port
, path
);
Url u;
u.set ( sb.getBufStart() );
if ( ! g_httpServer.getDoc ( u.getUrl() ,
0 , // ip
0 , // offset
-1 , // size
0 , // ifmodsince
NULL ,
callback ,
60*1000, // timeout
0, // proxyip
0, // proxyport
-1, // maxtextdoclen
-1, // maxotherdoclen
NULL ) ) // useragent
return false;
// error?
log("qa: getUrl error: %s",mstrerror(g_errno));
return true;
}
bool qatest ( ) ;
void qatestWrapper ( void *state , TcpSocket *sock ) { qatest(); }
// return false if blocked, true otherwise
bool addColl ( ) {
static bool s_flag = false;
if ( s_flag ) return true;
s_flag = true;
return getUrl ( "/admin/addcoll?c=qatest123" , qatestWrapper );
}
// first inject a set list of urls
static char **s_urlPtrs = NULL;
static long s_numUrls = 0;
static SafeBuf s_ubuf1;
static SafeBuf s_ubuf2;
bool loadUrls ( ) {
static bool s_loaded = false;
if ( s_loaded ) return true;
// use injectme3 file
s_ubuf1.load("./injectme3");
// scan for +++URL: xxxxx
char *s = s_ubuf1.getBufStart();
for ( ; *s ; s++ ) {
if ( strncmp(s,"+++URL: ",8) ) continue;
// got one
// find end of it
s += 8;
char *e = s;
for ( ; *e && ! is_wspace_a(*e); e++ );
// null term it
if ( *e ) *e = '\0';
// store ptr
s_ubuf2.pushLong((long)s);
// skip past that
s = e;
}
// make array of url ptrs
s_urlPtrs = (char **)s_ubuf2.getBufStart();
return true;
}
bool injectUrls ( ) {
loadUrls();
static long s_ii = 0;
for ( ; s_ii < s_numUrls ; ) {
// pre-inc it
s_ii++;
// inject using html api
SafeBuf sb;
sb.safePrintf("/admin/inject?c=qatest123&delete=0&u=");
sb.urlEncode ( s_urlPtrs[s_ii] );
return getUrl ( sb.getBufStart() , qatestWrapper );
}
return true;
}
static char *s_queries[] = {
"the",
"+the",
"cats",
"+cats dog",
"+cats +dog",
"cat OR dog",
"cat AND dog",
"cat AND NOT dog",
"NOT cat AND NOT dog",
"cat -dog",
"site:wisc.edu"
};
static long s_checksums[] = {
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0
};
static long s_qi1 = 0;
void doneSearching1 ( void *state , TcpSocket *sock ) {
//loadQueries1();
long ii = s_qi1 - 1;
// get checksum of it
HttpMime hm;
hm.set ( sock->m_readBuf , sock->m_readOffset , NULL );
char *page = sock->m_readBuf + hm.getMimeLen() ;
// we will need to ignore fields like the latency etc.
// perhaps pass that in as a cgi parm. &qa=1
long crc = hash32n ( page );
if ( crc != s_checksums[ii] ) {
log("qatest: query '%s' checksum %lu != %lu",
s_queries[ii],
s_checksums[ii],
crc);
s_failures++;
}
// resume the qa loop
qatest();
}
// ensure search results are consistent
bool searchTest1 () {
long nq = sizeof(s_queries)/sizeof(char *);
for ( ; s_qi1 < nq ; ) {
// pre-inc it
s_qi1++;
// inject using html api
SafeBuf sb;
// qa=1 tell gb to exclude "variable" or "random" things
// from the serps so we can checksum it consistently
sb.safePrintf ( "/search?c=qatest123&qa=1&q=" );
sb.urlEncode ( s_queries[s_qi1] );
return getUrl ( sb.getBufStart() , doneSearching1 );
}
return true;
}
static long s_qi2 = 0;
void doneSearching2 ( void *state , TcpSocket *sock ) {
//loadQueries1();
long ii = s_qi2 - 1;
// get checksum of it
HttpMime hm;
hm.set ( sock->m_readBuf , sock->m_readOffset , NULL );
char *page = sock->m_readBuf + hm.getMimeLen() ;
// we will need to ignore fields like the latency etc.
// perhaps pass that in as a cgi parm. &qa=1
long crc = hash32n ( page );
if ( crc != s_checksums[ii] ) {
log("qatest: query '%s' checksum %lu != %lu",
s_queries[ii],
s_checksums[ii],
crc);
s_failures++;
}
// resume the qa loop
qatest();
}
// ensure search results are consistent
bool searchTest2 () {
long nq = sizeof(s_queries)/sizeof(char *);
for ( ; s_qi2 < nq ; ) {
// pre-inc it
s_qi2++;
// inject using html api
SafeBuf sb;
// qa=1 tell gb to exclude "variable" or "random" things
// from the serps so we can checksum it consistently
sb.safePrintf ( "/search?c=qatest123&qa=1&q=" );
sb.urlEncode ( s_queries[s_qi2] );
return getUrl ( sb.getBufStart() , doneSearching2 );
}
return true;
}
bool deleteUrls ( ) {
static long s_ii2 = 0;
for ( ; s_ii2 < s_numUrls ; ) {
// pre-inc it
s_ii2++;
// reject using html api
SafeBuf sb;
sb.safePrintf( "/admin/inject?c=qatest123&delete=1&u=");
sb.urlEncode ( s_urlPtrs[s_ii2] );
return getUrl ( sb.getBufStart() , qatestWrapper );
}
return true;
}
#include "Msg0.h"
static Msg0 s_msg0;
static RdbList s_list;
void gotList33 ( void *state ) {
long *rdbId = (long *)state;
if ( ! s_list.isEmpty() ) {
log("qa: delete failed. list is not empty rdbid=%li.",*rdbId);
s_failures++;
}
// resume main loop
qatest();
}
// scan all Rdb databases and ensure no recs (it was a clean delete)
bool checkRdbLists ( long *rdbId ) {
CollectionRec *cr = g_collectiondb.getRec("qatest123");
if ( ! cr ) return true;
collnum_t cn = cr->m_collnum;
for ( ; *rdbId < RDB_END ; ) {
// pre-inc it
*rdbId = *rdbId + 1;
char minKey[MAX_KEY_BYTES];
char maxKey[MAX_KEY_BYTES];
KEYMIN(minKey,MAX_KEY_BYTES);
KEYMAX(maxKey,MAX_KEY_BYTES);
if ( ! s_msg0.getList ( 0 , // hostid
0 , // ip
0 , // port
0 , // cacheage
false, // addtocache
*rdbId , // rdbid
cn , // collnum
&s_list ,
minKey ,
maxKey ,
1000 , // minrecsizes
rdbId , // state
gotList33,
0 // niceness
) )
return false;
}
return true;
}
// once we have triggered the dump this will cause all rdbs to tightmerge
void doneDumping ( void *state , TcpSocket *sock ) {
CollectionRec *cr = g_collectiondb.getRec("qatest123");
if ( ! cr ) { qatest(); return; }
// tight merge the rdb that was dumped
for ( long i = 0 ; i < RDB_END ; i++ ) {
Rdb *rdb = getRdbFromId ( i );
if ( ! rdb ) continue;
RdbBase *base = rdb->getBase ( cr->m_collnum );
if ( ! base ) continue;
// . force a tight merge as soon as dump completes
// . the dump should already be going
base->m_nextMergeForced = true;
}
// wait for tight merges to complete now
qatest();
}
bool dumpTreesToDisk () {
static bool s_done = false;
if ( s_done ) return true;
s_done = true;
// force dump data to disk. dumps all rdbs.
return getUrl("/admin/master?dump=1",doneDumping );
}
void doneAddingUrls ( void *state ) {
qatest();
}
void sleepCallback ( int fd , void *state ) {
qatest();
}
// check every second to see if merges are done
bool waitForMergeToFinish ( ) {
// if registered
static bool s_registered = false;
if ( s_registered ) {
g_loop.unregisterSleepCallback ( NULL , sleepCallback );
s_registered = false;
}
CollectionRec *cr = g_collectiondb.getRec("qatest123");
if ( ! cr ) { qatest(); return true; }
// tight merge the rdb that was dumped
long i; for ( i = 0 ; i < RDB_END ; i++ ) {
Rdb *rdb = getRdbFromId ( i );
if ( ! rdb ) continue;
RdbBase *base = rdb->getBase ( cr->m_collnum );
if ( ! base ) continue;
// . force a tight merge as soon as dump completes
// . the dump should already be going
if ( base->m_nextMergeForced ) return false;
// still waiting on this merge
break;
}
// if not still waiting return true
if ( i >= RDB_END ) return true;
// sleep for 1 second
g_loop.registerSleepCallback ( 1000 , // 1000 ms
NULL , // state
sleepCallback ,
0 ); // niceness
s_registered = true;
return false;
}
bool resetColl ( ) {
static bool s_flag = false;
if ( s_flag ) return true;
s_flag = true;
// also turn spiders on
return getUrl("/admin/master?reset=qatest123&se=1", qatestWrapper );
}
bool addUrlTest ( ) {
static bool s_flag = false;
if ( s_flag ) return true;
s_flag = true;
return getUrl ( "/admin/addurl"
"?c=qatest123&u=www.dmoz.org+www.ibm.com+"
"www.diffbot.com"
, qatestWrapper );
}
// check every second to see if spidering phase is completed
bool checkSpidersDone ( ) {
// if registered
static bool s_registered = false;
if ( s_registered ) {
g_loop.unregisterSleepCallback ( NULL , sleepCallback );
s_registered = false;
}
// we have to adjust this once we know how many pages we'll archive
CollectionRec *cr = g_collectiondb.getRec("qatest123");
if ( ! cr ) { qatest(); return true; }
// return true if all done
if ( cr->m_globalCrawlInfo.m_pageDownloadSuccessesThisRound >= 200 )
return true;
// sleep for 1 second
g_loop.registerSleepCallback ( 1000 , // 1000 ms
NULL , // state
sleepCallback ,
0 ); // niceness
s_registered = true;
return false;
}
bool delColl ( ) {
static bool s_flag = false;
if ( s_flag ) return true;
s_flag = true;
return getUrl ( "/admin/delcoll?c=qatest123" , qatestWrapper );
}
static long s_rdbId1 = 0;
static long s_rdbId2 = 0;
//static long s_rdbId3 = 0;
// . run a series of tests to ensure that gb is functioning properly
// . use s_urls[] array of urls for injecting and spider seeding
// . contain an archive copy of all webpages in the injectme3 file and
// in pagearchive1.txt file
// . while initially spidering store pages in pagearchive1.txt so we can
// replay later. store up to 100,000 pages in there.
bool qatest ( ) {
// add the 'qatest123' collection
if ( ! addColl () ) return false;
// inject urls, return false if not done yet
if ( ! injectUrls ( ) ) return false;
// test search results
if ( ! searchTest1 () ) return false;
// delete all urls cleanly now
if ( ! deleteUrls ( ) ) return false;
// now get rdblist for every rdb for this coll and make sure all zero!
if ( ! checkRdbLists ( &s_rdbId1 ) ) return false;
// dump, tight merge and ensure no data in our rdbs for this coll
if ( ! dumpTreesToDisk() ) return false;
// wait for tight merge to complete
if ( ! waitForMergeToFinish() ) return false;
// now get rdblist for every rdb for this coll and make sure all zero!
if ( ! checkRdbLists ( &s_rdbId2 ) ) return false;
// reset the collection so we can test spidering
if ( ! resetColl ( ) ) return false;
// add urls to seed spider with. make msg13.cpp recognize qatest123
// collection and return 404 on urls not in our official list so
// we can ensure search result consistency. msg13.cpp will initially
// store the pages in a file, like the first 1,000 or so pages.
if ( ! addUrlTest () ) return false;
// wait for spidering to complete. sleep callback. # of spidered urls
// will be x, so we know when to stop
if ( ! checkSpidersDone() ) return false;
// . now search again on the large collection most likely
// . store search queries and checksum into queries2.txt
// . a 0 (or no) checksum means we should fill it in
if ( ! searchTest2 () ) return false;
// try a query delete
//if ( ! queryDeleteTest() ) return false;
// ensure empty
//if ( ! checkRdbLists ( &s_rdbId3 ) ) return false;
// delete the collection
if ( ! delColl() ) return false;
return true;
}