open-source-search-engine/qa.cpp

686 lines
14 KiB
C++
Raw Normal View History

2014-04-05 22:33:42 +04:00
#include <string.h>
#include "SafeBuf.h"
#include "HttpServer.h"
static long s_expectedCRC = 0;
2014-07-11 03:42:22 +04:00
bool qatest ( ) ;
// after we got the reply and verified expected crc, call the callback
static bool (*s_callback)() = NULL;
2014-04-05 22:33:42 +04:00
// first inject a set list of urls
static char **s_urlPtrs = NULL;
static char **s_contentPtrs = NULL;
static SafeBuf s_ubuf1;
static SafeBuf s_ubuf2;
static SafeBuf s_cbuf2;
2014-04-05 22:33:42 +04:00
2014-07-07 03:47:04 +04:00
void markOut ( char *reply , char *needle ) {
2014-07-07 09:06:33 +04:00
if ( ! reply ) return;
2014-07-07 03:47:04 +04:00
char *s = strstr ( reply , needle );
if ( ! s ) return;
for ( ; *s && ! is_digit(*s); s++ );
2014-07-07 06:43:00 +04:00
// find end of digit stream
//char *end = s;
//while ( ; *end && is_digit(*s); end++ );
// just bury the digit stream now, zeroing out was not
// a consistent LENGTH if we had 10 hits vs 9... making the hash
// different
// space out digits
for ( ; *s && is_digit(*s); s++ ) *s = ' ';
}
// do not hash
2014-07-07 09:06:33 +04:00
long qa_hash32 ( char *s ) {
2014-07-07 06:43:00 +04:00
unsigned long h = 0;
long k = 0;
for ( long i = 0 ; s[i] ; i++ ) {
// skip if not first space and back to back spaces
if ( s[i] == ' ' &&i>0 && s[i-1]==' ') continue;
h ^= g_hashtab [(unsigned char)k] [(unsigned char)s[i]];
k++;
}
return h;
2014-07-07 03:47:04 +04:00
}
2014-07-11 03:42:22 +04:00
static char *s_reply = NULL;
2014-07-07 01:13:00 +04:00
// come here after receiving ANY reply from the gigablast server
static void gotReplyWrapper ( void *state , TcpSocket *sock ) {
// store our current reply
SafeBuf fb2;
fb2.safeMemcpy(sock->m_readBuf,sock->m_readOffset);
fb2.nullTerm();
// log that we got the reply
log("qa: got reply(%li)=%s",sock->m_readOffset,sock->m_readBuf);
2014-07-07 09:06:33 +04:00
2014-07-07 01:13:00 +04:00
// get mime
HttpMime mime;
mime.set ( sock->m_readBuf , sock->m_readOffset , NULL );
// only hash content since mime has a timestamp in it
char *content = mime.getContent();
long contentLen = mime.getContentLen();
2014-07-07 09:06:33 +04:00
if ( content[contentLen] ) { char *xx=NULL;*xx=0; }
2014-07-07 03:47:04 +04:00
char *reply = sock->m_readBuf;
2014-07-11 03:42:22 +04:00
s_reply = reply;
2014-07-07 03:47:04 +04:00
// take out <responseTimeMS>
markOut ( reply , "<currentTimeUTC>");
markOut ( reply , "<responseTimeMS>");
2014-07-07 05:53:05 +04:00
// until i figure this one out, take it out
markOut ( reply , "<docsInCollection>");
2014-07-07 09:06:33 +04:00
// until i figure this one out, take it out
markOut ( reply , "<hits>");
2014-07-07 06:43:00 +04:00
// make checksum. we ignore back to back spaces so this
// hash works for <docsInCollection>10 vs <docsInCollection>9
long replyCRC = qa_hash32 ( content );
// if what we expected, save to disk if not there yet, then
// call s_callback() to resume the qa pipeline
if ( replyCRC == s_expectedCRC ) {
// save reply if good
char fn3[1024];
sprintf(fn3,"%sqa/reply.%li",g_hostdb.m_dir,replyCRC);
File ff; ff.set ( fn3 );
if ( ! ff.doesExist() ) {
// if not there yet then save it
fb2.save(fn3);
}
// . continue on with the qa process
// . which qa function that may be
s_callback();
return;
}
2014-07-07 05:53:05 +04:00
//
// if crc of reply does not match what was expected then do a diff
// so we can see why not
//
const char *emsg = "qa: bad replyCRC of %li should be %li "
"\n";//"phase=%li\n";
fprintf(stderr,emsg,replyCRC,s_expectedCRC);//,s_phase-1);
// get response on file
SafeBuf fb1;
char fn1[1024];
sprintf(fn1,"%sqa/reply.%li",g_hostdb.m_dir,s_expectedCRC);
fb1.load(fn1);
fb1.nullTerm();
// break up into lines
char fn2[1024];
sprintf(fn2,"/tmp/reply.%li",replyCRC);
fb2.save ( fn2 );
// do the diff between the two replies so we can see what changed
char cmd[1024];
sprintf(cmd,"diff %s %s",fn1,fn2);
fprintf(stderr,"%s\n",cmd);
system(cmd);
// if this is zero allow it to slide by. it is learning mode i guess.
// so we can learn what crc we need to use.
// otherwise, stop right there for debugging
if ( s_expectedCRC != 0 ) exit(1);
2014-07-07 05:53:05 +04:00
// keep on going
2014-07-09 07:33:13 +04:00
s_callback();
}
2014-04-05 22:33:42 +04:00
bool getUrl( char *path , long expectedCRC = 0 , char *post = NULL ) {
2014-04-05 22:33:42 +04:00
SafeBuf sb;
sb.safePrintf ( "http://%s:%li%s"
, iptoa(g_hostdb.m_myHost->m_ip)
, (long)g_hostdb.m_myHost->m_httpPort
, path
);
s_expectedCRC = expectedCRC;
Url u;
u.set ( sb.getBufStart() );
log("qa: getting %s",sb.getBufStart());
if ( ! g_httpServer.getDoc ( u.getUrl() ,
0 , // ip
0 , // offset
-1 , // size
0 , // ifmodsince
NULL ,
gotReplyWrapper,
60*1000, // timeout
0, // proxyip
0, // proxyport
-1, // maxtextdoclen
-1, // maxotherdoclen
NULL , // useragent
"HTTP/1.0" , // protocol
true , // doPost
NULL , // cookie
NULL , // additionalHeader
NULL , // fullRequest
post ) )
return false;
// error?
log("qa: getUrl error: %s",mstrerror(g_errno));
return true;
}
2014-04-05 22:33:42 +04:00
bool loadUrls ( ) {
static bool s_loaded = false;
if ( s_loaded ) return true;
2014-07-07 02:04:21 +04:00
s_loaded = true;
2014-04-05 22:33:42 +04:00
// use injectme3 file
s_ubuf1.load("./injectme3");
// scan for +++URL: xxxxx
char *s = s_ubuf1.getBufStart();
for ( ; *s ; s++ ) {
if ( strncmp(s,"+++URL: ",8) ) continue;
// got one
2014-07-07 02:04:21 +04:00
// \0 term it for s_contentPtrs below
*s = '\0';
2014-04-05 22:33:42 +04:00
// find end of it
s += 8;
char *e = s;
for ( ; *e && ! is_wspace_a(*e); e++ );
// null term it
if ( *e ) *e = '\0';
// store ptr
s_ubuf2.pushLong((long)s);
// skip past that
s = e;
2014-07-07 02:04:21 +04:00
// point to content
s_cbuf2.pushLong((long)(s+1));
2014-04-05 22:33:42 +04:00
}
// make array of url ptrs
s_urlPtrs = (char **)s_ubuf2.getBufStart();
2014-07-07 02:04:21 +04:00
s_contentPtrs= (char **)s_cbuf2.getBufStart();
2014-04-05 22:33:42 +04:00
return true;
}
2014-07-07 03:47:04 +04:00
/*
2014-04-05 22:33:42 +04:00
static char *s_queries[] = {
"the",
"+the",
"cats",
"+cats dog",
"+cats +dog",
"cat OR dog",
"cat AND dog",
"cat AND NOT dog",
"NOT cat AND NOT dog",
"cat -dog",
"site:wisc.edu"
};
2014-07-07 03:47:04 +04:00
*/
2014-04-05 22:33:42 +04:00
2014-07-07 09:06:33 +04:00
#undef usleep
//
// the injection qa test suite
//
bool qainject ( ) {
2014-07-07 09:06:33 +04:00
2014-07-09 07:33:13 +04:00
if ( ! s_callback ) s_callback = qainject;
//
// delete the 'qatest123' collection
//
2014-07-07 09:06:33 +04:00
static bool s_x1 = false;
if ( ! s_x1 ) {
s_x1 = true;
getUrl ( "/admin/delcoll?xml=1&delcoll=qatest123" );
return false;
2014-07-07 05:53:05 +04:00
}
2014-07-07 01:13:00 +04:00
//
2014-04-05 22:33:42 +04:00
// add the 'qatest123' collection
2014-07-07 01:13:00 +04:00
//
2014-07-07 09:06:33 +04:00
static bool s_x2 = false;
if ( ! s_x2 ) {
s_x2 = true;
getUrl ( "/admin/addcoll?addcoll=qatest123&xml=1" ,
// checksum of reply expected
238170006 );
return false;
2014-07-07 01:13:00 +04:00
}
2014-04-05 22:33:42 +04:00
2014-07-07 01:13:00 +04:00
//
2014-04-05 22:33:42 +04:00
// inject urls, return false if not done yet
2014-07-07 01:13:00 +04:00
//
2014-07-07 09:06:33 +04:00
static bool s_x4 = false;
if ( ! s_x4 ) {
2014-07-07 01:13:00 +04:00
// TODO: try delimeter based injection too
loadUrls();
static long s_ii = 0;
for ( ; s_ii < s_ubuf2.length()/(long)sizeof(char *) ; ) {
// inject using html api
SafeBuf sb;
2014-07-07 09:06:33 +04:00
sb.safePrintf("&c=qatest123&deleteurl=0&"
2014-07-07 01:13:00 +04:00
"format=xml&u=");
sb.urlEncode ( s_urlPtrs[s_ii] );
2014-07-07 02:04:21 +04:00
// the content
sb.safePrintf("&hasmime=1");
sb.safePrintf("&content=");
sb.urlEncode(s_contentPtrs[s_ii] );
2014-07-07 01:13:00 +04:00
sb.nullTerm();
2014-07-07 02:04:21 +04:00
// pre-inc it in case getUrl() blocks
s_ii++;
getUrl("/admin/inject",
0, // no idea what crc to expect
sb.getBufStart());
2014-07-07 03:47:04 +04:00
return false;
2014-07-07 01:13:00 +04:00
}
2014-07-07 09:06:33 +04:00
s_x4 = true;
2014-07-07 01:13:00 +04:00
}
2014-07-07 03:47:04 +04:00
// +the
2014-07-07 09:06:33 +04:00
static bool s_x5 = false;
if ( ! s_x5 ) {
2014-07-07 23:42:30 +04:00
usleep(1500000);
2014-07-07 09:06:33 +04:00
s_x5 = true;
2014-07-07 03:47:04 +04:00
getUrl ( "/search?c=qatest123&qa=1&format=xml&q=%2Bthe",
-1452050577 );
2014-07-07 03:47:04 +04:00
return false;
}
2014-07-07 09:06:33 +04:00
2014-07-07 03:47:04 +04:00
// sports news
2014-07-07 09:06:33 +04:00
static bool s_x7 = false;
if ( ! s_x7 ) {
s_x7 = true;
2014-07-07 03:47:04 +04:00
getUrl ( "/search?c=qatest123&qa=1&format=xml&q=sports+news",
-1586622518 );
2014-07-07 03:47:04 +04:00
return false;
}
2014-07-07 09:06:33 +04:00
2014-07-07 05:53:05 +04:00
//
// eject/delete the urls
//
static long s_ii2 = 0;
2014-07-07 09:06:33 +04:00
for ( ; s_ii2 < s_ubuf2.length()/(long)sizeof(char *) ; ) {
2014-07-07 05:53:05 +04:00
// reject using html api
SafeBuf sb;
sb.safePrintf( "/admin/inject?c=qatest123&deleteurl=1&"
"format=xml&u=");
sb.urlEncode ( s_urlPtrs[s_ii2] );
sb.nullTerm();
// pre-inc it in case getUrl() blocks
s_ii2++;
getUrl ( sb.getBufStart() , 0 );
2014-07-07 05:53:05 +04:00
return false;
}
2014-07-07 03:47:04 +04:00
2014-07-07 05:53:05 +04:00
//
// make sure no results left, +the
//
2014-07-07 09:06:33 +04:00
static bool s_x9 = false;
if ( ! s_x9 ) {
2014-07-07 23:42:30 +04:00
usleep(1500000);
2014-07-07 09:06:33 +04:00
s_x9 = true;
2014-07-07 05:53:05 +04:00
getUrl ( "/search?c=qatest123&qa=1&format=xml&q=%2Bthe",
-1672870556 );
2014-07-07 05:53:05 +04:00
return false;
}
2014-07-07 09:06:33 +04:00
//
// try delimeter based injecting
//
static bool s_y2 = false;
if ( ! s_y2 ) {
s_y2 = true;
SafeBuf sb;
// delim=+++URL:
sb.safePrintf("&c=qatest123&deleteurl=0&"
"delim=%%2B%%2B%%2BURL%%3A&format=xml&u=xyz.com&"
"hasmime=1&content=");
// use injectme3 file
SafeBuf ubuf;
ubuf.load("./injectme3");
sb.urlEncode(ubuf.getBufStart());
getUrl ( "/admin/inject",
// check reply, seems to have only a single docid init
-1970198487,
sb.getBufStart());
2014-07-07 09:06:33 +04:00
return false;
}
// now query check
static bool s_y4 = false;
if ( ! s_y4 ) {
2014-07-07 23:42:30 +04:00
usleep(1500000);
2014-07-07 09:06:33 +04:00
s_y4 = true;
getUrl ( "/search?c=qatest123&qa=1&format=xml&q=%2Bthe",
-480078278 );
2014-07-07 09:06:33 +04:00
return false;
}
//
// delete the 'qatest123' collection
//
if ( ! s_x1 ) {
s_x1 = true;
getUrl ( "/admin/delcoll?xml=1&delcoll=qatest123" );
return false;
2014-07-07 05:53:05 +04:00
}
2014-07-07 09:06:33 +04:00
static bool s_fee2 = false;
if ( ! s_fee2 ) {
s_fee2 = true;
fprintf(stderr,"\n\n\nSUCCESSFULLY COMPLETED "
"QA INJECT TEST\n\n\n");
2014-07-09 07:33:13 +04:00
return true;
}
return true;
}
static char *s_urls1 =
2014-07-11 03:42:22 +04:00
" walmart.com"
" cisco.com"
" t7online.com"
" sonyericsson.com"
" netsh.com"
" allegro.pl"
" hotscripts.com"
" sitepoint.com"
" so-net.net.tw"
" aol.co.uk"
" sbs.co.kr"
" chinaacc.com"
" eyou.com"
" spray.se"
" carview.co.jp"
" xcar.com.cn"
" united.com"
" raaga.com"
" primaryads.com"
" szonline.net"
" icbc.com.cn"
" instantbuzz.com"
" sz.net.cn"
" 6to23.com"
" seesaa.net"
" tracking101.com"
" jubii.dk"
" 5566.net"
" prikpagina.nl"
" 7xi.net"
" 91.com"
" jjwxc.com"
" adbrite.com"
" hoplay.com"
" questionmarket.com"
" telegraph.co.uk"
" trendmicro.com"
" google.fi"
" ebay.es"
" tfol.com"
" sleazydream.com"
" websearch.com"
" freett.com"
" dayoo.com"
" interia.pl"
" yymp3.com"
" stanford.edu"
" time.gr.jp"
" telia.com"
" madthumbs.com"
" chinamp3.com"
" oldgames.se"
" buy.com"
" singpao.com"
" cbsnews.com"
" corriere.it"
" cbs.com"
" flickr.com"
" theglobeandmail.com"
" incredifind.com"
" mit.edu"
" chase.com"
" ktv666.com"
" oldnavy.com"
" lego.com"
" eniro.se"
" bloomberg.com"
" ft.com"
" odn.ne.jp"
" pcpop.com"
" ugameasia.com"
" cantv.net"
" allinternal.com"
" aventertainments.com"
" invisionfree.com"
" hangzhou.com.cn"
" zhaopin.com"
" bcentral.com"
" lowes.com"
" adprofile.net"
" yninfo.com"
" jeeran.com"
" twbbs.net.tw"
" yousendit.com"
" aavalue.com"
" google.com.co"
" mysearch.com"
" worldsex.com"
" navisearch.net"
" lele.com"
" msn.co.in"
" officedepot.com"
" xintv.com"
" 204.177.92.193"
" travelzoo.com"
" bol.com.br"
" dtiserv2.com"
" optonline.net"
" hitslink.com"
" freechal.com"
" infojobs.net"
2014-07-09 07:33:13 +04:00
;
bool qaspider ( ) {
if ( ! s_callback ) s_callback = qaspider;
//
// delete the 'qatest123' collection
//
2014-07-09 07:33:13 +04:00
static bool s_x1 = false;
if ( ! s_x1 ) {
s_x1 = true;
getUrl ( "/admin/delcoll?xml=1&delcoll=qatest123" );
2014-07-09 07:33:13 +04:00
return false;
}
//
// add the 'qatest123' collection
//
static bool s_x2 = false;
if ( ! s_x2 ) {
s_x2 = true;
getUrl ( "/admin/addcoll?addcoll=qatest123&xml=1" ,
// checksum of reply expected
238170006 );
2014-07-09 07:33:13 +04:00
return false;
}
2014-07-11 19:00:30 +04:00
// restrict hopcount to 0 or 1 in url filters so we do not spider
// too deep
static bool s_z1 = false;
if ( ! s_z1 ) {
s_z1 = true;
2014-07-09 07:33:13 +04:00
SafeBuf sb;
2014-07-11 19:00:30 +04:00
sb.safePrintf("&c=qatest123&"
// make it the custom filter
"ufp=0&"
2014-07-11 19:00:30 +04:00
"fe=%%21ismanualadd+%%26%%26+%%21insitelist&hspl=0&hspl=1&fsf=0.000000&mspr=0&mspi=1&xg=1000&fsp=-3&"
2014-07-11 19:00:30 +04:00
// take out hopcount for now, just test quotas
// "fe1=tag%%3Ashallow+%%26%%26+hopcount%%3C%%3D1&hspl1=0&hspl1=1&fsf1=1.000000&mspr1=1&mspi1=1&xg1=1000&fsp1=3&"
2014-07-11 19:00:30 +04:00
"fe1=tag%%3Ashallow+%%26%%26+sitepages%%3C%%3D20&hspl1=0&hspl1=1&fsf1=1.000000&mspr1=1&mspi1=1&xg1=1000&fsp1=45&"
2014-07-11 19:00:30 +04:00
"fe2=default&hspl2=0&hspl2=1&fsf2=1.000000&mspr2=0&mspi2=1&xg2=1000&fsp2=45&"
2014-07-11 19:00:30 +04:00
);
getUrl ( "/admin/filters",0,sb.getBufStart());
return false;
}
2014-07-11 19:00:30 +04:00
// set the site list to
// a few sites
static bool s_z2 = false;
if ( ! s_z2 ) {
s_z2 = true;
2014-07-11 03:28:24 +04:00
SafeBuf sb;
2014-07-11 19:00:30 +04:00
sb.safePrintf("&c=qatest123&format=xml&sitelist=");
sb.urlEncode("tag:shallow www.walmart.com\r\n"
"tag:shallow http://www.ibm.com/\r\n");
sb.nullTerm();
getUrl ("/admin/settings",0,sb.getBufStart() );
2014-07-11 03:28:24 +04:00
}
2014-07-11 19:00:30 +04:00
//
2014-07-11 19:00:30 +04:00
// use the add url interface now
// walmart.com above was not seeded because of the site: directive
// so this will seed it.
//
2014-07-11 03:28:24 +04:00
static bool s_y2 = false;
if ( ! s_y2 ) {
s_y2 = true;
SafeBuf sb;
// delim=+++URL:
sb.safePrintf("&c=qatest123"
"&format=json"
"&strip=1"
"&spiderlinks=1"
2014-07-11 19:00:30 +04:00
"&urls=www.walmart.com+ibm.com"
2014-07-11 03:28:24 +04:00
);
// . now a list of websites we want to spider
// . the space is already encoded as +
2014-07-11 19:00:30 +04:00
//sb.urlEncode(s_urls1);
2014-07-11 03:28:24 +04:00
getUrl ( "/admin/addurl",0,sb.getBufStart());
return false;
}
2014-07-11 19:00:30 +04:00
//
// wait for spidering to stop
//
2014-07-11 03:42:22 +04:00
checkagain:
// wait until spider finishes. check the spider status page
// in json to see when completed
static bool s_k1 = false;
if ( ! s_k1 ) {
usleep(5000000); // 5 seconds
s_k1 = true;
2014-07-11 19:00:30 +04:00
getUrl ( "/admin/status?format=json&c=qatest123",0);
2014-07-11 03:42:22 +04:00
return false;
}
static bool s_k2 = false;
if ( ! s_k2 ) {
// ensure spiders are done
if ( s_reply && ! strstr(s_reply,"Job has completed") ) {
s_k1 = false;
goto checkagain;
}
s_k2 = true;
}
2014-07-11 19:00:30 +04:00
2014-07-11 03:42:22 +04:00
2014-07-11 03:28:24 +04:00
// verify no 2 hopcounts in results
static bool s_y4 = false;
if ( ! s_y4 ) {
s_y4 = true;
getUrl ( "/search?c=qatest123&qa=1&format=xml&"
"q=gbhopcount%%3A2",
123456 );
return false;
}
// check facet sections query for walmart
static bool s_y5 = false;
if ( ! s_y5 ) {
s_y5 = true;
getUrl ( "/search?c=qatest123&format=json&"
"q=gbfacetstr%%3Agbxpathsitehash2492664135",
123456 );
return false;
}
2014-07-11 03:28:24 +04:00
static bool s_y6 = false;
if ( ! s_y6 ) {
s_y6 = true;
getUrl ( "/get?page=4&q=gbfacetstr:gbxpathsitehash2492664135&qlang=xx&c=main&d=61506292&cnsp=0" , 123456 );
return false;
}
2014-07-11 03:28:24 +04:00
// in xml
static bool s_y7 = false;
if ( ! s_y7 ) {
s_y7 = true;
getUrl ( "/get?xml=1&page=4&q=gbfacetstr:gbxpathsitehash2492664135&qlang=xx&c=main&d=61506292&cnsp=0" , 123456 );
return false;
}
2014-07-11 03:28:24 +04:00
// and json
static bool s_y8 = false;
if ( ! s_y8 ) {
s_y8 = true;
getUrl ( "/get?json=1&page=4&q=gbfacetstr:gbxpathsitehash2492664135&qlang=xx&c=main&d=61506292&cnsp=0" , 123456 );
return false;
}
2014-07-11 19:00:30 +04:00
// delete the collection
static bool s_fee = false;
if ( ! s_fee ) {
s_fee = true;
return getUrl ( "/admin/delcoll?delcoll=qatest123" );
}
static bool s_fee2 = false;
if ( ! s_fee2 ) {
s_fee2 = true;
fprintf(stderr,"\n\n\nSUCCESSFULLY COMPLETED "
2014-07-11 19:00:30 +04:00
"QA SPIDER TEST\n\n\n");
return true;
}
2014-07-11 19:00:30 +04:00
2014-04-05 22:33:42 +04:00
return true;
}
// . run a series of tests to ensure that gb is functioning properly
// . uses the ./qa subdirectory to hold archive pages, ips, spider dates to
// ensure consistency between tests for exact replays
bool qatest ( ) {
2014-07-09 07:33:13 +04:00
if ( ! s_callback ) s_callback = qatest;
qainject ( );
qaspider ( );
2014-07-09 07:33:13 +04:00
return true;
}