// Copyright 2009, Gigablast Inc.
// . runs a series of tests on a gigablast instance
// . right now just performs injections to test parsing and indexing
#undef _XOPEN_SOURCE
#define _XOPEN_SOURCE 500
#include "gb-include.h"
#include "Test.h"
#include "Rdb.h"
#include "Spider.h"
#include "Msg1.h"
#include "Datedb.h"
#include "Pages.h"
#include "PingServer.h"
#include "Spider.h"
#include "Process.h"
#include "Placedb.h"
#include "Threads.h"
#include "Msge1.h"
#include "Parms.h"
//static void testWrapper ( int fd , void *state ) ;
static void injectedWrapper ( void *state ) ;
// the global class
Test g_test;
Test::Test() {
m_urlBuf = NULL;
m_isRunning = false;
m_isAdding = false;
m_urlsAdded = 0;
m_urlsIndexed = 0;
//m_spiderLinks = true;//false;
m_bypassMenuElimination = false;
// assume if they just turn spiders on we use this
//m_testDir = "test-spider";
}
// main.cpp calls g_repair.init()
bool Test::init ( ) {
m_isRunning = false;
m_isAdding = false;
m_urlsAdded = 0;
m_urlsIndexed = 0;
//if( ! g_loop.registerSleepCallback( 1 , NULL , testWrapper ) )
// return log("repair: Failed register callback.");
// record current value
//m_testSpiderEnabledSaved = g_conf.m_testSpiderEnabled;
//m_testParserEnabledSaved = g_conf.m_testParserEnabled;
return true;
}
void Test::reset ( ) {
if ( m_urlBuf ) mfree ( m_urlBuf , m_urlEnd - m_urlBuf , "test999");
//m_spiderLinks = true;//false;
m_bypassMenuElimination = false;
}
// . call this once every second
// . this is responsible for advancing from one g_repairMode to the next
//void testWrapper ( int fd , void *state ) {
// // call it from the class
// g_test.loop();
//}
char *Test::getTestDir ( ) {
// sanity
if ( g_conf.m_testSpiderEnabled && g_conf.m_testParserEnabled ) {
char *xx=NULL;*xx=0; }
if ( g_conf.m_testSpiderEnabled )
return "test-spider";
if ( g_conf.m_testParserEnabled )
return "test-parser";
// default if they just turn on spiders (spiders on cmd)
//return "test-spider";
//if ( ! m_testDir ) { char *xx=NULL;*xx=0; }
char *xx=NULL;*xx=0;
return NULL;
}
void Test::removeFiles ( ) {
// reset
m_errno = 0;
if ( g_conf.m_testParserEnabled ) {
// remove all old files for now to avoid system diffs
log("test: removing old parse critical and run files from "
"last run.");
//system ("rm /home/mwells/gigablast/test/parse*.?.*" );
//system ("rm /home/mwells/gigablast/test/critical*.?.*" );
char sbuf[1024];
char *testDir = getTestDir();
sprintf(sbuf,"rm %s/%s/run.?.*" ,
g_hostdb.m_dir,testDir);
system (sbuf);
// use this one instead since rm doesn't always work
sprintf(sbuf,"ls -1 %s/%s/ | grep parse | xargs --verbose "
"-I xxx rm %s/%s/xxx" ,
g_hostdb.m_dir,
testDir ,
g_hostdb.m_dir,
testDir );
log("test: %s",sbuf);
system(sbuf);
sprintf(sbuf,"ls -1 %s/%s/ | grep critical | xargs --verbose "
"-I xxx rm %s/%s/xxx" ,
g_hostdb.m_dir,
testDir ,
g_hostdb.m_dir,
testDir );
log("test: %s",sbuf);
system(sbuf);
}
// do not crash for lack of quickpoll now
long saved = g_conf.m_useQuickpoll;
g_conf.m_useQuickpoll = false;
CollectionRec *cr = g_collectiondb.getRec("qatest123");
// . reset the qatest collection to zero docs
// . TODO: implement this. only allow it for qatest coll.
// . kinda like Collectiondb::deleteRec() i guess but we need to
// preserve the parms!!
// . deletetagdb = false
if ( cr ) g_collectiondb.resetColl2 ( cr->m_collnum ,
cr->m_collnum ,
true );
// reset event count
//g_collectiondb.countEvents();
// turn it back on
g_conf.m_useQuickpoll = saved;
}
// come here once per second i guess
void Test::initTestRun ( ) {
g_errno = 0;
// . all hosts should have their g_conf.m_repairMode parm set
// . it is global now, not collection based, since we need to
// lock down titledb for the scan and there could be recs from
// the collection we are repairing in titledb's rdbtree, which,
// when dumped, would mess up our scan.
if ( ! g_conf.m_testSpiderEnabled && ! g_conf.m_testParserEnabled ) {
char *xx=NULL;*xx=0; }
// if both enabled, core
if ( g_conf.m_testSpiderEnabled && g_conf.m_testParserEnabled ) {
char *xx=NULL;*xx=0; }
// if the power went off
if ( ! g_process.m_powerIsOn ) return;
// return if currently running
// no, admin can re-init even if running now
//if ( m_isRunning ) { char *xx=NULL;*xx=0; }//return;
// must be host #0 only
if ( g_hostdb.m_myHost->m_hostId != 0 ) return;
// if was initially in this mode, don't do anything
//if ( m_testSpiderEnabledSaved ) return;
//if ( m_testParserEnabledSaved ) return;
// you must have the "qatest123" coll already setup!
CollectionRec *cr = g_collectiondb.getRec("qatest123");
if ( ! cr ) {
// note it
log("test: please add a collection named \"test\" first.");
// stop the test
g_conf.m_testParserEnabled = false;
g_conf.m_testSpiderEnabled = false;
// all done
return;
}
char *testDir = getTestDir();
// scan for file named "run.start.%li.txt" which is a dump of all
// the conf and parms
char filename[100];
File f;
long i; for ( i = 0 ; i < 9999 ; i++ ) {
// make filename. base it off working dir, g_hostdb.m_dir
sprintf ( filename,"%s/%s/run.%li.collparms.txt",
g_hostdb.m_dir,testDir,i );
// exist?
f.set ( filename );
// open files
long status = f.doesExist();
// error?
if ( status == -1 ) {
// note it in the log
log("test: doesExist() returned -1");
// end the test
g_conf.m_testParserEnabled = false;
g_conf.m_testSpiderEnabled = false;
// all done
return;
}
// try next i if this one in use
if ( status ) continue;
// got one
break;
}
// close it
f.close();
// create the run.%li.version.txt file
char cmd[1000];
char vfile[200];
sprintf(vfile,"%s/%s/run.%li.version.txt",g_hostdb.m_dir,testDir,i);
sprintf(cmd,
"%s/gb -v >& %s ; "
"echo -n \"RUN START TIME: \" >> %s ; "
"date >> %s",
g_hostdb.m_dir,vfile,
vfile,
vfile);
system(cmd);
// save it
m_runId = i;
cr = g_collectiondb.getRec ( "qatest123" );
if ( ! cr ) {
// and no more of this
g_conf.m_testParserEnabled = false;
g_conf.m_testSpiderEnabled = false;
return;
}
// set these
m_coll = cr->m_coll;
// turn on spiders
//cr->m_spideringEnabled = 1;
// crap i guess this too!!!
//g_conf.m_spideringEnabled = 1;
//
// log out the global parms
//
char fbuf[100];
// print our global parms into a file called run.%li.start.txt
sprintf(fbuf,"%s/%s/run.%li.confparms.txt",g_hostdb.m_dir,testDir,i);
// this saves it as xml i think
g_parms.saveToXml ( (char *)&g_conf , fbuf , OBJ_CONF);
//
// log out the coll specific parms
//
// update name
sprintf(fbuf,"%s/%s/run.%li.collparms.txt",g_hostdb.m_dir,testDir,i);
// save that
g_parms.saveToXml ( (char *)cr , fbuf , OBJ_COLL);
// get the list of urls to download and inject in order
sprintf(fbuf,"%s/%s/urls.txt",g_hostdb.m_dir,testDir);
// set it
f.set ( fbuf ) ;
// read it in
long fsize = f.getFileSize();
// add one for \0 termination
long need = fsize + 1;
// read it in
char *buf = (char *)mmalloc ( need ,"qatest");
// error?
if ( ! buf ) {
// note it
log("test: failed to alloc %li bytes for url buf",fsize);
// disable testing
g_conf.m_testParserEnabled = false;
g_conf.m_testSpiderEnabled = false;
// all done
return;
}
// open it
f.open ( O_RDONLY );
// read it in
long rs = f.read ( buf , fsize , 0 ) ;
// check it
if ( rs != fsize ) {
// note it
log("test: failed to read %li bytes of urls.txt file",fsize);
// disable testing
g_conf.m_testParserEnabled = false;
g_conf.m_testSpiderEnabled = false;
// all done
return;
}
// save it
m_urlBuf = buf;
// null term it just in case
buf[need-1] = '\0';
// end of it, including the terminating \0
m_urlEnd = buf + need;
// init url offset
m_urlPtr = m_urlBuf;
// reset just in case
//m_spiderLinks = false;
m_bypassMenuElimination = false;
// first check for spiderlinks=1|true
for ( char *p = m_urlBuf ; p < m_urlEnd ; p++ ) {
//if ( p[0] != 's' ) continue;
//if ( p[1] != 'p' ) continue;
//if ( ! strncmp(p,"spiderlinks",11) )
// m_spiderLinks = true;
//if ( ! strncmp(p,"bypassmenuelimination",21) )
// m_bypassMenuElimination = true;
}
// force max spiders to one because one page is often dependent
// on the previous page!
//if ( ! m_spiderLinks ) cr->m_maxNumSpiders = 1;
// need to make it 6 since some priorities essentially lock the
// ips up that have urls in higher priorities. i.e. once we dole
// a url out for ip X, then if later we add a high priority url for
// IP X it can't get spidered until the one that is doled does.
//else cr->m_maxNumSpiders = 6;
// . first space out all comments
// . comments are nice because we know why the url is in urls.txt
for ( char *p = m_urlBuf ; p < m_urlEnd ; p++ ) {
// skip if not start of a comment line
if ( *p != '#' ) continue;
// if not preceeded by a \n or start, skip
if ( p > m_urlBuf && *(p-1) != '\n' ) continue;
// ok, nuke it
for ( ; *p && *p !='\n' ; p++ ) *p = ' ';
}
// if we hit "\nSTOP\n" then white out that and all past it
for ( char *p = m_urlBuf ; p < m_urlEnd ; p++ ) {
// skip if not start of a comment line
if ( *p != '\n' ) continue;
// check it
if ( strncmp(p,"\nSTOP\n",6) ) continue;
// white out
for ( ; *p ; p++ ) {
// until we HIT RESUME
if ( *p == '\n' && ! strncmp(p,"\nRESUME\n",8) ) {
p[1] = ' ';
p[2] = ' ';
p[3] = ' ';
p[4] = ' ';
p[5] = ' ';
p[6] = ' ';
break;
}
*p = ' ';
}
// all done
//break;
}
// then NULL terminate all urls by converting all white space to \0s
for ( char *p = m_urlBuf ; p < m_urlEnd ; p++ )
// all non url chars to \0
if ( is_wspace_a(*p) ) *p = '\0';
// flag this
m_isRunning = true;
// and this
m_isAdding = true;
m_testStartTime = gettimeofdayInMilliseconds();
// set up dedup table
m_dt.set ( 8,0,0,NULL,0,false,MAX_NICENESS,"testdedup");
// remove all old files for now to avoid system diffs
log("test: beginning injection");
// . now inject each url in order, one at a time using msg7 i guess
// . returns true if all done
if ( ! injectLoop() ) return;
// close it up
//stopIt();
}
// this should be called when all docs have finished spidering
void Test::stopIt ( ) {
// sanity
if ( m_isAdding ) { char *xx=NULL;*xx=0; }
// flag that we are done
m_isRunning = false;
// print time
log("test: took %lli ms to complete injections.",
gettimeofdayInMilliseconds() - m_testStartTime );
// get this before setting testParserEnabled to false
char *testDir = g_test.getTestDir();
// turn this off now too
g_conf.m_testParserEnabled = false;
g_conf.m_testSpiderEnabled = false;
// save all!
bool disabled = g_threads.m_disabled;
g_threads.disableThreads();
// save it blocking style
g_process.save();
if ( ! disabled ) g_threads.enableThreads();
// save ips.txt
saveTestBuf ( testDir );
log("test: test completed. making qa.html");
//
//
// NOW MAKE THE qa.html FILE
//
//
// only analyze up to last 7 runs
long start = m_runId - 7;
if ( start < 0 ) start = 0;
SafeBuf sb;
sb.safePrintf("
\n");
sb.safePrintf(""
"run id | "
"conf diff | "
"coll diff | "
"run info | "
"
\n");
// take diffs between this run and the last run for confparms
for ( long i = m_runId ; i > start ; i-- ) {
// shortcut
char *dir = g_hostdb.m_dir;
// make diff filename
char diff1[200];
sprintf(diff1,"%s/%s/run.%li.confparms.txt.diff",dir,
testDir,i);
File f1;
f1.set(diff1);
if ( ! f1.doesExist() ) {
char df1[200];
char df2[200];
sprintf(df1,"%s/%s/run.%li.confparms.txt",dir,
testDir,i);
sprintf(df2,"%s/%s/run.%li.confparms.txt",dir,
testDir,i-1);
// do the diff
char cmd[600];
sprintf(cmd,"diff %s %s > %s",df1,df2,diff1);
log("test: system(\"%s\")",cmd);
system (cmd);
}
long fs1 = f1.getFileSize();
sb.safePrintf("%li | %li | ", i,fs1);
// make diff filename
char diff2[200];
sprintf(diff2,"%s/%s/run.%li.collparms.txt.diff",dir,
testDir,i);
File f2;
f2.set(diff2);
if ( ! f2.doesExist() ) {
char df1[200];
char df2[200];
sprintf(df1,"%s/%s/run.%li.collparms.txt",dir,
testDir,i);
sprintf(df2,"%s/%s/run.%li.collparms.txt",dir,
testDir,i-1);
// do the diff
char cmd[600];
sprintf(cmd,"diff %s %s > %s",df1,df2,diff2);
log("test: system(\"%s\")",cmd);
system (cmd);
}
long fs2 = f2.getFileSize();
sb.safePrintf("%li | ", fs2);
// the version
char vf[200];
sprintf(vf,"%s/%s/run.%li.version.txt",dir,testDir,i);
File f3;
f3.set ( vf );
long fs3 = f3.getFileSize();
char vbuf[1000];
vbuf[0] = 0;
if ( fs3 > 0 ) {
f3.open(O_RDONLY);
long rs = f3.read(vbuf,fs3,0);
vbuf[fs3] = '\0';
if ( rs <= 0 ) continue;
f3.close();
}
// show it
sb.safePrintf("%s |
\n", vbuf);
}
sb.safePrintf("
\n");
sb.safePrintf("
\n");
//
// now diff each parser output file for each url in urls.txt
//
//
// loop over url buf first so we can print one table per url
//
char *next = NULL;
// reset the url buf ptr
m_urlPtr = m_urlBuf;
// count em
long count = 0;
// ptrs to each url table
long un = 0;
long uptr [5000]; // offsets now, not char ptr since buf gets reallocd
char udiff[5000];
long ulen [5000];
long uhits[5000]; // critical errors! validateOutput() choked!
long uunchecked[5000]; // events/addresses found but were not validatd
long umiss[5000];
long usort[5000];
long uevents[5000];
SafeBuf tmp;
long niceness = MAX_NICENESS;
// advance to next url
for ( ; m_urlPtr < m_urlEnd ; m_urlPtr = next ) {
// breathe
QUICKPOLL(niceness);
// we converted all non-url chars into \0's so skip those!
for ( ; m_urlPtr= m_urlEnd ) break;
// set this up
next = m_urlPtr;
// compute next url ptr
for ( ; next < m_urlEnd && *next ; next++ );
// point to this url
char *u = m_urlPtr;
// get hash
long long h = hash64 ( u , gbstrlen(u) );
// shortcut
char *dir = g_hostdb.m_dir;
// print into a secondary safe buf with a ptr to
// it so we can sort that and transfer into the
// primary safebuf later
uptr[un] = tmp.length();
// assume no diff
udiff[un] = 0;
// print number
tmp.safePrintf("%li) ",count++);
// . link to our stored http server reply
// . TODO: link it to our [cached] copy in the test coll!!!
char local[1200];
sprintf(local,"/%s/doc.%llu.html",testDir,h);
tmp.safePrintf("%s ",local,u);
// link to live page
tmp.safePrintf(" live ",u);
// link to page parser
char ubuf[2000];
urlEncode(ubuf,2000,u,gbstrlen(u),true);
tmp.safePrintf(" parser ",ubuf);
//tmp.safePrintf(" (%llu)",h);
tmp.safePrintf("
\n");
//tmp.safePrintf("
\n");
tmp.safePrintf("\n");
tmp.safePrintf(""
"run id | "
"crit hits | "
"crit errors | "
"# e | "
"unchecked | "
"diff chars | "
"diff file | "
"full output | "
"
\n");
//SafeBuf sd;
// loop over all the runs now, starting with latest run first
for ( long ri = m_runId ; ri >= start ; ri-- ) {
QUICKPOLL(niceness);
// the diff filename
char pdiff[200];
sprintf(pdiff,"%s/%s/parse.%llu.%li.html.diff",dir,
testDir,h,ri);
File f;
f.set(pdiff);
long fs = f.getFileSize();
if ( ! f.doesExist() && ri > 0 ) {
// make the parse filename
char pbuf1[200];
char pbuf2[200];
sprintf(pbuf1,"%s/%s/parse.%llu.%li.html",
dir,testDir,h,ri);
sprintf(pbuf2,"%s/%s/parse.%llu.%li.html",
dir,testDir,h,ri-1);
// sanity check
//File tf; tf.set(pbuf1);
//if ( ! tf.doesExist()) {char *xx=NULL;*xx=0;}
// tmp file name
char tmp1[200];
char tmp2[200];
sprintf(tmp1,"%s/%s/t1.html",dir,testDir);
sprintf(tmp2,"%s/%s/t2.html",dir,testDir);
// filter first
char cmd[600];
sprintf(cmd,
"cat %s | "
"grep -v \"\" "
" > %s", pbuf1,tmp1);
system(cmd);
sprintf(cmd,
"cat %s | "
"grep -v \"\" "
" > %s", pbuf2,tmp2);
system(cmd);
// make the system cmd to do the diff
sprintf(cmd,
"echo \"\" > %s ; "
"diff -w --text %s %s "
// ignore this table header row
//" | grep -v \"R#4\""
" >> %s",
pdiff,
tmp1,tmp2,pdiff);
log("test: system(\"%s\")",cmd);
system(cmd);
// try again
f.set(pdiff);
fs = f.getFileSize();
}
QUICKPOLL(niceness);
// this means 0 . it just has the tag in it!
if ( fs < 0 || fs == 6 ) fs = 0;
// . if no diff and NOT current run, do not print it
// . print it if the run right before the current
// now always too
if ( ri != m_runId && ri != m_runId-1 && fs == 0 )
continue;
// relative filename
char rel[200];
sprintf(rel,"/%s/parse.%llu.%li.html.diff",
testDir,h,ri);
char full[200];
sprintf(full,"/%s/parse.%llu.%li.html",
testDir,h,ri);
char validate[200];
sprintf(validate,
"/%s/parse-shortdisplay.%llu.%li.html",
testDir,h,ri);
// use red font for current run that has a diff!
char *t1 = "";
char *t2 = "";
if ( ri == m_runId && fs != 0 ) {
t1 = "";
t2 = "";
// a diff
udiff[un] = 1;
}
// . get critical errors
// . i.e. XmlDoc::validateOutput() could not validate
// a particular event or address that was in the
// url's "validated.uh64.txt" file since the admin
// clicked on the checkbox in the page parser output
// . if we do not find such a tag in the parser output
// any more then Spider.cpp creates this file!
if ( ri == m_runId ) {
char cfile[256];
sprintf(cfile,"%s/%s/critical.%llu.%li.txt",
g_hostdb.m_dir,testDir,h,ri);
SafeBuf ttt;
ttt.fillFromFile(cfile);
// first long is misses, then hits then events
umiss[un] = 0;
uhits[un] = 0;
uevents[un] = 0;
uunchecked[un] = 0;
if ( ttt.length() >= 3 )
sscanf(ttt.getBufStart(),
"%li %li %li %li",
&umiss[un],
&uhits[un],
&uevents[un],
&uunchecked[un]);
usort[un] = umiss[un] + uunchecked[un];
//File cf;
//cf.set(cfile);
//if ( cf.doesExist()) ucrit[un] = 1;
//else ucrit[un] = 0;
}
// more critical?
if ( ri == m_runId && umiss[un] != 0 ) {
t1 = "";
t2 = "";
}
// . these are good to have
// . if you don't have 1+ critical hits then you
// probably need to be validate by the qa guy
char *uhb1 = "";
char *uhb2 = "";
if ( ri == m_runId && uhits[un] != 0 ) {
uhb1 = "**";
uhb2 = "**";
}
QUICKPOLL(niceness);
char *e1 = "";
char *e2 = " | ";
long ne = uevents[un];
if ( ne ) {
e1="";
e2=" | ";
}
char *u1 = "";
char *u2 = " | ";
if ( uunchecked[un] ) {
u1="";
u2=" | ";
}
// print the row!
tmp.safePrintf(""
"%s%li%s | "
"%s%li%s | " // critical hits
"%s%li%s | " // critical misses
"%s%li%s" // # events
"%s%li%s" // unchecked
"%s%li%s | " // filesize of diff
// diff filename
"%s%s%s | "
// full parser output
""
"full | "
"validate "
" | "
"
\n",
t1,ri,t2,
uhb1,uhits[un],uhb2,
t1,umiss[un],t2,
e1,ne,e2,
u1,uunchecked[un],u2,
t1,fs,t2,
rel,t1,rel,t2,
full,
validate);
// only fill "sd" for the most recent guy
if ( ri != m_runId ) continue;
// now concatenate the parse-shortdisplay file
// to this little table so qa admin can check/uncheck
// validation checkboxes for addresses and events
//sprintf(cfile,
// "%s/test/parse-shortdisplay.%llu.%li.html",
// g_hostdb.m_dir,h,ri);
//sd.fillFromFile ( cfile );
}
// end table
tmp.safePrintf("
\n");
// . and a separate little section for the checkboxes
// . should already be in tables, etc.
// . each checkbox should provide its own uh64 when it
// calls senddiv() when clicked now
//tmp.cat ( sd );
tmp.safePrintf("
\n");
tmp.safePrintf("
\n");
// set this
ulen[un] = tmp.length() - uptr[un] ;
// sanity check
if ( ulen[un] > 10000000 ) { char *xx=NULL;*xx=0; }
// inc it
un++;
// increase the 5000!!
if ( un >= 5000 ) { char *xx=NULL; *xx=0; }
}
char flag ;
bubble:
flag = 0;
// sort the url tables
for ( long i = 0 ; i < un - 1 ; i++ ) {
QUICKPOLL(niceness);
if ( usort[i] > usort[i+1] ) continue;
if ( usort[i] == usort[i+1] )
if ( udiff[i] >= udiff[i+1] ) continue;
// swap em
long tp = uptr[i];
long td = udiff[i];
long um = umiss[i];
long us = usort[i];
long uh = uhits[i];
long tl = ulen [i];
uptr[i] = uptr[i+1];
umiss[i] = umiss[i+1];
usort[i] = usort[i+1];
uhits[i] = uhits[i+1];
udiff[i] = udiff[i+1];
ulen[i] = ulen[i+1];
uptr[i+1] = tp;
umiss[i+1] = um;
usort[i+1] = us;
uhits[i+1] = uh;
udiff[i+1] = td;
ulen [i+1] = tl;
flag = 1;
}
if ( flag ) goto bubble;
// transfer into primary safe buf now
for ( long i = 0 ; i < un ; i++ )
sb.safeMemcpy(tmp.getBufStart() + uptr[i],ulen[i]);
sb.safePrintf("