open-source-search-engine/getsample.cpp
2014-11-10 14:45:11 -08:00

485 lines
14 KiB
C++

/* Standalone program that takes the query data (queries performed
by various clients and people) from the logs and filters it for download by
our data license clients. The specs of the program are as follows:
1. syntax:
getsample <directoryContainingTheLogFiles> <outputDirectory> <monthdayhour>
2. month day and hour are each 2 digits (zero padded)
3. the output of getqueries will be stored in a file called
sample.monthdayhour in the output directory, where month, day and hour are
2 digits each. like sample.020312. that sample should then be compressed
using a system() call to bzip2 to make it sample.020312.bz2. typically the
output directory will be /a/html/ so our clients can download the samples
over our http server.
4. "getsample <dirOfLogs> <outputDir> lasthour" will use the hour before the
current hour so we can do dumps every hour called by a cron job. lasthour is
the actual string "lasthour" not a number. getsample has to look at the
current time and use the previous hour for this value.
5. see opendir() invoicer.cpp for code that scans the files in a given
directory. we need to scan all files starting with "log" to get the sample
data, just like in invoicer.cpp. all log files (except the current one)
have a date appended to their names. Use that date to avoid scanning them
for queries if the date is before the requested monthdayhour. that date
is when the next log file was opened and they were set aside.
6. all ip addresses in the 8th column of the sample should be consistently
remapped by using hash32() (steal from hash.cpp).
7. all ip addresses that are values for the &uip= cgi parm should likewise
be remapped.
8. all &code= cgi values should be remapped by hashing their current value
(an ascii string) to a 32-bit number using hash32(). the string should be
replaced with that number. so the viewer knows they are coming from the
same client, but the client's actual passcode is not known.
9. all dates are in GMT (UTC). the timestamps in the log are already in UTC.
10. getsample should probably compile independently of the gb source code if
possible.
*/
#include "gb-include.h"
#include <errno.h>
#include <sys/types.h> // for opendir()
#include <dirent.h> // for opendir()
#include <time.h> // for time()
#include <ctype.h>
#include <sys/socket.h> // inet_ntoa()
#include <netinet/in.h> // inet_ntoa()
#include <arpa/inet.h> // inet_ntoa()
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
//lets not read more than 10mb at a time
#define MAX_READ_SIZE 10*1024*1024
class GetSample {
public:
char m_year[3];
char m_month[3] ;
char m_day[3] ;
char m_hour[3];
};
static char *s_month[] = {
{"XXX"},{"Jan"},{"Feb"},{"Mar"},{"Apr"},{"May"},{"Jun"},
{"Jul"},{"Aug"},{"Sep"},{"Oct"},{"Nov"},{"Dec"}
};
//Defining getSample as global
class GetSample g_getSample;
uint64_t g_hashtab[256][256] ;
bool hashinit () {
static bool s_initialized = false;
// bail if we already called this
if ( s_initialized ) return true;
// show RAND_MAX
//printf("RAND_MAX = %"UINT32"\n", RAND_MAX ); it's 0x7fffffff
// seed with same value so we get same rand sequence for all
srand ( 1945687 );
for ( int32_t i = 0 ; i < 256 ; i++ )
for ( int32_t j = 0 ; j < 256 ; j++ ) {
g_hashtab [i][j] = (uint64_t)rand();
// the top bit never gets set, so fix
if ( rand() > (0x7fffffff / 2) )
g_hashtab[i][j] |= 0x80000000;
g_hashtab [i][j] <<= 32;
g_hashtab [i][j] |= (uint64_t)rand();
// the top bit never gets set, so fix
if ( rand() > (0x7fffffff / 2) )
g_hashtab[i][j] |= 0x80000000;
}
//if ( g_hashtab[0][0] != 6720717044602784129LL ) return false;
s_initialized = true;
return true;
}
uint32_t hash32 ( const char *s, int32_t len ) {
uint32_t h = 0;
int32_t i = 0;
while ( i < len ) {
h ^= (uint32_t) g_hashtab [(unsigned char)i]
[(unsigned char)s[i]];
i++;
}
return h;
}
bool processLog(FILE *fdip, FILE *fdop){
//some urls are very int32_t, so keeping 10k
char line[10*1024];
//buffer to store
char buf[10*1024];
char *p=line;
int32_t i=0;
while(fgets(line,10*1024,fdip)){
char *q=buf;
char *lineStart=p;
char *ipStart=p;
char *urlStart=strstr(p," GET /search");
if (!urlStart)
urlStart=strstr(p," POST /search");
if(!urlStart)
continue;
//don't check for errors, grep could do that
/*char *tmp=strstr(p," (error: ");
if (tmp)
continue;*/
//we reached a GET or POST line in the log
// have to check the month, day and hour
//get to the date
ipStart+=32;
// check if we have the right month
if (strncasecmp(g_getSample.m_month,"xx",2)!=0 &&
strncasecmp(ipStart,
s_month[atoi(g_getSample.m_month)],3) )
continue;
//if we do not have the right month, do you think we should
// just skip this file? Not now, maybe would write the code
// later to read the next file here and check if we've skipped
// a lot.
ipStart+=4; //skipping the month
// check if we have the right day
if (strncasecmp(g_getSample.m_day,"xx",2)!=0 &&
strncasecmp(ipStart,g_getSample.m_day,2) )
continue;
ipStart += 3;
//check if we are at the right hour
if (strncasecmp(g_getSample.m_hour,"xx",2)!=0 &&
strncasecmp(ipStart,g_getSample.m_hour,2) ){
continue;
}
//we found a match
//null end after the query, we don't need anything after that
//was getting a seg fault because some line in tha log did not
// have HTTP at the end, which is weird. Just take \n
char *end=strstr(line,"\n");
//if a query is more than 10k, just trunc it at 10k
if (!end)
line[10*1024-1]='\0';
else
end[0]='\0';
// go to ip of the user
ipStart+=9;
//copy everything before that
strncpy(q,lineStart,ipStart-lineStart);
q+=ipStart-lineStart;
char ip[18];
strncpy(ip,ipStart,urlStart-ipStart);
ip[urlStart-ipStart]='\0';
uint32_t ipHash=hash32(ip,gbstrlen(ip));
sprintf(q,"%u.%u.%u.%u \0",(unsigned char)(ipHash>>24),
(unsigned char)(ipHash>>16),
(unsigned char)(ipHash>>8),
(unsigned char)ipHash);
q+=gbstrlen(q);
// urlStart+=12;
char *codeStart= strstr(urlStart,"code=");
char *uipStart= strstr(urlStart,"uip=");
if (!codeStart && !uipStart){
sprintf(q,"%s\n\0",urlStart);
q+=gbstrlen(q);
}
else if ((codeStart && codeStart<uipStart) || !uipStart){
codeStart+=5;
strncpy(q,urlStart,codeStart-urlStart);
q[codeStart-urlStart]='\0';
q+=gbstrlen(q);
//I've seen some bad codes being junk and > 50 chars
char code[50];
int32_t i=0;
while (codeStart[i]!='\0' && codeStart[i]!='&' &&
codeStart[i]!=' ' && i<50){
code[i]=codeStart[i];
i++;
}
code[i]='\0';
codeStart+=i;
uint32_t codeHash=hash32(code,gbstrlen(code));
sprintf(q,"%u\0",codeHash);
q+=gbstrlen(q);
if (uipStart){
uipStart+=4;
strncpy(q,codeStart,uipStart-codeStart);
q[uipStart-codeStart]='\0';
q+=gbstrlen(q);
char uip[50];
int32_t j=0;
while(uipStart[j]!='\0' &&
uipStart[j]!='&' &&
uipStart[j]!=' ' && j<50){
uip[j]=uipStart[j];
j++;
}
uip[j]='\0';
uipStart+=j;
uint32_t uipHash=hash32(uip,gbstrlen(uip));
sprintf(q,"%u.%u.%u.%u\0",
(unsigned char)(uipHash>>24),
(unsigned char)(uipHash>>16),
(unsigned char)(uipHash>>8),
(unsigned char)uipHash);
q+=gbstrlen(q);
//copy everything after that, and null end q
sprintf(q,"%s\n\0",uipStart);
/*strncpy(q,uipStart,gbstrlen(uipStart));
q+=gbstrlen(uipStart);*/
}
else{
/*strncpy(q,codeStart,gbstrlen(codeStart));
q+=gbstrlen(codeStart);*/
sprintf(q,"%s\n\0",codeStart);
}
}
else if((uipStart && uipStart<codeStart) || !codeStart){
uipStart+=4;
strncpy(q,urlStart,uipStart-urlStart);
q[uipStart-urlStart]='\0';
q+=gbstrlen(q);
char uip[50];
int32_t i=0;
while(uipStart[i]!='\0' &&
uipStart[i]!='&' &&
uipStart[i]!=' ' && i<50){
uip[i]=uipStart[i];
i++;
}
uip[i]='\0';
uipStart+=i;
uint32_t uipHash=hash32(uip,gbstrlen(uip));
sprintf(q,"%u.%u.%u.%u\0",
(unsigned char)(uipHash>>24),
(unsigned char)(uipHash>>16),
(unsigned char)(uipHash>>8),
(unsigned char)uipHash);
q+=gbstrlen(q);
if (codeStart){
codeStart+=5;
strncpy(q,uipStart,codeStart-uipStart);
q[codeStart-uipStart]='\0';
q+=gbstrlen(q);
char code[50];
int32_t j=0;
while (codeStart[j]!='\0' &&
codeStart[j]!='&' &&
codeStart[j]!=' ' && j<50){
code[j]=codeStart[j];
j++;
}
code[j]='\0';
codeStart+=j;
uint32_t codeHash=hash32(code,
gbstrlen(code));
sprintf(q,"%u\0",codeHash);
q+=gbstrlen(q);
//copy everything after that
sprintf(q,"%s\n\0",codeStart);
/*strncpy(q,codeStart,gbstrlen(codeStart));
q+=gbstrlen(codeStart);*/
}
else{
sprintf(q,"%s\n\0",uipStart);
/*strncpy(q,uipStart,gbstrlen(uipStart));
q+=gbstrlen(uipStart);*/
}
}
i++;
// fprintf(stderr,"%s",buf);
fputs(buf,fdop);
}
if (i>0)
fprintf(stderr,"Found %u queries\n",i);
}
int main ( int argc , char *argv[] ) {
FILE *fdip,*fdop;
char fileip[1024],fileop[1024];
// first arg is the directory of log* files
// second is the output dir
// third is the mmddhh or is the string lasthour
if ( argc < 2 ) {
usage:
fprintf(stderr,"Usage: getsample [OPTION]... DATE \n");
fprintf(stderr,"Output the queries from a gb log file"
" of a particular date \n"
"Eg. getsample -i /usr 06050302 \n");
fprintf(stderr,"OPTION:\n"
"-i Input is a directory containing log files\n"
"-o Output to a directory with filename"
"sample.DATE\n\n\n");
fprintf(stderr,"\tDATE is in the form of yymmddhh, where "
"each are 2 digits "
"(zero padded) and can be skipped "
"by putting 'xx' in place of them "
"eg. 06020312, 03xx04xx, 04xxxxxx, xx08xx. \n");
fprintf(stderr,"\tDATE can be replaced by the string "
"'lasthour' which dumps the queries "
"in the hour before the current hour\n");
return -1;
}
int32_t ipArg=0;
int32_t opArg=0;
int32_t i=1;
while (i < argc){
if(strncmp(argv[i],"-i",2)==0)
ipArg=i+1;
else if(strncmp(argv[i],"-o",2)==0)
opArg=i+1;
i++;
}
if (ipArg==0)
fdip=stdin;
if (opArg==0)
fdop=stdout;
//else cycle through
//check if it is mmddhh or lasthour
if (strcmp(argv[argc-1],"lasthour")==0){
//get the current time
time_t rawTime;
struct tm *timeInfo;
time (&rawTime);
//Reduce rawTime by 1 hour(3600 secs)
rawTime-=3600;
//timeInfo stores the lasthour UTC time
timeInfo=localtime(&rawTime);
int32_t year = timeInfo->tm_year;
if (year > 100 )
year -= 100;
sprintf(g_getSample.m_hour,"%02"INT32"",timeInfo->tm_hour);
sprintf(g_getSample.m_day,"%02"INT32"",timeInfo->tm_mday);
sprintf(g_getSample.m_month,"%02"INT32"",timeInfo->tm_mon+1);
sprintf(g_getSample.m_year,"%02"INT32"",year);
}
else{
if (gbstrlen(argv[argc-1]) != 8){
fprintf(stderr,"yymmddhh are each 2 digits "
"(zero padded) and can be skipped "
"by putt 'xx' in place of them "
"eg. xx020312, 03xx04xx, 04xxxxxx. \n");
return -1;
}
//put the yymmddhh string into different vars
strncpy(g_getSample.m_year,argv[argc-1],2);
strncpy(g_getSample.m_month,argv[argc-1]+2,2);
strncpy(g_getSample.m_day,argv[argc-1]+4,2);
strncpy(g_getSample.m_hour,argv[argc-1]+6,2);
//null end;
g_getSample.m_year[2]='\0';
g_getSample.m_month[2]='\0';
g_getSample.m_day[2]='\0';
g_getSample.m_hour[2]='\0';
}
//would be good to print the date on stderr
fprintf(stderr,"year=%s, month=%s, day=%s, hour=%s \n",
g_getSample.m_year,
g_getSample.m_month,
g_getSample.m_day,
g_getSample.m_hour);
//if we have an output dir given
if(opArg>0){
sprintf(fileop,"%s/sample.%s%s%s%s\0",argv[opArg],
g_getSample.m_year,
g_getSample.m_month,
g_getSample.m_day,
g_getSample.m_hour);
fdop=fopen(fileop,"w+");
if (!fdop){
fprintf(stderr,"getSample::open %s : %s\n",
fileop,strerror(errno));
return false;
}
}
if ( ! hashinit () ) return 0;
if (ipArg==0){
processLog(fdip,fdop);
}
else{
// open the dir and scan for log files
DIR *edir = opendir (argv[1] );
if ( ! edir ) {
fprintf ( stderr, "getSample::opendir (%s):%s\n",
argv[1],strerror( errno ) );
return -1;
}
// loop over all the log files in this directory
struct dirent *ent;
while ( (ent = readdir ( edir )) ) {
char *filename = ent->d_name;
if ( strncasecmp ( filename , "log" , 3 ) != 0 )
continue;
// skip if ends in a ~, it is an emacs backup file
if ( filename[gbstrlen(filename)-1] == '~' ) continue;
//no use processing if the log file date is older than
// the start date given to us underscore is before
// the month
/* char *p=strstr(filename,"_");
//if it is a current log file, eg. log0, then do it
if (p){
p++;
int32_t m,d,h;
m=atoi(p);
p+=3;
d=atoi(p);
p+=3;
h=atoi(p);
if (m<atoi(g_getSample.m_month))
continue;
if (m==atoi(g_getSample.m_month) &&
d<atoi(g_getSample.m_day))
continue;
p+=3;
if (m==atoi(g_getSample.m_month) &&
d==atoi(g_getSample.m_day) &&
h<atoi(g_getSample.m_hour))
continue;
}*/
// make a full filename
sprintf ( fileip , "%s/%s", argv[1],filename);
fprintf(stderr, "getSample::opening log file %s \n",
fileip);
// . returns -1 on failure
fdip=fopen(fileip,"r");
if (!fdip){
fprintf(stderr,"getSample::open %s : %s\n",
fileip,strerror(errno));
return false;
}
// . we got one, process it
processLog(fdip,fdop);
//after processing close it
fclose(fdip);
}
}
//if not writing to stdout, close the file and bzip it
if (fdop!=stdout){
fclose(fdop);
//do a system call to bzip the file
char tmp[1024];
sprintf(tmp,"gzip %s",fileop);
system(tmp);
}
return 1;
}