open-source-search-engine/filterquerylogs.cpp
2013-08-02 13:12:24 -07:00

268 lines
6.1 KiB
C++

#include <errno.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <ctype.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <arpa/inet.h>
#define MAX_READ_SIZE 2000000100
#define MAX_HASHES 1000
unsigned long long g_hashtab[256][256];
unsigned long hash32 ( char *s, long len ) {
unsigned long h = 0;
long i = 0;
while ( i < len ) {
h ^= (unsigned long) g_hashtab [(unsigned char)i]
[(unsigned char)s[i]];
i++;
}
return h;
}
long atoip ( char *s , long slen ) {
// point to it
char *p = s;
if ( s[slen] ) {
// copy into buffer and NULL terminate
char buf[1024];
if ( slen >= 1024 ) slen = 1023;
memcpy ( buf , s , slen );
buf [ slen ] = '\0';
// point to that
p = buf;
}
// convert to int
struct in_addr in;
in.s_addr = 0;
inet_aton ( p , &in );
// ensure this really is a long before returning ip
if ( sizeof(in_addr) == 4 ) return in.s_addr;
// otherwise bitch and return 0
//log("ip:bad inet_aton");
return 0;
}
// . returns -1 on error, 0 on success
// . reads HTTP reply from filename given as argument, filters it,
// and then writes it to stdout
// . originally, we read from stdin, but popen was causing problems when called
// from a thread on linux 2.4.17 with the old linux threads
int main ( int argc , char *argv[] ) {
// should have one and only 1 arg (excluding filename)
if ( argc != 2 ) {
fprintf(stderr,"usage: fql <querylogfilename1>..."
"<querylogfilenameN>\n");
return -1;
}
// each log file should be <= 2GB
char *buf = (char *)malloc ( MAX_READ_SIZE );
if ( ! buf ) {
fprintf(stderr,"fql:malloc:li: %s: %s\n",
(long)MAX_READ_SIZE,strerror(errno));
return -1;
}
// seed with same value so we get same rand sequence for all
srand ( 1945687 );
for ( long i = 0 ; i < 256 ; i++ )
for ( long j = 0 ; j < 256 ; j++ ) {
g_hashtab [i][j] = (unsigned long long)rand();
// the top bit never gets set, so fix
if ( rand() > (0x7fffffff / 2) )
g_hashtab[i][j] |= 0x80000000;
g_hashtab [i][j] <<= 32;
g_hashtab [i][j] |= (unsigned long long)rand();
// the top bit never gets set, so fix
if ( rand() > (0x7fffffff / 2) )
g_hashtab[i][j] |= 0x80000000;
}
if ( g_hashtab[0][0] != 6720717044602784129LL ) return false;
fprintf(stderr,"fql: reading %s\n", argv[1]);
// first and only arg is the input file to read from
int fd = open ( argv[1] , O_RDONLY );
if ( fd < 0 ) {
fprintf(stderr,"fql:open: %s: %s\n",
argv[1],strerror(errno));
free ( buf );
return -1;
}
int n = read ( fd , buf , MAX_READ_SIZE );
close ( fd );
fprintf(stderr,"fql: done reading %s\n", argv[1]);
// return -1 on read error
if ( n < 0 ) {
fprintf(stderr,"fql:fread: %s\n",strerror(errno));
free ( buf );
return -1;
}
// warn if the doc was bigger than expected
if ( n >= (long)MAX_READ_SIZE )
fprintf(stderr,"fql: WARNING: MAX_READ_SIZE "
"needs boost\n");
// if nothing came in then nothing goes out, we're done
if ( n == 0 ) { free ( buf ) ; return 0; }
// store last 1000 hashes in a ring
long hashes[MAX_HASHES];
memset ( hashes, 0 , MAX_HASHES * 4 );
long nh = 0;
// parse out query from each url
char *p = buf;
for ( ; *p ; p++ ) {
if ( p[0] != '?' && p[0] != '&' ) continue;
if ( p[1] != 'q' ) continue;
if ( p[2] != '=' ) continue;
p += 3;
// mark the end
char *end = p;
bool good = true;
for ( ; *end && *end!='&' && *end!='\n' && *end!=' '; end++ ) {
// double quote?
if ( *end == '%' &&
end[1] == '2' &&
end[2] == '2' ) {
good = false;
break;
}
// colon or pipe operators, ignore
if ( *end == '|') {
good = false;
break;
}
if ( *end == '%' &&
end[1] == '3' &&
end[2] == 'a' ) {
good = false;
break;
}
if ( *end == '%' &&
end[1] == '3' &&
end[2] == 'A' ) {
good = false;
break;
}
}
// filter out?
if ( ! good ) continue;
// limit size. 150 is too big.
if ( end - p > 150 ) continue;
// scan backwards to get ip
char *ips = p;
for ( ; ips>buf && *ips != ' ' && *ips != '\t' ; ips-- );
if ( ips>buf ) ips--;
for ( ; ips>buf && *ips != ' ' && *ips != '\t' ; ips-- );
char *ipend = ips;
if ( ips>buf ) ips--;
for ( ; ips>buf && *ips != ' ' && *ips != '\t' ; ips-- );
ips++;
// should be ip now!
long iplen = ipend - ips;
//long uip = atoip(ips,ipend-ips);
//if ( ! uip ) continue;
// must be ip #
if ( !isdigit(ips[0]) ) continue;
// replace comma with space
for ( char *r = p ; r < end ; r++ ) {
if ( *r == ',' ) *r = '+';
}
char *dst2 = p;
for ( char *r = p ; r < end ; r++ ) {
*dst2 = *r;
if ( *r == '%' &&
r[1] == '2' &&
r[2] == '0' ) {
*dst2 = '+';
r += 2;
}
dst2++;
}
end = dst2;
// skip initial spaces
char *x = p;
for ( ; x < end ; x++ ) {
if ( *x == '+' ) continue;
break;
}
char *query = p;
// filter out back to back spaces
char *dst = p;
bool lastWasSpace = false;
for ( char *x = p ; x < end ; x++ ) {
// skip back to back spaces
if ( *x == '+' && lastWasSpace ) continue;
// skip initial spaces
if ( x == p && *x == '+' ) {
lastWasSpace = true;
continue;
}
// skip initial spaces
*dst++ = *x;
if ( *x == '+' ) lastWasSpace = true;
else lastWasSpace = false;
}
// null term the overwritten buffer
*dst = '\0';
// get the length of the query
long queryLen = dst - p;
// skip that for the for loop
p = dst;
// skip empty queries
if ( queryLen==0 ) continue;
// hash it up
long h = hash32(query,queryLen);
for ( long i = 0 ; i < MAX_HASHES ; i++ ) {
if ( hashes[i] == h ) { good = false; break; }
}
hashes[nh] = h;
// inc and wrap
if ( ++nh >= MAX_HASHES ) nh = 0;
// filter out?
if ( ! good ) continue;
// cblock it
char dotCount = 0;
for ( long k = 0 ; k < iplen ; k++ ) {
if ( ips[k] != '.' ) continue;
if ( ++dotCount < 3 ) continue;
ips[k] = '\0';
break;
}
if ( dotCount != 3 ) continue;
// print ip
//ips[iplen] = '\0';
// write that out
fprintf(stdout,"%s %s\n",ips,query);
}
return 0;
}