open-source-search-engine/rmbots.cpp
mwells 87285ba3cd use gbmemcpy not memcpy so we can get profiler working again
since memcpy can't be interrupted and backtrace() called.
2015-01-13 12:25:42 -07:00

201 lines
4.7 KiB
C++

#include <errno.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <ctype.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <arpa/inet.h>
#define MAX_READ_SIZE 10000000
#define MAX_HASHES 100000
uint64_t g_hashtab[256][256];
uint32_t hash32 ( char *s, int32_t len ) {
uint32_t h = 0;
int32_t i = 0;
while ( i < len ) {
h ^= (uint32_t) g_hashtab [(unsigned char)i]
[(unsigned char)s[i]];
i++;
}
return h;
}
int32_t atoip ( char *s , int32_t slen ) {
// point to it
char *p = s;
if ( s[slen] ) {
// copy into buffer and NULL terminate
char buf[1024];
if ( slen >= 1024 ) slen = 1023;
gbmemcpy ( buf , s , slen );
buf [ slen ] = '\0';
// point to that
p = buf;
}
// convert to int
struct in_addr in;
in.s_addr = 0;
inet_aton ( p , &in );
// ensure this really is a int32_t before returning ip
if ( sizeof(in_addr) == 4 ) return in.s_addr;
// otherwise bitch and return 0
//log("ip:bad inet_aton");
return 0;
}
// . returns -1 on error, 0 on success
// . reads HTTP reply from filename given as argument, filters it,
// and then writes it to stdout
// . originally, we read from stdin, but popen was causing problems when called
// from a thread on linux 2.4.17 with the old linux threads
int main ( int argc , char *argv[] ) {
// should have one and only 1 arg (excluding filename)
if ( argc != 2 ) {
fprintf(stderr,"usage: rmbots <fileofbotips>\n");
return -1;
}
// each log file should be <= 2GB
char *buf = (char *)malloc ( MAX_READ_SIZE );
if ( ! buf ) {
fprintf(stderr,"fql:malloc:li: %s: %s\n",
(int32_t)MAX_READ_SIZE,strerror(errno));
return -1;
}
// seed with same value so we get same rand sequence for all
srand ( 1945687 );
for ( int32_t i = 0 ; i < 256 ; i++ )
for ( int32_t j = 0 ; j < 256 ; j++ ) {
g_hashtab [i][j] = (uint64_t)rand();
// the top bit never gets set, so fix
if ( rand() > (0x7fffffff / 2) )
g_hashtab[i][j] |= 0x80000000;
g_hashtab [i][j] <<= 32;
g_hashtab [i][j] |= (uint64_t)rand();
// the top bit never gets set, so fix
if ( rand() > (0x7fffffff / 2) )
g_hashtab[i][j] |= 0x80000000;
}
if ( g_hashtab[0][0] != 6720717044602784129LL ) return false;
// first and only arg is the input file to read from
int fd = open ( argv[1] , O_RDONLY );
if ( fd < 0 ) {
fprintf(stderr,"rmbots:open: %s: %s\n",
argv[1],strerror(errno));
free ( buf );
return -1;
}
int n = read ( fd , buf , MAX_READ_SIZE );
close ( fd );
// return -1 on read error
if ( n < 0 ) {
fprintf(stderr,"rmbots:fread: %s\n",strerror(errno));
free ( buf );
return -1;
}
// warn if the doc was bigger than expected
if ( n >= (int32_t)MAX_READ_SIZE )
fprintf(stderr,"rmbots: WARNING: MAX_READ_SIZE "
"needs boost\n");
// if nothing came in then nothing goes out, we're done
if ( n == 0 ) { free ( buf ) ; return 0; }
// store last 1000 hashes in a ring
int32_t hashes[MAX_HASHES];
memset ( hashes, 0 , MAX_HASHES * 4 );
int32_t nh = 0;
// parse out query from each url
char *p = buf;
for ( ; *p ; p++ ) {
// mark the end
char *end = p;
for ( ; *end && *end!='\n' ; end++ ) ;
// set it
char *ip = p;
// advance p for next call
if ( *end == '\n' ) p = end;
// should be ip now!
int32_t iplen = end - ip;
//int32_t uip = atoip(ips,ipend-ips);
//if ( ! uip ) continue;
// must be ip #
if ( !isdigit(ip[0]) ) continue;
// skip empty ip lines
if ( iplen == 0 ) continue;
// hash it up
uint32_t h = hash32(ip,iplen);
uint32_t n = h % MAX_HASHES;
for ( ; ; ) {
if ( hashes[n] == h ) break;
if ( hashes[n] == 0 ) break;
if ( ++n >= MAX_HASHES ) n = 0;
}
// store it
hashes[n] = h;
}
// now read stdin and filter out the line if it contains
// the bot ip!!
char line[5000];
while ( fgets ( line , 5000 , stdin ) ) {
char *p = line;
bool skip = false;
// scan line for uip
for ( ; *p ; p++ ) {
if ( p[0] != 'u' ) continue;
if ( p[1] != 'i' ) continue;
if ( p[2] != 'p' ) continue;
if ( p[3] != '=' ) continue;
char *ip = p + 4;
// find end of ip
char *end = ip;
for ( ; *end &&*end!='\n'&&*end!='&'; end++);
// get len
int32_t iplen = end - ip;
// hash it now
uint32_t h = hash32(ip,iplen);
uint32_t n = h % MAX_HASHES;
// skip if none
//if (iplen == 0 ) goto printit;
// find it in has htable
for ( ; ; ) {
if ( hashes[n] == h ) break;
if ( hashes[n] == 0 ) break;
if ( ++n >= MAX_HASHES ) n = 0;
}
// skip printing it
if ( hashes[n] == h ) {
skip = true;
break;
}
}
// skip printing it cuz its a bot?
if ( skip )
continue;
// print it now
fprintf(stdout,"%s",line);
}
return 0;
}