open-source-search-engine/numwords.cpp
2014-11-10 14:45:11 -08:00

148 lines
3.2 KiB
C++

#include "gb-include.h"
#include <sys/time.h> // gettimeofday()
#include <errno.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <ctype.h>
int32_t urlDecode ( char *dest , char *s , int32_t slen ) ;
int32_t htob ( char s ) ;
main ( int argc , char *argv[] ) {
// usage check
if ( argc != 3 ) {
fprintf(stderr,"numwords <filename> <numQueryWords>\n");
exit(-1);
}
// set File class
char *fname = argv[1];
int fd = open ( fname , O_RDONLY ) ;
// # words
int32_t nw = atoi ( argv[2] );
// open file
if ( fd < 0 ) {
fprintf(stderr,"numwords::open: %s %s",fname,strerror(errno));
exit(-1);
}
// get file size
struct stat stats;
stats.st_size = 0;
stat ( fname , &stats );
int32_t fileSize = stats.st_size;
// store a \0 at the end
int32_t bufSize = fileSize + 1;
// make buffer to hold all
char *buf = (char *) malloc ( bufSize );
if ( ! buf) {
fprintf(stderr,"numwords::malloc: %s",strerror(errno));
exit(-1);
}
char *bufEnd = buf + bufSize;
// read em all in
int32_t nr = read ( fd , buf , fileSize ) ;
if ( nr != fileSize ) {
fprintf(stderr,"numwords::read: %s %s",fname,strerror(errno));
exit(-1);
}
close ( fd );
fprintf(stderr,"read %"INT32" bytes\n",nr);
// store ptrs to each line
char *lines[100000];
// count lines in the file, which is now in memory, buf
int32_t n = 0;
// set first line
lines[n++] = buf;
// set lines array
char *p = buf;
for ( int32_t i = 0 ; i < bufSize ; i++ )
if ( i > 0 && buf[i-1] == '\n' ) {
lines[n++] = &buf[i];
// set to a \0
buf[i-1] = '\0';
}
fprintf(stderr,"read %"INT32" lines\n",n);
// mix the lines up
for ( int32_t i = 0 ; i < n ; i++ ) {
// switch line #i with a random line
int32_t j = rand() % n;
char *tmp = lines[i];
lines[i] = lines[j];
lines[j] = tmp;
}
char sbuf [2048];
// output to stdout
for ( int32_t i = 0 ; i < n ; i++ ) {
char *s = strstr(lines[i],"q=");
if ( ! s ) continue;
s+= 2;
char *start = s;
// decode
int32_t nd = urlDecode ( sbuf , s , gbstrlen(s) );
sbuf[nd] = '\0';
// count words
s = sbuf;
int32_t n = 0;
int32_t punct = 1;
for ( ; *s ; s++ ) {
//if ( *s == '%' ) {
// if ( *(s+1)=='2' && *(s+1)=='0' )
//}
if ( isalnum(*s) ) {
if ( punct == 1 ) { punct = 0; n++; }
continue;
}
if ( s == start ) continue;
if ( ispunct(*(s-1)) ) continue;
punct = 1;
}
// does it match?
if ( n == nw ) printf("%s\n", lines[i]);
}
}
// . decodes "s/slen" and stores into "dest"
// . returns the number of bytes stored into "dest"
int32_t urlDecode ( char *dest , char *s , int32_t slen ) {
int32_t j = 0;
for ( int32_t i = 0 ; i < slen ; i++ ) {
if ( s[i] == '+' ) { dest[j++]=' '; continue; }
dest[j++] = s[i];
if ( s[i] != '%' ) continue;
if ( i + 2 >= slen ) continue;
// convert hex chars to values
unsigned char a = htob ( s[i+1] ) * 16;
unsigned char b = htob ( s[i+2] ) ;
dest[j-1] = (char) (a + b);
i += 2;
}
return j;
}
// convert hex digit to value
int32_t htob ( char s ) {
if ( isdigit(s) ) return s - '0';
if ( s >= 'a' && s <= 'f' ) return (s - 'a') + 10;
if ( s >= 'A' && s <= 'F' ) return (s - 'A') + 10;
return 0;
}