open-source-search-engine/gbfilter.cpp
2013-08-02 13:12:24 -07:00

300 lines
8.8 KiB
C++

#include "gb-include.h"
#include <errno.h>
#include <sys/types.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <ctype.h>
// . we should not read in more than 1M from input file
// . if g_conf.m_httpMaxReadSize is ever bigger than 1M, this should be inc'd
#define MAX_READ_SIZE (20*1024*1024)
// the various content types
#define CT_UNKNOWN 0
#define CT_HTML 1
#define CT_TEXT 2
#define CT_XML 3
#define CT_PDF 4
#define CT_DOC 5
#define CT_XLS 6
#define CT_PPT 7
#define CT_PS 8
// . declare useful subroutines
// . "buf" is the mime + content, the whole HTTP reply gigabot received
// . "mime" is just the mime of the HTTP reply, the top portion of "buf"
long getMimeLen ( char *buf , long bufLen ) ;
char getContentType ( char *mime , long mimeLen ) ;
int filterContent ( char *buf , long bufLen , long mimeLen , char ctype ,
long id ) ;
// . returns -1 on error, 0 on success
// . reads HTTP reply from filename given as argument, filters it,
// and then writes it to stdout
// . originally, we read from stdin, but popen was causing problems when called
// from a thread on linux 2.4.17 with the old linux threads
int main ( int argc , char *argv[] ) {
// should have one and only 1 arg (excluding filename)
if ( argc != 2 ) {
fprintf(stderr,"gbfilter: usage: gbfilter <inputfilename>\n");
return -1;
}
// . read HTTP reply in from file, gigablast will give it to us there
// . this should be the HTTP mime followed by the content
char *buf = (char *)malloc ( MAX_READ_SIZE );
if ( ! buf ) {
fprintf(stderr,"gbfilter:malloc:%s: %s: %s\n",
argv[1],strerror(errno));
return -1;
}
// first and only arg is the input file to read from
int fd = open ( argv[1] , O_RDONLY );
if ( fd < 0 ) {
fprintf(stderr,"gbfilter:open: %s: %s\n",
argv[1],strerror(errno));
free ( buf );
return -1;
}
int n = read ( fd , buf , MAX_READ_SIZE );
close ( fd );
// return -1 on read error
if ( n < 0 ) {
fprintf(stderr,"gbfilter:fread: %s\n",strerror(errno));
free ( buf );
return -1;
}
// warn if the doc was bigger than expected
if ( n >= MAX_READ_SIZE )
fprintf(stderr,"gbfilter: WARNING: MAX_READ_SIZE "
"needs boost\n");
//sleep(45);
//srand(time(NULL));
//long i = rand() % 30;
//fprintf(stderr,"sleep(%li)\n",i);
//sleep(i);
// if nothing came in then nothing goes out, we're done
if ( n == 0 ) { free ( buf ) ; return 0; }
// get the end of the mime of this HTTP reply
long mimeLen = getMimeLen ( buf , n );
// if it is -1, no mime boundary was found, so return an error
if ( mimeLen < 0 ) {
fprintf(stderr,"gbfilter: no mime boundary\n");
free ( buf );
return -1;
}
// . get the id from the input filename
// . use that for out tmp files as well so parent caller can remove
// our cruft if we core
long id ;
char *p = argv[1];
// get id in the file
while ( *p && ! isdigit(*p) ) p++;
id = atol ( p );
// ... begin filter logic here ...
// get the content type (the various types are #define'd above)
char ctype = getContentType ( buf , mimeLen );
bool filter = false;
if ( ctype == CT_PDF ) filter = true ;
if ( ctype == CT_DOC ) filter = true ;
if ( ctype == CT_XLS ) filter = true ;
if ( ctype == CT_PPT ) filter = true ;
if ( ctype == CT_PS ) filter = true ;
if ( filter ) {
int status = filterContent ( buf, n, mimeLen, ctype, id );
free ( buf );
return status;
}
// ... end filter logic here ...
// if not filtered, write the input to stdout unaltered
// no! make it 0 bytes!
//long w = fwrite ( buf , 1 , n , stdout );
//if ( w == n ) { free ( buf ) ; return 0; }
free ( buf );
return 0;
// note any errors
fprintf(stderr,"gbfilter: fwrite: %s\n",strerror(errno));
free ( buf );
return -1;
}
// returns -1 if no boundary found
long getMimeLen ( char *buf , long bufLen ) {
// size of the boundary
long bsize = 0;
// find the boundary
long i;
for ( i = 0 ; i < bufLen ; i++ ) {
// continue until we hit a \r or \n
if ( buf[i] != '\r' && buf[i] != '\n' ) continue;
// boundary check
if ( i + 1 >= bufLen ) continue;
// prepare for a smaller mime size
bsize = 2;
// \r\r
if ( buf[i ] == '\r' && buf[i+1] == '\r' ) break;
// \n\n
if ( buf[i ] == '\n' && buf[i+1] == '\n' ) break;
// boundary check
if ( i + 3 >= bufLen ) continue;
// prepare for a larger mime size
bsize = 4;
// \r\n\r\n
if ( buf[i ] == '\r' && buf[i+1] == '\n' &&
buf[i+2] == '\r' && buf[i+3] == '\n' ) break;
// \n\r\n\r
if ( buf[i ] == '\n' && buf[i+1] == '\r' &&
buf[i+2] == '\n' && buf[i+3] == '\r' ) break;
}
// return false if could not find the end of the MIME
if ( i == bufLen ) return -1;
return i + bsize;
}
// get content-type
char getContentType ( char *mime , long mimeLen ) {
// temp null terminate so we can call strstr
char c = mime [ mimeLen ];
mime [ mimeLen ] = '\0';
// find "content-type:" field in mime
char *s = strstr ( mime , "Content-Type:" );
if ( ! s ) s = strstr ( mime , "content-type:" );
if ( ! s ) s = strstr ( mime , "Content-type:" );
if ( ! s ) s = strstr ( mime , "CONTENT-TYPE:" );
// set back
mime [ mimeLen ] = c;
// if no content-type specified, it's unknown
if ( ! s ) return CT_UNKNOWN ;
// otherwise, is it application/pdf ?
char *mimeEnd = mime + mimeLen;
// skip to field data
s += 13;
// skip spaces
while ( s < mimeEnd && (*s == ' ' || *s == '\t') ) s++;
// if s passed end, we had no field data, assume not pdf
if ( s >= mimeEnd ) return CT_UNKNOWN ;
// is it pdf?
if ( s + 15 < mimeEnd &&
strncasecmp ( s , "application/pdf" , 15 ) == 0 )
return CT_PDF;
// it it word?
if ( s + 18 < mimeEnd &&
strncasecmp ( s , "application/msword",18 ) == 0 )
return CT_DOC;
// it it xls?
if ( s + 24 < mimeEnd &&
strncasecmp ( s , "application/vnd.ms-excel",24 ) == 0 )
return CT_XLS;
// it it ppt?
if ( s + 24 < mimeEnd &&
strncasecmp ( s , "application/mspowerpoint",24 ) == 0 )
return CT_PPT;
// it it ps?
if ( s + 22 < mimeEnd &&
strncasecmp ( s , "application/postscript",22 ) == 0 )
return CT_PS;
// otherwise assume unknown even though may be text/html, etc.
return CT_UNKNOWN;
}
int filterContent ( char *buf , long n , long mimeLen , char ctype , long id) {
// write mime to stdout unaltered
int w = fwrite ( buf , 1 , mimeLen , stdout );
if ( w != mimeLen ) {
// note any errors
fprintf(stderr,"gbfilter: fwrite: %s\n",strerror(errno));
return -1;
}
// flush it so it comes first, before filtered content
fflush ( stdout );
// this is set on the call from gigablast server
char *wdir = getenv ("HOME" );
// save the content to a file so pdftohtml,etc. can work with it
char in[64];
sprintf ( in , "%s/content.%li", wdir , id ); // (long)getpid() );
//fprintf(stderr,"in=%s\n",in);
int fd = open ( in , O_CREAT | O_RDWR , S_IRWXU );
if ( fd < 0 ) {
fprintf(stderr,"gbfilter: open: %s\n",strerror(errno));
return -1;
}
long b = n - mimeLen ;
if ( write ( fd , buf + mimeLen , b ) != b ) {
close ( fd );
fprintf(stderr,"gbfilter: write: %s\n",strerror(errno));
unlink ( in );
return -1;
}
close(fd);
// . open a pipe to pdf2html program
// . the output will go to stdout
char cmd[128];
// different commands to filter differt ctypes
// -i : ignore images
// -stdout: send output to stdout
// -c : generate complex document
// Google generates complex docs, but the large ones are horribly slow
// in the browser, but docs with 2 cols don't display right w/o -c.
// damn, -stdout doesn't work when -c is specified.
// These ulimit sizes are max virtual memory in kilobytes. let's
// keep them to 25 Megabytes
if ( ctype == CT_PDF )
sprintf ( cmd , "ulimit -v 25000 -t 30 ; nice -n 19 %s/pdftohtml -q -i -noframes -stdout %s", wdir , in );
else if ( ctype == CT_DOC )
sprintf ( cmd , "ulimit -v 25000 -t 30 ; nice -n 19 %s/antiword %s" , wdir , in );
else if ( ctype == CT_XLS )
sprintf ( cmd , "ulimit -v 25000 -t 30 ; nice -n 19 %s/xlhtml %s" , wdir , in );
else if ( ctype == CT_PPT )
sprintf ( cmd , "ulimit -v 25000 -t 30 ; nice -n 19 %s/ppthtml %s" , wdir , in );
else if ( ctype == CT_PS )
sprintf ( cmd , "ulimit -v 25000 -t 30; nice -n 19 %s/pstotext %s" , wdir , in );
// don't use too much memory, i think xhtml uses so much that it
// swaps out all the gb processes?
//struct rlimit lim;
//lim.rlim_cur = lim.rlim_max = 24 * 1024 * 1024 ;
//if ( setrlimit ( RLIMIT_AS , &lim ) )
// fprintf (stderr,"gbfilter:setrlimit: %s", strerror(errno) );
FILE *pd = popen ( cmd , "w" );
if ( ! pd ) {
fprintf(stderr,"gbfilter: popen: %s\n",strerror(errno));
unlink ( in );
return -1;
}
// success
pclose(pd);
fflush ( stdout );
// clean up the binary file from disk
if ( unlink ( in ) == 0 ) return 0;
fprintf(stderr,"gbfilter: unlink (%s): %s\n",in,strerror(errno));
// ignore it, since it was not a processing error per se
errno = 0;
return 0;
}