#include "gb-include.h" #include #include #include #include #include #include // . we should not read in more than 1M from input file // . if g_conf.m_httpMaxReadSize is ever bigger than 1M, this should be inc'd #define MAX_READ_SIZE (20*1024*1024) // the various content types #define CT_UNKNOWN 0 #define CT_HTML 1 #define CT_TEXT 2 #define CT_XML 3 #define CT_PDF 4 #define CT_DOC 5 #define CT_XLS 6 #define CT_PPT 7 #define CT_PS 8 // . declare useful subroutines // . "buf" is the mime + content, the whole HTTP reply gigabot received // . "mime" is just the mime of the HTTP reply, the top portion of "buf" long getMimeLen ( char *buf , long bufLen ) ; char getContentType ( char *mime , long mimeLen ) ; int filterContent ( char *buf , long bufLen , long mimeLen , char ctype , long id ) ; // . returns -1 on error, 0 on success // . reads HTTP reply from filename given as argument, filters it, // and then writes it to stdout // . originally, we read from stdin, but popen was causing problems when called // from a thread on linux 2.4.17 with the old linux threads int main ( int argc , char *argv[] ) { // should have one and only 1 arg (excluding filename) if ( argc != 2 ) { fprintf(stderr,"gbfilter: usage: gbfilter \n"); return -1; } // . read HTTP reply in from file, gigablast will give it to us there // . this should be the HTTP mime followed by the content char *buf = (char *)malloc ( MAX_READ_SIZE ); if ( ! buf ) { fprintf(stderr,"gbfilter:malloc:%s: %s: %s\n", argv[1],strerror(errno)); return -1; } // first and only arg is the input file to read from int fd = open ( argv[1] , O_RDONLY ); if ( fd < 0 ) { fprintf(stderr,"gbfilter:open: %s: %s\n", argv[1],strerror(errno)); free ( buf ); return -1; } int n = read ( fd , buf , MAX_READ_SIZE ); close ( fd ); // return -1 on read error if ( n < 0 ) { fprintf(stderr,"gbfilter:fread: %s\n",strerror(errno)); free ( buf ); return -1; } // warn if the doc was bigger than expected if ( n >= MAX_READ_SIZE ) fprintf(stderr,"gbfilter: WARNING: MAX_READ_SIZE " "needs boost\n"); //sleep(45); //srand(time(NULL)); //long i = rand() % 30; //fprintf(stderr,"sleep(%li)\n",i); //sleep(i); // if nothing came in then nothing goes out, we're done if ( n == 0 ) { free ( buf ) ; return 0; } // get the end of the mime of this HTTP reply long mimeLen = getMimeLen ( buf , n ); // if it is -1, no mime boundary was found, so return an error if ( mimeLen < 0 ) { fprintf(stderr,"gbfilter: no mime boundary\n"); free ( buf ); return -1; } // . get the id from the input filename // . use that for out tmp files as well so parent caller can remove // our cruft if we core long id ; char *p = argv[1]; // get id in the file while ( *p && ! isdigit(*p) ) p++; id = atol ( p ); // ... begin filter logic here ... // get the content type (the various types are #define'd above) char ctype = getContentType ( buf , mimeLen ); bool filter = false; if ( ctype == CT_PDF ) filter = true ; if ( ctype == CT_DOC ) filter = true ; if ( ctype == CT_XLS ) filter = true ; if ( ctype == CT_PPT ) filter = true ; if ( ctype == CT_PS ) filter = true ; if ( filter ) { int status = filterContent ( buf, n, mimeLen, ctype, id ); free ( buf ); return status; } // ... end filter logic here ... // if not filtered, write the input to stdout unaltered // no! make it 0 bytes! //long w = fwrite ( buf , 1 , n , stdout ); //if ( w == n ) { free ( buf ) ; return 0; } free ( buf ); return 0; // note any errors fprintf(stderr,"gbfilter: fwrite: %s\n",strerror(errno)); free ( buf ); return -1; } // returns -1 if no boundary found long getMimeLen ( char *buf , long bufLen ) { // size of the boundary long bsize = 0; // find the boundary long i; for ( i = 0 ; i < bufLen ; i++ ) { // continue until we hit a \r or \n if ( buf[i] != '\r' && buf[i] != '\n' ) continue; // boundary check if ( i + 1 >= bufLen ) continue; // prepare for a smaller mime size bsize = 2; // \r\r if ( buf[i ] == '\r' && buf[i+1] == '\r' ) break; // \n\n if ( buf[i ] == '\n' && buf[i+1] == '\n' ) break; // boundary check if ( i + 3 >= bufLen ) continue; // prepare for a larger mime size bsize = 4; // \r\n\r\n if ( buf[i ] == '\r' && buf[i+1] == '\n' && buf[i+2] == '\r' && buf[i+3] == '\n' ) break; // \n\r\n\r if ( buf[i ] == '\n' && buf[i+1] == '\r' && buf[i+2] == '\n' && buf[i+3] == '\r' ) break; } // return false if could not find the end of the MIME if ( i == bufLen ) return -1; return i + bsize; } // get content-type char getContentType ( char *mime , long mimeLen ) { // temp null terminate so we can call strstr char c = mime [ mimeLen ]; mime [ mimeLen ] = '\0'; // find "content-type:" field in mime char *s = strstr ( mime , "Content-Type:" ); if ( ! s ) s = strstr ( mime , "content-type:" ); if ( ! s ) s = strstr ( mime , "Content-type:" ); if ( ! s ) s = strstr ( mime , "CONTENT-TYPE:" ); // set back mime [ mimeLen ] = c; // if no content-type specified, it's unknown if ( ! s ) return CT_UNKNOWN ; // otherwise, is it application/pdf ? char *mimeEnd = mime + mimeLen; // skip to field data s += 13; // skip spaces while ( s < mimeEnd && (*s == ' ' || *s == '\t') ) s++; // if s passed end, we had no field data, assume not pdf if ( s >= mimeEnd ) return CT_UNKNOWN ; // is it pdf? if ( s + 15 < mimeEnd && strncasecmp ( s , "application/pdf" , 15 ) == 0 ) return CT_PDF; // it it word? if ( s + 18 < mimeEnd && strncasecmp ( s , "application/msword",18 ) == 0 ) return CT_DOC; // it it xls? if ( s + 24 < mimeEnd && strncasecmp ( s , "application/vnd.ms-excel",24 ) == 0 ) return CT_XLS; // it it ppt? if ( s + 24 < mimeEnd && strncasecmp ( s , "application/mspowerpoint",24 ) == 0 ) return CT_PPT; // it it ps? if ( s + 22 < mimeEnd && strncasecmp ( s , "application/postscript",22 ) == 0 ) return CT_PS; // otherwise assume unknown even though may be text/html, etc. return CT_UNKNOWN; } int filterContent ( char *buf , long n , long mimeLen , char ctype , long id) { // write mime to stdout unaltered int w = fwrite ( buf , 1 , mimeLen , stdout ); if ( w != mimeLen ) { // note any errors fprintf(stderr,"gbfilter: fwrite: %s\n",strerror(errno)); return -1; } // flush it so it comes first, before filtered content fflush ( stdout ); // this is set on the call from gigablast server char *wdir = getenv ("HOME" ); // save the content to a file so pdftohtml,etc. can work with it char in[64]; sprintf ( in , "%s/content.%li", wdir , id ); // (long)getpid() ); //fprintf(stderr,"in=%s\n",in); int fd = open ( in , O_CREAT | O_RDWR , S_IRWXU ); if ( fd < 0 ) { fprintf(stderr,"gbfilter: open: %s\n",strerror(errno)); return -1; } long b = n - mimeLen ; if ( write ( fd , buf + mimeLen , b ) != b ) { close ( fd ); fprintf(stderr,"gbfilter: write: %s\n",strerror(errno)); unlink ( in ); return -1; } close(fd); // . open a pipe to pdf2html program // . the output will go to stdout char cmd[128]; // different commands to filter differt ctypes // -i : ignore images // -stdout: send output to stdout // -c : generate complex document // Google generates complex docs, but the large ones are horribly slow // in the browser, but docs with 2 cols don't display right w/o -c. // damn, -stdout doesn't work when -c is specified. // These ulimit sizes are max virtual memory in kilobytes. let's // keep them to 25 Megabytes if ( ctype == CT_PDF ) sprintf ( cmd , "ulimit -v 25000 -t 30 ; nice -n 19 %s/pdftohtml -q -i -noframes -stdout %s", wdir , in ); else if ( ctype == CT_DOC ) sprintf ( cmd , "ulimit -v 25000 -t 30 ; nice -n 19 %s/antiword %s" , wdir , in ); else if ( ctype == CT_XLS ) sprintf ( cmd , "ulimit -v 25000 -t 30 ; nice -n 19 %s/xlhtml %s" , wdir , in ); else if ( ctype == CT_PPT ) sprintf ( cmd , "ulimit -v 25000 -t 30 ; nice -n 19 %s/ppthtml %s" , wdir , in ); else if ( ctype == CT_PS ) sprintf ( cmd , "ulimit -v 25000 -t 30; nice -n 19 %s/pstotext %s" , wdir , in ); // don't use too much memory, i think xhtml uses so much that it // swaps out all the gb processes? //struct rlimit lim; //lim.rlim_cur = lim.rlim_max = 24 * 1024 * 1024 ; //if ( setrlimit ( RLIMIT_AS , &lim ) ) // fprintf (stderr,"gbfilter:setrlimit: %s", strerror(errno) ); FILE *pd = popen ( cmd , "w" ); if ( ! pd ) { fprintf(stderr,"gbfilter: popen: %s\n",strerror(errno)); unlink ( in ); return -1; } // success pclose(pd); fflush ( stdout ); // clean up the binary file from disk if ( unlink ( in ) == 0 ) return 0; fprintf(stderr,"gbfilter: unlink (%s): %s\n",in,strerror(errno)); // ignore it, since it was not a processing error per se errno = 0; return 0; }