From faf2c06d29d712123144f5276f73a1370bffcc59 Mon Sep 17 00:00:00 2001 From: Matt Date: Tue, 28 Apr 2015 22:30:58 -0700 Subject: [PATCH] some fixes for indexing warcs/arcs. --- HttpMime.cpp | 24 ++++++- HttpMime.h | 5 ++ HttpServer.cpp | 190 +++++++++++++++++++++++++++++++------------------ Makefile | 2 +- XmlDoc.cpp | 13 +++- 5 files changed, 161 insertions(+), 73 deletions(-) diff --git a/HttpMime.cpp b/HttpMime.cpp index 08f67625..853be0f2 100644 --- a/HttpMime.cpp +++ b/HttpMime.cpp @@ -45,6 +45,7 @@ void HttpMime::reset ( ) { m_locationFieldLen = 0; m_contentEncodingPos = NULL; m_contentLengthPos = NULL; + m_contentTypePos = NULL; } // . returns false if could not get a valid mime @@ -157,8 +158,12 @@ bool HttpMime::parse ( char *mime , int32_t mimeLen , Url *url ) { time_t now = time(NULL); if (m_lastModifiedDate > now) m_lastModifiedDate = now; } - else if ( strncasecmp ( p , "Content-Type:" ,13) == 0 ) + else if ( strncasecmp ( p , "Content-Type:" ,13) == 0 ) { m_contentType = getContentTypePrivate ( p + 13 ); + char *s = p + 13; + while ( *s == ' ' || *s == '\t' ) s++; + m_contentTypePos = s; + } else if ( strncasecmp ( p , "Set-Cookie:" ,10) == 0 ) { m_cookie = p + 11; if ( m_cookie[0] == ' ' ) m_cookie++; @@ -540,6 +545,7 @@ int32_t getContentTypeFromStr ( char *s ) { else if (!strncasecmp(s,"image/",6 ) ) ct = CT_IMAGE; else if (!strcasecmp(s,"application/javascript" ) ) ct = CT_JS; else if (!strcasecmp(s,"application/x-javascript") ) ct = CT_JS; + else if (!strcasecmp(s,"application/x-gzip" ) ) ct = CT_GZ; else if (!strcasecmp(s,"text/javascript" ) ) ct = CT_JS; else if (!strcasecmp(s,"text/x-js" ) ) ct = CT_JS; else if (!strcasecmp(s,"text/js" ) ) ct = CT_JS; @@ -626,6 +632,17 @@ void resetHttpMime ( ) { s_mimeTable.reset(); } +const char *extensionToContentTypeStr2 ( char *ext , int32_t elen ) { + // assume text/html if no extension provided + if ( ! ext || ! ext[0] ) return NULL; + if ( elen <= 0 ) return NULL; + // get hash for table look up + int32_t key = hash32 ( ext , elen ); + char **pp = (char **)s_mimeTable.getValue ( &key ); + if ( ! pp ) return NULL; + return *pp; +} + const char *HttpMime::getContentTypeFromExtension ( char *ext , int32_t elen) { // assume text/html if no extension provided if ( ! ext || ! ext[0] ) return "text/html"; @@ -1051,7 +1068,10 @@ static char *s_ext[] = { "xwd" , "image/x-xwindowdump", "xyz" , "chemical/x-pdb", "zip" , "application/zip" , - "xpi", "application/x-xpinstall" + "xpi", "application/x-xpinstall", + // newstuff + "warc", "application/warc", + "arc", "application/arc" }; // . init s_mimeTable in this call diff --git a/HttpMime.h b/HttpMime.h index 881f49bb..32f408e4 100644 --- a/HttpMime.h +++ b/HttpMime.h @@ -9,6 +9,8 @@ // convert application/json to CT_JSON for instance int32_t getContentTypeFromStr ( char *s ) ; +const char *extensionToContentTypeStr2 ( char *ext , int32_t elen ) ; + #include void getTime ( char *s , int *sec , int *min , int *hour ) ; @@ -42,6 +44,7 @@ time_t atotime5 ( char *s ) ; #define CT_JSON 16 #define CT_IMAGE 17 #define CT_STATUS 18 // an internal type indicating spider reply +#define CT_GZ 19 #define ET_IDENTITY 0 #define ET_GZIP 1 @@ -127,6 +130,7 @@ class HttpMime { int32_t getContentEncoding () {return m_contentEncoding;} char *getContentEncodingPos() {return m_contentEncodingPos;} char *getContentLengthPos() {return m_contentLengthPos;} + char *getContentTypePos() {return m_contentTypePos;} // private: @@ -166,6 +170,7 @@ class HttpMime { int32_t m_contentEncoding; char *m_contentEncodingPos; char *m_contentLengthPos; + char *m_contentTypePos; // the size of the terminating boundary, either 1 or 2 bytes. // just the last \n in the case of a \n\n or \r in the case diff --git a/HttpServer.cpp b/HttpServer.cpp index 814a95b6..4a061cf9 100644 --- a/HttpServer.cpp +++ b/HttpServer.cpp @@ -1035,19 +1035,21 @@ bool HttpServer::sendReply ( TcpSocket *s , HttpRequest *r , bool isAdmin) { if ( strncmp ( path , "/download/", 10 ) == 0 ) return sendBackDump ( s , r ); - if ( strncmp ( path , "/iagbcoll/" , 10 ) == 0 ) { + if ( strncmp ( path , "/gbiaitem/" , 10 ) == 0 ) { SafeBuf cmd; char *iaItem = path + 10; char c = iaItem[pathLen]; iaItem[pathLen] = '\0'; // iaItem is like "webgroup-20100422114008-00011" // print out the warc files as if they were urls - // so we can spider them through the spider pipeline as-is - cmd.safePrintf("%s/ia list %s --glob='*arc.gz' | " + // so we can spider them through the spider pipeline as-is. + // this hack only works on internet archive servers + // that have the '/home/mwells/ia' obviously + cmd.safePrintf("/home/mwells/ia list %s --glob='*arc.gz' | " "awk '{print \"\"}' > ./tmpiaout" - , g_hostdb.m_dir + //, g_hostdb.m_dir ,iaItem ,iaItem ); @@ -2388,6 +2390,7 @@ int32_t getMsgSize ( char *buf, int32_t bufSize, TcpSocket *s ) { } // if has no content then it must end in \n\r\n\r or \r\n\r\n if ( ! hasContent ) return bufSize; + // look for a Content-Type: field because we now limit how much // we read based on this char *p = buf; @@ -2411,45 +2414,71 @@ int32_t getMsgSize ( char *buf, int32_t bufSize, TcpSocket *s ) { // as well index that at least. if ( p + 15 < pend && strncasecmp( p,"application/pdf",15)==0) allOrNothing = true; + if ( p + 15 < pend&&strncasecmp(p,"application/x-gzip",18)==0) + allOrNothing = true; // adjust "max to read" if we don't have an html/plain doc if ( ! isPost ) { max = s->m_maxOtherDocLen + 10*1024 ; if ( s->m_maxOtherDocLen == -1 ) max = 0x7fffffff; } } + + // // if it is a warc or arc.gz allow it for now but we should + // // only allow one spider at a time per host + if ( s->m_sendBuf ) { + char *p = s->m_sendBuf; + char *pend = p + s->m_sendBufSize; + if ( strncmp(p,"GET /",5) == 0 ) p += 4; + // find end of url we are getting + char *e = p; + for ( ; *e && e < pend && ! is_wspace_a(*e) ; e++ ); + if ( e - 8 > p && strncmp(e-8,".warc.gz", 8 ) == 0 ) + max = 0x7fffffff; + if ( e - 7 > p && strncmp(e-7, ".arc.gz", 7 ) == 0 ) + max = 0x7fffffff; + } + + int32_t contentSize = 0; + int32_t totalReplySize = 0; + // now look for Content-Length in the mime - for ( int32_t j = 0; j < i ; j++ ) { + int32_t j; for ( j = 0; j < i ; j++ ) { if ( buf[j] != 'c' && buf[j] != 'C' ) continue; if ( j + 16 >= i ) break; if ( strncasecmp ( &buf[j], "Content-Length:" , 15 ) != 0 ) continue; - int32_t contentSize = atol2 ( &buf[j+15] , i - (j+15) ); - int32_t totalReplySize = contentSize + mimeSize ; - // all-or-nothing filter - if ( totalReplySize > max && allOrNothing ) { - log(LOG_INFO, - "http: pdf reply/request size of %"INT32" is larger " - "than limit of %"INT32". Cutoff pdf's are useless. " - "Abandoning.",totalReplySize,max); - // do not read any more than what we have - return bufSize; - } - // warn if we received a post that was truncated - if ( totalReplySize > max && isPost ) { - log("http: Truncated POST request from %"INT32" " - "to %"INT32" bytes. Increase \"max other/text doc " - "len\" in Spider Controls page to prevent this.", - totalReplySize,max); - } - // truncate the reply if we have to - if ( totalReplySize > max ) { - log("http: truncating reply of %"INT32" to %"INT32" bytes", - totalReplySize,max); - totalReplySize = max; - } - // truncate if we need to - return totalReplySize; + contentSize = atol2 ( &buf[j+15] , i - (j+15) ); + totalReplySize = contentSize + mimeSize ; + break; } + + // all-or-nothing filter + if ( totalReplySize > max && allOrNothing ) { + log(LOG_INFO, + "http: reply/request size of %"INT32" is larger " + "than limit of %"INT32". Cutoff documents " + "of this type are useless. " + "Abandoning.",totalReplySize,max); + // do not read any more than what we have + return bufSize; + } + // warn if we received a post that was truncated + if ( totalReplySize > max && isPost ) { + log("http: Truncated POST request from %"INT32" " + "to %"INT32" bytes. Increase \"max other/text doc " + "len\" in Spider Controls page to prevent this.", + totalReplySize,max); + } + // truncate the reply if we have to + if ( totalReplySize > max ) { + log("http: truncating reply of %"INT32" to %"INT32" bytes", + totalReplySize,max); + totalReplySize = max; + } + // truncate if we need to + if ( totalReplySize ) + return totalReplySize; + // if it is a POST request with content but no content length... // we don't know how big it is... if ( isPost ) { @@ -2880,16 +2909,32 @@ TcpSocket *HttpServer::unzipReply(TcpSocket* s) { // so we need to rewrite the Content-Length: and the // Content-Encoding: http mime field values so they are no longer // "gzip" and use the uncompressed content-length. - char *ptr1 = NULL; - char *ptr2 = NULL; - if(mime.getContentEncodingPos() && - mime.getContentEncodingPos() < mime.getContentLengthPos()) { - ptr1 = mime.getContentEncodingPos(); - ptr2 = mime.getContentLengthPos(); - } - else { - ptr1 = mime.getContentLengthPos(); - ptr2 = mime.getContentEncodingPos(); + char *ptr1 = mime.getContentEncodingPos(); + char *ptr2 = mime.getContentLengthPos(); + char *ptr3 = NULL; + + // change the content type based on the extension before the + // .gz extension since we are uncompressing it + char *p = s->m_readBuf + 4; + char *pend = s->m_readBuf + s->m_readBufSize; + const char *newCT = NULL; + char *lastPeriod = NULL; + // get the extension, if any, before the .gz + for ( ; *p && ! is_wspace_a(*p) && p < pend ; p++ ) { + if ( p[0] != '.' ) continue; + if ( p[1] != 'g' ) { lastPeriod = p; continue; } + if ( p[2] != 'z' ) { lastPeriod = p; continue; } + if ( ! is_wspace_a(p[3]) ) { lastPeriod = p; continue; } + // no prev? + if ( ! lastPeriod ) break; + // back up + newCT = extensionToContentTypeStr2 (lastPeriod,p-lastPeriod); + // this is NULL if the file extension is unrecognized + if ( ! newCT ) break; + // this should be like text/html or + // WARC/html or something like that... + ptr3 = mime.getContentTypePos(); + break; } // this was writing a number at the start of the mime and messing @@ -2901,38 +2946,45 @@ TcpSocket *HttpServer::unzipReply(TcpSocket* s) { char *src = s->m_readBuf; // sometimes they are missing Content-Length: - if ( ptr1 ) { - // copy ptr1 to src - gbmemcpy ( pnew, src, ptr1 - src ); - pnew += ptr1 - src; - src += ptr1 - src; - // store either the new content encoding or new length - if(ptr1 == mime.getContentEncodingPos()) - pnew += sprintf(pnew, " identity"); - else - pnew += sprintf(pnew, " %"INT32"",newSize); - // scan to \r\n at end of that line we replace - while ( *src != '\r' && *src != '\n') src++; - } - if ( ptr2 ) { - // copy ptr2 to src - gbmemcpy ( pnew , src , ptr2 - src ); - pnew += ptr2 - src; - src += ptr2 - src; - // now insert the new shit - if(ptr2 == mime.getContentEncodingPos()) - pnew += sprintf(pnew, " identity"); - else - pnew += sprintf(pnew, " %"INT32"",newSize); - // scan to \r\n at end of that line we replace - while ( *src != '\r' && *src != '\n') src++; + subloop: + + char *nextMin = (char *)-1; + if ( ptr1 && (ptr1 < nextMin || nextMin==(char *)-1)) nextMin = ptr1; + if ( ptr2 && (ptr2 < nextMin || nextMin==(char *)-1)) nextMin = ptr2; + if ( ptr3 && (ptr3 < nextMin || nextMin==(char *)-1)) nextMin = ptr3; + + // if all ptrs are NULL then copy the tail + if ( nextMin == (char *)-1 ) nextMin = mimeEnd; + + // copy ptr1 to src + gbmemcpy ( pnew, src, nextMin - src ); + pnew += nextMin - src; + src += nextMin - src; + // store either the new content encoding or new length + if ( nextMin == mime.getContentEncodingPos()) { + pnew += sprintf(pnew, " identity"); + ptr1 = NULL; } + else if ( nextMin == mime.getContentLengthPos() ) { + pnew += sprintf(pnew, " %"INT32"",newSize); + ptr2 = NULL; + } + else if ( nextMin == mime.getContentTypePos() ) { + pnew += sprintf(pnew," %s",newCT); + ptr3 = NULL; + } + // scan to \r\n at end of that line we replace + while ( *src != '\r' && *src != '\n') src++; + + // loop for more + if ( nextMin < mimeEnd ) goto subloop; + // copy the rest - gbmemcpy ( pnew , src , mimeEnd - src ); - pnew += mimeEnd - src; - src += mimeEnd - src; + // gbmemcpy ( pnew , src , mimeEnd - src ); + // pnew += mimeEnd - src; + // src += mimeEnd - src; // before restLen was negative because we were skipping over diff --git a/Makefile b/Makefile index eb1b3f63..41e4a168 100644 --- a/Makefile +++ b/Makefile @@ -89,7 +89,7 @@ OS_DEB := true STATIC := -static # MDW: i get some parsing inconsistencies when running the first qa injection # test if this is -O3. strange. -XMLDOCOPT := -O2 +XMLDOCOPT := -O0 endif diff --git a/XmlDoc.cpp b/XmlDoc.cpp index fde133ed..fae5c442 100644 --- a/XmlDoc.cpp +++ b/XmlDoc.cpp @@ -2720,6 +2720,10 @@ bool XmlDoc::indexDoc2 ( ) { // scan it using delimeters. the file consists of multiple documents // separated by this content delimeter. if ( m_firstUrlValid && m_firstUrl.isCompressedArcOrWarc() ) { + // we need the doc + char **replyPtr = getHttpReply (); + if ( ! replyPtr ) return true; + if ( replyPtr == (void *)-1 ) return false; // already called inject? if ( m_calledWarcInject ) // then we are done @@ -10196,7 +10200,14 @@ Url **XmlDoc::getRedirUrl() { // http-equiv refresh tag, but that added an element of // recursion that is just too confusing to deal with. so // let's just parse out the meta tag by hand - if ( ! isRobotsTxt ) { + bool checkMeta = true; + if ( isRobotsTxt ) checkMeta = false; + // warc and arc files have a list of html docs + // in them that we need to index, so skip this check + // for them as well + if ( m_firstUrlValid && m_firstUrl.isCompressedArcOrWarc() ) + checkMeta = false; + if ( checkMeta ) { Url **mrup = getMetaRedirUrl(); if ( ! mrup || mrup == (void *)-1) return (Url **)mrup; // set it. might be NULL if not there.