some fixes for indexing warcs/arcs.

2024-10-04 12:17:35 +03:00 · 2015-04-28 22:30:58 -07:00 · 2015-04-28 22:30:58 -07:00 · faf2c06d29
commit faf2c06d29
parent b6ff0b0173
5 changed files with 161 additions and 73 deletions
--- a/HttpMime.cpp
+++ b/HttpMime.cpp
@ -45,6 +45,7 @@ void HttpMime::reset ( ) {
 	m_locationFieldLen = 0;
 	m_contentEncodingPos = NULL;
 	m_contentLengthPos = NULL;
+	m_contentTypePos   = NULL;
 }

 // . returns false if could not get a valid mime
@ -157,8 +158,12 @@ bool HttpMime::parse ( char *mime , int32_t mimeLen , Url *url ) {
 			time_t now = time(NULL);
 			if (m_lastModifiedDate > now) m_lastModifiedDate = now;
 		}
-		else if ( strncasecmp ( p , "Content-Type:"   ,13) == 0 ) 
+		else if ( strncasecmp ( p , "Content-Type:"   ,13) == 0 ) {
 			m_contentType = getContentTypePrivate ( p + 13 );
+			char *s = p + 13;
+			while ( *s == ' ' || *s == '\t' ) s++;
+			m_contentTypePos = s;
+		}
 		else if ( strncasecmp ( p , "Set-Cookie:"   ,10) == 0 ) {
 			m_cookie = p + 11;
 			if ( m_cookie[0] == ' ' ) m_cookie++;
@ -540,6 +545,7 @@ int32_t getContentTypeFromStr ( char *s ) {
        else if (!strncasecmp(s,"image/",6               ) ) ct = CT_IMAGE;
 	else if (!strcasecmp(s,"application/javascript"  ) ) ct = CT_JS;
 	else if (!strcasecmp(s,"application/x-javascript") ) ct = CT_JS;
+	else if (!strcasecmp(s,"application/x-gzip"      ) ) ct = CT_GZ;
 	else if (!strcasecmp(s,"text/javascript"         ) ) ct = CT_JS;
 	else if (!strcasecmp(s,"text/x-js"               ) ) ct = CT_JS;
 	else if (!strcasecmp(s,"text/js"                 ) ) ct = CT_JS;
@ -626,6 +632,17 @@ void resetHttpMime ( ) {
 	s_mimeTable.reset();
 }

+const char *extensionToContentTypeStr2 ( char *ext , int32_t elen ) {
+	// assume text/html if no extension provided
+	if ( ! ext || ! ext[0] ) return NULL;
+	if ( elen <= 0 ) return NULL;
+	// get hash for table look up
+	int32_t key = hash32 ( ext , elen );
+	char **pp = (char **)s_mimeTable.getValue ( &key );
+	if ( ! pp ) return NULL;
+	return *pp;
+}
+
 const char *HttpMime::getContentTypeFromExtension ( char *ext , int32_t elen) {
 	// assume text/html if no extension provided
 	if ( ! ext || ! ext[0] ) return "text/html";
@ -1051,7 +1068,10 @@ static char *s_ext[] = {
     "xwd" , "image/x-xwindowdump",
     "xyz" , "chemical/x-pdb",
      "zip" , "application/zip" ,
-      "xpi", "application/x-xpinstall"
+      "xpi", "application/x-xpinstall",
+      // newstuff
+      "warc", "application/warc",
+      "arc", "application/arc"
 };

 // . init s_mimeTable in this call
--- a/HttpMime.h
+++ b/HttpMime.h
@ -9,6 +9,8 @@
 // convert application/json to CT_JSON for instance
 int32_t getContentTypeFromStr ( char *s ) ;

+const char *extensionToContentTypeStr2 ( char *ext , int32_t elen ) ;
+
 #include <time.h>

 void   getTime    ( char *s , int *sec , int *min , int *hour ) ;
@ -42,6 +44,7 @@ time_t atotime5   ( char *s ) ;
 #define CT_JSON   16
 #define CT_IMAGE  17
 #define CT_STATUS 18 // an internal type indicating spider reply
+#define CT_GZ     19

 #define ET_IDENTITY 0
 #define ET_GZIP 1
@ -127,6 +130,7 @@ class HttpMime {
 	int32_t  getContentEncoding () {return m_contentEncoding;}
 	char *getContentEncodingPos() {return m_contentEncodingPos;}
 	char *getContentLengthPos()      {return m_contentLengthPos;}
+	char *getContentTypePos()      {return m_contentTypePos;}


 	// private:
@ -166,6 +170,7 @@ class HttpMime {
 	int32_t    m_contentEncoding;
 	char   *m_contentEncodingPos;
 	char   *m_contentLengthPos;
+	char   *m_contentTypePos;

 	// the size of the terminating boundary, either 1 or 2 bytes.
 	// just the last \n in the case of a \n\n or \r in the case
--- a/HttpServer.cpp
+++ b/HttpServer.cpp
@ -1035,19 +1035,21 @@ bool HttpServer::sendReply ( TcpSocket  *s , HttpRequest *r , bool isAdmin) {
 	if ( strncmp ( path , "/download/", 10 ) == 0 )
 		return sendBackDump ( s , r );

-	if ( strncmp ( path , "/iagbcoll/" , 10 ) == 0 ) {
+	if ( strncmp ( path , "/gbiaitem/" , 10 ) == 0 ) {
 		SafeBuf cmd;
 		char *iaItem = path + 10;
 		char c = iaItem[pathLen];
 		iaItem[pathLen] = '\0';
 		// iaItem is like "webgroup-20100422114008-00011"
 		// print out the warc files as if they were urls
-		// so we can spider them through the spider pipeline as-is
-		cmd.safePrintf("%s/ia list %s --glob='*arc.gz' | "
+		// so we can spider them through the spider pipeline as-is.
+		// this hack only works on internet archive servers
+		// that have the '/home/mwells/ia' obviously
+		cmd.safePrintf("/home/mwells/ia list %s --glob='*arc.gz' | "
 			       "awk '{print \"<a "
 			       "href=http://archive.org/download/"
 			       "%s/\" $1\">\"}' > ./tmpiaout"
-			       , g_hostdb.m_dir
+			       //, g_hostdb.m_dir
 			       ,iaItem
 			       ,iaItem
 			       );
@ -2388,6 +2390,7 @@ int32_t getMsgSize ( char *buf, int32_t bufSize, TcpSocket *s ) {
 	}
 	// if has no content then it must end  in \n\r\n\r or \r\n\r\n
 	if ( ! hasContent ) return bufSize;
+
 	// look for a Content-Type: field because we now limit how much
 	// we read based on this
 	char *p          = buf;
@ -2411,45 +2414,71 @@ int32_t getMsgSize ( char *buf, int32_t bufSize, TcpSocket *s ) {
 		//   as well index that at least.
 		if ( p + 15 < pend && strncasecmp( p,"application/pdf",15)==0)
 			allOrNothing = true;
+		if ( p + 15 < pend&&strncasecmp(p,"application/x-gzip",18)==0)
+			allOrNothing = true;
 		// adjust "max to read" if we don't have an html/plain doc
 		if ( ! isPost ) {
 			max = s->m_maxOtherDocLen + 10*1024 ;
 			if ( s->m_maxOtherDocLen == -1 ) max = 0x7fffffff;
 		}
 	}
+
+	// // if it is a warc or arc.gz allow it for now but we should
+	// // only allow one spider at a time per host
+	if ( s->m_sendBuf ) {
+		char *p = s->m_sendBuf;
+		char *pend = p + s->m_sendBufSize;
+		if ( strncmp(p,"GET /",5) == 0 ) p += 4;
+		// find end of url we are getting
+		char *e = p;
+		for ( ; *e && e < pend && ! is_wspace_a(*e) ; e++ );
+		if ( e - 8 > p && strncmp(e-8,".warc.gz", 8 ) == 0 )
+			max = 0x7fffffff;
+		if ( e - 7 > p && strncmp(e-7, ".arc.gz", 7 ) == 0 )
+			max = 0x7fffffff;
+	}
+
+	int32_t contentSize = 0;
+	int32_t totalReplySize = 0;
+
 	// now look for Content-Length in the mime
-	for ( int32_t j = 0; j < i ; j++ ) {
+	int32_t j; for ( j = 0; j < i ; j++ ) {
 		if ( buf[j] != 'c' && buf[j] != 'C' ) continue;
 		if ( j + 16 >= i ) break;
 		if ( strncasecmp ( &buf[j], "Content-Length:" , 15 ) != 0 )
 			continue;
-		int32_t contentSize = atol2 ( &buf[j+15] , i - (j+15) );
-		int32_t totalReplySize = contentSize + mimeSize ;
-		// all-or-nothing filter
-		if ( totalReplySize > max && allOrNothing ) {
-			log(LOG_INFO,
-			    "http: pdf reply/request size of %"INT32" is larger "
-			    "than limit of %"INT32". Cutoff pdf's are useless. "
-			    "Abandoning.",totalReplySize,max);
-			// do not read any more than what we have
-			return bufSize;
-		}
-		// warn if we received a post that was truncated
-		if ( totalReplySize > max && isPost ) {
-			log("http: Truncated POST request from %"INT32" "
-			    "to %"INT32" bytes. Increase \"max other/text doc "
-			    "len\" in Spider Controls page to prevent this.",
-			    totalReplySize,max);
-		}
-		// truncate the reply if we have to
-		if ( totalReplySize > max ) {
-			log("http: truncating reply of %"INT32" to %"INT32" bytes",
-			    totalReplySize,max);
-			totalReplySize = max;
-		}
-		// truncate if we need to
-		return totalReplySize;
+		contentSize = atol2 ( &buf[j+15] , i - (j+15) );
+		totalReplySize = contentSize + mimeSize ;
+		break;
 	}
+
+	// all-or-nothing filter
+	if ( totalReplySize > max && allOrNothing ) {
+		log(LOG_INFO,
+		    "http: reply/request size of %"INT32" is larger "
+		    "than limit of %"INT32". Cutoff documents "
+		    "of this type are useless. "
+		    "Abandoning.",totalReplySize,max);
+		// do not read any more than what we have
+		return bufSize;
+	}
+	// warn if we received a post that was truncated
+	if ( totalReplySize > max && isPost ) {
+		log("http: Truncated POST request from %"INT32" "
+		    "to %"INT32" bytes. Increase \"max other/text doc "
+		    "len\" in Spider Controls page to prevent this.",
+		    totalReplySize,max);
+	}
+	// truncate the reply if we have to
+	if ( totalReplySize > max ) {
+		log("http: truncating reply of %"INT32" to %"INT32" bytes",
+		    totalReplySize,max);
+		totalReplySize = max;
+	}
+	// truncate if we need to
+	if ( totalReplySize )
+		return totalReplySize;
+
 	// if it is a POST request with content but no content length...
 	// we don't know how big it is...
 	if ( isPost ) {
@ -2880,16 +2909,32 @@ TcpSocket *HttpServer::unzipReply(TcpSocket* s) {
 	// so we need to rewrite the Content-Length: and the 
 	// Content-Encoding: http mime field values so they are no longer
 	// "gzip" and use the uncompressed content-length.
-	char *ptr1 = NULL;
-	char *ptr2 = NULL;
-	if(mime.getContentEncodingPos() &&
-	   mime.getContentEncodingPos() < mime.getContentLengthPos()) {
-		ptr1 = mime.getContentEncodingPos();
-		ptr2 = mime.getContentLengthPos();
-	}
-	else {
-		ptr1 = mime.getContentLengthPos();
-		ptr2 = mime.getContentEncodingPos();
+	char *ptr1 = mime.getContentEncodingPos();
+	char *ptr2 = mime.getContentLengthPos();
+	char *ptr3 = NULL;
+
+	// change the content type based on the extension before the
+	// .gz extension since we are uncompressing it
+	char *p = s->m_readBuf + 4;
+	char *pend = s->m_readBuf + s->m_readBufSize;
+	const char *newCT = NULL;
+	char *lastPeriod = NULL;
+	// get the extension, if any, before the .gz
+	for ( ; *p && ! is_wspace_a(*p) && p < pend ; p++ ) {
+		if ( p[0] != '.' ) continue;
+		if ( p[1] != 'g' ) { lastPeriod = p; continue; }
+		if ( p[2] != 'z' ) { lastPeriod = p; continue; }
+		if ( ! is_wspace_a(p[3]) ) { lastPeriod = p; continue; }
+		// no prev?
+		if ( ! lastPeriod ) break;
+		// back up
+		newCT = extensionToContentTypeStr2 (lastPeriod,p-lastPeriod);
+		// this is NULL if the file extension is unrecognized
+		if ( ! newCT ) break;
+		// this should be like text/html or
+		// WARC/html or something like that...
+		ptr3 = mime.getContentTypePos();
+		break;
 	}

 	// this was writing a number at the start of the mime and messing
@ -2901,38 +2946,45 @@ TcpSocket *HttpServer::unzipReply(TcpSocket* s) {
 	char *src = s->m_readBuf;

 	// sometimes they are missing Content-Length:
-	if ( ptr1 ) {
-		// copy ptr1 to src
-		gbmemcpy ( pnew, src, ptr1 - src );
-		pnew += ptr1 - src;
-		src  += ptr1 - src;
-		// store either the new content encoding or new length
-		if(ptr1 == mime.getContentEncodingPos())
-			pnew += sprintf(pnew, " identity");
-		else	
-			pnew += sprintf(pnew, " %"INT32"",newSize);
-		// scan to \r\n at end of that line we replace
-		while ( *src != '\r' && *src != '\n') src++;
-	}

-	if ( ptr2 ) {
-		// copy ptr2 to src
-		gbmemcpy ( pnew , src , ptr2 - src );
-		pnew += ptr2 - src;
-		src  += ptr2 - src;
-		// now insert the new shit
-		if(ptr2 == mime.getContentEncodingPos())
-			pnew += sprintf(pnew, " identity");
-		else	
-			pnew += sprintf(pnew, " %"INT32"",newSize);
-		// scan to \r\n at end of that line we replace
-		while ( *src != '\r' && *src != '\n') src++;
+ subloop:
+
+	char *nextMin = (char *)-1;
+	if ( ptr1 && (ptr1 < nextMin || nextMin==(char *)-1)) nextMin = ptr1;
+	if ( ptr2 && (ptr2 < nextMin || nextMin==(char *)-1)) nextMin = ptr2;
+	if ( ptr3 && (ptr3 < nextMin || nextMin==(char *)-1)) nextMin = ptr3;
+
+	// if all ptrs are NULL then copy the tail
+	if ( nextMin == (char *)-1 ) nextMin = mimeEnd;
+
+	// copy ptr1 to src
+	gbmemcpy ( pnew, src, nextMin - src );
+	pnew += nextMin - src;
+	src  += nextMin - src;
+	// store either the new content encoding or new length
+	if ( nextMin == mime.getContentEncodingPos()) {
+		pnew += sprintf(pnew, " identity");
+		ptr1 = NULL;
 	}
+	else if ( nextMin == mime.getContentLengthPos() ) {
+		pnew += sprintf(pnew, " %"INT32"",newSize);
+		ptr2 = NULL;
+	}
+	else if ( nextMin == mime.getContentTypePos() ) {
+		pnew += sprintf(pnew," %s",newCT);
+		ptr3 = NULL;
+	}
+	// scan to \r\n at end of that line we replace
+	while ( *src != '\r' && *src != '\n') src++;
+
+	// loop for more
+	if ( nextMin < mimeEnd ) goto subloop;
+

 	// copy the rest
-	gbmemcpy ( pnew , src , mimeEnd - src );
-	pnew += mimeEnd - src;
-	src  += mimeEnd - src;
+	// gbmemcpy ( pnew , src , mimeEnd - src );
+	// pnew += mimeEnd - src;
+	// src  += mimeEnd - src;


 	// before restLen was negative because we were skipping over
--- a/2
+++ b/2
@ -89,7 +89,7 @@ OS_DEB := true
 STATIC := -static
 # MDW: i get some parsing inconsistencies when running the first qa injection
 # test if this is -O3. strange.
-XMLDOCOPT := -O2
+XMLDOCOPT := -O0
 endif


--- a/XmlDoc.cpp
+++ b/XmlDoc.cpp
@ -2720,6 +2720,10 @@ bool XmlDoc::indexDoc2 ( ) {
 	// scan it using delimeters. the file consists of multiple documents
 	// separated by this content delimeter.
 	if ( m_firstUrlValid && m_firstUrl.isCompressedArcOrWarc() ) {
+		// we need the doc
+		char **replyPtr = getHttpReply ();
+		if ( ! replyPtr ) return true;
+		if ( replyPtr == (void *)-1 ) return false;
 		// already called inject?
 		if ( m_calledWarcInject )
 			// then we are done
@ -10196,7 +10200,14 @@ Url **XmlDoc::getRedirUrl() {
 		// http-equiv refresh tag, but that added an element of 
 		// recursion that is just too confusing to deal with. so 
 		// let's just parse out the meta tag by hand
-		if ( ! isRobotsTxt ) {
+		bool checkMeta = true;
+		if ( isRobotsTxt ) checkMeta = false;
+		// warc and arc files have a list of html docs
+		// in them that we need to index, so skip this check
+		// for them as well
+		if ( m_firstUrlValid && m_firstUrl.isCompressedArcOrWarc() )
+			checkMeta = false;
+		if ( checkMeta ) {
 			Url **mrup = getMetaRedirUrl();
 			if ( ! mrup || mrup == (void *)-1) return (Url **)mrup;
 			// set it. might be NULL if not there.