some fixes for indexing warcs/arcs.

2024-10-04 20:27:43 +03:00 · 2015-04-28 22:30:58 -07:00 · 2015-04-28 22:30:58 -07:00 · faf2c06d29
commit faf2c06d29
parent b6ff0b0173
5 changed files with 161 additions and 73 deletions
--- a/HttpMime.cpp
+++ b/HttpMime.cpp
@ -45,6 +45,7 @@ void HttpMime::reset ( ) {
 	m_locationFieldLen = 0;
 	m_contentEncodingPos = NULL;
 	m_contentLengthPos = NULL;
 	m_contentTypePos   = NULL;
 }
 // . returns false if could not get a valid mime
@ -157,8 +158,12 @@ bool HttpMime::parse ( char *mime , int32_t mimeLen , Url *url ) {
 			time_t now = time(NULL);
 			if (m_lastModifiedDate > now) m_lastModifiedDate = now;
 		}
-		else if ( strncasecmp ( p , "Content-Type:"   ,13) == 0 ) 
+		else if ( strncasecmp ( p , "Content-Type:"   ,13) == 0 ) {
 			m_contentType = getContentTypePrivate ( p + 13 );
 			char *s = p + 13;
 			while ( *s == ' ' || *s == '\t' ) s++;
 			m_contentTypePos = s;
 		}
 		else if ( strncasecmp ( p , "Set-Cookie:"   ,10) == 0 ) {
 			m_cookie = p + 11;
 			if ( m_cookie[0] == ' ' ) m_cookie++;
@ -540,6 +545,7 @@ int32_t getContentTypeFromStr ( char *s ) {
        else if (!strncasecmp(s,"image/",6               ) ) ct = CT_IMAGE;
 	else if (!strcasecmp(s,"application/javascript"  ) ) ct = CT_JS;
 	else if (!strcasecmp(s,"application/x-javascript") ) ct = CT_JS;
 	else if (!strcasecmp(s,"application/x-gzip"      ) ) ct = CT_GZ;
 	else if (!strcasecmp(s,"text/javascript"         ) ) ct = CT_JS;
 	else if (!strcasecmp(s,"text/x-js"               ) ) ct = CT_JS;
 	else if (!strcasecmp(s,"text/js"                 ) ) ct = CT_JS;
@ -626,6 +632,17 @@ void resetHttpMime ( ) {
 	s_mimeTable.reset();
 }
 const char *extensionToContentTypeStr2 ( char *ext , int32_t elen ) {
 	// assume text/html if no extension provided
 	if ( ! ext || ! ext[0] ) return NULL;
 	if ( elen <= 0 ) return NULL;
 	// get hash for table look up
 	int32_t key = hash32 ( ext , elen );
 	char **pp = (char **)s_mimeTable.getValue ( &key );
 	if ( ! pp ) return NULL;
 	return *pp;
 }
 const char *HttpMime::getContentTypeFromExtension ( char *ext , int32_t elen) {
 	// assume text/html if no extension provided
 	if ( ! ext || ! ext[0] ) return "text/html";
@ -1051,7 +1068,10 @@ static char *s_ext[] = {
     "xwd" , "image/x-xwindowdump",
     "xyz" , "chemical/x-pdb",
      "zip" , "application/zip" ,
-      "xpi", "application/x-xpinstall"
+      "xpi", "application/x-xpinstall",
      // newstuff
      "warc", "application/warc",
      "arc", "application/arc"
 };
 // . init s_mimeTable in this call
--- a/HttpMime.h
+++ b/HttpMime.h
@ -9,6 +9,8 @@
 // convert application/json to CT_JSON for instance
 int32_t getContentTypeFromStr ( char *s ) ;
 const char *extensionToContentTypeStr2 ( char *ext , int32_t elen ) ;
 #include <time.h>
 void   getTime    ( char *s , int *sec , int *min , int *hour ) ;
@ -42,6 +44,7 @@ time_t atotime5   ( char *s ) ;
 #define CT_JSON   16
 #define CT_IMAGE  17
 #define CT_STATUS 18 // an internal type indicating spider reply
 #define CT_GZ     19
 #define ET_IDENTITY 0
 #define ET_GZIP 1
@ -127,6 +130,7 @@ class HttpMime {
 	int32_t  getContentEncoding () {return m_contentEncoding;}
 	char *getContentEncodingPos() {return m_contentEncodingPos;}
 	char *getContentLengthPos()      {return m_contentLengthPos;}
 	char *getContentTypePos()      {return m_contentTypePos;}
 	// private:
@ -166,6 +170,7 @@ class HttpMime {
 	int32_t    m_contentEncoding;
 	char   *m_contentEncodingPos;
 	char   *m_contentLengthPos;
 	char   *m_contentTypePos;
 	// the size of the terminating boundary, either 1 or 2 bytes.
 	// just the last \n in the case of a \n\n or \r in the case
--- a/HttpServer.cpp
+++ b/HttpServer.cpp
@ -1035,19 +1035,21 @@ bool HttpServer::sendReply ( TcpSocket  *s , HttpRequest *r , bool isAdmin) {
 	if ( strncmp ( path , "/download/", 10 ) == 0 )
 		return sendBackDump ( s , r );
-	if ( strncmp ( path , "/iagbcoll/" , 10 ) == 0 ) {
+	if ( strncmp ( path , "/gbiaitem/" , 10 ) == 0 ) {
 		SafeBuf cmd;
 		char *iaItem = path + 10;
 		char c = iaItem[pathLen];
 		iaItem[pathLen] = '\0';
 		// iaItem is like "webgroup-20100422114008-00011"
 		// print out the warc files as if they were urls
-		// so we can spider them through the spider pipeline as-is
+		// so we can spider them through the spider pipeline as-is.
-		cmd.safePrintf("%s/ia list %s --glob='*arc.gz' | "
+		// this hack only works on internet archive servers
 		// that have the '/home/mwells/ia' obviously
 		cmd.safePrintf("/home/mwells/ia list %s --glob='*arc.gz' | "
 			       "awk '{print \"<a "
 			       "href=http://archive.org/download/"
 			       "%s/\" $1\">\"}' > ./tmpiaout"
-			       , g_hostdb.m_dir
+			       //, g_hostdb.m_dir
 			       ,iaItem
 			       ,iaItem
 			       );
@ -2388,6 +2390,7 @@ int32_t getMsgSize ( char *buf, int32_t bufSize, TcpSocket *s ) {
 	}
 	// if has no content then it must end  in \n\r\n\r or \r\n\r\n
 	if ( ! hasContent ) return bufSize;
 	// look for a Content-Type: field because we now limit how much
 	// we read based on this
 	char *p          = buf;
@ -2411,45 +2414,71 @@ int32_t getMsgSize ( char *buf, int32_t bufSize, TcpSocket *s ) {
 		//   as well index that at least.
 		if ( p + 15 < pend && strncasecmp( p,"application/pdf",15)==0)
 			allOrNothing = true;
 		if ( p + 15 < pend&&strncasecmp(p,"application/x-gzip",18)==0)
 			allOrNothing = true;
 		// adjust "max to read" if we don't have an html/plain doc
 		if ( ! isPost ) {
 			max = s->m_maxOtherDocLen + 10*1024 ;
 			if ( s->m_maxOtherDocLen == -1 ) max = 0x7fffffff;
 		}
 	}
 	// // if it is a warc or arc.gz allow it for now but we should
 	// // only allow one spider at a time per host
 	if ( s->m_sendBuf ) {
 		char *p = s->m_sendBuf;
 		char *pend = p + s->m_sendBufSize;
 		if ( strncmp(p,"GET /",5) == 0 ) p += 4;
 		// find end of url we are getting
 		char *e = p;
 		for ( ; *e && e < pend && ! is_wspace_a(*e) ; e++ );
 		if ( e - 8 > p && strncmp(e-8,".warc.gz", 8 ) == 0 )
 			max = 0x7fffffff;
 		if ( e - 7 > p && strncmp(e-7, ".arc.gz", 7 ) == 0 )
 			max = 0x7fffffff;
 	}
 	int32_t contentSize = 0;
 	int32_t totalReplySize = 0;
 	// now look for Content-Length in the mime
-	for ( int32_t j = 0; j < i ; j++ ) {
+	int32_t j; for ( j = 0; j < i ; j++ ) {
 		if ( buf[j] != 'c' && buf[j] != 'C' ) continue;
 		if ( j + 16 >= i ) break;
 		if ( strncasecmp ( &buf[j], "Content-Length:" , 15 ) != 0 )
 			continue;
-		int32_t contentSize = atol2 ( &buf[j+15] , i - (j+15) );
+		contentSize = atol2 ( &buf[j+15] , i - (j+15) );
-		int32_t totalReplySize = contentSize + mimeSize ;
+		totalReplySize = contentSize + mimeSize ;
-		// all-or-nothing filter
+		break;
 		if ( totalReplySize > max && allOrNothing ) {
 			log(LOG_INFO,
 			    "http: pdf reply/request size of %"INT32" is larger "
 			    "than limit of %"INT32". Cutoff pdf's are useless. "
 			    "Abandoning.",totalReplySize,max);
 			// do not read any more than what we have
 			return bufSize;
 		}
 		// warn if we received a post that was truncated
 		if ( totalReplySize > max && isPost ) {
 			log("http: Truncated POST request from %"INT32" "
 			    "to %"INT32" bytes. Increase \"max other/text doc "
 			    "len\" in Spider Controls page to prevent this.",
 			    totalReplySize,max);
 		}
 		// truncate the reply if we have to
 		if ( totalReplySize > max ) {
 			log("http: truncating reply of %"INT32" to %"INT32" bytes",
 			    totalReplySize,max);
 			totalReplySize = max;
 		}
 		// truncate if we need to
 		return totalReplySize;
 	}
 	// all-or-nothing filter
 	if ( totalReplySize > max && allOrNothing ) {
 		log(LOG_INFO,
 		    "http: reply/request size of %"INT32" is larger "
 		    "than limit of %"INT32". Cutoff documents "
 		    "of this type are useless. "
 		    "Abandoning.",totalReplySize,max);
 		// do not read any more than what we have
 		return bufSize;
 	}
 	// warn if we received a post that was truncated
 	if ( totalReplySize > max && isPost ) {
 		log("http: Truncated POST request from %"INT32" "
 		    "to %"INT32" bytes. Increase \"max other/text doc "
 		    "len\" in Spider Controls page to prevent this.",
 		    totalReplySize,max);
 	}
 	// truncate the reply if we have to
 	if ( totalReplySize > max ) {
 		log("http: truncating reply of %"INT32" to %"INT32" bytes",
 		    totalReplySize,max);
 		totalReplySize = max;
 	}
 	// truncate if we need to
 	if ( totalReplySize )
 		return totalReplySize;
 	// if it is a POST request with content but no content length...
 	// we don't know how big it is...
 	if ( isPost ) {
@ -2880,16 +2909,32 @@ TcpSocket *HttpServer::unzipReply(TcpSocket* s) {
 	// so we need to rewrite the Content-Length: and the 
 	// Content-Encoding: http mime field values so they are no longer
 	// "gzip" and use the uncompressed content-length.
-	char *ptr1 = NULL;
+	char *ptr1 = mime.getContentEncodingPos();
-	char *ptr2 = NULL;
+	char *ptr2 = mime.getContentLengthPos();
-	if(mime.getContentEncodingPos() &&
+	char *ptr3 = NULL;
-	   mime.getContentEncodingPos() < mime.getContentLengthPos()) {
+
-		ptr1 = mime.getContentEncodingPos();
+	// change the content type based on the extension before the
-		ptr2 = mime.getContentLengthPos();
+	// .gz extension since we are uncompressing it
-	}
+	char *p = s->m_readBuf + 4;
-	else {
+	char *pend = s->m_readBuf + s->m_readBufSize;
-		ptr1 = mime.getContentLengthPos();
+	const char *newCT = NULL;
-		ptr2 = mime.getContentEncodingPos();
+	char *lastPeriod = NULL;
 	// get the extension, if any, before the .gz
 	for ( ; *p && ! is_wspace_a(*p) && p < pend ; p++ ) {
 		if ( p[0] != '.' ) continue;
 		if ( p[1] != 'g' ) { lastPeriod = p; continue; }
 		if ( p[2] != 'z' ) { lastPeriod = p; continue; }
 		if ( ! is_wspace_a(p[3]) ) { lastPeriod = p; continue; }
 		// no prev?
 		if ( ! lastPeriod ) break;
 		// back up
 		newCT = extensionToContentTypeStr2 (lastPeriod,p-lastPeriod);
 		// this is NULL if the file extension is unrecognized
 		if ( ! newCT ) break;
 		// this should be like text/html or
 		// WARC/html or something like that...
 		ptr3 = mime.getContentTypePos();
 		break;
 	}
 	// this was writing a number at the start of the mime and messing
@ -2901,38 +2946,45 @@ TcpSocket *HttpServer::unzipReply(TcpSocket* s) {
 	char *src = s->m_readBuf;
 	// sometimes they are missing Content-Length:
 	if ( ptr1 ) {
 		// copy ptr1 to src
 		gbmemcpy ( pnew, src, ptr1 - src );
 		pnew += ptr1 - src;
 		src  += ptr1 - src;
 		// store either the new content encoding or new length
 		if(ptr1 == mime.getContentEncodingPos())
 			pnew += sprintf(pnew, " identity");
 		else	
 			pnew += sprintf(pnew, " %"INT32"",newSize);
 		// scan to \r\n at end of that line we replace
 		while ( *src != '\r' && *src != '\n') src++;
 	}
-	if ( ptr2 ) {
+ subloop:
-		// copy ptr2 to src
+
-		gbmemcpy ( pnew , src , ptr2 - src );
+	char *nextMin = (char *)-1;
-		pnew += ptr2 - src;
+	if ( ptr1 && (ptr1 < nextMin || nextMin==(char *)-1)) nextMin = ptr1;
-		src  += ptr2 - src;
+	if ( ptr2 && (ptr2 < nextMin || nextMin==(char *)-1)) nextMin = ptr2;
-		// now insert the new shit
+	if ( ptr3 && (ptr3 < nextMin || nextMin==(char *)-1)) nextMin = ptr3;
-		if(ptr2 == mime.getContentEncodingPos())
+
-			pnew += sprintf(pnew, " identity");
+	// if all ptrs are NULL then copy the tail
-		else	
+	if ( nextMin == (char *)-1 ) nextMin = mimeEnd;
-			pnew += sprintf(pnew, " %"INT32"",newSize);
+
-		// scan to \r\n at end of that line we replace
+	// copy ptr1 to src
-		while ( *src != '\r' && *src != '\n') src++;
+	gbmemcpy ( pnew, src, nextMin - src );
 	pnew += nextMin - src;
 	src  += nextMin - src;
 	// store either the new content encoding or new length
 	if ( nextMin == mime.getContentEncodingPos()) {
 		pnew += sprintf(pnew, " identity");
 		ptr1 = NULL;
 	}
 	else if ( nextMin == mime.getContentLengthPos() ) {
 		pnew += sprintf(pnew, " %"INT32"",newSize);
 		ptr2 = NULL;
 	}
 	else if ( nextMin == mime.getContentTypePos() ) {
 		pnew += sprintf(pnew," %s",newCT);
 		ptr3 = NULL;
 	}
 	// scan to \r\n at end of that line we replace
 	while ( *src != '\r' && *src != '\n') src++;
 	// loop for more
 	if ( nextMin < mimeEnd ) goto subloop;
 	// copy the rest
-	gbmemcpy ( pnew , src , mimeEnd - src );
+	// gbmemcpy ( pnew , src , mimeEnd - src );
-	pnew += mimeEnd - src;
+	// pnew += mimeEnd - src;
-	src  += mimeEnd - src;
+	// src  += mimeEnd - src;
 	// before restLen was negative because we were skipping over
--- a/2
+++ b/2
@ -89,7 +89,7 @@ OS_DEB := true
 STATIC := -static
 # MDW: i get some parsing inconsistencies when running the first qa injection
 # test if this is -O3. strange.
-XMLDOCOPT := -O2
+XMLDOCOPT := -O0
 endif
--- a/XmlDoc.cpp
+++ b/XmlDoc.cpp
@ -2720,6 +2720,10 @@ bool XmlDoc::indexDoc2 ( ) {
 	// scan it using delimeters. the file consists of multiple documents
 	// separated by this content delimeter.
 	if ( m_firstUrlValid && m_firstUrl.isCompressedArcOrWarc() ) {
 		// we need the doc
 		char **replyPtr = getHttpReply ();
 		if ( ! replyPtr ) return true;
 		if ( replyPtr == (void *)-1 ) return false;
 		// already called inject?
 		if ( m_calledWarcInject )
 			// then we are done
@ -10196,7 +10200,14 @@ Url **XmlDoc::getRedirUrl() {
 		// http-equiv refresh tag, but that added an element of 
 		// recursion that is just too confusing to deal with. so 
 		// let's just parse out the meta tag by hand
-		if ( ! isRobotsTxt ) {
+		bool checkMeta = true;
 		if ( isRobotsTxt ) checkMeta = false;
 		// warc and arc files have a list of html docs
 		// in them that we need to index, so skip this check
 		// for them as well
 		if ( m_firstUrlValid && m_firstUrl.isCompressedArcOrWarc() )
 			checkMeta = false;
 		if ( checkMeta ) {
 			Url **mrup = getMetaRedirUrl();
 			if ( ! mrup || mrup == (void *)-1) return (Url **)mrup;
 			// set it. might be NULL if not there.