added preliminary support for spidering .warc.gz and .arc.gz files

2024-10-04 04:07:13 +03:00 · 2015-04-27 21:41:22 -06:00 · 2015-04-27 21:41:22 -06:00 · 0eb415d408
commit 0eb415d408
parent ccb53eb4e7
7 changed files with 214 additions and 4 deletions
--- a/HttpServer.cpp
+++ b/HttpServer.cpp
@ -1043,14 +1043,16 @@ bool HttpServer::sendReply ( TcpSocket  *s , HttpRequest *r , bool isAdmin) {
 		// iaItem is like "webgroup-20100422114008-00011"
 		// print out the warc files as if they were urls
 		// so we can spider them through the spider pipeline as-is
-		cmd.safePrintf("/home/mwells/ia list %s --glob='*arc.gz' | "
+		cmd.safePrintf("%s/ia list %s --glob='*arc.gz' | "
 			       "awk '{print \"<a "
 			       "href=http://archive.org/download/"
 			       "%s/\" $1\">\"}' > ./tmpiaout"
+			       , g_hostdb.m_dir
 			       ,iaItem
 			       ,iaItem
 			       );
 		iaItem[pathLen] = c;
+		log("system: %s",cmd.getBufStart());
 		gbsystem ( cmd.getBufStart() );
 		SafeBuf sb;
 		sb.load ( "./tmpiaout" );
--- a/PageInject.cpp
+++ b/PageInject.cpp
@ -586,7 +586,39 @@ bool Msg7::inject ( void *state ,
 		// we've saved m_start as "start" above, 
 		// so find the next delimeter after it and set that to m_start
 		// add +1 to avoid infinite loop
-		m_start = strstr(start+1,delim);
+		if ( ! gr->m_isMimeDelimeted )
+			m_start = strstr(start+1,delim);
+
+		// WARC files are mime delimeted. the http reply, which 
+		// contains a mime, as a mime a level above that whose 
+		// content-length: field includes the original http reply mime
+		// as part of its content.
+		if ( gr->m_isMimeDelimeted ) {
+			char *mm = strstr(start,"Content-Length:");
+			char *mmend = NULL;
+			if ( mm ) mmend = strstr (mm,"\n");
+			if ( ! mm || ! mmend ) {
+				log("inject: all done");
+				return true;
+			}
+			char c = *mmend;
+			*mmend = '\0';
+			int64_t recordSize = atoll ( mm + 15 );
+			*mmend = c;
+			// end of mime header
+			char *hend = strstr ( mmend, "\r\n\r\n");
+			if ( ! hend ) {
+				log("inject: could not find header end.");
+				return true;
+			}
+			// skip that 
+			hend += 4;
+			// adjust start to point to start of the content really
+			start = hend;
+			// and over record 
+			m_start = start + recordSize;
+		}
+
 		// for injecting "start" set this to \0
 		if ( m_start ) {
 			// null term it
--- a/Parms.h
+++ b/Parms.h
@ -123,6 +123,7 @@ class GigablastRequest {
 	char *m_url; // also for /get
 	char *m_queryToScrape;
 	char *m_contentDelim;
+	bool  m_isMimeDelimeted; // are recs delimeted using Content-Length:
 	char *m_contentTypeStr;
 	char *m_contentFile;
 	char *m_content;
--- a/Url.cpp
+++ b/Url.cpp
@ -32,6 +32,8 @@ void Url::reset() {
 	//m_siteLen   = 0;
 	// ip related stuff
 	m_ip          = 0;
+	// m_isWarcValid = false;
+	// m_isArcValid  = false;
 }

 // set from another Url, does a copy
@ -1426,14 +1428,53 @@ bool Url::isBadExtension ( int32_t version ) {
 		s_badExtInitialized = true;
 	}

-	
+
 	int myKey = hash64Lower_a(m_extension,m_elen);
 	//zero unless we have a bad extention, otherwise
 	//we return TR version in which it was banned
 	int32_t badVersion = s_badExtTable.getValue(myKey);
 	if (badVersion == 0) return false;
-	if(badVersion <= version) return true;
+	//if(badVersion <= version) return true;
+	if ( badVersion > version ) return false;
+	// exceptions for .gz
+	if ( isCompressedArcOrWarc() ) return false;
+	return true;
+}
+
+bool Url::isCompressedArcOrWarc ( ) {
+
+	// hack to allow for .gz if it is .warc.gz or .arc.gz
+	if ( m_elen == 2 && 
+	     m_extension[0] == 'g' &&
+	     m_extension[1] == 'z' &&
+	     m_ulen > 10 &&
+	     m_extension[-1] == '.' &&
+	     m_extension[-2] == 'c' &&
+	     m_extension[-3] == 'r' &&
+	     m_extension[-4] == 'a' &&
+	     m_extension[-5] == '.' ) {
+		// m_isArc = true;
+		// m_isArcValid = true;
+		return true;
+	}
+
+	if ( m_elen == 2 && 
+	     m_extension[0] == 'g' &&
+	     m_extension[1] == 'z' &&
+	     m_ulen > 10 &&
+	     m_extension[-1] == '.' &&
+	     m_extension[-2] == 'c' &&
+	     m_extension[-3] == 'r' &&
+	     m_extension[-4] == 'a' &&
+	     m_extension[-5] == 'w' &&
+	     m_extension[-6] == '.' ) {
+		// m_isWarc = true;
+		// m_isWarcValid = true;
+		return true;
+	}
+
 	return false;
+
 }

 // see Url.h for a description of this.
--- a/Url.h
+++ b/Url.h
@ -92,6 +92,8 @@ public:
 	bool isBadExtension(int32_t xxx);
 	bool isSet()            { return m_ulen != 0; }

+	bool isCompressedArcOrWarc ( ) ;
+
 	// does it end in .xml, .rdb or .rss, etc. kinda thing
 	//bool isRSSFormat ( ) ;

--- a/XmlDoc.cpp
+++ b/XmlDoc.cpp
@ -114,6 +114,8 @@ XmlDoc::XmlDoc() {
 	m_freed = false;
 	m_contentInjected = false;
 	m_wasContentInjected = false;
+	m_calledWarcInject = false;
+	m_msg7 = NULL;
 	//m_coll  = NULL;
 	m_ubuf = NULL;
 	m_pbuf = NULL;
@ -189,6 +191,8 @@ static int64_t s_lastTimeStart = 0LL;

 void XmlDoc::reset ( ) {

+	m_calledWarcInject = false;
+
 	m_ipStartTime = 0;
 	m_ipEndTime   = 0;
 	m_diffbotReplyRetries = 0;
@ -249,6 +253,12 @@ void XmlDoc::reset ( ) {
 		//log("diffbot: deleting m_dx2");
 	}

+	if ( m_msg7 ) {
+		mdelete ( m_msg7, sizeof(Msg7), "xdmsg7" );
+		delete ( m_msg7 );
+		m_msg7 = NULL;
+	}
+
 	m_isDiffbotJSONObject = false;

 	m_dmozBuf.purge();
@ -2540,6 +2550,12 @@ bool XmlDoc::indexDoc ( ) {
 	return true;
 }

+void doneInjectingWarc ( void *state ) {
+	XmlDoc *THIS = (XmlDoc *)state;
+	// resume the index pipeline
+	THIS->m_masterLoop ( THIS->m_masterState );
+}
+
 // . returns false if blocked, true otherwise
 // . sets g_errno on error and returns true
 bool XmlDoc::indexDoc2 ( ) {
@ -2691,6 +2707,78 @@ bool XmlDoc::indexDoc2 ( ) {
 		// call it
 		if ( ! injectAhrefsLinks () ) return false;
 	}
+
+	// if we are a warc/arc doc for the internet archive then
+	// scan it using delimeters. the file consists of multiple documents
+	// separated by this content delimeter.
+	if ( m_firstUrlValid && m_firstUrl.isCompressedArcOrWarc() ) {
+		// already called inject?
+		if ( m_calledWarcInject )
+			// then we are done
+			return true;
+		int8_t *hc = getHopCount();
+		if ( ! hc ) return true;
+		if ( hc == (void *)-1 ) return false;
+		// first download
+		char **warcContent = getUtf8Content();
+		// return true with g_errno set on error
+		if ( ! warcContent ) {
+			if ( ! g_errno ) { char *xx=NULL;*xx=0; }
+			return true;
+		}
+		// would block? return false then
+		if ( warcContent == (void *)-1 )
+			return false;
+		// do not re-call this
+		m_calledWarcInject = true;
+		// need this. it is almost 1MB in size, so alloc it
+		if ( ! m_msg7 ) {
+			try { m_msg7 = new ( Msg7 ); }
+			catch ( ... ) {
+				g_errno = ENOMEM;
+				return true;
+			}
+			mnew ( m_msg7 , sizeof(Msg7),"xdmsg7");
+		}
+		// set the input parms
+		GigablastRequest *gr = &m_msg7->m_gr;
+		// reset it
+		memset ( gr , 0 , sizeof(GigablastRequest) );
+		// now set the parameters
+		gr->m_contentDelim = "WARC/";
+		//if ( isArc ) gr->m_contentDelim = "somethingelse";
+		// let injector know about the mime delimeterization
+		// which uses Content-Length: to indicate record size.
+		gr->m_isMimeDelimeted = true;
+		gr->m_spiderLinks = false;
+		gr->m_injectLinks = false;
+		// what happens if coll gets nuked from under us? use collnum
+		gr->m_coll = cr->m_coll;
+		gr->m_hopCount = *hc + 1;
+		// if ( ! m_collnumValid ) { char *xx=NULL;*xx=0; }
+		// gr->m_collnum = m_collnum;
+		// we could also use m_contentFile if it was on disk
+		gr->m_content = *warcContent;
+		// will this work on a content delimeterized doc?
+		gr->m_deleteUrl = m_deleteFromIndex;
+		// each subdoc will have a mime since it is a warc
+		gr->m_hasMime = true;
+		// TODO: set these based on the date in the warc mime!!
+		//gr->m_firstIndexed = ;
+		//gr->m_lastSpidered = ;
+		// then process. this will scan over each delimeted 
+		// doc in the arc/warc file and inject each one individually.
+		if ( ! m_msg7->inject ( this , doneInjectingWarc ) )
+			// it would block, callback will be called later
+			return false;
+		// error?
+		if ( g_errno )
+			log("buid: warc error %s",mstrerror(g_errno));
+		return true;
+	}
+		
+
+
 	// . now get the meta list from it to add
 	// . returns NULL and sets g_errno on error
 	char *metaList = getMetaList ( );
@ -16153,6 +16241,47 @@ char **XmlDoc::gotHttpReply ( ) {
 		m_httpReplyAllocSize = 0;
 	}

+	/*
+	// no, we should have encoding type ET_GZIP so httpserver
+	// should have unzipped it already... in HttpServer.cpp
+	//
+	// if we just downloaded a file ending in warc.gz arg.gz or 
+	// whatever.gz then it was statically compressed. so in the case
+	// of warc or arc, at least try to uncompress it so we can index
+	// the documents it contains using PageInject.cpp's injection loop
+	// based on some content delimeter in the file.
+	if ( m_firstUrlValid && m_firstUrl.isCompressedArcOrWarc() ) {
+		// make a buffer to hold it
+		int32_t us = getUncompressedSize (m_httpReply,m_httpReplySize);
+		char *ubuf = (char *)mmalloc ( us , "warcbuf" );
+		int32_t realSize = us;
+		int err = gbuncompress ( (unsigned char *)  ubuf ,
+					 (uint32_t *) &realSize   ,
+					 (unsigned char *)  m_httpReply , 
+					 (uint32_t  ) m_httpReplySize );
+		// free it i guess
+		mfree ( m_httpReply, m_httpReplyAllocSize, "XmlDocHR" );
+		// error uncompressing?
+		if ( err ) {
+			log("build: warc uncompress error %s",
+			    mstrerror(g_errno));
+			mfree ( ubuf , us , "warcbuf" );
+			// and reset it
+			m_httpReplySize      = 0;
+			m_httpReply          = NULL;
+			m_httpReplyAllocSize = 0;
+		}
+		else {
+			// ok, successful.
+			m_httpReply = ubuf;
+			m_httpReplySize = realSize;
+			m_httpReplyAllocSize = us;
+			log("build: warc uncompress successful");
+		}
+	}
+	*/
+
+
 	// if errors were not local, reset g_errno and set m_indexCode
 	//if ( g_errno == ETCPTIMEDOUT ) m_indexCode = ETCPTIMEDOUT;
 	//if ( g_errno == EBADMIME     ) m_indexCode = EBADMIME;
--- a/XmlDoc.h
+++ b/XmlDoc.h
@ -2342,6 +2342,9 @@ class XmlDoc {

 	bool          m_contentInjected;

+	bool          m_calledWarcInject;
+	class Msg7   *m_msg7;
+
 	bool          m_recycleContent;
 	//bool        m_loadFromOldTitleRec;