added preliminary support for spidering .warc.gz and .arc.gz files

This commit is contained in:
Matt 2015-04-27 21:41:22 -06:00
parent ccb53eb4e7
commit 0eb415d408
7 changed files with 214 additions and 4 deletions

View File

@ -1043,14 +1043,16 @@ bool HttpServer::sendReply ( TcpSocket *s , HttpRequest *r , bool isAdmin) {
// iaItem is like "webgroup-20100422114008-00011"
// print out the warc files as if they were urls
// so we can spider them through the spider pipeline as-is
cmd.safePrintf("/home/mwells/ia list %s --glob='*arc.gz' | "
cmd.safePrintf("%s/ia list %s --glob='*arc.gz' | "
"awk '{print \"<a "
"href=http://archive.org/download/"
"%s/\" $1\">\"}' > ./tmpiaout"
, g_hostdb.m_dir
,iaItem
,iaItem
);
iaItem[pathLen] = c;
log("system: %s",cmd.getBufStart());
gbsystem ( cmd.getBufStart() );
SafeBuf sb;
sb.load ( "./tmpiaout" );

View File

@ -586,7 +586,39 @@ bool Msg7::inject ( void *state ,
// we've saved m_start as "start" above,
// so find the next delimeter after it and set that to m_start
// add +1 to avoid infinite loop
m_start = strstr(start+1,delim);
if ( ! gr->m_isMimeDelimeted )
m_start = strstr(start+1,delim);
// WARC files are mime delimeted. the http reply, which
// contains a mime, as a mime a level above that whose
// content-length: field includes the original http reply mime
// as part of its content.
if ( gr->m_isMimeDelimeted ) {
char *mm = strstr(start,"Content-Length:");
char *mmend = NULL;
if ( mm ) mmend = strstr (mm,"\n");
if ( ! mm || ! mmend ) {
log("inject: all done");
return true;
}
char c = *mmend;
*mmend = '\0';
int64_t recordSize = atoll ( mm + 15 );
*mmend = c;
// end of mime header
char *hend = strstr ( mmend, "\r\n\r\n");
if ( ! hend ) {
log("inject: could not find header end.");
return true;
}
// skip that
hend += 4;
// adjust start to point to start of the content really
start = hend;
// and over record
m_start = start + recordSize;
}
// for injecting "start" set this to \0
if ( m_start ) {
// null term it

View File

@ -123,6 +123,7 @@ class GigablastRequest {
char *m_url; // also for /get
char *m_queryToScrape;
char *m_contentDelim;
bool m_isMimeDelimeted; // are recs delimeted using Content-Length:
char *m_contentTypeStr;
char *m_contentFile;
char *m_content;

45
Url.cpp
View File

@ -32,6 +32,8 @@ void Url::reset() {
//m_siteLen = 0;
// ip related stuff
m_ip = 0;
// m_isWarcValid = false;
// m_isArcValid = false;
}
// set from another Url, does a copy
@ -1426,14 +1428,53 @@ bool Url::isBadExtension ( int32_t version ) {
s_badExtInitialized = true;
}
int myKey = hash64Lower_a(m_extension,m_elen);
//zero unless we have a bad extention, otherwise
//we return TR version in which it was banned
int32_t badVersion = s_badExtTable.getValue(myKey);
if (badVersion == 0) return false;
if(badVersion <= version) return true;
//if(badVersion <= version) return true;
if ( badVersion > version ) return false;
// exceptions for .gz
if ( isCompressedArcOrWarc() ) return false;
return true;
}
bool Url::isCompressedArcOrWarc ( ) {
// hack to allow for .gz if it is .warc.gz or .arc.gz
if ( m_elen == 2 &&
m_extension[0] == 'g' &&
m_extension[1] == 'z' &&
m_ulen > 10 &&
m_extension[-1] == '.' &&
m_extension[-2] == 'c' &&
m_extension[-3] == 'r' &&
m_extension[-4] == 'a' &&
m_extension[-5] == '.' ) {
// m_isArc = true;
// m_isArcValid = true;
return true;
}
if ( m_elen == 2 &&
m_extension[0] == 'g' &&
m_extension[1] == 'z' &&
m_ulen > 10 &&
m_extension[-1] == '.' &&
m_extension[-2] == 'c' &&
m_extension[-3] == 'r' &&
m_extension[-4] == 'a' &&
m_extension[-5] == 'w' &&
m_extension[-6] == '.' ) {
// m_isWarc = true;
// m_isWarcValid = true;
return true;
}
return false;
}
// see Url.h for a description of this.

2
Url.h
View File

@ -92,6 +92,8 @@ public:
bool isBadExtension(int32_t xxx);
bool isSet() { return m_ulen != 0; }
bool isCompressedArcOrWarc ( ) ;
// does it end in .xml, .rdb or .rss, etc. kinda thing
//bool isRSSFormat ( ) ;

View File

@ -114,6 +114,8 @@ XmlDoc::XmlDoc() {
m_freed = false;
m_contentInjected = false;
m_wasContentInjected = false;
m_calledWarcInject = false;
m_msg7 = NULL;
//m_coll = NULL;
m_ubuf = NULL;
m_pbuf = NULL;
@ -189,6 +191,8 @@ static int64_t s_lastTimeStart = 0LL;
void XmlDoc::reset ( ) {
m_calledWarcInject = false;
m_ipStartTime = 0;
m_ipEndTime = 0;
m_diffbotReplyRetries = 0;
@ -249,6 +253,12 @@ void XmlDoc::reset ( ) {
//log("diffbot: deleting m_dx2");
}
if ( m_msg7 ) {
mdelete ( m_msg7, sizeof(Msg7), "xdmsg7" );
delete ( m_msg7 );
m_msg7 = NULL;
}
m_isDiffbotJSONObject = false;
m_dmozBuf.purge();
@ -2540,6 +2550,12 @@ bool XmlDoc::indexDoc ( ) {
return true;
}
void doneInjectingWarc ( void *state ) {
XmlDoc *THIS = (XmlDoc *)state;
// resume the index pipeline
THIS->m_masterLoop ( THIS->m_masterState );
}
// . returns false if blocked, true otherwise
// . sets g_errno on error and returns true
bool XmlDoc::indexDoc2 ( ) {
@ -2691,6 +2707,78 @@ bool XmlDoc::indexDoc2 ( ) {
// call it
if ( ! injectAhrefsLinks () ) return false;
}
// if we are a warc/arc doc for the internet archive then
// scan it using delimeters. the file consists of multiple documents
// separated by this content delimeter.
if ( m_firstUrlValid && m_firstUrl.isCompressedArcOrWarc() ) {
// already called inject?
if ( m_calledWarcInject )
// then we are done
return true;
int8_t *hc = getHopCount();
if ( ! hc ) return true;
if ( hc == (void *)-1 ) return false;
// first download
char **warcContent = getUtf8Content();
// return true with g_errno set on error
if ( ! warcContent ) {
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
return true;
}
// would block? return false then
if ( warcContent == (void *)-1 )
return false;
// do not re-call this
m_calledWarcInject = true;
// need this. it is almost 1MB in size, so alloc it
if ( ! m_msg7 ) {
try { m_msg7 = new ( Msg7 ); }
catch ( ... ) {
g_errno = ENOMEM;
return true;
}
mnew ( m_msg7 , sizeof(Msg7),"xdmsg7");
}
// set the input parms
GigablastRequest *gr = &m_msg7->m_gr;
// reset it
memset ( gr , 0 , sizeof(GigablastRequest) );
// now set the parameters
gr->m_contentDelim = "WARC/";
//if ( isArc ) gr->m_contentDelim = "somethingelse";
// let injector know about the mime delimeterization
// which uses Content-Length: to indicate record size.
gr->m_isMimeDelimeted = true;
gr->m_spiderLinks = false;
gr->m_injectLinks = false;
// what happens if coll gets nuked from under us? use collnum
gr->m_coll = cr->m_coll;
gr->m_hopCount = *hc + 1;
// if ( ! m_collnumValid ) { char *xx=NULL;*xx=0; }
// gr->m_collnum = m_collnum;
// we could also use m_contentFile if it was on disk
gr->m_content = *warcContent;
// will this work on a content delimeterized doc?
gr->m_deleteUrl = m_deleteFromIndex;
// each subdoc will have a mime since it is a warc
gr->m_hasMime = true;
// TODO: set these based on the date in the warc mime!!
//gr->m_firstIndexed = ;
//gr->m_lastSpidered = ;
// then process. this will scan over each delimeted
// doc in the arc/warc file and inject each one individually.
if ( ! m_msg7->inject ( this , doneInjectingWarc ) )
// it would block, callback will be called later
return false;
// error?
if ( g_errno )
log("buid: warc error %s",mstrerror(g_errno));
return true;
}
// . now get the meta list from it to add
// . returns NULL and sets g_errno on error
char *metaList = getMetaList ( );
@ -16153,6 +16241,47 @@ char **XmlDoc::gotHttpReply ( ) {
m_httpReplyAllocSize = 0;
}
/*
// no, we should have encoding type ET_GZIP so httpserver
// should have unzipped it already... in HttpServer.cpp
//
// if we just downloaded a file ending in warc.gz arg.gz or
// whatever.gz then it was statically compressed. so in the case
// of warc or arc, at least try to uncompress it so we can index
// the documents it contains using PageInject.cpp's injection loop
// based on some content delimeter in the file.
if ( m_firstUrlValid && m_firstUrl.isCompressedArcOrWarc() ) {
// make a buffer to hold it
int32_t us = getUncompressedSize (m_httpReply,m_httpReplySize);
char *ubuf = (char *)mmalloc ( us , "warcbuf" );
int32_t realSize = us;
int err = gbuncompress ( (unsigned char *) ubuf ,
(uint32_t *) &realSize ,
(unsigned char *) m_httpReply ,
(uint32_t ) m_httpReplySize );
// free it i guess
mfree ( m_httpReply, m_httpReplyAllocSize, "XmlDocHR" );
// error uncompressing?
if ( err ) {
log("build: warc uncompress error %s",
mstrerror(g_errno));
mfree ( ubuf , us , "warcbuf" );
// and reset it
m_httpReplySize = 0;
m_httpReply = NULL;
m_httpReplyAllocSize = 0;
}
else {
// ok, successful.
m_httpReply = ubuf;
m_httpReplySize = realSize;
m_httpReplyAllocSize = us;
log("build: warc uncompress successful");
}
}
*/
// if errors were not local, reset g_errno and set m_indexCode
//if ( g_errno == ETCPTIMEDOUT ) m_indexCode = ETCPTIMEDOUT;
//if ( g_errno == EBADMIME ) m_indexCode = EBADMIME;

View File

@ -2342,6 +2342,9 @@ class XmlDoc {
bool m_contentInjected;
bool m_calledWarcInject;
class Msg7 *m_msg7;
bool m_recycleContent;
//bool m_loadFromOldTitleRec;