mirror of
https://github.com/gigablast/open-source-search-engine.git
synced 2024-10-04 12:17:35 +03:00
added preliminary support for spidering .warc.gz and .arc.gz files
This commit is contained in:
parent
ccb53eb4e7
commit
0eb415d408
@ -1043,14 +1043,16 @@ bool HttpServer::sendReply ( TcpSocket *s , HttpRequest *r , bool isAdmin) {
|
||||
// iaItem is like "webgroup-20100422114008-00011"
|
||||
// print out the warc files as if they were urls
|
||||
// so we can spider them through the spider pipeline as-is
|
||||
cmd.safePrintf("/home/mwells/ia list %s --glob='*arc.gz' | "
|
||||
cmd.safePrintf("%s/ia list %s --glob='*arc.gz' | "
|
||||
"awk '{print \"<a "
|
||||
"href=http://archive.org/download/"
|
||||
"%s/\" $1\">\"}' > ./tmpiaout"
|
||||
, g_hostdb.m_dir
|
||||
,iaItem
|
||||
,iaItem
|
||||
);
|
||||
iaItem[pathLen] = c;
|
||||
log("system: %s",cmd.getBufStart());
|
||||
gbsystem ( cmd.getBufStart() );
|
||||
SafeBuf sb;
|
||||
sb.load ( "./tmpiaout" );
|
||||
|
@ -586,7 +586,39 @@ bool Msg7::inject ( void *state ,
|
||||
// we've saved m_start as "start" above,
|
||||
// so find the next delimeter after it and set that to m_start
|
||||
// add +1 to avoid infinite loop
|
||||
if ( ! gr->m_isMimeDelimeted )
|
||||
m_start = strstr(start+1,delim);
|
||||
|
||||
// WARC files are mime delimeted. the http reply, which
|
||||
// contains a mime, as a mime a level above that whose
|
||||
// content-length: field includes the original http reply mime
|
||||
// as part of its content.
|
||||
if ( gr->m_isMimeDelimeted ) {
|
||||
char *mm = strstr(start,"Content-Length:");
|
||||
char *mmend = NULL;
|
||||
if ( mm ) mmend = strstr (mm,"\n");
|
||||
if ( ! mm || ! mmend ) {
|
||||
log("inject: all done");
|
||||
return true;
|
||||
}
|
||||
char c = *mmend;
|
||||
*mmend = '\0';
|
||||
int64_t recordSize = atoll ( mm + 15 );
|
||||
*mmend = c;
|
||||
// end of mime header
|
||||
char *hend = strstr ( mmend, "\r\n\r\n");
|
||||
if ( ! hend ) {
|
||||
log("inject: could not find header end.");
|
||||
return true;
|
||||
}
|
||||
// skip that
|
||||
hend += 4;
|
||||
// adjust start to point to start of the content really
|
||||
start = hend;
|
||||
// and over record
|
||||
m_start = start + recordSize;
|
||||
}
|
||||
|
||||
// for injecting "start" set this to \0
|
||||
if ( m_start ) {
|
||||
// null term it
|
||||
|
1
Parms.h
1
Parms.h
@ -123,6 +123,7 @@ class GigablastRequest {
|
||||
char *m_url; // also for /get
|
||||
char *m_queryToScrape;
|
||||
char *m_contentDelim;
|
||||
bool m_isMimeDelimeted; // are recs delimeted using Content-Length:
|
||||
char *m_contentTypeStr;
|
||||
char *m_contentFile;
|
||||
char *m_content;
|
||||
|
43
Url.cpp
43
Url.cpp
@ -32,6 +32,8 @@ void Url::reset() {
|
||||
//m_siteLen = 0;
|
||||
// ip related stuff
|
||||
m_ip = 0;
|
||||
// m_isWarcValid = false;
|
||||
// m_isArcValid = false;
|
||||
}
|
||||
|
||||
// set from another Url, does a copy
|
||||
@ -1432,8 +1434,47 @@ bool Url::isBadExtension ( int32_t version ) {
|
||||
//we return TR version in which it was banned
|
||||
int32_t badVersion = s_badExtTable.getValue(myKey);
|
||||
if (badVersion == 0) return false;
|
||||
if(badVersion <= version) return true;
|
||||
//if(badVersion <= version) return true;
|
||||
if ( badVersion > version ) return false;
|
||||
// exceptions for .gz
|
||||
if ( isCompressedArcOrWarc() ) return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool Url::isCompressedArcOrWarc ( ) {
|
||||
|
||||
// hack to allow for .gz if it is .warc.gz or .arc.gz
|
||||
if ( m_elen == 2 &&
|
||||
m_extension[0] == 'g' &&
|
||||
m_extension[1] == 'z' &&
|
||||
m_ulen > 10 &&
|
||||
m_extension[-1] == '.' &&
|
||||
m_extension[-2] == 'c' &&
|
||||
m_extension[-3] == 'r' &&
|
||||
m_extension[-4] == 'a' &&
|
||||
m_extension[-5] == '.' ) {
|
||||
// m_isArc = true;
|
||||
// m_isArcValid = true;
|
||||
return true;
|
||||
}
|
||||
|
||||
if ( m_elen == 2 &&
|
||||
m_extension[0] == 'g' &&
|
||||
m_extension[1] == 'z' &&
|
||||
m_ulen > 10 &&
|
||||
m_extension[-1] == '.' &&
|
||||
m_extension[-2] == 'c' &&
|
||||
m_extension[-3] == 'r' &&
|
||||
m_extension[-4] == 'a' &&
|
||||
m_extension[-5] == 'w' &&
|
||||
m_extension[-6] == '.' ) {
|
||||
// m_isWarc = true;
|
||||
// m_isWarcValid = true;
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
|
||||
}
|
||||
|
||||
// see Url.h for a description of this.
|
||||
|
2
Url.h
2
Url.h
@ -92,6 +92,8 @@ public:
|
||||
bool isBadExtension(int32_t xxx);
|
||||
bool isSet() { return m_ulen != 0; }
|
||||
|
||||
bool isCompressedArcOrWarc ( ) ;
|
||||
|
||||
// does it end in .xml, .rdb or .rss, etc. kinda thing
|
||||
//bool isRSSFormat ( ) ;
|
||||
|
||||
|
129
XmlDoc.cpp
129
XmlDoc.cpp
@ -114,6 +114,8 @@ XmlDoc::XmlDoc() {
|
||||
m_freed = false;
|
||||
m_contentInjected = false;
|
||||
m_wasContentInjected = false;
|
||||
m_calledWarcInject = false;
|
||||
m_msg7 = NULL;
|
||||
//m_coll = NULL;
|
||||
m_ubuf = NULL;
|
||||
m_pbuf = NULL;
|
||||
@ -189,6 +191,8 @@ static int64_t s_lastTimeStart = 0LL;
|
||||
|
||||
void XmlDoc::reset ( ) {
|
||||
|
||||
m_calledWarcInject = false;
|
||||
|
||||
m_ipStartTime = 0;
|
||||
m_ipEndTime = 0;
|
||||
m_diffbotReplyRetries = 0;
|
||||
@ -249,6 +253,12 @@ void XmlDoc::reset ( ) {
|
||||
//log("diffbot: deleting m_dx2");
|
||||
}
|
||||
|
||||
if ( m_msg7 ) {
|
||||
mdelete ( m_msg7, sizeof(Msg7), "xdmsg7" );
|
||||
delete ( m_msg7 );
|
||||
m_msg7 = NULL;
|
||||
}
|
||||
|
||||
m_isDiffbotJSONObject = false;
|
||||
|
||||
m_dmozBuf.purge();
|
||||
@ -2540,6 +2550,12 @@ bool XmlDoc::indexDoc ( ) {
|
||||
return true;
|
||||
}
|
||||
|
||||
void doneInjectingWarc ( void *state ) {
|
||||
XmlDoc *THIS = (XmlDoc *)state;
|
||||
// resume the index pipeline
|
||||
THIS->m_masterLoop ( THIS->m_masterState );
|
||||
}
|
||||
|
||||
// . returns false if blocked, true otherwise
|
||||
// . sets g_errno on error and returns true
|
||||
bool XmlDoc::indexDoc2 ( ) {
|
||||
@ -2691,6 +2707,78 @@ bool XmlDoc::indexDoc2 ( ) {
|
||||
// call it
|
||||
if ( ! injectAhrefsLinks () ) return false;
|
||||
}
|
||||
|
||||
// if we are a warc/arc doc for the internet archive then
|
||||
// scan it using delimeters. the file consists of multiple documents
|
||||
// separated by this content delimeter.
|
||||
if ( m_firstUrlValid && m_firstUrl.isCompressedArcOrWarc() ) {
|
||||
// already called inject?
|
||||
if ( m_calledWarcInject )
|
||||
// then we are done
|
||||
return true;
|
||||
int8_t *hc = getHopCount();
|
||||
if ( ! hc ) return true;
|
||||
if ( hc == (void *)-1 ) return false;
|
||||
// first download
|
||||
char **warcContent = getUtf8Content();
|
||||
// return true with g_errno set on error
|
||||
if ( ! warcContent ) {
|
||||
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
|
||||
return true;
|
||||
}
|
||||
// would block? return false then
|
||||
if ( warcContent == (void *)-1 )
|
||||
return false;
|
||||
// do not re-call this
|
||||
m_calledWarcInject = true;
|
||||
// need this. it is almost 1MB in size, so alloc it
|
||||
if ( ! m_msg7 ) {
|
||||
try { m_msg7 = new ( Msg7 ); }
|
||||
catch ( ... ) {
|
||||
g_errno = ENOMEM;
|
||||
return true;
|
||||
}
|
||||
mnew ( m_msg7 , sizeof(Msg7),"xdmsg7");
|
||||
}
|
||||
// set the input parms
|
||||
GigablastRequest *gr = &m_msg7->m_gr;
|
||||
// reset it
|
||||
memset ( gr , 0 , sizeof(GigablastRequest) );
|
||||
// now set the parameters
|
||||
gr->m_contentDelim = "WARC/";
|
||||
//if ( isArc ) gr->m_contentDelim = "somethingelse";
|
||||
// let injector know about the mime delimeterization
|
||||
// which uses Content-Length: to indicate record size.
|
||||
gr->m_isMimeDelimeted = true;
|
||||
gr->m_spiderLinks = false;
|
||||
gr->m_injectLinks = false;
|
||||
// what happens if coll gets nuked from under us? use collnum
|
||||
gr->m_coll = cr->m_coll;
|
||||
gr->m_hopCount = *hc + 1;
|
||||
// if ( ! m_collnumValid ) { char *xx=NULL;*xx=0; }
|
||||
// gr->m_collnum = m_collnum;
|
||||
// we could also use m_contentFile if it was on disk
|
||||
gr->m_content = *warcContent;
|
||||
// will this work on a content delimeterized doc?
|
||||
gr->m_deleteUrl = m_deleteFromIndex;
|
||||
// each subdoc will have a mime since it is a warc
|
||||
gr->m_hasMime = true;
|
||||
// TODO: set these based on the date in the warc mime!!
|
||||
//gr->m_firstIndexed = ;
|
||||
//gr->m_lastSpidered = ;
|
||||
// then process. this will scan over each delimeted
|
||||
// doc in the arc/warc file and inject each one individually.
|
||||
if ( ! m_msg7->inject ( this , doneInjectingWarc ) )
|
||||
// it would block, callback will be called later
|
||||
return false;
|
||||
// error?
|
||||
if ( g_errno )
|
||||
log("buid: warc error %s",mstrerror(g_errno));
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
|
||||
// . now get the meta list from it to add
|
||||
// . returns NULL and sets g_errno on error
|
||||
char *metaList = getMetaList ( );
|
||||
@ -16153,6 +16241,47 @@ char **XmlDoc::gotHttpReply ( ) {
|
||||
m_httpReplyAllocSize = 0;
|
||||
}
|
||||
|
||||
/*
|
||||
// no, we should have encoding type ET_GZIP so httpserver
|
||||
// should have unzipped it already... in HttpServer.cpp
|
||||
//
|
||||
// if we just downloaded a file ending in warc.gz arg.gz or
|
||||
// whatever.gz then it was statically compressed. so in the case
|
||||
// of warc or arc, at least try to uncompress it so we can index
|
||||
// the documents it contains using PageInject.cpp's injection loop
|
||||
// based on some content delimeter in the file.
|
||||
if ( m_firstUrlValid && m_firstUrl.isCompressedArcOrWarc() ) {
|
||||
// make a buffer to hold it
|
||||
int32_t us = getUncompressedSize (m_httpReply,m_httpReplySize);
|
||||
char *ubuf = (char *)mmalloc ( us , "warcbuf" );
|
||||
int32_t realSize = us;
|
||||
int err = gbuncompress ( (unsigned char *) ubuf ,
|
||||
(uint32_t *) &realSize ,
|
||||
(unsigned char *) m_httpReply ,
|
||||
(uint32_t ) m_httpReplySize );
|
||||
// free it i guess
|
||||
mfree ( m_httpReply, m_httpReplyAllocSize, "XmlDocHR" );
|
||||
// error uncompressing?
|
||||
if ( err ) {
|
||||
log("build: warc uncompress error %s",
|
||||
mstrerror(g_errno));
|
||||
mfree ( ubuf , us , "warcbuf" );
|
||||
// and reset it
|
||||
m_httpReplySize = 0;
|
||||
m_httpReply = NULL;
|
||||
m_httpReplyAllocSize = 0;
|
||||
}
|
||||
else {
|
||||
// ok, successful.
|
||||
m_httpReply = ubuf;
|
||||
m_httpReplySize = realSize;
|
||||
m_httpReplyAllocSize = us;
|
||||
log("build: warc uncompress successful");
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
// if errors were not local, reset g_errno and set m_indexCode
|
||||
//if ( g_errno == ETCPTIMEDOUT ) m_indexCode = ETCPTIMEDOUT;
|
||||
//if ( g_errno == EBADMIME ) m_indexCode = EBADMIME;
|
||||
|
Loading…
Reference in New Issue
Block a user