some fixes for indexing warcs/arcs.

This commit is contained in:
Matt 2015-04-28 22:30:58 -07:00
parent b6ff0b0173
commit faf2c06d29
5 changed files with 161 additions and 73 deletions

View File

@ -45,6 +45,7 @@ void HttpMime::reset ( ) {
m_locationFieldLen = 0; m_locationFieldLen = 0;
m_contentEncodingPos = NULL; m_contentEncodingPos = NULL;
m_contentLengthPos = NULL; m_contentLengthPos = NULL;
m_contentTypePos = NULL;
} }
// . returns false if could not get a valid mime // . returns false if could not get a valid mime
@ -157,8 +158,12 @@ bool HttpMime::parse ( char *mime , int32_t mimeLen , Url *url ) {
time_t now = time(NULL); time_t now = time(NULL);
if (m_lastModifiedDate > now) m_lastModifiedDate = now; if (m_lastModifiedDate > now) m_lastModifiedDate = now;
} }
else if ( strncasecmp ( p , "Content-Type:" ,13) == 0 ) else if ( strncasecmp ( p , "Content-Type:" ,13) == 0 ) {
m_contentType = getContentTypePrivate ( p + 13 ); m_contentType = getContentTypePrivate ( p + 13 );
char *s = p + 13;
while ( *s == ' ' || *s == '\t' ) s++;
m_contentTypePos = s;
}
else if ( strncasecmp ( p , "Set-Cookie:" ,10) == 0 ) { else if ( strncasecmp ( p , "Set-Cookie:" ,10) == 0 ) {
m_cookie = p + 11; m_cookie = p + 11;
if ( m_cookie[0] == ' ' ) m_cookie++; if ( m_cookie[0] == ' ' ) m_cookie++;
@ -540,6 +545,7 @@ int32_t getContentTypeFromStr ( char *s ) {
else if (!strncasecmp(s,"image/",6 ) ) ct = CT_IMAGE; else if (!strncasecmp(s,"image/",6 ) ) ct = CT_IMAGE;
else if (!strcasecmp(s,"application/javascript" ) ) ct = CT_JS; else if (!strcasecmp(s,"application/javascript" ) ) ct = CT_JS;
else if (!strcasecmp(s,"application/x-javascript") ) ct = CT_JS; else if (!strcasecmp(s,"application/x-javascript") ) ct = CT_JS;
else if (!strcasecmp(s,"application/x-gzip" ) ) ct = CT_GZ;
else if (!strcasecmp(s,"text/javascript" ) ) ct = CT_JS; else if (!strcasecmp(s,"text/javascript" ) ) ct = CT_JS;
else if (!strcasecmp(s,"text/x-js" ) ) ct = CT_JS; else if (!strcasecmp(s,"text/x-js" ) ) ct = CT_JS;
else if (!strcasecmp(s,"text/js" ) ) ct = CT_JS; else if (!strcasecmp(s,"text/js" ) ) ct = CT_JS;
@ -626,6 +632,17 @@ void resetHttpMime ( ) {
s_mimeTable.reset(); s_mimeTable.reset();
} }
const char *extensionToContentTypeStr2 ( char *ext , int32_t elen ) {
// assume text/html if no extension provided
if ( ! ext || ! ext[0] ) return NULL;
if ( elen <= 0 ) return NULL;
// get hash for table look up
int32_t key = hash32 ( ext , elen );
char **pp = (char **)s_mimeTable.getValue ( &key );
if ( ! pp ) return NULL;
return *pp;
}
const char *HttpMime::getContentTypeFromExtension ( char *ext , int32_t elen) { const char *HttpMime::getContentTypeFromExtension ( char *ext , int32_t elen) {
// assume text/html if no extension provided // assume text/html if no extension provided
if ( ! ext || ! ext[0] ) return "text/html"; if ( ! ext || ! ext[0] ) return "text/html";
@ -1051,7 +1068,10 @@ static char *s_ext[] = {
"xwd" , "image/x-xwindowdump", "xwd" , "image/x-xwindowdump",
"xyz" , "chemical/x-pdb", "xyz" , "chemical/x-pdb",
"zip" , "application/zip" , "zip" , "application/zip" ,
"xpi", "application/x-xpinstall" "xpi", "application/x-xpinstall",
// newstuff
"warc", "application/warc",
"arc", "application/arc"
}; };
// . init s_mimeTable in this call // . init s_mimeTable in this call

View File

@ -9,6 +9,8 @@
// convert application/json to CT_JSON for instance // convert application/json to CT_JSON for instance
int32_t getContentTypeFromStr ( char *s ) ; int32_t getContentTypeFromStr ( char *s ) ;
const char *extensionToContentTypeStr2 ( char *ext , int32_t elen ) ;
#include <time.h> #include <time.h>
void getTime ( char *s , int *sec , int *min , int *hour ) ; void getTime ( char *s , int *sec , int *min , int *hour ) ;
@ -42,6 +44,7 @@ time_t atotime5 ( char *s ) ;
#define CT_JSON 16 #define CT_JSON 16
#define CT_IMAGE 17 #define CT_IMAGE 17
#define CT_STATUS 18 // an internal type indicating spider reply #define CT_STATUS 18 // an internal type indicating spider reply
#define CT_GZ 19
#define ET_IDENTITY 0 #define ET_IDENTITY 0
#define ET_GZIP 1 #define ET_GZIP 1
@ -127,6 +130,7 @@ class HttpMime {
int32_t getContentEncoding () {return m_contentEncoding;} int32_t getContentEncoding () {return m_contentEncoding;}
char *getContentEncodingPos() {return m_contentEncodingPos;} char *getContentEncodingPos() {return m_contentEncodingPos;}
char *getContentLengthPos() {return m_contentLengthPos;} char *getContentLengthPos() {return m_contentLengthPos;}
char *getContentTypePos() {return m_contentTypePos;}
// private: // private:
@ -166,6 +170,7 @@ class HttpMime {
int32_t m_contentEncoding; int32_t m_contentEncoding;
char *m_contentEncodingPos; char *m_contentEncodingPos;
char *m_contentLengthPos; char *m_contentLengthPos;
char *m_contentTypePos;
// the size of the terminating boundary, either 1 or 2 bytes. // the size of the terminating boundary, either 1 or 2 bytes.
// just the last \n in the case of a \n\n or \r in the case // just the last \n in the case of a \n\n or \r in the case

View File

@ -1035,19 +1035,21 @@ bool HttpServer::sendReply ( TcpSocket *s , HttpRequest *r , bool isAdmin) {
if ( strncmp ( path , "/download/", 10 ) == 0 ) if ( strncmp ( path , "/download/", 10 ) == 0 )
return sendBackDump ( s , r ); return sendBackDump ( s , r );
if ( strncmp ( path , "/iagbcoll/" , 10 ) == 0 ) { if ( strncmp ( path , "/gbiaitem/" , 10 ) == 0 ) {
SafeBuf cmd; SafeBuf cmd;
char *iaItem = path + 10; char *iaItem = path + 10;
char c = iaItem[pathLen]; char c = iaItem[pathLen];
iaItem[pathLen] = '\0'; iaItem[pathLen] = '\0';
// iaItem is like "webgroup-20100422114008-00011" // iaItem is like "webgroup-20100422114008-00011"
// print out the warc files as if they were urls // print out the warc files as if they were urls
// so we can spider them through the spider pipeline as-is // so we can spider them through the spider pipeline as-is.
cmd.safePrintf("%s/ia list %s --glob='*arc.gz' | " // this hack only works on internet archive servers
// that have the '/home/mwells/ia' obviously
cmd.safePrintf("/home/mwells/ia list %s --glob='*arc.gz' | "
"awk '{print \"<a " "awk '{print \"<a "
"href=http://archive.org/download/" "href=http://archive.org/download/"
"%s/\" $1\">\"}' > ./tmpiaout" "%s/\" $1\">\"}' > ./tmpiaout"
, g_hostdb.m_dir //, g_hostdb.m_dir
,iaItem ,iaItem
,iaItem ,iaItem
); );
@ -2388,6 +2390,7 @@ int32_t getMsgSize ( char *buf, int32_t bufSize, TcpSocket *s ) {
} }
// if has no content then it must end in \n\r\n\r or \r\n\r\n // if has no content then it must end in \n\r\n\r or \r\n\r\n
if ( ! hasContent ) return bufSize; if ( ! hasContent ) return bufSize;
// look for a Content-Type: field because we now limit how much // look for a Content-Type: field because we now limit how much
// we read based on this // we read based on this
char *p = buf; char *p = buf;
@ -2411,45 +2414,71 @@ int32_t getMsgSize ( char *buf, int32_t bufSize, TcpSocket *s ) {
// as well index that at least. // as well index that at least.
if ( p + 15 < pend && strncasecmp( p,"application/pdf",15)==0) if ( p + 15 < pend && strncasecmp( p,"application/pdf",15)==0)
allOrNothing = true; allOrNothing = true;
if ( p + 15 < pend&&strncasecmp(p,"application/x-gzip",18)==0)
allOrNothing = true;
// adjust "max to read" if we don't have an html/plain doc // adjust "max to read" if we don't have an html/plain doc
if ( ! isPost ) { if ( ! isPost ) {
max = s->m_maxOtherDocLen + 10*1024 ; max = s->m_maxOtherDocLen + 10*1024 ;
if ( s->m_maxOtherDocLen == -1 ) max = 0x7fffffff; if ( s->m_maxOtherDocLen == -1 ) max = 0x7fffffff;
} }
} }
// // if it is a warc or arc.gz allow it for now but we should
// // only allow one spider at a time per host
if ( s->m_sendBuf ) {
char *p = s->m_sendBuf;
char *pend = p + s->m_sendBufSize;
if ( strncmp(p,"GET /",5) == 0 ) p += 4;
// find end of url we are getting
char *e = p;
for ( ; *e && e < pend && ! is_wspace_a(*e) ; e++ );
if ( e - 8 > p && strncmp(e-8,".warc.gz", 8 ) == 0 )
max = 0x7fffffff;
if ( e - 7 > p && strncmp(e-7, ".arc.gz", 7 ) == 0 )
max = 0x7fffffff;
}
int32_t contentSize = 0;
int32_t totalReplySize = 0;
// now look for Content-Length in the mime // now look for Content-Length in the mime
for ( int32_t j = 0; j < i ; j++ ) { int32_t j; for ( j = 0; j < i ; j++ ) {
if ( buf[j] != 'c' && buf[j] != 'C' ) continue; if ( buf[j] != 'c' && buf[j] != 'C' ) continue;
if ( j + 16 >= i ) break; if ( j + 16 >= i ) break;
if ( strncasecmp ( &buf[j], "Content-Length:" , 15 ) != 0 ) if ( strncasecmp ( &buf[j], "Content-Length:" , 15 ) != 0 )
continue; continue;
int32_t contentSize = atol2 ( &buf[j+15] , i - (j+15) ); contentSize = atol2 ( &buf[j+15] , i - (j+15) );
int32_t totalReplySize = contentSize + mimeSize ; totalReplySize = contentSize + mimeSize ;
// all-or-nothing filter break;
if ( totalReplySize > max && allOrNothing ) {
log(LOG_INFO,
"http: pdf reply/request size of %"INT32" is larger "
"than limit of %"INT32". Cutoff pdf's are useless. "
"Abandoning.",totalReplySize,max);
// do not read any more than what we have
return bufSize;
}
// warn if we received a post that was truncated
if ( totalReplySize > max && isPost ) {
log("http: Truncated POST request from %"INT32" "
"to %"INT32" bytes. Increase \"max other/text doc "
"len\" in Spider Controls page to prevent this.",
totalReplySize,max);
}
// truncate the reply if we have to
if ( totalReplySize > max ) {
log("http: truncating reply of %"INT32" to %"INT32" bytes",
totalReplySize,max);
totalReplySize = max;
}
// truncate if we need to
return totalReplySize;
} }
// all-or-nothing filter
if ( totalReplySize > max && allOrNothing ) {
log(LOG_INFO,
"http: reply/request size of %"INT32" is larger "
"than limit of %"INT32". Cutoff documents "
"of this type are useless. "
"Abandoning.",totalReplySize,max);
// do not read any more than what we have
return bufSize;
}
// warn if we received a post that was truncated
if ( totalReplySize > max && isPost ) {
log("http: Truncated POST request from %"INT32" "
"to %"INT32" bytes. Increase \"max other/text doc "
"len\" in Spider Controls page to prevent this.",
totalReplySize,max);
}
// truncate the reply if we have to
if ( totalReplySize > max ) {
log("http: truncating reply of %"INT32" to %"INT32" bytes",
totalReplySize,max);
totalReplySize = max;
}
// truncate if we need to
if ( totalReplySize )
return totalReplySize;
// if it is a POST request with content but no content length... // if it is a POST request with content but no content length...
// we don't know how big it is... // we don't know how big it is...
if ( isPost ) { if ( isPost ) {
@ -2880,16 +2909,32 @@ TcpSocket *HttpServer::unzipReply(TcpSocket* s) {
// so we need to rewrite the Content-Length: and the // so we need to rewrite the Content-Length: and the
// Content-Encoding: http mime field values so they are no longer // Content-Encoding: http mime field values so they are no longer
// "gzip" and use the uncompressed content-length. // "gzip" and use the uncompressed content-length.
char *ptr1 = NULL; char *ptr1 = mime.getContentEncodingPos();
char *ptr2 = NULL; char *ptr2 = mime.getContentLengthPos();
if(mime.getContentEncodingPos() && char *ptr3 = NULL;
mime.getContentEncodingPos() < mime.getContentLengthPos()) {
ptr1 = mime.getContentEncodingPos(); // change the content type based on the extension before the
ptr2 = mime.getContentLengthPos(); // .gz extension since we are uncompressing it
} char *p = s->m_readBuf + 4;
else { char *pend = s->m_readBuf + s->m_readBufSize;
ptr1 = mime.getContentLengthPos(); const char *newCT = NULL;
ptr2 = mime.getContentEncodingPos(); char *lastPeriod = NULL;
// get the extension, if any, before the .gz
for ( ; *p && ! is_wspace_a(*p) && p < pend ; p++ ) {
if ( p[0] != '.' ) continue;
if ( p[1] != 'g' ) { lastPeriod = p; continue; }
if ( p[2] != 'z' ) { lastPeriod = p; continue; }
if ( ! is_wspace_a(p[3]) ) { lastPeriod = p; continue; }
// no prev?
if ( ! lastPeriod ) break;
// back up
newCT = extensionToContentTypeStr2 (lastPeriod,p-lastPeriod);
// this is NULL if the file extension is unrecognized
if ( ! newCT ) break;
// this should be like text/html or
// WARC/html or something like that...
ptr3 = mime.getContentTypePos();
break;
} }
// this was writing a number at the start of the mime and messing // this was writing a number at the start of the mime and messing
@ -2901,38 +2946,45 @@ TcpSocket *HttpServer::unzipReply(TcpSocket* s) {
char *src = s->m_readBuf; char *src = s->m_readBuf;
// sometimes they are missing Content-Length: // sometimes they are missing Content-Length:
if ( ptr1 ) {
// copy ptr1 to src
gbmemcpy ( pnew, src, ptr1 - src );
pnew += ptr1 - src;
src += ptr1 - src;
// store either the new content encoding or new length
if(ptr1 == mime.getContentEncodingPos())
pnew += sprintf(pnew, " identity");
else
pnew += sprintf(pnew, " %"INT32"",newSize);
// scan to \r\n at end of that line we replace
while ( *src != '\r' && *src != '\n') src++;
}
if ( ptr2 ) { subloop:
// copy ptr2 to src
gbmemcpy ( pnew , src , ptr2 - src ); char *nextMin = (char *)-1;
pnew += ptr2 - src; if ( ptr1 && (ptr1 < nextMin || nextMin==(char *)-1)) nextMin = ptr1;
src += ptr2 - src; if ( ptr2 && (ptr2 < nextMin || nextMin==(char *)-1)) nextMin = ptr2;
// now insert the new shit if ( ptr3 && (ptr3 < nextMin || nextMin==(char *)-1)) nextMin = ptr3;
if(ptr2 == mime.getContentEncodingPos())
pnew += sprintf(pnew, " identity"); // if all ptrs are NULL then copy the tail
else if ( nextMin == (char *)-1 ) nextMin = mimeEnd;
pnew += sprintf(pnew, " %"INT32"",newSize);
// scan to \r\n at end of that line we replace // copy ptr1 to src
while ( *src != '\r' && *src != '\n') src++; gbmemcpy ( pnew, src, nextMin - src );
pnew += nextMin - src;
src += nextMin - src;
// store either the new content encoding or new length
if ( nextMin == mime.getContentEncodingPos()) {
pnew += sprintf(pnew, " identity");
ptr1 = NULL;
} }
else if ( nextMin == mime.getContentLengthPos() ) {
pnew += sprintf(pnew, " %"INT32"",newSize);
ptr2 = NULL;
}
else if ( nextMin == mime.getContentTypePos() ) {
pnew += sprintf(pnew," %s",newCT);
ptr3 = NULL;
}
// scan to \r\n at end of that line we replace
while ( *src != '\r' && *src != '\n') src++;
// loop for more
if ( nextMin < mimeEnd ) goto subloop;
// copy the rest // copy the rest
gbmemcpy ( pnew , src , mimeEnd - src ); // gbmemcpy ( pnew , src , mimeEnd - src );
pnew += mimeEnd - src; // pnew += mimeEnd - src;
src += mimeEnd - src; // src += mimeEnd - src;
// before restLen was negative because we were skipping over // before restLen was negative because we were skipping over

View File

@ -89,7 +89,7 @@ OS_DEB := true
STATIC := -static STATIC := -static
# MDW: i get some parsing inconsistencies when running the first qa injection # MDW: i get some parsing inconsistencies when running the first qa injection
# test if this is -O3. strange. # test if this is -O3. strange.
XMLDOCOPT := -O2 XMLDOCOPT := -O0
endif endif

View File

@ -2720,6 +2720,10 @@ bool XmlDoc::indexDoc2 ( ) {
// scan it using delimeters. the file consists of multiple documents // scan it using delimeters. the file consists of multiple documents
// separated by this content delimeter. // separated by this content delimeter.
if ( m_firstUrlValid && m_firstUrl.isCompressedArcOrWarc() ) { if ( m_firstUrlValid && m_firstUrl.isCompressedArcOrWarc() ) {
// we need the doc
char **replyPtr = getHttpReply ();
if ( ! replyPtr ) return true;
if ( replyPtr == (void *)-1 ) return false;
// already called inject? // already called inject?
if ( m_calledWarcInject ) if ( m_calledWarcInject )
// then we are done // then we are done
@ -10196,7 +10200,14 @@ Url **XmlDoc::getRedirUrl() {
// http-equiv refresh tag, but that added an element of // http-equiv refresh tag, but that added an element of
// recursion that is just too confusing to deal with. so // recursion that is just too confusing to deal with. so
// let's just parse out the meta tag by hand // let's just parse out the meta tag by hand
if ( ! isRobotsTxt ) { bool checkMeta = true;
if ( isRobotsTxt ) checkMeta = false;
// warc and arc files have a list of html docs
// in them that we need to index, so skip this check
// for them as well
if ( m_firstUrlValid && m_firstUrl.isCompressedArcOrWarc() )
checkMeta = false;
if ( checkMeta ) {
Url **mrup = getMetaRedirUrl(); Url **mrup = getMetaRedirUrl();
if ( ! mrup || mrup == (void *)-1) return (Url **)mrup; if ( ! mrup || mrup == (void *)-1) return (Url **)mrup;
// set it. might be NULL if not there. // set it. might be NULL if not there.