mirror of
https://github.com/gigablast/open-source-search-engine.git
synced 2024-10-04 12:17:35 +03:00
some fixes for indexing warcs/arcs.
This commit is contained in:
parent
b6ff0b0173
commit
faf2c06d29
24
HttpMime.cpp
24
HttpMime.cpp
@ -45,6 +45,7 @@ void HttpMime::reset ( ) {
|
||||
m_locationFieldLen = 0;
|
||||
m_contentEncodingPos = NULL;
|
||||
m_contentLengthPos = NULL;
|
||||
m_contentTypePos = NULL;
|
||||
}
|
||||
|
||||
// . returns false if could not get a valid mime
|
||||
@ -157,8 +158,12 @@ bool HttpMime::parse ( char *mime , int32_t mimeLen , Url *url ) {
|
||||
time_t now = time(NULL);
|
||||
if (m_lastModifiedDate > now) m_lastModifiedDate = now;
|
||||
}
|
||||
else if ( strncasecmp ( p , "Content-Type:" ,13) == 0 )
|
||||
else if ( strncasecmp ( p , "Content-Type:" ,13) == 0 ) {
|
||||
m_contentType = getContentTypePrivate ( p + 13 );
|
||||
char *s = p + 13;
|
||||
while ( *s == ' ' || *s == '\t' ) s++;
|
||||
m_contentTypePos = s;
|
||||
}
|
||||
else if ( strncasecmp ( p , "Set-Cookie:" ,10) == 0 ) {
|
||||
m_cookie = p + 11;
|
||||
if ( m_cookie[0] == ' ' ) m_cookie++;
|
||||
@ -540,6 +545,7 @@ int32_t getContentTypeFromStr ( char *s ) {
|
||||
else if (!strncasecmp(s,"image/",6 ) ) ct = CT_IMAGE;
|
||||
else if (!strcasecmp(s,"application/javascript" ) ) ct = CT_JS;
|
||||
else if (!strcasecmp(s,"application/x-javascript") ) ct = CT_JS;
|
||||
else if (!strcasecmp(s,"application/x-gzip" ) ) ct = CT_GZ;
|
||||
else if (!strcasecmp(s,"text/javascript" ) ) ct = CT_JS;
|
||||
else if (!strcasecmp(s,"text/x-js" ) ) ct = CT_JS;
|
||||
else if (!strcasecmp(s,"text/js" ) ) ct = CT_JS;
|
||||
@ -626,6 +632,17 @@ void resetHttpMime ( ) {
|
||||
s_mimeTable.reset();
|
||||
}
|
||||
|
||||
const char *extensionToContentTypeStr2 ( char *ext , int32_t elen ) {
|
||||
// assume text/html if no extension provided
|
||||
if ( ! ext || ! ext[0] ) return NULL;
|
||||
if ( elen <= 0 ) return NULL;
|
||||
// get hash for table look up
|
||||
int32_t key = hash32 ( ext , elen );
|
||||
char **pp = (char **)s_mimeTable.getValue ( &key );
|
||||
if ( ! pp ) return NULL;
|
||||
return *pp;
|
||||
}
|
||||
|
||||
const char *HttpMime::getContentTypeFromExtension ( char *ext , int32_t elen) {
|
||||
// assume text/html if no extension provided
|
||||
if ( ! ext || ! ext[0] ) return "text/html";
|
||||
@ -1051,7 +1068,10 @@ static char *s_ext[] = {
|
||||
"xwd" , "image/x-xwindowdump",
|
||||
"xyz" , "chemical/x-pdb",
|
||||
"zip" , "application/zip" ,
|
||||
"xpi", "application/x-xpinstall"
|
||||
"xpi", "application/x-xpinstall",
|
||||
// newstuff
|
||||
"warc", "application/warc",
|
||||
"arc", "application/arc"
|
||||
};
|
||||
|
||||
// . init s_mimeTable in this call
|
||||
|
@ -9,6 +9,8 @@
|
||||
// convert application/json to CT_JSON for instance
|
||||
int32_t getContentTypeFromStr ( char *s ) ;
|
||||
|
||||
const char *extensionToContentTypeStr2 ( char *ext , int32_t elen ) ;
|
||||
|
||||
#include <time.h>
|
||||
|
||||
void getTime ( char *s , int *sec , int *min , int *hour ) ;
|
||||
@ -42,6 +44,7 @@ time_t atotime5 ( char *s ) ;
|
||||
#define CT_JSON 16
|
||||
#define CT_IMAGE 17
|
||||
#define CT_STATUS 18 // an internal type indicating spider reply
|
||||
#define CT_GZ 19
|
||||
|
||||
#define ET_IDENTITY 0
|
||||
#define ET_GZIP 1
|
||||
@ -127,6 +130,7 @@ class HttpMime {
|
||||
int32_t getContentEncoding () {return m_contentEncoding;}
|
||||
char *getContentEncodingPos() {return m_contentEncodingPos;}
|
||||
char *getContentLengthPos() {return m_contentLengthPos;}
|
||||
char *getContentTypePos() {return m_contentTypePos;}
|
||||
|
||||
|
||||
// private:
|
||||
@ -166,6 +170,7 @@ class HttpMime {
|
||||
int32_t m_contentEncoding;
|
||||
char *m_contentEncodingPos;
|
||||
char *m_contentLengthPos;
|
||||
char *m_contentTypePos;
|
||||
|
||||
// the size of the terminating boundary, either 1 or 2 bytes.
|
||||
// just the last \n in the case of a \n\n or \r in the case
|
||||
|
190
HttpServer.cpp
190
HttpServer.cpp
@ -1035,19 +1035,21 @@ bool HttpServer::sendReply ( TcpSocket *s , HttpRequest *r , bool isAdmin) {
|
||||
if ( strncmp ( path , "/download/", 10 ) == 0 )
|
||||
return sendBackDump ( s , r );
|
||||
|
||||
if ( strncmp ( path , "/iagbcoll/" , 10 ) == 0 ) {
|
||||
if ( strncmp ( path , "/gbiaitem/" , 10 ) == 0 ) {
|
||||
SafeBuf cmd;
|
||||
char *iaItem = path + 10;
|
||||
char c = iaItem[pathLen];
|
||||
iaItem[pathLen] = '\0';
|
||||
// iaItem is like "webgroup-20100422114008-00011"
|
||||
// print out the warc files as if they were urls
|
||||
// so we can spider them through the spider pipeline as-is
|
||||
cmd.safePrintf("%s/ia list %s --glob='*arc.gz' | "
|
||||
// so we can spider them through the spider pipeline as-is.
|
||||
// this hack only works on internet archive servers
|
||||
// that have the '/home/mwells/ia' obviously
|
||||
cmd.safePrintf("/home/mwells/ia list %s --glob='*arc.gz' | "
|
||||
"awk '{print \"<a "
|
||||
"href=http://archive.org/download/"
|
||||
"%s/\" $1\">\"}' > ./tmpiaout"
|
||||
, g_hostdb.m_dir
|
||||
//, g_hostdb.m_dir
|
||||
,iaItem
|
||||
,iaItem
|
||||
);
|
||||
@ -2388,6 +2390,7 @@ int32_t getMsgSize ( char *buf, int32_t bufSize, TcpSocket *s ) {
|
||||
}
|
||||
// if has no content then it must end in \n\r\n\r or \r\n\r\n
|
||||
if ( ! hasContent ) return bufSize;
|
||||
|
||||
// look for a Content-Type: field because we now limit how much
|
||||
// we read based on this
|
||||
char *p = buf;
|
||||
@ -2411,45 +2414,71 @@ int32_t getMsgSize ( char *buf, int32_t bufSize, TcpSocket *s ) {
|
||||
// as well index that at least.
|
||||
if ( p + 15 < pend && strncasecmp( p,"application/pdf",15)==0)
|
||||
allOrNothing = true;
|
||||
if ( p + 15 < pend&&strncasecmp(p,"application/x-gzip",18)==0)
|
||||
allOrNothing = true;
|
||||
// adjust "max to read" if we don't have an html/plain doc
|
||||
if ( ! isPost ) {
|
||||
max = s->m_maxOtherDocLen + 10*1024 ;
|
||||
if ( s->m_maxOtherDocLen == -1 ) max = 0x7fffffff;
|
||||
}
|
||||
}
|
||||
|
||||
// // if it is a warc or arc.gz allow it for now but we should
|
||||
// // only allow one spider at a time per host
|
||||
if ( s->m_sendBuf ) {
|
||||
char *p = s->m_sendBuf;
|
||||
char *pend = p + s->m_sendBufSize;
|
||||
if ( strncmp(p,"GET /",5) == 0 ) p += 4;
|
||||
// find end of url we are getting
|
||||
char *e = p;
|
||||
for ( ; *e && e < pend && ! is_wspace_a(*e) ; e++ );
|
||||
if ( e - 8 > p && strncmp(e-8,".warc.gz", 8 ) == 0 )
|
||||
max = 0x7fffffff;
|
||||
if ( e - 7 > p && strncmp(e-7, ".arc.gz", 7 ) == 0 )
|
||||
max = 0x7fffffff;
|
||||
}
|
||||
|
||||
int32_t contentSize = 0;
|
||||
int32_t totalReplySize = 0;
|
||||
|
||||
// now look for Content-Length in the mime
|
||||
for ( int32_t j = 0; j < i ; j++ ) {
|
||||
int32_t j; for ( j = 0; j < i ; j++ ) {
|
||||
if ( buf[j] != 'c' && buf[j] != 'C' ) continue;
|
||||
if ( j + 16 >= i ) break;
|
||||
if ( strncasecmp ( &buf[j], "Content-Length:" , 15 ) != 0 )
|
||||
continue;
|
||||
int32_t contentSize = atol2 ( &buf[j+15] , i - (j+15) );
|
||||
int32_t totalReplySize = contentSize + mimeSize ;
|
||||
// all-or-nothing filter
|
||||
if ( totalReplySize > max && allOrNothing ) {
|
||||
log(LOG_INFO,
|
||||
"http: pdf reply/request size of %"INT32" is larger "
|
||||
"than limit of %"INT32". Cutoff pdf's are useless. "
|
||||
"Abandoning.",totalReplySize,max);
|
||||
// do not read any more than what we have
|
||||
return bufSize;
|
||||
}
|
||||
// warn if we received a post that was truncated
|
||||
if ( totalReplySize > max && isPost ) {
|
||||
log("http: Truncated POST request from %"INT32" "
|
||||
"to %"INT32" bytes. Increase \"max other/text doc "
|
||||
"len\" in Spider Controls page to prevent this.",
|
||||
totalReplySize,max);
|
||||
}
|
||||
// truncate the reply if we have to
|
||||
if ( totalReplySize > max ) {
|
||||
log("http: truncating reply of %"INT32" to %"INT32" bytes",
|
||||
totalReplySize,max);
|
||||
totalReplySize = max;
|
||||
}
|
||||
// truncate if we need to
|
||||
return totalReplySize;
|
||||
contentSize = atol2 ( &buf[j+15] , i - (j+15) );
|
||||
totalReplySize = contentSize + mimeSize ;
|
||||
break;
|
||||
}
|
||||
|
||||
// all-or-nothing filter
|
||||
if ( totalReplySize > max && allOrNothing ) {
|
||||
log(LOG_INFO,
|
||||
"http: reply/request size of %"INT32" is larger "
|
||||
"than limit of %"INT32". Cutoff documents "
|
||||
"of this type are useless. "
|
||||
"Abandoning.",totalReplySize,max);
|
||||
// do not read any more than what we have
|
||||
return bufSize;
|
||||
}
|
||||
// warn if we received a post that was truncated
|
||||
if ( totalReplySize > max && isPost ) {
|
||||
log("http: Truncated POST request from %"INT32" "
|
||||
"to %"INT32" bytes. Increase \"max other/text doc "
|
||||
"len\" in Spider Controls page to prevent this.",
|
||||
totalReplySize,max);
|
||||
}
|
||||
// truncate the reply if we have to
|
||||
if ( totalReplySize > max ) {
|
||||
log("http: truncating reply of %"INT32" to %"INT32" bytes",
|
||||
totalReplySize,max);
|
||||
totalReplySize = max;
|
||||
}
|
||||
// truncate if we need to
|
||||
if ( totalReplySize )
|
||||
return totalReplySize;
|
||||
|
||||
// if it is a POST request with content but no content length...
|
||||
// we don't know how big it is...
|
||||
if ( isPost ) {
|
||||
@ -2880,16 +2909,32 @@ TcpSocket *HttpServer::unzipReply(TcpSocket* s) {
|
||||
// so we need to rewrite the Content-Length: and the
|
||||
// Content-Encoding: http mime field values so they are no longer
|
||||
// "gzip" and use the uncompressed content-length.
|
||||
char *ptr1 = NULL;
|
||||
char *ptr2 = NULL;
|
||||
if(mime.getContentEncodingPos() &&
|
||||
mime.getContentEncodingPos() < mime.getContentLengthPos()) {
|
||||
ptr1 = mime.getContentEncodingPos();
|
||||
ptr2 = mime.getContentLengthPos();
|
||||
}
|
||||
else {
|
||||
ptr1 = mime.getContentLengthPos();
|
||||
ptr2 = mime.getContentEncodingPos();
|
||||
char *ptr1 = mime.getContentEncodingPos();
|
||||
char *ptr2 = mime.getContentLengthPos();
|
||||
char *ptr3 = NULL;
|
||||
|
||||
// change the content type based on the extension before the
|
||||
// .gz extension since we are uncompressing it
|
||||
char *p = s->m_readBuf + 4;
|
||||
char *pend = s->m_readBuf + s->m_readBufSize;
|
||||
const char *newCT = NULL;
|
||||
char *lastPeriod = NULL;
|
||||
// get the extension, if any, before the .gz
|
||||
for ( ; *p && ! is_wspace_a(*p) && p < pend ; p++ ) {
|
||||
if ( p[0] != '.' ) continue;
|
||||
if ( p[1] != 'g' ) { lastPeriod = p; continue; }
|
||||
if ( p[2] != 'z' ) { lastPeriod = p; continue; }
|
||||
if ( ! is_wspace_a(p[3]) ) { lastPeriod = p; continue; }
|
||||
// no prev?
|
||||
if ( ! lastPeriod ) break;
|
||||
// back up
|
||||
newCT = extensionToContentTypeStr2 (lastPeriod,p-lastPeriod);
|
||||
// this is NULL if the file extension is unrecognized
|
||||
if ( ! newCT ) break;
|
||||
// this should be like text/html or
|
||||
// WARC/html or something like that...
|
||||
ptr3 = mime.getContentTypePos();
|
||||
break;
|
||||
}
|
||||
|
||||
// this was writing a number at the start of the mime and messing
|
||||
@ -2901,38 +2946,45 @@ TcpSocket *HttpServer::unzipReply(TcpSocket* s) {
|
||||
char *src = s->m_readBuf;
|
||||
|
||||
// sometimes they are missing Content-Length:
|
||||
if ( ptr1 ) {
|
||||
// copy ptr1 to src
|
||||
gbmemcpy ( pnew, src, ptr1 - src );
|
||||
pnew += ptr1 - src;
|
||||
src += ptr1 - src;
|
||||
// store either the new content encoding or new length
|
||||
if(ptr1 == mime.getContentEncodingPos())
|
||||
pnew += sprintf(pnew, " identity");
|
||||
else
|
||||
pnew += sprintf(pnew, " %"INT32"",newSize);
|
||||
// scan to \r\n at end of that line we replace
|
||||
while ( *src != '\r' && *src != '\n') src++;
|
||||
}
|
||||
|
||||
if ( ptr2 ) {
|
||||
// copy ptr2 to src
|
||||
gbmemcpy ( pnew , src , ptr2 - src );
|
||||
pnew += ptr2 - src;
|
||||
src += ptr2 - src;
|
||||
// now insert the new shit
|
||||
if(ptr2 == mime.getContentEncodingPos())
|
||||
pnew += sprintf(pnew, " identity");
|
||||
else
|
||||
pnew += sprintf(pnew, " %"INT32"",newSize);
|
||||
// scan to \r\n at end of that line we replace
|
||||
while ( *src != '\r' && *src != '\n') src++;
|
||||
subloop:
|
||||
|
||||
char *nextMin = (char *)-1;
|
||||
if ( ptr1 && (ptr1 < nextMin || nextMin==(char *)-1)) nextMin = ptr1;
|
||||
if ( ptr2 && (ptr2 < nextMin || nextMin==(char *)-1)) nextMin = ptr2;
|
||||
if ( ptr3 && (ptr3 < nextMin || nextMin==(char *)-1)) nextMin = ptr3;
|
||||
|
||||
// if all ptrs are NULL then copy the tail
|
||||
if ( nextMin == (char *)-1 ) nextMin = mimeEnd;
|
||||
|
||||
// copy ptr1 to src
|
||||
gbmemcpy ( pnew, src, nextMin - src );
|
||||
pnew += nextMin - src;
|
||||
src += nextMin - src;
|
||||
// store either the new content encoding or new length
|
||||
if ( nextMin == mime.getContentEncodingPos()) {
|
||||
pnew += sprintf(pnew, " identity");
|
||||
ptr1 = NULL;
|
||||
}
|
||||
else if ( nextMin == mime.getContentLengthPos() ) {
|
||||
pnew += sprintf(pnew, " %"INT32"",newSize);
|
||||
ptr2 = NULL;
|
||||
}
|
||||
else if ( nextMin == mime.getContentTypePos() ) {
|
||||
pnew += sprintf(pnew," %s",newCT);
|
||||
ptr3 = NULL;
|
||||
}
|
||||
// scan to \r\n at end of that line we replace
|
||||
while ( *src != '\r' && *src != '\n') src++;
|
||||
|
||||
// loop for more
|
||||
if ( nextMin < mimeEnd ) goto subloop;
|
||||
|
||||
|
||||
// copy the rest
|
||||
gbmemcpy ( pnew , src , mimeEnd - src );
|
||||
pnew += mimeEnd - src;
|
||||
src += mimeEnd - src;
|
||||
// gbmemcpy ( pnew , src , mimeEnd - src );
|
||||
// pnew += mimeEnd - src;
|
||||
// src += mimeEnd - src;
|
||||
|
||||
|
||||
// before restLen was negative because we were skipping over
|
||||
|
2
Makefile
2
Makefile
@ -89,7 +89,7 @@ OS_DEB := true
|
||||
STATIC := -static
|
||||
# MDW: i get some parsing inconsistencies when running the first qa injection
|
||||
# test if this is -O3. strange.
|
||||
XMLDOCOPT := -O2
|
||||
XMLDOCOPT := -O0
|
||||
endif
|
||||
|
||||
|
||||
|
13
XmlDoc.cpp
13
XmlDoc.cpp
@ -2720,6 +2720,10 @@ bool XmlDoc::indexDoc2 ( ) {
|
||||
// scan it using delimeters. the file consists of multiple documents
|
||||
// separated by this content delimeter.
|
||||
if ( m_firstUrlValid && m_firstUrl.isCompressedArcOrWarc() ) {
|
||||
// we need the doc
|
||||
char **replyPtr = getHttpReply ();
|
||||
if ( ! replyPtr ) return true;
|
||||
if ( replyPtr == (void *)-1 ) return false;
|
||||
// already called inject?
|
||||
if ( m_calledWarcInject )
|
||||
// then we are done
|
||||
@ -10196,7 +10200,14 @@ Url **XmlDoc::getRedirUrl() {
|
||||
// http-equiv refresh tag, but that added an element of
|
||||
// recursion that is just too confusing to deal with. so
|
||||
// let's just parse out the meta tag by hand
|
||||
if ( ! isRobotsTxt ) {
|
||||
bool checkMeta = true;
|
||||
if ( isRobotsTxt ) checkMeta = false;
|
||||
// warc and arc files have a list of html docs
|
||||
// in them that we need to index, so skip this check
|
||||
// for them as well
|
||||
if ( m_firstUrlValid && m_firstUrl.isCompressedArcOrWarc() )
|
||||
checkMeta = false;
|
||||
if ( checkMeta ) {
|
||||
Url **mrup = getMetaRedirUrl();
|
||||
if ( ! mrup || mrup == (void *)-1) return (Url **)mrup;
|
||||
// set it. might be NULL if not there.
|
||||
|
Loading…
Reference in New Issue
Block a user