mirror of
https://github.com/gigablast/open-source-search-engine.git
synced 2024-10-04 04:07:13 +03:00
fix a couple of cores happening on crawlbot.
fix bug of a urls.csv or other streaming download being truncated because gb thinks a shard is down. even if it is down, wait for it to come back up.
This commit is contained in:
parent
ec5c38bab5
commit
fc4731b11c
@ -1778,8 +1778,8 @@ bool HttpServer::sendSuccessReply ( TcpSocket *s , char format, char *addMsg) {
|
||||
else now = getTimeLocal();
|
||||
// . buffer for the MIME request and brief html err msg
|
||||
// . NOTE: ctime appends a \n to the time, so we don't need to
|
||||
char msg[1024];
|
||||
SafeBuf sb(msg,1024,0,false);
|
||||
char msg[1524];
|
||||
SafeBuf sb(msg,1524,0,false);
|
||||
|
||||
char *tt = asctime(gmtime ( &now ));
|
||||
tt [ gbstrlen(tt) - 1 ] = '\0';
|
||||
@ -1838,7 +1838,7 @@ bool HttpServer::sendSuccessReply ( TcpSocket *s , char format, char *addMsg) {
|
||||
|
||||
// use this new function that will compress the reply now if the
|
||||
// request was a ZET instead of a GET
|
||||
return sendReply2 ( msg , sb.length() , NULL , 0 , s );
|
||||
return sendReply2 ( sb.getBufStart(), sb.length() , NULL , 0 , s );
|
||||
}
|
||||
|
||||
bool HttpServer::sendErrorReply ( GigablastRequest *gr ) {
|
||||
@ -1851,8 +1851,8 @@ bool HttpServer::sendErrorReply ( GigablastRequest *gr ) {
|
||||
else now = getTimeLocal();
|
||||
|
||||
int32_t format = gr->m_hr.getReplyFormat();
|
||||
char msg[1024];
|
||||
SafeBuf sb(msg,1024,0,false);
|
||||
char msg[1524];
|
||||
SafeBuf sb(msg,1524,0,false);
|
||||
char *tt = asctime(gmtime ( &now ));
|
||||
tt [ gbstrlen(tt) - 1 ] = '\0';
|
||||
|
||||
@ -1904,7 +1904,7 @@ bool HttpServer::sendErrorReply ( GigablastRequest *gr ) {
|
||||
|
||||
// use this new function that will compress the reply now if the
|
||||
// request was a ZET instead of a GET
|
||||
return sendReply2 ( msg , sb.length() , NULL , 0 , gr->m_socket );
|
||||
return sendReply2 ( sb.getBufStart(),sb.length(),NULL,0,gr->m_socket );
|
||||
}
|
||||
|
||||
// . send an error reply, like "HTTP/1.1 404 Not Found"
|
||||
@ -1931,8 +1931,8 @@ bool HttpServer::sendErrorReply ( TcpSocket *s , int32_t error , char *errmsg ,
|
||||
|
||||
// . buffer for the MIME request and brief html err msg
|
||||
// . NOTE: ctime appends a \n to the time, so we don't need to
|
||||
char msg[1024];
|
||||
SafeBuf sb(msg,1024,0,false);
|
||||
char msg[1524];
|
||||
SafeBuf sb(msg,1524,0,false);
|
||||
// if it's a 404, redirect to home page
|
||||
/*
|
||||
if ( error == 404 )
|
||||
@ -2000,8 +2000,8 @@ bool HttpServer::sendErrorReply ( TcpSocket *s , int32_t error , char *errmsg ,
|
||||
// record it
|
||||
if ( bytesSent ) *bytesSent = sb.length();//sendBufSize;
|
||||
// use this new function that will compress the reply now if the
|
||||
// request was a ZET instead of a GET
|
||||
return sendReply2 ( msg , sb.length() , NULL , 0 , s );
|
||||
// request was a ZET instead of a GET mdw
|
||||
return sendReply2 ( sb.getBufStart() , sb.length() , NULL , 0 , s );
|
||||
|
||||
/*
|
||||
// . this returns false if blocked, true otherwise
|
||||
|
34
Msg40.cpp
34
Msg40.cpp
@ -109,6 +109,8 @@ Msg40::Msg40() {
|
||||
m_printCount = 0;
|
||||
//m_numGigabitInfos = 0;
|
||||
m_numCollsToSearch = 0;
|
||||
m_numMsg20sIn = 0;
|
||||
m_numMsg20sOut = 0;
|
||||
}
|
||||
|
||||
#define MAX2 50
|
||||
@ -1500,14 +1502,27 @@ bool Msg40::launchMsg20s ( bool recalled ) {
|
||||
int64_t docId = m_msg3a.m_docIds[i];
|
||||
uint32_t shardNum = g_hostdb.getShardNumFromDocId ( docId );
|
||||
if ( g_hostdb.isShardDead ( shardNum ) ) {
|
||||
log("msg40: skipping summary lookup #%"INT32" of "
|
||||
"docid %"INT64" for dead shard #%"INT32""
|
||||
, i
|
||||
, docId
|
||||
, shardNum );
|
||||
m_numRequests++;
|
||||
m_numReplies++;
|
||||
continue;
|
||||
CollectionRec *cr ;
|
||||
cr = g_collectiondb.getRec(m_firstCollnum);
|
||||
if ( cr &&
|
||||
// diffbot urls.csv downloads often encounter dead
|
||||
// hosts that are not really dead, so wait for it
|
||||
! cr->m_isCustomCrawl &&
|
||||
// this is causing us to truncate streamed results
|
||||
// too early when we have false positives that a
|
||||
// host is dead because the server is locking up
|
||||
// periodically
|
||||
! m_si->m_streamResults ) {
|
||||
log("msg40: skipping summary "
|
||||
"lookup #%"INT32" of "
|
||||
"docid %"INT64" for dead shard #%"INT32""
|
||||
, i
|
||||
, docId
|
||||
, shardNum );
|
||||
m_numRequests++;
|
||||
m_numReplies++;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -2215,12 +2230,11 @@ bool Msg40::gotSummary ( ) {
|
||||
|
||||
complete:
|
||||
|
||||
// . ok, now i wait for everybody.
|
||||
// . ok, now i wait for all msg20s (getsummary) to come back in.
|
||||
// . TODO: evaluate if this hurts us
|
||||
if ( m_numReplies < m_numRequests )
|
||||
return false;
|
||||
|
||||
|
||||
// if streaming results, we are done
|
||||
if ( m_si && m_si->m_streamResults ) {
|
||||
// unless waiting for last transmit to complete
|
||||
|
@ -8300,7 +8300,13 @@ bool SpiderLoop::spiderUrl2 ( ) {
|
||||
// count it as a hit
|
||||
//g_stats.m_spiderUrlsHit++;
|
||||
// sanity check
|
||||
if (m_sreq->m_priority <= -1 ) { char *xx=NULL;*xx=0; }
|
||||
if (m_sreq->m_priority <= -1 ) {
|
||||
log("spider: fixing bogus spider req priority of %i for "
|
||||
"url %s",
|
||||
(int)m_sreq->m_priority,m_sreq->m_url);
|
||||
m_sreq->m_priority = 0;
|
||||
//char *xx=NULL;*xx=0;
|
||||
}
|
||||
//if(m_sreq->m_priority >= MAX_SPIDER_PRIORITIES){char *xx=NULL;*xx=0;}
|
||||
// update this
|
||||
m_sc->m_outstandingSpiders[(unsigned char)m_sreq->m_priority]++;
|
||||
|
Loading…
Reference in New Issue
Block a user