mirror of
https://github.com/gigablast/open-source-search-engine.git
synced 2024-10-04 12:17:35 +03:00
added logic to retry diffbot reply on connection reset,
connection timed out or gateway timed out (http status 504) msgs. added logic to detect truncated json (missing final }) and not print it. also, at index time, we set a diffbot missing curly error to g_errno so the whole url can be retried later.
This commit is contained in:
parent
eccb969e5b
commit
e346a14a47
@ -190,6 +190,7 @@ case EINLINESECTIONS: return "Error generating section votes";
|
||||
case EREADONLYMODE: return "In read only mode. Failed.";
|
||||
case ENOTITLEREC: return "No title rec found when recycling content";
|
||||
case EQUERYINGDISABLED: return "Querying is disabled in the master controls";
|
||||
case EJSONMISSINGLASTCURLY: return "JSON was missing last curly bracket";
|
||||
}
|
||||
// if the remote error bit is clear it must be a regulare errno
|
||||
//if ( ! ( errnum & REMOTE_ERROR_BIT ) ) return strerror ( errnum );
|
||||
|
3
Errno.h
3
Errno.h
@ -194,6 +194,7 @@ enum {
|
||||
EINLINESECTIONS,
|
||||
EREADONLYMODE,
|
||||
ENOTITLEREC,
|
||||
EQUERYINGDISABLED
|
||||
EQUERYINGDISABLED,
|
||||
EJSONMISSINGLASTCURLY
|
||||
};
|
||||
#endif
|
||||
|
14
Json.cpp
14
Json.cpp
@ -474,3 +474,17 @@ char *JsonItem::getValueAsString ( int32_t *valueLen ) {
|
||||
*valueLen = sprintf ( s_numBuf,"%f", m_valueDouble );
|
||||
return s_numBuf;
|
||||
}
|
||||
|
||||
bool endsInCurly ( char *s , int32_t slen ) {
|
||||
char *e = s + slen - 1;
|
||||
// don't backup more than 30 chars
|
||||
char *m = e - 30;
|
||||
if ( m < s ) m = s;
|
||||
// \0?
|
||||
if ( e > m && *e == '\0' ) e--;
|
||||
// scan backwards, skipping whitespace
|
||||
for ( ; e > m && is_wspace_a(*e) ; e-- );
|
||||
// should be a } now to be valid json
|
||||
if ( e >= m && *e == '}' ) return true;
|
||||
return false;
|
||||
}
|
||||
|
2
Json.h
2
Json.h
@ -15,6 +15,8 @@
|
||||
|
||||
#define MAXJSONPARENTS 64
|
||||
|
||||
bool endsInCurly ( char *s , int32_t slen );
|
||||
|
||||
class JsonItem {
|
||||
|
||||
public:
|
||||
|
@ -3930,6 +3930,20 @@ bool printResult ( State0 *st, int32_t ix , int32_t *numPrintedSoFar ) {
|
||||
// a dud? just print empty {}'s
|
||||
if ( mr->size_content == 1 )
|
||||
sb->safePrintf("{}");
|
||||
// must have an ending } otherwise it was truncated json.
|
||||
// i'm seeing this happen sometimes, i do not know if diffbot
|
||||
// or gigablast is truncating the json
|
||||
else if ( ! endsInCurly ( mr->ptr_content, mr->size_content )){
|
||||
sb->safePrintf("{"
|
||||
"\"error\":"
|
||||
"\"Bad JSON. "
|
||||
"Diffbot reply was missing final "
|
||||
"curly bracket. Truncated JSON.\""
|
||||
"}");
|
||||
// make a note of it
|
||||
log("results: omitting diffbot reply missing curly "
|
||||
"for %s",mr->ptr_ubuf);
|
||||
}
|
||||
// if it's a diffbot object just print it out directly
|
||||
// into the json. it is already json.
|
||||
else
|
||||
|
68
XmlDoc.cpp
68
XmlDoc.cpp
@ -1101,7 +1101,8 @@ CollectionRec *XmlDoc::getCollRec ( ) {
|
||||
if ( ! m_collnumValid ) { char *xx=NULL;*xx=0; }
|
||||
CollectionRec *cr = g_collectiondb.m_recs[m_collnum];
|
||||
if ( ! cr ) {
|
||||
log("build: got NULL collection rec.");
|
||||
log("build: got NULL collection rec for collnum=%"INT32".",
|
||||
(int32_t)m_collnum);
|
||||
g_errno = ENOCOLLREC;
|
||||
return NULL;
|
||||
}
|
||||
@ -14271,6 +14272,17 @@ void gotDiffbotReplyWrapper ( void *state , TcpSocket *s ) {
|
||||
hadError = true;
|
||||
}
|
||||
|
||||
// just retry if connection got reset by peer!
|
||||
if ( g_errno == ECONNRESET ||
|
||||
g_errno == ETIMEDOUT ) {
|
||||
retry:
|
||||
log("buld: retrying diffbot reply");
|
||||
// resume. this checks g_errno for being set.
|
||||
THIS->m_masterLoop ( THIS->m_masterState );
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
//char *buf = s->m_readBuf;
|
||||
// do not allow TcpServer.cpp to free it since m_diffbotReply
|
||||
// is now responsible for that
|
||||
@ -14284,19 +14296,30 @@ void gotDiffbotReplyWrapper ( void *state , TcpSocket *s ) {
|
||||
// g_errno should be set
|
||||
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
|
||||
// note it
|
||||
log("xmldoc: error setting diffbot mime");
|
||||
log("build: error setting diffbot mime");
|
||||
THIS->m_diffbotReplyError = EDIFFBOTMIMEERROR;
|
||||
hadError = true;
|
||||
}
|
||||
|
||||
bool retryUrl = false;
|
||||
|
||||
// check the status
|
||||
if ( ! hadError && mime.getHttpStatus() != 200 ) {
|
||||
THIS->m_diffbotReplyError = EDIFFBOTBADHTTPSTATUS;
|
||||
log("xmldoc: diffbot reply mime was %"INT32"",
|
||||
mime.getHttpStatus());
|
||||
hadError = true;
|
||||
// gateway timed out? then retry.
|
||||
if ( mime.getHttpStatus() == 504 )
|
||||
retryUrl = true;
|
||||
}
|
||||
|
||||
if ( hadError )
|
||||
log("build: diffbot error for url %s",
|
||||
THIS->m_diffbotUrl.getBufStart());
|
||||
|
||||
if ( retryUrl )
|
||||
goto retry;
|
||||
|
||||
// get page content
|
||||
char *page = NULL;
|
||||
@ -14381,10 +14404,22 @@ void gotDiffbotReplyWrapper ( void *state , TcpSocket *s ) {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// reply is now valid but might be empty
|
||||
THIS->m_diffbotReplyValid = true;
|
||||
|
||||
// if json reply was truncated, that is an error as well.
|
||||
// likewise we have to check if such bad json is in the serps
|
||||
// when doing an icc=1 and print 'bad json' in json instead.
|
||||
if ( ! THIS->m_diffbotReplyError && s->m_readOffset > 1 &&
|
||||
// json must end with '}' (ignores trailing whitespace)
|
||||
! endsInCurly ( s->m_readBuf , s->m_readOffset ) ) {
|
||||
// hopefully this can be re-tried later.
|
||||
THIS->m_diffbotReplyError = EJSONMISSINGLASTCURLY;
|
||||
// make a note of it
|
||||
log("build: got diffbot reply missing curly for %s",
|
||||
THIS->m_firstUrl.m_url);
|
||||
}
|
||||
|
||||
//if ( ! cr ) return;
|
||||
|
||||
bool countIt = true;
|
||||
@ -14886,6 +14921,14 @@ SafeBuf *XmlDoc::getTokenizedDiffbotReply ( ) {
|
||||
// . the diffbot reply will be a list of json objects we want to index
|
||||
SafeBuf *XmlDoc::getDiffbotReply ( ) {
|
||||
|
||||
// got reply of malformed json missing final '}'
|
||||
if ( m_diffbotReplyValid &&
|
||||
m_diffbotReplyError == EJSONMISSINGLASTCURLY ) {
|
||||
// hopefully spider will retry later
|
||||
g_errno = m_diffbotReplyError;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if ( m_diffbotReplyValid )
|
||||
return &m_diffbotReply;
|
||||
|
||||
@ -15178,6 +15221,10 @@ SafeBuf *XmlDoc::getDiffbotReply ( ) {
|
||||
//if ( api && api[0] == '/' ) { api++; apiLen--; }
|
||||
// append the custom url. i.e. /api/analyze?mode=auto&u=
|
||||
//if ( api ) diffbotUrl.safeMemcpy ( api , apiLen );
|
||||
|
||||
// reset it in case we are a re-call from gotDiffbotReplyWrapper()
|
||||
// if g_errno == ECONNRESET
|
||||
m_diffbotUrl.reset();
|
||||
// store the api url into here
|
||||
m_diffbotUrl.safeMemcpy ( apiUrl.getUrl() , apiUrl.getUrlLen() );
|
||||
|
||||
@ -15323,6 +15370,11 @@ SafeBuf *XmlDoc::getDiffbotReply ( ) {
|
||||
|
||||
// mark as tried
|
||||
if ( m_srepValid ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
// might have been a recall if gotDiffbotReplyWrapper() sensed
|
||||
// g_errno == ECONNRESET and it will retry
|
||||
if ( ! m_sentToDiffbot ) {
|
||||
|
||||
m_sentToDiffbot = 1;
|
||||
|
||||
// count it for stats
|
||||
@ -15333,6 +15385,7 @@ SafeBuf *XmlDoc::getDiffbotReply ( ) {
|
||||
cr->localCrawlInfoUpdate();
|
||||
|
||||
cr->m_needsSave = true;
|
||||
}
|
||||
|
||||
char *additionalHeaders = NULL;
|
||||
if ( headers.length() > 0 )
|
||||
@ -15356,7 +15409,11 @@ SafeBuf *XmlDoc::getDiffbotReply ( ) {
|
||||
0 , // ifmodifiedsince
|
||||
this , // state
|
||||
gotDiffbotReplyWrapper ,
|
||||
180*1000, // 180 sec timeout
|
||||
// MDW: boost timeout from 180 to 18000
|
||||
// seconds so we can figure out why
|
||||
// diffbot times out, etc. what is
|
||||
// going on.
|
||||
18000*1000, // 180 sec timeout
|
||||
0,//proxyip
|
||||
0,//proxyport
|
||||
// unlimited replies i guess
|
||||
@ -20148,6 +20205,9 @@ bool XmlDoc::logIt ( SafeBuf *bb ) {
|
||||
sb->safePrintf("diffbotjsonobjects=%"INT32" ",
|
||||
(int32_t)m_diffbotJSONCount);
|
||||
|
||||
if ( m_diffbotReplyValid )
|
||||
sb->safePrintf("diffboterror=%"INT32" ",m_diffbotReplyError);
|
||||
|
||||
if ( m_siteValid )
|
||||
sb->safePrintf("site=%s ",ptr_site);
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user