added logic to retry diffbot reply on connection reset,

connection timed out or gateway timed out (http status 504)
msgs.  added logic to detect truncated json (missing final })
and not print it. also, at index time, we set a diffbot missing
curly error to g_errno so the whole url can be retried later.
This commit is contained in:
Matt Wells 2015-03-09 20:54:34 -07:00
parent eccb969e5b
commit e346a14a47
6 changed files with 104 additions and 12 deletions

View File

@ -190,6 +190,7 @@ case EINLINESECTIONS: return "Error generating section votes";
case EREADONLYMODE: return "In read only mode. Failed.";
case ENOTITLEREC: return "No title rec found when recycling content";
case EQUERYINGDISABLED: return "Querying is disabled in the master controls";
case EJSONMISSINGLASTCURLY: return "JSON was missing last curly bracket";
}
// if the remote error bit is clear it must be a regulare errno
//if ( ! ( errnum & REMOTE_ERROR_BIT ) ) return strerror ( errnum );

View File

@ -194,6 +194,7 @@ enum {
EINLINESECTIONS,
EREADONLYMODE,
ENOTITLEREC,
EQUERYINGDISABLED
EQUERYINGDISABLED,
EJSONMISSINGLASTCURLY
};
#endif

View File

@ -474,3 +474,17 @@ char *JsonItem::getValueAsString ( int32_t *valueLen ) {
*valueLen = sprintf ( s_numBuf,"%f", m_valueDouble );
return s_numBuf;
}
bool endsInCurly ( char *s , int32_t slen ) {
char *e = s + slen - 1;
// don't backup more than 30 chars
char *m = e - 30;
if ( m < s ) m = s;
// \0?
if ( e > m && *e == '\0' ) e--;
// scan backwards, skipping whitespace
for ( ; e > m && is_wspace_a(*e) ; e-- );
// should be a } now to be valid json
if ( e >= m && *e == '}' ) return true;
return false;
}

2
Json.h
View File

@ -15,6 +15,8 @@
#define MAXJSONPARENTS 64
bool endsInCurly ( char *s , int32_t slen );
class JsonItem {
public:

View File

@ -3930,6 +3930,20 @@ bool printResult ( State0 *st, int32_t ix , int32_t *numPrintedSoFar ) {
// a dud? just print empty {}'s
if ( mr->size_content == 1 )
sb->safePrintf("{}");
// must have an ending } otherwise it was truncated json.
// i'm seeing this happen sometimes, i do not know if diffbot
// or gigablast is truncating the json
else if ( ! endsInCurly ( mr->ptr_content, mr->size_content )){
sb->safePrintf("{"
"\"error\":"
"\"Bad JSON. "
"Diffbot reply was missing final "
"curly bracket. Truncated JSON.\""
"}");
// make a note of it
log("results: omitting diffbot reply missing curly "
"for %s",mr->ptr_ubuf);
}
// if it's a diffbot object just print it out directly
// into the json. it is already json.
else

View File

@ -1101,7 +1101,8 @@ CollectionRec *XmlDoc::getCollRec ( ) {
if ( ! m_collnumValid ) { char *xx=NULL;*xx=0; }
CollectionRec *cr = g_collectiondb.m_recs[m_collnum];
if ( ! cr ) {
log("build: got NULL collection rec.");
log("build: got NULL collection rec for collnum=%"INT32".",
(int32_t)m_collnum);
g_errno = ENOCOLLREC;
return NULL;
}
@ -14271,6 +14272,17 @@ void gotDiffbotReplyWrapper ( void *state , TcpSocket *s ) {
hadError = true;
}
// just retry if connection got reset by peer!
if ( g_errno == ECONNRESET ||
g_errno == ETIMEDOUT ) {
retry:
log("buld: retrying diffbot reply");
// resume. this checks g_errno for being set.
THIS->m_masterLoop ( THIS->m_masterState );
return;
}
//char *buf = s->m_readBuf;
// do not allow TcpServer.cpp to free it since m_diffbotReply
// is now responsible for that
@ -14284,19 +14296,30 @@ void gotDiffbotReplyWrapper ( void *state , TcpSocket *s ) {
// g_errno should be set
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
// note it
log("xmldoc: error setting diffbot mime");
log("build: error setting diffbot mime");
THIS->m_diffbotReplyError = EDIFFBOTMIMEERROR;
hadError = true;
}
bool retryUrl = false;
// check the status
if ( ! hadError && mime.getHttpStatus() != 200 ) {
THIS->m_diffbotReplyError = EDIFFBOTBADHTTPSTATUS;
log("xmldoc: diffbot reply mime was %"INT32"",
mime.getHttpStatus());
hadError = true;
// gateway timed out? then retry.
if ( mime.getHttpStatus() == 504 )
retryUrl = true;
}
if ( hadError )
log("build: diffbot error for url %s",
THIS->m_diffbotUrl.getBufStart());
if ( retryUrl )
goto retry;
// get page content
char *page = NULL;
@ -14381,10 +14404,22 @@ void gotDiffbotReplyWrapper ( void *state , TcpSocket *s ) {
}
}
// reply is now valid but might be empty
THIS->m_diffbotReplyValid = true;
// if json reply was truncated, that is an error as well.
// likewise we have to check if such bad json is in the serps
// when doing an icc=1 and print 'bad json' in json instead.
if ( ! THIS->m_diffbotReplyError && s->m_readOffset > 1 &&
// json must end with '}' (ignores trailing whitespace)
! endsInCurly ( s->m_readBuf , s->m_readOffset ) ) {
// hopefully this can be re-tried later.
THIS->m_diffbotReplyError = EJSONMISSINGLASTCURLY;
// make a note of it
log("build: got diffbot reply missing curly for %s",
THIS->m_firstUrl.m_url);
}
//if ( ! cr ) return;
bool countIt = true;
@ -14886,6 +14921,14 @@ SafeBuf *XmlDoc::getTokenizedDiffbotReply ( ) {
// . the diffbot reply will be a list of json objects we want to index
SafeBuf *XmlDoc::getDiffbotReply ( ) {
// got reply of malformed json missing final '}'
if ( m_diffbotReplyValid &&
m_diffbotReplyError == EJSONMISSINGLASTCURLY ) {
// hopefully spider will retry later
g_errno = m_diffbotReplyError;
return NULL;
}
if ( m_diffbotReplyValid )
return &m_diffbotReply;
@ -15178,6 +15221,10 @@ SafeBuf *XmlDoc::getDiffbotReply ( ) {
//if ( api && api[0] == '/' ) { api++; apiLen--; }
// append the custom url. i.e. /api/analyze?mode=auto&u=
//if ( api ) diffbotUrl.safeMemcpy ( api , apiLen );
// reset it in case we are a re-call from gotDiffbotReplyWrapper()
// if g_errno == ECONNRESET
m_diffbotUrl.reset();
// store the api url into here
m_diffbotUrl.safeMemcpy ( apiUrl.getUrl() , apiUrl.getUrlLen() );
@ -15323,6 +15370,11 @@ SafeBuf *XmlDoc::getDiffbotReply ( ) {
// mark as tried
if ( m_srepValid ) { char *xx=NULL;*xx=0; }
// might have been a recall if gotDiffbotReplyWrapper() sensed
// g_errno == ECONNRESET and it will retry
if ( ! m_sentToDiffbot ) {
m_sentToDiffbot = 1;
// count it for stats
@ -15333,6 +15385,7 @@ SafeBuf *XmlDoc::getDiffbotReply ( ) {
cr->localCrawlInfoUpdate();
cr->m_needsSave = true;
}
char *additionalHeaders = NULL;
if ( headers.length() > 0 )
@ -15356,7 +15409,11 @@ SafeBuf *XmlDoc::getDiffbotReply ( ) {
0 , // ifmodifiedsince
this , // state
gotDiffbotReplyWrapper ,
180*1000, // 180 sec timeout
// MDW: boost timeout from 180 to 18000
// seconds so we can figure out why
// diffbot times out, etc. what is
// going on.
18000*1000, // 180 sec timeout
0,//proxyip
0,//proxyport
// unlimited replies i guess
@ -20148,6 +20205,9 @@ bool XmlDoc::logIt ( SafeBuf *bb ) {
sb->safePrintf("diffbotjsonobjects=%"INT32" ",
(int32_t)m_diffbotJSONCount);
if ( m_diffbotReplyValid )
sb->safePrintf("diffboterror=%"INT32" ",m_diffbotReplyError);
if ( m_siteValid )
sb->safePrintf("site=%s ",ptr_site);