fix a core. do not get sections of non-html

or non-text documents. was causing EMALFORMED
sections error on diffbot json.
This commit is contained in:
Matt Wells 2014-01-25 07:02:14 -08:00
parent 308106673c
commit 99c6390a69
2 changed files with 24 additions and 22 deletions

View File

@ -3373,6 +3373,8 @@ bool SpiderColl::scanListForWinners ( ) {
for ( ; ! list->isExhausted() ; ) { for ( ; ! list->isExhausted() ; ) {
// breathe // breathe
QUICKPOLL ( MAX_NICENESS ); QUICKPOLL ( MAX_NICENESS );
// stop coring on empty lists
if ( list->isEmpty() ) break;
// get spiderdb rec in its serialized form // get spiderdb rec in its serialized form
char *rec = list->getCurrentRec(); char *rec = list->getCurrentRec();
// sanity // sanity

View File

@ -5810,12 +5810,12 @@ Sections *XmlDoc::getExplicitSections ( ) {
if ( m_explicitSectionsValid ) return &m_sections; if ( m_explicitSectionsValid ) return &m_sections;
// if json forget this it is only html // if json forget this it is only html
//uint8_t *ct = getContentType(); uint8_t *ct = getContentType();
//if ( ! ct || ct == (void *)-1 ) return (Sections *)ct; if ( ! ct || ct == (void *)-1 ) return (Sections *)ct;
//if ( *ct != CT_HTML && *ct != CT_TEXT && *ct != CT_XML ) { if ( *ct != CT_HTML && *ct != CT_TEXT && *ct != CT_XML ) {
// m_sectionsValid = true; m_sectionsValid = true;
// return &m_sections; return &m_sections;
//} }
setStatus ( "getting explicit sections" ); setStatus ( "getting explicit sections" );
@ -5853,8 +5853,8 @@ Sections *XmlDoc::getExplicitSections ( ) {
long long *d = getDocId(); long long *d = getDocId();
if ( ! d || d == (long long *)-1 ) return (Sections *)d; if ( ! d || d == (long long *)-1 ) return (Sections *)d;
// get the content type // get the content type
uint8_t *ct = getContentType(); //uint8_t *ct = getContentType();
if ( ! ct ) return NULL; //if ( ! ct ) return NULL;
CollectionRec *cr = getCollRec(); CollectionRec *cr = getCollRec();
if ( ! cr ) return NULL; if ( ! cr ) return NULL;
@ -5916,12 +5916,12 @@ Sections *XmlDoc::getImpliedSections ( ) {
if ( m_impliedSectionsValid ) return &m_sections; if ( m_impliedSectionsValid ) return &m_sections;
// if json forget this it is only html // if json forget this it is only html
//uint8_t *ct = getContentType(); uint8_t *ct = getContentType();
//if ( ! ct || ct == (void *)-1 ) return (Sections *)ct; if ( ! ct || ct == (void *)-1 ) return (Sections *)ct;
//if ( *ct != CT_HTML && *ct != CT_TEXT && *ct != CT_XML ) { if ( *ct != CT_HTML && *ct != CT_TEXT && *ct != CT_XML ) {
// m_sectionsValid = true; m_sectionsValid = true;
// return &m_sections; return &m_sections;
//} }
// get the sections without implied sections // get the sections without implied sections
Sections *sections = getExplicitSections(); Sections *sections = getExplicitSections();
@ -5941,8 +5941,8 @@ Sections *XmlDoc::getImpliedSections ( ) {
// bail on error // bail on error
if ( ! bits ) return NULL; if ( ! bits ) return NULL;
// get the content type // get the content type
uint8_t *ct = getContentType(); //uint8_t *ct = getContentType();
if ( ! ct ) return NULL; //if ( ! ct ) return NULL;
if ( ! m_firstUrlValid ) { char *xx=NULL;*xx=0; } if ( ! m_firstUrlValid ) { char *xx=NULL;*xx=0; }
@ -6004,12 +6004,12 @@ Sections *XmlDoc::getImpliedSections ( ) {
Sections *XmlDoc::getSections ( ) { Sections *XmlDoc::getSections ( ) {
// if json forget this it is only html // if json forget this it is only html
//uint8_t *ct = getContentType(); uint8_t *ct = getContentType();
//if ( ! ct || ct == (void *)-1 ) return (Sections *)ct; if ( ! ct || ct == (void *)-1 ) return (Sections *)ct;
//if ( *ct != CT_HTML && *ct != CT_TEXT && *ct != CT_XML ) { if ( *ct != CT_HTML && *ct != CT_TEXT && *ct != CT_XML ) {
// m_sectionsValid = true; m_sectionsValid = true;
// return &m_sections; return &m_sections;
//} }
// get the sections without implied sections // get the sections without implied sections
Sections *ss = getImpliedSections(); Sections *ss = getImpliedSections();