fix a core. do not get sections of non-html

or non-text documents. was causing EMALFORMED sections error on diffbot json.
2024-10-04 12:17:35 +03:00 · 2014-01-25 07:02:14 -08:00 · 2014-01-25 07:02:14 -08:00 · 99c6390a69
commit 99c6390a69
parent 308106673c
2 changed files with 24 additions and 22 deletions
--- a/Spider.cpp
+++ b/Spider.cpp
@ -3373,6 +3373,8 @@ bool SpiderColl::scanListForWinners ( ) {
 	for ( ; ! list->isExhausted() ; ) {
 		// breathe
 		QUICKPOLL ( MAX_NICENESS );
+		// stop coring on empty lists
+		if ( list->isEmpty() ) break;
 		// get spiderdb rec in its serialized form
 		char *rec = list->getCurrentRec();
 		// sanity
--- a/XmlDoc.cpp
+++ b/XmlDoc.cpp
@ -5810,12 +5810,12 @@ Sections *XmlDoc::getExplicitSections ( ) {
 	if ( m_explicitSectionsValid ) return &m_sections;

 	// if json forget this it is only html
-	//uint8_t *ct = getContentType();
-	//if ( ! ct || ct == (void *)-1 ) return (Sections *)ct;
-	//if ( *ct != CT_HTML && *ct != CT_TEXT && *ct != CT_XML ) {
-	//	m_sectionsValid = true;
-	//	return &m_sections;
-	//}
+	uint8_t *ct = getContentType();
+	if ( ! ct || ct == (void *)-1 ) return (Sections *)ct;
+	if ( *ct != CT_HTML && *ct != CT_TEXT && *ct != CT_XML ) {
+		m_sectionsValid = true;
+		return &m_sections;
+	}


 	setStatus ( "getting explicit sections" );
@ -5853,8 +5853,8 @@ Sections *XmlDoc::getExplicitSections ( ) {
 	long long *d = getDocId();
 	if ( ! d || d == (long long *)-1 ) return (Sections *)d;
 	// get the content type
-	uint8_t *ct = getContentType();
-	if ( ! ct ) return NULL;
+	//uint8_t *ct = getContentType();
+	//if ( ! ct ) return NULL;

 	CollectionRec *cr = getCollRec();
 	if ( ! cr ) return NULL;
@ -5916,12 +5916,12 @@ Sections *XmlDoc::getImpliedSections ( ) {
 	if ( m_impliedSectionsValid ) return &m_sections;

 	// if json forget this it is only html
-	//uint8_t *ct = getContentType();
-	//if ( ! ct || ct == (void *)-1 ) return (Sections *)ct;
-	//if ( *ct != CT_HTML && *ct != CT_TEXT && *ct != CT_XML ) {
-	//	m_sectionsValid = true;
-	//	return &m_sections;
-	//}
+	uint8_t *ct = getContentType();
+	if ( ! ct || ct == (void *)-1 ) return (Sections *)ct;
+	if ( *ct != CT_HTML && *ct != CT_TEXT && *ct != CT_XML ) {
+		m_sectionsValid = true;
+		return &m_sections;
+	}

 	// get the sections without implied sections
 	Sections *sections = getExplicitSections();
@ -5941,8 +5941,8 @@ Sections *XmlDoc::getImpliedSections ( ) {
 	// bail on error
 	if ( ! bits ) return NULL;
 	// get the content type
-	uint8_t *ct = getContentType();
-	if ( ! ct ) return NULL;
+	//uint8_t *ct = getContentType();
+	//if ( ! ct ) return NULL;

 	if ( ! m_firstUrlValid ) { char *xx=NULL;*xx=0; }

@ -6004,12 +6004,12 @@ Sections *XmlDoc::getImpliedSections ( ) {
 Sections *XmlDoc::getSections ( ) {

 	// if json forget this it is only html
-	//uint8_t *ct = getContentType();
-	//if ( ! ct || ct == (void *)-1 ) return (Sections *)ct;
-	//if ( *ct != CT_HTML && *ct != CT_TEXT && *ct != CT_XML ) {
-	//	m_sectionsValid = true;
-	//	return &m_sections;
-	//}
+	uint8_t *ct = getContentType();
+	if ( ! ct || ct == (void *)-1 ) return (Sections *)ct;
+	if ( *ct != CT_HTML && *ct != CT_TEXT && *ct != CT_XML ) {
+		m_sectionsValid = true;
+		return &m_sections;
+	}

 	// get the sections without implied sections
 	Sections *ss = getImpliedSections();