mirror of
https://github.com/gigablast/open-source-search-engine.git
synced 2024-10-04 12:17:35 +03:00
fix a core. do not get sections of non-html
or non-text documents. was causing EMALFORMED sections error on diffbot json.
This commit is contained in:
parent
308106673c
commit
99c6390a69
@ -3373,6 +3373,8 @@ bool SpiderColl::scanListForWinners ( ) {
|
|||||||
for ( ; ! list->isExhausted() ; ) {
|
for ( ; ! list->isExhausted() ; ) {
|
||||||
// breathe
|
// breathe
|
||||||
QUICKPOLL ( MAX_NICENESS );
|
QUICKPOLL ( MAX_NICENESS );
|
||||||
|
// stop coring on empty lists
|
||||||
|
if ( list->isEmpty() ) break;
|
||||||
// get spiderdb rec in its serialized form
|
// get spiderdb rec in its serialized form
|
||||||
char *rec = list->getCurrentRec();
|
char *rec = list->getCurrentRec();
|
||||||
// sanity
|
// sanity
|
||||||
|
44
XmlDoc.cpp
44
XmlDoc.cpp
@ -5810,12 +5810,12 @@ Sections *XmlDoc::getExplicitSections ( ) {
|
|||||||
if ( m_explicitSectionsValid ) return &m_sections;
|
if ( m_explicitSectionsValid ) return &m_sections;
|
||||||
|
|
||||||
// if json forget this it is only html
|
// if json forget this it is only html
|
||||||
//uint8_t *ct = getContentType();
|
uint8_t *ct = getContentType();
|
||||||
//if ( ! ct || ct == (void *)-1 ) return (Sections *)ct;
|
if ( ! ct || ct == (void *)-1 ) return (Sections *)ct;
|
||||||
//if ( *ct != CT_HTML && *ct != CT_TEXT && *ct != CT_XML ) {
|
if ( *ct != CT_HTML && *ct != CT_TEXT && *ct != CT_XML ) {
|
||||||
// m_sectionsValid = true;
|
m_sectionsValid = true;
|
||||||
// return &m_sections;
|
return &m_sections;
|
||||||
//}
|
}
|
||||||
|
|
||||||
|
|
||||||
setStatus ( "getting explicit sections" );
|
setStatus ( "getting explicit sections" );
|
||||||
@ -5853,8 +5853,8 @@ Sections *XmlDoc::getExplicitSections ( ) {
|
|||||||
long long *d = getDocId();
|
long long *d = getDocId();
|
||||||
if ( ! d || d == (long long *)-1 ) return (Sections *)d;
|
if ( ! d || d == (long long *)-1 ) return (Sections *)d;
|
||||||
// get the content type
|
// get the content type
|
||||||
uint8_t *ct = getContentType();
|
//uint8_t *ct = getContentType();
|
||||||
if ( ! ct ) return NULL;
|
//if ( ! ct ) return NULL;
|
||||||
|
|
||||||
CollectionRec *cr = getCollRec();
|
CollectionRec *cr = getCollRec();
|
||||||
if ( ! cr ) return NULL;
|
if ( ! cr ) return NULL;
|
||||||
@ -5916,12 +5916,12 @@ Sections *XmlDoc::getImpliedSections ( ) {
|
|||||||
if ( m_impliedSectionsValid ) return &m_sections;
|
if ( m_impliedSectionsValid ) return &m_sections;
|
||||||
|
|
||||||
// if json forget this it is only html
|
// if json forget this it is only html
|
||||||
//uint8_t *ct = getContentType();
|
uint8_t *ct = getContentType();
|
||||||
//if ( ! ct || ct == (void *)-1 ) return (Sections *)ct;
|
if ( ! ct || ct == (void *)-1 ) return (Sections *)ct;
|
||||||
//if ( *ct != CT_HTML && *ct != CT_TEXT && *ct != CT_XML ) {
|
if ( *ct != CT_HTML && *ct != CT_TEXT && *ct != CT_XML ) {
|
||||||
// m_sectionsValid = true;
|
m_sectionsValid = true;
|
||||||
// return &m_sections;
|
return &m_sections;
|
||||||
//}
|
}
|
||||||
|
|
||||||
// get the sections without implied sections
|
// get the sections without implied sections
|
||||||
Sections *sections = getExplicitSections();
|
Sections *sections = getExplicitSections();
|
||||||
@ -5941,8 +5941,8 @@ Sections *XmlDoc::getImpliedSections ( ) {
|
|||||||
// bail on error
|
// bail on error
|
||||||
if ( ! bits ) return NULL;
|
if ( ! bits ) return NULL;
|
||||||
// get the content type
|
// get the content type
|
||||||
uint8_t *ct = getContentType();
|
//uint8_t *ct = getContentType();
|
||||||
if ( ! ct ) return NULL;
|
//if ( ! ct ) return NULL;
|
||||||
|
|
||||||
if ( ! m_firstUrlValid ) { char *xx=NULL;*xx=0; }
|
if ( ! m_firstUrlValid ) { char *xx=NULL;*xx=0; }
|
||||||
|
|
||||||
@ -6004,12 +6004,12 @@ Sections *XmlDoc::getImpliedSections ( ) {
|
|||||||
Sections *XmlDoc::getSections ( ) {
|
Sections *XmlDoc::getSections ( ) {
|
||||||
|
|
||||||
// if json forget this it is only html
|
// if json forget this it is only html
|
||||||
//uint8_t *ct = getContentType();
|
uint8_t *ct = getContentType();
|
||||||
//if ( ! ct || ct == (void *)-1 ) return (Sections *)ct;
|
if ( ! ct || ct == (void *)-1 ) return (Sections *)ct;
|
||||||
//if ( *ct != CT_HTML && *ct != CT_TEXT && *ct != CT_XML ) {
|
if ( *ct != CT_HTML && *ct != CT_TEXT && *ct != CT_XML ) {
|
||||||
// m_sectionsValid = true;
|
m_sectionsValid = true;
|
||||||
// return &m_sections;
|
return &m_sections;
|
||||||
//}
|
}
|
||||||
|
|
||||||
// get the sections without implied sections
|
// get the sections without implied sections
|
||||||
Sections *ss = getImpliedSections();
|
Sections *ss = getImpliedSections();
|
||||||
|
Loading…
Reference in New Issue
Block a user