mirror of
https://github.com/gigablast/open-source-search-engine.git
synced 2024-10-04 12:17:35 +03:00
fix a core. do not get sections of non-html
or non-text documents. was causing EMALFORMED sections error on diffbot json.
This commit is contained in:
parent
308106673c
commit
99c6390a69
@ -3373,6 +3373,8 @@ bool SpiderColl::scanListForWinners ( ) {
|
||||
for ( ; ! list->isExhausted() ; ) {
|
||||
// breathe
|
||||
QUICKPOLL ( MAX_NICENESS );
|
||||
// stop coring on empty lists
|
||||
if ( list->isEmpty() ) break;
|
||||
// get spiderdb rec in its serialized form
|
||||
char *rec = list->getCurrentRec();
|
||||
// sanity
|
||||
|
44
XmlDoc.cpp
44
XmlDoc.cpp
@ -5810,12 +5810,12 @@ Sections *XmlDoc::getExplicitSections ( ) {
|
||||
if ( m_explicitSectionsValid ) return &m_sections;
|
||||
|
||||
// if json forget this it is only html
|
||||
//uint8_t *ct = getContentType();
|
||||
//if ( ! ct || ct == (void *)-1 ) return (Sections *)ct;
|
||||
//if ( *ct != CT_HTML && *ct != CT_TEXT && *ct != CT_XML ) {
|
||||
// m_sectionsValid = true;
|
||||
// return &m_sections;
|
||||
//}
|
||||
uint8_t *ct = getContentType();
|
||||
if ( ! ct || ct == (void *)-1 ) return (Sections *)ct;
|
||||
if ( *ct != CT_HTML && *ct != CT_TEXT && *ct != CT_XML ) {
|
||||
m_sectionsValid = true;
|
||||
return &m_sections;
|
||||
}
|
||||
|
||||
|
||||
setStatus ( "getting explicit sections" );
|
||||
@ -5853,8 +5853,8 @@ Sections *XmlDoc::getExplicitSections ( ) {
|
||||
long long *d = getDocId();
|
||||
if ( ! d || d == (long long *)-1 ) return (Sections *)d;
|
||||
// get the content type
|
||||
uint8_t *ct = getContentType();
|
||||
if ( ! ct ) return NULL;
|
||||
//uint8_t *ct = getContentType();
|
||||
//if ( ! ct ) return NULL;
|
||||
|
||||
CollectionRec *cr = getCollRec();
|
||||
if ( ! cr ) return NULL;
|
||||
@ -5916,12 +5916,12 @@ Sections *XmlDoc::getImpliedSections ( ) {
|
||||
if ( m_impliedSectionsValid ) return &m_sections;
|
||||
|
||||
// if json forget this it is only html
|
||||
//uint8_t *ct = getContentType();
|
||||
//if ( ! ct || ct == (void *)-1 ) return (Sections *)ct;
|
||||
//if ( *ct != CT_HTML && *ct != CT_TEXT && *ct != CT_XML ) {
|
||||
// m_sectionsValid = true;
|
||||
// return &m_sections;
|
||||
//}
|
||||
uint8_t *ct = getContentType();
|
||||
if ( ! ct || ct == (void *)-1 ) return (Sections *)ct;
|
||||
if ( *ct != CT_HTML && *ct != CT_TEXT && *ct != CT_XML ) {
|
||||
m_sectionsValid = true;
|
||||
return &m_sections;
|
||||
}
|
||||
|
||||
// get the sections without implied sections
|
||||
Sections *sections = getExplicitSections();
|
||||
@ -5941,8 +5941,8 @@ Sections *XmlDoc::getImpliedSections ( ) {
|
||||
// bail on error
|
||||
if ( ! bits ) return NULL;
|
||||
// get the content type
|
||||
uint8_t *ct = getContentType();
|
||||
if ( ! ct ) return NULL;
|
||||
//uint8_t *ct = getContentType();
|
||||
//if ( ! ct ) return NULL;
|
||||
|
||||
if ( ! m_firstUrlValid ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
@ -6004,12 +6004,12 @@ Sections *XmlDoc::getImpliedSections ( ) {
|
||||
Sections *XmlDoc::getSections ( ) {
|
||||
|
||||
// if json forget this it is only html
|
||||
//uint8_t *ct = getContentType();
|
||||
//if ( ! ct || ct == (void *)-1 ) return (Sections *)ct;
|
||||
//if ( *ct != CT_HTML && *ct != CT_TEXT && *ct != CT_XML ) {
|
||||
// m_sectionsValid = true;
|
||||
// return &m_sections;
|
||||
//}
|
||||
uint8_t *ct = getContentType();
|
||||
if ( ! ct || ct == (void *)-1 ) return (Sections *)ct;
|
||||
if ( *ct != CT_HTML && *ct != CT_TEXT && *ct != CT_XML ) {
|
||||
m_sectionsValid = true;
|
||||
return &m_sections;
|
||||
}
|
||||
|
||||
// get the sections without implied sections
|
||||
Sections *ss = getImpliedSections();
|
||||
|
Loading…
Reference in New Issue
Block a user