diff --git a/Blaster.cpp b/Blaster.cpp index 685362eb..b94fcdab 100644 --- a/Blaster.cpp +++ b/Blaster.cpp @@ -651,7 +651,10 @@ void Blaster::gotDoc2 ( void *state, TcpSocket *s){ false, 0, false, - TITLEREC_CURRENT_VERSION)){ + TITLEREC_CURRENT_VERSION , + true , // set parents + 0 , // niceness + CT_XML )){ // content type log(LOG_WARN,"blaster: Couldn't set XML1 Class in gotDoc2"); } Links links1; @@ -679,7 +682,10 @@ void Blaster::gotDoc2 ( void *state, TcpSocket *s){ false, 0, false, - TITLEREC_CURRENT_VERSION)){ + TITLEREC_CURRENT_VERSION, + true , // setparents + 0 , // niceness + CT_XML )){ log(LOG_WARN,"blaster: Couldn't set XML2 Class in gotDoc2"); } Links links2; @@ -1170,7 +1176,10 @@ void Blaster::gotDoc4 ( void *state, TcpSocket *s){ false, 0, false, - TITLEREC_CURRENT_VERSION)){ + TITLEREC_CURRENT_VERSION, + true, // setparents + 0, // niceness + CT_XML )){ log(LOG_WARN,"blaster: Couldn't set XML Class in gotDoc4"); } Links links; diff --git a/Linkdb.cpp b/Linkdb.cpp index 998f65c2..7d885384 100644 --- a/Linkdb.cpp +++ b/Linkdb.cpp @@ -3648,7 +3648,8 @@ bool Inlink::setXmlFromRSS ( Xml *xml , long niceness ) { true , // pure xml? TITLEREC_CURRENT_VERSION , false , // no need to now - niceness ); + niceness , + CT_XML ); } // only Title.cpp uses this right now diff --git a/PageGet.cpp b/PageGet.cpp index b8cf5d01..0e9908a8 100644 --- a/PageGet.cpp +++ b/PageGet.cpp @@ -712,7 +712,7 @@ bool processLoop ( void *state ) { //Words *ww = xd->getWords(); if ( ! xml.set ( content , contentLen , false , 0 , false , TITLEREC_CURRENT_VERSION , - false , 0 ) ) { // niceness is 0 + false , 0 , CT_HTML ) ) { // niceness is 0 //if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" ); return sendErrorReply ( st , g_errno ); } diff --git a/Parms.cpp b/Parms.cpp index d6d8724a..8b2d138b 100644 --- a/Parms.cpp +++ b/Parms.cpp @@ -4226,7 +4226,10 @@ bool Parms::setXmlFromFile(Xml *xml, char *filename, char *buf, long bufSize){ false , // ownData 0 , // allocSize false , // pureXml? - 0 );// version + 0 , // version + true , // setParents + 0 , // niceness + CT_XML ); } #define MAX_CONF_SIZE 200000 diff --git a/Xml.cpp b/Xml.cpp index f9153ccb..7f6f4920 100644 --- a/Xml.cpp +++ b/Xml.cpp @@ -196,6 +196,8 @@ void Xml::reset ( ) { m_allocSize = 0; } +#include "HttpMime.h" // CT_JSON + // "s" must be in utf8 bool Xml::set ( char *s , long slen , @@ -204,7 +206,8 @@ bool Xml::set ( char *s , bool pureXml , long version , bool setParentsArg , - long niceness ) { + long niceness , + char contentType ) { // just in case reset(); @@ -234,6 +237,28 @@ bool Xml::set ( char *s , return false; } + // if json go no further. TODO: also do this for CT_TEXT etc. + if ( contentType == CT_JSON ) { + m_numNodes = 0; + // make the array + m_maxNumNodes = 1; + m_nodes =(XmlNode *)mmalloc(sizeof(XmlNode)*m_maxNumNodes,"x"); + if ( ! m_nodes ) return false; + XmlNode *xd = &m_nodes[m_numNodes]; + // hack the node + xd->m_node = s; + xd->m_nodeLen = slen; + xd->m_isSelfLink = 0; + // . nodeId for text nodes is 0 + xd->m_nodeId = 0; + xd->m_hasBackTag = false; + xd->m_hash = 0; + xd->m_pairTagNum = -1; + m_numNodes++; + return true; + } + + QUICKPOLL((niceness)); long i; diff --git a/Xml.h b/Xml.h index 30abac2a..795b74a7 100644 --- a/Xml.h +++ b/Xml.h @@ -30,8 +30,10 @@ class Xml { bool ownData , long allocSize, //=0, bool pureXml, // =false ); long version , - bool setParents = true, - long niceness = 0); + bool setParents , // = true, + long niceness , // = 0 + char contentType ); + void reset ( ); diff --git a/XmlDoc.cpp b/XmlDoc.cpp index 041322f9..960fb52a 100644 --- a/XmlDoc.cpp +++ b/XmlDoc.cpp @@ -4727,7 +4727,8 @@ void XmlDoc::gotWikiResults ( UdpSlot *slot ) { false , TITLEREC_CURRENT_VERSION , false , // setParents? - m_niceness )) + m_niceness , + CT_HTML )) // return if g_errno got set return; @@ -5206,6 +5207,10 @@ Xml *XmlDoc::getXml ( ) { char **u8 = getUtf8Content(); if ( ! u8 || u8 == (char **)-1 ) return (Xml *)u8; long u8len = size_utf8Content - 1; + + uint8_t *ct = getContentType(); + if ( ! ct || ct == (void *)-1 ) return (Xml *)ct; + // note it setStatus ( "getting xml"); // set it @@ -5216,7 +5221,8 @@ Xml *XmlDoc::getXml ( ) { false , // pure xml? m_version , false , // setParentsArg? - m_niceness ) ) + m_niceness , + *ct ) ) // return NULL on error with g_errno set return NULL; // set just once @@ -5813,13 +5819,12 @@ Sections *XmlDoc::getExplicitSections ( ) { if ( m_explicitSectionsValid ) return &m_sections; // if json forget this it is only html - uint8_t *ct = getContentType(); - if ( ! ct || ct == (void *)-1 ) return (Sections *)ct; - if ( *ct != CT_HTML && *ct != CT_TEXT && *ct != CT_XML ) { - m_sectionsValid = true; - return &m_sections; - } - + //uint8_t *ct = getContentType(); + //if ( ! ct || ct == (void *)-1 ) return (Sections *)ct; + //if ( *ct != CT_HTML && *ct != CT_TEXT && *ct != CT_XML ) { + // m_sectionsValid = true; + // return &m_sections; + //} setStatus ( "getting explicit sections" ); // use the old title rec to make sure we parse consistently! @@ -5856,8 +5861,8 @@ Sections *XmlDoc::getExplicitSections ( ) { long long *d = getDocId(); if ( ! d || d == (long long *)-1 ) return (Sections *)d; // get the content type - //uint8_t *ct = getContentType(); - //if ( ! ct ) return NULL; + uint8_t *ct = getContentType(); + if ( ! ct ) return NULL; CollectionRec *cr = getCollRec(); if ( ! cr ) return NULL; @@ -5873,6 +5878,8 @@ Sections *XmlDoc::getExplicitSections ( ) { // this uses the sectionsReply to see which sections are "text", etc. // rather than compute it expensively if ( ! m_calledSections && + // we get malformed sections error for some diffbot replies + //*ct != CT_JSON && ! m_sections.set ( &m_words , &m_phrases , bits , @@ -5918,14 +5925,6 @@ Sections *XmlDoc::getExplicitSections ( ) { Sections *XmlDoc::getImpliedSections ( ) { if ( m_impliedSectionsValid ) return &m_sections; - // if json forget this it is only html - uint8_t *ct = getContentType(); - if ( ! ct || ct == (void *)-1 ) return (Sections *)ct; - if ( *ct != CT_HTML && *ct != CT_TEXT && *ct != CT_XML ) { - m_sectionsValid = true; - return &m_sections; - } - // get the sections without implied sections Sections *sections = getExplicitSections(); if ( ! sections || sections==(void *)-1) return (Sections *)sections; @@ -5944,8 +5943,8 @@ Sections *XmlDoc::getImpliedSections ( ) { // bail on error if ( ! bits ) return NULL; // get the content type - //uint8_t *ct = getContentType(); - //if ( ! ct ) return NULL; + uint8_t *ct = getContentType(); + if ( ! ct ) return NULL; if ( ! m_firstUrlValid ) { char *xx=NULL;*xx=0; } @@ -6006,14 +6005,6 @@ Sections *XmlDoc::getImpliedSections ( ) { // add in Section::m_sentFlags bits having to do with our voting tables Sections *XmlDoc::getSections ( ) { - // if json forget this it is only html - uint8_t *ct = getContentType(); - if ( ! ct || ct == (void *)-1 ) return (Sections *)ct; - if ( *ct != CT_HTML && *ct != CT_TEXT && *ct != CT_XML ) { - m_sectionsValid = true; - return &m_sections; - } - // get the sections without implied sections Sections *ss = getImpliedSections(); if ( ! ss || ss==(void *)-1) return (Sections *)ss; @@ -17865,6 +17856,7 @@ bool XmlDoc::logIt ( ) { // coll // sb.safePrintf("coll=%s ",coll); + sb.safePrintf("collnum=%li ",(long)m_collnum); // // print ip @@ -25077,6 +25069,9 @@ bool XmlDoc::hashRSSInfo ( HashTableX *tt ) { setStatus ( "hashing rss info" ); + uint8_t *ct = getContentType(); + if ( ! ct || ct == (void *)-1 ) { char *xx=NULL;*xx=0; } + // . finally hash in the linkText terms from the LinkInfo // . the LinkInfo class has all the terms of hashed anchor text for us // . if we're using an old TitleRec linkTermList is just a ptr to @@ -25190,7 +25185,10 @@ bool XmlDoc::hashRSSInfo ( HashTableX *tt ) { false , // own data? 0 , // allocSize false , // pure xml? - m_version ) ) + m_version , + true , // set parents? + m_niceness , + *ct ) ) return false; // set the words class from the xml, returns false and sets // g_errno on error diff --git a/fctypes.cpp b/fctypes.cpp index 9aa88566..34ede819 100644 --- a/fctypes.cpp +++ b/fctypes.cpp @@ -2046,6 +2046,7 @@ char* getNextNum(char* input, char** numPtr) { return nextspace; } +#include "HttpMime.h" // CT_HTML // returns length of stripped content, but will set g_errno and return -1 // on error @@ -2066,7 +2067,7 @@ long stripHtml( char *content, long contentLen, long version, long strip ) { // . parse as utf8 since all we are doing is messing with // the tags...content manipulation comes later if ( ! tmpXml.set ( content , contentLen, - false, 0, false, version ) ) + false, 0, false, version , true , 0 , CT_HTML ) ) return -1; //if( strip == 4 ) diff --git a/main.cpp b/main.cpp index 346fe533..d12f71df 100644 --- a/main.cpp +++ b/main.cpp @@ -11464,7 +11464,10 @@ bool parseTest ( char *coll , long long docId , char *query ) { t = gettimeofdayInMilliseconds(); for ( long i = 0 ; i < 100 ; i++ ) if ( ! xml.set ( content , contentLen , - false, 0, false, xd.m_version ) ) + false, 0, false, xd.m_version , + true , // setparents + 0 , // niceness + CT_HTML ) ) return log("build: speedtestxml: xml set: %s", mstrerror(g_errno)); // print time it took @@ -11480,7 +11483,8 @@ bool parseTest ( char *coll , long long docId , char *query ) { t = gettimeofdayInMilliseconds(); for ( long i = 0 ; i < 100 ; i++ ) if ( ! xml.set ( content , contentLen , - false, 0, false, xd.m_version , false ) ) + false, 0, false, xd.m_version , false , + 0 , CT_HTML ) ) return log("build: xml(setparents=false): %s", mstrerror(g_errno)); // print time it took @@ -11842,7 +11846,10 @@ bool summaryTest1 ( char *rec , long listSize, char *coll , long long docId , // now parse into xhtml (takes 15ms on lenny) Xml xml; xml.set ( content, contentLen , - false/*ownData?*/, 0, false, xd.m_version ); + false/*ownData?*/, 0, false, xd.m_version , + true , // setparents + 0 , // niceness + CT_HTML ); xd.getSummary();