for json docs only give them a single

xmlnode in the Xml.cpp class. hopefully will not get "malformed sections" error anymore. i think that was a result of the json having html tags in it and making unnested html structures which the sections class did not like. TODO: probably do this for CT_TEXT etc. as well.
2024-10-04 04:07:13 +03:00 · 2014-01-25 08:17:38 -08:00 · 2014-01-25 08:17:38 -08:00 · bc78b21dc6
commit bc78b21dc6
parent 4d0a09f1e4
9 changed files with 89 additions and 43 deletions
--- a/Blaster.cpp
+++ b/Blaster.cpp
@ -651,7 +651,10 @@ void Blaster::gotDoc2 ( void *state, TcpSocket *s){
 		     false,
 		     0,
 		     false,
-		     TITLEREC_CURRENT_VERSION)){
+		      TITLEREC_CURRENT_VERSION ,
+		      true , // set parents
+		      0 , // niceness 
+		      CT_XML )){ // content type
 		log(LOG_WARN,"blaster: Couldn't set XML1 Class in gotDoc2");
 	}
 	Links links1;
@ -679,7 +682,10 @@ void Blaster::gotDoc2 ( void *state, TcpSocket *s){
 		     false,
 		     0,
 		     false,
-		     TITLEREC_CURRENT_VERSION)){
+		      TITLEREC_CURRENT_VERSION,
+		      true , // setparents
+		      0 , // niceness
+		      CT_XML )){
 		log(LOG_WARN,"blaster: Couldn't set XML2 Class in gotDoc2");
 	}
 	Links links2;
@ -1170,7 +1176,10 @@ void Blaster::gotDoc4 ( void *state, TcpSocket *s){
 		     false,
 		     0,
 		     false,
-		     TITLEREC_CURRENT_VERSION)){
+		     TITLEREC_CURRENT_VERSION,
+		     true, // setparents
+		     0, // niceness
+		     CT_XML )){
 		log(LOG_WARN,"blaster: Couldn't set XML Class in gotDoc4");
 	}
 	Links links;
--- a/Linkdb.cpp
+++ b/Linkdb.cpp
@ -3648,7 +3648,8 @@ bool Inlink::setXmlFromRSS ( Xml *xml , long niceness ) {
 			  true                     , // pure xml?
 			  TITLEREC_CURRENT_VERSION ,
 			  false                    , // no need to now
-			  niceness                 );
+			  niceness                 ,
+			  CT_XML );
 }

 // only Title.cpp uses this right now
--- a/PageGet.cpp
+++ b/PageGet.cpp
@ -712,7 +712,7 @@ bool processLoop ( void *state ) {
 		//Words *ww = xd->getWords();
 		if ( ! xml.set ( content , contentLen , false ,
 				 0 , false , TITLEREC_CURRENT_VERSION ,
-				 false , 0 ) ) { // niceness is 0
+				 false , 0 , CT_HTML ) ) { // niceness is 0
 			//if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );
 			return sendErrorReply ( st , g_errno );
 		}			
--- a/Parms.cpp
+++ b/Parms.cpp
@ -4226,7 +4226,10 @@ bool Parms::setXmlFromFile(Xml *xml, char *filename, char *buf, long bufSize){
 			  false   , // ownData
 			  0       , // allocSize
 			  false   , // pureXml?
-			  0       );// version
+			  0       , // version
+			  true    , // setParents
+			  0       , // niceness
+			  CT_XML  );
 }

 #define MAX_CONF_SIZE 200000
--- a/Xml.cpp
+++ b/Xml.cpp
@ -196,6 +196,8 @@ void Xml::reset ( ) {
 	m_allocSize   = 0;
 }

+#include "HttpMime.h" // CT_JSON
+
 // "s" must be in utf8
 bool Xml::set ( char  *s             , 
 	        long   slen          , 
@ -204,7 +206,8 @@ bool Xml::set ( char  *s             ,
 	        bool   pureXml       ,
 	        long   version       ,
 	        bool   setParentsArg ,
-	        long   niceness      ) {
+	        long   niceness      ,
+		char   contentType ) {

 	// just in case
 	reset();
@ -234,6 +237,28 @@ bool Xml::set ( char  *s             ,
 		return false;
 	}

+	// if json go no further. TODO: also do this for CT_TEXT etc.
+	if ( contentType == CT_JSON ) {
+		m_numNodes = 0;
+		// make the array
+		m_maxNumNodes = 1;
+		m_nodes =(XmlNode *)mmalloc(sizeof(XmlNode)*m_maxNumNodes,"x");
+		if ( ! m_nodes ) return false;
+		XmlNode *xd = &m_nodes[m_numNodes];
+		// hack the node
+		xd->m_node       = s;
+		xd->m_nodeLen    = slen;
+		xd->m_isSelfLink = 0;
+		// . nodeId for text nodes is 0
+		xd->m_nodeId     = 0;
+		xd->m_hasBackTag = false;
+		xd->m_hash       = 0;
+		xd->m_pairTagNum = -1;
+		m_numNodes++;
+		return true;
+	}
+
+
 	QUICKPOLL((niceness));
 	long i;

--- a/Xml.h
+++ b/Xml.h
@ -30,8 +30,10 @@ class Xml {
 		 bool ownData , long allocSize, //=0, 
 		 bool pureXml, // =false );
 		 long version ,
-		 bool setParents = true,
-		 long niceness = 0);
+		 bool setParents , // = true,
+		 long niceness , // = 0
+		 char contentType );
+


 	void  reset ( );
--- a/XmlDoc.cpp
+++ b/XmlDoc.cpp
@ -4727,7 +4727,8 @@ void XmlDoc::gotWikiResults ( UdpSlot *slot ) {
 			 false                    ,
 			 TITLEREC_CURRENT_VERSION ,
 			 false                    , // setParents?
-			 m_niceness               ))
+			 m_niceness               ,
+			 CT_HTML                  ))
 		// return if g_errno got set
 		return;

@ -5206,6 +5207,10 @@ Xml *XmlDoc::getXml ( ) {
 	char **u8 = getUtf8Content();
 	if ( ! u8 || u8 == (char **)-1 ) return (Xml *)u8;
 	long u8len = size_utf8Content - 1;
+	
+	uint8_t *ct = getContentType();
+	if ( ! ct || ct == (void *)-1 ) return (Xml *)ct;
+
 	// note it
 	setStatus ( "getting xml");
 	// set it
@ -5216,7 +5221,8 @@ Xml *XmlDoc::getXml ( ) {
 			   false      ,  // pure xml?
 			   m_version  ,
 			   false      ,  // setParentsArg? 
-			   m_niceness ) )
+			   m_niceness ,
+			   *ct ) )
 		// return NULL on error with g_errno set
 		return NULL;
 	// set just once
@ -5813,13 +5819,12 @@ Sections *XmlDoc::getExplicitSections ( ) {
 	if ( m_explicitSectionsValid ) return &m_sections;

 	// if json forget this it is only html
-	uint8_t *ct = getContentType();
-	if ( ! ct || ct == (void *)-1 ) return (Sections *)ct;
-	if ( *ct != CT_HTML && *ct != CT_TEXT && *ct != CT_XML ) {
-		m_sectionsValid = true;
-		return &m_sections;
-	}
-
+	//uint8_t *ct = getContentType();
+	//if ( ! ct || ct == (void *)-1 ) return (Sections *)ct;
+	//if ( *ct != CT_HTML && *ct != CT_TEXT && *ct != CT_XML ) {
+	//	m_sectionsValid = true;
+	//	return &m_sections;
+	//}

 	setStatus ( "getting explicit sections" );
 	// use the old title rec to make sure we parse consistently!
@ -5856,8 +5861,8 @@ Sections *XmlDoc::getExplicitSections ( ) {
 	long long *d = getDocId();
 	if ( ! d || d == (long long *)-1 ) return (Sections *)d;
 	// get the content type
-	//uint8_t *ct = getContentType();
-	//if ( ! ct ) return NULL;
+	uint8_t *ct = getContentType();
+	if ( ! ct ) return NULL;

 	CollectionRec *cr = getCollRec();
 	if ( ! cr ) return NULL;
@ -5873,6 +5878,8 @@ Sections *XmlDoc::getExplicitSections ( ) {
 	// this uses the sectionsReply to see which sections are "text", etc.
 	// rather than compute it expensively
 	if ( ! m_calledSections &&
+	     // we get malformed sections error for some diffbot replies
+	     //*ct != CT_JSON &&
 	     ! m_sections.set ( &m_words      ,
 				&m_phrases    ,
 				bits          ,
@ -5918,14 +5925,6 @@ Sections *XmlDoc::getExplicitSections ( ) {
 Sections *XmlDoc::getImpliedSections ( ) {
 	if ( m_impliedSectionsValid ) return &m_sections;

-	// if json forget this it is only html
-	uint8_t *ct = getContentType();
-	if ( ! ct || ct == (void *)-1 ) return (Sections *)ct;
-	if ( *ct != CT_HTML && *ct != CT_TEXT && *ct != CT_XML ) {
-		m_sectionsValid = true;
-		return &m_sections;
-	}
-
 	// get the sections without implied sections
 	Sections *sections = getExplicitSections();
 	if ( ! sections || sections==(void *)-1) return (Sections *)sections;
@ -5944,8 +5943,8 @@ Sections *XmlDoc::getImpliedSections ( ) {
 	// bail on error
 	if ( ! bits ) return NULL;
 	// get the content type
-	//uint8_t *ct = getContentType();
-	//if ( ! ct ) return NULL;
+	uint8_t *ct = getContentType();
+	if ( ! ct ) return NULL;

 	if ( ! m_firstUrlValid ) { char *xx=NULL;*xx=0; }

@ -6006,14 +6005,6 @@ Sections *XmlDoc::getImpliedSections ( ) {
 // add in Section::m_sentFlags bits having to do with our voting tables
 Sections *XmlDoc::getSections ( ) {

-	// if json forget this it is only html
-	uint8_t *ct = getContentType();
-	if ( ! ct || ct == (void *)-1 ) return (Sections *)ct;
-	if ( *ct != CT_HTML && *ct != CT_TEXT && *ct != CT_XML ) {
-		m_sectionsValid = true;
-		return &m_sections;
-	}
-
 	// get the sections without implied sections
 	Sections *ss = getImpliedSections();
 	if ( ! ss || ss==(void *)-1) return (Sections *)ss;
@ -17865,6 +17856,7 @@ bool XmlDoc::logIt ( ) {
 	// coll
 	//
 	sb.safePrintf("coll=%s ",coll);
+	sb.safePrintf("collnum=%li ",(long)m_collnum);

 	//
 	// print ip
@ -25077,6 +25069,9 @@ bool XmlDoc::hashRSSInfo ( HashTableX *tt ) {

 	setStatus ( "hashing rss info" );

+	uint8_t *ct = getContentType();
+	if ( ! ct || ct == (void *)-1 ) { char *xx=NULL;*xx=0; }
+
 	// . finally hash in the linkText terms from the LinkInfo
 	// . the LinkInfo class has all the terms of hashed anchor text for us
 	// . if we're using an old TitleRec linkTermList is just a ptr to
@ -25190,7 +25185,10 @@ bool XmlDoc::hashRSSInfo ( HashTableX *tt ) {
 				  false            , // own data?
 				  0                , // allocSize
 				  false            , // pure xml?
-				  m_version ) )
+				  m_version ,
+				  true , // set parents?
+				  m_niceness ,
+				  *ct ) )
 			return false;
 		// set the words class from the xml, returns false and sets
 		// g_errno on error
--- a/fctypes.cpp
+++ b/fctypes.cpp
@ -2046,6 +2046,7 @@ char* getNextNum(char* input, char** numPtr) {
 	return nextspace;
 }

+#include "HttpMime.h" // CT_HTML

 // returns length of stripped content, but will set g_errno and return -1
 // on error
@ -2066,7 +2067,7 @@ long stripHtml( char *content, long contentLen, long version, long strip ) {
 	// . parse as utf8 since all we are doing is messing with 
 	//   the tags...content manipulation comes later
 	if ( ! tmpXml.set ( content , contentLen,
-			    false, 0, false, version ) )
+			    false, 0, false, version , true , 0 , CT_HTML ) )
 		return -1;

 	//if( strip == 4 )
--- a/main.cpp
+++ b/main.cpp
@ -11464,7 +11464,10 @@ bool parseTest ( char *coll , long long docId , char *query ) {
 	t = gettimeofdayInMilliseconds();
 	for ( long i = 0 ; i < 100 ; i++ ) 
 		if ( ! xml.set ( content , contentLen , 
-				 false, 0, false, xd.m_version ) )
+				 false, 0, false, xd.m_version ,
+				 true , // setparents
+				 0 , // niceness 
+				 CT_HTML ) )
 			return log("build: speedtestxml: xml set: %s",
 				   mstrerror(g_errno));
 	// print time it took
@ -11480,7 +11483,8 @@ bool parseTest ( char *coll , long long docId , char *query ) {
 	t = gettimeofdayInMilliseconds();
 	for ( long i = 0 ; i < 100 ; i++ ) 
 		if ( ! xml.set ( content , contentLen , 
-				 false, 0, false, xd.m_version , false ) )
+				 false, 0, false, xd.m_version , false ,
+				 0 , CT_HTML ) )
 			return log("build: xml(setparents=false): %s",
 				   mstrerror(g_errno));
 	// print time it took
@ -11842,7 +11846,10 @@ bool summaryTest1   ( char *rec , long listSize, char *coll , long long docId ,
 		// now parse into xhtml (takes 15ms on lenny)
 		Xml xml;
 		xml.set ( content, contentLen , 
-			  false/*ownData?*/, 0, false, xd.m_version );
+			  false/*ownData?*/, 0, false, xd.m_version ,
+			  true , // setparents
+			  0 , // niceness
+			  CT_HTML );

 		xd.getSummary();