for json docs only give them a single

xmlnode in the Xml.cpp class. hopefully
will not get "malformed sections" error
anymore. i think that was a result of the
json having html tags in it and making
unnested html structures which the
sections class did not like.
TODO: probably do this for CT_TEXT etc.
as well.
This commit is contained in:
Matt Wells 2014-01-25 08:17:38 -08:00
parent 4d0a09f1e4
commit bc78b21dc6
9 changed files with 89 additions and 43 deletions

View File

@ -651,7 +651,10 @@ void Blaster::gotDoc2 ( void *state, TcpSocket *s){
false, false,
0, 0,
false, false,
TITLEREC_CURRENT_VERSION)){ TITLEREC_CURRENT_VERSION ,
true , // set parents
0 , // niceness
CT_XML )){ // content type
log(LOG_WARN,"blaster: Couldn't set XML1 Class in gotDoc2"); log(LOG_WARN,"blaster: Couldn't set XML1 Class in gotDoc2");
} }
Links links1; Links links1;
@ -679,7 +682,10 @@ void Blaster::gotDoc2 ( void *state, TcpSocket *s){
false, false,
0, 0,
false, false,
TITLEREC_CURRENT_VERSION)){ TITLEREC_CURRENT_VERSION,
true , // setparents
0 , // niceness
CT_XML )){
log(LOG_WARN,"blaster: Couldn't set XML2 Class in gotDoc2"); log(LOG_WARN,"blaster: Couldn't set XML2 Class in gotDoc2");
} }
Links links2; Links links2;
@ -1170,7 +1176,10 @@ void Blaster::gotDoc4 ( void *state, TcpSocket *s){
false, false,
0, 0,
false, false,
TITLEREC_CURRENT_VERSION)){ TITLEREC_CURRENT_VERSION,
true, // setparents
0, // niceness
CT_XML )){
log(LOG_WARN,"blaster: Couldn't set XML Class in gotDoc4"); log(LOG_WARN,"blaster: Couldn't set XML Class in gotDoc4");
} }
Links links; Links links;

View File

@ -3648,7 +3648,8 @@ bool Inlink::setXmlFromRSS ( Xml *xml , long niceness ) {
true , // pure xml? true , // pure xml?
TITLEREC_CURRENT_VERSION , TITLEREC_CURRENT_VERSION ,
false , // no need to now false , // no need to now
niceness ); niceness ,
CT_XML );
} }
// only Title.cpp uses this right now // only Title.cpp uses this right now

View File

@ -712,7 +712,7 @@ bool processLoop ( void *state ) {
//Words *ww = xd->getWords(); //Words *ww = xd->getWords();
if ( ! xml.set ( content , contentLen , false , if ( ! xml.set ( content , contentLen , false ,
0 , false , TITLEREC_CURRENT_VERSION , 0 , false , TITLEREC_CURRENT_VERSION ,
false , 0 ) ) { // niceness is 0 false , 0 , CT_HTML ) ) { // niceness is 0
//if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" ); //if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );
return sendErrorReply ( st , g_errno ); return sendErrorReply ( st , g_errno );
} }

View File

@ -4226,7 +4226,10 @@ bool Parms::setXmlFromFile(Xml *xml, char *filename, char *buf, long bufSize){
false , // ownData false , // ownData
0 , // allocSize 0 , // allocSize
false , // pureXml? false , // pureXml?
0 );// version 0 , // version
true , // setParents
0 , // niceness
CT_XML );
} }
#define MAX_CONF_SIZE 200000 #define MAX_CONF_SIZE 200000

27
Xml.cpp
View File

@ -196,6 +196,8 @@ void Xml::reset ( ) {
m_allocSize = 0; m_allocSize = 0;
} }
#include "HttpMime.h" // CT_JSON
// "s" must be in utf8 // "s" must be in utf8
bool Xml::set ( char *s , bool Xml::set ( char *s ,
long slen , long slen ,
@ -204,7 +206,8 @@ bool Xml::set ( char *s ,
bool pureXml , bool pureXml ,
long version , long version ,
bool setParentsArg , bool setParentsArg ,
long niceness ) { long niceness ,
char contentType ) {
// just in case // just in case
reset(); reset();
@ -234,6 +237,28 @@ bool Xml::set ( char *s ,
return false; return false;
} }
// if json go no further. TODO: also do this for CT_TEXT etc.
if ( contentType == CT_JSON ) {
m_numNodes = 0;
// make the array
m_maxNumNodes = 1;
m_nodes =(XmlNode *)mmalloc(sizeof(XmlNode)*m_maxNumNodes,"x");
if ( ! m_nodes ) return false;
XmlNode *xd = &m_nodes[m_numNodes];
// hack the node
xd->m_node = s;
xd->m_nodeLen = slen;
xd->m_isSelfLink = 0;
// . nodeId for text nodes is 0
xd->m_nodeId = 0;
xd->m_hasBackTag = false;
xd->m_hash = 0;
xd->m_pairTagNum = -1;
m_numNodes++;
return true;
}
QUICKPOLL((niceness)); QUICKPOLL((niceness));
long i; long i;

6
Xml.h
View File

@ -30,8 +30,10 @@ class Xml {
bool ownData , long allocSize, //=0, bool ownData , long allocSize, //=0,
bool pureXml, // =false ); bool pureXml, // =false );
long version , long version ,
bool setParents = true, bool setParents , // = true,
long niceness = 0); long niceness , // = 0
char contentType );
void reset ( ); void reset ( );

View File

@ -4727,7 +4727,8 @@ void XmlDoc::gotWikiResults ( UdpSlot *slot ) {
false , false ,
TITLEREC_CURRENT_VERSION , TITLEREC_CURRENT_VERSION ,
false , // setParents? false , // setParents?
m_niceness )) m_niceness ,
CT_HTML ))
// return if g_errno got set // return if g_errno got set
return; return;
@ -5206,6 +5207,10 @@ Xml *XmlDoc::getXml ( ) {
char **u8 = getUtf8Content(); char **u8 = getUtf8Content();
if ( ! u8 || u8 == (char **)-1 ) return (Xml *)u8; if ( ! u8 || u8 == (char **)-1 ) return (Xml *)u8;
long u8len = size_utf8Content - 1; long u8len = size_utf8Content - 1;
uint8_t *ct = getContentType();
if ( ! ct || ct == (void *)-1 ) return (Xml *)ct;
// note it // note it
setStatus ( "getting xml"); setStatus ( "getting xml");
// set it // set it
@ -5216,7 +5221,8 @@ Xml *XmlDoc::getXml ( ) {
false , // pure xml? false , // pure xml?
m_version , m_version ,
false , // setParentsArg? false , // setParentsArg?
m_niceness ) ) m_niceness ,
*ct ) )
// return NULL on error with g_errno set // return NULL on error with g_errno set
return NULL; return NULL;
// set just once // set just once
@ -5813,13 +5819,12 @@ Sections *XmlDoc::getExplicitSections ( ) {
if ( m_explicitSectionsValid ) return &m_sections; if ( m_explicitSectionsValid ) return &m_sections;
// if json forget this it is only html // if json forget this it is only html
uint8_t *ct = getContentType(); //uint8_t *ct = getContentType();
if ( ! ct || ct == (void *)-1 ) return (Sections *)ct; //if ( ! ct || ct == (void *)-1 ) return (Sections *)ct;
if ( *ct != CT_HTML && *ct != CT_TEXT && *ct != CT_XML ) { //if ( *ct != CT_HTML && *ct != CT_TEXT && *ct != CT_XML ) {
m_sectionsValid = true; // m_sectionsValid = true;
return &m_sections; // return &m_sections;
} //}
setStatus ( "getting explicit sections" ); setStatus ( "getting explicit sections" );
// use the old title rec to make sure we parse consistently! // use the old title rec to make sure we parse consistently!
@ -5856,8 +5861,8 @@ Sections *XmlDoc::getExplicitSections ( ) {
long long *d = getDocId(); long long *d = getDocId();
if ( ! d || d == (long long *)-1 ) return (Sections *)d; if ( ! d || d == (long long *)-1 ) return (Sections *)d;
// get the content type // get the content type
//uint8_t *ct = getContentType(); uint8_t *ct = getContentType();
//if ( ! ct ) return NULL; if ( ! ct ) return NULL;
CollectionRec *cr = getCollRec(); CollectionRec *cr = getCollRec();
if ( ! cr ) return NULL; if ( ! cr ) return NULL;
@ -5873,6 +5878,8 @@ Sections *XmlDoc::getExplicitSections ( ) {
// this uses the sectionsReply to see which sections are "text", etc. // this uses the sectionsReply to see which sections are "text", etc.
// rather than compute it expensively // rather than compute it expensively
if ( ! m_calledSections && if ( ! m_calledSections &&
// we get malformed sections error for some diffbot replies
//*ct != CT_JSON &&
! m_sections.set ( &m_words , ! m_sections.set ( &m_words ,
&m_phrases , &m_phrases ,
bits , bits ,
@ -5918,14 +5925,6 @@ Sections *XmlDoc::getExplicitSections ( ) {
Sections *XmlDoc::getImpliedSections ( ) { Sections *XmlDoc::getImpliedSections ( ) {
if ( m_impliedSectionsValid ) return &m_sections; if ( m_impliedSectionsValid ) return &m_sections;
// if json forget this it is only html
uint8_t *ct = getContentType();
if ( ! ct || ct == (void *)-1 ) return (Sections *)ct;
if ( *ct != CT_HTML && *ct != CT_TEXT && *ct != CT_XML ) {
m_sectionsValid = true;
return &m_sections;
}
// get the sections without implied sections // get the sections without implied sections
Sections *sections = getExplicitSections(); Sections *sections = getExplicitSections();
if ( ! sections || sections==(void *)-1) return (Sections *)sections; if ( ! sections || sections==(void *)-1) return (Sections *)sections;
@ -5944,8 +5943,8 @@ Sections *XmlDoc::getImpliedSections ( ) {
// bail on error // bail on error
if ( ! bits ) return NULL; if ( ! bits ) return NULL;
// get the content type // get the content type
//uint8_t *ct = getContentType(); uint8_t *ct = getContentType();
//if ( ! ct ) return NULL; if ( ! ct ) return NULL;
if ( ! m_firstUrlValid ) { char *xx=NULL;*xx=0; } if ( ! m_firstUrlValid ) { char *xx=NULL;*xx=0; }
@ -6006,14 +6005,6 @@ Sections *XmlDoc::getImpliedSections ( ) {
// add in Section::m_sentFlags bits having to do with our voting tables // add in Section::m_sentFlags bits having to do with our voting tables
Sections *XmlDoc::getSections ( ) { Sections *XmlDoc::getSections ( ) {
// if json forget this it is only html
uint8_t *ct = getContentType();
if ( ! ct || ct == (void *)-1 ) return (Sections *)ct;
if ( *ct != CT_HTML && *ct != CT_TEXT && *ct != CT_XML ) {
m_sectionsValid = true;
return &m_sections;
}
// get the sections without implied sections // get the sections without implied sections
Sections *ss = getImpliedSections(); Sections *ss = getImpliedSections();
if ( ! ss || ss==(void *)-1) return (Sections *)ss; if ( ! ss || ss==(void *)-1) return (Sections *)ss;
@ -17865,6 +17856,7 @@ bool XmlDoc::logIt ( ) {
// coll // coll
// //
sb.safePrintf("coll=%s ",coll); sb.safePrintf("coll=%s ",coll);
sb.safePrintf("collnum=%li ",(long)m_collnum);
// //
// print ip // print ip
@ -25077,6 +25069,9 @@ bool XmlDoc::hashRSSInfo ( HashTableX *tt ) {
setStatus ( "hashing rss info" ); setStatus ( "hashing rss info" );
uint8_t *ct = getContentType();
if ( ! ct || ct == (void *)-1 ) { char *xx=NULL;*xx=0; }
// . finally hash in the linkText terms from the LinkInfo // . finally hash in the linkText terms from the LinkInfo
// . the LinkInfo class has all the terms of hashed anchor text for us // . the LinkInfo class has all the terms of hashed anchor text for us
// . if we're using an old TitleRec linkTermList is just a ptr to // . if we're using an old TitleRec linkTermList is just a ptr to
@ -25190,7 +25185,10 @@ bool XmlDoc::hashRSSInfo ( HashTableX *tt ) {
false , // own data? false , // own data?
0 , // allocSize 0 , // allocSize
false , // pure xml? false , // pure xml?
m_version ) ) m_version ,
true , // set parents?
m_niceness ,
*ct ) )
return false; return false;
// set the words class from the xml, returns false and sets // set the words class from the xml, returns false and sets
// g_errno on error // g_errno on error

View File

@ -2046,6 +2046,7 @@ char* getNextNum(char* input, char** numPtr) {
return nextspace; return nextspace;
} }
#include "HttpMime.h" // CT_HTML
// returns length of stripped content, but will set g_errno and return -1 // returns length of stripped content, but will set g_errno and return -1
// on error // on error
@ -2066,7 +2067,7 @@ long stripHtml( char *content, long contentLen, long version, long strip ) {
// . parse as utf8 since all we are doing is messing with // . parse as utf8 since all we are doing is messing with
// the tags...content manipulation comes later // the tags...content manipulation comes later
if ( ! tmpXml.set ( content , contentLen, if ( ! tmpXml.set ( content , contentLen,
false, 0, false, version ) ) false, 0, false, version , true , 0 , CT_HTML ) )
return -1; return -1;
//if( strip == 4 ) //if( strip == 4 )

View File

@ -11464,7 +11464,10 @@ bool parseTest ( char *coll , long long docId , char *query ) {
t = gettimeofdayInMilliseconds(); t = gettimeofdayInMilliseconds();
for ( long i = 0 ; i < 100 ; i++ ) for ( long i = 0 ; i < 100 ; i++ )
if ( ! xml.set ( content , contentLen , if ( ! xml.set ( content , contentLen ,
false, 0, false, xd.m_version ) ) false, 0, false, xd.m_version ,
true , // setparents
0 , // niceness
CT_HTML ) )
return log("build: speedtestxml: xml set: %s", return log("build: speedtestxml: xml set: %s",
mstrerror(g_errno)); mstrerror(g_errno));
// print time it took // print time it took
@ -11480,7 +11483,8 @@ bool parseTest ( char *coll , long long docId , char *query ) {
t = gettimeofdayInMilliseconds(); t = gettimeofdayInMilliseconds();
for ( long i = 0 ; i < 100 ; i++ ) for ( long i = 0 ; i < 100 ; i++ )
if ( ! xml.set ( content , contentLen , if ( ! xml.set ( content , contentLen ,
false, 0, false, xd.m_version , false ) ) false, 0, false, xd.m_version , false ,
0 , CT_HTML ) )
return log("build: xml(setparents=false): %s", return log("build: xml(setparents=false): %s",
mstrerror(g_errno)); mstrerror(g_errno));
// print time it took // print time it took
@ -11842,7 +11846,10 @@ bool summaryTest1 ( char *rec , long listSize, char *coll , long long docId ,
// now parse into xhtml (takes 15ms on lenny) // now parse into xhtml (takes 15ms on lenny)
Xml xml; Xml xml;
xml.set ( content, contentLen , xml.set ( content, contentLen ,
false/*ownData?*/, 0, false, xd.m_version ); false/*ownData?*/, 0, false, xd.m_version ,
true , // setparents
0 , // niceness
CT_HTML );
xd.getSummary(); xd.getSummary();