for json docs only give them a single

xmlnode in the Xml.cpp class. hopefully
will not get "malformed sections" error
anymore. i think that was a result of the
json having html tags in it and making
unnested html structures which the
sections class did not like.
TODO: probably do this for CT_TEXT etc.
as well.
This commit is contained in:
Matt Wells 2014-01-25 08:17:38 -08:00
parent 4d0a09f1e4
commit bc78b21dc6
9 changed files with 89 additions and 43 deletions

View File

@ -651,7 +651,10 @@ void Blaster::gotDoc2 ( void *state, TcpSocket *s){
false,
0,
false,
TITLEREC_CURRENT_VERSION)){
TITLEREC_CURRENT_VERSION ,
true , // set parents
0 , // niceness
CT_XML )){ // content type
log(LOG_WARN,"blaster: Couldn't set XML1 Class in gotDoc2");
}
Links links1;
@ -679,7 +682,10 @@ void Blaster::gotDoc2 ( void *state, TcpSocket *s){
false,
0,
false,
TITLEREC_CURRENT_VERSION)){
TITLEREC_CURRENT_VERSION,
true , // setparents
0 , // niceness
CT_XML )){
log(LOG_WARN,"blaster: Couldn't set XML2 Class in gotDoc2");
}
Links links2;
@ -1170,7 +1176,10 @@ void Blaster::gotDoc4 ( void *state, TcpSocket *s){
false,
0,
false,
TITLEREC_CURRENT_VERSION)){
TITLEREC_CURRENT_VERSION,
true, // setparents
0, // niceness
CT_XML )){
log(LOG_WARN,"blaster: Couldn't set XML Class in gotDoc4");
}
Links links;

View File

@ -3648,7 +3648,8 @@ bool Inlink::setXmlFromRSS ( Xml *xml , long niceness ) {
true , // pure xml?
TITLEREC_CURRENT_VERSION ,
false , // no need to now
niceness );
niceness ,
CT_XML );
}
// only Title.cpp uses this right now

View File

@ -712,7 +712,7 @@ bool processLoop ( void *state ) {
//Words *ww = xd->getWords();
if ( ! xml.set ( content , contentLen , false ,
0 , false , TITLEREC_CURRENT_VERSION ,
false , 0 ) ) { // niceness is 0
false , 0 , CT_HTML ) ) { // niceness is 0
//if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );
return sendErrorReply ( st , g_errno );
}

View File

@ -4226,7 +4226,10 @@ bool Parms::setXmlFromFile(Xml *xml, char *filename, char *buf, long bufSize){
false , // ownData
0 , // allocSize
false , // pureXml?
0 );// version
0 , // version
true , // setParents
0 , // niceness
CT_XML );
}
#define MAX_CONF_SIZE 200000

27
Xml.cpp
View File

@ -196,6 +196,8 @@ void Xml::reset ( ) {
m_allocSize = 0;
}
#include "HttpMime.h" // CT_JSON
// "s" must be in utf8
bool Xml::set ( char *s ,
long slen ,
@ -204,7 +206,8 @@ bool Xml::set ( char *s ,
bool pureXml ,
long version ,
bool setParentsArg ,
long niceness ) {
long niceness ,
char contentType ) {
// just in case
reset();
@ -234,6 +237,28 @@ bool Xml::set ( char *s ,
return false;
}
// if json go no further. TODO: also do this for CT_TEXT etc.
if ( contentType == CT_JSON ) {
m_numNodes = 0;
// make the array
m_maxNumNodes = 1;
m_nodes =(XmlNode *)mmalloc(sizeof(XmlNode)*m_maxNumNodes,"x");
if ( ! m_nodes ) return false;
XmlNode *xd = &m_nodes[m_numNodes];
// hack the node
xd->m_node = s;
xd->m_nodeLen = slen;
xd->m_isSelfLink = 0;
// . nodeId for text nodes is 0
xd->m_nodeId = 0;
xd->m_hasBackTag = false;
xd->m_hash = 0;
xd->m_pairTagNum = -1;
m_numNodes++;
return true;
}
QUICKPOLL((niceness));
long i;

6
Xml.h
View File

@ -30,8 +30,10 @@ class Xml {
bool ownData , long allocSize, //=0,
bool pureXml, // =false );
long version ,
bool setParents = true,
long niceness = 0);
bool setParents , // = true,
long niceness , // = 0
char contentType );
void reset ( );

View File

@ -4727,7 +4727,8 @@ void XmlDoc::gotWikiResults ( UdpSlot *slot ) {
false ,
TITLEREC_CURRENT_VERSION ,
false , // setParents?
m_niceness ))
m_niceness ,
CT_HTML ))
// return if g_errno got set
return;
@ -5206,6 +5207,10 @@ Xml *XmlDoc::getXml ( ) {
char **u8 = getUtf8Content();
if ( ! u8 || u8 == (char **)-1 ) return (Xml *)u8;
long u8len = size_utf8Content - 1;
uint8_t *ct = getContentType();
if ( ! ct || ct == (void *)-1 ) return (Xml *)ct;
// note it
setStatus ( "getting xml");
// set it
@ -5216,7 +5221,8 @@ Xml *XmlDoc::getXml ( ) {
false , // pure xml?
m_version ,
false , // setParentsArg?
m_niceness ) )
m_niceness ,
*ct ) )
// return NULL on error with g_errno set
return NULL;
// set just once
@ -5813,13 +5819,12 @@ Sections *XmlDoc::getExplicitSections ( ) {
if ( m_explicitSectionsValid ) return &m_sections;
// if json forget this it is only html
uint8_t *ct = getContentType();
if ( ! ct || ct == (void *)-1 ) return (Sections *)ct;
if ( *ct != CT_HTML && *ct != CT_TEXT && *ct != CT_XML ) {
m_sectionsValid = true;
return &m_sections;
}
//uint8_t *ct = getContentType();
//if ( ! ct || ct == (void *)-1 ) return (Sections *)ct;
//if ( *ct != CT_HTML && *ct != CT_TEXT && *ct != CT_XML ) {
// m_sectionsValid = true;
// return &m_sections;
//}
setStatus ( "getting explicit sections" );
// use the old title rec to make sure we parse consistently!
@ -5856,8 +5861,8 @@ Sections *XmlDoc::getExplicitSections ( ) {
long long *d = getDocId();
if ( ! d || d == (long long *)-1 ) return (Sections *)d;
// get the content type
//uint8_t *ct = getContentType();
//if ( ! ct ) return NULL;
uint8_t *ct = getContentType();
if ( ! ct ) return NULL;
CollectionRec *cr = getCollRec();
if ( ! cr ) return NULL;
@ -5873,6 +5878,8 @@ Sections *XmlDoc::getExplicitSections ( ) {
// this uses the sectionsReply to see which sections are "text", etc.
// rather than compute it expensively
if ( ! m_calledSections &&
// we get malformed sections error for some diffbot replies
//*ct != CT_JSON &&
! m_sections.set ( &m_words ,
&m_phrases ,
bits ,
@ -5918,14 +5925,6 @@ Sections *XmlDoc::getExplicitSections ( ) {
Sections *XmlDoc::getImpliedSections ( ) {
if ( m_impliedSectionsValid ) return &m_sections;
// if json forget this it is only html
uint8_t *ct = getContentType();
if ( ! ct || ct == (void *)-1 ) return (Sections *)ct;
if ( *ct != CT_HTML && *ct != CT_TEXT && *ct != CT_XML ) {
m_sectionsValid = true;
return &m_sections;
}
// get the sections without implied sections
Sections *sections = getExplicitSections();
if ( ! sections || sections==(void *)-1) return (Sections *)sections;
@ -5944,8 +5943,8 @@ Sections *XmlDoc::getImpliedSections ( ) {
// bail on error
if ( ! bits ) return NULL;
// get the content type
//uint8_t *ct = getContentType();
//if ( ! ct ) return NULL;
uint8_t *ct = getContentType();
if ( ! ct ) return NULL;
if ( ! m_firstUrlValid ) { char *xx=NULL;*xx=0; }
@ -6006,14 +6005,6 @@ Sections *XmlDoc::getImpliedSections ( ) {
// add in Section::m_sentFlags bits having to do with our voting tables
Sections *XmlDoc::getSections ( ) {
// if json forget this it is only html
uint8_t *ct = getContentType();
if ( ! ct || ct == (void *)-1 ) return (Sections *)ct;
if ( *ct != CT_HTML && *ct != CT_TEXT && *ct != CT_XML ) {
m_sectionsValid = true;
return &m_sections;
}
// get the sections without implied sections
Sections *ss = getImpliedSections();
if ( ! ss || ss==(void *)-1) return (Sections *)ss;
@ -17865,6 +17856,7 @@ bool XmlDoc::logIt ( ) {
// coll
//
sb.safePrintf("coll=%s ",coll);
sb.safePrintf("collnum=%li ",(long)m_collnum);
//
// print ip
@ -25077,6 +25069,9 @@ bool XmlDoc::hashRSSInfo ( HashTableX *tt ) {
setStatus ( "hashing rss info" );
uint8_t *ct = getContentType();
if ( ! ct || ct == (void *)-1 ) { char *xx=NULL;*xx=0; }
// . finally hash in the linkText terms from the LinkInfo
// . the LinkInfo class has all the terms of hashed anchor text for us
// . if we're using an old TitleRec linkTermList is just a ptr to
@ -25190,7 +25185,10 @@ bool XmlDoc::hashRSSInfo ( HashTableX *tt ) {
false , // own data?
0 , // allocSize
false , // pure xml?
m_version ) )
m_version ,
true , // set parents?
m_niceness ,
*ct ) )
return false;
// set the words class from the xml, returns false and sets
// g_errno on error

View File

@ -2046,6 +2046,7 @@ char* getNextNum(char* input, char** numPtr) {
return nextspace;
}
#include "HttpMime.h" // CT_HTML
// returns length of stripped content, but will set g_errno and return -1
// on error
@ -2066,7 +2067,7 @@ long stripHtml( char *content, long contentLen, long version, long strip ) {
// . parse as utf8 since all we are doing is messing with
// the tags...content manipulation comes later
if ( ! tmpXml.set ( content , contentLen,
false, 0, false, version ) )
false, 0, false, version , true , 0 , CT_HTML ) )
return -1;
//if( strip == 4 )

View File

@ -11464,7 +11464,10 @@ bool parseTest ( char *coll , long long docId , char *query ) {
t = gettimeofdayInMilliseconds();
for ( long i = 0 ; i < 100 ; i++ )
if ( ! xml.set ( content , contentLen ,
false, 0, false, xd.m_version ) )
false, 0, false, xd.m_version ,
true , // setparents
0 , // niceness
CT_HTML ) )
return log("build: speedtestxml: xml set: %s",
mstrerror(g_errno));
// print time it took
@ -11480,7 +11483,8 @@ bool parseTest ( char *coll , long long docId , char *query ) {
t = gettimeofdayInMilliseconds();
for ( long i = 0 ; i < 100 ; i++ )
if ( ! xml.set ( content , contentLen ,
false, 0, false, xd.m_version , false ) )
false, 0, false, xd.m_version , false ,
0 , CT_HTML ) )
return log("build: xml(setparents=false): %s",
mstrerror(g_errno));
// print time it took
@ -11842,7 +11846,10 @@ bool summaryTest1 ( char *rec , long listSize, char *coll , long long docId ,
// now parse into xhtml (takes 15ms on lenny)
Xml xml;
xml.set ( content, contentLen ,
false/*ownData?*/, 0, false, xd.m_version );
false/*ownData?*/, 0, false, xd.m_version ,
true , // setparents
0 , // niceness
CT_HTML );
xd.getSummary();