mirror of
https://github.com/gigablast/open-source-search-engine.git
synced 2024-10-04 12:17:35 +03:00
for json docs only give them a single
xmlnode in the Xml.cpp class. hopefully will not get "malformed sections" error anymore. i think that was a result of the json having html tags in it and making unnested html structures which the sections class did not like. TODO: probably do this for CT_TEXT etc. as well.
This commit is contained in:
parent
4d0a09f1e4
commit
bc78b21dc6
15
Blaster.cpp
15
Blaster.cpp
@ -651,7 +651,10 @@ void Blaster::gotDoc2 ( void *state, TcpSocket *s){
|
|||||||
false,
|
false,
|
||||||
0,
|
0,
|
||||||
false,
|
false,
|
||||||
TITLEREC_CURRENT_VERSION)){
|
TITLEREC_CURRENT_VERSION ,
|
||||||
|
true , // set parents
|
||||||
|
0 , // niceness
|
||||||
|
CT_XML )){ // content type
|
||||||
log(LOG_WARN,"blaster: Couldn't set XML1 Class in gotDoc2");
|
log(LOG_WARN,"blaster: Couldn't set XML1 Class in gotDoc2");
|
||||||
}
|
}
|
||||||
Links links1;
|
Links links1;
|
||||||
@ -679,7 +682,10 @@ void Blaster::gotDoc2 ( void *state, TcpSocket *s){
|
|||||||
false,
|
false,
|
||||||
0,
|
0,
|
||||||
false,
|
false,
|
||||||
TITLEREC_CURRENT_VERSION)){
|
TITLEREC_CURRENT_VERSION,
|
||||||
|
true , // setparents
|
||||||
|
0 , // niceness
|
||||||
|
CT_XML )){
|
||||||
log(LOG_WARN,"blaster: Couldn't set XML2 Class in gotDoc2");
|
log(LOG_WARN,"blaster: Couldn't set XML2 Class in gotDoc2");
|
||||||
}
|
}
|
||||||
Links links2;
|
Links links2;
|
||||||
@ -1170,7 +1176,10 @@ void Blaster::gotDoc4 ( void *state, TcpSocket *s){
|
|||||||
false,
|
false,
|
||||||
0,
|
0,
|
||||||
false,
|
false,
|
||||||
TITLEREC_CURRENT_VERSION)){
|
TITLEREC_CURRENT_VERSION,
|
||||||
|
true, // setparents
|
||||||
|
0, // niceness
|
||||||
|
CT_XML )){
|
||||||
log(LOG_WARN,"blaster: Couldn't set XML Class in gotDoc4");
|
log(LOG_WARN,"blaster: Couldn't set XML Class in gotDoc4");
|
||||||
}
|
}
|
||||||
Links links;
|
Links links;
|
||||||
|
@ -3648,7 +3648,8 @@ bool Inlink::setXmlFromRSS ( Xml *xml , long niceness ) {
|
|||||||
true , // pure xml?
|
true , // pure xml?
|
||||||
TITLEREC_CURRENT_VERSION ,
|
TITLEREC_CURRENT_VERSION ,
|
||||||
false , // no need to now
|
false , // no need to now
|
||||||
niceness );
|
niceness ,
|
||||||
|
CT_XML );
|
||||||
}
|
}
|
||||||
|
|
||||||
// only Title.cpp uses this right now
|
// only Title.cpp uses this right now
|
||||||
|
@ -712,7 +712,7 @@ bool processLoop ( void *state ) {
|
|||||||
//Words *ww = xd->getWords();
|
//Words *ww = xd->getWords();
|
||||||
if ( ! xml.set ( content , contentLen , false ,
|
if ( ! xml.set ( content , contentLen , false ,
|
||||||
0 , false , TITLEREC_CURRENT_VERSION ,
|
0 , false , TITLEREC_CURRENT_VERSION ,
|
||||||
false , 0 ) ) { // niceness is 0
|
false , 0 , CT_HTML ) ) { // niceness is 0
|
||||||
//if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );
|
//if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );
|
||||||
return sendErrorReply ( st , g_errno );
|
return sendErrorReply ( st , g_errno );
|
||||||
}
|
}
|
||||||
|
@ -4226,7 +4226,10 @@ bool Parms::setXmlFromFile(Xml *xml, char *filename, char *buf, long bufSize){
|
|||||||
false , // ownData
|
false , // ownData
|
||||||
0 , // allocSize
|
0 , // allocSize
|
||||||
false , // pureXml?
|
false , // pureXml?
|
||||||
0 );// version
|
0 , // version
|
||||||
|
true , // setParents
|
||||||
|
0 , // niceness
|
||||||
|
CT_XML );
|
||||||
}
|
}
|
||||||
|
|
||||||
#define MAX_CONF_SIZE 200000
|
#define MAX_CONF_SIZE 200000
|
||||||
|
27
Xml.cpp
27
Xml.cpp
@ -196,6 +196,8 @@ void Xml::reset ( ) {
|
|||||||
m_allocSize = 0;
|
m_allocSize = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#include "HttpMime.h" // CT_JSON
|
||||||
|
|
||||||
// "s" must be in utf8
|
// "s" must be in utf8
|
||||||
bool Xml::set ( char *s ,
|
bool Xml::set ( char *s ,
|
||||||
long slen ,
|
long slen ,
|
||||||
@ -204,7 +206,8 @@ bool Xml::set ( char *s ,
|
|||||||
bool pureXml ,
|
bool pureXml ,
|
||||||
long version ,
|
long version ,
|
||||||
bool setParentsArg ,
|
bool setParentsArg ,
|
||||||
long niceness ) {
|
long niceness ,
|
||||||
|
char contentType ) {
|
||||||
|
|
||||||
// just in case
|
// just in case
|
||||||
reset();
|
reset();
|
||||||
@ -234,6 +237,28 @@ bool Xml::set ( char *s ,
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// if json go no further. TODO: also do this for CT_TEXT etc.
|
||||||
|
if ( contentType == CT_JSON ) {
|
||||||
|
m_numNodes = 0;
|
||||||
|
// make the array
|
||||||
|
m_maxNumNodes = 1;
|
||||||
|
m_nodes =(XmlNode *)mmalloc(sizeof(XmlNode)*m_maxNumNodes,"x");
|
||||||
|
if ( ! m_nodes ) return false;
|
||||||
|
XmlNode *xd = &m_nodes[m_numNodes];
|
||||||
|
// hack the node
|
||||||
|
xd->m_node = s;
|
||||||
|
xd->m_nodeLen = slen;
|
||||||
|
xd->m_isSelfLink = 0;
|
||||||
|
// . nodeId for text nodes is 0
|
||||||
|
xd->m_nodeId = 0;
|
||||||
|
xd->m_hasBackTag = false;
|
||||||
|
xd->m_hash = 0;
|
||||||
|
xd->m_pairTagNum = -1;
|
||||||
|
m_numNodes++;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
QUICKPOLL((niceness));
|
QUICKPOLL((niceness));
|
||||||
long i;
|
long i;
|
||||||
|
|
||||||
|
6
Xml.h
6
Xml.h
@ -30,8 +30,10 @@ class Xml {
|
|||||||
bool ownData , long allocSize, //=0,
|
bool ownData , long allocSize, //=0,
|
||||||
bool pureXml, // =false );
|
bool pureXml, // =false );
|
||||||
long version ,
|
long version ,
|
||||||
bool setParents = true,
|
bool setParents , // = true,
|
||||||
long niceness = 0);
|
long niceness , // = 0
|
||||||
|
char contentType );
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
void reset ( );
|
void reset ( );
|
||||||
|
58
XmlDoc.cpp
58
XmlDoc.cpp
@ -4727,7 +4727,8 @@ void XmlDoc::gotWikiResults ( UdpSlot *slot ) {
|
|||||||
false ,
|
false ,
|
||||||
TITLEREC_CURRENT_VERSION ,
|
TITLEREC_CURRENT_VERSION ,
|
||||||
false , // setParents?
|
false , // setParents?
|
||||||
m_niceness ))
|
m_niceness ,
|
||||||
|
CT_HTML ))
|
||||||
// return if g_errno got set
|
// return if g_errno got set
|
||||||
return;
|
return;
|
||||||
|
|
||||||
@ -5206,6 +5207,10 @@ Xml *XmlDoc::getXml ( ) {
|
|||||||
char **u8 = getUtf8Content();
|
char **u8 = getUtf8Content();
|
||||||
if ( ! u8 || u8 == (char **)-1 ) return (Xml *)u8;
|
if ( ! u8 || u8 == (char **)-1 ) return (Xml *)u8;
|
||||||
long u8len = size_utf8Content - 1;
|
long u8len = size_utf8Content - 1;
|
||||||
|
|
||||||
|
uint8_t *ct = getContentType();
|
||||||
|
if ( ! ct || ct == (void *)-1 ) return (Xml *)ct;
|
||||||
|
|
||||||
// note it
|
// note it
|
||||||
setStatus ( "getting xml");
|
setStatus ( "getting xml");
|
||||||
// set it
|
// set it
|
||||||
@ -5216,7 +5221,8 @@ Xml *XmlDoc::getXml ( ) {
|
|||||||
false , // pure xml?
|
false , // pure xml?
|
||||||
m_version ,
|
m_version ,
|
||||||
false , // setParentsArg?
|
false , // setParentsArg?
|
||||||
m_niceness ) )
|
m_niceness ,
|
||||||
|
*ct ) )
|
||||||
// return NULL on error with g_errno set
|
// return NULL on error with g_errno set
|
||||||
return NULL;
|
return NULL;
|
||||||
// set just once
|
// set just once
|
||||||
@ -5813,13 +5819,12 @@ Sections *XmlDoc::getExplicitSections ( ) {
|
|||||||
if ( m_explicitSectionsValid ) return &m_sections;
|
if ( m_explicitSectionsValid ) return &m_sections;
|
||||||
|
|
||||||
// if json forget this it is only html
|
// if json forget this it is only html
|
||||||
uint8_t *ct = getContentType();
|
//uint8_t *ct = getContentType();
|
||||||
if ( ! ct || ct == (void *)-1 ) return (Sections *)ct;
|
//if ( ! ct || ct == (void *)-1 ) return (Sections *)ct;
|
||||||
if ( *ct != CT_HTML && *ct != CT_TEXT && *ct != CT_XML ) {
|
//if ( *ct != CT_HTML && *ct != CT_TEXT && *ct != CT_XML ) {
|
||||||
m_sectionsValid = true;
|
// m_sectionsValid = true;
|
||||||
return &m_sections;
|
// return &m_sections;
|
||||||
}
|
//}
|
||||||
|
|
||||||
|
|
||||||
setStatus ( "getting explicit sections" );
|
setStatus ( "getting explicit sections" );
|
||||||
// use the old title rec to make sure we parse consistently!
|
// use the old title rec to make sure we parse consistently!
|
||||||
@ -5856,8 +5861,8 @@ Sections *XmlDoc::getExplicitSections ( ) {
|
|||||||
long long *d = getDocId();
|
long long *d = getDocId();
|
||||||
if ( ! d || d == (long long *)-1 ) return (Sections *)d;
|
if ( ! d || d == (long long *)-1 ) return (Sections *)d;
|
||||||
// get the content type
|
// get the content type
|
||||||
//uint8_t *ct = getContentType();
|
uint8_t *ct = getContentType();
|
||||||
//if ( ! ct ) return NULL;
|
if ( ! ct ) return NULL;
|
||||||
|
|
||||||
CollectionRec *cr = getCollRec();
|
CollectionRec *cr = getCollRec();
|
||||||
if ( ! cr ) return NULL;
|
if ( ! cr ) return NULL;
|
||||||
@ -5873,6 +5878,8 @@ Sections *XmlDoc::getExplicitSections ( ) {
|
|||||||
// this uses the sectionsReply to see which sections are "text", etc.
|
// this uses the sectionsReply to see which sections are "text", etc.
|
||||||
// rather than compute it expensively
|
// rather than compute it expensively
|
||||||
if ( ! m_calledSections &&
|
if ( ! m_calledSections &&
|
||||||
|
// we get malformed sections error for some diffbot replies
|
||||||
|
//*ct != CT_JSON &&
|
||||||
! m_sections.set ( &m_words ,
|
! m_sections.set ( &m_words ,
|
||||||
&m_phrases ,
|
&m_phrases ,
|
||||||
bits ,
|
bits ,
|
||||||
@ -5918,14 +5925,6 @@ Sections *XmlDoc::getExplicitSections ( ) {
|
|||||||
Sections *XmlDoc::getImpliedSections ( ) {
|
Sections *XmlDoc::getImpliedSections ( ) {
|
||||||
if ( m_impliedSectionsValid ) return &m_sections;
|
if ( m_impliedSectionsValid ) return &m_sections;
|
||||||
|
|
||||||
// if json forget this it is only html
|
|
||||||
uint8_t *ct = getContentType();
|
|
||||||
if ( ! ct || ct == (void *)-1 ) return (Sections *)ct;
|
|
||||||
if ( *ct != CT_HTML && *ct != CT_TEXT && *ct != CT_XML ) {
|
|
||||||
m_sectionsValid = true;
|
|
||||||
return &m_sections;
|
|
||||||
}
|
|
||||||
|
|
||||||
// get the sections without implied sections
|
// get the sections without implied sections
|
||||||
Sections *sections = getExplicitSections();
|
Sections *sections = getExplicitSections();
|
||||||
if ( ! sections || sections==(void *)-1) return (Sections *)sections;
|
if ( ! sections || sections==(void *)-1) return (Sections *)sections;
|
||||||
@ -5944,8 +5943,8 @@ Sections *XmlDoc::getImpliedSections ( ) {
|
|||||||
// bail on error
|
// bail on error
|
||||||
if ( ! bits ) return NULL;
|
if ( ! bits ) return NULL;
|
||||||
// get the content type
|
// get the content type
|
||||||
//uint8_t *ct = getContentType();
|
uint8_t *ct = getContentType();
|
||||||
//if ( ! ct ) return NULL;
|
if ( ! ct ) return NULL;
|
||||||
|
|
||||||
if ( ! m_firstUrlValid ) { char *xx=NULL;*xx=0; }
|
if ( ! m_firstUrlValid ) { char *xx=NULL;*xx=0; }
|
||||||
|
|
||||||
@ -6006,14 +6005,6 @@ Sections *XmlDoc::getImpliedSections ( ) {
|
|||||||
// add in Section::m_sentFlags bits having to do with our voting tables
|
// add in Section::m_sentFlags bits having to do with our voting tables
|
||||||
Sections *XmlDoc::getSections ( ) {
|
Sections *XmlDoc::getSections ( ) {
|
||||||
|
|
||||||
// if json forget this it is only html
|
|
||||||
uint8_t *ct = getContentType();
|
|
||||||
if ( ! ct || ct == (void *)-1 ) return (Sections *)ct;
|
|
||||||
if ( *ct != CT_HTML && *ct != CT_TEXT && *ct != CT_XML ) {
|
|
||||||
m_sectionsValid = true;
|
|
||||||
return &m_sections;
|
|
||||||
}
|
|
||||||
|
|
||||||
// get the sections without implied sections
|
// get the sections without implied sections
|
||||||
Sections *ss = getImpliedSections();
|
Sections *ss = getImpliedSections();
|
||||||
if ( ! ss || ss==(void *)-1) return (Sections *)ss;
|
if ( ! ss || ss==(void *)-1) return (Sections *)ss;
|
||||||
@ -17865,6 +17856,7 @@ bool XmlDoc::logIt ( ) {
|
|||||||
// coll
|
// coll
|
||||||
//
|
//
|
||||||
sb.safePrintf("coll=%s ",coll);
|
sb.safePrintf("coll=%s ",coll);
|
||||||
|
sb.safePrintf("collnum=%li ",(long)m_collnum);
|
||||||
|
|
||||||
//
|
//
|
||||||
// print ip
|
// print ip
|
||||||
@ -25077,6 +25069,9 @@ bool XmlDoc::hashRSSInfo ( HashTableX *tt ) {
|
|||||||
|
|
||||||
setStatus ( "hashing rss info" );
|
setStatus ( "hashing rss info" );
|
||||||
|
|
||||||
|
uint8_t *ct = getContentType();
|
||||||
|
if ( ! ct || ct == (void *)-1 ) { char *xx=NULL;*xx=0; }
|
||||||
|
|
||||||
// . finally hash in the linkText terms from the LinkInfo
|
// . finally hash in the linkText terms from the LinkInfo
|
||||||
// . the LinkInfo class has all the terms of hashed anchor text for us
|
// . the LinkInfo class has all the terms of hashed anchor text for us
|
||||||
// . if we're using an old TitleRec linkTermList is just a ptr to
|
// . if we're using an old TitleRec linkTermList is just a ptr to
|
||||||
@ -25190,7 +25185,10 @@ bool XmlDoc::hashRSSInfo ( HashTableX *tt ) {
|
|||||||
false , // own data?
|
false , // own data?
|
||||||
0 , // allocSize
|
0 , // allocSize
|
||||||
false , // pure xml?
|
false , // pure xml?
|
||||||
m_version ) )
|
m_version ,
|
||||||
|
true , // set parents?
|
||||||
|
m_niceness ,
|
||||||
|
*ct ) )
|
||||||
return false;
|
return false;
|
||||||
// set the words class from the xml, returns false and sets
|
// set the words class from the xml, returns false and sets
|
||||||
// g_errno on error
|
// g_errno on error
|
||||||
|
@ -2046,6 +2046,7 @@ char* getNextNum(char* input, char** numPtr) {
|
|||||||
return nextspace;
|
return nextspace;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#include "HttpMime.h" // CT_HTML
|
||||||
|
|
||||||
// returns length of stripped content, but will set g_errno and return -1
|
// returns length of stripped content, but will set g_errno and return -1
|
||||||
// on error
|
// on error
|
||||||
@ -2066,7 +2067,7 @@ long stripHtml( char *content, long contentLen, long version, long strip ) {
|
|||||||
// . parse as utf8 since all we are doing is messing with
|
// . parse as utf8 since all we are doing is messing with
|
||||||
// the tags...content manipulation comes later
|
// the tags...content manipulation comes later
|
||||||
if ( ! tmpXml.set ( content , contentLen,
|
if ( ! tmpXml.set ( content , contentLen,
|
||||||
false, 0, false, version ) )
|
false, 0, false, version , true , 0 , CT_HTML ) )
|
||||||
return -1;
|
return -1;
|
||||||
|
|
||||||
//if( strip == 4 )
|
//if( strip == 4 )
|
||||||
|
13
main.cpp
13
main.cpp
@ -11464,7 +11464,10 @@ bool parseTest ( char *coll , long long docId , char *query ) {
|
|||||||
t = gettimeofdayInMilliseconds();
|
t = gettimeofdayInMilliseconds();
|
||||||
for ( long i = 0 ; i < 100 ; i++ )
|
for ( long i = 0 ; i < 100 ; i++ )
|
||||||
if ( ! xml.set ( content , contentLen ,
|
if ( ! xml.set ( content , contentLen ,
|
||||||
false, 0, false, xd.m_version ) )
|
false, 0, false, xd.m_version ,
|
||||||
|
true , // setparents
|
||||||
|
0 , // niceness
|
||||||
|
CT_HTML ) )
|
||||||
return log("build: speedtestxml: xml set: %s",
|
return log("build: speedtestxml: xml set: %s",
|
||||||
mstrerror(g_errno));
|
mstrerror(g_errno));
|
||||||
// print time it took
|
// print time it took
|
||||||
@ -11480,7 +11483,8 @@ bool parseTest ( char *coll , long long docId , char *query ) {
|
|||||||
t = gettimeofdayInMilliseconds();
|
t = gettimeofdayInMilliseconds();
|
||||||
for ( long i = 0 ; i < 100 ; i++ )
|
for ( long i = 0 ; i < 100 ; i++ )
|
||||||
if ( ! xml.set ( content , contentLen ,
|
if ( ! xml.set ( content , contentLen ,
|
||||||
false, 0, false, xd.m_version , false ) )
|
false, 0, false, xd.m_version , false ,
|
||||||
|
0 , CT_HTML ) )
|
||||||
return log("build: xml(setparents=false): %s",
|
return log("build: xml(setparents=false): %s",
|
||||||
mstrerror(g_errno));
|
mstrerror(g_errno));
|
||||||
// print time it took
|
// print time it took
|
||||||
@ -11842,7 +11846,10 @@ bool summaryTest1 ( char *rec , long listSize, char *coll , long long docId ,
|
|||||||
// now parse into xhtml (takes 15ms on lenny)
|
// now parse into xhtml (takes 15ms on lenny)
|
||||||
Xml xml;
|
Xml xml;
|
||||||
xml.set ( content, contentLen ,
|
xml.set ( content, contentLen ,
|
||||||
false/*ownData?*/, 0, false, xd.m_version );
|
false/*ownData?*/, 0, false, xd.m_version ,
|
||||||
|
true , // setparents
|
||||||
|
0 , // niceness
|
||||||
|
CT_HTML );
|
||||||
|
|
||||||
xd.getSummary();
|
xd.getSummary();
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user