mirror of
https://github.com/gigablast/open-source-search-engine.git
synced 2024-10-04 04:07:13 +03:00
for json docs only give them a single
xmlnode in the Xml.cpp class. hopefully will not get "malformed sections" error anymore. i think that was a result of the json having html tags in it and making unnested html structures which the sections class did not like. TODO: probably do this for CT_TEXT etc. as well.
This commit is contained in:
parent
4d0a09f1e4
commit
bc78b21dc6
15
Blaster.cpp
15
Blaster.cpp
@ -651,7 +651,10 @@ void Blaster::gotDoc2 ( void *state, TcpSocket *s){
|
||||
false,
|
||||
0,
|
||||
false,
|
||||
TITLEREC_CURRENT_VERSION)){
|
||||
TITLEREC_CURRENT_VERSION ,
|
||||
true , // set parents
|
||||
0 , // niceness
|
||||
CT_XML )){ // content type
|
||||
log(LOG_WARN,"blaster: Couldn't set XML1 Class in gotDoc2");
|
||||
}
|
||||
Links links1;
|
||||
@ -679,7 +682,10 @@ void Blaster::gotDoc2 ( void *state, TcpSocket *s){
|
||||
false,
|
||||
0,
|
||||
false,
|
||||
TITLEREC_CURRENT_VERSION)){
|
||||
TITLEREC_CURRENT_VERSION,
|
||||
true , // setparents
|
||||
0 , // niceness
|
||||
CT_XML )){
|
||||
log(LOG_WARN,"blaster: Couldn't set XML2 Class in gotDoc2");
|
||||
}
|
||||
Links links2;
|
||||
@ -1170,7 +1176,10 @@ void Blaster::gotDoc4 ( void *state, TcpSocket *s){
|
||||
false,
|
||||
0,
|
||||
false,
|
||||
TITLEREC_CURRENT_VERSION)){
|
||||
TITLEREC_CURRENT_VERSION,
|
||||
true, // setparents
|
||||
0, // niceness
|
||||
CT_XML )){
|
||||
log(LOG_WARN,"blaster: Couldn't set XML Class in gotDoc4");
|
||||
}
|
||||
Links links;
|
||||
|
@ -3648,7 +3648,8 @@ bool Inlink::setXmlFromRSS ( Xml *xml , long niceness ) {
|
||||
true , // pure xml?
|
||||
TITLEREC_CURRENT_VERSION ,
|
||||
false , // no need to now
|
||||
niceness );
|
||||
niceness ,
|
||||
CT_XML );
|
||||
}
|
||||
|
||||
// only Title.cpp uses this right now
|
||||
|
@ -712,7 +712,7 @@ bool processLoop ( void *state ) {
|
||||
//Words *ww = xd->getWords();
|
||||
if ( ! xml.set ( content , contentLen , false ,
|
||||
0 , false , TITLEREC_CURRENT_VERSION ,
|
||||
false , 0 ) ) { // niceness is 0
|
||||
false , 0 , CT_HTML ) ) { // niceness is 0
|
||||
//if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );
|
||||
return sendErrorReply ( st , g_errno );
|
||||
}
|
||||
|
@ -4226,7 +4226,10 @@ bool Parms::setXmlFromFile(Xml *xml, char *filename, char *buf, long bufSize){
|
||||
false , // ownData
|
||||
0 , // allocSize
|
||||
false , // pureXml?
|
||||
0 );// version
|
||||
0 , // version
|
||||
true , // setParents
|
||||
0 , // niceness
|
||||
CT_XML );
|
||||
}
|
||||
|
||||
#define MAX_CONF_SIZE 200000
|
||||
|
27
Xml.cpp
27
Xml.cpp
@ -196,6 +196,8 @@ void Xml::reset ( ) {
|
||||
m_allocSize = 0;
|
||||
}
|
||||
|
||||
#include "HttpMime.h" // CT_JSON
|
||||
|
||||
// "s" must be in utf8
|
||||
bool Xml::set ( char *s ,
|
||||
long slen ,
|
||||
@ -204,7 +206,8 @@ bool Xml::set ( char *s ,
|
||||
bool pureXml ,
|
||||
long version ,
|
||||
bool setParentsArg ,
|
||||
long niceness ) {
|
||||
long niceness ,
|
||||
char contentType ) {
|
||||
|
||||
// just in case
|
||||
reset();
|
||||
@ -234,6 +237,28 @@ bool Xml::set ( char *s ,
|
||||
return false;
|
||||
}
|
||||
|
||||
// if json go no further. TODO: also do this for CT_TEXT etc.
|
||||
if ( contentType == CT_JSON ) {
|
||||
m_numNodes = 0;
|
||||
// make the array
|
||||
m_maxNumNodes = 1;
|
||||
m_nodes =(XmlNode *)mmalloc(sizeof(XmlNode)*m_maxNumNodes,"x");
|
||||
if ( ! m_nodes ) return false;
|
||||
XmlNode *xd = &m_nodes[m_numNodes];
|
||||
// hack the node
|
||||
xd->m_node = s;
|
||||
xd->m_nodeLen = slen;
|
||||
xd->m_isSelfLink = 0;
|
||||
// . nodeId for text nodes is 0
|
||||
xd->m_nodeId = 0;
|
||||
xd->m_hasBackTag = false;
|
||||
xd->m_hash = 0;
|
||||
xd->m_pairTagNum = -1;
|
||||
m_numNodes++;
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
QUICKPOLL((niceness));
|
||||
long i;
|
||||
|
||||
|
6
Xml.h
6
Xml.h
@ -30,8 +30,10 @@ class Xml {
|
||||
bool ownData , long allocSize, //=0,
|
||||
bool pureXml, // =false );
|
||||
long version ,
|
||||
bool setParents = true,
|
||||
long niceness = 0);
|
||||
bool setParents , // = true,
|
||||
long niceness , // = 0
|
||||
char contentType );
|
||||
|
||||
|
||||
|
||||
void reset ( );
|
||||
|
58
XmlDoc.cpp
58
XmlDoc.cpp
@ -4727,7 +4727,8 @@ void XmlDoc::gotWikiResults ( UdpSlot *slot ) {
|
||||
false ,
|
||||
TITLEREC_CURRENT_VERSION ,
|
||||
false , // setParents?
|
||||
m_niceness ))
|
||||
m_niceness ,
|
||||
CT_HTML ))
|
||||
// return if g_errno got set
|
||||
return;
|
||||
|
||||
@ -5206,6 +5207,10 @@ Xml *XmlDoc::getXml ( ) {
|
||||
char **u8 = getUtf8Content();
|
||||
if ( ! u8 || u8 == (char **)-1 ) return (Xml *)u8;
|
||||
long u8len = size_utf8Content - 1;
|
||||
|
||||
uint8_t *ct = getContentType();
|
||||
if ( ! ct || ct == (void *)-1 ) return (Xml *)ct;
|
||||
|
||||
// note it
|
||||
setStatus ( "getting xml");
|
||||
// set it
|
||||
@ -5216,7 +5221,8 @@ Xml *XmlDoc::getXml ( ) {
|
||||
false , // pure xml?
|
||||
m_version ,
|
||||
false , // setParentsArg?
|
||||
m_niceness ) )
|
||||
m_niceness ,
|
||||
*ct ) )
|
||||
// return NULL on error with g_errno set
|
||||
return NULL;
|
||||
// set just once
|
||||
@ -5813,13 +5819,12 @@ Sections *XmlDoc::getExplicitSections ( ) {
|
||||
if ( m_explicitSectionsValid ) return &m_sections;
|
||||
|
||||
// if json forget this it is only html
|
||||
uint8_t *ct = getContentType();
|
||||
if ( ! ct || ct == (void *)-1 ) return (Sections *)ct;
|
||||
if ( *ct != CT_HTML && *ct != CT_TEXT && *ct != CT_XML ) {
|
||||
m_sectionsValid = true;
|
||||
return &m_sections;
|
||||
}
|
||||
|
||||
//uint8_t *ct = getContentType();
|
||||
//if ( ! ct || ct == (void *)-1 ) return (Sections *)ct;
|
||||
//if ( *ct != CT_HTML && *ct != CT_TEXT && *ct != CT_XML ) {
|
||||
// m_sectionsValid = true;
|
||||
// return &m_sections;
|
||||
//}
|
||||
|
||||
setStatus ( "getting explicit sections" );
|
||||
// use the old title rec to make sure we parse consistently!
|
||||
@ -5856,8 +5861,8 @@ Sections *XmlDoc::getExplicitSections ( ) {
|
||||
long long *d = getDocId();
|
||||
if ( ! d || d == (long long *)-1 ) return (Sections *)d;
|
||||
// get the content type
|
||||
//uint8_t *ct = getContentType();
|
||||
//if ( ! ct ) return NULL;
|
||||
uint8_t *ct = getContentType();
|
||||
if ( ! ct ) return NULL;
|
||||
|
||||
CollectionRec *cr = getCollRec();
|
||||
if ( ! cr ) return NULL;
|
||||
@ -5873,6 +5878,8 @@ Sections *XmlDoc::getExplicitSections ( ) {
|
||||
// this uses the sectionsReply to see which sections are "text", etc.
|
||||
// rather than compute it expensively
|
||||
if ( ! m_calledSections &&
|
||||
// we get malformed sections error for some diffbot replies
|
||||
//*ct != CT_JSON &&
|
||||
! m_sections.set ( &m_words ,
|
||||
&m_phrases ,
|
||||
bits ,
|
||||
@ -5918,14 +5925,6 @@ Sections *XmlDoc::getExplicitSections ( ) {
|
||||
Sections *XmlDoc::getImpliedSections ( ) {
|
||||
if ( m_impliedSectionsValid ) return &m_sections;
|
||||
|
||||
// if json forget this it is only html
|
||||
uint8_t *ct = getContentType();
|
||||
if ( ! ct || ct == (void *)-1 ) return (Sections *)ct;
|
||||
if ( *ct != CT_HTML && *ct != CT_TEXT && *ct != CT_XML ) {
|
||||
m_sectionsValid = true;
|
||||
return &m_sections;
|
||||
}
|
||||
|
||||
// get the sections without implied sections
|
||||
Sections *sections = getExplicitSections();
|
||||
if ( ! sections || sections==(void *)-1) return (Sections *)sections;
|
||||
@ -5944,8 +5943,8 @@ Sections *XmlDoc::getImpliedSections ( ) {
|
||||
// bail on error
|
||||
if ( ! bits ) return NULL;
|
||||
// get the content type
|
||||
//uint8_t *ct = getContentType();
|
||||
//if ( ! ct ) return NULL;
|
||||
uint8_t *ct = getContentType();
|
||||
if ( ! ct ) return NULL;
|
||||
|
||||
if ( ! m_firstUrlValid ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
@ -6006,14 +6005,6 @@ Sections *XmlDoc::getImpliedSections ( ) {
|
||||
// add in Section::m_sentFlags bits having to do with our voting tables
|
||||
Sections *XmlDoc::getSections ( ) {
|
||||
|
||||
// if json forget this it is only html
|
||||
uint8_t *ct = getContentType();
|
||||
if ( ! ct || ct == (void *)-1 ) return (Sections *)ct;
|
||||
if ( *ct != CT_HTML && *ct != CT_TEXT && *ct != CT_XML ) {
|
||||
m_sectionsValid = true;
|
||||
return &m_sections;
|
||||
}
|
||||
|
||||
// get the sections without implied sections
|
||||
Sections *ss = getImpliedSections();
|
||||
if ( ! ss || ss==(void *)-1) return (Sections *)ss;
|
||||
@ -17865,6 +17856,7 @@ bool XmlDoc::logIt ( ) {
|
||||
// coll
|
||||
//
|
||||
sb.safePrintf("coll=%s ",coll);
|
||||
sb.safePrintf("collnum=%li ",(long)m_collnum);
|
||||
|
||||
//
|
||||
// print ip
|
||||
@ -25077,6 +25069,9 @@ bool XmlDoc::hashRSSInfo ( HashTableX *tt ) {
|
||||
|
||||
setStatus ( "hashing rss info" );
|
||||
|
||||
uint8_t *ct = getContentType();
|
||||
if ( ! ct || ct == (void *)-1 ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
// . finally hash in the linkText terms from the LinkInfo
|
||||
// . the LinkInfo class has all the terms of hashed anchor text for us
|
||||
// . if we're using an old TitleRec linkTermList is just a ptr to
|
||||
@ -25190,7 +25185,10 @@ bool XmlDoc::hashRSSInfo ( HashTableX *tt ) {
|
||||
false , // own data?
|
||||
0 , // allocSize
|
||||
false , // pure xml?
|
||||
m_version ) )
|
||||
m_version ,
|
||||
true , // set parents?
|
||||
m_niceness ,
|
||||
*ct ) )
|
||||
return false;
|
||||
// set the words class from the xml, returns false and sets
|
||||
// g_errno on error
|
||||
|
@ -2046,6 +2046,7 @@ char* getNextNum(char* input, char** numPtr) {
|
||||
return nextspace;
|
||||
}
|
||||
|
||||
#include "HttpMime.h" // CT_HTML
|
||||
|
||||
// returns length of stripped content, but will set g_errno and return -1
|
||||
// on error
|
||||
@ -2066,7 +2067,7 @@ long stripHtml( char *content, long contentLen, long version, long strip ) {
|
||||
// . parse as utf8 since all we are doing is messing with
|
||||
// the tags...content manipulation comes later
|
||||
if ( ! tmpXml.set ( content , contentLen,
|
||||
false, 0, false, version ) )
|
||||
false, 0, false, version , true , 0 , CT_HTML ) )
|
||||
return -1;
|
||||
|
||||
//if( strip == 4 )
|
||||
|
13
main.cpp
13
main.cpp
@ -11464,7 +11464,10 @@ bool parseTest ( char *coll , long long docId , char *query ) {
|
||||
t = gettimeofdayInMilliseconds();
|
||||
for ( long i = 0 ; i < 100 ; i++ )
|
||||
if ( ! xml.set ( content , contentLen ,
|
||||
false, 0, false, xd.m_version ) )
|
||||
false, 0, false, xd.m_version ,
|
||||
true , // setparents
|
||||
0 , // niceness
|
||||
CT_HTML ) )
|
||||
return log("build: speedtestxml: xml set: %s",
|
||||
mstrerror(g_errno));
|
||||
// print time it took
|
||||
@ -11480,7 +11483,8 @@ bool parseTest ( char *coll , long long docId , char *query ) {
|
||||
t = gettimeofdayInMilliseconds();
|
||||
for ( long i = 0 ; i < 100 ; i++ )
|
||||
if ( ! xml.set ( content , contentLen ,
|
||||
false, 0, false, xd.m_version , false ) )
|
||||
false, 0, false, xd.m_version , false ,
|
||||
0 , CT_HTML ) )
|
||||
return log("build: xml(setparents=false): %s",
|
||||
mstrerror(g_errno));
|
||||
// print time it took
|
||||
@ -11842,7 +11846,10 @@ bool summaryTest1 ( char *rec , long listSize, char *coll , long long docId ,
|
||||
// now parse into xhtml (takes 15ms on lenny)
|
||||
Xml xml;
|
||||
xml.set ( content, contentLen ,
|
||||
false/*ownData?*/, 0, false, xd.m_version );
|
||||
false/*ownData?*/, 0, false, xd.m_version ,
|
||||
true , // setparents
|
||||
0 , // niceness
|
||||
CT_HTML );
|
||||
|
||||
xd.getSummary();
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user