#include "gb-include.h"
#include "Xml.h"
#include "Mem.h" // mfree(), mmalloc()
#include "Unicode.h" // for html entities that return unicode
#include "Titledb.h"
#include "Words.h"
//#include "DateParse2.h"
Xml::Xml () {
m_xml = NULL;
m_xmlLen = 0;
m_nodes = NULL;
m_numNodes=0;
m_ownData = false;
m_version = TITLEREC_CURRENT_VERSION;
}
// . should free m_xml if m_copy is true
Xml::~Xml () {
reset();
}
// . for parsing xml conf files
bool Xml::getBool ( long n0 , long n1 , char *tagName , bool defaultBool ) {
long len;
char *s = getTextForXmlTag ( n0 , n1 , tagName , &len , true );
if ( s ) return atob ( s , len );
// return the default if no non-white-space text
return defaultBool;
}
// . for parsing xml conf files
long Xml::getLong ( long n0 , long n1 , char *tagName , long defaultLong ) {
long len;
char *s = getTextForXmlTag ( n0 , n1 , tagName , &len , false );
if ( s ) return atol2 ( s , len );
// return the default if no non-white-space text
return defaultLong;
}
// . for parsing xml conf files
long long Xml::getLongLong ( long n0 , long n1 , char *tagName ,
long long defaultLongLong ) {
long len;
char *s = getTextForXmlTag ( n0 , n1 , tagName , &len , false );
if ( s ) return atoll2 ( s , len );
// return the default if no non-white-space text
return defaultLongLong;
}
// . for parsing xml conf files
float Xml::getFloat (long n0 , long n1 , char *tagName,float defaultFloat){
long len;
char *s = getTextForXmlTag ( n0 , n1 , tagName , &len , false );
if ( s ) return atof2 ( s , len );
// return the default if no non-white-space text
return defaultFloat;
}
char *Xml::getString ( long n0 , long n1 , char *tagName, long *len ,
bool skipLeadingSpaces ) const {
char *s = getTextForXmlTag ( n0, n1, tagName, len, skipLeadingSpaces );
if ( s ) return s;
// return the default if s is null
return NULL;
}
// . used by getValueAsBool/Long/String()
// . tagName is compound for xml tags, simple for html tags
// . NOTE: we skip over leading spaces
char *Xml::getTextForXmlTag ( long n0 , long n1 , char *tagName , long *len ,
bool skipLeadingSpaces ) const {
// assume len is 0
*len = 0;
// get a matching xml TAG
long num = getNodeNum ( n0 , n1 , tagName , gbstrlen(tagName) );
if ( num < 0 ) return NULL;
return getString ( num , skipLeadingSpaces , len );
}
char *Xml::getString ( long num , bool skipLeadingSpaces , long *len ) const {
// get the text of this tag (if any)
if ( ++num >= m_numNodes ) { *len = 0; return NULL; }
if ( ! m_nodes[num].isText() ) { *len = 0; return NULL; }
// if we don't skip leading spaces return it as is
if ( ! skipLeadingSpaces ) {
*len = m_nodes[num].m_nodeLen;
return m_nodes[num].m_node;
}
// get the string
char *s = m_nodes[num].m_node;
// set the length and return the string
long slen = m_nodes[num].m_nodeLen;
// skip leading spaces
while ( is_wspace_utf8 ( s ) && slen > 0 ) { s++; slen--; }
// set len
*len = slen;
// return NULL if slen is 0
if ( slen == 0 ) return NULL;
// otherwise return s
return s;
}
char *Xml::getNode ( char *tagName , long *len ) {
// assume len is 0
*len = 0;
// get a matching xml TAG
long num = getNodeNum ( 0 , m_numNodes, tagName , gbstrlen(tagName) );
if ( num < 0 ) return NULL;
// no back tag if its like it won't have one
XmlNode *node = &m_nodes[num];
if ( ! node->m_hasBackTag ) return NULL;
// scan for ending back tag
long i ; for ( i = num + 1 ; i < m_numNodes ; i++ ) {
if ( m_nodes[i].m_hash != node->m_hash ) continue;
break;
}
if ( i >= m_numNodes ) return NULL;
// got the back tag
char *end = m_nodes[i].m_node;
char *s = m_nodes[num+1].m_node;
// trim spaces
while ( s < end && is_wspace_a ( *s ) ) s++;
while ( end-1 > s && is_wspace_a ( end[-1] ) ) end--;
*len = end - s;
return s;
}
long long Xml::getCompoundHash ( char *s , long len ) const {
// setup
char *p = s;
char *start = s;
long i = 0;
long long h = 0;
loop:
// find fisrt .
while ( i < len && p[i] != '.' ) i++;
// . hash from p to p[i]
// . tag names are always ascii, so use the ascii hasher, not utf8
h = hash64Upper_a ( start , &p[i] - start , h );
// bail if done
if ( i >= len ) return h;
// then period
h = hash64 ( "." , 1 , h );
// skip period
i++;
// start now points to next word
start = &p[i];
// continue
goto loop;
}
// . return -1 if not found
// . "tagName" is compound (i.e. "myhouse.myroom" )
long Xml::getNodeNum ( long n0 , long n1 , char *tagName , long tagNameLen ) const {
// . since i changed the hash to a zobrist hash, hashing
// "dns.ip" is not the same as hashing "dns" then "." then "ip"
// by passing the hash of the last to the next as the startHash
// . therefore, i now parse it up
long long h = getCompoundHash ( tagName , tagNameLen );
long i;
if ( n1 > m_numNodes ) n1 = m_numNodes;
if ( n0 > m_numNodes ) n0 = m_numNodes;
if ( n1 < 0 ) n1 = 0;
if ( n0 < 0 ) n0 = 0;
for ( i = n0 ; i < n1; i++ ) {
// if node is text (non-tag) then skip
if ( ! m_nodes[i].isTag() ) continue;
//if ( m_nodes[i].m_compoundHash == h ) break;
if ( m_nodes[i].m_hash == h ) break;
}
// return -1 if not found at all
if ( i >= n1 ) return -1;
return i;
}
void Xml::reset ( ) {
// free old nodes array if any
if ( m_nodes ) mfree ( m_nodes, m_maxNumNodes*sizeof(XmlNode),"Xml1");
if ( m_ownData && m_xml ) mfree ( m_xml, m_allocSize, "Xml1");
m_xml = NULL;
m_nodes = NULL;
m_numNodes = 0;
m_maxNumNodes = 0;
m_allocSize = 0;
}
#include "HttpMime.h" // CT_JSON
// "s" must be in utf8
bool Xml::set ( char *s ,
long slen ,
bool ownData ,
long allocSize ,
bool pureXml ,
long version ,
bool setParentsArg ,
long niceness ,
char contentType ) {
// just in case
reset();
m_niceness = niceness;
// clear it
g_errno = 0;
// if we own the data we free on reset/destruction
m_ownData = ownData;
m_version = version;
// use explicit allocSize if we passed one
m_allocSize = allocSize?allocSize:slen+1;
// make pointers to data
m_xml = s;
m_xmlLen = slen; //i;
// debug msg time
if ( g_conf.m_logTimingBuild )
logf(LOG_TIMING,
"build: xml: set: 4a. %llu",gettimeofdayInMilliseconds());
// sanity check
if ( !s || slen <= 0) return true;
if ( s[slen] != '\0' ) {
log(LOG_LOGIC,"build: Xml: Content is not null terminated.");
char *xx = NULL; *xx = 0;
//sleep(100);
g_errno = EBADENGINEER;
return false;
}
// if json go no further. TODO: also do this for CT_TEXT etc.
if ( contentType == CT_JSON ) {
m_numNodes = 0;
// make the array
m_maxNumNodes = 1;
m_nodes =(XmlNode *)mmalloc(sizeof(XmlNode)*m_maxNumNodes,"x");
if ( ! m_nodes ) return false;
XmlNode *xd = &m_nodes[m_numNodes];
// hack the node
xd->m_node = s;
xd->m_nodeLen = slen;
xd->m_isSelfLink = 0;
// . nodeId for text nodes is 0
xd->m_nodeId = 0;
xd->m_hasBackTag = false;
xd->m_hash = 0;
xd->m_pairTagNum = -1;
m_numNodes++;
return true;
}
QUICKPOLL((niceness));
long i;
// . replacing NULL bytes with spaces in the buffer
// . utf8 should never have any 0 bytes in it either!
for ( i = 0 ; i < slen ; i++ ) if ( !s[i] ) s[i] = ' ';
// counting the max num nodes
for ( i = 0 ; s[i] ; i++ ) if ( s[i] == '<' ) m_maxNumNodes++;
// account for the text (non-tag) nodes (padding nodes between tags)
m_maxNumNodes *= 2 ;
// if we only have one tag we can still have 3 nodes!
m_maxNumNodes++;
// debug msg time
if ( g_conf.m_logTimingBuild )
logf(LOG_TIMING,
"build: xml: set: 4b. %llu",gettimeofdayInMilliseconds());
// . truncate it to avoid spammers
// . now i limit to 30k nodes because of those damned xls docs!
// . they have 300,000+ nodes some of 'em
// now allow 35k nodes for every 100k doclen
long num100k = slen/(100*1024);
if (num100k <= 0) num100k = 1;
long bigMax = 35*1024 * num100k;
if (m_maxNumNodes > bigMax){
log(LOG_WARN, "build: xml: doclen %ld, "
"too many nodes: counted %ld, max %ld "
"...truncating", slen, m_maxNumNodes, bigMax);
m_maxNumNodes = bigMax;
}
// breathe
QUICKPOLL ( niceness );
m_nodes = (XmlNode *) mmalloc (sizeof(XmlNode) * m_maxNumNodes,"Xml1");
if ( ! m_nodes ) {
reset();
return log("build: Could not allocate %li "
"bytes need to parse document.",
sizeof(XmlNode)*m_maxNumNodes);
}
// debug msg time
if ( g_conf.m_logTimingBuild )
logf(LOG_TIMING,
"build: xml: set: 4c. %llu",gettimeofdayInMilliseconds());
// . TODO: do this on demand
// . now fill our nodes array
// . loop over the xml
// . i is byte-index in buffer
long oldi;
for ( i = 0 ; i < m_xmlLen && m_numNodes < m_maxNumNodes ; ) {
// breathe
QUICKPOLL(niceness);
// remember oldi
oldi = i;
// set that node
i += m_nodes[m_numNodes].set (&m_xml[i],pureXml,version);
// in script?
if ( m_nodes[m_numNodes].m_nodeId != TAG_SCRIPT ) {
m_numNodes++;
continue;
}
if ( ! m_nodes[m_numNodes].isFrontTag() ) {
m_numNodes++;
continue;
}
// ok, we got a
char *pstart = &m_xml[i];
char *p = pstart;
char *pend = &m_xml[0] + m_xmlLen;
// scan -- 5 continues -- node 1570 is text of script
for ( ; p < pend ; p++ ) {
if ( p[0] != '<' ) continue;
if ( to_lower_a(p[1]) != '/' ) continue;
if ( to_lower_a(p[2]) != 's' ) continue;
if ( to_lower_a(p[3]) != 'c' ) continue;
if ( to_lower_a(p[4]) != 'r' ) continue;
if ( to_lower_a(p[5]) != 'i' ) continue;
if ( to_lower_a(p[6]) != 'p' ) continue;
if ( to_lower_a(p[7]) != 't' ) continue;
break;
}
// make sure we do not breach! i saw this happen once!
if ( m_numNodes >= m_maxNumNodes ) break;
XmlNode *xn = &m_nodes[m_numNodes++];
xn->m_nodeId = TAG_SCRIPTTEXT;//0; // TEXT NODE
xn->m_node = pstart;
xn->m_nodeLen = p - pstart;
xn->m_tagName = NULL;
xn->m_tagNameLen = 0;
xn->m_hasBackTag = false;
xn->m_hash = 0;
xn->m_isVisible = false;
xn->m_isBreaking = false;
// advance i to get to the
i = p - &m_xml[0] ;
}
// sanity
if ( m_numNodes > m_maxNumNodes ) { char *xx=NULL;*xx=0; }
// trim off last node if empty! it is causing a core in isBackTag()
if ( m_numNodes > 0 && m_nodes[m_numNodes-1].m_nodeLen == 0 )
m_numNodes--;
// debug msg time
if ( g_conf.m_logTimingBuild )
logf(LOG_TIMING,
"build: xml: set: 4d. %llu",gettimeofdayInMilliseconds());
return true;
}
// for translating HTML entities to an iso char
#include "Entities.h"
// . replaces line-breaking html tags with 2 returns if "includeTags" is false
// . stores tags too if "includeTags" is true
// . returns # chars written to buf
// . NOTE: see XmlNode.cpp for list of tag types in "NodeType" structure
// . used to get xml subtrees as text
// . used to get
's
// . must write to your buf rather than just return a pointer since we may
// have to concatenate several nodes together, we may have to replace tags,..
// . TODO: nuke this in favor of Pos.cpp::filter() -- but that needs Words.cpp
long Xml::getText ( char *buf ,
long bufMaxSize ,
long node1 ,
long node2 ,
bool includeTags ,
bool visibleTextOnly ,
bool filter , // convert entities, \r's
bool filterSpaces , // filter excessive punct/spaces
bool useStopIndexTag ) { // indexable text only?
// init some vars
long i = node1;
long n = node2;
// truncate n to the # of nodes we have
if ( n > m_numNodes ) n = m_numNodes;
// keep a non visible tag stack
long notVisible = 0;
// are we in indexable area?
bool inStopTag = false;
// the destination
char *dst = buf;
char *dstEnd = buf + bufMaxSize;
char cs = -1;
// cannot allow nested script tags, messed up our summary generator
// when a page tried to print a