open-source-search-engine/XmlNode.cpp
Matt a54471849b sitemap.xml support for harvesting loc urls.
parse xml docs as pure xml again but set nodeid
to TAG_LINK etc. so Linkdb.cpp can get links again.
added isparentsitemap url filter to prioritize urls
from sitemaps. added isrssext to url filters to
prioritize new possible rss feed urls. added numinlinks
to url filters to prioritize popular urls for spidering.
use those filters in default web filter set.
fix filters that delete urls from the index using
the 'DELETE' priority. they weren't getting deleted.
2015-03-17 14:26:16 -06:00

742 lines
24 KiB
C++

#include "gb-include.h"
#include "XmlNode.h"
#include "Mem.h"
// . Here's a nice list of all the html nodes names, lengths, whether they're
// a breaking node or not and their node id
// . isVisible is true if text in between front and end tags is visible on page
// . isVisible is used by Xml::getText()
// . filterKeep is 1 if we should keep it when &strip=1 is given when getting
// the cached document. i added this for faisal
// . a filterKeep of 0 means remove tag and text between it and its back tag.
// . a filterKeep of 1 means keep the tag and text between it and its back tag.
// . a filterKeep of 2 means remove tag BUT keep the text between
// it and its back tag.
NodeType g_nodes[] = {
// NAME hasBackTag brk? isVisible? filterKeep1? filterKeep2 type/m_nodeId[i]
// isXml? (the last field)
// --------------------------
// -- text node --- 0
{"textNode" , 0, 0, 1, 1,1, TAG_TEXTNODE ,0},
// -- xml tag node --- 1
{"xmlTag" , 1, 1, 1, 2,2, TAG_XMLTAG ,0},
{"A" , 1, 0, 1, 1,1, TAG_A ,0},
{"ABBREV" , 1, 1, 1, 2,2, TAG_ABBREV ,0},
{"ACRONYM" , 1, 1, 1, 2,1, TAG_ACRONYM ,0},
{"ADDRESS" , 1, 1, 1, 2,2, TAG_ADDRESS ,0},
{"APPLET" , 1, 1, 1, 0,0, TAG_APPLET ,0},
{"AREA" , 0, 1, 1, 0,0, TAG_AREA ,0},
{"AU" , 1, 1, 1, 0,0, TAG_AU ,0},
{"AUTHOR" , 1, 1, 1, 0,0, TAG_AUTHOR ,0},
{"B" , 1, 0, 1, 1,1, TAG_B ,0},
{"BANNER" , 1, 1, 1, 0,0, TAG_BANNER ,0},
{"BASE" , 0, 1, 1, 0,0, TAG_BASE ,0},
{"BASEFONT" , 0, 1, 1, 2,2, TAG_BASEFONT ,0},
{"BGSOUND" , 0, 1, 1, 0,0, TAG_BGSOUND ,0},
{"BIG" , 1, 0, 1, 2,1, TAG_BIG ,0},
{"BLINK" , 1, 0, 1, 2,2, TAG_BLINK ,0},
{"BLOCKQUOTE",1, 1, 1, 2,1, TAG_BLOCKQUOTE ,0},
{"BQ" , 1, 1, 1, 0,0, TAG_BQ ,0},
{"BODY" , 1, 1, 1, 1,1, TAG_BODY ,0},
{"BR" , 0, 1, 1, 1,1, TAG_BR ,0},
{"CAPTION" , 1, 1, 1, 2,1, TAG_CAPTION ,0},
{"CENTER" , 1, 1, 1, 1,1, TAG_CENTER ,0},
{"CITE" , 1, 1, 1, 2,1, TAG_CITE ,0},
{"CODE" , 1, 1, 1, 2,1, TAG_CODE ,0},
{"COL" , 1, 1, 1, 2,2, TAG_COL ,0},
{"COLGROUP" , 1, 1, 1, 0,0, TAG_COLGROUP ,0},
{"CREDIT" , 1, 1, 1, 0,0, TAG_CREDIT ,0},
{"DEL" , 1, 1, 1, 2,1, TAG_DEL ,0},
{"DFN" , 1, 1, 1, 2,1, TAG_DFN ,0},
{"DIR" , 1, 1, 1, 0,0, TAG_DIR ,0},
// MDW: wtf, these have back tags!
// MDW: ok, i fixed it!
{"DIV" , 1, 1, 1, 1,1, TAG_DIV ,0},
{"DL" , 1, 1, 1, 1,1, TAG_DL ,0},
// this may not have a back tag!
{"DT" , 1, 1, 1, 1,1, TAG_DT ,0},
// this may not have a back tag!
{"DD" , 1, 1, 1, 1,1, TAG_DD ,0},
{"EM" , 1, 0, 1, 2,1, TAG_EM ,0}, // emphasized text
{"EMBED" , 0, 1, 1, 0,0, TAG_EMBED ,0},
{"FIG" , 1, 1, 1, 0,0, TAG_FIG ,0},
{"FN" , 1, 1, 1, 0,0, TAG_FN ,0},
{"FONT" , 1, 0, 1, 1,1, TAG_FONT ,0},
{"FORM" , 1, 1, 1, 2,2, TAG_FORM ,0},
// this may not have a back tag!
{"FRAME" , 1, 1, 1, 0,0, TAG_FRAME ,0},
{"FRAMESET" , 1, 1, 1, 0,0, TAG_FRAMESET ,0},
{"H1" , 1, 1, 1, 1,1, TAG_H1 ,0},
{"H2" , 1, 1, 1, 1,1, TAG_H2 ,0},
{"H3" , 1, 1, 1, 1,1, TAG_H3 ,0},
{"H4" , 1, 1, 1, 1,1, TAG_H4 ,0},
{"H5" , 1, 1, 1, 1,1, TAG_H5 ,0},
{"H6" , 1, 1, 1, 1,1, TAG_H6 ,0},
{"HEAD" , 1, 1, 1, 1,1, TAG_HEAD ,0},
{"HR" , 0, 1, 1, 1,1, TAG_HR ,0},
{"HTML" , 1, 1, 1, 1,1, TAG_HTML ,0},
{"I" , 1, 0, 1, 2,1, TAG_I ,0},
{"IFRAME" , 1, 1, 1, 2,2, TAG_IFRAME ,0},
// filter = 1,but tag is turned to alt
{"IMG" , 0, 1, 1, 1,1, TAG_IMG ,0},
{"INPUT" , 0, 1, 1, 0,0, TAG_INPUT ,0},
{"INS" , 1, 1, 1, 2,1, TAG_INS ,0},
{"ISINDEX" , 0, 1, 1, 0,0, TAG_ISINDEX ,0},
{"KBD" , 1, 1, 1, 2,1, TAG_KBD ,0},
{"LANG" , 1, 1, 1, 0,0, TAG_LANG ,0},
{"LH" , 1, 1, 1, 0,0, TAG_LH ,0},
// this may or may not have a back tag
{"LI" , 1, 1, 1, 1,1, TAG_LI ,0},
// this may or may not have a back tag
{"LINK" , 0, 1, 1, 0,0, TAG_LINK ,0},
{"LISTING" , 1, 1, 1, 0,0, TAG_LISTING ,0},
{"MAP" , 1, 1, 1, 0,0, TAG_MAP ,0},
// don't index marquee text
{"MARQUEE" , 1, 1, 0, 2,2, TAG_MARQUEE ,0},
{"MATH" , 1, 1, 1, 0,0, TAG_MATH ,0},
{"MENU" , 1, 1, 1, 1,1, TAG_MENU ,0},
{"META" , 0, 1, 1, 1,1, TAG_META ,0},
{"MULTICOL" , 0, 1, 1, 0,0, TAG_MULTICOL ,0},
{"NOBR" , 1, 0, 1, 0,0, TAG_NOBR ,0},
{"NOFRAMES" , 1, 1, 1, 0,0, TAG_NOFRAMES ,0},
{"NOTE" , 1, 1, 1, 0,0, TAG_NOTE ,0},
{"OL" , 1, 1, 1, 1,1, TAG_OL ,0},
{"OVERLAY" , 0, 1, 1, 0,0, TAG_OVERLAY ,0},
// this may not have a back tag!
{"P" , 0, 1, 1, 1,1, TAG_P ,0},
{"PARAM" , 0, 1, 1, 0,0, TAG_PARAM ,0},
{"PERSON" , 1, 1, 1, 0,0, TAG_PERSON ,0},
{"PLAINTEXT", 1, 1, 1, 0,0, TAG_PLAINTEXT ,0},
{"PRE" , 1, 1, 1, 2,1, TAG_PRE ,0},
{"Q" , 1, 1, 1, 2,1, TAG_Q ,0},
{"RANGE" , 0, 1, 1, 0,0, TAG_RANGE ,0},
{"SAMP" , 1, 1, 1, 2,1, TAG_SAMP ,0},
{"SCRIPT" , 1, 1, 0, 0,0, TAG_SCRIPT ,0},
{"SELECT" , 1, 1, 0, 0,0, TAG_SELECT ,0},
{"SMALL" , 1, 0, 1, 2,1, TAG_SMALL ,0},
{"SPACER" , 0, 1, 1, 2,1, TAG_SPACER ,0},
{"SPOT" , 0, 1, 1, 0,0, TAG_SPOT ,0},
{"STRIKE" , 1, 1, 1, 2,1, TAG_STRIKE ,0},
{"STRONG" , 1, 0, 1, 2,1, TAG_STRONG ,0},
{"SUB" , 1, 0, 1, 2,2, TAG_SUB ,0},
{"SUP" , 1, 0, 1, 2,2, TAG_SUP ,0},
{"TAB" , 0, 1, 1, 0,0, TAG_TAB ,0},
{"TABLE" , 1, 1, 1, 1,1, TAG_TABLE ,0},
{"TBODY" , 1, 1, 1, 1,1, TAG_TBODY ,0},
// this may not have a back tag!
{"TD" , 1, 1, 1, 1,1, TAG_TD ,0},
{"TEXTAREA" , 1, 1, 1, 2,2, TAG_TEXTAREA ,0},
{"TEXTFLOW" , 0, 1, 1, 0,0, TAG_TEXTFLOW ,0},
{"TFOOT" , 0, 1, 1, 0,0, TAG_TFOOT ,0},
// this DOES have a back tag
{"TH" , 1, 1, 1, 0,0, TAG_TH ,0},
{"THEAD" , 0, 1, 1, 0,0, TAG_THEAD ,0},
{"TITLE" , 1, 1, 1, 1,1, TAG_TITLE ,0},
// this may not have a back tag!
{"TR" , 1, 1, 1, 1,1, TAG_TR ,0},
{"TT" , 1, 1, 1, 2,1, TAG_TT ,0},
{"U" , 1, 0, 1, 1,1, TAG_U ,0},
{"UL" , 1, 0, 1, 1,1, TAG_UL ,0},
{"VAR" , 1, 1, 1, 2,1, TAG_VAR ,0},
{"WBR" , 0, 1, 1, 0,0, TAG_WBR ,0},
{"XMP" , 1, 1, 1, 0,0, TAG_XMP ,0},
{"!--" , 0, 1, 1, 0,0, TAG_COMMENT ,0}, // comment tag!
{"OPTION" , 0, 1, 1, 2,2, TAG_OPTION ,0},
{"STYLE" , 1, 1, 0, 0,1, TAG_STYLE ,0},
// doctype tag <!DOCTYPE ...>
{"DOCTYPE" , 0, 1, 1, 0,0, TAG_DOCTYPE ,0},
// used in office.microsoft.com <?xml ...>
{"XML" , 0, 1, 1, 0,0, TAG_XML ,0},
// <start index> <stop index>
{"START" , 0, 1, 1, 0,0, TAG_START ,0},
{"STOP" , 0, 1, 1, 0,0, TAG_STOP ,0},
// . i added these tags for faisal, but don't really need them
// since our XML tag condition handles this case
// . we can no longer treat as a generic XML tags since faisal wanted
// the strip=2 option
{"SPAN" , 1, 0, 1, 2,1, TAG_SPAN ,0}, // not breaking!
{"LEGEND" , 1, 1, 1, 2,1, TAG_LEGEND ,0},
{"S" , 1, 1, 1, 2,1, TAG_S ,0}, // strike tag
{"ABBR" , 1, 0, 1, 2,1, TAG_ABBR ,0},
{"![CDATA[" , 0, 1, 1, 0,0, TAG_CDATA ,0}, // <![CDATA[ tag
{"NOSCRIPT" , 1, 1, 0, 0,0, TAG_NOSCRIPT,0},
{"FIELDSET" , 1, 1, 1, 0,0, TAG_FIELDSET,0},
// feedburner uses these in the xml
{"FEEDBURNER:ORIGLINK", 0, 1, 1, 0,0, TAG_FBORIGLINK ,1},
// ahrefs uses these as links
{"RDF:RDF",0, 1, 1, 0,0, TAG_RDF ,1},
{"RSS",0, 1, 1, 0,0, TAG_RSS ,1},
{"FEED",0, 1, 1, 0,0, TAG_FEED ,1},
{"ITEM",1, 1, 0, 0,0, TAG_ITEM ,1},
{"ENTRY",1, 1, 0, 0,0, TAG_ENTRY ,1},
{"CHANNEL",1, 1, 0, 0,0, TAG_CHANNEL ,1},
{"ENCLOSURE",1, 1, 0, 0,0, TAG_ENCLOSURE ,0},
{"WEBLOG",0, 1, 0, 0,0, TAG_WEBLOG ,1},
{"GBFRAME", 1, 1, 1, 1,1, TAG_GBFRAME ,0},
{"TC" , 1, 1, 1, 1,1, TAG_TC ,0},// HACK: tbl column section
{"GBXMLTITLE", 1, 1, 1, 1,1, TAG_GBXMLTITLE,1},
// facebook xml
{"START_TIME", 1, 1, 1, 1,1, TAG_FBSTARTTIME,1},
{"END_TIME", 1, 1, 1, 1,1, TAG_FBENDTIME,1},
{"NAME", 1, 1, 1, 1,1, TAG_FBNAME,1},
{"PIC_SQUARE", 1, 1, 1, 1,1, TAG_FBPICSQUARE,1},
{"HIDE_GUEST_LIST", 1, 1, 1, 1,1, TAG_FBHIDEGUESTLIST,1},
{"scriptText",0, 1, 0, 0,0, TAG_SCRIPTTEXT,0 },
{"BUTTON" , 1, 1, 1, 0,0, TAG_BUTTON ,0},
{"UrlFrom", 0, 1, 1, 0,0, TAG_URLFROM ,1},
// for sitemap.xml
{"LOC" , 0, 1, 1, 0,0, TAG_LOC,0}
//{"BUTTON" , 1, 1, 1, 2, 122,0},
//{"BDO" , 1, 1, 1, 2, 123,0},
//{"LABEL" , 1, 1, 1, 2, 124,0},
//{"LAYER" , 1, 1, 1, 2, 125}
};
// NAME hasBackTag brk? isVisible? filterKeep1? filterKeep2 type/m_nodeId[i]
// . called by Xml class
// . returns the length of the node
// . TODO: "node" is now guaranteed to be \0 terminated -- make this faster
int32_t XmlNode::set ( char *node , bool pureXml , int32_t version ) {
// save head of node
m_node = node;
// sanity check
static bool s_check = false;
if ( ! s_check ) {
s_check = true;
// how many NodeTypes do we have in g_nodes?
static int32_t nn = sizeof(g_nodes) / sizeof(NodeType);
// set the hash table
for ( int32_t i = 0 ; i < nn ; i++ ) {
// sanity
if ( g_nodes[i].m_nodeId != i ) { char *xx=NULL;*xx=0;}
}
}
// . reset this
// . need to do here instead of in Links.cpp because sometimes
// we think an anchor tag indicates a link, but it is really
// just an <a href="javascript:..."> function call and Links.cpp
// ignored it but we are expecting this to be valid!
m_isSelfLink = 0;
// reset
//m_linkNum = -1;
// CDATA tag was identified in earlier versions as a text node. Now
// it is identified as a CDATA tag node. But gb.conf and others always
// pass their version as 0
if ( node[0] == '<' &&
node[1] == '!' &&
node[2] == '[' &&
node[3] == 'C' &&
node[4] == 'D' &&
node[5] == 'A' &&
node[6] == 'T' &&
node[7] == 'A' &&
node[8] == '[' )
return setCDATANode ( node );
// if "node" isn't the start of a tag then set it as a Text Node
if ( *node != '<' || ! isTagStart ( node ) ) {//, 0, version ) ) {
// . set this node as a text node!
// . nodeId for text nodes is 0
m_nodeId = 0;
m_node = node;
m_hasBackTag = false;
m_hash = 0;
int32_t i = 0;
//char inCDATA = 0;
// inc i as int32_t as it's NOT the beginning of a tag
while ( node[i] &&
(node[i] != '<' || ! isTagStart ( node+i)))//,versin)))
i++;
m_nodeLen = i;
m_pairTagNum = -1;
return m_nodeLen;
}
// . see if it's a comment (node end is "-->" for comments)
// . comments are special cases
if ( node[1] == '!' ) {
if ( node[2]=='-' && node[3]=='-' )
return setCommentNode ( node );
// this means comment too:
// <![if ....]>
if ( node[2]=='[' )
return setCommentNode2 ( node );
}
// . otherwise it's a regular tag
// . might be <!DOCTYPE ...> or something though
m_nodeLen = getTagLen ( node );//, version );
// . get the node's name's length (i-1)
// . node name ends at non alnum char
// . we can have hyphens in node name (TODO: even at beginning???)
int32_t tagNameStart = 1;
// . skip over backslash in the back tags
// . or skip over / or ? or ! now
// . tag names must start with a letter, fwiw
if ( ! is_alnum_a(node[tagNameStart]) /* == '/'*/ ) tagNameStart++;
int32_t i = tagNameStart;
// skip i to end of tagName. this should only allow ascii chars
// to be "tag name chars"
for ( ; i < m_nodeLen && is_tagname_char(node[i]) ; i++ );
// set the tagName and tagNameLen
m_tagName = &node [ tagNameStart ];
m_tagNameLen = i - tagNameStart;
// break point
//if ( m_tagNameLen == 3 && m_tagName[0]=='!' &&
// m_tagName[1]=='-' && m_tagName[2]=='-' )
// fprintf(stderr,"man!");
// . set the node's hash -- used cuz it's faster than strcmp
// . just hash the letters as upper case
// . tag names are never utf8, so use the ascii ha
m_hash = hash64Upper_a ( m_tagName , m_tagNameLen , 0LL);
// if we're pure xml, don't allow any html tags accept <!-- -->
if ( pureXml ) {
m_hasBackTag = true;
m_isBreaking = true;
m_isVisible = true;
//m_nodeId = TAG_XMLTAG;//1;
// this returns 1 if tag is not in the list
m_nodeId = setNodeInfo ( m_hash );//&m_hasBackTag ,
}
// . determine if the nodeId for this node
// . determine if it breaks lines (for phrasing purposes)
else
m_nodeId = setNodeInfo ( m_hash );//&m_hasBackTag ,
//&m_isBreaking , &m_isVisible );
// . no back tag if / follow name
// . this was only for "pureXml" but now i do it for all tags!
if ( m_node [ m_nodeLen - 2 ] == '/' ) m_hasBackTag = false;
if ( m_node [ m_nodeLen - 2 ] == '?' ) m_hasBackTag = false;
return m_nodeLen;
}
// . return the length of a node starting at "node"
int32_t getTagLen ( char *node ) { // , int32_t version ) {
// see if it's not a node
//if ( node[0] != '<' ) return 0;
// skip over first <
int32_t i ;
// . keep looping until we hit a < or > OR while we're in quotes
// . ignore < and > when they're in quotes
for ( i = 1 ; node[i] ; i++ ) {
// this switch should speed things up... no!
if ( node[i] != '<' &&
node[i] != '>' &&
node[i] != '\"' &&
node[i] != '\'' )
continue;
// this is about 1.3 times faster than above (with -O2 on both)
//if ( ! is_tag_control_char ( node[i] ) ) continue;
if ( node[i] == '<' ) break;
if ( node[i] == '>' ) {
break;
//if ( node[i-1]!='b') break;
//if ( i -2 < 0 ) break;
//if ( node[i-2]!='g') break;
// we had a "gb>" which means that these 3 chars
// we originally a &gt; html encoded entity which
// we decoded for easier parsing
//continue;
}
//if (version >= 70 && version < 77) continue;
// we can have double quotes within single quotes
if ( node [ i ] == '\"' ) {
// scan back looking for equal sign...
int32_t k; for ( k = i - 1 ; k > 1 ; k-- ) {
if ( is_wspace_a(node[k]) ) continue;
break;
}
if ( k <= 1 ) continue;
// . if an equal sign did not immediately preceed
// this double quote then ignore the double quote
// . this now fixes the harwoodmuseum.org issue
// talked about below
if ( node[k] != '=' ) continue;
// skip over this first quote
i++;
while ( node[i] && node[i]!='\"' ) {
// crap some pages have unbalanced quotes.
// see /test/doc.14541556377486183454.html
if ( node[i ]=='>' &&
node[i-1]=='\"' ) {
i--;
break;
}
// like an img tag hits a </a> for
// http://www.harwoodmuseum.org/press_deta
// il.php?ID=44
// BUT this fucks up
// onclick="tb_show('<b>Community Calendar</b>'
// on the </b> which is legitamately in quotes
//if ( node[i ]=='<' &&
// node[i+1]=='/' ) {
// i--;
// break;
//}
if ( node[i ]=='>' &&
node[i-1]==' ' &&
node[i-2]=='\"' ) {
i--;
break;
}
// skip this char
i++;
}
// return the length if tag ended abuptly
if ( ! node[i] ) return i;
// back-to-back quotes? common mistake
if ( node[i+1] == '\"' ) i++;
continue;
}
// continue if we don't have a " '" or "='"
if ( node [ i ] != '\'' ) continue;
if ( node[i-1] != '=' && !is_wspace_a( node[i-1] ) ) continue;
// skip to end of quote
while ( node[i] && node[i]!='\'' ) i++;
}
// skip i over the >
if ( node[i] == '>' ) i++;
// . else we found no closure outside of quotes so be more stringent
// . look for closure with regard to quotes
else for ( i=1; node[i] && node[i] != '>' && node[i] != '<';i++);
// return the LENGTH of the whole node
return i ;
}
int32_t XmlNode::setCommentNode ( char *node ) {
m_nodeId = TAG_COMMENT;
m_isBreaking = true;
m_isVisible = true;
m_hasBackTag = false;
m_hash = hash64 ( "!--" , 3 , 0LL );
m_node = node;
m_tagName = node + 1; // !--
m_tagNameLen = 3;
// . compute node length
// . TODO: do we have to deal with quotes????
// . TODO: what about nested comments?
int32_t i;
for ( i = 3 ; node[i] ; i++ ) {
if ( node[i] !='>' ) continue;
if ( node[i-1] !='-' ) continue;
if ( node[i-2] =='-' ) break;
}
// skip i over the >, if any (could be end of doc)
if ( node[i] == '>' ) i++;
m_nodeLen = i;
return i;
}
int32_t XmlNode::setCommentNode2 ( char *node ) {
m_nodeId = TAG_COMMENT;
m_isBreaking = false;//true;
m_isVisible = false;//true;
m_hasBackTag = false;
m_hash = hash64 ( "![" , 2 , 0LL );
m_node = node;
m_tagName = node + 1;
m_tagNameLen = 2;
// . compute node length
// . TODO: do we have to deal with quotes????
// . TODO: what about nested comments?
int32_t i;
for ( i = 2 ; node[i] ; i++ ) {
// look for ending of ]> like for <![if gt IE 6]>
if ( node[i] !='>' ) continue;
if ( node[i-1] ==']' ) break;
// look for ending of --> like for <![endif]-->
if ( node[i-1] == '-' && node[i-2] == '-' ) break;
}
// skip i over the >, if any (could be end of doc)
if ( node[i] == '>' ) i++;
m_nodeLen = i;
return i;
}
int32_t XmlNode::setCDATANode ( char *node ) {
m_nodeId = TAG_CDATA;
m_isBreaking = true;
m_isVisible = true;
m_hasBackTag = false;
m_hash = hash64 ( "![CDATA[" , 8 , 0LL );
m_node = node;
m_tagName = node + 1; // !--
m_tagNameLen = 8;
// . compute node length
// . TODO: do we have to deal with quotes????
// . TODO: what about nested comments?
int32_t i;
for ( i = 8 ; node[i] ; i++ ) {
// seems like just ]] is good enough! don't need "]]>"
//if ( node[i] !='>' ) continue;
if ( node[i ] !=']' ) continue;
if ( node[i+1] !=']' ) continue;//{ i++; break; }
// but skip it if we got it
if ( node[i+2] !='>' ) continue;
//if ( node[i+2] == '>' ) { i+=3; break;}
i += 3;
break;
// if does not end in '>', skip the ']' anyway
// no! hurts regex ending in [0-9]
//i+=2; break;
}
// skip i over the >, if any (could be end of doc)
//if ( node[i] == '>' ) i++;
m_nodeLen = i;
return i;
}
// Return the value of the specified "field" within this node.
// the case of "field" does not matter.
char *XmlNode::getFieldValue ( char *field , int32_t *valueLen ) {
// reset this to 0
*valueLen = 0;
// scan for the field name in our node
int32_t flen = gbstrlen(field);
char inQuotes = '\0';
int32_t i;
// scan the characters in the node, looking for the field name in ascii
for ( i = 1; i + flen < m_nodeLen ; i++ ) {
// skip the field if it's quoted
if ( inQuotes) {
if (m_node[i] == inQuotes ) inQuotes = 0;
continue;
}
// set inQuotes to the quote if we're in quotes
if ( (m_node[i]=='\"' || m_node[i]=='\'')){
inQuotes = m_node[i];
continue;
}
// a field name must be preceeded by non-alnum
if ( is_alnum_a ( m_node[i-1] ) ) continue;
// the first character of this field shout match field[0]
if ( to_lower_a (m_node[i]) != to_lower_a(field[0] )) continue;
// field just be immediately followed by an = or space
if (m_node[i+flen]!='='&&!is_wspace_a(m_node[i+flen]))continue;
// field names must match
if ( strncasecmp ( &m_node[i], field, flen ) != 0 ) continue;
// break cuz we got a match for our field name
break;
}
// return NULL if no matching field
if ( i + flen >= m_nodeLen ) return NULL;
// advance i over the fieldname so it pts to = or space
i += flen;
// advance i over spaces
while ( i < m_nodeLen && is_wspace_a ( m_node[i] ) ) i++;
// advance over the equal sign, return NULL if does not exist
if ( i < m_nodeLen && m_node[i++] != '=' ) return NULL;
// advance i over spaces after the equal sign
while ( i < m_nodeLen && is_wspace_a ( m_node[i] ) ) i++;
// now parse out the value of this field (could be in quotes)
inQuotes = '\0';
// set inQuotes to the quote if we're in quotes
if ( m_node[i]=='\"' || m_node[i]=='\'') inQuotes = m_node[i++];
// mark this as the start of the value
int start=i;
// advance i until we hit a space, or we hit a that quote if inQuotes
if (inQuotes) {
while (i<m_nodeLen && m_node[i] != inQuotes )
i++;
}
else {
while ( i<m_nodeLen &&
!is_wspace_a(m_node[i])&&
m_node[i]!='>')
i++;
}
// set the length of the value
*valueLen = i - start;
// return a ptr to the value
return m_node + start;
}
#include "HashTableX.h"
nodeid_t getTagId ( char *s , NodeType **retp ) {
// init table?
static bool s_init = false;
static HashTableX s_ht;
static char s_buf[10000];
if ( ! s_init ) {
s_init = true;
s_ht.set ( 4 ,4,1024,s_buf,10000,false,0,"tagids");//niceness=0
// how many NodeTypes do we have in g_nodes?
static int32_t nn = sizeof(g_nodes) / sizeof(NodeType);
// set the hash table
for ( int32_t i = 0 ; i < nn ; i++ ) {
char *name = g_nodes[i].m_nodeName;
int32_t nlen = gbstrlen(name);
int64_t h = hash64Upper_a ( name,nlen,0LL );
NodeType *nt = &g_nodes[i];
if ( ! s_ht.addKey(&h,&nt) ) {
char *xx=NULL;*xx=0; }
}
// sanity
if ( s_ht.m_numSlots != 1024 ) { char *xx=NULL;*xx=0; }
// sanity test
nodeid_t tt = getTagId ( "br" );
if ( tt != TAG_BR ) { char *xx=NULL;*xx=0; }
}
// find end of tag name. hyphens are ok to be in name.
// facebook uses underscores like <start_time>
char *e = s; for ( ; *e && (is_alnum_a(*e) || *e=='-'|| *e=='_'); e++);
// hash it for lookup
int64_t h = hash64Upper_a ( s , e - s , 0 );
// look it up
NodeType **ntp = (NodeType **)s_ht.getValue(&h);
// assume none
if ( retp ) *retp = NULL;
// none?
if ( ! ntp ) return 0;
// got one
if ( retp ) *retp = *ntp;
// get id otherwise
return (*ntp)->m_nodeId;
}
// . returns the nodeId
// . 0 means not a node
// . 1 means it's an xml node
// . > 1 is reserved for pre-defined html nodes
nodeid_t XmlNode::setNodeInfo ( int64_t nodeHash ){// , char *hasBackTag ,
//char *isBreaking , char *isVisible ) {
/*
// sanity check
static bool s_init = false;
if ( ! s_init ) {
s_init = true;
// how many NodeTypes do we have in g_nodes?
static int32_t nn = sizeof(g_nodes) / sizeof(NodeType);
// set the hash table
for ( int32_t i = 0 ; i < nn ; i++ ) {
// sanity check
if(g_nodes[i].m_nodeId != i ) { char *xx=NULL;*xx=0;}
}
}
*/
// . we have a list of all node types called "g_nodes"
// . each node type is a NodeType struct
// . hash all these node types into a hash table by their node name
// . we have 108 node names so we'll use 512 buckets
// . given the hash of your node name you can look it up in this table
static bool s_isHashed = false;
static int64_t s_hash [512];
static nodeid_t s_num [512];
// how many NodeTypes do we have in g_nodes?
static int32_t s_numNodeTypes = sizeof(g_nodes) / sizeof(NodeType);
// we only need to fill in the hash table once since it's static
if ( s_isHashed ) goto ready;
// clear the hash table
memset ( s_hash , 0 , 8*512 );
// set the hash table
for ( int32_t i = 0 ; i < s_numNodeTypes ; i++ ) {
int64_t h = hash64Upper_a ( g_nodes[i].m_nodeName,
gbstrlen(g_nodes[i].m_nodeName),0LL);
//int32_t b = (uint64_t)h % 512;
int32_t b = (uint64_t)h & 511;
// debug msg
//fprintf(stderr,"node #%"INT32" has bucket #%"INT32", hash =%"INT64"\n",i,b,h);
while ( s_hash[b] ) if ( ++b == 512 ) b = 0;
s_hash [ b ] = h;
s_num [ b ] = i;
}
// set this to true so we don't do the hashing again
s_isHashed = true;
ready:
// look up nodeHash in hash table
//int32_t b = (uint64_t)nodeHash % 512;
int32_t b = (uint64_t)nodeHash & 511;
while ( s_hash[b] ) {
if ( s_hash[b] == nodeHash ) break;
if ( ++b == 512 ) b = 0;
}
// if it wasn't found it must be an xml node(or unrecognized html node)
if ( ! s_hash[b] ) {
// default is breaking, has back tag and is indexable
m_isBreaking = true;
m_hasBackTag = true;
m_isVisible = true;
return 1;
}
// otherwise extract the isBreaking and the nodeId from the hit bucket
int32_t n = s_num[b];
m_hasBackTag = g_nodes [ n ].m_hasBackTag;
m_isBreaking = g_nodes [ n ].m_isBreaking;
m_isVisible = g_nodes [ n ].m_isVisible;
// return the tag/node Id
return g_nodes [ n ].m_nodeId;
}
int32_t getNumXmlNodes ( ) {
return (int32_t)sizeof(g_nodes) / sizeof(XmlNode);
}
#include "Words.h" // BACKBITCOMP
bool isBreakingTagId ( nodeid_t tagId ) {
return g_nodes [ tagId & BACKBITCOMP ].m_isBreaking;
}
bool hasBackTag ( nodeid_t tagId ) {
return g_nodes [ tagId & BACKBITCOMP ].m_hasBackTag;
}