open-source-search-engine/XmlNode.cpp

738 lines
24 KiB
C++

#include "gb-include.h"
#include "XmlNode.h"
#include "Mem.h"
// . Here's a nice list of all the html nodes names, lengths, whether they're
// a breaking node or not and their node id
// . isVisible is true if text in between front and end tags is visible on page
// . isVisible is used by Xml::getText()
// . filterKeep is 1 if we should keep it when &strip=1 is given when getting
// the cached document. i added this for faisal
// . a filterKeep of 0 means remove tag and text between it and its back tag.
// . a filterKeep of 1 means keep the tag and text between it and its back tag.
// . a filterKeep of 2 means remove tag BUT keep the text between
// it and its back tag.
NodeType g_nodes[] = {
// NAME hasBackTag brk? isVisible? filterKeep1? filterKeep2 type/m_nodeId[i]
// isXml? (the last field)
// --------------------------
// -- text node --- 0
{"textNode" , 0, 0, 1, 1,1, TAG_TEXTNODE ,0},
// -- xml tag node --- 1
{"xmlTag" , 1, 1, 1, 2,2, TAG_XMLTAG ,0},
{"A" , 1, 0, 1, 1,1, TAG_A ,0},
{"ABBREV" , 1, 1, 1, 2,2, TAG_ABBREV ,0},
{"ACRONYM" , 1, 1, 1, 2,1, TAG_ACRONYM ,0},
{"ADDRESS" , 1, 1, 1, 2,2, TAG_ADDRESS ,0},
{"APPLET" , 1, 1, 1, 0,0, TAG_APPLET ,0},
{"AREA" , 0, 1, 1, 0,0, TAG_AREA ,0},
{"AU" , 1, 1, 1, 0,0, TAG_AU ,0},
{"AUTHOR" , 1, 1, 1, 0,0, TAG_AUTHOR ,0},
{"B" , 1, 0, 1, 1,1, TAG_B ,0},
{"BANNER" , 1, 1, 1, 0,0, TAG_BANNER ,0},
{"BASE" , 0, 1, 1, 0,0, TAG_BASE ,0},
{"BASEFONT" , 0, 1, 1, 2,2, TAG_BASEFONT ,0},
{"BGSOUND" , 0, 1, 1, 0,0, TAG_BGSOUND ,0},
{"BIG" , 1, 0, 1, 2,1, TAG_BIG ,0},
{"BLINK" , 1, 0, 1, 2,2, TAG_BLINK ,0},
{"BLOCKQUOTE",1, 1, 1, 2,1, TAG_BLOCKQUOTE ,0},
{"BQ" , 1, 1, 1, 0,0, TAG_BQ ,0},
{"BODY" , 1, 1, 1, 1,1, TAG_BODY ,0},
{"BR" , 0, 1, 1, 1,1, TAG_BR ,0},
{"CAPTION" , 1, 1, 1, 2,1, TAG_CAPTION ,0},
{"CENTER" , 1, 1, 1, 1,1, TAG_CENTER ,0},
{"CITE" , 1, 1, 1, 2,1, TAG_CITE ,0},
{"CODE" , 1, 1, 1, 2,1, TAG_CODE ,0},
{"COL" , 1, 1, 1, 2,2, TAG_COL ,0},
{"COLGROUP" , 1, 1, 1, 0,0, TAG_COLGROUP ,0},
{"CREDIT" , 1, 1, 1, 0,0, TAG_CREDIT ,0},
{"DEL" , 1, 1, 1, 2,1, TAG_DEL ,0},
{"DFN" , 1, 1, 1, 2,1, TAG_DFN ,0},
{"DIR" , 1, 1, 1, 0,0, TAG_DIR ,0},
// MDW: wtf, these have back tags!
// MDW: ok, i fixed it!
{"DIV" , 1, 1, 1, 1,1, TAG_DIV ,0},
{"DL" , 1, 1, 1, 1,1, TAG_DL ,0},
// this may not have a back tag!
{"DT" , 1, 1, 1, 1,1, TAG_DT ,0},
// this may not have a back tag!
{"DD" , 1, 1, 1, 1,1, TAG_DD ,0},
{"EM" , 1, 0, 1, 2,1, TAG_EM ,0}, // emphasized text
{"EMBED" , 0, 1, 1, 0,0, TAG_EMBED ,0},
{"FIG" , 1, 1, 1, 0,0, TAG_FIG ,0},
{"FN" , 1, 1, 1, 0,0, TAG_FN ,0},
{"FONT" , 1, 0, 1, 1,1, TAG_FONT ,0},
{"FORM" , 1, 1, 1, 2,2, TAG_FORM ,0},
// this may not have a back tag!
{"FRAME" , 1, 1, 1, 0,0, TAG_FRAME ,0},
{"FRAMESET" , 1, 1, 1, 0,0, TAG_FRAMESET ,0},
{"H1" , 1, 1, 1, 1,1, TAG_H1 ,0},
{"H2" , 1, 1, 1, 1,1, TAG_H2 ,0},
{"H3" , 1, 1, 1, 1,1, TAG_H3 ,0},
{"H4" , 1, 1, 1, 1,1, TAG_H4 ,0},
{"H5" , 1, 1, 1, 1,1, TAG_H5 ,0},
{"H6" , 1, 1, 1, 1,1, TAG_H6 ,0},
{"HEAD" , 1, 1, 1, 1,1, TAG_HEAD ,0},
{"HR" , 0, 1, 1, 1,1, TAG_HR ,0},
{"HTML" , 1, 1, 1, 1,1, TAG_HTML ,0},
{"I" , 1, 0, 1, 2,1, TAG_I ,0},
{"IFRAME" , 1, 1, 1, 2,2, TAG_IFRAME ,0},
// filter = 1,but tag is turned to alt
{"IMG" , 0, 1, 1, 1,1, TAG_IMG ,0},
{"INPUT" , 0, 1, 1, 0,0, TAG_INPUT ,0},
{"INS" , 1, 1, 1, 2,1, TAG_INS ,0},
{"ISINDEX" , 0, 1, 1, 0,0, TAG_ISINDEX ,0},
{"KBD" , 1, 1, 1, 2,1, TAG_KBD ,0},
{"LANG" , 1, 1, 1, 0,0, TAG_LANG ,0},
{"LH" , 1, 1, 1, 0,0, TAG_LH ,0},
// this may or may not have a back tag
{"LI" , 1, 1, 1, 1,1, TAG_LI ,0},
// this may or may not have a back tag
{"LINK" , 0, 1, 1, 0,0, TAG_LINK ,0},
{"LISTING" , 1, 1, 1, 0,0, TAG_LISTING ,0},
{"MAP" , 1, 1, 1, 0,0, TAG_MAP ,0},
// don't index marquee text
{"MARQUEE" , 1, 1, 0, 2,2, TAG_MARQUEE ,0},
{"MATH" , 1, 1, 1, 0,0, TAG_MATH ,0},
{"MENU" , 1, 1, 1, 1,1, TAG_MENU ,0},
{"META" , 0, 1, 1, 1,1, TAG_META ,0},
{"MULTICOL" , 0, 1, 1, 0,0, TAG_MULTICOL ,0},
{"NOBR" , 1, 0, 1, 0,0, TAG_NOBR ,0},
{"NOFRAMES" , 1, 1, 1, 0,0, TAG_NOFRAMES ,0},
{"NOTE" , 1, 1, 1, 0,0, TAG_NOTE ,0},
{"OL" , 1, 1, 1, 1,1, TAG_OL ,0},
{"OVERLAY" , 0, 1, 1, 0,0, TAG_OVERLAY ,0},
// this may not have a back tag!
{"P" , 0, 1, 1, 1,1, TAG_P ,0},
{"PARAM" , 0, 1, 1, 0,0, TAG_PARAM ,0},
{"PERSON" , 1, 1, 1, 0,0, TAG_PERSON ,0},
{"PLAINTEXT", 1, 1, 1, 0,0, TAG_PLAINTEXT ,0},
{"PRE" , 1, 1, 1, 2,1, TAG_PRE ,0},
{"Q" , 1, 1, 1, 2,1, TAG_Q ,0},
{"RANGE" , 0, 1, 1, 0,0, TAG_RANGE ,0},
{"SAMP" , 1, 1, 1, 2,1, TAG_SAMP ,0},
{"SCRIPT" , 1, 1, 0, 0,0, TAG_SCRIPT ,0},
{"SELECT" , 1, 1, 0, 0,0, TAG_SELECT ,0},
{"SMALL" , 1, 0, 1, 2,1, TAG_SMALL ,0},
{"SPACER" , 0, 1, 1, 2,1, TAG_SPACER ,0},
{"SPOT" , 0, 1, 1, 0,0, TAG_SPOT ,0},
{"STRIKE" , 1, 1, 1, 2,1, TAG_STRIKE ,0},
{"STRONG" , 1, 0, 1, 2,1, TAG_STRONG ,0},
{"SUB" , 1, 0, 1, 2,2, TAG_SUB ,0},
{"SUP" , 1, 0, 1, 2,2, TAG_SUP ,0},
{"TAB" , 0, 1, 1, 0,0, TAG_TAB ,0},
{"TABLE" , 1, 1, 1, 1,1, TAG_TABLE ,0},
{"TBODY" , 1, 1, 1, 1,1, TAG_TBODY ,0},
// this may not have a back tag!
{"TD" , 1, 1, 1, 1,1, TAG_TD ,0},
{"TEXTAREA" , 1, 1, 1, 2,2, TAG_TEXTAREA ,0},
{"TEXTFLOW" , 0, 1, 1, 0,0, TAG_TEXTFLOW ,0},
{"TFOOT" , 0, 1, 1, 0,0, TAG_TFOOT ,0},
// this DOES have a back tag
{"TH" , 1, 1, 1, 0,0, TAG_TH ,0},
{"THEAD" , 0, 1, 1, 0,0, TAG_THEAD ,0},
{"TITLE" , 1, 1, 1, 1,1, TAG_TITLE ,0},
// this may not have a back tag!
{"TR" , 1, 1, 1, 1,1, TAG_TR ,0},
{"TT" , 1, 1, 1, 2,1, TAG_TT ,0},
{"U" , 1, 0, 1, 1,1, TAG_U ,0},
{"UL" , 1, 0, 1, 1,1, TAG_UL ,0},
{"VAR" , 1, 1, 1, 2,1, TAG_VAR ,0},
{"WBR" , 0, 1, 1, 0,0, TAG_WBR ,0},
{"XMP" , 1, 1, 1, 0,0, TAG_XMP ,0},
{"!--" , 0, 1, 1, 0,0, TAG_COMMENT ,0}, // comment tag!
{"OPTION" , 0, 1, 1, 2,2, TAG_OPTION ,0},
{"STYLE" , 1, 1, 0, 0,1, TAG_STYLE ,0},
// doctype tag <!DOCTYPE ...>
{"DOCTYPE" , 0, 1, 1, 0,0, TAG_DOCTYPE ,0},
// used in office.microsoft.com <?xml ...>
{"XML" , 0, 1, 1, 0,0, TAG_XML ,0},
// <start index> <stop index>
{"START" , 0, 1, 1, 0,0, TAG_START ,0},
{"STOP" , 0, 1, 1, 0,0, TAG_STOP ,0},
// . i added these tags for faisal, but don't really need them
// since our XML tag condition handles this case
// . we can no longer treat as a generic XML tags since faisal wanted
// the strip=2 option
{"SPAN" , 1, 0, 1, 2,1, TAG_SPAN ,0}, // not breaking!
{"LEGEND" , 1, 1, 1, 2,1, TAG_LEGEND ,0},
{"S" , 1, 1, 1, 2,1, TAG_S ,0}, // strike tag
{"ABBR" , 1, 0, 1, 2,1, TAG_ABBR ,0},
{"![CDATA[" , 0, 1, 1, 0,0, TAG_CDATA ,0}, // <![CDATA[ tag
{"NOSCRIPT" , 1, 1, 0, 0,0, TAG_NOSCRIPT,0},
{"FIELDSET" , 1, 1, 1, 0,0, TAG_FIELDSET,0},
// feedburner uses these in the xml
{"FEEDBURNER:ORIGLINK", 0, 1, 1, 0,0, TAG_FBORIGLINK ,1},
// ahrefs uses these as links
{"RDF:RDF",0, 1, 1, 0,0, TAG_RDF ,1},
{"RSS",0, 1, 1, 0,0, TAG_RSS ,1},
{"FEED",0, 1, 1, 0,0, TAG_FEED ,1},
{"ITEM",1, 1, 0, 0,0, TAG_ITEM ,1},
{"ENTRY",1, 1, 0, 0,0, TAG_ENTRY ,1},
{"CHANNEL",1, 1, 0, 0,0, TAG_CHANNEL ,1},
{"ENCLOSURE",1, 1, 0, 0,0, TAG_ENCLOSURE ,0},
{"WEBLOG",0, 1, 0, 0,0, TAG_WEBLOG ,1},
{"GBFRAME", 1, 1, 1, 1,1, TAG_GBFRAME ,0},
{"TC" , 1, 1, 1, 1,1, TAG_TC ,0},// HACK: tbl column section
{"GBXMLTITLE", 1, 1, 1, 1,1, TAG_GBXMLTITLE,1},
// facebook xml
{"START_TIME", 1, 1, 1, 1,1, TAG_FBSTARTTIME,1},
{"END_TIME", 1, 1, 1, 1,1, TAG_FBENDTIME,1},
{"NAME", 1, 1, 1, 1,1, TAG_FBNAME,1},
{"PIC_SQUARE", 1, 1, 1, 1,1, TAG_FBPICSQUARE,1},
{"HIDE_GUEST_LIST", 1, 1, 1, 1,1, TAG_FBHIDEGUESTLIST,1},
{"scriptText",0, 1, 0, 0,0, TAG_SCRIPTTEXT,0 },
{"BUTTON" , 1, 1, 1, 0,0, TAG_BUTTON ,0},
{"UrlFrom", 0, 1, 1, 0,0, TAG_URLFROM ,1}
//{"BUTTON" , 1, 1, 1, 2, 122,0},
//{"BDO" , 1, 1, 1, 2, 123,0},
//{"LABEL" , 1, 1, 1, 2, 124,0},
//{"LAYER" , 1, 1, 1, 2, 125}
};
// NAME hasBackTag brk? isVisible? filterKeep1? filterKeep2 type/m_nodeId[i]
// . called by Xml class
// . returns the length of the node
// . TODO: "node" is now guaranteed to be \0 terminated -- make this faster
long XmlNode::set ( char *node , bool pureXml , long version ) {
// save head of node
m_node = node;
// sanity check
static bool s_check = false;
if ( ! s_check ) {
s_check = true;
// how many NodeTypes do we have in g_nodes?
static long nn = sizeof(g_nodes) / sizeof(NodeType);
// set the hash table
for ( long i = 0 ; i < nn ; i++ ) {
// sanity
if ( g_nodes[i].m_nodeId != i ) { char *xx=NULL;*xx=0;}
}
}
// . reset this
// . need to do here instead of in Links.cpp because sometimes
// we think an anchor tag indicates a link, but it is really
// just an <a href="javascript:..."> function call and Links.cpp
// ignored it but we are expecting this to be valid!
m_isSelfLink = 0;
// reset
//m_linkNum = -1;
// CDATA tag was identified in earlier versions as a text node. Now
// it is identified as a CDATA tag node. But gb.conf and others always
// pass their version as 0
if ( node[0] == '<' &&
node[1] == '!' &&
node[2] == '[' &&
node[3] == 'C' &&
node[4] == 'D' &&
node[5] == 'A' &&
node[6] == 'T' &&
node[7] == 'A' &&
node[8] == '[' )
return setCDATANode ( node );
// if "node" isn't the start of a tag then set it as a Text Node
if ( *node != '<' || ! isTagStart ( node ) ) {//, 0, version ) ) {
// . set this node as a text node!
// . nodeId for text nodes is 0
m_nodeId = 0;
m_node = node;
m_hasBackTag = false;
m_hash = 0;
long i = 0;
//char inCDATA = 0;
// inc i as long as it's NOT the beginning of a tag
while ( node[i] &&
(node[i] != '<' || ! isTagStart ( node+i)))//,versin)))
i++;
m_nodeLen = i;
m_pairTagNum = -1;
return m_nodeLen;
}
// . see if it's a comment (node end is "-->" for comments)
// . comments are special cases
if ( node[1] == '!' ) {
if ( node[2]=='-' && node[3]=='-' )
return setCommentNode ( node );
// this means comment too:
// <![if ....]>
if ( node[2]=='[' )
return setCommentNode2 ( node );
}
// . otherwise it's a regular tag
// . might be <!DOCTYPE ...> or something though
m_nodeLen = getTagLen ( node );//, version );
// . get the node's name's length (i-1)
// . node name ends at non alnum char
// . we can have hyphens in node name (TODO: even at beginning???)
long tagNameStart = 1;
// . skip over backslash in the back tags
// . or skip over / or ? or ! now
// . tag names must start with a letter, fwiw
if ( ! is_alnum_a(node[tagNameStart]) /* == '/'*/ ) tagNameStart++;
long i = tagNameStart;
// skip i to end of tagName. this should only allow ascii chars
// to be "tag name chars"
for ( ; i < m_nodeLen && is_tagname_char(node[i]) ; i++ );
// set the tagName and tagNameLen
m_tagName = &node [ tagNameStart ];
m_tagNameLen = i - tagNameStart;
// break point
//if ( m_tagNameLen == 3 && m_tagName[0]=='!' &&
// m_tagName[1]=='-' && m_tagName[2]=='-' )
// fprintf(stderr,"man!");
// . set the node's hash -- used cuz it's faster than strcmp
// . just hash the letters as upper case
// . tag names are never utf8, so use the ascii ha
m_hash = hash64Upper_a ( m_tagName , m_tagNameLen , 0LL);
// if we're pure xml, don't allow any html tags accept <!-- -->
if ( pureXml ) {
m_hasBackTag = true;
m_isBreaking = true;
m_isVisible = true;
m_nodeId = TAG_XMLTAG;//1;
}
// . determine if the nodeId for this node
// . determine if it breaks lines (for phrasing purposes)
else
m_nodeId = setNodeInfo ( m_hash );//&m_hasBackTag ,
//&m_isBreaking , &m_isVisible );
// . no back tag if / follow name
// . this was only for "pureXml" but now i do it for all tags!
if ( m_node [ m_nodeLen - 2 ] == '/' ) m_hasBackTag = false;
if ( m_node [ m_nodeLen - 2 ] == '?' ) m_hasBackTag = false;
return m_nodeLen;
}
// . return the length of a node starting at "node"
long getTagLen ( char *node ) { // , long version ) {
// see if it's not a node
//if ( node[0] != '<' ) return 0;
// skip over first <
long i ;
// . keep looping until we hit a < or > OR while we're in quotes
// . ignore < and > when they're in quotes
for ( i = 1 ; node[i] ; i++ ) {
// this switch should speed things up... no!
if ( node[i] != '<' &&
node[i] != '>' &&
node[i] != '\"' &&
node[i] != '\'' )
continue;
// this is about 1.3 times faster than above (with -O2 on both)
//if ( ! is_tag_control_char ( node[i] ) ) continue;
if ( node[i] == '<' ) break;
if ( node[i] == '>' ) {
break;
//if ( node[i-1]!='b') break;
//if ( i -2 < 0 ) break;
//if ( node[i-2]!='g') break;
// we had a "gb>" which means that these 3 chars
// we originally a &gt; html encoded entity which
// we decoded for easier parsing
//continue;
}
//if (version >= 70 && version < 77) continue;
// we can have double quotes within single quotes
if ( node [ i ] == '\"' ) {
// scan back looking for equal sign...
long k; for ( k = i - 1 ; k > 1 ; k-- ) {
if ( is_wspace_a(node[k]) ) continue;
break;
}
if ( k <= 1 ) continue;
// . if an equal sign did not immediately preceed
// this double quote then ignore the double quote
// . this now fixes the harwoodmuseum.org issue
// talked about below
if ( node[k] != '=' ) continue;
// skip over this first quote
i++;
while ( node[i] && node[i]!='\"' ) {
// crap some pages have unbalanced quotes.
// see /test/doc.14541556377486183454.html
if ( node[i ]=='>' &&
node[i-1]=='\"' ) {
i--;
break;
}
// like an img tag hits a </a> for
// http://www.harwoodmuseum.org/press_deta
// il.php?ID=44
// BUT this fucks up
// onclick="tb_show('<b>Community Calendar</b>'
// on the </b> which is legitamately in quotes
//if ( node[i ]=='<' &&
// node[i+1]=='/' ) {
// i--;
// break;
//}
if ( node[i ]=='>' &&
node[i-1]==' ' &&
node[i-2]=='\"' ) {
i--;
break;
}
// skip this char
i++;
}
// return the length if tag ended abuptly
if ( ! node[i] ) return i;
// back-to-back quotes? common mistake
if ( node[i+1] == '\"' ) i++;
continue;
}
// continue if we don't have a " '" or "='"
if ( node [ i ] != '\'' ) continue;
if ( node[i-1] != '=' && !is_wspace_a( node[i-1] ) ) continue;
// skip to end of quote
while ( node[i] && node[i]!='\'' ) i++;
}
// skip i over the >
if ( node[i] == '>' ) i++;
// . else we found no closure outside of quotes so be more stringent
// . look for closure with regard to quotes
else for ( i=1; node[i] && node[i] != '>' && node[i] != '<';i++);
// return the LENGTH of the whole node
return i ;
}
long XmlNode::setCommentNode ( char *node ) {
m_nodeId = TAG_COMMENT;
m_isBreaking = true;
m_isVisible = true;
m_hasBackTag = false;
m_hash = hash64 ( "!--" , 3 , 0LL );
m_node = node;
m_tagName = node + 1; // !--
m_tagNameLen = 3;
// . compute node length
// . TODO: do we have to deal with quotes????
// . TODO: what about nested comments?
long i;
for ( i = 3 ; node[i] ; i++ ) {
if ( node[i] !='>' ) continue;
if ( node[i-1] !='-' ) continue;
if ( node[i-2] =='-' ) break;
}
// skip i over the >, if any (could be end of doc)
if ( node[i] == '>' ) i++;
m_nodeLen = i;
return i;
}
long XmlNode::setCommentNode2 ( char *node ) {
m_nodeId = TAG_COMMENT;
m_isBreaking = false;//true;
m_isVisible = false;//true;
m_hasBackTag = false;
m_hash = hash64 ( "![" , 2 , 0LL );
m_node = node;
m_tagName = node + 1;
m_tagNameLen = 2;
// . compute node length
// . TODO: do we have to deal with quotes????
// . TODO: what about nested comments?
long i;
for ( i = 2 ; node[i] ; i++ ) {
// look for ending of ]> like for <![if gt IE 6]>
if ( node[i] !='>' ) continue;
if ( node[i-1] ==']' ) break;
// look for ending of --> like for <![endif]-->
if ( node[i-1] == '-' && node[i-2] == '-' ) break;
}
// skip i over the >, if any (could be end of doc)
if ( node[i] == '>' ) i++;
m_nodeLen = i;
return i;
}
long XmlNode::setCDATANode ( char *node ) {
m_nodeId = TAG_CDATA;
m_isBreaking = true;
m_isVisible = true;
m_hasBackTag = false;
m_hash = hash64 ( "![CDATA[" , 8 , 0LL );
m_node = node;
m_tagName = node + 1; // !--
m_tagNameLen = 8;
// . compute node length
// . TODO: do we have to deal with quotes????
// . TODO: what about nested comments?
long i;
for ( i = 8 ; node[i] ; i++ ) {
// seems like just ]] is good enough! don't need "]]>"
//if ( node[i] !='>' ) continue;
if ( node[i ] !=']' ) continue;
if ( node[i+1] !=']' ) continue;//{ i++; break; }
// but skip it if we got it
if ( node[i+2] !='>' ) continue;
//if ( node[i+2] == '>' ) { i+=3; break;}
i += 3;
break;
// if does not end in '>', skip the ']' anyway
// no! hurts regex ending in [0-9]
//i+=2; break;
}
// skip i over the >, if any (could be end of doc)
//if ( node[i] == '>' ) i++;
m_nodeLen = i;
return i;
}
// Return the value of the specified "field" within this node.
// the case of "field" does not matter.
char *XmlNode::getFieldValue ( char *field , long *valueLen ) {
// reset this to 0
*valueLen = 0;
// scan for the field name in our node
long flen = gbstrlen(field);
char inQuotes = '\0';
long i;
// scan the characters in the node, looking for the field name in ascii
for ( i = 1; i + flen < m_nodeLen ; i++ ) {
// skip the field if it's quoted
if ( inQuotes) {
if (m_node[i] == inQuotes ) inQuotes = 0;
continue;
}
// set inQuotes to the quote if we're in quotes
if ( (m_node[i]=='\"' || m_node[i]=='\'')){
inQuotes = m_node[i];
continue;
}
// a field name must be preceeded by non-alnum
if ( is_alnum_a ( m_node[i-1] ) ) continue;
// the first character of this field shout match field[0]
if ( to_lower_a (m_node[i]) != to_lower_a(field[0] )) continue;
// field just be immediately followed by an = or space
if (m_node[i+flen]!='='&&!is_wspace_a(m_node[i+flen]))continue;
// field names must match
if ( strncasecmp ( &m_node[i], field, flen ) != 0 ) continue;
// break cuz we got a match for our field name
break;
}
// return NULL if no matching field
if ( i + flen >= m_nodeLen ) return NULL;
// advance i over the fieldname so it pts to = or space
i += flen;
// advance i over spaces
while ( i < m_nodeLen && is_wspace_a ( m_node[i] ) ) i++;
// advance over the equal sign, return NULL if does not exist
if ( i < m_nodeLen && m_node[i++] != '=' ) return NULL;
// advance i over spaces after the equal sign
while ( i < m_nodeLen && is_wspace_a ( m_node[i] ) ) i++;
// now parse out the value of this field (could be in quotes)
inQuotes = '\0';
// set inQuotes to the quote if we're in quotes
if ( m_node[i]=='\"' || m_node[i]=='\'') inQuotes = m_node[i++];
// mark this as the start of the value
int start=i;
// advance i until we hit a space, or we hit a that quote if inQuotes
if (inQuotes) {
while (i<m_nodeLen && m_node[i] != inQuotes )
i++;
}
else {
while ( i<m_nodeLen &&
!is_wspace_a(m_node[i])&&
m_node[i]!='>')
i++;
}
// set the length of the value
*valueLen = i - start;
// return a ptr to the value
return m_node + start;
}
#include "HashTableX.h"
nodeid_t getTagId ( char *s , NodeType **retp ) {
// init table?
static bool s_init = false;
static HashTableX s_ht;
static char s_buf[10000];
if ( ! s_init ) {
s_init = true;
s_ht.set ( 4 ,4,1024,s_buf,10000,false,0,"tagids");//niceness=0
// how many NodeTypes do we have in g_nodes?
static long nn = sizeof(g_nodes) / sizeof(NodeType);
// set the hash table
for ( long i = 0 ; i < nn ; i++ ) {
char *name = g_nodes[i].m_nodeName;
long nlen = gbstrlen(name);
long long h = hash64Upper_a ( name,nlen,0LL );
NodeType *nt = &g_nodes[i];
if ( ! s_ht.addKey(&h,&nt) ) {
char *xx=NULL;*xx=0; }
}
// sanity
if ( s_ht.m_numSlots != 1024 ) { char *xx=NULL;*xx=0; }
// sanity test
nodeid_t tt = getTagId ( "br" );
if ( tt != TAG_BR ) { char *xx=NULL;*xx=0; }
}
// find end of tag name. hyphens are ok to be in name.
// facebook uses underscores like <start_time>
char *e = s; for ( ; *e && (is_alnum_a(*e) || *e=='-'|| *e=='_'); e++);
// hash it for lookup
long long h = hash64Upper_a ( s , e - s , 0 );
// look it up
NodeType **ntp = (NodeType **)s_ht.getValue(&h);
// assume none
if ( retp ) *retp = NULL;
// none?
if ( ! ntp ) return 0;
// got one
if ( retp ) *retp = *ntp;
// get id otherwise
return (*ntp)->m_nodeId;
}
// . returns the nodeId
// . 0 means not a node
// . 1 means it's an xml node
// . > 1 is reserved for pre-defined html nodes
nodeid_t XmlNode::setNodeInfo ( long long nodeHash ){// , char *hasBackTag ,
//char *isBreaking , char *isVisible ) {
/*
// sanity check
static bool s_init = false;
if ( ! s_init ) {
s_init = true;
// how many NodeTypes do we have in g_nodes?
static long nn = sizeof(g_nodes) / sizeof(NodeType);
// set the hash table
for ( long i = 0 ; i < nn ; i++ ) {
// sanity check
if(g_nodes[i].m_nodeId != i ) { char *xx=NULL;*xx=0;}
}
}
*/
// . we have a list of all node types called "g_nodes"
// . each node type is a NodeType struct
// . hash all these node types into a hash table by their node name
// . we have 108 node names so we'll use 512 buckets
// . given the hash of your node name you can look it up in this table
static bool s_isHashed = false;
static long long s_hash [512];
static nodeid_t s_num [512];
// how many NodeTypes do we have in g_nodes?
static long s_numNodeTypes = sizeof(g_nodes) / sizeof(NodeType);
// we only need to fill in the hash table once since it's static
if ( s_isHashed ) goto ready;
// clear the hash table
memset ( s_hash , 0 , 8*512 );
// set the hash table
for ( long i = 0 ; i < s_numNodeTypes ; i++ ) {
long long h = hash64Upper_a ( g_nodes[i].m_nodeName,
gbstrlen(g_nodes[i].m_nodeName),0LL);
//long b = (unsigned long long)h % 512;
long b = (unsigned long long)h & 511;
// debug msg
//fprintf(stderr,"node #%li has bucket #%li, hash =%lli\n",i,b,h);
while ( s_hash[b] ) if ( ++b == 512 ) b = 0;
s_hash [ b ] = h;
s_num [ b ] = i;
}
// set this to true so we don't do the hashing again
s_isHashed = true;
ready:
// look up nodeHash in hash table
//long b = (unsigned long long)nodeHash % 512;
long b = (unsigned long long)nodeHash & 511;
while ( s_hash[b] ) {
if ( s_hash[b] == nodeHash ) break;
if ( ++b == 512 ) b = 0;
}
// if it wasn't found it must be an xml node(or unrecognized html node)
if ( ! s_hash[b] ) {
// default is breaking, has back tag and is indexable
m_isBreaking = true;
m_hasBackTag = true;
m_isVisible = true;
return 1;
}
// otherwise extract the isBreaking and the nodeId from the hit bucket
long n = s_num[b];
m_hasBackTag = g_nodes [ n ].m_hasBackTag;
m_isBreaking = g_nodes [ n ].m_isBreaking;
m_isVisible = g_nodes [ n ].m_isVisible;
// return the tag/node Id
return g_nodes [ n ].m_nodeId;
}
long getNumXmlNodes ( ) {
return (long)sizeof(g_nodes) / sizeof(XmlNode);
}
#include "Words.h" // BACKBITCOMP
bool isBreakingTagId ( nodeid_t tagId ) {
return g_nodes [ tagId & BACKBITCOMP ].m_isBreaking;
}
bool hasBackTag ( nodeid_t tagId ) {
return g_nodes [ tagId & BACKBITCOMP ].m_hasBackTag;
}