mirror of
https://github.com/gigablast/open-source-search-engine.git
synced 2024-10-05 12:47:37 +03:00
713 lines
23 KiB
C++
713 lines
23 KiB
C++
#include "gb-include.h"
|
|
|
|
#include "XmlNode.h"
|
|
#include "Mem.h"
|
|
|
|
// . called by Xml class
|
|
// . returns the length of the node
|
|
// . TODO: "node" is now guaranteed to be \0 terminated -- make this faster
|
|
long XmlNode::set ( char *node , bool pureXml , long version ) {
|
|
// save head of node
|
|
m_node = node;
|
|
|
|
// . reset this
|
|
// . need to do here instead of in Links.cpp because sometimes
|
|
// we think an anchor tag indicates a link, but it is really
|
|
// just an <a href="javascript:..."> function call and Links.cpp
|
|
// ignored it but we are expecting this to be valid!
|
|
m_isSelfLink = 0;
|
|
|
|
// reset
|
|
//m_linkNum = -1;
|
|
|
|
// CDATA tag was identified in earlier versions as a text node. Now
|
|
// it is identified as a CDATA tag node. But gb.conf and others always
|
|
// pass their version as 0
|
|
if ( node[0] == '<' &&
|
|
node[1] == '!' &&
|
|
node[2] == '[' &&
|
|
node[3] == 'C' &&
|
|
node[4] == 'D' &&
|
|
node[5] == 'A' &&
|
|
node[6] == 'T' &&
|
|
node[7] == 'A' &&
|
|
node[8] == '[' )
|
|
return setCDATANode ( node );
|
|
|
|
// if "node" isn't the start of a tag then set it as a Text Node
|
|
if ( *node != '<' || ! isTagStart ( node ) ) {//, 0, version ) ) {
|
|
// . set this node as a text node!
|
|
// . nodeId for text nodes is 0
|
|
m_nodeId = 0;
|
|
m_node = node;
|
|
m_hasBackTag = false;
|
|
m_hash = 0;
|
|
long i = 0;
|
|
//char inCDATA = 0;
|
|
// inc i as long as it's NOT the beginning of a tag
|
|
while ( node[i] &&
|
|
(node[i] != '<' || ! isTagStart ( node+i)))//,versin)))
|
|
i++;
|
|
m_nodeLen = i;
|
|
m_pairTagNum = -1;
|
|
return m_nodeLen;
|
|
}
|
|
|
|
// . see if it's a comment (node end is "-->" for comments)
|
|
// . comments are special cases
|
|
if ( node[1] == '!' ) {
|
|
if ( node[2]=='-' && node[3]=='-' )
|
|
return setCommentNode ( node );
|
|
// this means comment too:
|
|
// <![if ....]>
|
|
if ( node[2]=='[' )
|
|
return setCommentNode2 ( node );
|
|
}
|
|
|
|
// . otherwise it's a regular tag
|
|
// . might be <!DOCTYPE ...> or something though
|
|
m_nodeLen = getTagLen ( node );//, version );
|
|
// . get the node's name's length (i-1)
|
|
// . node name ends at non alnum char
|
|
// . we can have hyphens in node name (TODO: even at beginning???)
|
|
long tagNameStart = 1;
|
|
// . skip over backslash in the back tags
|
|
// . or skip over / or ? or ! now
|
|
// . tag names must start with a letter, fwiw
|
|
if ( ! is_alnum_a(node[tagNameStart]) /* == '/'*/ ) tagNameStart++;
|
|
long i = tagNameStart;
|
|
// skip i to end of tagName. this should only allow ascii chars
|
|
// to be "tag name chars"
|
|
for ( ; i < m_nodeLen && is_tagname_char(node[i]) ; i++ );
|
|
// set the tagName and tagNameLen
|
|
m_tagName = &node [ tagNameStart ];
|
|
m_tagNameLen = i - tagNameStart;
|
|
|
|
// break point
|
|
//if ( m_tagNameLen == 3 && m_tagName[0]=='!' &&
|
|
// m_tagName[1]=='-' && m_tagName[2]=='-' )
|
|
// fprintf(stderr,"man!");
|
|
// . set the node's hash -- used cuz it's faster than strcmp
|
|
// . just hash the letters as upper case
|
|
// . tag names are never utf8, so use the ascii ha
|
|
m_hash = hash64Upper_a ( m_tagName , m_tagNameLen , 0LL);
|
|
|
|
// if we're pure xml, don't allow any html tags accept <!-- -->
|
|
if ( pureXml ) {
|
|
m_hasBackTag = true;
|
|
m_isBreaking = true;
|
|
m_isVisible = true;
|
|
m_nodeId = TAG_XMLTAG;//1;
|
|
}
|
|
// . determine if the nodeId for this node
|
|
// . determine if it breaks lines (for phrasing purposes)
|
|
else
|
|
m_nodeId = setNodeInfo ( m_hash );//&m_hasBackTag ,
|
|
//&m_isBreaking , &m_isVisible );
|
|
|
|
|
|
// . no back tag if / follow name
|
|
// . this was only for "pureXml" but now i do it for all tags!
|
|
if ( m_node [ m_nodeLen - 2 ] == '/' ) m_hasBackTag = false;
|
|
if ( m_node [ m_nodeLen - 2 ] == '?' ) m_hasBackTag = false;
|
|
|
|
return m_nodeLen;
|
|
}
|
|
|
|
// . return the length of a node starting at "node"
|
|
long getTagLen ( char *node ) { // , long version ) {
|
|
// see if it's not a node
|
|
//if ( node[0] != '<' ) return 0;
|
|
// skip over first <
|
|
long i ;
|
|
// . keep looping until we hit a < or > OR while we're in quotes
|
|
// . ignore < and > when they're in quotes
|
|
for ( i = 1 ; node[i] ; i++ ) {
|
|
// this switch should speed things up... no!
|
|
if ( node[i] != '<' &&
|
|
node[i] != '>' &&
|
|
node[i] != '\"' &&
|
|
node[i] != '\'' )
|
|
continue;
|
|
// this is about 1.3 times faster than above (with -O2 on both)
|
|
//if ( ! is_tag_control_char ( node[i] ) ) continue;
|
|
if ( node[i] == '<' ) break;
|
|
if ( node[i] == '>' ) {
|
|
break;
|
|
//if ( node[i-1]!='b') break;
|
|
//if ( i -2 < 0 ) break;
|
|
//if ( node[i-2]!='g') break;
|
|
// we had a "gb>" which means that these 3 chars
|
|
// we originally a > html encoded entity which
|
|
// we decoded for easier parsing
|
|
//continue;
|
|
}
|
|
//if (version >= 70 && version < 77) continue;
|
|
|
|
// we can have double quotes within single quotes
|
|
if ( node [ i ] == '\"' ) {
|
|
// scan back looking for equal sign...
|
|
long k; for ( k = i - 1 ; k > 1 ; k-- ) {
|
|
if ( is_wspace_a(node[k]) ) continue;
|
|
break;
|
|
}
|
|
if ( k <= 1 ) continue;
|
|
// . if an equal sign did not immediately preceed
|
|
// this double quote then ignore the double quote
|
|
// . this now fixes the harwoodmuseum.org issue
|
|
// talked about below
|
|
if ( node[k] != '=' ) continue;
|
|
// skip over this first quote
|
|
i++;
|
|
while ( node[i] && node[i]!='\"' ) {
|
|
// crap some pages have unbalanced quotes.
|
|
// see /test/doc.14541556377486183454.html
|
|
if ( node[i ]=='>' &&
|
|
node[i-1]=='\"' ) {
|
|
i--;
|
|
break;
|
|
}
|
|
// like an img tag hits a </a> for
|
|
// http://www.harwoodmuseum.org/press_deta
|
|
// il.php?ID=44
|
|
// BUT this fucks up
|
|
// onclick="tb_show('<b>Community Calendar</b>'
|
|
// on the </b> which is legitamately in quotes
|
|
//if ( node[i ]=='<' &&
|
|
// node[i+1]=='/' ) {
|
|
// i--;
|
|
// break;
|
|
//}
|
|
if ( node[i ]=='>' &&
|
|
node[i-1]==' ' &&
|
|
node[i-2]=='\"' ) {
|
|
i--;
|
|
break;
|
|
}
|
|
// skip this char
|
|
i++;
|
|
}
|
|
// return the length if tag ended abuptly
|
|
if ( ! node[i] ) return i;
|
|
// back-to-back quotes? common mistake
|
|
if ( node[i+1] == '\"' ) i++;
|
|
continue;
|
|
}
|
|
// continue if we don't have a " '" or "='"
|
|
if ( node [ i ] != '\'' ) continue;
|
|
if ( node[i-1] != '=' && !is_wspace_a( node[i-1] ) ) continue;
|
|
// skip to end of quote
|
|
while ( node[i] && node[i]!='\'' ) i++;
|
|
}
|
|
// skip i over the >
|
|
if ( node[i] == '>' ) i++;
|
|
// . else we found no closure outside of quotes so be more stringent
|
|
// . look for closure with regard to quotes
|
|
else for ( i=1; node[i] && node[i] != '>' && node[i] != '<';i++);
|
|
// return the LENGTH of the whole node
|
|
return i ;
|
|
}
|
|
|
|
long XmlNode::setCommentNode ( char *node ) {
|
|
|
|
m_nodeId = TAG_COMMENT;
|
|
m_isBreaking = true;
|
|
m_isVisible = true;
|
|
m_hasBackTag = false;
|
|
m_hash = hash64 ( "!--" , 3 , 0LL );
|
|
m_node = node;
|
|
m_tagName = node + 1; // !--
|
|
m_tagNameLen = 3;
|
|
|
|
// . compute node length
|
|
// . TODO: do we have to deal with quotes????
|
|
// . TODO: what about nested comments?
|
|
long i;
|
|
for ( i = 3 ; node[i] ; i++ ) {
|
|
if ( node[i] !='>' ) continue;
|
|
if ( node[i-1] !='-' ) continue;
|
|
if ( node[i-2] =='-' ) break;
|
|
}
|
|
|
|
// skip i over the >, if any (could be end of doc)
|
|
if ( node[i] == '>' ) i++;
|
|
|
|
m_nodeLen = i;
|
|
|
|
return i;
|
|
}
|
|
|
|
|
|
long XmlNode::setCommentNode2 ( char *node ) {
|
|
|
|
m_nodeId = TAG_COMMENT;
|
|
m_isBreaking = false;//true;
|
|
m_isVisible = false;//true;
|
|
m_hasBackTag = false;
|
|
m_hash = hash64 ( "![" , 2 , 0LL );
|
|
m_node = node;
|
|
m_tagName = node + 1;
|
|
m_tagNameLen = 2;
|
|
|
|
// . compute node length
|
|
// . TODO: do we have to deal with quotes????
|
|
// . TODO: what about nested comments?
|
|
long i;
|
|
for ( i = 2 ; node[i] ; i++ ) {
|
|
// look for ending of ]> like for <![if gt IE 6]>
|
|
if ( node[i] !='>' ) continue;
|
|
if ( node[i-1] ==']' ) break;
|
|
}
|
|
|
|
// skip i over the >, if any (could be end of doc)
|
|
if ( node[i] == '>' ) i++;
|
|
|
|
m_nodeLen = i;
|
|
|
|
return i;
|
|
}
|
|
|
|
long XmlNode::setCDATANode ( char *node ) {
|
|
|
|
m_nodeId = TAG_CDATA;
|
|
m_isBreaking = true;
|
|
m_isVisible = true;
|
|
m_hasBackTag = false;
|
|
m_hash = hash64 ( "![CDATA[" , 8 , 0LL );
|
|
m_node = node;
|
|
m_tagName = node + 1; // !--
|
|
m_tagNameLen = 8;
|
|
|
|
// . compute node length
|
|
// . TODO: do we have to deal with quotes????
|
|
// . TODO: what about nested comments?
|
|
long i;
|
|
for ( i = 8 ; node[i] ; i++ ) {
|
|
// seems like just ]] is good enough! don't need "]]>"
|
|
//if ( node[i] !='>' ) continue;
|
|
if ( node[i ] !=']' ) continue;
|
|
if ( node[i+1] !=']' ) continue;//{ i++; break; }
|
|
// but skip it if we got it
|
|
if ( node[i+2] == '>' ) { i+=3; break;}
|
|
// if does not end in '>', skip the ']' anyway
|
|
i+=2; break;
|
|
}
|
|
|
|
// skip i over the >, if any (could be end of doc)
|
|
//if ( node[i] == '>' ) i++;
|
|
|
|
m_nodeLen = i;
|
|
|
|
return i;
|
|
}
|
|
|
|
// Return the value of the specified "field" within this node.
|
|
// the case of "field" does not matter.
|
|
char *XmlNode::getFieldValue ( char *field , long *valueLen ) {
|
|
// reset this to 0
|
|
*valueLen = 0;
|
|
// scan for the field name in our node
|
|
long flen = gbstrlen(field);
|
|
char inQuotes = '\0';
|
|
long i;
|
|
|
|
// scan the characters in the node, looking for the field name in ascii
|
|
for ( i = 1; i + flen < m_nodeLen ; i++ ) {
|
|
// skip the field if it's quoted
|
|
if ( inQuotes) {
|
|
if (m_node[i] == inQuotes ) inQuotes = 0;
|
|
continue;
|
|
}
|
|
// set inQuotes to the quote if we're in quotes
|
|
if ( (m_node[i]=='\"' || m_node[i]=='\'')){
|
|
inQuotes = m_node[i];
|
|
continue;
|
|
}
|
|
// a field name must be preceeded by non-alnum
|
|
if ( is_alnum_a ( m_node[i-1] ) ) continue;
|
|
// the first character of this field shout match field[0]
|
|
if ( to_lower_a (m_node[i]) != to_lower_a(field[0] )) continue;
|
|
// field just be immediately followed by an = or space
|
|
if (m_node[i+flen]!='='&&!is_wspace_a(m_node[i+flen]))continue;
|
|
// field names must match
|
|
if ( strncasecmp ( &m_node[i], field, flen ) != 0 ) continue;
|
|
// break cuz we got a match for our field name
|
|
break;
|
|
}
|
|
|
|
|
|
// return NULL if no matching field
|
|
if ( i + flen >= m_nodeLen ) return NULL;
|
|
|
|
// advance i over the fieldname so it pts to = or space
|
|
i += flen;
|
|
|
|
// advance i over spaces
|
|
while ( i < m_nodeLen && is_wspace_a ( m_node[i] ) ) i++;
|
|
|
|
// advance over the equal sign, return NULL if does not exist
|
|
if ( i < m_nodeLen && m_node[i++] != '=' ) return NULL;
|
|
|
|
// advance i over spaces after the equal sign
|
|
while ( i < m_nodeLen && is_wspace_a ( m_node[i] ) ) i++;
|
|
|
|
// now parse out the value of this field (could be in quotes)
|
|
inQuotes = '\0';
|
|
|
|
// set inQuotes to the quote if we're in quotes
|
|
if ( m_node[i]=='\"' || m_node[i]=='\'') inQuotes = m_node[i++];
|
|
|
|
// mark this as the start of the value
|
|
int start=i;
|
|
|
|
// advance i until we hit a space, or we hit a that quote if inQuotes
|
|
if (inQuotes) {
|
|
while (i<m_nodeLen && m_node[i] != inQuotes )
|
|
i++;
|
|
}
|
|
else {
|
|
while ( i<m_nodeLen &&
|
|
!is_wspace_a(m_node[i])&&
|
|
m_node[i]!='>')
|
|
i++;
|
|
}
|
|
|
|
// set the length of the value
|
|
*valueLen = i - start;
|
|
|
|
// return a ptr to the value
|
|
return m_node + start;
|
|
}
|
|
|
|
|
|
// . Here's a nice list of all the html nodes names, lengths, whether they're
|
|
// a breaking node or not and their node id
|
|
// . isVisible is true if text in between front and end tags is visible on page
|
|
// . isVisible is used by Xml::getText()
|
|
// . filterKeep is 1 if we should keep it when &strip=1 is given when getting
|
|
// the cached document. i added this for faisal
|
|
// . a filterKeep of 0 means remove tag and text between it and its back tag.
|
|
// . a filterKeep of 1 means keep the tag and text between it and its back tag.
|
|
// . a filterKeep of 2 means remove tag BUT keep the text between
|
|
// it and its back tag.
|
|
NodeType g_nodes[] = {
|
|
// NAME hasBackTag brk? isVisible? filterKeep1? filterKeep2 type/m_nodeId[i]
|
|
// isXml? (the last field)
|
|
// --------------------------
|
|
// -- text node --- 0
|
|
{"textNode" , 0, 0, 1, 1,1, TAG_TEXTNODE ,0},
|
|
// -- xml tag node --- 1
|
|
{"xmlTag" , 1, 1, 1, 2,2, TAG_XMLTAG ,0},
|
|
{"A" , 1, 0, 1, 1,1, TAG_A ,0},
|
|
{"ABBREV" , 1, 1, 1, 2,2, TAG_ABBREV ,0},
|
|
{"ACRONYM" , 1, 1, 1, 2,1, TAG_ACRONYM ,0},
|
|
{"ADDRESS" , 1, 1, 1, 2,2, TAG_ADDRESS ,0},
|
|
{"APPLET" , 1, 1, 1, 0,0, TAG_APPLET ,0},
|
|
{"AREA" , 0, 1, 1, 0,0, TAG_AREA ,0},
|
|
{"AU" , 1, 1, 1, 0,0, TAG_AU ,0},
|
|
{"AUTHOR" , 1, 1, 1, 0,0, TAG_AUTHOR ,0},
|
|
{"B" , 1, 0, 1, 1,1, TAG_B ,0},
|
|
{"BANNER" , 1, 1, 1, 0,0, TAG_BANNER ,0},
|
|
{"BASE" , 0, 1, 1, 0,0, TAG_BASE ,0},
|
|
{"BASEFONT" , 0, 1, 1, 2,2, TAG_BASEFONT ,0},
|
|
{"BGSOUND" , 0, 1, 1, 0,0, TAG_BGSOUND ,0},
|
|
{"BIG" , 1, 0, 1, 2,1, TAG_BIG ,0},
|
|
{"BLINK" , 1, 0, 1, 2,2, TAG_BLINK ,0},
|
|
{"BLOCKQUOTE",1, 1, 1, 2,1, TAG_BLOCKQUOTE ,0},
|
|
{"BQ" , 1, 1, 1, 0,0, TAG_BQ ,0},
|
|
{"BODY" , 1, 1, 1, 1,1, TAG_BODY ,0},
|
|
{"BR" , 0, 1, 1, 1,1, TAG_BR ,0},
|
|
{"CAPTION" , 1, 1, 1, 2,1, TAG_CAPTION ,0},
|
|
{"CENTER" , 1, 1, 1, 1,1, TAG_CENTER ,0},
|
|
{"CITE" , 1, 1, 1, 2,1, TAG_CITE ,0},
|
|
{"CODE" , 1, 1, 1, 2,1, TAG_CODE ,0},
|
|
{"COL" , 1, 1, 1, 2,2, TAG_COL ,0},
|
|
{"COLGROUP" , 1, 1, 1, 0,0, TAG_COLGROUP ,0},
|
|
{"CREDIT" , 1, 1, 1, 0,0, TAG_CREDIT ,0},
|
|
{"DEL" , 1, 1, 1, 2,1, TAG_DEL ,0},
|
|
{"DFN" , 1, 1, 1, 2,1, TAG_DFN ,0},
|
|
{"DIR" , 1, 1, 1, 0,0, TAG_DIR ,0},
|
|
// MDW: wtf, these have back tags!
|
|
// MDW: ok, i fixed it!
|
|
{"DIV" , 1, 1, 1, 1,1, TAG_DIV ,0},
|
|
{"DL" , 1, 1, 1, 1,1, TAG_DL ,0},
|
|
// this may not have a back tag!
|
|
{"DT" , 1, 1, 1, 1,1, TAG_DT ,0},
|
|
// this may not have a back tag!
|
|
{"DD" , 1, 1, 1, 1,1, TAG_DD ,0},
|
|
{"EM" , 1, 0, 1, 2,1, TAG_EM ,0}, // emphasized text
|
|
{"EMBED" , 0, 1, 1, 0,0, TAG_EMBED ,0},
|
|
{"FIG" , 1, 1, 1, 0,0, TAG_FIG ,0},
|
|
{"FN" , 1, 1, 1, 0,0, TAG_FN ,0},
|
|
{"FONT" , 1, 0, 1, 1,1, TAG_FONT ,0},
|
|
{"FORM" , 1, 1, 1, 2,2, TAG_FORM ,0},
|
|
// this may not have a back tag!
|
|
{"FRAME" , 1, 1, 1, 0,0, TAG_FRAME ,0},
|
|
{"FRAMESET" , 1, 1, 1, 0,0, TAG_FRAMESET ,0},
|
|
{"H1" , 1, 1, 1, 1,1, TAG_H1 ,0},
|
|
{"H2" , 1, 1, 1, 1,1, TAG_H2 ,0},
|
|
{"H3" , 1, 1, 1, 1,1, TAG_H3 ,0},
|
|
{"H4" , 1, 1, 1, 1,1, TAG_H4 ,0},
|
|
{"H5" , 1, 1, 1, 1,1, TAG_H5 ,0},
|
|
{"H6" , 1, 1, 1, 1,1, TAG_H6 ,0},
|
|
{"HEAD" , 1, 1, 1, 1,1, TAG_HEAD ,0},
|
|
{"HR" , 0, 1, 1, 1,1, TAG_HR ,0},
|
|
{"HTML" , 1, 1, 1, 1,1, TAG_HTML ,0},
|
|
{"I" , 1, 0, 1, 2,1, TAG_I ,0},
|
|
{"IFRAME" , 1, 1, 1, 2,2, TAG_IFRAME ,0},
|
|
// filter = 1,but tag is turned to alt
|
|
{"IMG" , 0, 1, 1, 1,1, TAG_IMG ,0},
|
|
{"INPUT" , 0, 1, 1, 0,0, TAG_INPUT ,0},
|
|
{"INS" , 1, 1, 1, 2,1, TAG_INS ,0},
|
|
{"ISINDEX" , 0, 1, 1, 0,0, TAG_ISINDEX ,0},
|
|
{"KBD" , 1, 1, 1, 2,1, TAG_KBD ,0},
|
|
{"LANG" , 1, 1, 1, 0,0, TAG_LANG ,0},
|
|
{"LH" , 1, 1, 1, 0,0, TAG_LH ,0},
|
|
// this may or may not have a back tag
|
|
{"LI" , 1, 1, 1, 1,1, TAG_LI ,0},
|
|
// this may or may not have a back tag
|
|
{"LINK" , 0, 1, 1, 0,0, TAG_LINK ,0},
|
|
{"LISTING" , 1, 1, 1, 0,0, TAG_LISTING ,0},
|
|
{"MAP" , 1, 1, 1, 0,0, TAG_MAP ,0},
|
|
// don't index marquee text
|
|
{"MARQUEE" , 1, 1, 0, 2,2, TAG_MARQUEE ,0},
|
|
{"MATH" , 1, 1, 1, 0,0, TAG_MATH ,0},
|
|
{"MENU" , 1, 1, 1, 1,1, TAG_MENU ,0},
|
|
{"META" , 0, 1, 1, 1,1, TAG_META ,0},
|
|
{"MULTICOL" , 0, 1, 1, 0,0, TAG_MULTICOL ,0},
|
|
{"NOBR" , 1, 0, 1, 0,0, TAG_NOBR ,0},
|
|
{"NOFRAMES" , 1, 1, 1, 0,0, TAG_NOFRAMES ,0},
|
|
{"NOTE" , 1, 1, 1, 0,0, TAG_NOTE ,0},
|
|
{"OL" , 1, 1, 1, 1,1, TAG_OL ,0},
|
|
{"OVERLAY" , 0, 1, 1, 0,0, TAG_OVERLAY ,0},
|
|
// this may not have a back tag!
|
|
{"P" , 0, 1, 1, 1,1, TAG_P ,0},
|
|
{"PARAM" , 0, 1, 1, 0,0, TAG_PARAM ,0},
|
|
{"PERSON" , 1, 1, 1, 0,0, TAG_PERSON ,0},
|
|
{"PLAINTEXT", 1, 1, 1, 0,0, TAG_PLAINTEXT ,0},
|
|
{"PRE" , 1, 1, 1, 2,1, TAG_PRE ,0},
|
|
{"Q" , 1, 1, 1, 2,1, TAG_Q ,0},
|
|
{"RANGE" , 0, 1, 1, 0,0, TAG_RANGE ,0},
|
|
{"SAMP" , 1, 1, 1, 2,1, TAG_SAMP ,0},
|
|
{"SCRIPT" , 1, 1, 0, 0,0, TAG_SCRIPT ,0},
|
|
{"SELECT" , 1, 1, 0, 0,0, TAG_SELECT ,0},
|
|
{"SMALL" , 1, 0, 1, 2,1, TAG_SMALL ,0},
|
|
{"SPACER" , 0, 1, 1, 2,1, TAG_SPACER ,0},
|
|
{"SPOT" , 0, 1, 1, 0,0, TAG_SPOT ,0},
|
|
{"STRIKE" , 1, 1, 1, 2,1, TAG_STRIKE ,0},
|
|
{"STRONG" , 1, 0, 1, 2,1, TAG_STRONG ,0},
|
|
{"SUB" , 1, 0, 1, 2,2, TAG_SUB ,0},
|
|
{"SUP" , 1, 0, 1, 2,2, TAG_SUP ,0},
|
|
{"TAB" , 0, 1, 1, 0,0, TAG_TAB ,0},
|
|
{"TABLE" , 1, 1, 1, 1,1, TAG_TABLE ,0},
|
|
{"TBODY" , 1, 1, 1, 1,1, TAG_TBODY ,0},
|
|
// this may not have a back tag!
|
|
{"TD" , 1, 1, 1, 1,1, TAG_TD ,0},
|
|
{"TEXTAREA" , 1, 1, 1, 2,2, TAG_TEXTAREA ,0},
|
|
{"TEXTFLOW" , 0, 1, 1, 0,0, TAG_TEXTFLOW ,0},
|
|
{"TFOOT" , 0, 1, 1, 0,0, TAG_TFOOT ,0},
|
|
// this DOES have a back tag
|
|
{"TH" , 1, 1, 1, 0,0, TAG_TH ,0},
|
|
{"THEAD" , 0, 1, 1, 0,0, TAG_THEAD ,0},
|
|
{"TITLE" , 1, 1, 1, 1,1, TAG_TITLE ,0},
|
|
// this may not have a back tag!
|
|
{"TR" , 1, 1, 1, 1,1, TAG_TR ,0},
|
|
{"TT" , 1, 1, 1, 2,1, TAG_TT ,0},
|
|
{"U" , 1, 0, 1, 1,1, TAG_U ,0},
|
|
{"UL" , 1, 0, 1, 1,1, TAG_UL ,0},
|
|
{"VAR" , 1, 1, 1, 2,1, TAG_VAR ,0},
|
|
{"WBR" , 0, 1, 1, 0,0, TAG_WBR ,0},
|
|
{"XMP" , 1, 1, 1, 0,0, TAG_XMP ,0},
|
|
{"!--" , 0, 1, 1, 0,0, TAG_COMMENT ,0}, // comment tag!
|
|
{"OPTION" , 0, 1, 1, 2,2, TAG_OPTION ,0},
|
|
{"STYLE" , 1, 1, 0, 0,1, TAG_STYLE ,0},
|
|
// doctype tag <!DOCTYPE ...>
|
|
{"DOCTYPE" , 0, 1, 1, 0,0, TAG_DOCTYPE ,0},
|
|
// used in office.microsoft.com <?xml ...>
|
|
{"XML" , 0, 1, 1, 0,0, TAG_XML ,0},
|
|
// <start index> <stop index>
|
|
{"START" , 0, 1, 1, 0,0, TAG_START ,0},
|
|
{"STOP" , 0, 1, 1, 0,0, TAG_STOP ,0},
|
|
// . i added these tags for faisal, but don't really need them
|
|
// since our XML tag condition handles this case
|
|
// . we can no longer treat as a generic XML tags since faisal wanted
|
|
// the strip=2 option
|
|
{"SPAN" , 1, 0, 1, 2,1, TAG_SPAN ,0}, // not breaking!
|
|
{"LEGEND" , 1, 1, 1, 2,1, TAG_LEGEND ,0},
|
|
{"S" , 1, 1, 1, 2,1, TAG_S ,0}, // strike tag
|
|
{"ABBR" , 1, 0, 1, 2,1, TAG_ABBR ,0},
|
|
{"![CDATA[" , 0, 1, 1, 0,0, TAG_CDATA ,0}, // <![CDATA[ tag
|
|
{"NOSCRIPT" , 1, 1, 0, 0,0, TAG_NOSCRIPT,0},
|
|
{"FIELDSET" , 1, 1, 1, 0,0, TAG_FIELDSET,0},
|
|
// feedburner uses these in the xml
|
|
{"FEEDBURNER:ORIGLINK", 0, 1, 1, 0,0, TAG_FBORIGLINK ,1},
|
|
// ahrefs uses these as links
|
|
{"UrlFrom", 0, 1, 1, 0,0, TAG_URLFROM ,1},
|
|
{"RDF:RDF",0, 1, 1, 0,0, TAG_RDF ,1},
|
|
{"RSS",0, 1, 1, 0,0, TAG_RSS ,1},
|
|
{"FEED",0, 1, 1, 0,0, TAG_FEED ,1},
|
|
|
|
{"ITEM",1, 1, 0, 0,0, TAG_ITEM ,1},
|
|
{"ENTRY",1, 1, 0, 0,0, TAG_ENTRY ,1},
|
|
{"CHANNEL",1, 1, 0, 0,0, TAG_CHANNEL ,1},
|
|
{"ENCLOSURE",1, 1, 0, 0,0, TAG_ENCLOSURE ,0},
|
|
{"WEBLOG",0, 1, 0, 0,0, TAG_WEBLOG ,1},
|
|
|
|
{"GBFRAME", 1, 1, 1, 1,1, TAG_GBFRAME ,0},
|
|
{"TC" , 1, 1, 1, 1,1, TAG_TC ,0}, // HACK: tbl column section
|
|
{"BUTTON" , 1, 1, 1, 0,0, TAG_BUTTON ,0},
|
|
|
|
{"GBXMLTITLE", 1, 1, 1, 1,1, TAG_GBXMLTITLE,1},
|
|
|
|
// facebook xml
|
|
{"START_TIME", 1, 1, 1, 1,1, TAG_FBSTARTTIME,1},
|
|
{"END_TIME", 1, 1, 1, 1,1, TAG_FBENDTIME,1},
|
|
{"NAME", 1, 1, 1, 1,1, TAG_FBNAME,1},
|
|
{"PIC_SQUARE", 1, 1, 1, 1,1, TAG_FBPICSQUARE,1},
|
|
{"HIDE_GUEST_LIST", 1, 1, 1, 1,1, TAG_FBHIDEGUESTLIST,1},
|
|
|
|
|
|
{"scriptText",0, 1, 0, 0,0, TAG_SCRIPTTEXT,0 }
|
|
//{"BUTTON" , 1, 1, 1, 2, 122,0},
|
|
//{"BDO" , 1, 1, 1, 2, 123,0},
|
|
//{"LABEL" , 1, 1, 1, 2, 124,0},
|
|
//{"LAYER" , 1, 1, 1, 2, 125}
|
|
};
|
|
// NAME hasBackTag brk? isVisible? filterKeep1? filterKeep2 type/m_nodeId[i]
|
|
|
|
#include "HashTableX.h"
|
|
|
|
nodeid_t getTagId ( char *s , NodeType **retp ) {
|
|
|
|
// init table?
|
|
static bool s_init = false;
|
|
static HashTableX s_ht;
|
|
static char s_buf[10000];
|
|
if ( ! s_init ) {
|
|
s_init = true;
|
|
s_ht.set ( 4 ,4,1024,s_buf,10000,false,0,"tagids");//niceness=0
|
|
// how many NodeTypes do we have in g_nodes?
|
|
static long nn = sizeof(g_nodes) / sizeof(NodeType);
|
|
// set the hash table
|
|
for ( long i = 0 ; i < nn ; i++ ) {
|
|
char *name = g_nodes[i].m_nodeName;
|
|
long nlen = gbstrlen(name);
|
|
long long h = hash64Upper_a ( name,nlen,0LL );
|
|
NodeType *nt = &g_nodes[i];
|
|
if ( ! s_ht.addKey(&h,&nt) ) {
|
|
char *xx=NULL;*xx=0; }
|
|
}
|
|
// sanity
|
|
if ( s_ht.m_numSlots != 1024 ) { char *xx=NULL;*xx=0; }
|
|
// sanity test
|
|
nodeid_t tt = getTagId ( "br" );
|
|
if ( tt != TAG_BR ) { char *xx=NULL;*xx=0; }
|
|
}
|
|
|
|
|
|
// find end of tag name. hyphens are ok to be in name.
|
|
// facebook uses underscores like <start_time>
|
|
char *e = s; for ( ; *e && (is_alnum_a(*e) || *e=='-'|| *e=='_'); e++);
|
|
// hash it for lookup
|
|
long long h = hash64Upper_a ( s , e - s , 0 );
|
|
// look it up
|
|
NodeType **ntp = (NodeType **)s_ht.getValue(&h);
|
|
// assume none
|
|
if ( retp ) *retp = NULL;
|
|
// none?
|
|
if ( ! ntp ) return 0;
|
|
// got one
|
|
if ( retp ) *retp = *ntp;
|
|
// get id otherwise
|
|
return (*ntp)->m_nodeId;
|
|
}
|
|
|
|
// . returns the nodeId
|
|
// . 0 means not a node
|
|
// . 1 means it's an xml node
|
|
// . > 1 is reserved for pre-defined html nodes
|
|
nodeid_t XmlNode::setNodeInfo ( long long nodeHash ){// , char *hasBackTag ,
|
|
//char *isBreaking , char *isVisible ) {
|
|
/*
|
|
// sanity check
|
|
static bool s_init = false;
|
|
if ( ! s_init ) {
|
|
s_init = true;
|
|
// how many NodeTypes do we have in g_nodes?
|
|
static long nn = sizeof(g_nodes) / sizeof(NodeType);
|
|
// set the hash table
|
|
for ( long i = 0 ; i < nn ; i++ ) {
|
|
// sanity check
|
|
if(g_nodes[i].m_nodeId != i ) { char *xx=NULL;*xx=0;}
|
|
}
|
|
}
|
|
*/
|
|
|
|
// . we have a list of all node types called "g_nodes"
|
|
// . each node type is a NodeType struct
|
|
// . hash all these node types into a hash table by their node name
|
|
// . we have 108 node names so we'll use 512 buckets
|
|
// . given the hash of your node name you can look it up in this table
|
|
static bool s_isHashed = false;
|
|
static long long s_hash [512];
|
|
static nodeid_t s_num [512];
|
|
// how many NodeTypes do we have in g_nodes?
|
|
static long s_numNodeTypes = sizeof(g_nodes) / sizeof(NodeType);
|
|
// we only need to fill in the hash table once since it's static
|
|
if ( s_isHashed ) goto ready;
|
|
// clear the hash table
|
|
memset ( s_hash , 0 , 8*512 );
|
|
// set the hash table
|
|
for ( long i = 0 ; i < s_numNodeTypes ; i++ ) {
|
|
long long h = hash64Upper_a ( g_nodes[i].m_nodeName,
|
|
gbstrlen(g_nodes[i].m_nodeName),0LL);
|
|
//long b = (unsigned long long)h % 512;
|
|
long b = (unsigned long long)h & 511;
|
|
// debug msg
|
|
//fprintf(stderr,"node #%li has bucket #%li, hash =%lli\n",i,b,h);
|
|
while ( s_hash[b] ) if ( ++b == 512 ) b = 0;
|
|
s_hash [ b ] = h;
|
|
s_num [ b ] = i;
|
|
}
|
|
// set this to true so we don't do the hashing again
|
|
s_isHashed = true;
|
|
|
|
ready:
|
|
// look up nodeHash in hash table
|
|
//long b = (unsigned long long)nodeHash % 512;
|
|
long b = (unsigned long long)nodeHash & 511;
|
|
while ( s_hash[b] ) {
|
|
if ( s_hash[b] == nodeHash ) break;
|
|
if ( ++b == 512 ) b = 0;
|
|
}
|
|
// if it wasn't found it must be an xml node(or unrecognized html node)
|
|
if ( ! s_hash[b] ) {
|
|
// default is breaking, has back tag and is indexable
|
|
m_isBreaking = true;
|
|
m_hasBackTag = true;
|
|
m_isVisible = true;
|
|
return 1;
|
|
}
|
|
// otherwise extract the isBreaking and the nodeId from the hit bucket
|
|
long n = s_num[b];
|
|
m_hasBackTag = g_nodes [ n ].m_hasBackTag;
|
|
m_isBreaking = g_nodes [ n ].m_isBreaking;
|
|
m_isVisible = g_nodes [ n ].m_isVisible;
|
|
// return the tag/node Id
|
|
return g_nodes [ n ].m_nodeId;
|
|
}
|
|
|
|
long getNumXmlNodes ( ) {
|
|
return (long)sizeof(g_nodes) / sizeof(XmlNode);
|
|
}
|
|
|
|
#include "Words.h" // BACKBITCOMP
|
|
|
|
bool isBreakingTagId ( nodeid_t tagId ) {
|
|
return g_nodes [ tagId & BACKBITCOMP ].m_isBreaking;
|
|
}
|
|
|
|
bool hasBackTag ( nodeid_t tagId ) {
|
|
return g_nodes [ tagId & BACKBITCOMP ].m_hasBackTag;
|
|
}
|