#include "gb-include.h" #include "XmlNode.h" #include "Mem.h" // . Here's a nice list of all the html nodes names, lengths, whether they're // a breaking node or not and their node id // . isVisible is true if text in between front and end tags is visible on page // . isVisible is used by Xml::getText() // . filterKeep is 1 if we should keep it when &strip=1 is given when getting // the cached document. i added this for faisal // . a filterKeep of 0 means remove tag and text between it and its back tag. // . a filterKeep of 1 means keep the tag and text between it and its back tag. // . a filterKeep of 2 means remove tag BUT keep the text between // it and its back tag. NodeType g_nodes[] = { // NAME hasBackTag brk? isVisible? filterKeep1? filterKeep2 type/m_nodeId[i] // isXml? (the last field) // -------------------------- // -- text node --- 0 {"textNode" , 0, 0, 1, 1,1, TAG_TEXTNODE ,0}, // -- xml tag node --- 1 {"xmlTag" , 1, 1, 1, 2,2, TAG_XMLTAG ,0}, {"A" , 1, 0, 1, 1,1, TAG_A ,0}, {"ABBREV" , 1, 1, 1, 2,2, TAG_ABBREV ,0}, {"ACRONYM" , 1, 1, 1, 2,1, TAG_ACRONYM ,0}, {"ADDRESS" , 1, 1, 1, 2,2, TAG_ADDRESS ,0}, {"APPLET" , 1, 1, 1, 0,0, TAG_APPLET ,0}, {"AREA" , 0, 1, 1, 0,0, TAG_AREA ,0}, {"AU" , 1, 1, 1, 0,0, TAG_AU ,0}, {"AUTHOR" , 1, 1, 1, 0,0, TAG_AUTHOR ,0}, {"B" , 1, 0, 1, 1,1, TAG_B ,0}, {"BANNER" , 1, 1, 1, 0,0, TAG_BANNER ,0}, {"BASE" , 0, 1, 1, 0,0, TAG_BASE ,0}, {"BASEFONT" , 0, 1, 1, 2,2, TAG_BASEFONT ,0}, {"BGSOUND" , 0, 1, 1, 0,0, TAG_BGSOUND ,0}, {"BIG" , 1, 0, 1, 2,1, TAG_BIG ,0}, {"BLINK" , 1, 0, 1, 2,2, TAG_BLINK ,0}, {"BLOCKQUOTE",1, 1, 1, 2,1, TAG_BLOCKQUOTE ,0}, {"BQ" , 1, 1, 1, 0,0, TAG_BQ ,0}, {"BODY" , 1, 1, 1, 1,1, TAG_BODY ,0}, {"BR" , 0, 1, 1, 1,1, TAG_BR ,0}, {"CAPTION" , 1, 1, 1, 2,1, TAG_CAPTION ,0}, {"CENTER" , 1, 1, 1, 1,1, TAG_CENTER ,0}, {"CITE" , 1, 1, 1, 2,1, TAG_CITE ,0}, {"CODE" , 1, 1, 1, 2,1, TAG_CODE ,0}, {"COL" , 1, 1, 1, 2,2, TAG_COL ,0}, {"COLGROUP" , 1, 1, 1, 0,0, TAG_COLGROUP ,0}, {"CREDIT" , 1, 1, 1, 0,0, TAG_CREDIT ,0}, {"DEL" , 1, 1, 1, 2,1, TAG_DEL ,0}, {"DFN" , 1, 1, 1, 2,1, TAG_DFN ,0}, {"DIR" , 1, 1, 1, 0,0, TAG_DIR ,0}, // MDW: wtf, these have back tags! // MDW: ok, i fixed it! {"DIV" , 1, 1, 1, 1,1, TAG_DIV ,0}, {"DL" , 1, 1, 1, 1,1, TAG_DL ,0}, // this may not have a back tag! {"DT" , 1, 1, 1, 1,1, TAG_DT ,0}, // this may not have a back tag! {"DD" , 1, 1, 1, 1,1, TAG_DD ,0}, {"EM" , 1, 0, 1, 2,1, TAG_EM ,0}, // emphasized text {"EMBED" , 0, 1, 1, 0,0, TAG_EMBED ,0}, {"FIG" , 1, 1, 1, 0,0, TAG_FIG ,0}, {"FN" , 1, 1, 1, 0,0, TAG_FN ,0}, {"FONT" , 1, 0, 1, 1,1, TAG_FONT ,0}, {"FORM" , 1, 1, 1, 2,2, TAG_FORM ,0}, // this may not have a back tag! {"FRAME" , 1, 1, 1, 0,0, TAG_FRAME ,0}, {"FRAMESET" , 1, 1, 1, 0,0, TAG_FRAMESET ,0}, {"H1" , 1, 1, 1, 1,1, TAG_H1 ,0}, {"H2" , 1, 1, 1, 1,1, TAG_H2 ,0}, {"H3" , 1, 1, 1, 1,1, TAG_H3 ,0}, {"H4" , 1, 1, 1, 1,1, TAG_H4 ,0}, {"H5" , 1, 1, 1, 1,1, TAG_H5 ,0}, {"H6" , 1, 1, 1, 1,1, TAG_H6 ,0}, {"HEAD" , 1, 1, 1, 1,1, TAG_HEAD ,0}, {"HR" , 0, 1, 1, 1,1, TAG_HR ,0}, {"HTML" , 1, 1, 1, 1,1, TAG_HTML ,0}, {"I" , 1, 0, 1, 2,1, TAG_I ,0}, {"IFRAME" , 1, 1, 1, 2,2, TAG_IFRAME ,0}, // filter = 1,but tag is turned to alt {"IMG" , 0, 1, 1, 1,1, TAG_IMG ,0}, {"INPUT" , 0, 1, 1, 0,0, TAG_INPUT ,0}, {"INS" , 1, 1, 1, 2,1, TAG_INS ,0}, {"ISINDEX" , 0, 1, 1, 0,0, TAG_ISINDEX ,0}, {"KBD" , 1, 1, 1, 2,1, TAG_KBD ,0}, {"LANG" , 1, 1, 1, 0,0, TAG_LANG ,0}, {"LH" , 1, 1, 1, 0,0, TAG_LH ,0}, // this may or may not have a back tag {"LI" , 1, 1, 1, 1,1, TAG_LI ,0}, // this may or may not have a back tag {"LINK" , 0, 1, 1, 0,0, TAG_LINK ,0}, {"LISTING" , 1, 1, 1, 0,0, TAG_LISTING ,0}, {"MAP" , 1, 1, 1, 0,0, TAG_MAP ,0}, // don't index marquee text {"MARQUEE" , 1, 1, 0, 2,2, TAG_MARQUEE ,0}, {"MATH" , 1, 1, 1, 0,0, TAG_MATH ,0}, {"MENU" , 1, 1, 1, 1,1, TAG_MENU ,0}, {"META" , 0, 1, 1, 1,1, TAG_META ,0}, {"MULTICOL" , 0, 1, 1, 0,0, TAG_MULTICOL ,0}, {"NOBR" , 1, 0, 1, 0,0, TAG_NOBR ,0}, {"NOFRAMES" , 1, 1, 1, 0,0, TAG_NOFRAMES ,0}, {"NOTE" , 1, 1, 1, 0,0, TAG_NOTE ,0}, {"OL" , 1, 1, 1, 1,1, TAG_OL ,0}, {"OVERLAY" , 0, 1, 1, 0,0, TAG_OVERLAY ,0}, // this may not have a back tag! {"P" , 0, 1, 1, 1,1, TAG_P ,0}, {"PARAM" , 0, 1, 1, 0,0, TAG_PARAM ,0}, {"PERSON" , 1, 1, 1, 0,0, TAG_PERSON ,0}, {"PLAINTEXT", 1, 1, 1, 0,0, TAG_PLAINTEXT ,0}, {"PRE" , 1, 1, 1, 2,1, TAG_PRE ,0}, {"Q" , 1, 1, 1, 2,1, TAG_Q ,0}, {"RANGE" , 0, 1, 1, 0,0, TAG_RANGE ,0}, {"SAMP" , 1, 1, 1, 2,1, TAG_SAMP ,0}, {"SCRIPT" , 1, 1, 0, 0,0, TAG_SCRIPT ,0}, {"SELECT" , 1, 1, 0, 0,0, TAG_SELECT ,0}, {"SMALL" , 1, 0, 1, 2,1, TAG_SMALL ,0}, {"SPACER" , 0, 1, 1, 2,1, TAG_SPACER ,0}, {"SPOT" , 0, 1, 1, 0,0, TAG_SPOT ,0}, {"STRIKE" , 1, 1, 1, 2,1, TAG_STRIKE ,0}, {"STRONG" , 1, 0, 1, 2,1, TAG_STRONG ,0}, {"SUB" , 1, 0, 1, 2,2, TAG_SUB ,0}, {"SUP" , 1, 0, 1, 2,2, TAG_SUP ,0}, {"TAB" , 0, 1, 1, 0,0, TAG_TAB ,0}, {"TABLE" , 1, 1, 1, 1,1, TAG_TABLE ,0}, {"TBODY" , 1, 1, 1, 1,1, TAG_TBODY ,0}, // this may not have a back tag! {"TD" , 1, 1, 1, 1,1, TAG_TD ,0}, {"TEXTAREA" , 1, 1, 1, 2,2, TAG_TEXTAREA ,0}, {"TEXTFLOW" , 0, 1, 1, 0,0, TAG_TEXTFLOW ,0}, {"TFOOT" , 0, 1, 1, 0,0, TAG_TFOOT ,0}, // this DOES have a back tag {"TH" , 1, 1, 1, 0,0, TAG_TH ,0}, {"THEAD" , 0, 1, 1, 0,0, TAG_THEAD ,0}, {"TITLE" , 1, 1, 1, 1,1, TAG_TITLE ,0}, // this may not have a back tag! {"TR" , 1, 1, 1, 1,1, TAG_TR ,0}, {"TT" , 1, 1, 1, 2,1, TAG_TT ,0}, {"U" , 1, 0, 1, 1,1, TAG_U ,0}, {"UL" , 1, 0, 1, 1,1, TAG_UL ,0}, {"VAR" , 1, 1, 1, 2,1, TAG_VAR ,0}, {"WBR" , 0, 1, 1, 0,0, TAG_WBR ,0}, {"XMP" , 1, 1, 1, 0,0, TAG_XMP ,0}, {"!--" , 0, 1, 1, 0,0, TAG_COMMENT ,0}, // comment tag! {"OPTION" , 0, 1, 1, 2,2, TAG_OPTION ,0}, {"STYLE" , 1, 1, 0, 0,1, TAG_STYLE ,0}, // doctype tag {"DOCTYPE" , 0, 1, 1, 0,0, TAG_DOCTYPE ,0}, // used in office.microsoft.com {"XML" , 0, 1, 1, 0,0, TAG_XML ,0}, // {"START" , 0, 1, 1, 0,0, TAG_START ,0}, {"STOP" , 0, 1, 1, 0,0, TAG_STOP ,0}, // . i added these tags for faisal, but don't really need them // since our XML tag condition handles this case // . we can no longer treat as a generic XML tags since faisal wanted // the strip=2 option {"SPAN" , 1, 0, 1, 2,1, TAG_SPAN ,0}, // not breaking! {"LEGEND" , 1, 1, 1, 2,1, TAG_LEGEND ,0}, {"S" , 1, 1, 1, 2,1, TAG_S ,0}, // strike tag {"ABBR" , 1, 0, 1, 2,1, TAG_ABBR ,0}, {"![CDATA[" , 0, 1, 1, 0,0, TAG_CDATA ,0}, // function call and Links.cpp // ignored it but we are expecting this to be valid! m_isSelfLink = 0; // reset //m_linkNum = -1; // CDATA tag was identified in earlier versions as a text node. Now // it is identified as a CDATA tag node. But gb.conf and others always // pass their version as 0 if ( node[0] == '<' && node[1] == '!' && node[2] == '[' && node[3] == 'C' && node[4] == 'D' && node[5] == 'A' && node[6] == 'T' && node[7] == 'A' && node[8] == '[' ) return setCDATANode ( node ); // if "node" isn't the start of a tag then set it as a Text Node if ( *node != '<' || ! isTagStart ( node ) ) {//, 0, version ) ) { // . set this node as a text node! // . nodeId for text nodes is 0 m_nodeId = 0; m_node = node; m_hasBackTag = false; m_hash = 0; int32_t i = 0; //char inCDATA = 0; // inc i as int32_t as it's NOT the beginning of a tag while ( node[i] && (node[i] != '<' || ! isTagStart ( node+i)))//,versin))) i++; m_nodeLen = i; m_pairTagNum = -1; return m_nodeLen; } // . see if it's a comment (node end is "-->" for comments) // . comments are special cases if ( node[1] == '!' ) { if ( node[2]=='-' && node[3]=='-' ) return setCommentNode ( node ); // this means comment too: // if ( node[2]=='[' ) return setCommentNode2 ( node ); } // . otherwise it's a regular tag // . might be or something though m_nodeLen = getTagLen ( node );//, version ); // . get the node's name's length (i-1) // . node name ends at non alnum char // . we can have hyphens in node name (TODO: even at beginning???) int32_t tagNameStart = 1; // . skip over backslash in the back tags // . or skip over / or ? or ! now // . tag names must start with a letter, fwiw if ( ! is_alnum_a(node[tagNameStart]) /* == '/'*/ ) tagNameStart++; int32_t i = tagNameStart; // skip i to end of tagName. this should only allow ascii chars // to be "tag name chars" for ( ; i < m_nodeLen && is_tagname_char(node[i]) ; i++ ); // set the tagName and tagNameLen m_tagName = &node [ tagNameStart ]; m_tagNameLen = i - tagNameStart; // break point //if ( m_tagNameLen == 3 && m_tagName[0]=='!' && // m_tagName[1]=='-' && m_tagName[2]=='-' ) // fprintf(stderr,"man!"); // . set the node's hash -- used cuz it's faster than strcmp // . just hash the letters as upper case // . tag names are never utf8, so use the ascii ha m_hash = hash64Upper_a ( m_tagName , m_tagNameLen , 0LL); // if we're pure xml, don't allow any html tags accept if ( pureXml ) { m_hasBackTag = true; m_isBreaking = true; m_isVisible = true; //m_nodeId = TAG_XMLTAG;//1; // this returns 1 if tag is not in the list m_nodeId = setNodeInfo ( m_hash );//&m_hasBackTag , } // . determine if the nodeId for this node // . determine if it breaks lines (for phrasing purposes) else m_nodeId = setNodeInfo ( m_hash );//&m_hasBackTag , //&m_isBreaking , &m_isVisible ); // . no back tag if / follow name // . this was only for "pureXml" but now i do it for all tags! if ( m_node [ m_nodeLen - 2 ] == '/' ) m_hasBackTag = false; if ( m_node [ m_nodeLen - 2 ] == '?' ) m_hasBackTag = false; return m_nodeLen; } // . return the length of a node starting at "node" int32_t getTagLen ( char *node ) { // , int32_t version ) { // see if it's not a node //if ( node[0] != '<' ) return 0; // skip over first < int32_t i ; // . keep looping until we hit a < or > OR while we're in quotes // . ignore < and > when they're in quotes for ( i = 1 ; node[i] ; i++ ) { // this switch should speed things up... no! if ( node[i] != '<' && node[i] != '>' && node[i] != '\"' && node[i] != '\'' ) continue; // this is about 1.3 times faster than above (with -O2 on both) //if ( ! is_tag_control_char ( node[i] ) ) continue; if ( node[i] == '<' ) break; if ( node[i] == '>' ) { break; //if ( node[i-1]!='b') break; //if ( i -2 < 0 ) break; //if ( node[i-2]!='g') break; // we had a "gb>" which means that these 3 chars // we originally a > html encoded entity which // we decoded for easier parsing //continue; } //if (version >= 70 && version < 77) continue; // we can have double quotes within single quotes if ( node [ i ] == '\"' ) { // scan back looking for equal sign... int32_t k; for ( k = i - 1 ; k > 1 ; k-- ) { if ( is_wspace_a(node[k]) ) continue; break; } if ( k <= 1 ) continue; // . if an equal sign did not immediately preceed // this double quote then ignore the double quote // . this now fixes the harwoodmuseum.org issue // talked about below if ( node[k] != '=' ) continue; // skip over this first quote i++; while ( node[i] && node[i]!='\"' ) { // crap some pages have unbalanced quotes. // see /test/doc.14541556377486183454.html if ( node[i ]=='>' && node[i-1]=='\"' ) { i--; break; } // like an img tag hits a for // http://www.harwoodmuseum.org/press_deta // il.php?ID=44 // BUT this fucks up // onclick="tb_show('Community Calendar' // on the which is legitamately in quotes //if ( node[i ]=='<' && // node[i+1]=='/' ) { // i--; // break; //} if ( node[i ]=='>' && node[i-1]==' ' && node[i-2]=='\"' ) { i--; break; } // skip this char i++; } // return the length if tag ended abuptly if ( ! node[i] ) return i; // back-to-back quotes? common mistake if ( node[i+1] == '\"' ) i++; continue; } // continue if we don't have a " '" or "='" if ( node [ i ] != '\'' ) continue; if ( node[i-1] != '=' && !is_wspace_a( node[i-1] ) ) continue; // skip to end of quote while ( node[i] && node[i]!='\'' ) i++; } // skip i over the > if ( node[i] == '>' ) i++; // . else we found no closure outside of quotes so be more stringent // . look for closure with regard to quotes else for ( i=1; node[i] && node[i] != '>' && node[i] != '<';i++); // return the LENGTH of the whole node return i ; } int32_t XmlNode::setCommentNode ( char *node ) { m_nodeId = TAG_COMMENT; m_isBreaking = true; m_isVisible = true; m_hasBackTag = false; m_hash = hash64 ( "!--" , 3 , 0LL ); m_node = node; m_tagName = node + 1; // !-- m_tagNameLen = 3; // . compute node length // . TODO: do we have to deal with quotes???? // . TODO: what about nested comments? int32_t i; for ( i = 3 ; node[i] ; i++ ) { if ( node[i] !='>' ) continue; if ( node[i-1] !='-' ) continue; if ( node[i-2] =='-' ) break; } // skip i over the >, if any (could be end of doc) if ( node[i] == '>' ) i++; m_nodeLen = i; return i; } int32_t XmlNode::setCommentNode2 ( char *node ) { m_nodeId = TAG_COMMENT; m_isBreaking = false;//true; m_isVisible = false;//true; m_hasBackTag = false; m_hash = hash64 ( "![" , 2 , 0LL ); m_node = node; m_tagName = node + 1; m_tagNameLen = 2; // . compute node length // . TODO: do we have to deal with quotes???? // . TODO: what about nested comments? int32_t i; for ( i = 2 ; node[i] ; i++ ) { // look for ending of ]> like for if ( node[i] !='>' ) continue; if ( node[i-1] ==']' ) break; // look for ending of --> like for if ( node[i-1] == '-' && node[i-2] == '-' ) break; } // skip i over the >, if any (could be end of doc) if ( node[i] == '>' ) i++; m_nodeLen = i; return i; } int32_t XmlNode::setCDATANode ( char *node ) { m_nodeId = TAG_CDATA; m_isBreaking = true; m_isVisible = true; m_hasBackTag = false; m_hash = hash64 ( "![CDATA[" , 8 , 0LL ); m_node = node; m_tagName = node + 1; // !-- m_tagNameLen = 8; // . compute node length // . TODO: do we have to deal with quotes???? // . TODO: what about nested comments? int32_t i; for ( i = 8 ; node[i] ; i++ ) { // seems like just ]] is good enough! don't need "]]>" //if ( node[i] !='>' ) continue; if ( node[i ] !=']' ) continue; if ( node[i+1] !=']' ) continue;//{ i++; break; } // but skip it if we got it if ( node[i+2] !='>' ) continue; //if ( node[i+2] == '>' ) { i+=3; break;} i += 3; break; // if does not end in '>', skip the ']' anyway // no! hurts regex ending in [0-9] //i+=2; break; } // skip i over the >, if any (could be end of doc) //if ( node[i] == '>' ) i++; m_nodeLen = i; return i; } // Return the value of the specified "field" within this node. // the case of "field" does not matter. char *XmlNode::getFieldValue ( char *field , int32_t *valueLen ) { // reset this to 0 *valueLen = 0; // scan for the field name in our node int32_t flen = gbstrlen(field); char inQuotes = '\0'; int32_t i; // scan the characters in the node, looking for the field name in ascii for ( i = 1; i + flen < m_nodeLen ; i++ ) { // skip the field if it's quoted if ( inQuotes) { if (m_node[i] == inQuotes ) inQuotes = 0; continue; } // set inQuotes to the quote if we're in quotes if ( (m_node[i]=='\"' || m_node[i]=='\'')){ inQuotes = m_node[i]; continue; } // a field name must be preceeded by non-alnum if ( is_alnum_a ( m_node[i-1] ) ) continue; // the first character of this field shout match field[0] if ( to_lower_a (m_node[i]) != to_lower_a(field[0] )) continue; // field just be immediately followed by an = or space if (m_node[i+flen]!='='&&!is_wspace_a(m_node[i+flen]))continue; // field names must match if ( strncasecmp ( &m_node[i], field, flen ) != 0 ) continue; // break cuz we got a match for our field name break; } // return NULL if no matching field if ( i + flen >= m_nodeLen ) return NULL; // advance i over the fieldname so it pts to = or space i += flen; // advance i over spaces while ( i < m_nodeLen && is_wspace_a ( m_node[i] ) ) i++; // advance over the equal sign, return NULL if does not exist if ( i < m_nodeLen && m_node[i++] != '=' ) return NULL; // advance i over spaces after the equal sign while ( i < m_nodeLen && is_wspace_a ( m_node[i] ) ) i++; // now parse out the value of this field (could be in quotes) inQuotes = '\0'; // set inQuotes to the quote if we're in quotes if ( m_node[i]=='\"' || m_node[i]=='\'') inQuotes = m_node[i++]; // mark this as the start of the value int start=i; // advance i until we hit a space, or we hit a that quote if inQuotes if (inQuotes) { while (i char *e = s; for ( ; *e && (is_alnum_a(*e) || *e=='-'|| *e=='_'); e++); // hash it for lookup int64_t h = hash64Upper_a ( s , e - s , 0 ); // look it up NodeType **ntp = (NodeType **)s_ht.getValue(&h); // assume none if ( retp ) *retp = NULL; // none? if ( ! ntp ) return 0; // got one if ( retp ) *retp = *ntp; // get id otherwise return (*ntp)->m_nodeId; } // . returns the nodeId // . 0 means not a node // . 1 means it's an xml node // . > 1 is reserved for pre-defined html nodes nodeid_t XmlNode::setNodeInfo ( int64_t nodeHash ){// , char *hasBackTag , //char *isBreaking , char *isVisible ) { /* // sanity check static bool s_init = false; if ( ! s_init ) { s_init = true; // how many NodeTypes do we have in g_nodes? static int32_t nn = sizeof(g_nodes) / sizeof(NodeType); // set the hash table for ( int32_t i = 0 ; i < nn ; i++ ) { // sanity check if(g_nodes[i].m_nodeId != i ) { char *xx=NULL;*xx=0;} } } */ // . we have a list of all node types called "g_nodes" // . each node type is a NodeType struct // . hash all these node types into a hash table by their node name // . we have 108 node names so we'll use 512 buckets // . given the hash of your node name you can look it up in this table static bool s_isHashed = false; static int64_t s_hash [512]; static nodeid_t s_num [512]; // how many NodeTypes do we have in g_nodes? static int32_t s_numNodeTypes = sizeof(g_nodes) / sizeof(NodeType); // we only need to fill in the hash table once since it's static if ( s_isHashed ) goto ready; // clear the hash table memset ( s_hash , 0 , 8*512 ); // set the hash table for ( int32_t i = 0 ; i < s_numNodeTypes ; i++ ) { int64_t h = hash64Upper_a ( g_nodes[i].m_nodeName, gbstrlen(g_nodes[i].m_nodeName),0LL); //int32_t b = (uint64_t)h % 512; int32_t b = (uint64_t)h & 511; // debug msg //fprintf(stderr,"node #%"INT32" has bucket #%"INT32", hash =%"INT64"\n",i,b,h); while ( s_hash[b] ) if ( ++b == 512 ) b = 0; s_hash [ b ] = h; s_num [ b ] = i; } // set this to true so we don't do the hashing again s_isHashed = true; ready: // look up nodeHash in hash table //int32_t b = (uint64_t)nodeHash % 512; int32_t b = (uint64_t)nodeHash & 511; while ( s_hash[b] ) { if ( s_hash[b] == nodeHash ) break; if ( ++b == 512 ) b = 0; } // if it wasn't found it must be an xml node(or unrecognized html node) if ( ! s_hash[b] ) { // default is breaking, has back tag and is indexable m_isBreaking = true; m_hasBackTag = true; m_isVisible = true; return 1; } // otherwise extract the isBreaking and the nodeId from the hit bucket int32_t n = s_num[b]; m_hasBackTag = g_nodes [ n ].m_hasBackTag; m_isBreaking = g_nodes [ n ].m_isBreaking; m_isVisible = g_nodes [ n ].m_isVisible; // return the tag/node Id return g_nodes [ n ].m_nodeId; } int32_t getNumXmlNodes ( ) { return (int32_t)sizeof(g_nodes) / sizeof(XmlNode); } #include "Words.h" // BACKBITCOMP bool isBreakingTagId ( nodeid_t tagId ) { return g_nodes [ tagId & BACKBITCOMP ].m_isBreaking; } bool hasBackTag ( nodeid_t tagId ) { return g_nodes [ tagId & BACKBITCOMP ].m_hasBackTag; }