#ifndef _XMLNODE_H_ #define _XMLNODE_H_ #include "gb-include.h" // . an xml node can be text or tag (html or xml tag) typedef int16_t nodeid_t; // . get how many xml/html tags we have classified in our g_nodes[] array // . used by Weights.cpp int32_t getNumXmlNodes ( ) ; bool isBreakingTagId ( nodeid_t tagId ) ; bool hasBackTag ( nodeid_t tagId ) ; int32_t getTagLen ( char *node ) ; bool isTagStart ( char *s );//, int32_t i, int32_t version ) ; // s points to tag name - first char nodeid_t getTagId ( char *s , class NodeType **retp = NULL ); class XmlNode { public: friend class Xml; // needs to access our private parts ;) friend class XmlDoc; // needs to access our private parts ;) bool isText () { return m_nodeId == 0; }; bool isTag () { return m_nodeId > 0; }; bool isHtmlTag () { return m_nodeId > 1; }; bool isXmlTag () { return m_nodeId == 1; }; nodeid_t getNodeId () { return m_nodeId; }; int64_t getNodeHash() { return m_hash; }; char *getNode () { return m_node; }; // m_nodeLen is in bytes int32_t getNodeLen () { return m_nodeLen; }; //int32_t getXmlParent () { return m_xmlParentTagNum; }; bool isBreaking () { return m_isBreaking; }; bool isVisible () { return m_isVisible; }; bool hasBackTag () { return m_hasBackTag; }; // exclude meta tags and comment tags (they are not front or back) bool isFrontTag () { return m_nodeId > 0 && m_node[1]!='/' && m_nodeId != 68 && m_nodeId != 109; }; // . get the value of a field like "href" in the tag char *getFieldValue ( char *fieldName , int32_t *valueLen ); // . used exclusively by Xml class which contains an array of XmlNodes // . "node" points to the beginning of the node, the '<' if it's a tag // . sets m_node,m_nodeLen,m_hash,m_isBreaking,m_nodeId // . returns the length of the node // . pureXml is true if node cannot be an html tag, except comment //int32_t set ( char *node , bool pureXml ); int32_t set ( char *node , bool pureXml , int32_t version ); // private: // . called by set() to get the length of a tag node //int32_t getTagLen ( char *node , int32_t version); //int32_t getTagLen ( UChar *node , int32_t version ); // . called by set() to get the length of a TEXT node (and set it) //int32_t setTextNode ( char *node ); // . called by set() to get the length of a COMMENT node (and set it) int32_t setCommentNode ( char *node ); //int32_t setCommentNode ( UChar *node ); int32_t setCommentNode2 ( char *node ); // . called by set() to get the length of a CDATA node (and set it) int32_t setCDATANode ( char *node ); //int32_t setCDATANode ( UChar *node ); // . called by set() to get nodeId and isBreaking of a tag node // . returns the nodeId nodeid_t setNodeInfo ( int64_t nodeHash ); char *m_node; // tag data, or text data if not a tag int32_t m_nodeLen; // m_nodeLen is in bytes char *m_tagName; // iff this node is a tag int32_t m_tagNameLen; int64_t m_hash; // iff this node is a tag //int64_t m_compoundHash; // set by Xml class //int32_t m_parentTagNum; // set by Xml class //int32_t m_xmlParentTagNum; // set by Xml class int16_t m_depth; // set by Xml class (xml depth only) nodeid_t m_nodeId; // 0 for text,1 for xml tag, 1+ for html char m_hasBackTag:1; char m_isBreaking:1; // does tag (if it is) line break? char m_isVisible:1; char m_isSelfLink:1; // an a href tag link to self? int32_t m_pairTagNum; // paired opening or closing tag // . "m_linkNum" references a link in Links.cpp // . use for xml nodes only right now // . used so XmlDoc.cpp::getContactUsLink() works better //int32_t m_linkNum; class XmlNode *m_parent; }; // . does "s" start a tag? (regular tag , back tag or comment tag) inline bool isTagStart ( char *s ) { // , int32_t i, int32_t version ) { // it must start with < to be a tag if ( *s != '<' ) return false; // a = len ) return false; // next char can be an alnum, !-- or / then alnum if ( is_alnum_a ( s[1] ) ) return true; // next char can be 1 of 3 things to be a tag //switch ( s[1] ) { // / is also acceptable, followed only by an alnum or > if ( s[1]== '/' ) { if ( is_alnum_a(s[2]) ) return true; if ( s[2] == '>' ) return true; return false; } // office.microsoft.com uses tags if ( s[1]=='?' ) { if ( is_alnum_a(s[2]) ) return true; //if ( s[2] == '>' ) return true; is tag??? return false; } // make sure the double hyphens follow the ! or alnum if ( s[1]=='!' ) { // this is for i guess if ( is_alnum_a(s[2]) ) return true; // and the famous comment tag if ( s[2]=='-' && s[3]=='-' ) return true; // and i've seen too //