#ifndef _XMLNODE_H_
#define _XMLNODE_H_
#include "gb-include.h"
// . an xml node can be text or tag (html or xml tag)
typedef int16_t nodeid_t;
// . get how many xml/html tags we have classified in our g_nodes[] array
// . used by Weights.cpp
int32_t getNumXmlNodes ( ) ;
bool isBreakingTagId ( nodeid_t tagId ) ;
bool hasBackTag ( nodeid_t tagId ) ;
int32_t getTagLen ( char *node ) ;
bool isTagStart ( char *s );//, int32_t i, int32_t version ) ;
// s points to tag name - first char
nodeid_t getTagId ( char *s , class NodeType **retp = NULL );
class XmlNode {
public:
friend class Xml; // needs to access our private parts ;)
friend class XmlDoc; // needs to access our private parts ;)
bool isText () { return m_nodeId == 0; };
bool isTag () { return m_nodeId > 0; };
bool isHtmlTag () { return m_nodeId > 1; };
bool isXmlTag () { return m_nodeId == 1; };
nodeid_t getNodeId () { return m_nodeId; };
int64_t getNodeHash() { return m_hash; };
char *getNode () { return m_node; };
// m_nodeLen is in bytes
int32_t getNodeLen () { return m_nodeLen; };
//int32_t getXmlParent () { return m_xmlParentTagNum; };
bool isBreaking () { return m_isBreaking; };
bool isVisible () { return m_isVisible; };
bool hasBackTag () { return m_hasBackTag; };
// exclude meta tags and comment tags (they are not front or back)
bool isFrontTag () {
return m_nodeId > 0 && m_node[1]!='/' &&
m_nodeId != 68 && m_nodeId != 109; };
// . get the value of a field like "href" in the tag
char *getFieldValue ( char *fieldName , int32_t *valueLen );
// . used exclusively by Xml class which contains an array of XmlNodes
// . "node" points to the beginning of the node, the '<' if it's a tag
// . sets m_node,m_nodeLen,m_hash,m_isBreaking,m_nodeId
// . returns the length of the node
// . pureXml is true if node cannot be an html tag, except comment
//int32_t set ( char *node , bool pureXml );
int32_t set ( char *node , bool pureXml , int32_t version );
// private:
// . called by set() to get the length of a tag node
//int32_t getTagLen ( char *node , int32_t version);
//int32_t getTagLen ( UChar *node , int32_t version );
// . called by set() to get the length of a TEXT node (and set it)
//int32_t setTextNode ( char *node );
// . called by set() to get the length of a COMMENT node (and set it)
int32_t setCommentNode ( char *node );
//int32_t setCommentNode ( UChar *node );
int32_t setCommentNode2 ( char *node );
// . called by set() to get the length of a CDATA node (and set it)
int32_t setCDATANode ( char *node );
//int32_t setCDATANode ( UChar *node );
// . called by set() to get nodeId and isBreaking of a tag node
// . returns the nodeId
nodeid_t setNodeInfo ( int64_t nodeHash );
char *m_node; // tag data, or text data if not a tag
int32_t m_nodeLen; // m_nodeLen is in bytes
char *m_tagName; // iff this node is a tag
int32_t m_tagNameLen;
int64_t m_hash; // iff this node is a tag
//int64_t m_compoundHash; // set by Xml class
//int32_t m_parentTagNum; // set by Xml class
//int32_t m_xmlParentTagNum; // set by Xml class
int16_t m_depth; // set by Xml class (xml depth only)
nodeid_t m_nodeId; // 0 for text,1 for xml tag, 1+ for html
char m_hasBackTag:1;
char m_isBreaking:1; // does tag (if it is) line break?
char m_isVisible:1;
char m_isSelfLink:1; // an a href tag link to self?
int32_t m_pairTagNum; // paired opening or closing tag
// . "m_linkNum" references a link in Links.cpp
// . use for xml nodes only right now
// . used so XmlDoc.cpp::getContactUsLink() works better
//int32_t m_linkNum;
class XmlNode *m_parent;
};
// . does "s" start a tag? (regular tag , back tag or comment tag)
inline bool isTagStart ( char *s ) { // , int32_t i, int32_t version ) {
// it must start with < to be a tag
if ( *s != '<' ) return false;
// a = len ) return false;
// next char can be an alnum, !-- or / then alnum
if ( is_alnum_a ( s[1] ) ) return true;
// next char can be 1 of 3 things to be a tag
//switch ( s[1] ) {
// / is also acceptable, followed only by an alnum or >
if ( s[1]== '/' ) {
if ( is_alnum_a(s[2]) ) return true;
if ( s[2] == '>' ) return true;
return false;
}
// office.microsoft.com uses tags
if ( s[1]=='?' ) {
if ( is_alnum_a(s[2]) ) return true;
//if ( s[2] == '>' ) return true; > is tag???
return false;
}
// make sure the double hyphens follow the ! or alnum
if ( s[1]=='!' ) {
// this is for i guess
if ( is_alnum_a(s[2]) ) return true;
// and the famous comment tag
if ( s[2]=='-' && s[3]=='-' ) return true;
// and i've seen too
//