open-source-search-engine/XmlNode.h

339 lines
8.3 KiB
C
Raw Normal View History

2013-08-03 00:12:24 +04:00
#ifndef _XMLNODE_H_
#define _XMLNODE_H_
#include "gb-include.h"
// . an xml node can be text or tag (html or xml tag)
typedef int16_t nodeid_t;
// . get how many xml/html tags we have classified in our g_nodes[] array
// . used by Weights.cpp
2014-11-11 01:45:11 +03:00
int32_t getNumXmlNodes ( ) ;
2013-08-03 00:12:24 +04:00
bool isBreakingTagId ( nodeid_t tagId ) ;
bool hasBackTag ( nodeid_t tagId ) ;
2014-11-11 01:45:11 +03:00
int32_t getTagLen ( char *node ) ;
bool isTagStart ( char *s );//, int32_t i, int32_t version ) ;
2013-08-03 00:12:24 +04:00
// s points to tag name - first char
nodeid_t getTagId ( char *s , class NodeType **retp = NULL );
class XmlNode {
public:
friend class Xml; // needs to access our private parts ;)
friend class XmlDoc; // needs to access our private parts ;)
bool isText () { return m_nodeId == 0; };
bool isTag () { return m_nodeId > 0; };
bool isHtmlTag () { return m_nodeId > 1; };
bool isXmlTag () { return m_nodeId == 1; };
nodeid_t getNodeId () { return m_nodeId; };
2014-10-30 22:36:39 +03:00
int64_t getNodeHash() { return m_hash; };
2013-08-03 00:12:24 +04:00
char *getNode () { return m_node; };
// m_nodeLen is in bytes
2014-11-11 01:45:11 +03:00
int32_t getNodeLen () { return m_nodeLen; };
//int32_t getXmlParent () { return m_xmlParentTagNum; };
2013-08-03 00:12:24 +04:00
bool isBreaking () { return m_isBreaking; };
bool isVisible () { return m_isVisible; };
bool hasBackTag () { return m_hasBackTag; };
// exclude meta tags and comment tags (they are not front or back)
bool isFrontTag () {
return m_nodeId > 0 && m_node[1]!='/' &&
m_nodeId != 68 && m_nodeId != 109; };
// . get the value of a field like "href" in the <a href="blah"> tag
2014-11-11 01:45:11 +03:00
char *getFieldValue ( char *fieldName , int32_t *valueLen );
2013-08-03 00:12:24 +04:00
// . used exclusively by Xml class which contains an array of XmlNodes
// . "node" points to the beginning of the node, the '<' if it's a tag
// . sets m_node,m_nodeLen,m_hash,m_isBreaking,m_nodeId
// . returns the length of the node
// . pureXml is true if node cannot be an html tag, except comment
2014-11-11 01:45:11 +03:00
//int32_t set ( char *node , bool pureXml );
int32_t set ( char *node , bool pureXml , int32_t version );
2013-08-03 00:12:24 +04:00
// private:
// . called by set() to get the length of a tag node
2014-11-11 01:45:11 +03:00
//int32_t getTagLen ( char *node , int32_t version);
//int32_t getTagLen ( UChar *node , int32_t version );
2013-08-03 00:12:24 +04:00
// . called by set() to get the length of a TEXT node (and set it)
2014-11-11 01:45:11 +03:00
//int32_t setTextNode ( char *node );
2013-08-03 00:12:24 +04:00
// . called by set() to get the length of a COMMENT node (and set it)
2014-11-11 01:45:11 +03:00
int32_t setCommentNode ( char *node );
//int32_t setCommentNode ( UChar *node );
2013-08-03 00:12:24 +04:00
2014-11-11 01:45:11 +03:00
int32_t setCommentNode2 ( char *node );
2013-08-03 00:12:24 +04:00
// . called by set() to get the length of a CDATA node (and set it)
2014-11-11 01:45:11 +03:00
int32_t setCDATANode ( char *node );
//int32_t setCDATANode ( UChar *node );
2013-08-03 00:12:24 +04:00
// . called by set() to get nodeId and isBreaking of a tag node
// . returns the nodeId
2014-10-30 22:36:39 +03:00
nodeid_t setNodeInfo ( int64_t nodeHash );
2013-08-03 00:12:24 +04:00
char *m_node; // tag data, or text data if not a tag
2014-11-11 01:45:11 +03:00
int32_t m_nodeLen; // m_nodeLen is in bytes
2013-08-03 00:12:24 +04:00
char *m_tagName; // iff this node is a tag
2014-11-11 01:45:11 +03:00
int32_t m_tagNameLen;
2014-10-30 22:36:39 +03:00
int64_t m_hash; // iff this node is a tag
//int64_t m_compoundHash; // set by Xml class
2014-11-11 01:45:11 +03:00
//int32_t m_parentTagNum; // set by Xml class
//int32_t m_xmlParentTagNum; // set by Xml class
int16_t m_depth; // set by Xml class (xml depth only)
2013-08-03 00:12:24 +04:00
nodeid_t m_nodeId; // 0 for text,1 for xml tag, 1+ for html
char m_hasBackTag:1;
char m_isBreaking:1; // does tag (if it is) line break?
char m_isVisible:1;
char m_isSelfLink:1; // an a href tag link to self?
2014-11-11 01:45:11 +03:00
int32_t m_pairTagNum; // paired opening or closing tag
2013-08-03 00:12:24 +04:00
// . "m_linkNum" references a link in Links.cpp
// . use for <a href> xml nodes only right now
// . used so XmlDoc.cpp::getContactUsLink() works better
2014-11-11 01:45:11 +03:00
//int32_t m_linkNum;
2014-09-28 19:39:46 +04:00
class XmlNode *m_parent;
2013-08-03 00:12:24 +04:00
};
// . does "s" start a tag? (regular tag , back tag or comment tag)
2014-11-11 01:45:11 +03:00
inline bool isTagStart ( char *s ) { // , int32_t i, int32_t version ) {
2013-08-03 00:12:24 +04:00
// it must start with < to be a tag
if ( *s != '<' ) return false;
// a <gb is a fake tag because we now decode all html entites
// so in htmlDecode() in fctypes.cpp we decode &lt; to
// "<gb"
//if ( s[i+1]=='g' && s[i+2]=='b') return false;
// minimal tag is 3 chars
// if ( !s[ii + 2 >= len ) return false;
// next char can be an alnum, !-- or / then alnum
if ( is_alnum_a ( s[1] ) ) return true;
// next char can be 1 of 3 things to be a tag
//switch ( s[1] ) {
// / is also acceptable, followed only by an alnum or >
if ( s[1]== '/' ) {
if ( is_alnum_a(s[2]) ) return true;
if ( s[2] == '>' ) return true;
return false;
}
// office.microsoft.com uses <?xml ...?> tags
if ( s[1]=='?' ) {
if ( is_alnum_a(s[2]) ) return true;
//if ( s[2] == '>' ) return true; <?> is tag???
return false;
}
// make sure the double hyphens follow the ! or alnum
if ( s[1]=='!' ) {
// this is for <!xml> i guess
if ( is_alnum_a(s[2]) ) return true;
// and the <![CDATA[
if ( s[2]=='[' && s[3]=='C' && s[4]=='D' &&
s[5]=='A' && s[6]=='T' && s[7]=='A' &&
s[8]=='[' ) return true;
// and the <!-- comment here--> famous comment tag
if ( s[2]=='-' && s[3]=='-' ) return true;
// and <![....]> i've seen too
// <![if gt IE 6]><script>.... for waterfordcoc.org
if ( s[2] == '[' ) return true;
}
return false;
};
// Now set up a structure for describing ALL the available HTML nodes.
// . Each HTML node has a name, name length, does it break a word?
// a format bit. (most HTML tags have 0 for their format bit
// because we really don't care about what they do -- we use format
// bits for extracting title, summaries, et al.
// . the is indexable is false for tags like <script> <option> whose contents
// are not visible/indexable
class NodeType {
public:
2013-08-03 00:12:24 +04:00
char *m_nodeName;
bool m_hasBackTag;
char m_isBreaking;
char m_isVisible;
char m_filterKeep1; // for &strip=1 option
char m_filterKeep2; // for &strip=2 option
nodeid_t m_nodeId;
char m_isXmlTag;
};
extern class NodeType g_nodes[];
inline char *getTagName ( nodeid_t tagId ) {return g_nodes[tagId].m_nodeName;};
// . each tag has a number
enum {
TAG_TEXTNODE = 0,
TAG_XMLTAG,
TAG_A,
TAG_ABBREV,
TAG_ACRONYM,
TAG_ADDRESS,
TAG_APPLET,
TAG_AREA,
TAG_AU,
TAG_AUTHOR,
TAG_B, // 10
TAG_BANNER,
TAG_BASE,
TAG_BASEFONT,
TAG_BGSOUND,
TAG_BIG,
TAG_BLINK,
TAG_BLOCKQUOTE,
TAG_BQ,
TAG_BODY,
TAG_BR, // 20
TAG_CAPTION,
TAG_CENTER,
TAG_CITE,
TAG_CODE,
TAG_COL,
TAG_COLGROUP,
TAG_CREDIT,
TAG_DEL,
TAG_DFN,
TAG_DIR, // 30
TAG_DIV,
TAG_DL,
TAG_DT,
TAG_DD,
TAG_EM,
TAG_EMBED,
TAG_FIG,
TAG_FN,
TAG_FONT,
TAG_FORM, // 40
TAG_FRAME,
TAG_FRAMESET,
TAG_H1,
TAG_H2,
TAG_H3,
TAG_H4,
TAG_H5,
TAG_H6,
TAG_HEAD,
TAG_HR, // 50
TAG_HTML,
TAG_I,
TAG_IFRAME,
TAG_IMG,
TAG_INPUT,
TAG_INS,
TAG_ISINDEX,
TAG_KBD,
TAG_LANG,
TAG_LH, // 60
TAG_LI,
TAG_LINK,
TAG_LISTING,
TAG_MAP,
TAG_MARQUEE,
TAG_MATH,
TAG_MENU,
TAG_META,
TAG_MULTICOL,
TAG_NOBR, // 70
TAG_NOFRAMES,
TAG_NOTE,
TAG_OL,
TAG_OVERLAY,
TAG_P,
TAG_PARAM,
TAG_PERSON,
TAG_PLAINTEXT,
TAG_PRE,
TAG_Q, // 80
TAG_RANGE,
TAG_SAMP,
TAG_SCRIPT,
TAG_SELECT,
TAG_SMALL,
TAG_SPACER,
TAG_SPOT,
TAG_STRIKE,
TAG_STRONG,
TAG_SUB, // 90
TAG_SUP,
TAG_TAB,
TAG_TABLE,
TAG_TBODY,
2013-08-03 00:12:24 +04:00
TAG_TD,
TAG_TEXTAREA,
TAG_TEXTFLOW,
TAG_TFOOT,
TAG_TH,
TAG_THEAD, // 100
TAG_TITLE,
TAG_TR,
TAG_TT,
2013-08-03 00:12:24 +04:00
TAG_U,
TAG_UL,
TAG_VAR,
TAG_WBR,
TAG_XMP,
TAG_COMMENT,
2013-08-03 00:12:24 +04:00
TAG_OPTION, // 110
TAG_STYLE,
TAG_DOCTYPE,
TAG_XML,
TAG_START,
TAG_STOP,
TAG_SPAN,
TAG_LEGEND,
TAG_S,
2013-08-03 00:12:24 +04:00
TAG_ABBR,
TAG_CDATA, // 120
TAG_NOSCRIPT,
TAG_FIELDSET,
TAG_FBORIGLINK, // "feedburner:origlink" special feedburner link
TAG_RDF , // rdf:RDF
TAG_RSS , // rss
TAG_FEED , // atom feed tag
TAG_ITEM,
TAG_ENTRY,
TAG_CHANNEL,
TAG_ENCLOSURE,
TAG_WEBLOG,
// a tag we insert in XmlDoc.cpp to indicate expanded frame/iframe src
TAG_GBFRAME,
TAG_TC,
TAG_GBXMLTITLE,
// facebook xml tags
TAG_FBSTARTTIME, // 135
TAG_FBENDTIME, // 136
TAG_FBNAME,
TAG_FBPICSQUARE,
TAG_FBHIDEGUESTLIST,
// . do not parse this up into words!! it is text in <script> tags
// . consider it a whole tag i guess
TAG_SCRIPTTEXT,
TAG_BUTTON,
TAG_URLFROM, // for ahrefs.com
// support sitemap.xml
TAG_LOC,
//
// fake tags below here
//
2013-08-03 00:12:24 +04:00
// a fake tag used by Sections.cpp
TAG_SENTENCE,
LAST_TAG
};
#endif