open-source-search-engine/XmlNode.h
Matt a54471849b sitemap.xml support for harvesting loc urls.
parse xml docs as pure xml again but set nodeid
to TAG_LINK etc. so Linkdb.cpp can get links again.
added isparentsitemap url filter to prioritize urls
from sitemaps. added isrssext to url filters to
prioritize new possible rss feed urls. added numinlinks
to url filters to prioritize popular urls for spidering.
use those filters in default web filter set.
fix filters that delete urls from the index using
the 'DELETE' priority. they weren't getting deleted.
2015-03-17 14:26:16 -06:00

339 lines
8.3 KiB
C++

#ifndef _XMLNODE_H_
#define _XMLNODE_H_
#include "gb-include.h"
// . an xml node can be text or tag (html or xml tag)
typedef int16_t nodeid_t;
// . get how many xml/html tags we have classified in our g_nodes[] array
// . used by Weights.cpp
int32_t getNumXmlNodes ( ) ;
bool isBreakingTagId ( nodeid_t tagId ) ;
bool hasBackTag ( nodeid_t tagId ) ;
int32_t getTagLen ( char *node ) ;
bool isTagStart ( char *s );//, int32_t i, int32_t version ) ;
// s points to tag name - first char
nodeid_t getTagId ( char *s , class NodeType **retp = NULL );
class XmlNode {
public:
friend class Xml; // needs to access our private parts ;)
friend class XmlDoc; // needs to access our private parts ;)
bool isText () { return m_nodeId == 0; };
bool isTag () { return m_nodeId > 0; };
bool isHtmlTag () { return m_nodeId > 1; };
bool isXmlTag () { return m_nodeId == 1; };
nodeid_t getNodeId () { return m_nodeId; };
int64_t getNodeHash() { return m_hash; };
char *getNode () { return m_node; };
// m_nodeLen is in bytes
int32_t getNodeLen () { return m_nodeLen; };
//int32_t getXmlParent () { return m_xmlParentTagNum; };
bool isBreaking () { return m_isBreaking; };
bool isVisible () { return m_isVisible; };
bool hasBackTag () { return m_hasBackTag; };
// exclude meta tags and comment tags (they are not front or back)
bool isFrontTag () {
return m_nodeId > 0 && m_node[1]!='/' &&
m_nodeId != 68 && m_nodeId != 109; };
// . get the value of a field like "href" in the <a href="blah"> tag
char *getFieldValue ( char *fieldName , int32_t *valueLen );
// . used exclusively by Xml class which contains an array of XmlNodes
// . "node" points to the beginning of the node, the '<' if it's a tag
// . sets m_node,m_nodeLen,m_hash,m_isBreaking,m_nodeId
// . returns the length of the node
// . pureXml is true if node cannot be an html tag, except comment
//int32_t set ( char *node , bool pureXml );
int32_t set ( char *node , bool pureXml , int32_t version );
// private:
// . called by set() to get the length of a tag node
//int32_t getTagLen ( char *node , int32_t version);
//int32_t getTagLen ( UChar *node , int32_t version );
// . called by set() to get the length of a TEXT node (and set it)
//int32_t setTextNode ( char *node );
// . called by set() to get the length of a COMMENT node (and set it)
int32_t setCommentNode ( char *node );
//int32_t setCommentNode ( UChar *node );
int32_t setCommentNode2 ( char *node );
// . called by set() to get the length of a CDATA node (and set it)
int32_t setCDATANode ( char *node );
//int32_t setCDATANode ( UChar *node );
// . called by set() to get nodeId and isBreaking of a tag node
// . returns the nodeId
nodeid_t setNodeInfo ( int64_t nodeHash );
char *m_node; // tag data, or text data if not a tag
int32_t m_nodeLen; // m_nodeLen is in bytes
char *m_tagName; // iff this node is a tag
int32_t m_tagNameLen;
int64_t m_hash; // iff this node is a tag
//int64_t m_compoundHash; // set by Xml class
//int32_t m_parentTagNum; // set by Xml class
//int32_t m_xmlParentTagNum; // set by Xml class
int16_t m_depth; // set by Xml class (xml depth only)
nodeid_t m_nodeId; // 0 for text,1 for xml tag, 1+ for html
char m_hasBackTag:1;
char m_isBreaking:1; // does tag (if it is) line break?
char m_isVisible:1;
char m_isSelfLink:1; // an a href tag link to self?
int32_t m_pairTagNum; // paired opening or closing tag
// . "m_linkNum" references a link in Links.cpp
// . use for <a href> xml nodes only right now
// . used so XmlDoc.cpp::getContactUsLink() works better
//int32_t m_linkNum;
class XmlNode *m_parent;
};
// . does "s" start a tag? (regular tag , back tag or comment tag)
inline bool isTagStart ( char *s ) { // , int32_t i, int32_t version ) {
// it must start with < to be a tag
if ( *s != '<' ) return false;
// a <gb is a fake tag because we now decode all html entites
// so in htmlDecode() in fctypes.cpp we decode &lt; to
// "<gb"
//if ( s[i+1]=='g' && s[i+2]=='b') return false;
// minimal tag is 3 chars
// if ( !s[ii + 2 >= len ) return false;
// next char can be an alnum, !-- or / then alnum
if ( is_alnum_a ( s[1] ) ) return true;
// next char can be 1 of 3 things to be a tag
//switch ( s[1] ) {
// / is also acceptable, followed only by an alnum or >
if ( s[1]== '/' ) {
if ( is_alnum_a(s[2]) ) return true;
if ( s[2] == '>' ) return true;
return false;
}
// office.microsoft.com uses <?xml ...?> tags
if ( s[1]=='?' ) {
if ( is_alnum_a(s[2]) ) return true;
//if ( s[2] == '>' ) return true; <?> is tag???
return false;
}
// make sure the double hyphens follow the ! or alnum
if ( s[1]=='!' ) {
// this is for <!xml> i guess
if ( is_alnum_a(s[2]) ) return true;
// and the <![CDATA[
if ( s[2]=='[' && s[3]=='C' && s[4]=='D' &&
s[5]=='A' && s[6]=='T' && s[7]=='A' &&
s[8]=='[' ) return true;
// and the <!-- comment here--> famous comment tag
if ( s[2]=='-' && s[3]=='-' ) return true;
// and <![....]> i've seen too
// <![if gt IE 6]><script>.... for waterfordcoc.org
if ( s[2] == '[' ) return true;
}
return false;
};
// Now set up a structure for describing ALL the available HTML nodes.
// . Each HTML node has a name, name length, does it break a word?
// a format bit. (most HTML tags have 0 for their format bit
// because we really don't care about what they do -- we use format
// bits for extracting title, summaries, et al.
// . the is indexable is false for tags like <script> <option> whose contents
// are not visible/indexable
class NodeType {
public:
char *m_nodeName;
bool m_hasBackTag;
char m_isBreaking;
char m_isVisible;
char m_filterKeep1; // for &strip=1 option
char m_filterKeep2; // for &strip=2 option
nodeid_t m_nodeId;
char m_isXmlTag;
};
extern class NodeType g_nodes[];
inline char *getTagName ( nodeid_t tagId ) {return g_nodes[tagId].m_nodeName;};
// . each tag has a number
enum {
TAG_TEXTNODE = 0,
TAG_XMLTAG,
TAG_A,
TAG_ABBREV,
TAG_ACRONYM,
TAG_ADDRESS,
TAG_APPLET,
TAG_AREA,
TAG_AU,
TAG_AUTHOR,
TAG_B, // 10
TAG_BANNER,
TAG_BASE,
TAG_BASEFONT,
TAG_BGSOUND,
TAG_BIG,
TAG_BLINK,
TAG_BLOCKQUOTE,
TAG_BQ,
TAG_BODY,
TAG_BR, // 20
TAG_CAPTION,
TAG_CENTER,
TAG_CITE,
TAG_CODE,
TAG_COL,
TAG_COLGROUP,
TAG_CREDIT,
TAG_DEL,
TAG_DFN,
TAG_DIR, // 30
TAG_DIV,
TAG_DL,
TAG_DT,
TAG_DD,
TAG_EM,
TAG_EMBED,
TAG_FIG,
TAG_FN,
TAG_FONT,
TAG_FORM, // 40
TAG_FRAME,
TAG_FRAMESET,
TAG_H1,
TAG_H2,
TAG_H3,
TAG_H4,
TAG_H5,
TAG_H6,
TAG_HEAD,
TAG_HR, // 50
TAG_HTML,
TAG_I,
TAG_IFRAME,
TAG_IMG,
TAG_INPUT,
TAG_INS,
TAG_ISINDEX,
TAG_KBD,
TAG_LANG,
TAG_LH, // 60
TAG_LI,
TAG_LINK,
TAG_LISTING,
TAG_MAP,
TAG_MARQUEE,
TAG_MATH,
TAG_MENU,
TAG_META,
TAG_MULTICOL,
TAG_NOBR, // 70
TAG_NOFRAMES,
TAG_NOTE,
TAG_OL,
TAG_OVERLAY,
TAG_P,
TAG_PARAM,
TAG_PERSON,
TAG_PLAINTEXT,
TAG_PRE,
TAG_Q, // 80
TAG_RANGE,
TAG_SAMP,
TAG_SCRIPT,
TAG_SELECT,
TAG_SMALL,
TAG_SPACER,
TAG_SPOT,
TAG_STRIKE,
TAG_STRONG,
TAG_SUB, // 90
TAG_SUP,
TAG_TAB,
TAG_TABLE,
TAG_TBODY,
TAG_TD,
TAG_TEXTAREA,
TAG_TEXTFLOW,
TAG_TFOOT,
TAG_TH,
TAG_THEAD, // 100
TAG_TITLE,
TAG_TR,
TAG_TT,
TAG_U,
TAG_UL,
TAG_VAR,
TAG_WBR,
TAG_XMP,
TAG_COMMENT,
TAG_OPTION, // 110
TAG_STYLE,
TAG_DOCTYPE,
TAG_XML,
TAG_START,
TAG_STOP,
TAG_SPAN,
TAG_LEGEND,
TAG_S,
TAG_ABBR,
TAG_CDATA, // 120
TAG_NOSCRIPT,
TAG_FIELDSET,
TAG_FBORIGLINK, // "feedburner:origlink" special feedburner link
TAG_RDF , // rdf:RDF
TAG_RSS , // rss
TAG_FEED , // atom feed tag
TAG_ITEM,
TAG_ENTRY,
TAG_CHANNEL,
TAG_ENCLOSURE,
TAG_WEBLOG,
// a tag we insert in XmlDoc.cpp to indicate expanded frame/iframe src
TAG_GBFRAME,
TAG_TC,
TAG_GBXMLTITLE,
// facebook xml tags
TAG_FBSTARTTIME, // 135
TAG_FBENDTIME, // 136
TAG_FBNAME,
TAG_FBPICSQUARE,
TAG_FBHIDEGUESTLIST,
// . do not parse this up into words!! it is text in <script> tags
// . consider it a whole tag i guess
TAG_SCRIPTTEXT,
TAG_BUTTON,
TAG_URLFROM, // for ahrefs.com
// support sitemap.xml
TAG_LOC,
//
// fake tags below here
//
// a fake tag used by Sections.cpp
TAG_SENTENCE,
LAST_TAG
};
#endif