open-source-search-engine/XmlNode.h

#ifndef _XMLNODE_H_
#define _XMLNODE_H_

#include "gb-include.h"
// . an xml node can be text or tag (html or xml tag)

typedef int16_t nodeid_t;

// . get how many xml/html tags we have classified in our g_nodes[] array
// . used by Weights.cpp
int32_t getNumXmlNodes ( ) ;
bool isBreakingTagId ( nodeid_t tagId ) ;
bool hasBackTag ( nodeid_t tagId ) ;
int32_t getTagLen ( char *node ) ;
bool isTagStart ( char *s );//, int32_t i, int32_t version ) ;
// s points to tag name - first char
nodeid_t getTagId ( char *s , class NodeType **retp = NULL ); 

class XmlNode {

 public:

	friend class Xml;    // needs to access our private parts ;)
	friend class XmlDoc; // needs to access our private parts ;)

	bool  isText       () { return m_nodeId == 0; };
	bool  isTag        () { return m_nodeId >  0; };
	bool  isHtmlTag    () { return m_nodeId >  1; };
	bool  isXmlTag     () { return m_nodeId == 1; };
	nodeid_t getNodeId    () { return m_nodeId; };
	int64_t getNodeHash() { return m_hash; };
	char *getNode      () { return m_node; };
	// m_nodeLen is in bytes
	int32_t  getNodeLen   () { return m_nodeLen; };
	//int32_t  getXmlParent () { return m_xmlParentTagNum; };
	bool  isBreaking   () { return m_isBreaking; };
	bool  isVisible    () { return m_isVisible; };
	bool  hasBackTag   () { return m_hasBackTag; };

	// exclude meta tags and comment tags (they are not front or back)
	bool  isFrontTag () { 
		return m_nodeId > 0 && m_node[1]!='/' &&
			m_nodeId != 68 && m_nodeId != 109; };

	// . get the value of a field like "href" in the <a href="blah"> tag
	char *getFieldValue ( char *fieldName , int32_t *valueLen );

	// . used exclusively by Xml class which contains an array of XmlNodes
	// . "node" points to the beginning of the node, the '<' if it's a tag
	// . sets m_node,m_nodeLen,m_hash,m_isBreaking,m_nodeId
	// . returns the length of the node
	// . pureXml is true if node cannot be an html tag, except comment
	//int32_t set ( char *node , bool pureXml );
	int32_t set ( char *node , bool pureXml , int32_t version );

	// private:

	// . called by set() to get the length of a tag node
	//int32_t getTagLen      ( char *node , int32_t version);
	//int32_t getTagLen      ( UChar *node , int32_t version );

	// . called by set() to get the length of a TEXT node (and set it)
	//int32_t setTextNode    ( char *node );

	// . called by set() to get the length of a COMMENT node (and set it)
	int32_t setCommentNode ( char *node );
	//int32_t setCommentNode ( UChar *node );

	int32_t setCommentNode2 ( char *node );

	// . called by set() to get the length of a CDATA node (and set it)
	int32_t setCDATANode ( char *node );
	//int32_t setCDATANode ( UChar *node );

	// . called by set() to get nodeId and isBreaking of a tag node
	// . returns the nodeId
	nodeid_t setNodeInfo    ( int64_t  nodeHash );

	char      *m_node;             // tag data, or text data if not a tag
	int32_t       m_nodeLen;          // m_nodeLen is in bytes
	char      *m_tagName;          // iff this node is a tag
	int32_t       m_tagNameLen;
	int64_t  m_hash;             // iff this node is a tag
	//int64_t  m_compoundHash;     // set by Xml class
	//int32_t       m_parentTagNum;     // set by Xml class
	//int32_t       m_xmlParentTagNum;  // set by Xml class
	int16_t      m_depth;            // set by Xml class (xml depth only)
	nodeid_t   m_nodeId;           // 0 for text,1 for xml tag, 1+ for html
	char       m_hasBackTag:1;
	char       m_isBreaking:1;     // does tag (if it is) line break?
	char       m_isVisible:1;
	char       m_isSelfLink:1;  // an a href tag link to self?
	int32_t       m_pairTagNum;    // paired opening or closing tag
	// . "m_linkNum" references a link in Links.cpp
	// . use for <a href> xml nodes only right now
	// . used so XmlDoc.cpp::getContactUsLink() works better
	//int32_t       m_linkNum;        
	class XmlNode *m_parent;
};

// . does "s" start a tag? (regular tag , back tag or comment tag)
inline bool isTagStart ( char *s ) { // , int32_t i, int32_t version ) {
	// it must start with < to be a tag
	if ( *s != '<' ) return false;
	// a <gb is a fake tag because we now decode all html entites
	// so in htmlDecode() in fctypes.cpp we decode &lt; to
	// "<gb"
	//if ( s[i+1]=='g' && s[i+2]=='b') return false;
	// minimal tag is 3 chars
	// if ( !s[ii + 2 >= len ) return false;
	// next char can be an alnum, !-- or / then alnum
	if ( is_alnum_a ( s[1] ) ) return true;
	// next char can be 1 of 3 things to be a tag
	//switch ( s[1] ) {
	// / is also acceptable, followed only by an alnum or >
	if ( s[1]== '/' ) {
		if ( is_alnum_a(s[2]) ) return true;
		if ( s[2] == '>'    ) return true;
		return false;
	}
	// office.microsoft.com uses <?xml ...?> tags
	if ( s[1]=='?' ) {
		if ( is_alnum_a(s[2]) ) return true;
		//if ( s[2] == '>'    ) return true; <?> is tag???
		return false;
	}
	// make sure the double hyphens follow the ! or alnum
	if ( s[1]=='!' ) {
		// this is for <!xml> i guess
		if ( is_alnum_a(s[2]) ) return true;
		// and the <![CDATA[
		if ( s[2]=='[' && s[3]=='C' && s[4]=='D' &&
		     s[5]=='A' && s[6]=='T' && s[7]=='A' &&
		     s[8]=='[' ) return true;
		// and the <!-- comment here--> famous comment tag
		if ( s[2]=='-' && s[3]=='-' ) return true;
		// and <![....]> i've seen too
		// <![if gt IE 6]><script>.... for waterfordcoc.org
		if ( s[2] == '[' ) return true;
	}
	return false;
};


// Now set up a structure for describing ALL the available HTML nodes.
// . Each HTML node has a name, name length, does it break a word?
//   a format bit. (most HTML tags have 0 for their format bit
//   because we really don't care about what they do -- we use format
//   bits for extracting title, summaries, et al.
// . the is indexable is false for tags like <script> <option> whose contents
//   are not visible/indexable
class NodeType {
 public:
	char    *m_nodeName;
	bool     m_hasBackTag;
	char     m_isBreaking;
	char     m_isVisible;
	char     m_filterKeep1; // for &strip=1 option
	char     m_filterKeep2; // for &strip=2 option
	nodeid_t m_nodeId;
	char     m_isXmlTag;
};

extern class NodeType g_nodes[];

inline char *getTagName ( nodeid_t tagId ) {return g_nodes[tagId].m_nodeName;};

// . each tag has a number
enum {
	TAG_TEXTNODE = 0,
	TAG_XMLTAG,
	TAG_A,
	TAG_ABBREV,
	TAG_ACRONYM,
	TAG_ADDRESS,
	TAG_APPLET,
	TAG_AREA,
	TAG_AU,
	TAG_AUTHOR,
	TAG_B, // 10
	TAG_BANNER,
	TAG_BASE,
	TAG_BASEFONT,
	TAG_BGSOUND,
	TAG_BIG,
	TAG_BLINK,
	TAG_BLOCKQUOTE,
	TAG_BQ,
	TAG_BODY,
	TAG_BR, // 20
	TAG_CAPTION,
	TAG_CENTER,
	TAG_CITE,
	TAG_CODE,
	TAG_COL,
	TAG_COLGROUP,
	TAG_CREDIT,
	TAG_DEL,
	TAG_DFN,
	TAG_DIR, // 30
	TAG_DIV,
	TAG_DL,
	TAG_DT,
	TAG_DD,
	TAG_EM,
	TAG_EMBED,
	TAG_FIG,
	TAG_FN,
	TAG_FONT,
	TAG_FORM, // 40
	TAG_FRAME,
	TAG_FRAMESET,
	TAG_H1,
	TAG_H2,
	TAG_H3,
	TAG_H4,
	TAG_H5,
	TAG_H6,
	TAG_HEAD,
	TAG_HR, // 50
	TAG_HTML,
	TAG_I,
	TAG_IFRAME,
	TAG_IMG,
	TAG_INPUT,
	TAG_INS,
	TAG_ISINDEX,
	TAG_KBD,
	TAG_LANG,
	TAG_LH, // 60
	TAG_LI,
	TAG_LINK,
	TAG_LISTING,
	TAG_MAP,
	TAG_MARQUEE,
	TAG_MATH,
	TAG_MENU,
	TAG_META,
	TAG_MULTICOL,
	TAG_NOBR, // 70
	TAG_NOFRAMES,
	TAG_NOTE,
	TAG_OL,
	TAG_OVERLAY,
	TAG_P,
	TAG_PARAM,
	TAG_PERSON,
	TAG_PLAINTEXT,
	TAG_PRE,
	TAG_Q, // 80
	TAG_RANGE,
	TAG_SAMP,
	TAG_SCRIPT,
	TAG_SELECT,
	TAG_SMALL,
	TAG_SPACER,
	TAG_SPOT,
	TAG_STRIKE,
	TAG_STRONG,
	TAG_SUB, // 90
	TAG_SUP,
	TAG_TAB,
	TAG_TABLE,
	TAG_TBODY,

	TAG_TD,
	TAG_TEXTAREA,
	TAG_TEXTFLOW,
	TAG_TFOOT,
	TAG_TH,
	TAG_THEAD, // 100
	TAG_TITLE,
	TAG_TR,
	TAG_TT,

	TAG_U,
	TAG_UL,
	TAG_VAR,
	TAG_WBR,
	TAG_XMP,
	TAG_COMMENT,

	TAG_OPTION, // 110
	TAG_STYLE,
	TAG_DOCTYPE,
	TAG_XML,
	TAG_START,
	TAG_STOP,
	TAG_SPAN,
	TAG_LEGEND,
	TAG_S,

	TAG_ABBR,
	TAG_CDATA, // 120
	TAG_NOSCRIPT,
	TAG_FIELDSET,
	TAG_FBORIGLINK, // "feedburner:origlink" special feedburner link
	TAG_RDF  ,      // rdf:RDF
	TAG_RSS  ,      // rss
	TAG_FEED ,      // atom feed tag

	TAG_ITEM,
	TAG_ENTRY,
	TAG_CHANNEL,
	TAG_ENCLOSURE,
	TAG_WEBLOG,
	// a tag we insert in XmlDoc.cpp to indicate expanded frame/iframe src
	TAG_GBFRAME,
	TAG_TC,
	TAG_GBXMLTITLE,

	// facebook xml tags
	TAG_FBSTARTTIME, // 135
	TAG_FBENDTIME, // 136
	TAG_FBNAME,
	TAG_FBPICSQUARE,
	TAG_FBHIDEGUESTLIST,

	// . do not parse this up into words!! it is text in <script> tags
	// . consider it a whole tag i guess
	TAG_SCRIPTTEXT,
	TAG_BUTTON,
	TAG_URLFROM, // for ahrefs.com

	// support sitemap.xml
	TAG_LOC,

	//
	// fake tags below here
	//
	// a fake tag used by Sections.cpp
	TAG_SENTENCE,

	LAST_TAG
};
#endif
Initial file population. 2013-08-03 00:12:24 +04:00			`#ifndef _XMLNODE_H_`
			`#define _XMLNODE_H_`

			`#include "gb-include.h"`
			`// . an xml node can be text or tag (html or xml tag)`

			`typedef int16_t nodeid_t;`

			`// . get how many xml/html tags we have classified in our g_nodes[] array`
			`// . used by Weights.cpp`
now it compiles with -m32 2014-11-11 01:45:11 +03:00			`int32_t getNumXmlNodes ( ) ;`
Initial file population. 2013-08-03 00:12:24 +04:00			`bool isBreakingTagId ( nodeid_t tagId ) ;`
			`bool hasBackTag ( nodeid_t tagId ) ;`
now it compiles with -m32 2014-11-11 01:45:11 +03:00			`int32_t getTagLen ( char *node ) ;`
			`bool isTagStart ( char *s );//, int32_t i, int32_t version ) ;`
Initial file population. 2013-08-03 00:12:24 +04:00			`// s points to tag name - first char`
			`nodeid_t getTagId ( char s , class NodeType *retp = NULL );`

			`class XmlNode {`

			`public:`

			`friend class Xml; // needs to access our private parts ;)`
			`friend class XmlDoc; // needs to access our private parts ;)`

			`bool isText () { return m_nodeId == 0; };`
			`bool isTag () { return m_nodeId > 0; };`
			`bool isHtmlTag () { return m_nodeId > 1; };`
			`bool isXmlTag () { return m_nodeId == 1; };`
			`nodeid_t getNodeId () { return m_nodeId; };`
replace long long with int64_t 2014-10-30 22:36:39 +03:00			`int64_t getNodeHash() { return m_hash; };`
Initial file population. 2013-08-03 00:12:24 +04:00			`char *getNode () { return m_node; };`
			`// m_nodeLen is in bytes`
now it compiles with -m32 2014-11-11 01:45:11 +03:00			`int32_t getNodeLen () { return m_nodeLen; };`
			`//int32_t getXmlParent () { return m_xmlParentTagNum; };`
Initial file population. 2013-08-03 00:12:24 +04:00			`bool isBreaking () { return m_isBreaking; };`
			`bool isVisible () { return m_isVisible; };`
			`bool hasBackTag () { return m_hasBackTag; };`

			`// exclude meta tags and comment tags (they are not front or back)`
			`bool isFrontTag () {`
			`return m_nodeId > 0 && m_node[1]!='/' &&`
			`m_nodeId != 68 && m_nodeId != 109; };`

			`// . get the value of a field like "href" in the <a href="blah"> tag`
now it compiles with -m32 2014-11-11 01:45:11 +03:00			`char getFieldValue ( char fieldName , int32_t *valueLen );`
Initial file population. 2013-08-03 00:12:24 +04:00
			`// . used exclusively by Xml class which contains an array of XmlNodes`
			`// . "node" points to the beginning of the node, the '<' if it's a tag`
			`// . sets m_node,m_nodeLen,m_hash,m_isBreaking,m_nodeId`
			`// . returns the length of the node`
			`// . pureXml is true if node cannot be an html tag, except comment`
now it compiles with -m32 2014-11-11 01:45:11 +03:00			`//int32_t set ( char *node , bool pureXml );`
			`int32_t set ( char *node , bool pureXml , int32_t version );`
Initial file population. 2013-08-03 00:12:24 +04:00
			`// private:`

			`// . called by set() to get the length of a tag node`
now it compiles with -m32 2014-11-11 01:45:11 +03:00			`//int32_t getTagLen ( char *node , int32_t version);`
			`//int32_t getTagLen ( UChar *node , int32_t version );`
Initial file population. 2013-08-03 00:12:24 +04:00
			`// . called by set() to get the length of a TEXT node (and set it)`
now it compiles with -m32 2014-11-11 01:45:11 +03:00			`//int32_t setTextNode ( char *node );`
Initial file population. 2013-08-03 00:12:24 +04:00
			`// . called by set() to get the length of a COMMENT node (and set it)`
now it compiles with -m32 2014-11-11 01:45:11 +03:00			`int32_t setCommentNode ( char *node );`
			`//int32_t setCommentNode ( UChar *node );`
Initial file population. 2013-08-03 00:12:24 +04:00
now it compiles with -m32 2014-11-11 01:45:11 +03:00			`int32_t setCommentNode2 ( char *node );`
Initial file population. 2013-08-03 00:12:24 +04:00
			`// . called by set() to get the length of a CDATA node (and set it)`
now it compiles with -m32 2014-11-11 01:45:11 +03:00			`int32_t setCDATANode ( char *node );`
			`//int32_t setCDATANode ( UChar *node );`
Initial file population. 2013-08-03 00:12:24 +04:00
			`// . called by set() to get nodeId and isBreaking of a tag node`
			`// . returns the nodeId`
replace long long with int64_t 2014-10-30 22:36:39 +03:00			`nodeid_t setNodeInfo ( int64_t nodeHash );`
Initial file population. 2013-08-03 00:12:24 +04:00
			`char *m_node; // tag data, or text data if not a tag`
now it compiles with -m32 2014-11-11 01:45:11 +03:00			`int32_t m_nodeLen; // m_nodeLen is in bytes`
Initial file population. 2013-08-03 00:12:24 +04:00			`char *m_tagName; // iff this node is a tag`
now it compiles with -m32 2014-11-11 01:45:11 +03:00			`int32_t m_tagNameLen;`
replace long long with int64_t 2014-10-30 22:36:39 +03:00			`int64_t m_hash; // iff this node is a tag`
			`//int64_t m_compoundHash; // set by Xml class`
now it compiles with -m32 2014-11-11 01:45:11 +03:00			`//int32_t m_parentTagNum; // set by Xml class`
			`//int32_t m_xmlParentTagNum; // set by Xml class`
			`int16_t m_depth; // set by Xml class (xml depth only)`
Initial file population. 2013-08-03 00:12:24 +04:00			`nodeid_t m_nodeId; // 0 for text,1 for xml tag, 1+ for html`
			`char m_hasBackTag:1;`
			`char m_isBreaking:1; // does tag (if it is) line break?`
			`char m_isVisible:1;`
			`char m_isSelfLink:1; // an a href tag link to self?`
now it compiles with -m32 2014-11-11 01:45:11 +03:00			`int32_t m_pairTagNum; // paired opening or closing tag`
Initial file population. 2013-08-03 00:12:24 +04:00			`// . "m_linkNum" references a link in Links.cpp`
			`// . use for <a href> xml nodes only right now`
			`// . used so XmlDoc.cpp::getContactUsLink() works better`
now it compiles with -m32 2014-11-11 01:45:11 +03:00			`//int32_t m_linkNum;`
added Xml::getCompoundName() 2014-09-28 19:39:46 +04:00			`class XmlNode *m_parent;`
Initial file population. 2013-08-03 00:12:24 +04:00			`};`

			`// . does "s" start a tag? (regular tag , back tag or comment tag)`
now it compiles with -m32 2014-11-11 01:45:11 +03:00			`inline bool isTagStart ( char *s ) { // , int32_t i, int32_t version ) {`
Initial file population. 2013-08-03 00:12:24 +04:00			`// it must start with < to be a tag`
			`if ( *s != '<' ) return false;`
			`// a <gb is a fake tag because we now decode all html entites`
			`// so in htmlDecode() in fctypes.cpp we decode < to`
			`// "<gb"`
			`//if ( s[i+1]=='g' && s[i+2]=='b') return false;`
			`// minimal tag is 3 chars`
			`// if ( !s[ii + 2 >= len ) return false;`
			`// next char can be an alnum, !-- or / then alnum`
			`if ( is_alnum_a ( s[1] ) ) return true;`
			`// next char can be 1 of 3 things to be a tag`
			`//switch ( s[1] ) {`
			`// / is also acceptable, followed only by an alnum or >`
			`if ( s[1]== '/' ) {`
			`if ( is_alnum_a(s[2]) ) return true;`
			`if ( s[2] == '>' ) return true;`
			`return false;`
			`}`
			`// office.microsoft.com uses <?xml ...?> tags`
			`if ( s[1]=='?' ) {`
			`if ( is_alnum_a(s[2]) ) return true;`
			`//if ( s[2] == '>' ) return true; <?> is tag???`
			`return false;`
			`}`
			`// make sure the double hyphens follow the ! or alnum`
			`if ( s[1]=='!' ) {`
			`// this is for <!xml> i guess`
			`if ( is_alnum_a(s[2]) ) return true;`
			`// and the <![CDATA[`
			`if ( s[2]=='[' && s[3]=='C' && s[4]=='D' &&`
			`s[5]=='A' && s[6]=='T' && s[7]=='A' &&`
			`s[8]=='[' ) return true;`
			`// and the <!-- comment here--> famous comment tag`
			`if ( s[2]=='-' && s[3]=='-' ) return true;`
			`// and <![....]> i've seen too`
			`// <![if gt IE 6]><script>.... for waterfordcoc.org`
			`if ( s[2] == '[' ) return true;`
			`}`
			`return false;`
			`};`


			`// Now set up a structure for describing ALL the available HTML nodes.`
			`// . Each HTML node has a name, name length, does it break a word?`
			`// a format bit. (most HTML tags have 0 for their format bit`
			`// because we really don't care about what they do -- we use format`
			`// bits for extracting title, summaries, et al.`
			`// . the is indexable is false for tags like <script> <option> whose contents`
			`// are not visible/indexable`
fixes to make easier to compile on max os x. 2014-08-28 23:55:02 +04:00			`class NodeType {`
			`public:`
Initial file population. 2013-08-03 00:12:24 +04:00			`char *m_nodeName;`
			`bool m_hasBackTag;`
			`char m_isBreaking;`
			`char m_isVisible;`
			`char m_filterKeep1; // for &strip=1 option`
			`char m_filterKeep2; // for &strip=2 option`
			`nodeid_t m_nodeId;`
			`char m_isXmlTag;`
			`};`

			`extern class NodeType g_nodes[];`

			`inline char *getTagName ( nodeid_t tagId ) {return g_nodes[tagId].m_nodeName;};`

			`// . each tag has a number`
			`enum {`
			`TAG_TEXTNODE = 0,`
			`TAG_XMLTAG,`
			`TAG_A,`
			`TAG_ABBREV,`
			`TAG_ACRONYM,`
			`TAG_ADDRESS,`
			`TAG_APPLET,`
			`TAG_AREA,`
			`TAG_AU,`
			`TAG_AUTHOR,`
			`TAG_B, // 10`
			`TAG_BANNER,`
			`TAG_BASE,`
			`TAG_BASEFONT,`
			`TAG_BGSOUND,`
			`TAG_BIG,`
			`TAG_BLINK,`
			`TAG_BLOCKQUOTE,`
			`TAG_BQ,`
			`TAG_BODY,`
			`TAG_BR, // 20`
			`TAG_CAPTION,`
			`TAG_CENTER,`
			`TAG_CITE,`
			`TAG_CODE,`
			`TAG_COL,`
			`TAG_COLGROUP,`
			`TAG_CREDIT,`
			`TAG_DEL,`
			`TAG_DFN,`
			`TAG_DIR, // 30`
			`TAG_DIV,`
			`TAG_DL,`
			`TAG_DT,`
			`TAG_DD,`
			`TAG_EM,`
			`TAG_EMBED,`
			`TAG_FIG,`
			`TAG_FN,`
			`TAG_FONT,`
			`TAG_FORM, // 40`
			`TAG_FRAME,`
			`TAG_FRAMESET,`
			`TAG_H1,`
			`TAG_H2,`
			`TAG_H3,`
			`TAG_H4,`
			`TAG_H5,`
			`TAG_H6,`
			`TAG_HEAD,`
			`TAG_HR, // 50`
			`TAG_HTML,`
			`TAG_I,`
			`TAG_IFRAME,`
			`TAG_IMG,`
			`TAG_INPUT,`
			`TAG_INS,`
			`TAG_ISINDEX,`
			`TAG_KBD,`
			`TAG_LANG,`
			`TAG_LH, // 60`
			`TAG_LI,`
			`TAG_LINK,`
			`TAG_LISTING,`
			`TAG_MAP,`
			`TAG_MARQUEE,`
			`TAG_MATH,`
			`TAG_MENU,`
			`TAG_META,`
			`TAG_MULTICOL,`
			`TAG_NOBR, // 70`
			`TAG_NOFRAMES,`
			`TAG_NOTE,`
			`TAG_OL,`
			`TAG_OVERLAY,`
			`TAG_P,`
			`TAG_PARAM,`
			`TAG_PERSON,`
			`TAG_PLAINTEXT,`
			`TAG_PRE,`
			`TAG_Q, // 80`
			`TAG_RANGE,`
			`TAG_SAMP,`
			`TAG_SCRIPT,`
			`TAG_SELECT,`
			`TAG_SMALL,`
			`TAG_SPACER,`
			`TAG_SPOT,`
			`TAG_STRIKE,`
			`TAG_STRONG,`
			`TAG_SUB, // 90`
			`TAG_SUP,`
			`TAG_TAB,`
			`TAG_TABLE,`
			`TAG_TBODY,`
fix <script> tags that immediately end in </script> or never end but hit another <script> or a </gbiframe> tag. 2014-07-15 04:24:20 +04:00
Initial file population. 2013-08-03 00:12:24 +04:00			`TAG_TD,`
			`TAG_TEXTAREA,`
			`TAG_TEXTFLOW,`
			`TAG_TFOOT,`
			`TAG_TH,`
			`TAG_THEAD, // 100`
			`TAG_TITLE,`
			`TAG_TR,`
			`TAG_TT,`
fix <script> tags that immediately end in </script> or never end but hit another <script> or a </gbiframe> tag. 2014-07-15 04:24:20 +04:00
Initial file population. 2013-08-03 00:12:24 +04:00			`TAG_U,`
			`TAG_UL,`
			`TAG_VAR,`
			`TAG_WBR,`
			`TAG_XMP,`
			`TAG_COMMENT,`
fix <script> tags that immediately end in </script> or never end but hit another <script> or a </gbiframe> tag. 2014-07-15 04:24:20 +04:00
Initial file population. 2013-08-03 00:12:24 +04:00			`TAG_OPTION, // 110`
			`TAG_STYLE,`
			`TAG_DOCTYPE,`
			`TAG_XML,`
			`TAG_START,`
			`TAG_STOP,`
			`TAG_SPAN,`
			`TAG_LEGEND,`
			`TAG_S,`
fix <script> tags that immediately end in </script> or never end but hit another <script> or a </gbiframe> tag. 2014-07-15 04:24:20 +04:00
Initial file population. 2013-08-03 00:12:24 +04:00			`TAG_ABBR,`
			`TAG_CDATA, // 120`
			`TAG_NOSCRIPT,`
			`TAG_FIELDSET,`
			`TAG_FBORIGLINK, // "feedburner:origlink" special feedburner link`
			`TAG_RDF , // rdf:RDF`
			`TAG_RSS , // rss`
			`TAG_FEED , // atom feed tag`

			`TAG_ITEM,`
			`TAG_ENTRY,`
			`TAG_CHANNEL,`
			`TAG_ENCLOSURE,`
			`TAG_WEBLOG,`
			`// a tag we insert in XmlDoc.cpp to indicate expanded frame/iframe src`
			`TAG_GBFRAME,`
			`TAG_TC,`
			`TAG_GBXMLTITLE,`

			`// facebook xml tags`
			`TAG_FBSTARTTIME, // 135`
			`TAG_FBENDTIME, // 136`
			`TAG_FBNAME,`
			`TAG_FBPICSQUARE,`
			`TAG_FBHIDEGUESTLIST,`

			`// . do not parse this up into words!! it is text in <script> tags`
			`// . consider it a whole tag i guess`
			`TAG_SCRIPTTEXT,`
fix <script> tags that immediately end in </script> or never end but hit another <script> or a </gbiframe> tag. 2014-07-15 04:24:20 +04:00			`TAG_BUTTON,`
			`TAG_URLFROM, // for ahrefs.com`

sitemap.xml support for harvesting loc urls. parse xml docs as pure xml again but set nodeid to TAG_LINK etc. so Linkdb.cpp can get links again. added isparentsitemap url filter to prioritize urls from sitemaps. added isrssext to url filters to prioritize new possible rss feed urls. added numinlinks to url filters to prioritize popular urls for spidering. use those filters in default web filter set. fix filters that delete urls from the index using the 'DELETE' priority. they weren't getting deleted. 2015-03-17 23:26:16 +03:00			`// support sitemap.xml`
			`TAG_LOC,`

fix <script> tags that immediately end in </script> or never end but hit another <script> or a </gbiframe> tag. 2014-07-15 04:24:20 +04:00			`//`
			`// fake tags below here`
			`//`
Initial file population. 2013-08-03 00:12:24 +04:00			`// a fake tag used by Sections.cpp`
			`TAG_SENTENCE,`

			`LAST_TAG`
			`};`
			`#endif`