#ifndef _XML_H_ #define _XML_H_ // . this is used for parsing tagdb records // . used for pasrsing tagdb records, conf file, and html/xml documents // . NOTE: ALL tags are case INsensitive so equals #include "XmlNode.h" #include "Lang.h" class Xml { public: Xml () ; // . should free m_xml if m_copy is true ~Xml () ; // do we have any xml in here? bool isEmpty () { return (m_xml == NULL); }; // . set this xml class from a string // . should be called before calling anything else // . if "copy" makes a copy of the string "s" and references into that // . s must be NULL terminated // . if it's pure xml then set pureXml to true otherwise we assume it // is html or xhtml bool set(char *s , int32_t slen , bool ownData , int32_t allocSize, //=0, bool pureXml, // =false ); int32_t version , bool setParents , // = true, int32_t niceness , // = 0 char contentType ); void reset ( ); char *getContent () { return m_xml; }; char *getContentEnd () { return m_xml + m_xmlLen; }; int32_t getContentLen () { return m_xmlLen; }; int32_t getNumNodes ( ) { return m_numNodes; }; int32_t getVersion ( ) { return m_version; }; // . tagName is compound for xml tags, simple for html tags // . xml compound tag name example = "myhouse.bedroom.nightstand" // . html simple tag name example = "title" or "table" // . obsolete compound name = myhouse[0].bedroom[2].nightstand[1] // . returns -1 if not found // . only searches nodes in [n0,n1] node range int32_t getNodeNum ( int32_t n0, int32_t n1, char *tagName, int32_t tagNameLen) const; // some wrappers for it int32_t getNodeNum ( char *tagName ) { if ( ! tagName ) { char *xx=NULL;*xx=0; } return getNodeNum ( 0,m_numNodes,tagName,strlen(tagName)); }; int32_t getNodeNum ( char *tagName , int32_t tagNameLen ) { return getNodeNum ( 0,m_numNodes,tagName,tagNameLen); }; int32_t findNodeNum ( char *nodeText); int32_t getPingServerCount ( ) ; // . get the back tag node for a given node // . return the last node considered to be a kid or part of node n // . used by XmlDoc to parse out tags and find // int32_t getEndNode ( int32_t n ); bool isTag ( int32_t n ) {return m_nodes[n].isTag(); }; bool isBreakingTag ( int32_t n ) {return m_nodes[n].m_isBreaking;}; bool isBackTag ( int32_t n ) {return m_nodes[n].m_node[1]=='/';}; bool isXmlTag ( int32_t n ) {return m_nodes[n].m_nodeId == 1;}; char *getNode ( int32_t n ) {return m_nodes[n].m_node; }; int32_t getNodeLen ( int32_t n ) {return m_nodes[n].m_nodeLen;}; nodeid_t getNodeId ( int32_t n ) {return m_nodes[n].m_nodeId;}; int64_t getNodeHash ( int32_t n ) {return m_nodes[n].m_hash;}; bool isVisible ( int32_t n ) {return m_nodes[n].m_isVisible;}; // get all nodes! XmlNode *getNodes ( ) { return m_nodes; }; XmlNode *getNodePtr ( int32_t n ) { return &m_nodes[n]; }; // . store the the full xml compound name of a tag into "buf" (w/ \0) // . return the length stored // . ie. "xml.country.state.city" // . fullTag option returns the entire node text // . ie. "... //int32_t getCompoundName ( int32_t n , char *buf , int32_t bufMaxLen, // bool fullTag = false ) ; // get like compound name like "node1.node2.node3\0" bool getCompoundName ( int32_t node , class SafeBuf *sb ) ; // . used for parsing xml conf files // . used for getting the title in an html doc, etc. // . gets the value of the text field immediately following the tag // . "tagName" is always compound // . only searches nodes in [n0,n1] node range bool getBool ( int32_t n0 , int32_t n1 , char *tagName , bool defaultBool = 0 ); int32_t getLong ( int32_t n0 , int32_t n1 , char *tagName , int32_t defaultLong = 0 ); int64_t getLongLong ( int32_t n0 , int32_t n1 , char *tagName , int64_t defaultLongLong = 0LL ); float getFloat ( int32_t n0 , int32_t n1 , char *tagName , float defaultFloat = 0.0 ); char *getString ( int32_t n0 , int32_t n1 , char *tagName , int32_t *len , bool skipLeadingSpaces = true ) const; // for parsing facebook replies: char *getNode ( char *tagName , int32_t *len ) ; // like above routines but we search all nodes bool getBool ( char *tagName, bool defaultBool = false ) { return getBool(0,m_numNodes,tagName,defaultBool); } int32_t getLong ( char *tagName, int32_t defaultLong = 0 ) { return getLong(0,m_numNodes,tagName,defaultLong); } int64_t getLongLong (char *tagName, int64_t defaultLongLong = 0LL){ return getLongLong(0,m_numNodes,tagName,defaultLongLong); } float getFloat ( char *tagName, float defaultFloat = 0.0 ) { return getFloat(0,m_numNodes,tagName,defaultFloat); } char *getString ( char *tagName , int32_t *len , bool skipLeadingSpaces = true ) const { return getString(0,m_numNodes,tagName,len,skipLeadingSpaces); } // . used for getting links in the tag // . used for getting data from meta tags //bool getBool ( int32_t node, char *field, bool defaultBool ); //int32_t getLong ( int32_t node, char *field, int32_t defaultLong ); char *getString ( int32_t node, char *field, int32_t *valueLen ) { if ( node >= m_numNodes ) { char *xx=NULL;*xx=0; } return m_nodes[node].getFieldValue ( field , valueLen);} // called by getTextForXmlTag() below char *getString ( int32_t node , bool skipLeadingSpaces, int32_t *len )const; // . like getText() below but gets the content from a meta tag // . stores it in "buf" and NULL terminates it // . returns the length // . field can be stuff like "summary","description","keywords",... // . use "http-equiv" for "name" for meta redirect tags // . if "convertHtmlEntites" is true we change < to < and > to > int32_t getMetaContent ( char *buf , int32_t bufLen , char *field , int32_t fieldLen , char *name = "name" , bool convertHtmlEntities = false , int32_t startNode = 0 , int32_t *matchedNode = NULL ) ; // just get a pointer to it char *getMetaContentPointer ( char *field , int32_t fieldLen , char *name = "name" , int32_t *len = NULL ); // . filters out tags (uses html entities) and stores in "buf" // . replaces "line breaking" html tags with 2 returns // . only get text of nodes in [node1,node2] // . returns # chars written to buf // . buf is NULL terminated // . bufMaxSize is usually at least getContentLen() + 1 (m_xmlLen+1) // . maxDepth is RELATIVE to node # nodeNumber's depth // . if "filter" then convert html entities and \r's to spaces // . get kid text of node #"nodeNumber" unless it's -1 // . if "filterSpaces" then don't allow back to back spaces or \n's // and replace tags with ".." not \n (but no back to back ..'s) int32_t getText ( char *buf , int32_t bufMaxSize , int32_t node1 = 0 , int32_t node2 = 999999, bool includeTags = false , bool visibleTextOnly = true , bool filter = false , bool filterSpaces = false , bool useStopIndexTag = false ); // do they have a possible search box for gigablast on their page? // if url and urlLen are non-null they will be filled in with the // target url bool hasGigablastForm(char **url = NULL, int32_t *urlLen = NULL); unsigned char getLanguage() { return langUnknown; } int32_t isRSSFeed ( ); //bool isAtomFeed ( ); //bool isRDFFeed ( ); //int32_t getRSSPublishDate ( int32_t niceness ); char *getRSSTitle ( int32_t *titleLen , bool *isHtmlEncoded ) const; char *getRSSDescription ( int32_t *titleLen , bool *isHtmlEncoded ); // get the link to the RSS feed for this page if it has one char *getRSSPointer ( int32_t *length, int32_t startNode = 0, int32_t *matchNode = NULL ); // get the link from this RSS item to the article char *getItemLink ( int32_t *length ); int32_t getMemUsed() { return m_allocSize + m_maxNumNodes*sizeof(XmlNode); }; // private: // . used by getValueAsBool/Long/String() // . tagName is compound for xml tags, simple for html tags char *getTextForXmlTag ( int32_t n0, int32_t n1, char *tagName, int32_t *len , bool skipLeadingSpaces ) const; // used because "s" may have words separated by periods int64_t getCompoundHash ( char *s , int32_t len ) const; // . set the m_parentNum of each XmlNode in our m_nodes array // . TODO: do it on demand? void setParents (); // . must be called after setParents() // . sets the m_compoundHash member variable in each XmlNode in m_nodes void setCompoundHashes(); // . for mapping one xml tag to another // . we map single and compound tags // . TODO: implement this // class Xml *map; XmlNode *m_nodes; int32_t m_numNodes; int32_t m_maxNumNodes; bool m_pureXml; char *m_xml; int32_t m_xmlLen; // If this is a unicode buffer, then m_xml is encoded in UTF-16 // m_xmlLen is still the size of the buffer IN BYTES int32_t m_version; int32_t m_niceness; // if we own the data we free m_xml on reset or destruction bool m_ownData; int32_t m_allocSize; // size of buffer, if we allocated it }; #endif