// Matt Wells, copyright Jul 201 // . the record retrieved from tagdb // . used for describing a site // . can parse out record from our rdb or from a network msg // . has siteUrl and filenum of the file that holds the Xml that has the // parsing rules and quotas for docs in that site // . we have the fields you can use at the bottom of this file #ifndef _CATREC_H_ #define _CATREC_H_ #include "Conf.h" #include "Xml.h" #include "RdbList.h" #include "Tagdb.h" #include "Categories.h" #include "Lang.h" #include "Tagdb.h" #include "Catdb.h" #define MAX_IND_CATIDS 1024 #define MAX_SITE_TYPES 12 // url, catids, indirect catids, numCatids, numIndCatids, filenum #define CATREC_BUF_SIZE MAX_URL_LEN + MAX_CATIDS*4 + 9 class CatRec { public: // these just set m_xml to NULL void reset() ; CatRec(); ~CatRec(); // . extract the site url for "url" // . extract the filenum of the file that holds the xml we want // . returns false and sets errno on error setting // . if rec is NULL we use the default rec for this collection bool set ( Url *url, char *data,long dataSize, bool gotByIp ); // , char rdbId = RDB_TAGDB ); // we're empty if m_xml is NULL //bool isEmpty() { return (! m_xml); }; // . used to by Msg9 to make a CatRec to add // . serializes filenum/site into our m_data/m_dataSize // . returns false and sets errno on error /* bool set ( Url *site , char *coll , long collLen , long filenum , char version , char rdbId = RDB_TAGDB , long timeStamp = 0, char *comment = NULL, char *username = NULL, long *catids = NULL, unsigned char numCatids = 0, unsigned char spamBits = 0, char siteQuality = 0, char adultLevel = 0, SiteType *siteTypes = NULL, uint8_t numTypes = 0, SiteType *langs = NULL, uint8_t numLangs = 0); */ bool set ( Url *site , long filenum , long *catids = NULL, unsigned char numCatids = 0 ); //Xml *getXml() { return m_xml; }; //bool set ( long filenum ) ; // . this method just sets the filenum, version, url and url-len from // data-pointer "data" // . this method is written as an alternative to the above set methods // Useful if the caller is interested just in the url and url len // saves time bool set (char *data, long dataSize);//, char rdbId ); // set the indirect catids void setIndirectCatids ( long *indCatids, long numIndCatids ); // . did this url have an entry in tagdb? // . we need this to know because if it didn't it will have default rec // . Msg16 will override Url::isSpam() if this record is not default // . Msg25 will also not bother checking for link bans via Msg18 bool hadRec() { return m_hadRec; }; // . did we get it by ip? (if not, we got it by canonical domain name) // . if we got it by IP and it was banned, admin has the option to // tell gigablast to automatically add the domain name as banned // to tagdb in Msg14.cpp bool gotByIp() { return m_gotByIp; }; // get the record itself (just templateNum/site/coll) char *getData ( ) { return m_data; }; long getDataSize ( ) { return m_dataSize; }; // along with coll/collLen identifies a unique xml file //long getFilenum ( ) { return m_filenum; }; //long getRuleset ( ) { return m_filenum; }; // . these should both be NULL terminated // . they both reference into the data contained in m_list // or m_buf if the list doesn't have a site record for us Url *getSite ( ) { return &m_site; }; //char *getCollection ( ) { return m_coll; }; //long getCollectionLen ( ) { return m_collLen; }; /* char* printFormattedRec(char* p); void printFormattedRec(SafeBuf *sb); char* printXmlRec (char* p); void printXmlRec ( SafeBuf *sb ); //status of manually set bits. bool isSpamUnknown() { return m_spamBits == SPAM_UNKNOWN; } bool isSpam() { return m_spamBits == SPAM_BIT; } bool isNotSpam() { return m_spamBits == NOT_SPAM; } char* getSpamStr(); unsigned char getSpamStatus() { return m_spamBits; } // bool isRatingUnknown() { return m_adultLevel == NOT_RATED; } bool isAdultButNotPorn() { return m_adultLevel == RATED_R; } bool isPorn() { return m_adultLevel == RATED_X; } bool isKidSafe() { return m_adultLevel == RATED_G; } char* getAdultStr(); char *getPubDateFmtStr(); long getTimeStamp() { return m_timeStamp; } char *getComment() { return m_comment; } char *getUsername() { return m_username; } char getSiteQuality() { return m_siteQuality; } long getNumSiteTypes () { return m_numTypes; } long getNumSiteLangs () { return m_numLangs; } SiteType *getSiteTypes () { return m_siteTypes; } SiteType *getSiteLangs () { return m_siteLangs; } uint32_t getScoreForType(uint8_t type); // . mod functions // . pain in the butt cuz we gotta change m_data/m_dataSize buffer too void addSiteType (uint8_t type, uint32_t score ) ; void setFilenum (long newFilenum ); // . [n0,n1] constitute an xml node range in "xml" // . "len" is the length of another node's data in another xml doc // . gets the scoreWeight from docQuality and a node's dataLen // . 2nd one gets the maxScore from docQuality long getScoreWeightFromQuality ( long n0, long n1, long quality ); long getScoreWeightFromQuality2( long quality ); long getMaxScoreFromQuality ( long n0, long n1, long quality ); long getMaxLenFromQuality ( long n0, long n1, long quality ); //bool hasMaxCountFromQualityTag ( long n0, long n1 ) ; //long getMaxCountFromQuality ( long n0, long n1, long quality ) ; long getScoreWeightFromLen ( long n0, long n1, long len ); long getScoreWeightFromLen2 ( long len ); long getScoreWeightFromNumWords( long n0, long n1, long len ); long getMaxScoreFromLen ( long n0, long n1, long quality ); long getMaxScoreFromNumWords ( long n0, long n1, long quality ); // 2 new maps for boosting base quality from link statistics long getQualityBoostFromNumLinks ( long numLinks ); long getQualityBoostFromLinkQualitySum ( long linkBaseQualitySum ); // 2 new maps for maxScore/scoreWeight of outgoing linkText long getLinkTextScoreWeightFromLinkerQuality ( long quality ); long getLinkTextScoreWeightFromLinkeeQuality ( long quality ); long getLinkTextMaxScoreFromQuality ( long quality ); long getLinkTextScoreWeightFromNumWords( long numWords ); // . another new map for boosting quality from the link-adjusted // quality of our root page // . root page is just our site url (i.e. http://about.com/) // . "rootQuality" is link-adjusted long getQualityBoostFromRootQuality ( long rootQuality ) ; long getQuotaBoostFromRootQuality ( long rootQuality ) ; long getQuotaBoostFromQuality ( long quality ) ; // if X% of the words are spammed, consider ALL the words to be spammed long getMaxPercentForSpamFromQuality ( long quality ) ; //private: // . parses and accesses a map/graph in the xml for us // . returns default "def" if map not present or x's in map unordered long getY (long n0,long n1,long X,char *strx,char *stry,long def) ; */ // these reference into m_data??? Url m_site; //char m_coll[64]; //long m_collLen; // filenum determines the xml uniquely long m_filenum; // did this rec have it's own entry in tagdb? bool m_hadRec; // did we get it by ip? (if not, we got it by canonical domain name) bool m_gotByIp; /* // . the xml describing this site // . references into an Xml stored in Sitedb class Xml *m_xml; */ // a buffer for holding the little site record itself char m_data[CATREC_BUF_SIZE]; long m_dataSize; // category ID info for catdb unsigned char m_numCatids; long *m_catids; long m_numIndCatids; long m_indCatids[MAX_IND_CATIDS]; // version unsigned char m_version; /* unsigned char m_spamBits; unsigned char m_adultLevel; char m_siteQuality; uint8_t m_numTypes; uint8_t m_numLangs; SiteType m_siteTypes[MAX_SITE_TYPES]; SiteType m_siteLangs[MAX_SITE_TYPES]; */ // url pointer char *m_url; long m_urlLen; /* // time stamp, comment, username long m_timeStamp; char *m_comment; char *m_username; // hack for addSiteType() long *m_incHere; char *m_addHere ; // hack for changeFilenum() char *m_filenumPtr; */ }; #endif // format of a template or default record in xml: // ## NOTE: the key of the record is the sitename prefixed with the collection: // ## NOTE: "collectionName:" is prefixed to all hashed terms before hashing // ## LATER: do permission system // ## all indexed terms will be preceeded by "collection:" when indexed so you // ## can do a search within that collection. // %s // ## %s (stored as a long) // %s (text, html?) // %s (used iff allowAllExtensions is false) // ## the base quality of all docs from this site // %c (0-100%,default 30,qual of docs in site) // ## the computed link-adjusted quality should not exceed this // %c (0-100%, def 100) // ## should we treat incoming link text as if it were on our page? // ## score weights and maxes for the link text is determined by the linker's // ## own link-adjusted quality. (see graphs/maps below) // %b (0-100, default = 100, a %) // ## do links from this site always point to clean pages? // %b (default no) // ## a doc w/ link-adjusted quality LESS THAN this will not be indexed // %c (default 0% ) // ## a doc w/ link-adjusted quality at or below this will be checked for // ## adult content. // %c (default 0%, 0 means none) // ## how often do we re-spider it? // ## we try to compute the best spider rate based on last modified times // %i (default 60*60*24*30=1month, in seconds) // %i (default 60*60*24*30=1month, in seconds) // %b (default true) // %li (0-7, default -1) -1 means prntPriorty-1 // %li (0-7, default 7) // ## these are fairly self-explanatory // %i (default 0, 0 means none) // %i (default 6 ) // %b (default no ) // %b (default no ) // %b (default no ) // %b (default no ) // %b (default yes) // %b (default yes) // %b (default yes) // %b (default yes) // %b (default yes) // %b (default yes) from cache/titledb // %b (default yes) // %b (default yes) site: terms // %b (default yes) subsite: terms // %b (default yes) url: terms // %b (default yes) suburl: terms // %b (default yes) ip: terms // %b (default yes) link:/href: terms // %ul (default -1 = no max) // ## we don't have a security system... yet... // ## TODO: %ul (default 1024*1024) // ## TODO: %s (256bit seal for maxScore tag above) // ## Now for some maps/graphs. // ## we list the 5 X components followed by the 5 Y components. // ## all maps/graphs linearly interpolate between the points. // ## the edge pieces are horizontal. // ## these maps can have up to 32 points but i typically just use 5. // ## we map the NUMBER of incoming links to a baseQuality BOOST for our doc. // ## the resulting new quality is the link-adjusted quality of the linkee doc. // ## These boosts are ADDED to the existing quality. // %i (default 0 ) // %i (default 5 ) // %i (default 10 ) // %i (default 20 ) // %i (default 50 ) // %i (default 0% ) // %i (default 5% ) // %i (default 10% ) // %i (default 15% ) // %i (default 20% ) // ## we map the SUM of the baseQuality of all linkers to a baseQuality BOOST. // ## the resulting new quality is the link-adjusted quality of the linkee doc. // ## we only add up BASE quality of the linkers. // ## we only add up 1 linker's BASE quality per site. // ## These boosts are ADDED to the existing quality. // %i (default 0 ) // %i (default 50 ) // %i (default 100 ) // %i (default 150 ) // %i (default 200 ) // %i (default 0% ) // %i (default 5% ) // %i (default 10% ) // %i (default 15% ) // %i (default 20% ) // ## we map the LINK-ADJUSTED QUALITY of our root page (site url) to a // ## quality BOOST for us. // ## the site url is just our site, could be like http://about.com/ // ## These boosts are ADDED to the existing quality. // %i (default 0 ) // %i (default 50 ) // %i (default 100 ) // %i (default 200 ) // %i (default 500 ) // %i (default 0% ) // %i (default 5% ) // %i (default 10% ) // %i (default 15% ) // %i (default 20% ) // ## TODO: make based on quality of doc and length of link text!! // ## currently we limit link text to up to 256 chars in LinkInfo.cpp. // ## map doc's link-adjusted quality to scoreWeight of it's outgoing link text // %i (default 0% ) // %i (default 30% ) // %i (default 50% ) // %i (default 70% ) // %i (default 85% ) // %i (default 50% ) // %i (default 100% ) // %i (default 130% ) // %i (default 180% ) // %i (default 250% ) // ## map doc's link-adjusted quality to maxScore of it's outgoing link text. // ## maxScore applies to all docs from this site as to limit a site's impact. // %i (default // %i // %i // %i // %i // %i // %i // %i // %i // %i // ## we map the LINK-ADJUSTED QUALITY of our ROOT page (site url) to a quota // ## boost. (can be negative) // ## the site url is just our site, could be like http://about.com/ // ## These boosts are MULTIPLIED by the existing quota. // %i (default 0 ) // %i (default 50 ) // %i (default 100 ) // %i (default 200 ) // %i (default 500 ) // %i (default 0% ) // %i (default 0% ) // %i (default 0% ) // %i (default 0% ) // %i (default 0% ) // ## we map the LINK-ADJUSTED QUALITY of our page (site url) to a quota // ## boost. (can be negative) // ## the site url is just our site, could be like http://about.com/ // ## These boosts are MULTIPLIED by the existing quota. // %i (default 0 ) // %i (default 50 ) // %i (default 100 ) // %i (default 200 ) // %i (default 500 ) // %i (default 0% ) // %i (default 0% ) // %i (default 0% ) // %i (default 0% ) // %i (default 0% ) // ## the node describes parsing/indexing rtu // ## used for xhtml tags (title, meta summary/keywords/description) // ## NOTE: defines a point on the #words-to-score function // ## NOTE: omit to index whole body (exculdes meta tags and xml tags) // ## NOTE: set to "meta.summary" for indexing meta tag summary // ## NOTE: set to "meta.keywords" for indexing meta tag keywords // ## NOTE: set to "meta.description" for indexing meta tag keywords // ## NOTE: set to "Xml" for indexing ALL xml tags // ## NOTE: set to ??? for indexing text under that tag ... // // %s ("title","meta.summary","Xml","W") // %s (for mapping pure xml tags) // %s (like "title", "myTag:" -can omit) // %c (default 0, 0 means none) // %ul (0-255, default 0 ) do not index // %ul (0-inf, default 0 ) // %ul (0-inf, default inf) // %ul (0-inf, default inf) // %b (default no) (ex.: no for title) // %b (default no ) index checksum? // %b (default yes) // %b (default no ) hash word iff unique // %b (default yes) // %b (default yes) // %b (default no ) hash a checksum // %b (default yes) // %b (default yes) // // ## Map doc's (link-adjusted) quality to a maxLen for this field. // ## 30% quality is probably average. // ## NOTE: there really are no defaults for these, use tagdb default rec. // %c (default 15% ) // %c (default 30% ) // %c (default 45% ) // %c (default 60% ) // %c (default 80% ) // %ul (default 80k ) // %ul (default 100k) // %ul (default 150k) // %ul (default 200k) // %ul (default 250k) // // ## Map doc's (link-adjusted) quality to a maxScore for this field. // %c (default 15% ) // %c (default 30% ) // %c (default 45% ) // %c (default 60% ) // %c (default 80% ) // %ul (default 30% ) // %ul (default 45% ) // %ul (default 60% ) // %ul (default 80% ) // %ul (default 100%) // // ## map doc (link-adjusted) quality to a scoreWeight for this field // %c (default 15% ) // %c (default 30% ) // %c (default 45% ) // %c (default 60% ) // %c (default 80% ) // %ul (default 60% ) // %ul (default 100%) // %ul (default 150%) // %ul (default 200%) // %ul (default 250%) // // ## map field length to a scoreWeight for this field // %ul (default 100) #w<100 -->wght=300 // %ul (default 500) score in[200,300] // %ul (default 1000) // %ul (default 2000) // %ul (default 5000) if under/over 5000 // %ul (default 300%) // %ul (default 200%) // %ul (default 150%) // %ul (default 100%) // %ul (default 50%) // // ## map field length to a maxScore for this field // %ul (default 100) #w<100 -->wght=300 // %ul (default 500) score in[200,300] // %ul (default 1000) // %ul (default 2000) // %ul (default 5000) if under/over 5000 // %ul (default 30% ) // %ul (default 45% ) // %ul (default 60% ) // %ul (default 80% ) // %ul (default 100%) // // // TODO: // , , ... for pure xml tags w/ special meaning //