// Gigablast, Inc. Copyright April 2007 // Linkdb - stores link information // . Format of a 28-byte key in linkdb // . used by Msg25::getPageLinkInfo() // . // . HHHHHHHH HHHHHHHH HHHHHHHH HHHHHHHH H = sitehash32 of linkEE // . pppppppp pppppppp pppppppp pppppppp p = linkEEHash, q = ~linkerSiteRank // . pppppppp pppppppS qqqqqqqq cccccccc c = lower ip byte, S = isLinkSpam? // . IIIIIIII IIIIIIII IIIIIIII dddddddd I = upper 3 bytes of ip // . dddddddd dddddddd dddddddd dddddd00 d = linkerdocid,h = half bit,Z =delbit // . mmmmmmmm mmmmmm0N xxxxxxxx xxxxxxss N = 1 if it was added to existing page // . ssssssss ssssssss ssssssss sssssshZ s = sitehash32 of linker // m = discovery date in days since jan 1 // x = estimated date it was lost (0 if not yet lost) // // NOTE: the "c" bits were the hopcount of the inlinker, but we changed // them to the lower ip byte so steve can show the # of unique ips linking // to your page or site. #ifndef _LINKDB_H_ #define _LINKDB_H_ #define LDBKS sizeof(key224_t) #define LDB_MAXSITERANK 0xff #define LDB_MAXHOPCOUNT 0xff #define LDB_MAXURLHASH 0x00007fffffffffffLL #define LINKDBEPOCH (1325376000-365*86400*4) #include "Conf.h" #include "Rdb.h" //#include "DiskPageCache.h" #include "Titledb.h" void handleRequest25 ( UdpSlot *slot , int32_t netnice ) ; // . get the inlinkers to this SITE (any page on this site) // . use that to compute a site quality // . also get the inlinkers sorted by date and see how many good inlinkers // we had since X days ago. (each inlinker needs a pub/birth date) class Msg25Request { public: // either MODE_PAGELINKINFO or MODE_SITELINKINFO char m_mode; // bool m_isSiteLinkInfo ; int32_t m_ip ; int64_t m_docId ; collnum_t m_collnum ; bool m_isInjecting ; bool m_printInXml ; // when we get a reply we call this void *m_state ; void (* m_callback)(void *state) ; // server-side parms so it doesn't have to allocate a state //SafeBuf m_pbuf ; //SafeBuf m_linkInfoBuf ; //char *coll ; //char *qbuf ; //int32_t qbufSize ; //XmlDoc *xd ; int32_t m_siteNumInlinks ; class LinkInfo *m_oldLinkInfo ; int32_t m_niceness ; bool m_doLinkSpamCheck ; bool m_oneVotePerIpDom ; bool m_canBeCancelled ; int32_t m_lastUpdateTime ; bool m_onlyNeedGoodInlinks ; bool m_getLinkerTitles ; int32_t m_ourHostHash32 ; int32_t m_ourDomHash32 ; uint8_t m_waitingInLine:1; uint8_t m_reserved1:1; uint8_t m_reserved2:1; uint8_t m_reserved3:1; uint8_t m_reserved4:1; uint8_t m_reserved5:1; uint8_t m_reserved6:1; uint8_t m_reserved7:1; // new stuff int32_t m_siteHash32; int64_t m_siteHash64; int64_t m_linkHash64; // for linked list of these guys in g_lineTable in Linkdb.cpp // but only used on the server end, not client end class Msg25Request *m_next; // the mutlicast we use class Multicast *m_mcast; UdpSlot *m_udpSlot; bool m_printDebugMsgs; // store final LinkInfo reply in here SafeBuf *m_linkInfoBuf; char *ptr_site; char *ptr_url; char *ptr_oldLinkInfo; int32_t size_site; int32_t size_url; int32_t size_oldLinkInfo; //variable data begins here int32_t getStoredSize(); void serialize(); void deserialize(); }; // . returns false if blocked, true otherwise // . sets errno on error // . your req->m_callback will be called with the Msg25Reply bool getLinkInfo ( SafeBuf *reqBuf , // store msg25 request in here Multicast *mcast , // use this to send msg 0x25 request char *site , char *url , bool isSiteLinkInfo , int32_t ip , int64_t docId , collnum_t collnum , char *qbuf , int32_t qbufSize , void *state , void (* callback)(void *state) , bool isInjecting , SafeBuf *pbuf , //class XmlDoc *xd , bool printInXml , int32_t siteNumInlinks , //int32_t sitePop , LinkInfo *oldLinkInfo , int32_t niceness , bool doLinkSpamCheck , bool oneVotePerIpDom , bool canBeCancelled , int32_t lastUpdateTime , bool onlyNeedGoodInlinks , bool getLinkerTitles , //= false , // if an inlinking document has an outlink // of one of these hashes then we set // Msg20Reply::m_hadLinkToOurDomOrHost. // it is used to remove an inlinker to a related // docid, which also links to our main seo url // being processed. so we do not recommend // such links since they already link to a page // on your domain or hostname. set BOTH to zero // to not perform this algo in handleRequest20()'s // call to XmlDoc::getMsg20Reply(). int32_t ourHostHash32 , // = 0 , int32_t ourDomHash32 , // = 0 ); SafeBuf *myLinkInfoBuf ); void handleRequest25 ( UdpSlot *slot , int32_t netnice ) ; int32_t getSiteRank ( int32_t sni ) ; class Linkdb { public: void reset(); bool init ( ); bool init2 ( int32_t treeMem ); bool verify ( char *coll ); bool addColl ( char *coll, bool doVerify = true ); bool setMetaList ( char *metaList , char *metaListEnd , class XmlDoc *oldDoc , class XmlDoc *newDoc , int32_t niceness , int32_t *numBytesWritten ) ; // this makes a "url" key key224_t makeKey_uk ( uint32_t linkeeSiteHash32 , uint64_t linkeeUrlHash64 , bool isLinkSpam , unsigned char linkerSiteRank , // 0-15 i guess unsigned char linkerHopCount , uint32_t linkerIp , int64_t linkerDocId , uint32_t discoveryDate , uint32_t lostDate , bool newAddToOldPage , uint32_t linkerSiteHash32 , bool isDelete ); key224_t makeStartKey_uk ( uint32_t linkeeSiteHash32 , uint64_t linkeeUrlHash64 = 0LL ) { return makeKey_uk ( linkeeSiteHash32, linkeeUrlHash64, false, // linkspam? 255, // 15, // ~siterank 0, // hopcount 0, // ip 0, // docid 0, //discovery date 0, // lostdate false, // newaddtopage 0, // linkersitehash true); // is delete? } key224_t makeEndKey_uk ( uint32_t linkeeSiteHash32 , uint64_t linkeeUrlHash64 = 0xffffffffffffffffLL ) { return makeKey_uk ( linkeeSiteHash32, linkeeUrlHash64, true, // linkspam? 0, // ~siterank 0xff, // hopcount 0xffffffff, // ip MAX_DOCID, // docid 0xffffffff, //discovery date 0xffffffff, // lostdate true, // newaddtopage 0xffffffff, // linkersitehash false); // is delete? } /* int64_t getUrlHash ( Url *url ) { // . now use the probable docid (UNMASKED) // . this makes it so we reside on the same host as the // titleRec and spiderRec of this url. that way Msg14 // is assured of adding the Linkdb rec before the Spiderdb // rec and therefore of getting its parent as an inlinker. int64_t h = g_titledb.getProbableDocId(url,false); // now it is the lower 47 bits since we added the spam bit return h & 0x00007fffffffffffLL; } */ // // accessors for "url" keys in linkdb // uint32_t getLinkeeSiteHash32_uk ( key224_t *key ) { return (key->n3) >> 32; } uint64_t getLinkeeUrlHash64_uk ( key224_t *key ) { uint64_t h = key->n3; h &= 0x00000000ffffffffLL; h <<= 15; h |= key->n2 >> 49; return h; } char isLinkSpam_uk (key224_t *key ) { if ((key->n2) & 0x1000000000000LL) return true; return false; } unsigned char getLinkerSiteRank_uk ( key224_t *k ) { unsigned char rank = (k->n2 >> 40) & 0xff; // complement it back rank = (unsigned char)~rank;//LDB_MAXSITERANK - rank; return rank; } //unsigned char getLinkerHopCount_uk ( key224_t *k ) { // return (k->n2 >> 32) & 0xff; //} int32_t getLinkerIp_uk ( key224_t *k ) { uint32_t ip ; // the most significant part of the ip is the lower byte!!! ip = (uint32_t)((k->n2>>8)&0x00ffffff); ip |= ((k->n2>>8) & 0xff000000); return ip; } void setIp32_uk ( void *k , uint32_t ip ) { char *ips = (char *)&ip; char *ks = (char *)k; ks[16] = ips[3]; ks[15] = ips[2]; ks[14] = ips[1]; ks[13] = ips[0]; } // we are missing the lower byte, it will be zero int32_t getLinkerIp24_uk ( key224_t *k ) { return (int32_t)((k->n2>>8)&0x00ffffff); } int64_t getLinkerDocId_uk( key224_t *k ) { uint64_t d = k->n2 & 0xff; d <<= 30; d |= k->n1 >>34; return d; } // . in days since jan 1, 2012 utc // . timestamp of jan 1, 2012 utc is 1325376000 int32_t getDiscoveryDate_uk ( void *k ) { uint32_t date = ((key224_t *)k)->n1 >> 18; date &= 0x00003fff; // if 0 return that if ( date == 0 ) return 0; // multiply by seconds in days then date *= 86400; // add OUR epoch date += LINKDBEPOCH; // and use that return date; } // . in days since jan 1, 2012 utc // . timestamp of jan 1, 2012 utc is 1325376000 void setDiscoveryDate_uk ( void *k , int32_t date ) { // subtract jan 1 2012 date -= LINKDBEPOCH; // convert into days date /= 86400; // sanity if ( date > 0x3fff || date < 0 ) { char *xx=NULL;*xx=0; } // clear old bits ((key224_t *)k)->n1 &= 0xffffffff03ffffLL; // scale us into it ((key224_t *)k)->n1 |= ((uint64_t)date) << 18; } int32_t getLostDate_uk ( void *k ) { uint32_t date = ((key224_t *)k)->n1 >> 2; date &= 0x00003fff; // if 0 return that if ( date == 0 ) return 0; // multiply by seconds in days then date *= 86400; // add OUR epoch date += LINKDBEPOCH; // and use that return date; } // . in days since jan 1, 2012 utc // . timestamp of jan 1, 2012 utc is 1325376000 void setLostDate_uk ( void *k , int32_t date ) { // subtract jan 1 2012 date -= LINKDBEPOCH; // convert into days date /= 86400; // sanity if ( date > 0x3fff || date < 0 ) { char *xx=NULL;*xx=0; } // clear old bits ((key224_t *)k)->n1 &= 0xffffffffffff0003LL; // scale us into it ((key224_t *)k)->n1 |= ((uint64_t)date) << 2; } uint32_t getLinkerSiteHash32_uk( void *k ) { uint32_t sh32 = ((key224_t *)k)->n1 & 0x00000003; sh32 <<= 30; sh32 |= ((key224_t *)k)->n0 >> 2; return sh32; } Rdb *getRdb() { return &m_rdb; }; //DiskPageCache *getDiskPageCache () { return &m_pc; }; //DiskPageCache m_pc; private: Rdb m_rdb; }; extern class Linkdb g_linkdb; extern class Linkdb g_linkdb2; // . get ALL the linkText classes for a url and merge 'em into a LinkInfo class // . also gets the link-adjusted quality of our site's url (root url) // . first gets all docIds of docs that link to that url via an link: search // . gets the LinkText, customized for our url, from each docId in that list // . merge them into a final LinkInfo class for your url //#include "LinkText.h" #include "Msg2.h" // for getting IndexLists from Indexdb #include "Msg20.h" // for getting this url's LinkInfo from another cluster #include "SafeBuf.h" #include "HashTableX.h" #include "Msg22.h" #include "CatRec.h" #define MAX_LINKERS 3000 // if a linker is a "title rec not found" or log spam, then we get another // linker's titleRec. churn through up to these many titleRecs in an attempt // to get MAX_LINKERS good titlerecs before giving up. //#define MAX_DOCIDS_TO_SAMPLE 25000 // on news.google.com, 22393 of the 25000 are link spam, and we only end // up getting 508 good inlinks, so rais from 25000 to 50000 //#define MAX_DOCIDS_TO_SAMPLE 50000 // try a ton of lookups so we can ditch xfactor and keep posdb key as // simple as possible. just make sure we recycle link info a lot! #define MAX_DOCIDS_TO_SAMPLE 1000000 // go down from 300 to 100 so XmlDoc::getRecommendLinksBuf() can launch // like 5 msg25s and have no fear of having >500 msg20 requests outstanding // which clogs things up // crap, no, on gk144 we got 128 hosts now, so put back to 300... // if we have less hosts then limit this proportionately in Linkdb.cpp #define MAX_MSG20_OUTSTANDING 300 #define MAX_NOTE_BUF_LEN 20000 #define MSG25_MAX_REQUEST_SIZE (MAX_URL_LEN+MAX_COLL_LEN+64) //#define MSG25_MAX_REPLY_SIZE 1024 void handleRequest25 ( UdpSlot *slot , int32_t netnice ) ; class Msg25 { public: // . returns false if blocked, true otherwise // . sets errno on error // . this sets Msg25::m_siteRootQuality and Msg25::m_linkInfo // . "url/coll" should NOT be on stack in case weBlock // . if "reallyGetLinkInfo" is false we don't actually try to fetch // any link text and return true right away, really saves a bunch // of disk seeks when spidering small collections that don't need // link text/info indexing/analysis bool getLinkInfo2 (char *site , char *url , bool isSiteLinkInfo , int32_t ip , int64_t docId , //char *coll , collnum_t collnum, char *qbuf , int32_t qbufSize , void *state , void (* callback)(void *state) , bool isInjecting , //SafeBuf *pbuf , bool printDebugMsgs , // into "Msg25::m_pbuf" //class XmlDoc *xd , bool printInXml , int32_t siteNumInlinks , //int32_t sitePop , LinkInfo *oldLinkInfo , int32_t niceness , bool doLinkSpamCheck , bool oneVotePerIpDom , bool canBeCancelled , int32_t lastUpdateTime , bool onlyNeedGoodInlinks , bool getLinkerTitles , //= false , // if an inlinking document has an outlink // of one of these hashes then we set // Msg20Reply::m_hadLinkToOurDomOrHost. // it is used to remove an inlinker to a related // docid, which also links to our main seo url // being processed. so we do not recommend // such links since they already link to a page // on your domain or hostname. set BOTH to zero // to not perform this algo in handleRequest20()'s // call to XmlDoc::getMsg20Reply(). int32_t ourHostHash32 , // = 0 , int32_t ourDomHash32 , // = 0 ); SafeBuf *myLinkInfoBuf ); Msg25(); ~Msg25(); void reset(); // . based on the linkInfo, are we banned/unbanned/clean/dirty // . the linking url may set these bits in it's link: term score // . these bits are set based on the linker's siteRec //bool isBanned () {return (m_msg18.isBanned () || m_isBanned ); }; //bool isUnbanned () {return (m_msg18.isUnbanned() || m_isUnbanned); }; //bool isDirty () {return (m_msg18.isDirty () || m_isDirty ); }; //bool isClean () {return (m_msg18.isClean () || m_isClean ); }; // we also set these bits from looking at url's link scores //bool m_isBanned; //bool m_isUnbanned; //bool m_isDirty; //bool m_isClean; //char getMinInlinkerHopCount () { return m_minInlinkerHopCount; }; // a new parm referencing the request we got over the network class Msg25Request * m_req25; class Msg20Reply *getLoser (class Msg20Reply *r, class Msg20Reply *p); char *isDup (class Msg20Reply *r, class Msg20Reply *p); bool addNote ( char *note , int32_t noteLen , int64_t docId ); //class LinkInfo *getLinkInfo () { return m_linkInfo; }; // m_linkInfo ptr references into here. provided by caller. SafeBuf *m_linkInfoBuf; SafeBuf m_realBuf; // private: // these need to be public for wrappers to call: bool gotTermFreq ( bool msg42Called ) ; bool getRootTitleRec ( ) ; bool gotRootTitleRec ( ); bool gotDocId ( ) ; //bool gotRootQuality2 ( ) ; bool gotRootLinkText ( ) ; bool gotRootLinkText2 ( ) ; bool getLinkingDocIds ( ) ; bool gotList ( ) ; bool gotClusterRecs ( ) ; bool sendRequests ( ); bool gotLinkText ( class Msg20Request *req ) ; //int32_t j ); bool gotMsg25Reply ( ) ; bool doReadLoop ( ); // input vars //Url *m_url; //Url m_tmpUrl; char *m_url; char *m_site; int32_t m_ourHostHash32; int32_t m_ourDomHash32; int32_t m_round; uint64_t m_linkHash64; key224_t m_nextKey; bool m_retried; bool m_prependWWW; bool m_onlyNeedGoodInlinks; bool m_getLinkerTitles; int64_t m_docId; //char *m_coll; collnum_t m_collnum; //int32_t m_collLen; //LinkInfo *m_linkInfo; void *m_state; void (* m_callback) ( void *state ); int32_t m_siteNumInlinks; //int32_t m_sitePop; int32_t m_mode; bool m_printInXml; //class XmlDoc *m_xd; // private: // url info int32_t m_ip; int32_t m_top; int32_t m_midDomHash; bool m_gettingList; // hack for seo pipeline in xmldoc.cpp int32_t m_hackrd; // . we use Msg0 to get an indexList for href: terms // . the href: IndexList's docIds are docs that link to us // . we now use Msg2 since it has "restrictIndexdb" support to limit // indexdb searches to just the root file to decrease disk seeks //Msg0 m_msg0; Msg5 m_msg5; RdbList m_list; class Inlink *m_k; // for getting the root title rec so we can share its pwids Msg22 m_msg22; int32_t m_maxNumLinkers; // should we free the m_replyPtrs on destruction? default=true bool m_ownReplies; // Now we just save the replies we get back from Msg20::getSummary() // We point to them with a LinkTextReply, which is just a pointer // and some access functions. Msg20Reply *m_replyPtrs [ MAX_LINKERS ]; int32_t m_replySizes [ MAX_LINKERS ]; int32_t m_numReplyPtrs; //LinkText *m_linkTexts [ MAX_LINKERS ]; Msg20 m_msg20s [ MAX_MSG20_OUTSTANDING ]; Msg20Request m_msg20Requests [ MAX_MSG20_OUTSTANDING ]; char m_inUse [ MAX_MSG20_OUTSTANDING ]; // for "fake" replies Msg20Reply m_msg20Replies [ MAX_MSG20_OUTSTANDING ]; // make this dynamic to avoid wasting so much space when must pages // have *very* few inlinkers. make it point to m_dbuf by default. //int64_t m_docIds [ MAX_DOCIDS_TO_SAMPLE ]; //char m_hasLinkText [ MAX_LINKERS ]; // make this dynamic as well! (see m_docIds comment above) //char m_scores [ MAX_DOCIDS_TO_SAMPLE ]; int32_t m_numDocIds; int32_t m_cblocks; int32_t m_uniqueIps; // new stuff for getting term freqs for really huge links: termlists //int64_t m_termId; //Msg42 m_msg42; int32_t m_minRecSizes; //int32_t m_termFreq; // Msg20 is for getting the LinkInfo class from this same url's // titleRec from another (usually much larger) gigablast cluster/netwrk Msg20 m_msg20; // how many msg20s have we sent/recvd? int32_t m_numRequests; int32_t m_numReplies; int32_t m_linkSpamOut; // have we had an error for any transaction? int32_t m_errno; // this is used for link ban checks //Msg18 m_msg18; SafeBuf m_tmp; SafeBuf *m_pbuf; // will point to m_tmp if m_printDebugMsgs // for holding the final linkinfo output //SafeBuf m_linkInfoBuf; // copied from CollectionRec bool m_oneVotePerIpDom ; bool m_doLinkSpamCheck ; bool m_isInjecting ; char m_canBeCancelled ; int32_t m_lastUpdateTime ; Multicast m_mcast; //char **m_statusPtr; int32_t m_good; int32_t m_errors; int32_t m_noText; int32_t m_reciprocal; bool m_spideringEnabled; //TermTable m_ipTable; //int32_t m_ipdups; int32_t m_dupCount; int32_t m_vectorDups; int32_t m_spamLinks; int32_t m_niceness; int32_t m_numFromSameIp; int32_t m_sameMidDomain; // stats for allow some link spam inlinks to vote int32_t m_spamCount; int32_t m_spamWeight; int32_t m_maxSpam; char m_siteQuality; int32_t m_siteNumFreshInlinks; // this is used for the linkdb list //HashTableT m_ipTable; HashTableX m_ipTable; HashTableX m_fullIpTable; HashTableX m_firstIpTable; // this is for deduping docids because we now combine the linkdb // list of docids with the old inlinks in the old link info //HashTableT m_docIdTable; HashTableX m_docIdTable; // special counts int32_t m_ipDupsLinkdb; int32_t m_docIdDupsLinkdb; int32_t m_linkSpamLinkdb; int32_t m_lostLinks; int32_t m_ipDups; uint32_t m_groupId; int64_t m_probDocId; LinkInfo *m_oldLinkInfo; char m_buf [ MAX_NOTE_BUF_LEN ]; char *m_bufPtr; char *m_bufEnd; HashTableX m_table; char m_request [ MSG25_MAX_REQUEST_SIZE ]; int32_t m_requestSize; //char m_replyBuf [ MSG25_MAX_REQUEST_SIZE ]; // hop count got from linkdb //char m_minInlinkerHopCount; HashTableX m_adBanTable; // for setting or determining if a search results // inlinkers also have the query terms. buzz. char *m_qbuf; int32_t m_qbufSize; }; // used by Msg25::addNote() #define MAX_ENTRY_DOCIDS 10 class NoteEntry { public: int32_t m_count; char *m_note; int64_t m_docIds[MAX_ENTRY_DOCIDS]; }; // . takes a bunch of Msg20Replies and makes a serialized buffer, LinkInfo // . LinkInfo's buffer consists of a bunch of serialized "Inlinks" as defined // below // . THINK OF THIS CLASS as a Msg25 reply ("Msg25Reply") class #include "Xml.h" // how big can the rss item we store in the Inlink::ptr_rssItem be? #define MAX_RSSITEM_SIZE 30000 class LinkInfo { public: int32_t getStoredSize ( ) { return m_lisize; }; int32_t getSize ( ) { return m_lisize; }; time_t getLastUpdated ( ) { return (time_t)m_lastUpdated; }; //int32_t getNumTotalInlinks ( ) { // if ( this == NULL ) return 0; return m_numTotalInlinks; }; int32_t getNumLinkTexts ( ) { if ( this == NULL ) return 0; return m_numStoredInlinks; }; int32_t getNumGoodInlinks ( ) { if ( this == NULL ) return 0; return m_numGoodInlinks; }; // how many of the inlinks are from the same ip top? //int32_t getNumInternalInlinks( ) { // if ( this == NULL ) return 0; return m_numInlinksInternal; }; // how many inlinks are from a different ip top? //int32_t getNumExternalInlinks( ) { // if ( this == NULL ) return 0; // return m_numInlinks - m_numInlinksInternal; }; //int32_t getNumInlinksExtrapolated ( ){ // if ( this == NULL ) return 0;return m_numInlinksExtrapolated;}; // update them for each Inlink. calls for each Inlink. void updateStringPtrs ( ); // this returns a ptr to a static Inlink in some cases, so beware //class Inlink *getNextInlink ( class Inlink *k ) ; class Inlink *getNextInlink ( class Inlink *k ) ; bool getItemXml ( Xml *xml , int32_t niceness ) ; bool hasLinkText ( ); /* bool hash ( TermTable *table , int32_t externalLinkTextWeight , int32_t internalLinkTextWeight , int32_t ip , int32_t version , int32_t siteNumInlinks , TermTable *countTable , char *note , int32_t niceness ) ; */ // for PageTitledb bool print ( class SafeBuf *sb , char *coll ); // adds up the page pops of the inlinkers as int32_t as they are from // a different site than "u" is //int32_t computePagePop ( class Url *u , char *coll ) ; bool hasRSSItem(); // a small header, followed by the buf of "Inlinks", m_buf[] char m_version; // we only keep usually no more than 10 or so internal guys, so this // can be a single byte char m_numInlinksInternal; char m_reserved1; // was m_siteRootQuality char m_reserved2; // includes Inlinks in m_buf[] below int32_t m_lisize; // this is really a time_t but that changes and this can't change! int32_t m_lastUpdated; // this is precisely how many inlinks we stored in m_buf[] below int32_t m_numStoredInlinks;//m_numTotalInlinks; // . only valid if titleRec version >= 119, otherwise its always 0 // . this count includes internal as well as external links, i.e. just // the total inlinks we got, counting at most one inlink per page. // it is not very useful i guess, but steve wants it. int32_t m_totalInlinkingDocIds;//reserved3; // . how many inlinks did we have that were "good"? // . this is typically less than the # of Inlinks stored in m_buf below // because it does not include internal cblock inlinks int32_t m_numGoodInlinks; // . # of c blocks linking to this page/site // . only valid if titlerecversion >= 119 // . includes your own intenral cblock int32_t m_numUniqueCBlocks;//m_pagePop; // . # of IPs linking to this page/site // . only valid if titlerecversion >= 119 // . includes your own internal ip int32_t m_numUniqueIps;//numInlinksFresh; // was m_reserved3; //int32_t m_sitePop; //int32_t m_siteNumInlinks; // serialize "Inlinks" into this buffer, m_buf[] char m_buf[0]; } __attribute__((packed, aligned(4))); #define MAXINLINKSTRINGBUFSIZE 2048 class Inlink { // : public Msg { public: //int32_t *getFirstSizeParm () { return &size_urlBuf; }; //int32_t *getLastSizeParm () { return &size_rssItem; }; //int32_t *getFirstOffPtr () { return &off_urlBuf; }; //int32_t getBaseSize () { return sizeof(Inlink);}; //char *getStringBuf () { return m_buf; }; //int32_t getBaseNumStrings() { // return (char **)&size_urlBuf - (char **)&off_urlBuf; }; // zero ourselves out void reset() ; void set ( class Msg20Reply *reply ); // set ourselves from a serialized older-versioned Inlink void set2 ( class Inlink *old ); bool setXmlFromRSS ( Xml *xml , int32_t niceness ) ; //bool setXmlFromLinkText ( Xml *xml ) ; // . set a Msg20Reply from ourselves // . Msg25 uses this to recycle old inlinks that are now gone // . allows us to preserve ptr_rssInfo, etc. void setMsg20Reply ( class Msg20Reply *r ) ; int32_t getStoredSize ( ) ; // . return ptr to the buffer we serialize into // . return NULL and set g_errno on error char *serialize ( int32_t *retSize , char *userBuf , int32_t userBufSize , bool makePtrsRefNewBuf ) ; //int32_t updateStringPtrs ( char *buf ); // returns a ptr into a static buffer char *getLinkTextAsUtf8 ( int32_t *len = NULL ) ; int32_t m_ip ; //0 int64_t m_docId ; // 4 int32_t m_firstSpidered ; // 12 int32_t m_lastSpidered ; // 16 int32_t m_nextSpiderDate ; // 20 // like in the titleRec, the lower 2 bits of the datedbDate have // special meaning. // 0x00 --> datedb date extracted from content (pubdate) // 0x01 --> datedb date based on estimated "modified" time (moddate) // 0x10 --> datedb date is when same-site root was estimated to have // first added that url as an outlink (discoverdate) (TODO) int32_t m_datedbDate ; // 24 // this date is used as the discovery date for purposes of computing // LinkInfo::m_numInlinksFresh int32_t m_firstIndexedDate ; // 28 //int32_t m_baseScore ; int32_t m_pageNumInlinks ; // 32 int32_t m_siteNumInlinks ; // 36 // record the word position we hashed this link text with // so we can match it to the DocIdScoringInfo stuff int32_t m_wordPosStart;//reservedc;//pagePop // 40 int32_t m_firstIp;//wordPosEnd;//reservedd;//sitePop // 44 // . int32_t m_reserved1 ; // . how many strings do we have? // . makes it easy to add new strings later uint16_t m_reserved_NumStrings ; // 48 // . and were our first string ptrs starts // . allows us to set ourselves from an "old" Inlink uint16_t m_reserved_FirstStrPtrOffset ; // 50 uint16_t m_numOutlinks ; // 52 // i guess no need to store this stuff if we are storing the url // in ptr_urlBuf below. we can call Url::set() then Url::getHostHash() // NO, because the site is now only contained in the TagRec now and // we compute the site in SiteGetter.cpp, so it is more complicated!!! // we get the tag rec of each outlink, and get the site from that // and hash that and store it here // we got a 2 byte padding before this PADPADPADPADP int16_t m_pad0; int32_t m_siteHash ; // www.hompages.com/~fred/ // 56 //int32_t m_hostHash ; // www.ibm.com //int32_t m_midDomHash ; // the ibm in ibm.com // single bit flags uint16_t m_isPermalink : 1 ; // 60 uint16_t m_outlinkInContent : 1 ; uint16_t m_outlinkInComment : 1 ; uint16_t m_isReserved : 1 ; // was u-n-i-c-o-d-e- bit uint16_t m_isLinkSpam : 1 ; //uint16_t m_isAnomaly : 1 ; // when Msg20Request::ptr_qbuf is set and // Msg20Request::m_computeLinkInfo is true, Msg20 calls Msg25, which // in turn calls one Msg20 for each inlinker the doc has, thereby // passing the ptr_qbuf into each of those Msg20s. if the inlinker // matches the query then it sets m_hasAllQueryTerms to true and // returns the Msg20Reply to Msg25. When Msg25 is done it calls // makeLinkInfo() to make a LinkInfo out of all those Msg20Replies. // We use m_hasAllQueryTerms to display the absScore2 of each inlinker // in the raw xml search results feed for buzz. uint16_t m_hasAllQueryTerms : 1 ; // if we imported it from the old LinkInfo. helps us preserve rssInfo, // hopcounts, etc. uint16_t m_recycled : 1 ; uint16_t m_reserved4 : 1 ; uint16_t m_reserved5 : 1 ; uint16_t m_reserved6 : 1 ; uint16_t m_reserved7 : 1 ; uint16_t m_reserved8 : 1 ; uint16_t m_reserved9 : 1 ; uint16_t m_reserveda : 1 ; uint16_t m_reservedb : 1 ; uint16_t m_country ; // 62 uint8_t m_language ; // 64 //char m_docQuality ; char m_siteRank; // 65 //char m_ruleset ; char m_hopcount ; // 66 char m_linkTextScoreWeight ; // 0-100% (was m_inlinkWeight) //67 char *getUrl ( ) { if ( size_urlBuf == 0 ) return NULL; return m_buf ;//+ off_urlBuf; }; char *getLinkText ( ) { if ( size_linkText == 0 ) return NULL; //return m_buf + off_linkText; return m_buf + size_urlBuf; }; char *getSurroundingText ( ) { if ( size_surroundingText == 0 ) return NULL; //return m_buf + off_surroundingText; return m_buf + size_urlBuf + size_linkText; }; char *getRSSItem ( ) { if ( size_rssItem == 0 ) return NULL; //return m_buf + off_rssItem; return m_buf + size_urlBuf + size_linkText + size_surroundingText; }; char *getCategories ( ) { if ( size_categories == 0 ) return NULL; //return m_buf + off_categories; return m_buf + size_urlBuf + size_linkText + size_surroundingText + size_rssItem; }; char *getGigabitQuery ( ) { if ( size_gigabitQuery == 0 ) return NULL; //return m_buf + off_gigabitQuery; return m_buf + size_urlBuf + size_linkText + size_surroundingText + size_rssItem + size_categories; }; char *getTemplateVector ( ) { if ( size_templateVector == 0 ) return NULL; //return m_buf + off_templateVector; return m_buf + size_urlBuf + size_linkText + size_surroundingText + size_rssItem + size_categories + size_gigabitQuery; }; // // add new non-strings right above this line // // . the url, link text and neighborhoods are stored in here // . no need to store vector for voting deduping in here because // that use MsgE's Msg20Replies directly // . this is just stuff we want in the title rec int32_t off_urlBuf ; // 68 int32_t off_linkText ; int32_t off_surroundingText ; // neighborhoods // . this is the rss item that links to us // . if calling Msg25::getLinkInfo() with getLinkerTitles set to // true then this is the title! int32_t off_rssItem ; // . zakbot and the turk categorize site roots, and kids inherit // the categories from their parent inlinkers // . we can't really use tagdb cuz that operates on subdirectories // which may not be upheld for some sites. (like cnn.com!, the // stories are not proper subdirectories...) // . so inherit the category from our inlinkers. "sports", "world", ... // . comma-separated (in ascii) int32_t off_categories ; // . augments our own gigabits vector, used for finding related docs // . used along with the template vector for deduping pgs at index time // . now we used for finding similar docs AND categorizing // . comma-separated // . each gigabit has a count in []'s. score in body x1, title x5, // and inlink text x5. i.e. "News[10],blue devils[5],... // . always in UTF-8 int32_t off_gigabitQuery ; // . the html tag vector. // . used for deduping voters (anti-spam tech) // . used along with the gigabit vector for deduping pgs at index time // . now we used for finding similar docs and for categorizing (spam) int32_t off_templateVector ; // // add new strings right above this line // int32_t size_urlBuf ; int32_t size_linkText ; int32_t size_surroundingText ; int32_t size_rssItem ; int32_t size_categories ; int32_t size_gigabitQuery ; int32_t size_templateVector ; char m_buf[MAXINLINKSTRINGBUFSIZE] ; } __attribute__((packed, aligned(4))); // . this function is normally called like "info = makeLinkInfo()" // to create a new LinkInfo based on a bunch of Msg20 replies // . returns NULL and sets g_errno on error LinkInfo *makeLinkInfo ( char *coll , int32_t ip , //char siteRootQuality , //int32_t sitePop , int32_t siteNumInlinks , Msg20Reply **replies , int32_t numReplies , //int32_t extrapolated , //int32_t xfactor , // if link spam give this weight int32_t spamWeight , bool oneVotePerIpTop , int64_t linkeeDocId , int32_t lastUpdateTime , bool onlyNeedGoodInlinks , int32_t niceness , class Msg25 *msg25 , SafeBuf *linkInfoBuf ) ; // . set from the Msg20 replies in MsgE // . Msg20 uses this to set the LinkInfo class to the "outlinks" // . if an outlink has no docid, it is not stored, because it was // therefore not in the index. LinkInfo *makeLinkInfo ( class MsgE *m , int32_t niceness ) ; //////// // // LINKS CLASS // //////// //typedef int16_t linkflags_t; typedef int32_t linkflags_t; // all the links (urls), separated by \0's, are put into a buf of this size #define LINK_BUF_SIZE (100*1024) // we allow up to this many links to be put into m_buf //#define MAX_LINKS 10000 //#define MSR_HAD_REC 0x80 //#define NUM_TYPES_IN_MSR 2 //class MiniSiteRec { //public: // bool hadRec() { return m_flags & MSR_HAD_REC; }; // int16_t m_siteOffset; // int16_t m_siteLen; // int32_t m_filenum; // uint8_t m_flags; // char m_siteQuality; // SiteType m_types[NUM_TYPES_IN_MSR]; // SiteType m_lang; //}; // Link Flags #define LF_SAMEHOST 0x0001 // same hostname #define LF_SAMEDOM 0x0002 // same domain #define LF_SITEROOT 0x0004 // for blogrolls #define LF_SAMESITE 0x0008 // only get offsite outlink info in Msg20.cpp #define LF_OLDLINK 0x0010 // set this if it was on the pg last spider tim #define LF_RSS 0x0020 // is it from an rss tag? #define LF_PERMALINK 0x0040 // a probable permalink? of permalink format? #define LF_SUBDIR 0x0080 // is the outlink in a subdir of parent? #define LF_AHREFTAG 0x0100 // an outlink #define LF_LINKTAG 0x0200 // a outlink #define LF_FBTAG 0x0400 // a feed burner original outlink #define LF_SELFLINK 0x0800 // links to self #define LF_SELFPERMALINK 0x1000 // has "permalink" "link text" or attribute #define LF_STRONGPERM 0x2000 // is permalink of /yyyy/mm/dd/ format #define LF_EDUTLD 0x4000 #define LF_GOVTLD 0x8000 #define LF_NOFOLLOW 0x10000 bool isPermalink ( //char *coll , class Links *links , class Url *u , char contentType , class LinkInfo *linkInfo , bool isRSS , char **note = NULL , char *pathOverride= NULL , bool ignoreCgi = false , linkflags_t *extraFlags = NULL ) ; class Links { public: Links(); ~Links(); void reset(); // call this before calling hash() and write() bool set ( bool useRelNoFollow , Xml *xml, Url *parentUrl , bool setLinkHashes , // use NULL for this if you do not have a baseUrl Url *baseUrl , int32_t version, int32_t niceness , //bool addSiteRootFlag = false , //char *coll = NULL , bool parentIsPermalink , // = false , Links *oldLinks , // for LF_OLDLINKS flag // this is used by Msg13.cpp to quickly get ptrs // to the links in the document, no normalization! bool doQuickSet = false , class SafeBuf *diffbotReply = NULL ); // set from a simple text buffer bool set ( char *buf , int32_t niceness ) ; bool print ( SafeBuf *sb ) ; // Link in ascii text bool addLink(char *link,int32_t linkLen,int32_t nodeNum,bool setLinkHashes, int32_t titleRecVersion, int32_t niceness , bool isRSS , int32_t tagId , linkflags_t flagsArg ); // . link spam functions. used by linkspam.cpp's setLinkSpam(). // . also used by Linkdb.cpp to create a linkdb list to add to rdb // . we do not add outlinks to linkdb if they are "link spam" bool setAllSpamBits ( char *note ) { m_spamNote = note; return true; } void setSpamBit ( char *note , int32_t i ) { m_spamNotes[i] = note; } void setSpamBits ( char *note , int32_t i ) { for (int32_t j=i ; j *dt , // class XmlDoc *xd , // int32_t niceness ); // . does link #n have link text that has at least 1 alnum char in it? // . used for scoring link: terms to make link-text adds more efficient bool hasLinkText ( int32_t n, int32_t version ); // . returns false on error and sets errno // . get our outgoing link text for this url // . store it into "buf" int32_t getLinkText ( char *linkee , bool getSiteLinkInfo , char *buf , int32_t maxBufLen , //bool filter , char **itemPtr , int32_t *itemLen , int32_t *retNode1 , // = NULL , int32_t *retLinkNum , int32_t niceness ); int32_t getLinkText2 ( int32_t i, char *buf , int32_t maxBufLen , //bool filter , char **itemPtr , int32_t *itemLen , int32_t *retNode1 , // = NULL , int32_t niceness ); // quick n dirty check for substrings in linktext char *linkTextSubstr(int32_t linkNum, char *string, int32_t niceness); // returns list of \0 terminated, normalized links char *getLinkBuf () { return m_allocBuf; }; int32_t getLinkBufLen () { if ( m_allocBuf ) return m_bufPtr - m_allocBuf; return 0; //return m_allocBuf?m_bufPtr-m_allocBuf:0; }; //uint32_t *getLinkHashes () { return m_linkHashes; }; int32_t getNumLinks () { return m_numLinks; }; // was there a link to gigablast.com or www.gigablast.com? bool linksToGigablast() { return m_linksToGigablast; }; int32_t getLinkLen ( int32_t i ) { return m_linkLens [i]; }; char *getLink ( int32_t i ) { return m_linkPtrs [i]; }; char *getLinkPtr ( int32_t i ) { return m_linkPtrs [i]; }; uint32_t getLinkHash32 ( int32_t i ) { return (uint32_t)m_linkHashes[i]; }; uint64_t getLinkHash64 ( int32_t i ) { return m_linkHashes[i]; }; uint64_t getHostHash64 ( int32_t i ) { return m_hostHashes[i]; }; int32_t getDomHash32 ( int32_t i ) { return m_domHashes[i]; }; int32_t getNodeNum ( int32_t i ) { return m_linkNodes[i]; }; bool hasRelNoFollow() { return m_hasRelNoFollow; }; char *getLinkHost ( int32_t i , int32_t *hostLen ) ; int32_t findLinkNum(char* url, int32_t urlLen); int32_t getMemUsed () { return m_allocSize; }; bool hasSelfPermalink ( ) { return m_hasSelfPermalink; }; bool hasRSSOutlink ( ) { return m_hasRSSOutlink; }; bool hasSubdirOutlink ( ) { return m_hasSubdirOutlink; }; // . make an RdbList for adding to spiderdb // . returns -1 and sets g_errno on error // . otherwise returns # of outlinks added to "list" // . used by Msg14.cpp for adding outlinks to spiderdb /* char *addToMetaList ( char *p , // metalist start char *pend , // metalist end class TitleRec *tr , class XmlDoc *old , char *coll , class MsgE *msge , int32_t niceness , Url *quickLink = NULL , linkflags_t quickLinkFlags = 0 , bool isAddUrl = false , bool forceAll = false , bool skipExternalLinks = false , bool unforceAll = false , int32_t explicitPriority = -1 ); */ // private: Xml *m_xml; Url *m_baseUrl; Url *m_parentUrl; bool m_parentIsPermalink; char *m_baseSite; int32_t m_baseSiteLen; // set , if any, into m_tmpUrl so m_baseUrl can point to it Url m_tmpUrl; // . we store all links in this buf // . each link ends in a \0 // . convenient for passing to Msg10 // . each link is in complete http:// format with base url, etc. char *m_buf;// [LINK_BUF_SIZE]; // pointer to the end of the buffer char *m_bufPtr; // running count of the bufsize, including static and dynamic // int32_t m_bufSize; // this is non-NULL if all outlinks are considered link spam, // otherwise, individual outlinks will have their m_spamNotes[i] be // non-NULL, and point to the string that describes why they are // link spam. char *m_spamNote; char **m_linkPtrs;// [MAX_LINKS]; int32_t *m_linkLens;// [MAX_LINKS]; int32_t *m_linkNodes;// [MAX_LINKS]; uint64_t *m_linkHashes;// [MAX_LINKS]; uint64_t *m_hostHashes;// [MAX_LINKS]; int32_t *m_domHashes;// [MAX_LINKS]; linkflags_t *m_linkFlags; char *m_linkContactyTypes; // for XmlDoc's isContacty() algo char **m_spamNotes; bool m_doQuickSet; // do we have an rss link? i.e. are we an RSS feed bool m_hasRSS; bool m_isFeedBurner; char *m_linkBuf; int32_t m_allocLinks; int32_t m_numLinks; int32_t m_numNodes; // . should we extract redirects from links? (like for yahoo's links) // . this is set based on the SiteRec //bool m_extractRedirects; bool m_linksToGigablast; bool m_hasRelNoFollow; bool m_stripIds; uint32_t m_allocSize; char *m_allocBuf; // queue the blog roll links into the turk for voting bool queueBlogRoll ( class TagRec **tagRecPtrs , int32_t niceness ) ; bool m_addSiteRootFlags; char *m_coll; char m_flagged; char m_hasSelfPermalink; char m_hasRSSOutlink; char m_hasSubdirOutlink; char *m_rssOutlinkPtr; int32_t m_rssOutlinkLen; // . returns 0 if probably not a permalink // . returns 1 if probably is a permalink // . returns -1 if not enough information to make a decision char isPermalink ( char **note ) { return -1; }; int32_t m_numOutlinksAdded; }; int32_t getRegExpNumOfOutlink ( Url *up , linkflags_t linkFlags , TagRec *tagRec , int32_t quality , int32_t ip , CollectionRec *cr , Url *parentUrl , int32_t sourceHostHash , int32_t parentHopCount , int32_t parentPriority , int32_t hopCount , // our hop count int32_t h , // hostHash bool newOutlink , // are we new? bool isAddUrl , // from addUrl? // use -1 if unknown for these 3 values char isParentRSS , char parentIsNew , char parentIsPermalink , char isIndexed ); // -1--> unknown #endif