open-source-search-engine/Linkdb.h

// Gigablast, Inc.  Copyright April 2007

// Linkdb - stores link information

// . Format of a 28-byte key in linkdb
// . used by Msg25::getPageLinkInfo()
// .
// . HHHHHHHH HHHHHHHH HHHHHHHH HHHHHHHH H = sitehash32 of linkEE
// . pppppppp pppppppp pppppppp pppppppp p = linkEEHash, q = ~linkerSiteRank
// . pppppppp pppppppS qqqqqqqq cccccccc c = lower ip byte, S = isLinkSpam?
// . IIIIIIII IIIIIIII IIIIIIII dddddddd I = upper 3 bytes of ip
// . dddddddd dddddddd dddddddd dddddd00 d = linkerdocid,h = half bit,Z =delbit
// . mmmmmmmm mmmmmm0N xxxxxxxx xxxxxxss N = 1 if it was added to existing page
// . ssssssss ssssssss ssssssss sssssshZ s = sitehash32 of linker
//   m = discovery date in days since jan 1
//   x = estimated date it was lost (0 if not yet lost)
//
// NOTE: the "c" bits were the hopcount of the inlinker, but we changed
// them to the lower ip byte so steve can show the # of unique ips linking
// to your page or site.

#ifndef _LINKDB_H_
#define _LINKDB_H_

#define LDBKS sizeof(key224_t)

#define LDB_MAXSITERANK 0xff
#define LDB_MAXHOPCOUNT 0xff
#define LDB_MAXURLHASH  0x00007fffffffffffLL

#define LINKDBEPOCH (1325376000-365*86400*4)

#include "Conf.h"
#include "Rdb.h"
#include "DiskPageCache.h"
#include "Titledb.h"

void  handleRequest25 ( UdpSlot *slot , long netnice ) ;

// . get the inlinkers to this SITE (any page on this site)
// . use that to compute a site quality
// . also get the inlinkers sorted by date and see how many good inlinkers
//   we had since X days ago. (each inlinker needs a pub/birth date)
class Msg25Request {
public:
	// either MODE_PAGELINKINFO or MODE_SITELINKINFO
	char       m_mode; // bool       m_isSiteLinkInfo    ;
	long       m_ip                ;
	long long  m_docId             ;
	collnum_t  m_collnum           ;
	bool       m_isInjecting       ;
	bool       m_printInXml        ;

	// when we get a reply we call this
	void      *m_state               ;
	void    (* m_callback)(void *state) ;

	// server-side parms so it doesn't have to allocate a state
	//SafeBuf    m_pbuf        ;
	//SafeBuf    m_linkInfoBuf ;

	//char    *coll              ;
	//char    *qbuf              ;
	//long     qbufSize          ;
	//XmlDoc  *xd                ;

	long       m_siteNumInlinks      ;
	class LinkInfo  *m_oldLinkInfo         ;
	long       m_niceness            ;
	bool       m_doLinkSpamCheck     ;
	bool       m_oneVotePerIpDom     ;
	bool       m_canBeCancelled      ;
	long       m_lastUpdateTime      ;
	bool       m_onlyNeedGoodInlinks  ;
	bool       m_getLinkerTitles ;
	long       m_ourHostHash32 ;
	long       m_ourDomHash32 ;

	// new stuff
	long       m_siteHash32;
	long long  m_siteHash64;
	long long  m_linkHash64;
	// for linked list of these guys in g_lineTable in Linkdb.cpp
	// but only used on the server end, not client end
	class Msg25Request *m_next;
	// the mutlicast we use
	class Multicast *m_mcast;
	UdpSlot *m_udpSlot;
	bool m_printDebugMsgs;
	// store final LinkInfo reply in here
	SafeBuf   *m_linkInfoBuf;


	char      *ptr_site;
	char      *ptr_url;
	char      *ptr_oldLinkInfo;

	long       size_site;
	long       size_url;
	long       size_oldLinkInfo;

	char m_buf[0];

	long getStoredSize();
	void serialize();
	void deserialize();
};

// . returns false if blocked, true otherwise
// . sets errno on error
// . your req->m_callback will be called with the Msg25Reply
bool getLinkInfo ( SafeBuf *reqBuf , // store msg25 request in here
		   Multicast *mcast , // use this to send msg 0x25 request
		   char      *site ,
		   char      *url  ,
		   bool       isSiteLinkInfo ,
		   long       ip                  ,
		   long long  docId               ,
		   collnum_t collnum ,
		   char      *qbuf                ,
		   long       qbufSize            ,
		   void      *state               ,
		   void (* callback)(void *state) ,
		   bool       isInjecting         ,
		   SafeBuf   *pbuf                ,
		   //class XmlDoc *xd ,
		   bool printInXml ,
		   long       siteNumInlinks      ,
		   //long       sitePop             ,
		   LinkInfo  *oldLinkInfo         ,
		   long       niceness            ,
		   bool       doLinkSpamCheck     ,
		   bool       oneVotePerIpDom     ,
		   bool       canBeCancelled      ,
		   long       lastUpdateTime      ,
		   bool       onlyNeedGoodInlinks  ,
		   bool       getLinkerTitles , //= false ,
		   // if an inlinking document has an outlink
		   // of one of these hashes then we set
		   // Msg20Reply::m_hadLinkToOurDomOrHost.
		   // it is used to remove an inlinker to a related
		   // docid, which also links to our main seo url
		   // being processed. so we do not recommend
		   // such links since they already link to a page
		   // on your domain or hostname. set BOTH to zero
		   // to not perform this algo in handleRequest20()'s
		   // call to XmlDoc::getMsg20Reply().
		   long       ourHostHash32 , // = 0 ,
		   long       ourDomHash32 , // = 0 );
		   SafeBuf *myLinkInfoBuf );


void  handleRequest25 ( UdpSlot *slot , long netnice ) ;

long getSiteRank ( long sni ) ;

class Linkdb {
 public:
	void reset();

	bool init    ( );
	bool init2 ( long treeMem );
	bool verify  ( char *coll );
	bool addColl ( char *coll, bool doVerify = true );

	bool setMetaList ( char    *metaList        ,
			   char    *metaListEnd     ,
			   class XmlDoc  *oldDoc          ,
			   class XmlDoc  *newDoc          ,
			   long     niceness        ,
			   long    *numBytesWritten ) ;

	// this makes a "url" key
	key224_t makeKey_uk ( uint32_t  linkeeSiteHash32 ,
			      uint64_t  linkeeUrlHash64  ,
			      bool      isLinkSpam     ,
			      unsigned char linkerSiteRank , // 0-15 i guess
			      unsigned char linkerHopCount ,
			      uint32_t  linkerIp       ,
			      long long linkerDocId    ,
			      unsigned long      discoveryDate  ,
			      unsigned long      lostDate       ,
			      bool      newAddToOldPage   ,
			      uint32_t linkerSiteHash32 ,
			      bool      isDelete       );


	key224_t makeStartKey_uk ( uint32_t linkeeSiteHash32 ,
				   uint64_t linkeeUrlHash64  = 0LL ) {
		return makeKey_uk ( linkeeSiteHash32,
				    linkeeUrlHash64,
				    false, // linkspam?
				    255, // 15, // ~siterank
				    0, // hopcount
				    0, // ip
				    0, // docid
				    0, //discovery date
				    0, // lostdate
				    false, // newaddtopage
				    0, // linkersitehash
				    true); // is delete?
	}

	key224_t makeEndKey_uk ( uint32_t linkeeSiteHash32 ,
				 uint64_t linkeeUrlHash64  =
				 0xffffffffffffffffLL ) {
		return makeKey_uk ( linkeeSiteHash32,
				    linkeeUrlHash64,
				    true, // linkspam?
				    0, // ~siterank
				    0xff, // hopcount
				    0xffffffff, // ip
				    MAX_DOCID, // docid
				    0xffffffff, //discovery date
				    0xffffffff, // lostdate
				    true, // newaddtopage
				    0xffffffff, // linkersitehash
				    false); // is delete?
	}

	/*
	long long getUrlHash ( Url *url ) {
		// . now use the probable docid (UNMASKED)
		// . this makes it so we reside on the same host as the
		//   titleRec and spiderRec of this url. that way Msg14
		//   is assured of adding the Linkdb rec before the Spiderdb
		//   rec and therefore of getting its parent as an inlinker.
		long long h = g_titledb.getProbableDocId(url,false);
		// now it is the lower 47 bits since we added the spam bit
		return h & 0x00007fffffffffffLL;	}
	*/

	//
	// accessors for "url" keys in linkdb
	//

	unsigned long getLinkeeSiteHash32_uk ( key224_t *key ) {
		return (key->n3) >> 32; }

	unsigned long long getLinkeeUrlHash64_uk ( key224_t *key ) {
		unsigned long long h = key->n3;
		h &= 0x00000000ffffffffLL;
		h <<= 15;
		h |= key->n2 >> 49;
		return h;
	}

	char isLinkSpam_uk (key224_t *key ) {
		if ((key->n2) & 0x1000000000000LL) return true;
		return false;
	}

	unsigned char getLinkerSiteRank_uk ( key224_t *k ) {
		unsigned char rank = (k->n2 >> 40) & 0xff;
		// complement it back
		rank = (unsigned char)~rank;//LDB_MAXSITERANK - rank;
		return rank;
	}

	//unsigned char getLinkerHopCount_uk ( key224_t *k ) {
	//	return (k->n2 >> 32) & 0xff;
	//}

	long getLinkerIp_uk ( key224_t *k ) {
		unsigned long ip ;
		// the most significant part of the ip is the lower byte!!!
		ip = (unsigned long)((k->n2>>8)&0x00ffffff);
		ip |= ((k->n2>>8) & 0xff000000);
		return ip;
	}

	void setIp32_uk ( void *k , unsigned long ip ) {
		char *ips = (char *)&ip;
		char *ks = (char *)k;
		ks[16] = ips[3];
		ks[15] = ips[2];
		ks[14] = ips[1];
		ks[13] = ips[0];
	}


	// we are missing the lower byte, it will be zero
	long getLinkerIp24_uk ( key224_t *k ) {
		return (long)((k->n2>>8)&0x00ffffff);
	}

	long long getLinkerDocId_uk( key224_t *k ) {
		unsigned long long d = k->n2 & 0xff;
		d <<= 30;
		d |= k->n1 >>34;
		return d;
	}

	// . in days since jan 1, 2012 utc
	// . timestamp of jan 1, 2012 utc is 1325376000
	long getDiscoveryDate_uk ( void *k ) {
		uint32_t date = ((key224_t *)k)->n1 >> 18;
		date &= 0x00003fff;
		// if 0 return that
		if ( date == 0 ) return 0;
		// multiply by seconds in days then
		date *= 86400;
		// add OUR epoch
		date += LINKDBEPOCH;
		// and use that
		return date;
	}

	// . in days since jan 1, 2012 utc
	// . timestamp of jan 1, 2012 utc is 1325376000
	void setDiscoveryDate_uk ( void *k , long date ) {
		// subtract jan 1 2012
		date -= LINKDBEPOCH;
		// convert into days
		date /= 86400;
		// sanity
		if ( date > 0x3fff || date < 0 ) { char *xx=NULL;*xx=0; }
		// clear old bits
		((key224_t *)k)->n1 &= 0xffffffff03ffffLL;
		// scale us into it
		((key224_t *)k)->n1 |= ((unsigned long long)date) << 18;
	}

	long getLostDate_uk ( void *k ) {
		uint32_t date = ((key224_t *)k)->n1 >> 2;
		date &= 0x00003fff;
		// if 0 return that
		if ( date == 0 ) return 0;
		// multiply by seconds in days then
		date *= 86400;
		// add OUR epoch
		date += LINKDBEPOCH;
		// and use that
		return date;
	}

	// . in days since jan 1, 2012 utc
	// . timestamp of jan 1, 2012 utc is 1325376000
	void setLostDate_uk ( void *k , long date ) {
		// subtract jan 1 2012
		date -= LINKDBEPOCH;
		// convert into days
		date /= 86400;
		// sanity
		if ( date > 0x3fff || date < 0 ) { char *xx=NULL;*xx=0; }
		// clear old bits
		((key224_t *)k)->n1 &= 0xffffffffffff0003LL;
		// scale us into it
		((key224_t *)k)->n1 |= ((unsigned long long)date) << 2;
	}

	uint32_t getLinkerSiteHash32_uk( void *k ) {
		uint32_t sh32 = ((key224_t *)k)->n1 & 0x00000003;
		sh32 <<= 30;
		sh32 |= ((key224_t *)k)->n0 >> 2;
		return sh32;
	}

	Rdb           *getRdb()           { return &m_rdb; };

	DiskPageCache *getDiskPageCache () { return &m_pc; };
	DiskPageCache m_pc;

 private:
	Rdb           m_rdb;

};

extern class Linkdb g_linkdb;
extern class Linkdb g_linkdb2;


// . get ALL the linkText classes for a url and merge 'em into a LinkInfo class
// . also gets the link-adjusted quality of our site's url (root url)
// . first gets all docIds of docs that link to that url via an link: search
// . gets the LinkText, customized for our url, from each docId in that list
// . merge them into a final LinkInfo class for your url

//#include "LinkText.h"
#include "Msg2.h"      // for getting IndexLists from Indexdb
#include "Msg20.h"     // for getting this url's LinkInfo from another cluster
#include "SafeBuf.h"
#include "HashTableX.h"
#include "Msg22.h"
#include "CatRec.h"

#define MAX_LINKERS 3000

// if a linker is a "title rec not found" or log spam, then we get another
// linker's titleRec. churn through up to these many titleRecs in an attempt
// to get MAX_LINKERS good titlerecs before giving up.
//#define MAX_DOCIDS_TO_SAMPLE 25000
// on news.google.com, 22393 of the 25000 are link spam, and we only end
// up getting 508 good inlinks, so rais from 25000 to 50000
//#define MAX_DOCIDS_TO_SAMPLE 50000
// try a ton of lookups so we can ditch xfactor and keep posdb key as
// simple as possible. just make sure we recycle link info a lot!
#define MAX_DOCIDS_TO_SAMPLE 1000000

// go down from 300 to 100 so XmlDoc::getRecommendLinksBuf() can launch
// like 5 msg25s and have no fear of having >500 msg20 requests outstanding
// which clogs things up
// crap, no, on gk144 we got 128 hosts now, so put back to 300...
// if we have less hosts then limit this proportionately in Linkdb.cpp
#define	MAX_MSG20_OUTSTANDING 300

#define MAX_NOTE_BUF_LEN 20000

#define MSG25_MAX_REQUEST_SIZE (MAX_URL_LEN+MAX_COLL_LEN+64)

//#define MSG25_MAX_REPLY_SIZE   1024

void  handleRequest25 ( UdpSlot *slot , long netnice ) ;

class Msg25 {

 public:

	// . returns false if blocked, true otherwise
	// . sets errno on error
	// . this sets Msg25::m_siteRootQuality and Msg25::m_linkInfo
	// . "url/coll" should NOT be on stack in case weBlock
	// . if "reallyGetLinkInfo" is false we don't actually try to fetch
	//   any link text and return true right away, really saves a bunch
	//   of disk seeks when spidering small collections that don't need
	//   link text/info indexing/analysis
	bool getLinkInfo2 (char      *site ,
			   char      *url  ,
			   bool       isSiteLinkInfo ,
			   long       ip                  ,
			   long long  docId               ,
			   //char      *coll                ,
			   collnum_t collnum,
			   char      *qbuf                ,
			   long       qbufSize            ,
			   void      *state               ,
			   void (* callback)(void *state) ,
			   bool       isInjecting         ,
			   //SafeBuf   *pbuf                ,
			   bool       printDebugMsgs , // into "Msg25::m_pbuf"
			   //class XmlDoc *xd ,
			   bool       printInXml ,
			   long       siteNumInlinks      ,
			   //long       sitePop             ,
			   LinkInfo  *oldLinkInfo         ,
			   long       niceness            ,
			   bool       doLinkSpamCheck     ,
			   bool       oneVotePerIpDom     ,
			   bool       canBeCancelled      ,
			   long       lastUpdateTime      ,
			   bool       onlyNeedGoodInlinks  ,
			   bool       getLinkerTitles , //= false ,
			   // if an inlinking document has an outlink
			   // of one of these hashes then we set
			   // Msg20Reply::m_hadLinkToOurDomOrHost.
			   // it is used to remove an inlinker to a related
			   // docid, which also links to our main seo url
			   // being processed. so we do not recommend
			   // such links since they already link to a page
			   // on your domain or hostname. set BOTH to zero
			   // to not perform this algo in handleRequest20()'s
			   // call to XmlDoc::getMsg20Reply().
			   long       ourHostHash32 , // = 0 ,
			   long       ourDomHash32 , // = 0 );
			   SafeBuf *myLinkInfoBuf );
	Msg25();
	~Msg25();
	void reset();

	// . based on the linkInfo, are we banned/unbanned/clean/dirty
	// . the linking url may set these bits in it's link: term score
	// . these bits are set based on the linker's siteRec
	//bool isBanned   () {return (m_msg18.isBanned  () || m_isBanned  ); };
	//bool isUnbanned () {return (m_msg18.isUnbanned() || m_isUnbanned); };
	//bool isDirty    () {return (m_msg18.isDirty   () || m_isDirty   ); };
	//bool isClean    () {return (m_msg18.isClean   () || m_isClean   ); };

	// we also set these bits from looking at url's link scores
	//bool m_isBanned;
	//bool m_isUnbanned;
	//bool m_isDirty;
	//bool m_isClean;

	//char getMinInlinkerHopCount () { return m_minInlinkerHopCount; };

	// a new parm referencing the request we got over the network
	class Msg25Request * m_req25;

	class Msg20Reply *getLoser (class Msg20Reply *r, class Msg20Reply *p);
	char             *isDup    (class Msg20Reply *r, class Msg20Reply *p);

	bool addNote ( char *note , long noteLen , long long docId );

	//class LinkInfo *getLinkInfo () { return m_linkInfo; };

	// m_linkInfo ptr references into here. provided by caller.
	SafeBuf *m_linkInfoBuf;

	SafeBuf m_realBuf;

	// private:
	// these need to be public for wrappers to call:
	bool gotTermFreq ( bool msg42Called ) ;
	bool getRootTitleRec ( ) ;
	bool gotRootTitleRec ( );
	bool gotDocId ( ) ;
	//bool gotRootQuality2 ( ) ;
	bool gotRootLinkText ( ) ;
	bool gotRootLinkText2 ( ) ;
	bool getLinkingDocIds ( ) ;
	bool gotList     ( ) ;
	bool gotClusterRecs ( ) ;
	bool sendRequests ( );
	bool gotLinkText  ( class Msg20Request *req ) ; //long j );
	bool gotMsg25Reply ( ) ;
	bool doReadLoop ( );

	// input vars
	//Url       *m_url;
	//Url        m_tmpUrl;
	char *m_url;
	char *m_site;

	long m_ourHostHash32;
	long m_ourDomHash32;

	long m_round;
	uint64_t m_linkHash64;
	key224_t m_nextKey;

	bool       m_retried;
	bool       m_prependWWW;
	bool       m_onlyNeedGoodInlinks;
	bool       m_getLinkerTitles;
	long long  m_docId;
	//char      *m_coll;
	collnum_t m_collnum;
	//long       m_collLen;
	//LinkInfo  *m_linkInfo;
	void      *m_state;
	void     (* m_callback) ( void *state );

	long m_siteNumInlinks;
	//long m_sitePop;
	long m_mode;
	bool m_printInXml;
	//class XmlDoc  *m_xd;

	// private:

	// url info
	long m_ip;
	long m_top;
	long m_midDomHash;

	bool m_gettingList;

	// hack for seo pipeline in xmldoc.cpp
	long m_hackrd;

	// . we use Msg0 to get an indexList for href: terms
	// . the href: IndexList's docIds are docs that link to us
	// . we now use Msg2 since it has "restrictIndexdb" support to limit
	//   indexdb searches to just the root file to decrease disk seeks
	//Msg0  m_msg0;
	Msg5 m_msg5;
	RdbList m_list;

	class Inlink *m_k;

	// for getting the root title rec so we can share its pwids
	Msg22 m_msg22;

	long      m_maxNumLinkers;

	// should we free the m_replyPtrs on destruction? default=true
	bool m_ownReplies;

	// Now we just save the replies we get back from Msg20::getSummary()
	// We point to them with a LinkTextReply, which is just a pointer
	// and some access functions.
 	Msg20Reply    *m_replyPtrs  [ MAX_LINKERS ];
	long           m_replySizes [ MAX_LINKERS ];
	long           m_numReplyPtrs;

	//LinkText *m_linkTexts [ MAX_LINKERS ];
	Msg20        m_msg20s        [ MAX_MSG20_OUTSTANDING ];
	Msg20Request m_msg20Requests [ MAX_MSG20_OUTSTANDING ];
	char         m_inUse         [ MAX_MSG20_OUTSTANDING ];
	// for "fake" replies
	Msg20Reply   m_msg20Replies  [ MAX_MSG20_OUTSTANDING ];

	// make this dynamic to avoid wasting so much space when must pages
	// have *very* few inlinkers. make it point to m_dbuf by default.
	//long long m_docIds    [ MAX_DOCIDS_TO_SAMPLE ];

	//char      m_hasLinkText [ MAX_LINKERS ];

	// make this dynamic as well! (see m_docIds comment above)
	//char      m_scores    [ MAX_DOCIDS_TO_SAMPLE ];

	long      m_numDocIds;
	long      m_cblocks;
	long      m_uniqueIps;

	// new stuff for getting term freqs for really huge links: termlists
	//long long m_termId;
	//Msg42     m_msg42;
	long      m_minRecSizes;
	//long      m_termFreq;

	// Msg20 is for getting the LinkInfo class from this same url's
	// titleRec from another (usually much larger) gigablast cluster/netwrk
	Msg20     m_msg20;

	// how many msg20s have we sent/recvd?
	long      m_numRequests;
	long      m_numReplies;

	long      m_linkSpamOut;

	// have we had an error for any transaction?
	long      m_errno;

	// this is used for link ban checks
	//Msg18     m_msg18;

	SafeBuf   m_tmp;
	SafeBuf  *m_pbuf; // will point to m_tmp if m_printDebugMsgs

	// for holding the final linkinfo output
	//SafeBuf m_linkInfoBuf;

	// copied from CollectionRec
	bool  m_oneVotePerIpDom           ;
	bool  m_doLinkSpamCheck           ;
	bool  m_isInjecting               ;
	char  m_canBeCancelled            ;
	long  m_lastUpdateTime            ;

	Multicast m_mcast;

	//char **m_statusPtr;

	long m_good;
	long m_errors;
	long m_noText;
	long m_reciprocal;

	bool m_spideringEnabled;

	//TermTable m_ipTable;
	//long      m_ipdups;
	long      m_dupCount;
	long      m_vectorDups;
	long      m_spamLinks;
	long      m_niceness;
	long      m_numFromSameIp;
	long      m_sameMidDomain;

	// stats for allow some link spam inlinks to vote
	long m_spamCount;
	long m_spamWeight;
	long m_maxSpam;

	char m_siteQuality;
	long m_siteNumFreshInlinks;

	// this is used for the linkdb list
	//HashTableT <long, char> m_ipTable;
	HashTableX m_ipTable;
	HashTableX m_fullIpTable;
	HashTableX m_firstIpTable;

	// this is for deduping docids because we now combine the linkdb
	// list of docids with the old inlinks in the old link info
	//HashTableT <long long, char> m_docIdTable;
	HashTableX m_docIdTable;

	// special counts
	long      m_ipDupsLinkdb;
	long      m_docIdDupsLinkdb;
	long      m_linkSpamLinkdb;
	long      m_lostLinks;
	long      m_ipDups;

	unsigned long  m_groupId;
	long long      m_probDocId;

	LinkInfo *m_oldLinkInfo;

	char      m_buf [ MAX_NOTE_BUF_LEN ];
	char     *m_bufPtr;
	char     *m_bufEnd;
	HashTableX m_table;

	char      m_request [ MSG25_MAX_REQUEST_SIZE ];
	long      m_requestSize;

	//char      m_replyBuf [ MSG25_MAX_REQUEST_SIZE ];

	// hop count got from linkdb
	//char      m_minInlinkerHopCount;

	HashTableX m_adBanTable;

	// for setting <absScore2> or determining if a search results
	// inlinkers also have the query terms. buzz.
	char *m_qbuf;
	long  m_qbufSize;
};

// used by Msg25::addNote()
#define MAX_ENTRY_DOCIDS 10
class NoteEntry {
public:
	long             m_count;
	char            *m_note;
	long long        m_docIds[MAX_ENTRY_DOCIDS];
};

// . takes a bunch of Msg20Replies and makes a serialized buffer, LinkInfo
// . LinkInfo's buffer consists of a bunch of serialized "Inlinks" as defined
//   below
// . THINK OF THIS CLASS as a Msg25 reply ("Msg25Reply") class

#include "Xml.h"

// how big can the rss item we store in the Inlink::ptr_rssItem be?
#define MAX_RSSITEM_SIZE 30000

class LinkInfo {

 public:

	long   getStoredSize  ( ) { return m_size; };
	long   getSize        ( ) { return m_size; };
	time_t getLastUpdated ( ) { return m_lastUpdated; };

	//long   getNumTotalInlinks   ( ) {
	//	if ( this == NULL ) return 0; return m_numTotalInlinks; };
	long   getNumLinkTexts ( ) {
		if ( this == NULL ) return 0; return m_numStoredInlinks; };

	long   getNumGoodInlinks   ( ) {
		if ( this == NULL ) return 0; return m_numGoodInlinks; };

	// how many of the inlinks are from the same ip top?
	//long   getNumInternalInlinks( ) {
	//	if ( this == NULL ) return 0; return m_numInlinksInternal; };

	// how many inlinks are from a different ip top?
	//long   getNumExternalInlinks( ) {
	//	if ( this == NULL ) return 0;
	//	return m_numInlinks - m_numInlinksInternal; };

	//long   getNumInlinksExtrapolated ( ){
	//	if ( this == NULL ) return 0;return m_numInlinksExtrapolated;};

	// update them for each Inlink. calls for each Inlink.
	void   updateStringPtrs ( );

	// this returns a ptr to a static Inlink in some cases, so beware
	class Inlink *getNextInlink ( class Inlink *k ) ;

	// do not call this one
	class Inlink *getNextInlink2 ( class Inlink *k ) ;

	bool getItemXml ( Xml *xml , long niceness ) ;

	bool hasLinkText ( );

	/*
	bool hash ( TermTable      *table                  ,
		    long            externalLinkTextWeight ,
		    long            internalLinkTextWeight ,
		    long            ip                     ,
		    long            version                ,
		    long            siteNumInlinks         ,
		    TermTable      *countTable             ,
		    char           *note                   ,
		    long            niceness               ) ;
	*/

	// for PageTitledb
	bool print ( class SafeBuf *sb , char *coll );

	// adds up the page pops of the inlinkers as long as they are from
	// a different site than "u" is
	//long computePagePop ( class Url *u , char *coll ) ;

	bool hasRSSItem();

	// a small header, followed by the buf of "Inlinks", m_buf[]
	char       m_version;
	// we only keep usually no more than 10 or so internal guys, so this
	// can be a single byte
	char       m_numInlinksInternal;
	char       m_reserved1; // was m_siteRootQuality
	char       m_reserved2;
	long       m_size;
	time_t     m_lastUpdated;
	// this is precisely how many inlinks we stored in m_buf[] below
	long       m_numStoredInlinks;//m_numTotalInlinks;
	// . only valid if titleRec version >= 119, otherwise its always 0
	// . this count includes internal as well as external links, i.e. just
	//   the total inlinks we got, counting at most one inlink per page.
	//   it is not very useful i guess, but steve wants it.
	long       m_totalInlinkingDocIds;//reserved3;
	// . how many inlinks did we have that were "good"?
	// . this is typically less than the # of Inlinks stored in m_buf below
	//   because it does not include internal cblock inlinks
	long       m_numGoodInlinks;
	// . # of c blocks linking to this page/site
	// . only valid if titlerecversion >= 119
	// . includes your own intenral cblock
	long       m_numUniqueCBlocks;//m_pagePop;
	// . # of IPs linking to this page/site
	// . only valid if titlerecversion >= 119
	// . includes your own internal ip
	long       m_numUniqueIps;//numInlinksFresh; // was m_reserved3;
	//long       m_sitePop;
	//long       m_siteNumInlinks;

	// serialize "Inlinks" into this buffer, m_buf[]
	char   m_buf[0];
};


class Inlink { // : public Msg {

 public:

	long  *getFirstSizeParm () { return &size_urlBuf; };
	long  *getLastSizeParm  () { return &size_rssItem; };
	char **getFirstStrPtr   () { return &ptr_urlBuf; };
	long   getBaseSize      () { return sizeof(Inlink);};
	char  *getStringBuf     () { return m_buf; };

	long getBaseNumStrings() {
		return (char **)&size_urlBuf - (char **)&ptr_urlBuf; };

	// zero ourselves out
	void reset() ;

	void set ( class Msg20Reply *reply );

	// set ourselves from a serialized older-versioned Inlink
	void set2 ( class Inlink *old );

	bool setXmlFromRSS      ( Xml *xml , long niceness ) ;
	//bool setXmlFromLinkText ( Xml *xml ) ;

	// . set a Msg20Reply from ourselves
	// . Msg25 uses this to recycle old inlinks that are now gone
	// . allows us to preserve ptr_rssInfo, etc.
	void setMsg20Reply ( class Msg20Reply *r ) ;

	long getStoredSize ( ) ;

	// . return ptr to the buffer we serialize into
	// . return NULL and set g_errno on error
	char *serialize ( long *retSize     ,
			  char *userBuf     ,
			  long  userBufSize ,
			  bool  makePtrsRefNewBuf ) ;

	long updateStringPtrs ( char *buf );

	// returns a ptr into a static buffer
	char *getLinkTextAsUtf8 ( long *len = NULL ) ;

	long       m_ip                  ;
	long long  m_docId               ;
	long       m_firstSpidered       ;
	long       m_lastSpidered        ;
	long	   m_nextSpiderDate	 ;
	// like in the titleRec, the lower 2 bits of the datedbDate have
	// special meaning.
	// 0x00 --> datedb date extracted from content (pubdate)
	// 0x01 --> datedb date based on estimated "modified" time (moddate)
	// 0x10 --> datedb date is when same-site root was estimated to have
	//          first added that url as an outlink (discoverdate) (TODO)
	long       m_datedbDate          ;
	// this date is used as the discovery date for purposes of computing
	// LinkInfo::m_numInlinksFresh
	long       m_firstIndexedDate    ;
	//long       m_baseScore           ;
	long       m_pageNumInlinks      ;
	long       m_siteNumInlinks      ;
	// record the word position we hashed this link text with
	// so we can match it to the DocIdScoringInfo stuff
	long       m_wordPosStart;//reservedc;//pagePop             ;
	long       m_firstIp;//wordPosEnd;//reservedd;//sitePop             ;

	// . long     m_reserved1           ;
	// . how many strings do we have?
	// . makes it easy to add new strings later
	uint16_t   m_numStrings          ;
	// . and were our first string ptrs starts
	// . allows us to set ourselves from an "old" Inlink
	uint16_t   m_firstStrPtrOffset   ;

	uint16_t   m_numOutlinks         ;
	// i guess no need to store this stuff if we are storing the url
	// in ptr_urlBuf below. we can call Url::set() then Url::getHostHash()
	// NO, because the site is now only contained in the TagRec now and
	// we compute the site in SiteGetter.cpp, so it is more complicated!!!
	// we get the tag rec of each outlink, and get the site from that
	// and hash that and store it here
	long       m_siteHash            ; // www.hompages.com/~fred/
	//long     m_hostHash            ; // www.ibm.com
	//long     m_midDomHash          ; // the ibm in ibm.com

	// single bit flags
	uint16_t   m_isPermalink      : 1 ;
	uint16_t   m_outlinkInContent : 1 ;
	uint16_t   m_outlinkInComment : 1 ;
	uint16_t   m_isReserved       : 1 ; // was u-n-i-c-o-d-e- bit
	uint16_t   m_isLinkSpam       : 1 ;
	//uint16_t   m_isAnomaly        : 1 ;
	// when Msg20Request::ptr_qbuf is set and
	// Msg20Request::m_computeLinkInfo is true, Msg20 calls Msg25, which
	// in turn calls one Msg20 for each inlinker the doc has, thereby
	// passing the ptr_qbuf into each of those Msg20s. if the inlinker
	// matches the query then it sets m_hasAllQueryTerms to true and
	// returns the Msg20Reply to Msg25. When Msg25 is done it calls
	// makeLinkInfo() to make a LinkInfo out of all those Msg20Replies.
	// We use m_hasAllQueryTerms to display the absScore2 of each inlinker
	// in the raw xml search results feed for buzz.
	uint16_t   m_hasAllQueryTerms : 1 ;
	// if we imported it from the old LinkInfo. helps us preserve rssInfo,
	// hopcounts, etc.
	uint16_t   m_recycled         : 1 ;
	uint16_t   m_reserved4        : 1 ;
	uint16_t   m_reserved5        : 1 ;
	uint16_t   m_reserved6        : 1 ;
	uint16_t   m_reserved7        : 1 ;
	uint16_t   m_reserved8        : 1 ;
	uint16_t   m_reserved9        : 1 ;
	uint16_t   m_reserveda        : 1 ;
	uint16_t   m_reservedb        : 1 ;

	uint16_t   m_country             ;
	uint8_t    m_language            ;
	//char     m_docQuality          ;
	char       m_siteRank;
	//char       m_ruleset             ;
	char       m_hopcount            ;
	char       m_linkTextScoreWeight ; // 0-100% (was m_inlinkWeight)

	//
	// add new non-strings right above this line
	//

	// . the url, link text and neighborhoods are stored in here
	// . no need to store vector for voting deduping in here because
	//   that use MsgE's Msg20Replies directly
	// . this is just stuff we want in the title rec
	char      *ptr_urlBuf            ;
	char      *ptr_linkText          ;
	char      *ptr_surroundingText   ; // neighborhoods
	// . this is the rss item that links to us
	// . if calling Msg25::getLinkInfo() with getLinkerTitles set to
	//   true then this is the title!
	char      *ptr_rssItem           ;
	// . zakbot and the turk categorize site roots, and kids inherit
	//   the categories from their parent inlinkers
	// . we can't really use tagdb cuz that operates on subdirectories
	//   which may not be upheld for some sites. (like cnn.com!, the
	//   stories are not proper subdirectories...)
	// . so inherit the category from our inlinkers. "sports", "world", ...
	// . comma-separated (in ascii)
	char      *ptr_categories        ;
	// . augments our own gigabits vector, used for finding related docs
	// . used along with the template vector for deduping pgs at index time
	// . now we used for finding similar docs AND categorizing
	// . comma-separated
	// . each gigabit has a count in []'s. score in body x1, title x5,
	//   and inlink text x5. i.e. "News[10],blue devils[5],...
	// . always in UTF-8
	char      *ptr_gigabitQuery      ;
	// . the html tag vector.
	// . used for deduping voters (anti-spam tech)
	// . used along with the gigabit vector for deduping pgs at index time
	// . now we used for finding similar docs and for categorizing (spam)
	char      *ptr_templateVector    ;

	//
	// add new strings right above this line
	//

	long       size_urlBuf           ;
	long       size_linkText         ;
	long       size_surroundingText  ;
	long       size_rssItem          ;
	long       size_categories       ;
	long       size_gigabitQuery     ;
	long       size_templateVector   ;


	char       m_buf[0]              ;
};

// . this function is normally called like "info = makeLinkInfo()"
//   to create a new LinkInfo based on a bunch of Msg20 replies
// . returns NULL and sets g_errno on error
LinkInfo *makeLinkInfo ( char        *coll                    ,
			 long         ip                      ,
			 //char       siteRootQuality         ,
			 //long         sitePop                 ,
			 long         siteNumInlinks          ,
			 Msg20Reply **replies                 ,
			 long         numReplies              ,
			 //long         extrapolated            ,
			 //long         xfactor                 ,
			 // if link spam give this weight
			 long         spamWeight              ,
			 bool         oneVotePerIpTop         ,
			 long long    linkeeDocId             ,
			 long         lastUpdateTime          ,
			 bool         onlyNeedGoodInlinks      ,
			 long         niceness                ,
			 class Msg25 *msg25 ,
			 SafeBuf *linkInfoBuf ) ;

// . set from the Msg20 replies in MsgE
// . Msg20 uses this to set the LinkInfo class to the "outlinks"
// . if an outlink has no docid, it is not stored, because it was
//   therefore not in the index.
LinkInfo *makeLinkInfo ( class MsgE *m , long niceness ) ;


////////
//
// LINKS CLASS
//
////////

//typedef short linkflags_t;
typedef long linkflags_t;

// all the links (urls), separated by \0's, are put into a buf of this size
#define LINK_BUF_SIZE (100*1024)

// we allow up to this many links to be put into m_buf
//#define MAX_LINKS      10000

//#define MSR_HAD_REC      0x80
//#define NUM_TYPES_IN_MSR 2

//class MiniSiteRec {
//public:
//	bool hadRec() { return m_flags & MSR_HAD_REC; };
//	short    m_siteOffset;
//	short    m_siteLen;
//	long     m_filenum;
//	uint8_t  m_flags;
//	char     m_siteQuality;
//	SiteType m_types[NUM_TYPES_IN_MSR];
//	SiteType m_lang;
//};

// Link Flags
#define LF_SAMEHOST      0x0001 // same hostname
#define LF_SAMEDOM       0x0002 // same domain
#define LF_SITEROOT      0x0004 // for blogrolls
#define LF_SAMESITE      0x0008 // only get offsite outlink info in Msg20.cpp
#define LF_OLDLINK       0x0010 // set this if it was on the pg last spider tim
#define LF_RSS           0x0020 // is it from an rss <link href=> tag?
#define LF_PERMALINK     0x0040 // a probable permalink? of permalink format?
#define LF_SUBDIR        0x0080 // is the outlink in a subdir of parent?
#define LF_AHREFTAG      0x0100 // an <a href=> outlink
#define LF_LINKTAG       0x0200 // a <link> outlink
#define LF_FBTAG         0x0400 // a feed burner original outlink
#define LF_SELFLINK      0x0800 // links to self
#define LF_SELFPERMALINK 0x1000 // has "permalink" "link text" or attribute
#define LF_STRONGPERM    0x2000 // is permalink of /yyyy/mm/dd/ format
#define LF_EDUTLD        0x4000
#define LF_GOVTLD        0x8000

#define LF_NOFOLLOW     0x10000

bool isPermalink ( //char        *coll        ,
		   class Links *links       ,
		   class Url   *u           ,
		   char         contentType ,
		   class LinkInfo    *linkInfo    ,
		   bool         isRSS       ,
		   char       **note        = NULL  ,
		   char        *pathOverride= NULL  ,
		   bool         ignoreCgi   = false ,
		   linkflags_t  *extraFlags = NULL  ) ;

class Links {

public:
	Links();
	~Links();
	void reset();

	// call this before calling hash() and write()
	bool set ( bool useRelNoFollow ,
		   Xml *xml,
		   Url *parentUrl ,
		   bool setLinkHashes ,
		   // use NULL for this if you do not have a baseUrl
		   Url *baseUrl ,
		   long version,
		   long niceness ,
		   //bool addSiteRootFlag = false ,
		   //char *coll           = NULL  ,
		   bool  parentIsPermalink , // = false ,
		   Links *oldLinks         , // for LF_OLDLINKS flag
		   // this is used by Msg13.cpp to quickly get ptrs
		   // to the links in the document, no normalization!
		   bool doQuickSet = false );

	// set from a simple text buffer
	bool set ( char *buf , long niceness ) ;

	// Link in ascii text
	bool addLink(char *link,long linkLen,long nodeNum,bool setLinkHashes,
		     long titleRecVersion, long niceness , bool isRSS ,
		     long tagId , linkflags_t flagsArg );

	// . link spam functions. used by linkspam.cpp's setLinkSpam().
	// . also used by Linkdb.cpp to create a linkdb list to add to rdb
	// . we do not add outlinks to linkdb if they are "link spam"
	bool setAllSpamBits ( char *note ) { m_spamNote = note; return true; }
	void setSpamBit  ( char *note , long i ) { m_spamNotes[i] = note; }
	void setSpamBits ( char *note , long i ) {
		for (long j=i ; j<m_numLinks ; j++) m_spamNotes[j] = note;};
	// . m_spamNote is set if it is ALL link spam... set above
	// . internal outlinks are never considered link spam since we "dedup"
	//   them by ip in Msg25/LinkInfo::merge() anyway
	bool isLinkSpam ( long i ) {
		if ( isInternalDom(i) ) return false;
		if ( m_spamNote       ) return true;
		return m_spamNotes[i];
	}
	const char *getSpamNote ( long i ) {
	        if ( isInternalDom(i) ) return "good";
		if ( m_spamNote       ) return m_spamNote;
		if ( m_spamNotes[i]   ) return m_spamNotes[i];
		return "good";
	}

	// for spidering links purposes, we consider "internal" to be same
	// hostname
	bool isInternal     ( long i ) {return (m_linkFlags[i] & LF_SAMEHOST);}
	bool isInternalHost ( long i ) {return (m_linkFlags[i] & LF_SAMEHOST);}

	// we do not subjugate same domain links to link spam detection in
	// linkspam.cpp::setLinkSpam()
	bool isInternalDom  ( long i ) { return (m_linkFlags[i] & LF_SAMEDOM);}

	bool isOld ( long i ) { return m_linkFlags[i] & LF_OLDLINK; };

	// remove all links from the link buf that do not have the same
	// hostname as "url". Used in Msg14 to avoid adding such links and
	// avoid getting link info for such links.
	//void removeExternalLinks ( );

	// . returns false and sets g_errno on error
	// . remove links from our m_linkPtrs[] if they are in "old"
	bool flagOldLinks ( class Links *old ) ;

	// does this page have a subtantial amount of links with naughty words
	// in their hostnames?
	//bool isPageDirty ( );

	// . hash the link: and href: terms
	// . we need SiteRec of the url that supplied these links so we
	//   might set hi bits in the link: terms scores to represent:
	//   banned, unbanned, clean, dirty links if the SiteRec says so
	// . returns false and set errno on error
	/*
	bool hash ( TermTable *table , // SiteRec *sr ,
		    Url *url ,
		    Url *redirUrl ,
		    long version,
		    long niceness ,
		    bool isRSSFeed );
	*/

	// hash for Linkdb keys
	//bool hash ( class HashTableX *dt , // <key128_t,char> *dt ,
	//	    class XmlDoc     *xd ,
	//	    long        niceness );

	// . does link #n have link text that has at least 1 alnum char in it?
	// . used for scoring link: terms to make link-text adds more efficient
	bool hasLinkText ( long n, long version );

	// . returns false on error and sets errno
	// . get our outgoing link text for this url
	// . store it into "buf"
	long getLinkText ( char  *linkee ,
			   bool   getSiteLinkInfo ,
			   char  *buf       ,
			   long   maxBufLen ,
			   //bool   filter    ,
			   char **itemPtr   ,
			   long  *itemLen   ,
			   long   *retNode1 , // = NULL ,
			   long   *retLinkNum ,
			   long    niceness );

	long getLinkText2 ( long i,
			   char  *buf       ,
			   long   maxBufLen ,
			   //bool   filter    ,
			   char **itemPtr   ,
			   long  *itemLen   ,
			   long   *retNode1 , // = NULL ,
			   long    niceness );

	// quick n dirty check for substrings in linktext
	char *linkTextSubstr(long linkNum, char *string, long niceness);

	// returns list of \0 terminated, normalized links
	char          *getLinkBuf    () {
		return m_allocBuf;
	};
	long           getLinkBufLen () {
		if ( m_allocBuf ) return m_bufPtr - m_allocBuf;
		return 0;
		//return m_allocBuf?m_bufPtr-m_allocBuf:0;
	};
	//unsigned long *getLinkHashes () { return m_linkHashes; };
	long           getNumLinks   () { return m_numLinks; };

	// was there a link to gigablast.com or www.gigablast.com?
	bool           linksToGigablast() { return m_linksToGigablast; };

	long           getLinkLen    ( long i ) { return m_linkLens  [i]; };
	char          *getLink       ( long i ) { return m_linkPtrs  [i]; };
	char          *getLinkPtr    ( long i ) { return m_linkPtrs  [i]; };
	uint32_t       getLinkHash32 ( long i ) {
		return (uint32_t)m_linkHashes[i]; };
	uint64_t       getLinkHash64 ( long i ) { return m_linkHashes[i]; };
	uint64_t       getHostHash64 ( long i ) { return m_hostHashes[i]; };
	long           getDomHash32  ( long i ) { return m_domHashes[i]; };
	long           getNodeNum    ( long i ) { return m_linkNodes[i];  };
	bool hasRelNoFollow() { return m_hasRelNoFollow; };

	char *getLinkHost ( long i , long *hostLen ) ;

	long findLinkNum(char* url, long urlLen);

	long getMemUsed () { return m_allocSize; };

	bool hasSelfPermalink ( ) { return m_hasSelfPermalink; };
	bool hasRSSOutlink    ( ) { return m_hasRSSOutlink; };
	bool hasSubdirOutlink ( ) { return m_hasSubdirOutlink; };

	// . make an RdbList for adding to spiderdb
	// . returns -1 and sets g_errno on error
	// . otherwise returns # of outlinks added to "list"
	// . used by Msg14.cpp for adding outlinks to spiderdb
	/*
	char *addToMetaList ( char            *p           , // metalist start
			      char            *pend        , // metalist end
			      class TitleRec  *tr          ,
			      class XmlDoc    *old         ,
			      char           *coll         ,
			      class MsgE     *msge         ,
			      long            niceness     ,
			      Url            *quickLink         = NULL  ,
			      linkflags_t     quickLinkFlags    = 0     ,
			      bool            isAddUrl          = false ,
			      bool            forceAll          = false ,
			      bool            skipExternalLinks = false ,
			      bool            unforceAll        = false ,
			      long            explicitPriority  = -1    );
	*/

	// private:

	Xml   *m_xml;
	Url   *m_baseUrl;
	Url   *m_parentUrl;
	bool   m_parentIsPermalink;

	char  *m_baseSite;
	long   m_baseSiteLen;

	// set <base href>, if any, into m_tmpUrl so m_baseUrl can point to it
	Url    m_tmpUrl;

	// . we store all links in this buf
	// . each link ends in a \0
	// . convenient for passing to Msg10
	// . each link is in complete http:// format with base url, etc.
	char   *m_buf;// [LINK_BUF_SIZE];
	// pointer to the end of the buffer
	char  *m_bufPtr;
	// running count of the bufsize, including static and dynamic
	// long   m_bufSize;

	// this is non-NULL if all outlinks are considered link spam,
	// otherwise, individual outlinks will have their m_spamNotes[i] be
	// non-NULL, and point to the string that describes why they are
	// link spam.
	char  *m_spamNote;

	char          **m_linkPtrs;//   [MAX_LINKS];
	long           *m_linkLens;//   [MAX_LINKS];
	long           *m_linkNodes;//  [MAX_LINKS];
	uint64_t       *m_linkHashes;// [MAX_LINKS];
	uint64_t       *m_hostHashes;// [MAX_LINKS];
	long           *m_domHashes;// [MAX_LINKS];
	linkflags_t    *m_linkFlags;
	char           *m_linkContactyTypes; // for XmlDoc's isContacty() algo
	char          **m_spamNotes;

	bool m_doQuickSet;

	// do we have an rss link? i.e. are we an RSS feed
	bool           m_hasRSS;
	bool           m_isFeedBurner;

	char          *m_linkBuf;
	long           m_allocLinks;
	long           m_numLinks;
	long           m_numNodes;

	// . should we extract redirects from links? (like for yahoo's links)
	// . this is set based on the SiteRec
	//bool m_extractRedirects;

	bool m_linksToGigablast;

	bool m_hasRelNoFollow;

	bool m_stripIds;

	unsigned long  m_allocSize;
	char          *m_allocBuf;

	// queue the blog roll links into the turk for voting
	bool queueBlogRoll ( class TagRec **tagRecPtrs , long niceness ) ;
	bool  m_addSiteRootFlags;
	char *m_coll;

	char  m_flagged;

	char  m_hasSelfPermalink;
	char  m_hasRSSOutlink;
	char  m_hasSubdirOutlink;
	char *m_rssOutlinkPtr;
	long  m_rssOutlinkLen;

	// . returns  0 if probably not a permalink
	// . returns  1 if probably is a permalink
	// . returns -1 if not enough information to make a decision
	char isPermalink ( char **note ) { return -1; };

	long m_numOutlinksAdded;
};

long getRegExpNumOfOutlink ( Url           *up              ,
			     linkflags_t    linkFlags       ,
			     TagRec        *tagRec          ,
			     long           quality         ,
			     long           ip              ,
			     CollectionRec *cr              ,
			     Url           *parentUrl       ,
			     long           sourceHostHash  ,
			     long           parentHopCount  ,
			     long           parentPriority  ,
			     long           hopCount        , // our hop count
			     long           h               , // hostHash
			     bool           newOutlink      , // are we new?
			     bool           isAddUrl        , // from addUrl?
			     // use -1 if unknown for these 3 values
			     char           isParentRSS     ,
			     char           parentIsNew     ,
			     char           parentIsPermalink ,
			     char           isIndexed       ); // -1--> unknown


#endif