open-source-search-engine/Linkdb.cpp

#include "Linkdb.h"
#include "Threads.h"
#include "Titledb.h"
#include "linkspam.h"
#include "sort.h"
#include "XmlDoc.h" // score32to8()

Linkdb g_linkdb;
Linkdb g_linkdb2;

void Linkdb::reset() {
	m_rdb.reset();
}

bool Linkdb::init ( ) {
	key224_t  k;
	// sanity tests
	uint32_t    linkeeSiteHash32 = (uint32_t)rand();
	uint32_t    linkerSiteHash32 = (uint32_t)rand();
	uint64_t linkeeUrlHash64  = (uint64_t)rand() << 32LL | rand();
	// mask it to 32+15 bits
	linkeeUrlHash64 &= 0x00007fffffffffffLL;
	unsigned char linkerSiteRank = 13;
	unsigned char      hopCount         = 7;
	long      ip               = rand();
	long      ipdom3 = ipdom(ip);
	long long docId = ((uint64_t)rand() << 32 | rand()) & DOCID_MASK;
	long discoveryDate = 1339784732;
	long lostDate      = discoveryDate + 86400*23;
	char linkSpam = 1;
	k = makeKey_uk ( linkeeSiteHash32 ,
			 linkeeUrlHash64  ,
			 linkSpam         , // islinkspam?
			 linkerSiteRank   ,
			 hopCount         ,
			 ip               ,
			 docId            ,
			 discoveryDate    ,
			 lostDate         ,
			 false            , // newaddtooldpage?
			 linkerSiteHash32 ,
			 false            ); // is del?

	// jan 1 2008
	unsigned long epoch = LINKDBEPOCH;
	long dd2 = (discoveryDate - epoch) / 86400;
	if ( discoveryDate == 0 ) dd2 = 0;
	dd2 = dd2 * 86400  + epoch;
	long ld2 = (lostDate - epoch) / 86400;
	if ( lostDate == 0 ) ld2 = 0;
	ld2 = ld2 * 86400  + epoch;

	// try this
	setLostDate_uk(&k,ld2 );

	// now test it
	if(getLinkeeSiteHash32_uk(&k)!=linkeeSiteHash32){char *xx=NULL;*xx=0;}
	if(getLinkeeUrlHash64_uk(&k)!=linkeeUrlHash64){char *xx=NULL;*xx=0;}
	if ( isLinkSpam_uk    ( &k ) != linkSpam       ) {char *xx=NULL;*xx=0;}
	if (getLinkerSiteHash32_uk(&k)!=linkerSiteHash32){char *xx=NULL;*xx=0;}
	if ( getLinkerSiteRank_uk(&k) != linkerSiteRank){char *xx=NULL;*xx=0;}
	//if (getLinkerHopCount_uk (&k ) != hopCount  ) {char *xx=NULL;*xx=0;}
	if ( getLinkerIp24_uk ( &k ) != ipdom3         ) {char *xx=NULL;*xx=0;}
	if ( getLinkerIp_uk ( &k ) != ip         ) {char *xx=NULL;*xx=0;}
	if ( getLinkerDocId_uk( &k ) != docId          ) {char *xx=NULL;*xx=0;}
	if ( getDiscoveryDate_uk(&k) != dd2  ) {char *xx=NULL;*xx=0;}
	if ( getLostDate_uk(&k) != ld2  ) {char *xx=NULL;*xx=0;}

	// more tests
	setDiscoveryDate_uk (&k,discoveryDate);
	setLostDate_uk (&k,lostDate);
	if ( getDiscoveryDate_uk(&k) != dd2  ) {char *xx=NULL;*xx=0;}
	if ( getLostDate_uk(&k) != ld2  ) {char *xx=NULL;*xx=0;}


	long ip3 = 0xabcdef12;
	setIp32_uk ( &k , ip3 );
	long ip4 = getLinkerIp_uk ( &k );
	if ( ip3 != ip4 ) { char *xx=NULL;*xx=0; }

	/*
	// test similarity
	long v1[] = {86845183, 126041601, 193138017, 194832692, 209041345, 237913907, 
		    253753116, 420176029, 425806029, 469664463, 474491119, 486025959, 524746875, 
		    565034969, 651889954, 723451712, 735373612, 740115430, 889005385, 
		    1104585188, 1180264907, 1190905206, 1555245401, 1585281138, 1775919002, 
		    1780336562, 1784029178, 1799261433, 2013337516, 2095261394, 2137774538, 0};
	long v2[] = {51207128, 126041601, 237913907, 253753116, 315255440, 394767298, 
		    420176029, 435382723, 469664463, 486025959, 536944585, 556667308, 565034969, 
		    615792190, 624608202, 629600018, 807226240, 1107373572, 1113238204, 
		    1134807359, 1135960080, 1200900964, 1527062593, 1585281138, 1634165777, 
		    1694464250, 1802457437, 1943916889, 1960218442, 2058631149, -2130866760, 0};

	long nv1 = sizeof(v1)/4;
	long nv2 = sizeof(v2)/4;
	if ( isSimilar_sorted (v1,v2,nv1,nv2,80,0) ) {
		char *xx=NULL;*xx=0;
	}
	*/

	// we use the same disk page size as indexdb (for rdbmap.cpp)
	long pageSize = GB_INDEXDB_PAGE_SIZE;
	// set this for debugging
	//long long maxTreeMem = 1000000;
	long long maxTreeMem = 40000000; // 40MB
	// . what's max # of tree nodes?
	// . key+4+left+right+parents+dataPtr = sizeof(key192_t)+4 +4+4+4+4
	// . 32 bytes per record when in the tree
	long maxTreeNodes = maxTreeMem /(sizeof(key224_t)+16);
	// disk page cache mem, 100MB on gk0 now
	long pcmem = 0; // g_conf.m_linkdbMaxDiskPageCacheMem;
	// give it a little
	pcmem = 10000000; // 10MB
	// keep this low if we are the tmp cluster
	//if ( g_hostdb.m_useTmpCluster ) pcmem = 0;
	// TODO: would be nice to just do page caching on the satellite files;
	//       look into "minimizeDiskSeeks" at some point...
	if ( ! m_pc.init ( "linkdb" ,
			   RDB_LINKDB,
			   pcmem    ,
			   pageSize ,
			   true     ,  // use shared mem?
			   false    )) // minimizeDiskSeeks?
		return log("db: Linkdb init failed.");
	// init the rdb
	return m_rdb.init ( g_hostdb.m_dir ,
			    "linkdb" ,
			    true     , // dedup
			    0        , // fixeddatasize is 0 since no data
			    3,//g_conf.m_linkdbMinFilesToMerge ,
			    // fix this to 15 and rely on the page cache of
			    // just the satellite files and the daily merge to
			    // keep things fast.
			    //15       , 
			    maxTreeMem ,
			    maxTreeNodes ,
			    true     , //isTreeBalanced
			    0        , // cache mem 
			    0        , // cache nodes
			    false, // true     , // use half keys
			    false    , // load cache from disk
			    &m_pc    ,
			    false    , // false
			    false    , // preload page cache
			    sizeof(key224_t) ,
			    true             ); // bias page cache? (true!)
}

// init the rebuild/secondary rdb, used by PageRepair.cpp
bool Linkdb::init2 ( long treeMem ) {
	// . what's max # of tree nodes?
	// . key+4+left+right+parents+dataPtr = 12+4 +4+4+4+4 = 32
	// . 28 bytes per record when in the tree
	long nodeSize = ( sizeof(key224_t) + 12 + 4 ) + sizeof(collnum_t);
	long maxTreeNodes  = treeMem / nodeSize;
	// initialize our own internal rdb
	return m_rdb.init ( g_hostdb.m_dir     ,
			    "linkdbRebuild" ,
			    true          , // dedup
			    0             , // no data now! just docid/s/c
			    50            , // m_clusterdbMinFilesToMerge,
			    treeMem       , // g_conf.m_clusterdbMaxTreeMem,
			    maxTreeNodes  , 
			    true          , // balance tree?
			    0             , // maxCacheMem   , 
			    0             , // maxCacheNodes ,
			    false, // true          , // half keys?
			    false         , // g_conf.m_clusterdbSaveCache,
			    NULL          , // &m_pc ,
			    false         , // is titledb
			    false         , // preload disk page cache
			    sizeof(key224_t), // key size
			    true          );// bias disk page cache
}

bool Linkdb::addColl ( char *coll, bool doVerify ) {
	if ( ! m_rdb.addColl ( coll ) ) return false;
	if ( ! doVerify ) return true;
	// verify
	if ( verify(coll) ) return true;
	// if not allowing scale, return false
	if ( ! g_conf.m_allowScale ) return false;
	// otherwise let it go
	log ( "db: Verify failed, but scaling is allowed, passing." );
	return true;
}

bool Linkdb::verify ( char *coll ) {
	log ( LOG_DEBUG, "db: Verifying Linkdb for coll %s...", coll );
	g_threads.disableThreads();

	Msg5 msg5;
	Msg5 msg5b;
	RdbList list;
	key224_t startKey;
	key224_t endKey;
	startKey.setMin();
	endKey.setMax();
	long minRecSizes = 64000;
	
	if ( ! msg5.getList ( RDB_LINKDB   ,
			      coll          ,
			      &list         ,
			      (char*)&startKey      ,
			      (char*)&endKey        ,
			      minRecSizes   ,
			      true          , // includeTree   ,
			      false         , // add to cache?
			      0             , // max cache age
			      0             , // startFileNum  ,
			      -1            , // numFiles      ,
			      NULL          , // state
			      NULL          , // callback
			      0             , // niceness
			      false         , // err correction?
			      NULL          ,
			      0             ,
			      -1            ,
			      true          ,
			      -1LL          ,
			      &msg5b        ,
			      true          )) {
		g_threads.enableThreads();
		return log("db: HEY! it did not block");
	}

	long count = 0;
	long got   = 0;
	for ( list.resetListPtr() ; ! list.isExhausted() ;
	      list.skipCurrentRecord() ) {
		key224_t k;
		list.getCurrentKey((char*)&k);
		count++;
		//uint32_t shardNum = getShardNum ( RDB_LINKDB , &k );
		//if ( groupId == g_hostdb.m_groupId ) got++;
		unsigned long shardNum = getShardNum( RDB_LINKDB , &k );
		if ( shardNum == getMyShardNum() ) got++;
	}
	if ( got != count ) {
		log ("db: Out of first %li records in Linkdb , "
		     "only %li belong to our group.",count,got);

		/*
		// repeat with log
		for ( list.resetListPtr() ; ! list.isExhausted() ;
		      list.skipCurrentRecord() ) {

			key224_t k;
			list.getCurrentKey((char*)&k);
			uint32_t shardNum = getShardNum ( RDB_LINKDB , &k );
			long groupNum = g_hostdb.getGroupNum(groupId);
			unsigned long sh32 ;
			sh32 = g_linkdb.getLinkeeSiteHash32_uk(&k);
			uint16_t sh16 = sh32 >> 19;
			log("db: sh16=0x%lx group=%li",
			    (long)sh16,groupNum);
		}
		*/


		// exit if NONE, we probably got the wrong data
		if ( got == 0 ) log("db: Are you sure you have the "
				    "right "
				    "data in the right directory? "
				    "Exiting.");
		log ( "db: Exiting due to inconsistency.");
		g_threads.enableThreads();
		return g_conf.m_bypassValidation;
	}
	log ( LOG_DEBUG, "db: Linkdb passed verification successfully for "
	      "%li recs.", count );
	// DONE
	g_threads.enableThreads();
	return true;
}

// make a "url" key
key224_t Linkdb::makeKey_uk ( uint32_t  linkeeSiteHash32       ,
			      uint64_t  linkeeUrlHash64        ,
			      bool      isLinkSpam       ,
			      unsigned char      linkerSiteRank   ,
			      unsigned char      linkerHopCount         ,
			      uint32_t  linkerIp               ,
			      long long linkerDocId ,
			      unsigned long      discoveryDate ,
			      unsigned long      lostDate ,
			      bool      newAddToOldPage ,
			      uint32_t linkerSiteHash32 ,
			      bool      isDelete         ) {

	//if ( linkerSiteRank > LDB_MAXSITERANK ) { char *xx=NULL;*xx=0; }
	//if ( linkerHopCount > LDB_MAXHOPCOUNT ) { char *xx=NULL;*xx=0; }

	// mask it
	linkeeUrlHash64 &= LDB_MAXURLHASH;

	key224_t k;

	k.n3 = linkeeSiteHash32;
	k.n3 <<= 32;
	k.n3 |= (linkeeUrlHash64>>15) & 0xffffffff;

	// finish the url hash
	k.n2 = linkeeUrlHash64 & 0x7fff;

	k.n2 <<= 1;
	if ( isLinkSpam ) k.n2 |= 0x01;

	// make it 8-bites for now even though only needs 4
	k.n2 <<= 8;
	k.n2 |= (unsigned char)~linkerSiteRank;

	k.n2 <<= 8;
	//k.n2 |= linkerHopCount;
	// this is now part of the linkerip, steve wants the full ip
	k.n2 |= (linkerIp >> 24);

	//uint32_t id = ipdom(linkerIp);
	//if ( id > 0xffffff ) { char *xx=NULL;*xx=0; }
	k.n2 <<= 24;
	k.n2 |= (linkerIp & 0x00ffffff);

	k.n2 <<= 8;
	k.n2 |= (((unsigned long long)linkerDocId) >> 30);

	k.n1 = (((unsigned long long)linkerDocId) & 0x3fffffffLL);

	// two reserved bits
	k.n1 <<= 2;

	// sanity checks
	//if(discoveryDate && discoveryDate < 1025376000){char *xx=NULL;*xx=0;}
	if ( lostDate && lostDate < LINKDBEPOCH){char *xx=NULL;*xx=0;}

	// . convert discovery date from utc into days since jan 2008 epoch
	// . the number is for jan 2012, so subtract 4 years to do 2008
	unsigned long epoch = LINKDBEPOCH;
	if ( discoveryDate && discoveryDate < epoch ) { char *xx=NULL;*xx=0; }
	unsigned long nd = (discoveryDate - epoch) / 86400;
	if ( discoveryDate == 0 ) nd = 0;
	// makeEndKey_uk() maxes this out!
	if ( nd > 0x3fff ) nd = 0x3fff;

	k.n1 <<= 14;
	k.n1 |= nd;

	// one reservied bit
	k.n1 <<= 1;

	k.n1 <<= 1;
	if ( newAddToOldPage ) k.n1 |= 0x01;

	// the "lost" date. 0 if not yet lost.
	unsigned long od = (lostDate - LINKDBEPOCH) / 86400;
	if ( lostDate == 0 ) od = 0;
	// makeEndKey_uk() maxes this out!
	if ( od > 0x3fff ) od = 0x3fff;
	k.n1 <<= 14;
	k.n1 |= od;

	// 2 bits of linker site hash
	k.n1 <<= 2;
	k.n1 |= linkerSiteHash32 >> 30;

	// rest of linker site hash
	k.n0 = linkerSiteHash32;
	// halfbit - unused now!
	k.n0 <<= 1;
	// delbit
	k.n0 <<= 1;
	if ( ! isDelete ) k.n0 |= 0x01;
 
	return k;
}

/////////
//
// MSG25 :: getLinkInfo()
//
/////////

#include "Collectiondb.h"
//#include "CollectionRec.h"
#include "matches2.h"

// 1MB read size for now
#define READSIZE 1000000

#define MAX_INTERNAL_INLINKS 10 

//static void gotRootTitleRecWrapper25 ( void *state ) ;
//static void gotTermFreqWrapper       ( void *state ) ;
static void gotListWrapper           ( void *state );//, RdbList *list );
static bool gotLinkTextWrapper       ( void *state );
//static void sendLinkInfoReplyWrapper ( void *state );//, LinkInfo *info ) ;
//static void gotReplyWrapper25        ( void *state , void *state2 ) ;

Msg25::Msg25() {
	m_numRequests = 0;
	m_linkSpamOut = 0;
	// set minhopcount to unknown
	//m_minInlinkerHopCount = -1;
	m_numReplyPtrs = 0;
	m_linkInfo = NULL;
	m_ownReplies = true;
}

Msg25::~Msg25 ( ) {
	reset();
}

void Msg25::reset() {
	if ( ! m_ownReplies ) m_numReplyPtrs = 0;
	for ( long i = 0 ; i < m_numReplyPtrs ; i++ )
		mfree ( m_replyPtrs[i], m_replySizes[i], "msg25r");
	// reset array count to 0
	m_numReplyPtrs = 0;
	// . free the linkinfo if we are responsible for it
	// . if someone "steals" it from us, they should set this to NULL
	//if ( m_linkInfo ) 
	//	mfree ( m_linkInfo , m_linkInfo->getStoredSize(),"msg25s");
	// this now points into m_linkInfoBuf safebuf, just NULL it
	m_linkInfo = NULL;

	m_table.reset();
	m_ipTable.reset();
	m_fullIpTable.reset();
	m_firstIpTable.reset();
	m_docIdTable.reset();
}

#define MODE_PAGELINKINFO 1
#define MODE_SITELINKINFO 2

// . get the inlinkers to this SITE (any page on this site)
// . use that to compute a site quality
// . also get the inlinkers sorted by date and see how many good inlinkers
//   we had since X days ago. (each inlinker needs a pub/birth date)


// . returns false if blocked, true otherwise
// . sets g_errno on error
// . we need the siteRec of the url for merging the linkInfo's
// . NOTE: make sure no input vars are on the stack in case we block
// . reallyGetLinkInfo is set to false if caller does not want it but calls
//   us anyway for some reason forgotten...
bool Msg25::getLinkInfo ( char      *site                ,
			  char      *url                 ,
			  // either MODE_PAGELINKINFO or MODE_SITELINKINFO
			  bool       isSiteLinkInfo      ,
			  long       ip                  ,
			  long long  docId               ,
			  char      *coll                ,
			  char      *qbuf                ,
			  long       qbufSize            ,
			  void      *state               ,
			  void (* callback)(void *state) ,
			  bool       isInjecting         ,
			  SafeBuf   *pbuf                ,
			  XmlDoc *xd ,
			  //bool     printInXml ,
			  long       siteNumInlinks      ,
			  //long       sitePop             ,
			  LinkInfo  *oldLinkInfo         ,
			  long       niceness            ,
			  bool       doLinkSpamCheck     ,
			  bool       oneVotePerIpDom     ,
			  bool       canBeCancelled      ,
			  long       lastUpdateTime      ,
			  bool       onlyNeedGoodInlinks  ,
			  bool       getLinkerTitles ,
			  long       ourHostHash32 ,
			  long       ourDomHash32 ,
			  SafeBuf   *linkInfoBuf ) {

	// reset the ip table
	reset();
	//long mode = MODE_PAGELINKINFO;
	//m_printInXml = printInXml;
	if ( isSiteLinkInfo ) m_mode = MODE_SITELINKINFO;
	else                  m_mode = MODE_PAGELINKINFO;
	m_xd = xd;
	m_printInXml = false;
	if ( m_xd ) m_printInXml = m_xd->m_printInXml;
	// sanity check
	if ( ! coll ) { char *xx=NULL; *xx=0; }
	m_onlyNeedGoodInlinks = onlyNeedGoodInlinks;
	m_getLinkerTitles     = getLinkerTitles;
	// save safebuf ptr, where we store the link info
	m_linkInfoBuf = linkInfoBuf;
	if ( ! linkInfoBuf ) { char *xx=NULL;*xx=0; }
	// sanity check
	if ( m_mode == MODE_PAGELINKINFO && ! docId ) {char *xx=NULL; *xx=0; }
	// must have a valid ip
	//if ( ! ip || ip == -1 ) { char *xx = NULL; *xx = 0; }
	// get collection rec for our collection
	CollectionRec *cr = g_collectiondb.getRec ( coll );//, collLen );
	// bail if NULL
	if ( ! cr ) {
		g_errno = ENOTFOUND;
		log("build: No collection record found when getting "
		    "link info.");
		return true;
	}

	m_gettingList = false;
	// record this in case we were called by Msg3b with the spiders off
	m_spideringEnabled    = g_conf.m_spideringEnabled;
	m_ourHostHash32 = ourHostHash32;
	m_ourDomHash32 = ourDomHash32;
	//m_minInlinkerHopCount = -1; // -1 -->unknown
	m_niceness            = niceness;
	m_maxNumLinkers       = MAX_LINKERS;
	m_errno               = 0;
	m_numReplyPtrs        = 0;
	m_bufPtr              = m_buf;
	m_bufEnd              = m_buf + MAX_NOTE_BUF_LEN;
	m_dupCount            = 0;
	m_vectorDups          = 0;
	m_spamLinks           = 0;
	m_errors              = 0;
	m_noText              = 0;
	m_reciprocal          = 0;
	m_ipDupsLinkdb        = 0;
	m_docIdDupsLinkdb     = 0;
	m_lostLinks           = 0;
	m_ipDups              = 0;
	m_linkSpamLinkdb      = 0;
	//m_url                 = url;
	m_docId               = docId;
	m_coll                = coll;
	//m_collLen             = collLen;
	m_callback            = callback;
	m_state               = state;
	m_oneVotePerIpDom     = oneVotePerIpDom;
	m_doLinkSpamCheck     = doLinkSpamCheck;
	m_canBeCancelled      = canBeCancelled;
	m_siteNumInlinks      = siteNumInlinks; // -1 --> unknown
	//m_sitePop             = sitePop;        // -1 --> unknown
	m_qbuf                = qbuf;
	m_qbufSize            = qbufSize;
	m_isInjecting         = isInjecting;
	m_oldLinkInfo         = oldLinkInfo;
	m_pbuf                = pbuf;
	m_ip                  = ip;
	m_top                 = iptop(m_ip);
	m_lastUpdateTime      = lastUpdateTime;

	m_nextKey.setMin();

	m_adBanTable.reset();
	m_adBanTable.set(4,0,0,NULL,0,false,m_niceness,"adbans");

	m_table.set (4,4,0,NULL,0,false,m_niceness,"msg25tab");

	QUICKPOLL(m_niceness);

	m_url = url;
	m_site = site;

	// and the "mid domain hash" so that ibm.com and ibm.ru cannot both
	// vote even if from totally different ips
	Url u; u.set(url);
	char *m      = u.getMidDomain();
	long  mlen   = u.getMidDomainLen();
	m_midDomHash = hash32 ( m , mlen );

	// do not prepend "www." to the root url
	m_prependWWW = false;
	// we have not done a retry yet
	m_retried = false;

	// change status
	//if ( m_statusPtr ) *m_statusPtr = "consulting linkdb";

	// . add a "www" to our url
	// . we do this when indexing link: terms as well 
	// . this allows www.xyz.com & xyz.com to both get the same link text
	// . we only index one of those if they both have the same content
	// . the problem is is that Linkdb::getUrlHash() is what we set
	//   Links::m_linkHashes[i] to, and that does NOT add "www"
	// . ultimately i would say it should add the "www" but only for 
	//   computing the m_linkHashes[i], not for indexing links:?
	// . MDW: doesn't seem like we do this anymore...
	//Url u2;
	//u2.set ( m_url->getUrl() , m_url->getUrlLen() , false/*addWWW?*/);

	//log("debug: entering getlinkinfo this=%lx",(long)this);

	// then the url/site hash
	//uint64_t linkHash64 = (uint64_t) u.getUrlHash64();
	m_linkHash64 = (uint64_t) u.getUrlHash64();
	//uint32_t hostHash32 = (uint32_t)m_url->getHostHash32();

	m_round = 0;

	// must have a valid ip
	if ( ! ip || ip == -1 ) { //char *xx = NULL; *xx = 0; }
		log("linkdb: no inlinks because ip is invalid");
		return true;
	}


	return doReadLoop();
}


bool Msg25::doReadLoop ( ) {

	//log("debug: entering doReadLoop this=%lx",(long)this);

	// sanity. no double entry.
	if ( m_gettingList ) { char *xx=NULL;*xx=0; }

	// . get the top X results from this termlist
	// . but skip link: terms with a 1 (no link text) for a score
	// . these keys are ordered from lowest to highest
	key224_t startKey ;
	key224_t endKey   ;

	long siteHash32 = hash32n ( m_site );

	// access different parts of linkdb depending on the "mode"
	if ( m_mode == MODE_SITELINKINFO ) {
		startKey = g_linkdb.makeStartKey_uk ( siteHash32 );
		endKey   = g_linkdb.makeEndKey_uk   ( siteHash32 );
		//log("linkdb: getlinkinfo: "
		//    "site=%s sitehash32=%lu",site,siteHash32);
	}
	else {
		startKey = g_linkdb.makeStartKey_uk (siteHash32,m_linkHash64 );
		endKey   = g_linkdb.makeEndKey_uk   (siteHash32,m_linkHash64 );
	}

	// resume from where we left off?
	if ( m_round > 0 ) 
		//startKey = m_nextKey;
		memcpy ( &startKey , &m_nextKey , LDBKS );

	// but new links: algo does not need internal links with no link test
	// see Links.cpp::hash() for score table

	QUICKPOLL(m_niceness);

	m_minRecSizes = READSIZE; // MAX_LINKERS_IN_TERMLIST * 10 + 6;

	long numFiles = -1;
	// NO, DON't restrict because it will mess up the hopcount.
	bool includeTree = true;

	// what group has this linkdb list?
	//unsigned long groupId = getGroupId ( RDB_LINKDB , &startKey );
	unsigned long shardNum = getShardNum ( RDB_LINKDB, &startKey );
	// use a biased lookup
	long numTwins = g_hostdb.getNumHostsPerShard();
	long long sectionWidth = (0xffffffff/(long long)numTwins) + 1;
	// these are 192 bit keys, top 32 bits are a hash of the url
	unsigned long x = siteHash32;//(startKey.n1 >> 32);
	long hostNum = x / sectionWidth;
	long numHosts = g_hostdb.getNumHostsPerShard();
	Host *hosts = g_hostdb.getShard ( shardNum); // Group ( groupId );
	if ( hostNum >= numHosts ) { char *xx = NULL; *xx = 0; }
	long hostId = hosts [ hostNum ].m_hostId ;

	// debug log
	if ( g_conf.m_logDebugLinkInfo ) {
		char *ms = "page";
		if ( m_mode == MODE_SITELINKINFO ) ms = "site";
		log("msg25: getting full linkinfo mode=%s site=%s url=%s "
		    "docid=%lli",
		    ms,m_site,m_url,m_docId);
	}

	m_gettingList = true;

	// . get the linkdb list
	// . we now get the WHOLE list so we can see how many linkers there are
	// . we need a high timeout because udp server was getting suspended
	//   before for 30 seconds and this was timing out and yahoo.com
	//   was getting spidered w/o any link text -- that's bad.
	//   Now we hang indefinitely. We also fixed UdpServer to resend
	//   requests after 30 seconds even though it was fully acked in case
	//   the receiving host went down and is now back up.
	if ( ! m_msg0.getList ( -1              , // hostId, -1 if none
				0               , // hostId ip
				0               , // hostId port
				0               , // max cache age in seconds
				false           , // addToCache?
				RDB_LINKDB      ,
				m_coll          ,
				&m_list         ,
				(char*)&startKey,
				(char*)&endKey  ,
				m_minRecSizes   ,
				this            ,
				gotListWrapper  ,
				m_niceness      ,
				true            , // error correct?
				includeTree     ,
				true            , // do merge
				hostId          , // firstHostId
				0               , // startFileNum
				numFiles        ,
				60*60*24*365    )){// timeout of one year
		//log("debug: msg0 blocked this=%lx",(long)this);
		return false;
	}
	// all done
	m_gettingList = false;
	// debug log
	if ( g_conf.m_logDebugBuild )
		log("build: msg25 call to msg0 did not block");
	// return true on error
	if ( g_errno ) {
		log("build: Had error getting linkers to url %s : %s.",
		    m_url,mstrerror(g_errno));
		return true;
	}
	// . this returns false if blocked, true otherwise
	// . sets g_errno on error
	return gotList();
}

void gotListWrapper ( void *state ) { // , RdbList *list ) {
	Msg25 *THIS = (Msg25 *) state;

	//log("debug: entering gotlistwrapper this=%lx",(long)THIS);


	// return if it blocked
	// . this calls sendRequests()
	// . which can call gotLinkText(NULL) if none sent
	// . which can call doReadLoop() if list was not empty (lost linker)
	// . which can block on msg0
	if ( ! THIS->gotList() ) return;

	// error? wait for all replies to come in...
	if ( THIS->m_numRequests > THIS->m_numReplies ) {
		log("msg25: had error %s numreplies=%li numrequests=%li "
		    "round=%li",
		    mstrerror(g_errno),THIS->m_numReplies,THIS->m_numRequests,
		    THIS->m_round);
		return;
	}

	// the call to gotList() may have launched another msg0 even
	// though it did not return false???
	// THIS FIXED a double entry core!!!! msg0 would return after
	// we had destroyed the xmldoc class behind this!!
	if ( THIS->m_gettingList ) {
		//log("debug: waiting for msg0 1");
		return;
	}

	//log("debug: calling final callback 1");

	// otherwise call callback, g_errno probably is set
	THIS->m_callback ( THIS->m_state );//, THIS->m_linkInfo );
}

// . this returns false if blocked, true otherwise
// . sets g_errno on error
bool Msg25::gotList() {
	// all done
	m_gettingList = false;
	// reset # of docIds linking to us
	//m_numDocIds = 0;

	//log("debug: entering gotlist this=%lx",(long)this);

	// return true on error
	if ( g_errno ) {
		log("build: Had error getting linkers to url %s : %s.",
		    m_url,mstrerror(g_errno));
		return true;
	}

	// this counts all the inlinks... not inlinking pages/docids
	//m_numDocIds = (m_list.getListSize() - 6)/10;
	//if ( m_numDocIds < 0 ) m_numDocIds = 0;

	// . record the # of hits we got for weighting the score of the
	//   link text iff it's truncated by MAX_LINKERS
	// . TODO: if url is really popular, like yahoo, we should use the
	//   termFreq of the link: term!
	// . TODO: we now only read in first 50k linkers so Msg0::getList()
	//   doesn't waste space through its stupid buffer pre-allocation.
	//   it should not preallocate for us since our niceness is over 1
	//   cuz we don't require a real-time signal handler to read our reply.
	m_list.resetListPtr();
	// clear this too
	m_k = (Inlink *)-1;
	// MATT: fix this later maybe....
	/*
	// all done?
	if ( m_list.getListSize() < m_minRecSizes ) return gotTermFreq (false);

	// change status
	//if ( m_statusPtr ) *m_statusPtr = "getting linkdb term freq";
	// returns false if blocked, returns true and sets g_errno 
	// on error
	if ( ! m_msg42.getTermFreq ( m_coll   ,
				     3600     , // maxAge
				     m_termId ,
				     this     , // state
				     gotTermFreqWrapper ,
				     m_niceness         ) )
		return false;
	return gotTermFreq(true);
}

void gotTermFreqWrapper ( void *state ) {
	Msg25 *THIS = (Msg25 *)state;
	// if blocked, just return
	if ( ! THIS->gotTermFreq(true) ) return;
	// otherwise call callback, g_errno is probably set
	THIS->m_callback ( THIS->m_state , THIS->m_linkInfo );
}

bool Msg25::gotTermFreq ( bool msg42Called ) {
	// error?
	if ( g_errno ) {
		log("build: Msg25 had error getting term freq.");
		return true;
	}
	// was msg42 called?
	if ( msg42Called ) {
		// set the new one
		long long tf = m_msg42.getTermFreq();
		logf(LOG_DEBUG,"build: Upping linkers from %li to %lli",
		     m_numDocIds,tf);
		if ( tf > m_numDocIds ) m_numDocIds = tf;
	}
	*/

	// set m_nextKey in case we need to re-call doReadLoop()
	//m_list.getLastKey ( (char *)&m_nextKey );
	// inc by 1
	//m_nextKey += 1;


	// we haven't got any responses as of yet or sent any requests
	m_numReplies    = 0;
	m_numRequests   = 0;

	if ( m_round == 0 ) {
		m_linkSpamOut   = 0;
		m_numFromSameIp = 0;
		memset ( m_inUse  , 0 , MAX_MSG20_OUTSTANDING );
		// use this to dedup ips in linkdb to avoid looking up their
		// title recs... saves a lot of lookups
		//m_ipTable.set(256);
		if (!m_ipTable.set(4,0,256,NULL,0,false,m_niceness,"msg25ips"))
			return true;
		long long needSlots = m_list.getListSize() / LDBKS;
		// wtf?
		if ( m_list.getListSize() > READSIZE + 10000 ) {
			//char *xx=NULL;*xx=0; }
			log("linkdb: read very big linkdb list %li bytes "
			    "bigger than needed",
			    m_list.getListSize() - READSIZE );
		}
		// triple for hash table speed
		needSlots *= 3;
		// ensure 256 min
		if ( needSlots < 256 ) needSlots = 256;
		
		if ( ! m_fullIpTable.set(4,0,needSlots,NULL,0,false,
					 m_niceness,"msg25ip32") )
			return true;

		if ( ! m_firstIpTable.set(4,0,needSlots,NULL,0,false,
					  m_niceness,"msg25fip32") )
			return true;
		// this too
		//m_docIdTable.set(256);
		if ( ! m_docIdTable.set(8,0,needSlots,
					NULL,0,false,m_niceness,"msg25docid") )
			return true;
		// . how many link spam inlinks can we accept?
		// . they do not contribute to quality
		// . they only contribute to link text
		// . their number and weights depend on our ROOT QUALITY!
		// . we need this because our filters are too stringent!
		m_spamCount  = 0;
		m_spamWeight = 0;
		m_maxSpam    = 0;
		m_numDocIds  = 0;
		m_cblocks    = 0;
		m_uniqueIps  = 0;
	}

	// if we are doing site linkinfo, bail now
	if ( m_mode == MODE_SITELINKINFO ) return sendRequests();

	// when MODE_PAGELINKINFO we must have a site quality for that site
	if ( m_siteNumInlinks < 0 ) {char *xx=NULL;*xx=0; }

	// shortcut
	long n = m_siteNumInlinks;
	if      ( n >= 1000 ) {m_spamWeight = 90; m_maxSpam = 4000;}
	else if ( n >=  900 ) {m_spamWeight = 80; m_maxSpam = 3000;}
	else if ( n >=  800 ) {m_spamWeight = 70; m_maxSpam = 2000;}
	else if ( n >=  700 ) {m_spamWeight = 55; m_maxSpam = 1000;}
	else if ( n >=  600 ) {m_spamWeight = 50; m_maxSpam =  100;}
	else if ( n >=  500 ) {m_spamWeight = 15; m_maxSpam =   20;}
	else if ( n >=  200 ) {m_spamWeight = 10; m_maxSpam =   15;}
	else if ( n >=   70 ) {m_spamWeight = 07; m_maxSpam =   10;}
	else if ( n >=   20 ) {m_spamWeight = 05; m_maxSpam =    7;}

	/*

	  for steve i took this out of the key and put in the lower ip byte

	// scan list for the minimum hop count of the inlinkers
	m_list.resetListPtr();
	long minhc = -1;
	for ( ; ! m_list.isExhausted() ; m_list.skipCurrentRecord() ) { 
		// get the key/rec
		key224_t key;
		m_list.getCurrentKey( (char*)&key );
		char hc = g_linkdb.getLinkerHopCount_uk ( &key );
		if ( hc >= 0 && hc < minhc ) minhc = hc;
	}
	// now set our hopcount based on that
	if ( minhc >= 0 ) m_minInlinkerHopCount = minhc;
	*/

	// now send the requests
	m_list.resetListPtr();
	return sendRequests();
}

// . returns false if blocked, true otherwise
// . sets g_errno on error
bool Msg25::sendRequests ( ) {
	// . stop if cancelled
	// . a niceness of 0 implies we are called from Msg3b, the docid
	//   reranking tool
	// . no, now that can be niceness 0, so use the m_spideringEnabled flag
	//if ( ! g_conf.m_spideringEnabled && 
	//     ! m_isInjecting             &&
	//     ! m_pbuf                    &&
	//       m_canBeCancelled ) {
	//	g_errno = ECANCELLED;
	//	return true;
	//}

	unsigned long long lastDocId = 0LL;

	//log("debug: entering sendrequests this=%lx",(long)this);

	// change status
	//if ( m_statusPtr ) *m_statusPtr = "getting link texts";

	// smaller clusters cannot afford to launch the full 300 msg20s
	// because it can clog up one host!
	float ratio = (float)g_hostdb.getNumHosts() / 128.0;
	long ourMax = (long)(ratio * (float)MAX_MSG20_OUTSTANDING);
	if ( ourMax > MAX_MSG20_OUTSTANDING )
		ourMax = MAX_MSG20_OUTSTANDING;

	// keep sending requests
	while ( 1 == 1 ) {

		// breathe
		QUICKPOLL ( m_niceness );

		// if we still haven't gotten enough good inlinks, quit after
		// looking up this many titlerecs
		if ( m_numRequests >= MAX_DOCIDS_TO_SAMPLE ) break;
		// . we only need at most MAX_LINKERS in our sample
		// . but we do keep "losers" until the very end so we can
		//   remove them in an order-independent fashion to guarantee
		//   consistency. otherwise, "losers" will depend on the order
		//   in which the Msg20Replies are received to some degree
		if ( m_numReplyPtrs >= MAX_LINKERS ) break;
		// do not have more than this many outstanding Msg23s
		if ( m_numRequests-m_numReplies >= ourMax ) break;
		// we may have pre-allocated the LinkText classes for
		// use be currently outstanding Msg20 requests, therefore
		// they are not available at this time... m_numReplyPtrs is
		// how many replies we have kept.
		if ( m_numReplyPtrs+m_numRequests-m_numReplies>=MAX_LINKERS) 
			break;


		// reset g_errno just in case
		g_errno = 0;

		char     isLinkSpam =  0;
		//char     hc         = -1;
		long     itop  ;
		unsigned long     ip32;
		uint64_t docId ;
		long     discovered = 0;
		// was the link lost?
		long     lostDate = 0; 

		// . recycle inlinks from the old link info guy
		// . this keeps our inlinks persistent!!! very nice...
		// . useful for when they disappear on an aggregator site
		if ( m_list.isExhausted() && m_round == 0 ) {
			// recycle old inlinks at this point
			if ( m_k == (Inlink *)-1 ) m_k = NULL;
			// get it
			m_k = m_oldLinkInfo->getNextInlink ( m_k );
			// if none left, we really are done
			if ( ! m_k ) break;
			// set these
			//hc         = m_k->m_hopcount;
			itop       = m_k->m_ip & 0x0000ffff;
			ip32       = m_k->m_ip;
			isLinkSpam = m_k->m_isLinkSpam;
			docId      = m_k->m_docId;
			discovered = m_k->m_firstIndexedDate;
		}
		else if ( m_list.isExhausted() && m_round != 0 ) 
			break;
		// is this a "url" key?
		else if ( m_mode == MODE_PAGELINKINFO ) {
			// get the current key if list has more left
			key224_t key; m_list.getCurrentKey( &key );
			// skip if the bit is not set right
			//if ( ! g_linkdb.isUrlKey(&key) ) continue;
			//hc         = g_linkdb.getLinkerHopCount_uk ( &key );
			itop       = g_linkdb.getLinkerIp24_uk     ( &key );
			ip32       = g_linkdb.getLinkerIp_uk     ( &key );
			isLinkSpam = g_linkdb.isLinkSpam_uk  ( &key );
			docId      = g_linkdb.getLinkerDocId_uk    ( &key );
			discovered = g_linkdb.getDiscoveryDate_uk(&key);
			// is it expired?
			lostDate = g_linkdb.getLostDate_uk(&key);
			// update this
			memcpy ( &m_nextKey  , &key , LDBKS );
			//if ( ip32+1 < ip32 ) { char *xx=NULL;*xx=0; }
			// skip to next ip!
			//g_linkdb.setIp32_uk ( &m_nextKey , ip32+1 );
			m_nextKey += 1;
		}
		// otherwise this is a "site" key. we are getting all the
		// inlinks to any page on the site...
		else {
			// get the current key if list has more left
			key224_t key; m_list.getCurrentKey( &key );
			// show it for debug
			//log("key: %s",KEYSTR(&key,LDBKS));
			// skip if the bit is not set right
			//if ( ! g_linkdb.isSiteKey(&key) ) continue;
			//hc         = -1;
			itop       = g_linkdb.getLinkerIp24_uk     ( &key );
			ip32       = g_linkdb.getLinkerIp_uk     ( &key );
			//isLinkSpam=g_linkdb.isLinkSpam_sk  ( &key );
			isLinkSpam = false;
			docId      = g_linkdb.getLinkerDocId_uk    ( &key );
			//linkDate = g_linkdb.getDate_sk     ( &key );
			discovered = g_linkdb.getDiscoveryDate_uk(&key);
			// is it expired?
			lostDate = g_linkdb.getLostDate_uk(&key);
			// update this
			memcpy ( &m_nextKey  , &key , LDBKS );
			//if ( ip32+1 < ip32 ) { char *xx=NULL;*xx=0; }
			// skip to next ip!
			//g_linkdb.setIp32_uk ( &m_nextKey , ip32+1 );
			m_nextKey += 1;
		}


		// advance to next rec if the list has more to go
		if ( ! m_list.isExhausted() ) m_list.skipCurrentRecord();

		// clear this if we should
		if ( ! m_doLinkSpamCheck ) isLinkSpam = false;

		// mangle it so hashtable does not collide so much
		//long long dh = hash64h ( docId , docId );

		// dedup docid, since we now try to keep old Inlinks from
		// the previous LinkInfo. this allows us to preserve RSS
		// info, good hopcounts, etc.
		// on round 487 this is going OOM at 300MB, so take it out
		//if ( m_docIdTable.getSlot ( &dh ) >= 0 ) {
		//	m_docIdDupsLinkdb++;
		//	continue;
		//}
		// add it. TODO: what if this fails?
		//if ( ! m_docIdTable.addKey ( &dh ) ) 
		//	return true;

		// if it is no longer there, just ignore
		if ( lostDate ) {
			m_lostLinks++;
			continue;
		}

		// try using this to save mem then
		if ( docId == lastDocId ) {
			m_docIdDupsLinkdb++;
			continue;
		}
		// update this then
		lastDocId = docId;


		// count unique docids
		m_numDocIds++;

		//
		// get the next docId
		//
 		// . now the 4 hi bits of the score represent special things
		// . see Msg18.cpp:125 where we repeat this

		// once we got this many "good" spam inlinkers, we should
		// get no more! assume the outstanding link spams will be
		// successful. they may not, so we must wait on them.
		// MDW: take this out
		//if(isLinkSpam && m_spamCount + m_linkSpamOut >= m_maxSpam ) {
		//	// actually, assume they will be successful, 
		//	// and just skip ourselves
		//	m_linkSpamLinkdb++;
		//	continue;
		//}

		// is it the minimum thus far?
		// MDW: i saw a bunch of corrupt linkdb recs with hc<0. check.
		//if ( hc>=0 &&
		//     (m_minInlinkerHopCount==-1 || hc<m_minInlinkerHopCount))
		//	m_minInlinkerHopCount = hc;

		// count unique ips for steve's stats
		if ( ! m_fullIpTable.isInTable(&ip32) ) {
			// return true on error
			if ( ! m_fullIpTable.addKey(&ip32) )
				return true;
			// count it
			m_uniqueIps++;
		}

		// TODO: if inlinker is internal by having the same DOMAIN
		// even though a different ip, we should adjust this logic!!
		if ( itop != m_top ) {
			long slot = m_ipTable.getSlot ( &itop );
			if ( slot != -1 ) {m_ipDupsLinkdb++;continue;}
			// store it
			if ( ! m_ipTable.addKey ( &itop ) )
				return true;
			// count unique cblock inlinks
			m_cblocks++;
		}
		// if we are local... allow up to 5 votes, weight is diminished
		else {
			// count your own, only once!
			if ( m_numFromSameIp == 0 ) m_cblocks++;
			// count it as internal
			m_numFromSameIp++;
			// only get link text from first 5 internal linkers,
			// they will all count as one external linker
			if ( m_numFromSameIp > MAX_INTERNAL_INLINKS ) {
				m_ipDupsLinkdb++; continue; }
		}

			
		// count this request as launched
		m_numRequests++;
		// if linkspam, count this
		if ( isLinkSpam ) m_linkSpamOut++;

		// find a msg20 we can use
		long j ;
		for (j=0 ;j<MAX_MSG20_OUTSTANDING;j++) if (!m_inUse[j]) break;
		// sanity check
		if ( j >= MAX_MSG20_OUTSTANDING ) { char *xx = NULL; *xx = 0; }
		// "claim" it
		m_inUse [j] = 1;

		// . this will return false if blocks
		// . we EXPECT these recs to be there...
		// . now pass in the score for the newAlgo
		Msg20Request *r = &m_msg20Requests[j];
		// clear it. reset to defaults.
		r->reset();
		// set the request
		r->m_getLinkText     = true;
		r->m_onlyNeedGoodInlinks = m_onlyNeedGoodInlinks;
		// is linkee a site? then we will try to find link text
		// to any page on that site...
		// if we are in site mode, then m_url should be m_site!
		if ( m_mode == MODE_PAGELINKINFO ) {
			r->m_isSiteLinkInfo = false;
			r-> ptr_linkee = m_url;
			r->size_linkee = gbstrlen(m_url)+1; // include \0
		}
		else {
			r->m_isSiteLinkInfo = true;
			r-> ptr_linkee = m_site;
			r->size_linkee = gbstrlen(m_site)+1; // include \0
		}
		r-> ptr_coll         = m_coll;
		r->size_coll         = gbstrlen(m_coll) + 1; // include \0
		r->m_docId           = docId;
		r->m_expected        = true; // false;
		r->m_niceness        = m_niceness;
		r->m_state           = r;
		r->m_state2          = this;
		r->m_j               = j;
		r->m_callback        = gotLinkTextWrapper;
		// do NOT get summary stuff!! slows us down...
		r->m_maxNumCharsPerLine = 0;
		r->m_numSummaryLines    = 0;
		// get title now for steve
		r->m_titleMaxLen        = 300;
		r->m_summaryMaxLen      = 0;
		r->m_discoveryDate      = discovered;
		// buzz sets the query to see if inlinker has the query terms
		// so we can set <absScore2>
		r->m_langId             = langUnknown; // no synonyms i guess
		r->ptr_qbuf             = m_qbuf;
		r->size_qbuf            = m_qbufSize;
		// These parms should maybe be passed from the calling msg20,
		// But, at least for now, buzz always wants these values.
		//if (m_qbufSize > 1){
		//	r->m_hackFixWords   = true;
		//	r->m_hackFixPhrases = true;
		//	r->m_excludeLinkText = true;
		//	r->m_excludeMetaText = true;
		//}
		// place holder used below
		r->m_isLinkSpam         = isLinkSpam;
		// buzz may not want link spam checks! they are pretty clean.
		r->m_doLinkSpamCheck    = m_doLinkSpamCheck;
		// this just gets the LinkInfo class from the TITLEREC
		// so that Msg25 should not be called. thus avoiding an
		// infinite loop!
		// we need the LinkInfo of each inlinker to get the inlinker's
		// sitePop and numInlinksToSite, which is needed to call
		// makeLinkInfo() below.
		r->m_getLinkInfo        = true;


		r->m_ourHostHash32 = m_ourHostHash32;
		r->m_ourDomHash32  = m_ourDomHash32;

		// . MAKE A FAKE MSG20REPLY for pre-existing Inlinks
		// . the opposite of Inlink::set(Msg20Reply *)
		// . we used that as a reference
		// . ISSUES:
		// . 1. if inlinker gets banned, we still recycle it
		// . 2. if ad id gets banned, we still recycle it
		// . 3. we cannot dedup by the vectors, because we do not
		//      store those in the Inlink class (Msg25::isDup())
		if ( m_k && m_k != (Inlink *)-1 ) {
			Msg20Reply *rep = &m_msg20Replies[j];
			rep->reset();
			m_k->setMsg20Reply ( rep );
			// let receiver know we are a recycle
			rep->m_recycled = 1;
			// . this returns true if we are done
			// . g_errno is set on error, and true is returned
			if ( gotLinkText ( r ) ) return true;
			// keep going
			continue;
		}

		// debug log
		if ( g_conf.m_logDebugLinkInfo ) {
			char *ms = "page";
			if ( m_mode == MODE_SITELINKINFO ) ms = "site";
			log("msg25: getting single link mode=%s site=%s "
			    "url=%s docid=%lli request=%li",
			    ms,m_site,m_url,docId,m_numRequests-1);
		}

		// returns false if blocks, true otherwise
		bool status = m_msg20s[j].getSummary ( r ) ;
		// breathe
		QUICKPOLL(m_niceness);
		// if blocked launch another
		if ( ! status ) continue;
		// . this returns true if we are done
		// . g_errno is set on error, and true is returned
		if ( gotLinkText ( r ) ) return true;
		
	}

	// we may still be waiting on some replies to come in
	if ( m_numRequests > m_numReplies ) return false;

	// if the list had linkdb recs in it, but we launched no msg20s
	// because they were "lost" then we end up here.
	
	/*
	if ( m_url )
		log("linkdb: encountered %li lost links for url %s "
		    "rnd=%li",
		    m_lostLinks,m_url,m_round);
	else
		log("linkdb: encountered %li lost links for docid %lli "
		    "rnd=%li",
		    m_lostLinks,m_docId,m_round);
	*/

	// . otherwise, we got everyone, so go right to the merge routine
	// . returns false if not all replies have been received 
	// . returns true if done
	// . sets g_errno on error
	// . if all replies are in then this can call doReadLoop() and
	//   return false!
	return gotLinkText ( NULL );
}

bool gotLinkTextWrapper ( void *state ) { // , LinkTextReply *linkText ) {
	Msg20Request *req = (Msg20Request *)state;
	// get our Msg25
	Msg25 *THIS = (Msg25 *)req->m_state2;

	//log("debug: entering gotlinktextwrapper this=%lx",(long)THIS);

	// . this returns false if we're still awaiting replies
	// . returns true if all replies have been received and processed
	if ( THIS->gotLinkText ( req ) ) {

		//log("debug: calling final callback 2");
		if ( THIS->m_gettingList ) return false;

		// . now call callback, we're done
		// . g_errno will be set on critical error
		THIS->m_callback ( THIS->m_state );//, THIS->m_linkInfo );
		return true;
	}
	// if gotLinkText() called doReadLoop() it blocked calling msg0
	if ( THIS->m_gettingList ) return false;
	// . try to send more requests
	// . return if it blocked
	// . shit, this could call doReadLoop() now that we have added
	//   the lostdate filter, because there will end up being no requests
	//   sent out even though the list was of positive size, so it
	//   will try to read the next round of msg0.
	if ( ! THIS->sendRequests ( ) ) return false;
	// . shit, therefore, return false if we did launch a msg0 after this
	if ( THIS->m_gettingList ) return false;

	//log("debug: calling final callback 3");

	// otherwise we're done
	THIS->m_callback ( THIS->m_state );//, THIS->m_linkInfo );
	return true;
}

char *getExplanation ( char *note ) {

	if ( ! note ) return NULL;
	if ( strcmp(note,"good")==0) return NULL;

	static char *s_notes[] = {

		"same mid domain",
		"inlinker's domain, excluding TLD, is same as page it "
		"links to",

		"linker banned or filtered", 
		"inlinker's domain has been manually banned",

		"no link text",
		"inlink contains no text, probably an image",

		"banned by ad id",
		"inlinking page contains a google ad id that has been "
		"manually banned",

		"ip dup",
		"inlinker is from the same C Block as another inlinker",

		"first ip dup",
		"first recorded C Block of inlinker matches another inlinker",

		"post page",
		"inlink is from a page that contains a form tag whose "
		"submit url contains character sequence that are indicative "
		"of posting a comment, thereby indicating that the inlink "
		"could be in a comment section",

		"path is cgi",
		"inlinker url contains a question mark",

		"similar link desc",
		"the text surrounding the anchor text of this inlink is "
		"too similar to that of another processed inlink",

		"similar content",
		"the inlinker's page content is "
		"too similar to that of another processed inlink",

		"doc too big",
		"inlinker's page is too large, and was truncated, and "
		"might have lost some indicators",

		"link chain middle",
		"inlink is in the middle of a list of inlinks, without "
		"any non-link text separating it, inidicative of text ads",

		"link chain right",
		"inlink is at the end of a list of inlinks, without "
		"any non-link text separating it, inidicative of text ads",

		"link chain left",
		"inlink is at the beginning of a list of inlinks, without "
		"any non-link text separating it, inidicative of text ads",

		"near sporny outlink",
		"inlink is near another outlink on that page which contains "
		"porn words in its domain or url",

		"70.8*.",
		"inlinker is from an IP address that is of the form "
		"70.8*.*.* which is a notorious block of spam",

		".info tld",
		"inlinker's tld is .info, indicative of spam",

		".biz tld",
		"inlinker's tld is .biz, indicative of spam",

		"textarea tag",
		"inlinker page contains a textarea html tag, indicative "
		"of being in a comment section",

		"stats page",
		"inlinker is from a web access stats page",
		
		"has dmoz path",
		"inlinker url looks like a dmoz mirror url",
		
		"guestbook in hostname",
		"inlinker is from a guestbook site",

		"ad table",
		"inlink appears to be in a table of ad links",

		"duplicateIPCClass",
		

	};

	long n = sizeof(s_notes)/ sizeof(char *);
	for ( long i = 0 ; i < n ; i += 2 ) {
		if ( strcmp(note,s_notes[i]) ) continue;
		return s_notes[i+1];
	}

	if ( strncmp(note,"path has",8) == 0 )
		return "inlinker's url contains keywords indicative "
			"of a comment page, guestbook page or "
			"link exchange";

	return 
		"inlinker's page contains the described text, indicative of "
		"being a link exchange or being in a comment section or "
		"being an otherwise spammy page";
}

// . returns false if not all replies have been received (or timed/erroredout)
// . returns true if done
// . sets g_errno on error
bool Msg25::gotLinkText ( Msg20Request *req ) { // LinkTextReply *linkText ) {

	//log("debug: entering gotlinktext this=%lx",(long)this);

	long j = -1;
	if ( req ) j = req->m_j;
	// get it
	Msg20 *m = NULL;
	// the reply
	Msg20Reply *r = NULL;
	// the alloc size of the reply
	long rsize = 0;
	// the original request

	// set the reply
	if ( j >= 0 ) {
		// get the msg20
		m = &m_msg20s[j];
		// set the reply
		r = m->m_r;
		// the reply size
		rsize = m->m_replyMaxSize;
		// inc # of replies
		m_numReplies++;
		// get the request
		Msg20Request *req = &m_msg20Requests[j];
		// discount this if was linkspam
		if ( req->m_isLinkSpam ) m_linkSpamOut--;
		// "make available" msg20 and msg20Request #j for re-use
		m_inUse [ j ] = 0;
		// . propagate internal error to g_errno
		// . if g_errno was set then the reply will be empty
		if ( r && r->m_errno && ! g_errno ) g_errno = r->m_errno;
		// if it had an error print it for now
		if ( r && r->m_errno )
			log("query: msg25: msg20 had error for docid %lli : "
			    "%s",r->m_docId, mstrerror(r->m_errno));
	}
	
	// what is the reason it cannot vote...?
	char *note    = NULL;
	long  noteLen = 0;
	
	// assume it CAN VOTE for now
	bool good = true;

	// just log then reset g_errno if it's set
	if ( g_errno ) {
		// a dummy docid
		long long docId = -1LL;
		// set it right
		if ( r ) docId = r->m_docId;
		// we often restrict link: termlist lookup to indexdb root
		// file, so we end up including terms from deleted docs...
		// this we get a lot of ENOTFOUND errors. 
		// MDW: we no longer do this restriction...
		log(LOG_DEBUG,
		    "build: Got error getting link text from one document: "
		    "%s. Will have to restart later. docid=%lli.",
		    mstrerror(g_errno),docId);
		// this is a special case
		if ( g_errno == ECANCELLED || 
		     g_errno == ENOMEM     ||
		     g_errno == ENOSLOTS    ) {
			m_errors++;
			if ( m_numReplies < m_numRequests ) return false;
			return true;
		}
		// otherwise, keep going, but this reply can not vote
		good = false;
		m_errors++;
		note = mstrerror ( g_errno );
		// reset g_errno
		g_errno = 0;		
	}

	// are we an "internal" inlink?
	bool internal = false;
	if ( r && iptop(r->m_firstIp) == m_top )
		internal = true;
	if ( r && iptop(r->m_ip) == m_top )
		internal = true;

	// . if the mid domain hash of the inlinker matches ours, no voting
	// . this is set to 0 for recycles
	if ( r && good && r->m_midDomHash == m_midDomHash && ! internal ) {
		good = false;
		m_sameMidDomain++;
		note = "same mid domain";
	}

	// is the inlinker banned?
	if ( r && good && r->m_isBanned ) {
		// it is no longer good
		good = false;
		// inc the general count, too
		m_spamLinks++;
		// add his ad id hash to the table, so if any
		// other linker has it, it will be banned by ad id!
		if ( r->m_adIdHash ) m_adBanTable.addKey ( &r->m_adIdHash );
		// count each *type* of "link spam". the type is given
		// by linkText->m_note and is a string...
		note = "linker banned or filtered";
	}

	// breathe
	QUICKPOLL(m_niceness);

	// get the linker url
	Url linker; 
	if ( r ) linker.set ( r->ptr_ubuf , r->size_ubuf );

	// sanity check, Xml::set() requires this...
	if ( r&&r->size_rssItem > 0 && r->ptr_rssItem[r->size_rssItem-1]!=0 ) {
		char *xx=NULL;*xx=0; }

	// . if no link text, count as error
	// . linkText->getLinkTextLen()
	if ( r && good && 
	     r->size_linkText <= 0 && 
	     r->size_rssItem  <= 0 && 
	     // allow if from a ping server because like 
	     // rpc.weblogs.com/shortChanges.xml so we can use
	     // "inlink==xxx" in the url filters to assign any page linked
	     // to by a pingserver into a special spider queue. then we can
	     // spider that page quickly and get its xml feed url, and then
	     // spider that to get new outlinks of permalinks.
	     // Well now we use "inpingserver" instead of having to specify
	     // the "inlink==xxx" expression for every ping server we know.
	     ! linker.isPingServer() ) {
		good = false;
		m_noText++;
		note = "no link text";
	}

	// banned by way of ad id?
	if (r && good&& r->m_adIdHash&&m_adBanTable.getSlot(&r->m_adIdHash)>0){
		// it is no longer good
		good = false;
		// inc the general count, too
		m_spamLinks++;
		// count each *type* of "link spam". the type is given
		// by linkText->m_note and is a string...
		note = "banned by ad id";
	}

	// . if we are linked to by a page on the same ip as the linkee
	//   then call it a reciprocal link
	// . only check if our root quality is < 45%
	// . MDW: i disabled this until it can prove more useful
	/*
	Vector *v5 = NULL;
	if ( r ) v5 = r->ptr_vectorBuf5; // linkText->getVector4();
	if ( r && good && ! internal && m_rootQuality < 45 ) {
		// these are the IPs of the linker's incoming linkers
		long numIps = v5->getNumPairHashes();
		long ourIp  = m_url->getIp();
		for ( long i = 0 ; i < numIps ; i++ ) {
			long ip = v5->m_pairHashes[i];
			if ( ip != ourIp ) continue;
			good = false;
			m_reciprocal++;
			note = "reciprocal link";
			break;
		}
	}
	*/

	QUICKPOLL(m_niceness);
	// discount if LinkText::isLinkSpam() or isLinkSpam2() said it
	// should not vote
	if ( r && good && ! internal && r->m_isLinkSpam &&
	     // we can no allow link spam iff it is below the max!
	     ++m_spamCount >= m_maxSpam ) {
		// it is no longer good
		good = false;
		// inc the general count, too
		m_spamLinks++;
		// count each *type* of "link spam". the type is given
		// by linkText->m_note and is a string...
		note    = r-> ptr_note;
		noteLen = r->size_note - 1; // includes \0
	}

	// loop over all the replies we got so far to see if "r" is a dup
	// or if another reply is a dup of "r"
	long n = m_numReplyPtrs;
	// do not do the deduping if no reply given
	if ( ! r ) n = 0; 
	// do not do this if "r" already considered bad
	if ( ! good ) n = 0;
	// this is the "dup"
	Msg20Reply *dup  = NULL;
	long        dupi = -1;
	// . we do not actually remove the Msg20Replies at this point because 
	//   this filter is dependent on the order in which we receive the 
	//   Msg20Replies. we do the removal below after all replies are in.
	// . NO! not anymore, i don't want to hit MAX_LINKERS and end up
	//   removing all the dups below and end up with hardly any inlinkers
	for ( long i = 0 ; ! internal && i < n ; i++ ) {
		// breathe
		QUICKPOLL(m_niceness);
		// get the reply in a ptr
		Msg20Reply *p = m_replyPtrs[i];
		// is it internal
		bool pinternal = false;
		if ( iptop(p->m_ip) == m_top ) pinternal = true;
		if ( iptop(p->m_firstIp) == m_top ) pinternal = true;
		// allow internal inlinks to match
		if ( pinternal ) continue;
		// is "p" a dup of us? (or we of it?)
		char *dupNote = isDup ( r , p ) ;
		// if it is not a dup, keep going
		if ( ! dupNote ) continue;
		// getLoser() returns the lowest-scoring reply of "r" and "p"
		Msg20Reply *tmp = getLoser ( r , p );
		// is it worse than the current "dup"? if so, update "dup"
		dup = getLoser ( tmp , dup );
		// get the "i" value
		if ( dup == r ) dupi = j;
		if ( dup == p ) dupi = i;
		// we got a dup
		good = false;
		note = dupNote;
	}

	// inc this count
	if ( dup ) m_dupCount++;

	// if "p" is the lower-scoring dup, put "r" in its place, and then
	// set "r" to "p" doing a swap operation
	if ( dup && dup != r ) {
		// sanity check
		if ( dupi < 0 ) { char *xx=NULL;*xx=0; }
		// HACK: swap them
		Msg20Reply *tmp      = m_replyPtrs [dupi];
		long        tmpSize  = m_replySizes[dupi];
		m_replyPtrs [dupi] = r;
		m_replySizes[dupi] = rsize;
		r                    = tmp;
		rsize                = tmpSize;
		// make Msg20 point to that old "dup" reply
		m->m_r            = r;
		m->m_replyMaxSize = rsize;
	}

	// breathe
	QUICKPOLL(m_niceness);

	if ( r && good ) {
		long iptop1 = iptop(r->m_ip);
		long iptop2 = iptop(r->m_firstIp);
		if ( m_firstIpTable.isInTable ( &iptop1 ) ||
		     m_firstIpTable.isInTable ( &iptop2 ) ) {
			good = false;
			m_ipDups++;
			note = "first ip dup";
		}
		// add to table. return true with g_errno set on error
		if ( ! m_firstIpTable.addKey(&iptop1) ) return true;
		if ( ! m_firstIpTable.addKey(&iptop2) ) return true;
	}


	// BUT do not set good to false so it is stored so we can look
	// at the linktext for indexing purposes anyway, but we do not
	// want to count it towards the # of good siteinlinks because
	// its internal
	if ( internal && ! note ) {
		m_ipDups++;
		note = "ip dup";
	}

	if ( r && ! good && ! note )
		note = "unknown reason";

	// compile the reason it could not vote
	if ( r && ! good ) {
		// set "noteLen" if not yet set
		if ( note && noteLen == 0 ) noteLen = gbstrlen ( note );
		// add it to our table
		addNote ( note , noteLen , r->m_docId );
		// . free the reply since it cannot vote
		// . no, it should be auto-freed when the msg20 is re-used
	}

	bool store = true;
	if ( ! good ) store = false;
	//if ( ! m_onlyNeedGoodInlinks ) store = true;
	// . if doing for display, show good
	// . steve needs to see the bad guys as well ppl can learn
	if ( m_pbuf ) store = true;

	// now for showing recommended link sources let's include the
	// bad boys because we might have mislabelled them as bad, b/c maybe
	// google thinks they are good! fix for 
	// XmlDoc::getRecommendedLinksBuf()
	if ( ! m_onlyNeedGoodInlinks ) store = true;

	// how is this NULL?
	if ( ! r ) store = false;

	if ( store ) {
		// save the reply
		m_replyPtrs [m_numReplyPtrs] = r;
		m_replySizes[m_numReplyPtrs] = rsize;
		// why we do this?
		if ( note && ! r->ptr_note ) {
			r->ptr_note = note;
			r->size_note = noteLen+1;
		}
		// store this in the reply for convenience
		r->m_discoveryDate = req->m_discoveryDate;
		m_numReplyPtrs++;
		// debug note
		//log("linkdb: stored %li msg20replies",m_numReplyPtrs);
		// do not allow Msg20 to free it
		m->m_r = NULL;
	}

	// free the reply buf of this msg20 now to save mem because
	// we can't send out like 100,000 of these for yahoo.com to find
	// less than 1000 good ones!
	// tell msg20 to free the reply if not null
	if ( m ) m->reset();

	// wait for all replies to come in
	if ( m_numReplies < m_numRequests ) return false;

	//
	//
	// READ MORE FROM LINKDB to avoid truncation
	//
	//
	// youtube is doing like 180,000 rounds! wtf! limit to 10000
	if ( m_list.m_listSize > 0 && // && m_round < 10000 ) {
	     // no! now we shrink a list by removing dup docids from it
	     // in Msg0.cpp before sending back to save memory and cpu and
	     // network. so it can be well below m_minRecSizes and still need
	     // to go on to the next round
	     //m_list.m_listSize >= m_minRecSizes && 
	     m_numReplyPtrs < MAX_LINKERS ) {
		// count it
		m_round++;
		// note it
		char *ms = "page";
		char *id = m_url;
		if ( m_mode == MODE_SITELINKINFO ) {
			ms = "site";
			id = m_site;
		}
		// debug
		if ( g_conf.m_logDebugLinkInfo ) {
			log("linkdb: recalling round=%li for %s=%s",
			    m_round,ms,m_site);
		}
		// and re-call
		if ( ! doReadLoop() ) return false;
		// it did not block!! wtf? i guess it read no more or
		// launched no more requests.
		//log("linkdb: doreadloop did not block");
	}
		

	//
	//
	// process all "good" Msg20Replies
	//
	//

	/*
	// since we may not have called a Msg20 for every docid in the
	// linkdb list, extroplate "m_numReplyPtrs" to what it probably should
	// have been. it is a non-linear. we could go out to more derivatives,
	// m_deltaDiff2, etc. if necessary.
	long long extrapolated = m_numReplyPtrs;
	long long bonus        = m_numReplyPtrs;
	long long step         = (long)MAX_DOCIDS_TO_SAMPLE * 2 ;
	// add in "bonus" X docids sampled
	long nd;
	for ( nd = m_numReplies ; nd + step <= m_numDocIds ; nd += step ) {
		QUICKPOLL(m_niceness);
		extrapolated += bonus;
		step      *= 2;
	}
	*/
	// . do linear estimation of remainder however. 
	// . hey i don't want to get into crazy logs...
	/*
	long long rem = m_numDocIds - nd;
	if ( step > 0 && rem > 0 ) extrapolated += (bonus * rem) / step;
	// sanity check
	if ( rem > step ) { char *xx = NULL; *xx = 0; }
	// log build msg
	if ( g_conf.m_logDebugSpider )
		log(LOG_DEBUG,"build: msg25: %s extrapolated=%li "
		    "goodReplies=%li "
		    "allReplies=%li",
		    m_url->getUrl(), (long)extrapolated, (long)m_numReplyPtrs, 
		    (long)m_numReplies);
	// sanity check
	if ( extrapolated < 0 ) {
		if ( g_conf.m_logDebugSpider )
			log("build: msg25: extrapolated = %li < 0. Resetting "
			    "to 0.",(long)extrapolated);
		extrapolated = 0;
	}
	// the x factor
	long x = 100;
	if ( m_numReplyPtrs > 0 ) 
		x = ((long long)extrapolated * 100LL) / m_numReplyPtrs;
	*/

	// skip making link info?
	//if ( ! m_onlyNeedGoodInlinks ) return true;

	// breathe
	QUICKPOLL(m_niceness);

	// debug log
	if ( g_conf.m_logDebugLinkInfo ) {
		char *ms = "page";
		if ( m_mode == MODE_SITELINKINFO ) ms = "site";
		log("msg25: making final linkinfo mode=%s site=%s url=%s "
		    "docid=%lli",
		    ms,m_site,m_url,m_docId);
	}

	// . this returns NULL and sets g_errno on error
	// . returns an allocated ptr to a LinkInfo class
	// . we are responsible for freeing
	// . LinkInfo::getSize() returns the allocated size
	m_linkInfo = makeLinkInfo ( m_coll            ,
				    m_ip              ,
				    m_siteNumInlinks  ,
				    //m_sitePop         ,
				    m_replyPtrs       ,
				    m_numReplyPtrs    ,
				    //m_numReplyPtrs, // extrapolated      ,
				    //100, // x                 ,
				    m_spamWeight      ,
				    m_oneVotePerIpDom ,
				    m_docId           , // linkee docid
				    m_lastUpdateTime  ,
				    m_onlyNeedGoodInlinks ,
				    m_niceness        ,
				    this ,
				    m_linkInfoBuf );
	// return true with g_errno set on error
	if ( ! m_linkInfo ) {
		log("build: msg25 linkinfo set: %s",mstrerror(g_errno));
		return true;
	}

	// if nothing to print out, be on our way
	if ( ! m_pbuf ) return true;

	/////////////////////////////////////////
	//
	// print out for PageParser.cpp
	//
	/////////////////////////////////////////

	// sort by docid so we get consistent output on PageParser.cpp
	char sflag = 1;
	while ( sflag ) {
		sflag = 0;
		for ( long i = 1 ; i < m_numReplyPtrs ; i++ ) {
			// sort by quality first
			char q1 = m_replyPtrs[i-1]->m_siteRank;//docQuality;
			char q2 = m_replyPtrs[i  ]->m_siteRank;//docQuality;
			if ( q1 > q2 ) continue;
			// if tied, check docids
			long long d1 = m_replyPtrs[i-1]->m_docId;
			long long d2 = m_replyPtrs[i  ]->m_docId;
			if ( d1 == d2 )
				log("build: got same docid in msg25 "
				    "d=%lli url=%s",d1,
				    m_replyPtrs[i]->ptr_ubuf);
			if ( q1 == q2 && d1 <= d2 ) continue;
			// swap them
			Msg20Reply *tmp   = m_replyPtrs  [i-1];
			long        size  = m_replySizes [i-1];
			m_replyPtrs [i-1] = m_replyPtrs  [i  ];
			m_replySizes[i-1] = m_replySizes [i  ];
			m_replyPtrs [i  ] = tmp;
			m_replySizes[i  ] = size;
			sflag = 1;
		}
	}

	// LinkInfo::set() probably filtered out even more!
	//long ng = m_inlinkingDocIdsRead - inlinkDocIdsFiltered;
	// how many were filtered by LinkInfo::set()?
	//long inlinkingDocIdsFiltered2 = ng - m_linkInfo->getNumInlinks();

	struct tm *timeStruct = localtime ( &m_lastUpdateTime );
	char buf[64];
	strftime ( buf, 64 , "%b-%d-%Y(%H:%M:%S)" , timeStruct );

	char *ss = "site";
	if ( m_mode == MODE_PAGELINKINFO ) ss = "page";

	long siteRank = ::getSiteRank ( m_linkInfo->m_numGoodInlinks );

	if ( m_printInXml ) { // && m_xd ) {

		m_pbuf->safePrintf("\t<desc>inlinks to %s</desc>\n",ss);

		m_pbuf->safePrintf("\t<sampleCreatedUTC>%lu"
				   "</sampleCreatedUTC>\n"
				   , m_lastUpdateTime
				   );
		char *u = NULL;
		if ( m_xd ) u = m_xd->ptr_firstUrl;
		if ( u )
			m_pbuf->safePrintf("\t<url><![CDATA[%s]]></url>\n",u);

		char *site = NULL;
		if ( m_xd ) site = m_xd->ptr_site;
		if ( site )
			m_pbuf->safePrintf("\t<site><![CDATA[%s]]></site>\n",
					   site);

		long long d = 0LL;
		if ( m_xd ) d = m_xd->m_docId;
		if ( d && d != -1LL )
			m_pbuf->safePrintf("\t<docId>%lli</docId>\n",d);
			
		m_pbuf->safePrintf(
				   "\t<ipAddress><![CDATA[%s]]></ipAddress>\n"

				   "\t<totalSiteInlinksProcessed>%li"
				   "</totalSiteInlinksProcessed>\n"

				   "\t<totalGoodSiteInlinksProcessed>%li"
				   "</totalGoodSiteInlinksProcessed>\n"

				   "\t<numUniqueCBlocksLinkingToPage>%li"
				   "</numUniqueCBlocksLinkingToPage>\n"

				   "\t<numUniqueIpsLinkingToPage>%li"
				   "</numUniqueIpsLinkingToPage>\n"

				   , iptoa(m_ip)
				   // the total # of inlinkers. we may not have
				   // read all of them from disk though.
				   , m_numDocIds
				   , m_linkInfo->m_numGoodInlinks
				   , m_cblocks
				   , m_uniqueIps
				   );
	}
	else
		m_pbuf->safePrintf(   "<table width=100%%>"
				      "<td bgcolor=lightyellow>\n"
				      "<b>Summary of inlinks to %s "
				      "%s</b>\n"
				      "<br><br>"
				      
				      "<table cellpadding=3 "
				      "border=1 width=100%%>\n"
				      
				      "<tr>"
				      "<td>sample created</td>"
				      "<td>%s</td>"
				      "<td>when this info was last "
				      "computed</td>"
				      "</tr>\n"
				      
				      "<tr>"
				      "<td>IP address</td>"
				      "<td>%s</td>"
				      "<td>&nbsp;"
				      "</td>"
				      "<td> &nbsp; </td>"
				      "</tr>\n"
				      
				      "<tr>"
				      "<td>total inlinkers</td>"
				      "<td>%li</td>"
				      "<td>how many inlinks we have total. "
				      "Max: %li."
				      //" Bad docids are removed so may be "
				      //"less than that max."
				      "</td>"
				      "<td> &nbsp; </td>"
				      "</tr>\n"

				      "<tr>"
				      "<td>unique cblocks</td>"
				      "<td>%li</td>"
				      "<td>unique EXTERNAL cblock inlinks</td>"
				      "<td> &nbsp; </td>"
				      "</tr>\n"

				      "<tr>"
				      "<td>unique ips</td>"
				      "<td>%li</td>"
				      "<td>unique EXTERNAL IP inlinks</td>"
				      "<td> &nbsp; </td>"
				      "</tr>\n"

				      
				      //"<tr>"
				      //"<td>sampled inlinkers</td>"
				      //"<td>%li</td>"
				      //"<td>how many docs "
				      //"we sampled for inlink text. "
				      //"Limited to %li docs.</td>"
				      //"<td> &nbsp; </td>"
				      //"</tr>\n"
				      ,
				      ss,
				      m_url,
				      buf, //m_lastUpdateTime,
				      iptoa(m_ip),
				      // the total # of inlinkers. we may not
				      // have read all of them from disk though
				      m_numDocIds ,
				      (long)READSIZE/(long)LDBKS,
				      //(long)MAX_LINKERS_IN_TERMLIST,
				      // how many docids we read from disk
				      //(m_list.getListSize()-6)/12 ,
				      //(long)MAX_DOCIDS_TO_SAMPLE);
				      m_cblocks,
				      m_uniqueIps
				      );

	if ( m_mode == MODE_SITELINKINFO && m_printInXml )
		m_pbuf->safePrintf("\t<siteRank>%li</siteRank>\n" , siteRank );

	// print link spam types
	long ns = m_table.getNumSlots();
	for ( long i = 0 ; i < ns ; i++ ) {
		// skip empty slots
		if ( m_table.isEmpty(i) ) continue;
		// who is in this slot
		NoteEntry *e = *(NoteEntry **)m_table.getValueFromSlot(i);
		char *exp = getExplanation ( e->m_note );
		// show it
		if ( m_printInXml ) {
			m_pbuf->safePrintf ( "\t<inlinkStat>\n");
			m_pbuf->safePrintf ( "\t\t<name><![CDATA[" );
			//m_pbuf->htmlEncode(e->m_note,gbstrlen(e->m_note),0);
			m_pbuf->safeStrcpy ( e->m_note );
			m_pbuf->safePrintf ( "]]></name>\n");
			if ( exp )
				m_pbuf->safePrintf ( "\t\t<explanation>"
						     "<![CDATA[%s]]>"
						     "</explanation>\n",
						     exp);
			m_pbuf->safePrintf ( "\t\t<count>%li</count>\n",
					     e->m_count );
			m_pbuf->safePrintf ( "\t</inlinkStat>\n");
		}
		else {
			m_pbuf->safePrintf ( "<tr><td>%s", e->m_note );
			//if ( exp ) 
			//	m_pbuf->safePrintf ( " - %s", exp );
			m_pbuf->safePrintf("</td>");
			m_pbuf->safePrintf (
					    "<td><font color=red>%li</font>"
					    "</td>"
					    "<td>reason could not vote</td>"
					    "<td>"
					    , e->m_count );
		}
		// print some docids that had this problem
		for ( long j = 0 ; j < MAX_ENTRY_DOCIDS ; j++ ) {
			if ( e->m_docIds[j] == -1LL ) break;
			if ( ! m_printInXml )
				m_pbuf->safePrintf ("<a href=\"/master/titledb"
						    "?c=%s&d=%lli\">"
						    "%li</a> ",
						    m_coll,e->m_docIds[j],j);
		}
		if ( ! m_printInXml )
			m_pbuf->safePrintf ( "&nbsp; </td></tr>\n" );
	}
		
	if ( ! m_printInXml ) 
		m_pbuf->safePrintf(  

				   "<tr>"
				   "<td>ip dup</td>"
				   "<td><font color=red>%li</font></td>"
				   "<td>"
				   "basically the same ip, but we looked "
				   "it up anyway."
				   "</td>"
				   "</tr>\n"

				   "<tr>"
				   "<td>ip dup linkdb</td>"
				   "<td>%li</td>"
				   "<td>"
				   "linkdb saved us from having to "
				   "look up this many title recs from the "
				   "same ip C block.</td>"
				   "</tr>\n"

				   "<tr>"
				   "<td>docid dup linkdb</td>"
				   "<td>%li</td>"
				   "<td>"
				   "linkdb saved us from having to "
				   "look up this many title recs from the "
				   "same docid.</td>"
				   "</tr>\n"
				   
				   "<tr>"
				   "<td>link spam linkdb</td>"
				   "<td>%li</td>"
				   "<td>"
				   "linkdb saved us from having to "
				   "look up this many title recs because "
				   "they were pre-identified as link "
				   "spam.</td>"
				   "</tr>\n"
				   
				   "<tr>"
				   "<td><b>good</b></td>"
				   "<td><b>%li</b></td>"
				   "<td>"
				   //"may include anomalies and some "
				   //"link farms discovered later. "
				   "# inlinkers with positive weight"
				   //"limited to MAX_LINKERS = %li"
				   "</td>"
				   "<td> &nbsp; </td>"
				   "</tr>\n"
				   
				   /*
				     "<tr>"
				     "<td>good extrapolated</td>"
				     "<td>%li</td>"
				     "<td>extrapolate the good links to get "
				     "around the MAX_LINKERS limitation</td>"
				     "<td> &nbsp; </td>"
				     "</tr>\n"
				     
				     "<tr>"
				     "<td>X factor (not linear!)</td>"
				     "<td>%li%%</td>"
				     "<td>~ good extrapolated / good = "
				     "%li / %li</td>"
				     "<td> &nbsp; </td>"
				     "</tr>\n"
				   */
				   ,
				   (long)m_ipDups   ,
				   (long)m_ipDupsLinkdb   ,
				   (long)m_docIdDupsLinkdb   ,
				   (long)m_linkSpamLinkdb ,
				   m_linkInfo->m_numGoodInlinks
				   // good and max
				   //(long)m_linkInfo->getNumInlinks() ,
				   );

	if ( m_mode == MODE_SITELINKINFO && ! m_printInXml )
		m_pbuf->safePrintf(
				   "<tr><td><b>siterank</b></td>"
				   "<td><b>%li</b></td>"
				   "<td>based on # of good inlinks</td>"
				   "</tr>",
				   siteRank
				   );

	if ( ! m_printInXml )
		m_pbuf->safePrintf("</table>"
				   "<br>"
				   "<br>" );

	// xml?
	if ( m_printInXml && m_ipDups ) {
		// ip dups
		m_pbuf->safePrintf ( "\t<inlinkStat>\n"
				     "\t\t<name><![CDATA[");
		m_pbuf->safePrintf ( "duplicateIPCClass" );
		m_pbuf->safePrintf ( "]]></name>\n");

		m_pbuf->safePrintf ( "\t\t<explanation><![CDATA[");
		m_pbuf->safePrintf ( "inlinker is form the same C Block "
				     "as another inlinker we processed");
		m_pbuf->safePrintf ( "]]></explanation>\n");

		m_pbuf->safePrintf ( "\t\t<count>%li</count>\n",
				     m_ipDups );
		m_pbuf->safePrintf ( "\t</inlinkStat>\n");
	}
	if ( m_printInXml && m_ipDupsLinkdb ) {
		// ip dups
		m_pbuf->safePrintf ( "\t<inlinkStat>\n"
				     "\t\t<name><![CDATA[");
		m_pbuf->safePrintf ( "duplicateIPCClass" );
		m_pbuf->safePrintf ( "]]></name>\n");
		m_pbuf->safePrintf ( "\t\t<explanation><![CDATA[");
		m_pbuf->safePrintf ( "inlinker is form the same C Block "
				     "as another inlinker we processed");
		m_pbuf->safePrintf ( "]]></explanation>\n");

		m_pbuf->safePrintf ( "\t\t<count>%li</count>\n",
				     m_ipDupsLinkdb );
		m_pbuf->safePrintf ( "\t</inlinkStat>\n");
	}
	if ( m_printInXml && m_docIdDupsLinkdb ) {
		// ip dups
		m_pbuf->safePrintf ( "\t<inlinkStat>\n"
				     "\t\t<name><![CDATA[");
		m_pbuf->safePrintf ( "duplicateDocId" );
		m_pbuf->safePrintf ( "]]></name>\n");
		m_pbuf->safePrintf ( "\t\t<explanation><![CDATA[");
		m_pbuf->safePrintf ( "inlinker is on the same page "
				     "as another inlinker we processed");
		m_pbuf->safePrintf ( "]]></explanation>\n");
		m_pbuf->safePrintf ( "\t\t<count>%li</count>\n",
				     m_ipDupsLinkdb );
		m_pbuf->safePrintf ( "\t</inlinkStat>\n");
	}
	if ( m_printInXml && m_linkSpamLinkdb ) {
		// link spam
		m_pbuf->safePrintf ( "\t<inlinkStat>\n"
				     "\t\t<name><![CDATA[");
		m_pbuf->safePrintf ( "generalLinkSpam" );
		m_pbuf->safePrintf ( "]]></name>\n");
		m_pbuf->safePrintf ( "\t\t<count>%li</count>\n",
				     m_linkSpamLinkdb );
		m_pbuf->safePrintf ( "\t</inlinkStat>\n");
	}


	char *tt = "";
	if ( m_mode == MODE_SITELINKINFO ) tt = "site ";

	if ( ! m_printInXml ) {
		m_pbuf->safePrintf(  "<table cellpadding=3 border=1>"
				     "<tr>"
				     "<td colspan=20>Inlinks "
				     "to %s%s &nbsp; (IP=%s) " // pagePop=%li "
				     //"sitePop=%li numLinksToSite=%li)"
				     "</td>"
				     "</tr>\n"
				     "<tr>"
				     "<td>#</td>"
				     "<td>docId</td>"
				     "<td>note</td>"
				     "<td>url</td>"
				     "<td>site</td>"
				     "<td>title</td>"
				     //"<td>reason</td>"
				     "<td>IP</td>"
				     "<td>firstIP</td>"
				     //"<td>external</td>"
				     "<td>lang</td>"
				     "<td>discovered</td>"
				     "<td>pubdate</td>"
				     "<td>hop count</td>"
				     "<td>site rank</td>"
				     "<td># words in link text</td>"
				     "<td>link text bytes</td>"
				     "<td>link text</td>",
				     tt,m_url,iptoa(m_ip));
		if ( m_mode == MODE_SITELINKINFO )
			m_pbuf->safePrintf("<td>link url</td>");
		m_pbuf->safePrintf("<td>neighborhood</td>"
				   "</tr>\n" );
	}

	//CollectionRec *cr = g_collectiondb.getRec ( m_coll );
	// print out each Inlink/Msg20Reply
	for ( long i = 0 ; i < m_numReplyPtrs ; i++ ) {
		// point to a reply
		Msg20Reply *r = m_replyPtrs[i];
		// are we internal
		bool internal = false;
		if ( (r->m_ip&0x0000ffff) == (m_ip&0x0000ffff) ) 
			internal = true;
		if ( (r->m_firstIp&0x0000ffff) == (m_ip&0x0000ffff))
			internal = true;
		// the "external" string
		//char *ext = "Y"; if ( internal ) ext = "N";
		// are we an "anomaly"?
		char *note = r->ptr_note;
		if ( r->m_isLinkSpam && !note )
			note = "unknown";
		// get our ip as a string
		//char *ips = iptoa(r->m_ip);
		// print the link text itself
		char *txt  = r->ptr_linkText;
		// get length of link text
		long tlen = r->size_linkText;
		if ( tlen > 0 ) tlen--;
		//float weight = 1.0;
		//if ( note ) weight = 0.0;
		//if ( internal ) weight = 0.0;
		// datedb date
		char dbuf[128];
		if ( r->m_datedbDate > 1 ) 
			sprintf(dbuf,"%s UTC",
				asctime ( gmtime(&r->m_datedbDate )));
		else 
			sprintf(dbuf,"---");

		char discBuf[128];
		long dd = r->m_discoveryDate;
		if ( dd ) {
			struct tm *timeStruct = gmtime ( &dd );
			strftime ( discBuf, 128 , "<nobr>%b %d %Y</nobr>" , 
				   timeStruct);
		}
		else 
			sprintf(discBuf,"---");

		char *title = r->ptr_tbuf;
		if ( ! title ) title = "";
		
		// show the linking docid, the its weight
		if ( m_printInXml ) {
			char *ns = note;
			if ( ! note ) ns = "good";
			//if ( internal ) note = "internal";
			m_pbuf->safePrintf("\t<inlink>\n"

					   "\t\t<docId>%lli</docId>\n"

					   "\t\t<url><![CDATA[%s]]></url>\n"

					   "\t\t<site><![CDATA[%s]]></site>\n"

					  "\t\t<title><![CDATA[%s]]></title>\n"

					   //"\t\t<weight>%.01f</weight>\n"

					   "\t\t<note><![CDATA[%s]]>"
					   "</note>\n"

					   , r->m_docId
					   , r->ptr_ubuf
					   , r->ptr_site
					   , title
					   //, weight
					   , ns
					   );

			// get explanation of note
			char *exp = getExplanation ( ns );
			if ( exp )
				m_pbuf->safePrintf("\t\t<explanation>"
						   "<![CDATA[%s]]>"
						   "</explanation>\n"
						   , exp );

			m_pbuf->safePrintf("\t\t<ipAddress>"
					   "<![CDATA[%s]]>"
					   "</ipAddress>\n"
					   ,iptoa(r->m_ip) );

			m_pbuf->safePrintf("\t\t<firstIpAddress>"
					   "<![CDATA[%s]]>"
					   "</firstIpAddress>\n"
					   ,iptoa(r->m_firstIp) );

			m_pbuf->safePrintf(

					   "\t\t<onSite>%li</onSite>\n"

					   "\t\t<discoveryDateUTC>%lu"
					   "</discoveryDateUTC>\n"

					   "\t\t<language><![CDATA[%s]]>"
					   "</language>\n"

					   "\t\t<siteRank>%li</siteRank>\n"
					   , (long)internal
					   , dd
					   , getLanguageString(r->m_language)
					   , (long)r->m_siteRank
					   );
			m_pbuf->safePrintf("\t\t<linkText><![CDATA[");
			m_pbuf->htmlEncode ( txt,tlen,0);
			m_pbuf->safePrintf("]]>"
					   "</linkText>\n"
					   );
			if ( m_mode == MODE_SITELINKINFO )
				m_pbuf->safePrintf("\t<linkUrl><![CDATA[%s]]>"
						   "</linkUrl>\n",
						   r->ptr_linkUrl);
			m_pbuf->safePrintf(
					   "\t</inlink>\n"
					   );
			continue;
		}

		m_pbuf->safePrintf(  "<tr>"
				     "<td>%li</td>" // #i
				     "<td>"
				     "<a href=\"/print?page=1&d=%lli\">"
				     "%lli</a></td>" // docid
				     "<td><nobr>"//%.1f"
				     ,i+1
				     ,r->m_docId
				     ,r->m_docId
				     //,weight
				     );
		if ( note )
			m_pbuf->safePrintf("%s", note );
		else
			m_pbuf->safePrintf("<b>good</b>");

		m_pbuf->safePrintf(  "</nobr></td>"//wghtnte
				     "<td>%s</td>" // url
				     "<td>%s</td>" // site
				     "<td>%s</td>", // title
				     r->ptr_ubuf,
				     r->ptr_site,
				     title);
		m_pbuf->safePrintf("<td><a href=\"/search?q=ip%%3A"
				   "%s&c=%s&n=200\">%s</a></td>"  // ip
				   , iptoa(r->m_ip)
				   , m_coll
				   , iptoa(r->m_ip)
				   );
		m_pbuf->safePrintf("<td>%s</td>"
				   , iptoa(r->m_firstIp)
				   );
		m_pbuf->safePrintf(  //"<td>%s</td>"   // external
				     "<td>%s</td>"   // language
				     "<td>%s</td>"   // discoverydate
				     "<td>%s</td>"   // datedbdate
				     "<td>%li</td>" // hopcount
				     "<td><font color=red><b>%li"
				     "</b></font></td>"  // site rank
				     "<td>%li</td>"  // nw 
				     "<td>%li</td>" // textLen
				     "<td><nobr>", // text
				     //ext,
				     getLanguageString(r->m_language),
				     discBuf,
				     dbuf,//r->m_datedbDate,
				     (long)r->m_hopcount,
				     (long)r->m_siteRank, // docQuality,
				     (long)r->m_linkTextNumWords ,
				     tlen );
		// only bold if good
		if ( ! note )
			m_pbuf->safePrintf("<b>");
		// this is in utf8 already
		m_pbuf->safeMemcpy ( txt , tlen );
		// only bold if good
		if ( ! note )
			m_pbuf->safePrintf("</b>");
		// wrap it up
		m_pbuf->safePrintf("</nobr></td>");
		// print url that is linked to in the case of site inlinks
		if ( m_mode == MODE_SITELINKINFO )
			m_pbuf->safePrintf("<td>%s</td>",r->ptr_linkUrl);
		// print the neighborhood
		m_pbuf->safePrintf("<td>");
		txt  = r->ptr_surroundingText;
		tlen = r->size_surroundingText - 1;
		if(!txt) {
			m_pbuf->safePrintf("--\n");
			m_pbuf->safePrintf("</td></tr>\n");
			continue;
		}
		// this is utf8 already
		m_pbuf->safeMemcpy ( txt , tlen );
		m_pbuf->safePrintf("</td></tr>\n");
	}

	if ( ! m_printInXml ) {
		m_pbuf->safePrintf(  "</table>\n<br>\n" );
		m_pbuf->safePrintf(  "</td></table>\n<br><br>\n" );
	}

	// print site rank
	if ( m_printInXml && m_mode == MODE_SITELINKINFO )
		m_pbuf->safePrintf("\t<siteRankTable>\n");

	if ( ! m_printInXml && m_mode == MODE_SITELINKINFO )
		m_pbuf->safePrintf("<table border=1>"
				   "<tr><td colspan=2>"
				   "<center>siteRankTable</center>"
				   "</td></tr>"
				   "<tr><td># good inlinks</td>"
				   "<td>siteRank</td></tr>"
				   );

	// print site rank table
	long lastsr = -1;
	for ( long i = 0 ; i < 11000 && m_mode == MODE_SITELINKINFO ; i++ ) {
		long sr = ::getSiteRank ( i );
		if ( sr == lastsr ) continue;
		lastsr = sr;
		if ( m_printInXml )
			m_pbuf->safePrintf("\t\t<row>"
					   "\t\t\t<numInlinks>%li"
					   "</numInlinks>\n"
					   "\t\t\t<siteRank>%li"
					   "</siteRank>\n"
					   "\t\t</row>\n"
					   ,i,sr);
		else
			m_pbuf->safePrintf("<tr><td>%li</td><td>%li</td></tr>"
					   ,i,sr);
	}
	if ( m_printInXml && m_mode == MODE_SITELINKINFO )
		m_pbuf->safePrintf("\t</siteRankTable>\n");
	else if ( m_mode == MODE_SITELINKINFO )
		m_pbuf->safePrintf("</table>"
				   "<br>" );


	//m_pbuf->safePrintf("<b>*</b><i>The maximum of these two weights "
	//		   "is used.</i><br><br>");

	return true;
}

// . return the "worst" of the two Msg20Replies
// . in the case of a dup, we use this to determine which one is kicked out
// . if one is NULL, return the other
Msg20Reply *Msg25::getLoser ( Msg20Reply *r , Msg20Reply *p ) {
	if ( ! p ) return r;
	if ( ! r ) return p;
	// if "r" is internal, but p is not, r is the loser
	bool rinternal = false;
	bool pinternal = false;
	if ( iptop(r->m_ip     ) == m_top ) rinternal = true;
	if ( iptop(r->m_firstIp) == m_top ) rinternal = true;
	if ( iptop(p->m_ip     ) == m_top ) pinternal = true;
	if ( iptop(p->m_firstIp) == m_top ) pinternal = true;
	if ( rinternal && ! pinternal )
		return r;
	// and vice versa
	if ( pinternal && ! rinternal )
		return p;
	// then resort to doc quality
	//if ( p->m_docQuality < r->m_docQuality ) return p;
	//if ( r->m_docQuality < p->m_docQuality ) return r;
	if ( p->m_siteRank < r->m_siteRank ) return p;
	if ( r->m_siteRank < p->m_siteRank ) return r;
	// . if they had the same quality... check docid
	// . the lower the docid, the "better" it is. this behavior
	//   is opposite of the quality behavior.
	if ( p->m_docId > r->m_docId ) return p;
	// fall back to r then
	return r;
}


// . is "p" a dup of "r"?
// . we will kick out the worst one so it cannot vote
// . returns NULL if not a dup
// . returns NULL with g_errno set on error
char *Msg25::isDup ( Msg20Reply *r , Msg20Reply *p ) {

	// reset this
	g_errno = 0;

	// do ip tops match? (top 2 bytes of ip addresses)
	bool internal = false;
	if ( iptop(p->m_ip) == m_top ) internal = true;
	if ( iptop(p->m_firstIp) == m_top ) internal = true;
	if ( internal && m_oneVotePerIpDom )
		return "ip dup";

	// only one ad id
	if ( r->m_adIdHash && r->m_adIdHash == p->m_adIdHash )
		return "same ad id";

	/*
	// see if he is too similar to another, if so he is not a good voter
	Vector *v1 = (Vector *)r->ptr_vector1;
	Vector *v2 = (Vector *)r->ptr_vector2;
	Vector *v3 = (Vector *)r->ptr_vector3;

	// get vectors for Msg20Reply "p"
	Vector *x1 = (Vector *)p->ptr_vector1;
	Vector *x2 = (Vector *)p->ptr_vector2;
	Vector *x3 = (Vector *)p->ptr_vector3;

	//   doc j is 0% to 100% similar to doc i
	// . but we need to remove the wordpairs found
	//   in common so they aren't used against 
	//   another doc, so we say 'true' here
	// . returns -1 and sets g_errno on error
	// . vX vectors can be NULL if the linker was "linkSpam" because
	//   Msg20.cpp's handler does not set them in that case
	long p1 = 0;
	long p2 = 0;
	long p3 = 0;
	if ( v1 && x1 ) p1 = v1->getLinkBrotherProbability  ( x1  , false);
	// only consider p2 if each vector is beefy. these
	// can be small because these vectors represent the
	// word pairs just to the right of the link in the
	// content, and before any "breaking" tag thereafter.
	// no, there are too many little ads, so disregard the beeft
	// requirement.
	if ( v2 && x2 && v2->m_numPairHashes >= 1 && x2->m_numPairHashes >= 1 )
		p2 = v2->getLinkBrotherProbability(x2,false);
	// compare tag id pair vectors
	if ( v3 && x3 && v3->m_numPairHashes >= 2 && x3->m_numPairHashes >= 2 )
		p3 = v3->getLinkBrotherProbability(x3,false);
	*/

	// see if he is too similar to another, if so he is not a good voter
	long *v1 = (long *)r->ptr_vector1;
	long *v2 = (long *)r->ptr_vector2;
	long *v3 = (long *)r->ptr_vector3;

	long nv1 = r->size_vector1 / 4;
	long nv2 = r->size_vector2 / 4;
	long nv3 = r->size_vector3 / 4;

	// get vectors for Msg20Reply "p"
	long *x1 = (long *)p->ptr_vector1;
	long *x2 = (long *)p->ptr_vector2;
	long *x3 = (long *)p->ptr_vector3;

	long nx1 = p->size_vector1 / 4;
	long nx2 = p->size_vector2 / 4;
	long nx3 = p->size_vector3 / 4;


	//   doc j is 0% to 100% similar to doc i
	// . but we need to remove the wordpairs found
	//   in common so they aren't used against 
	//   another doc, so we say 'true' here
	// . returns -1 and sets g_errno on error
	// . vX vectors can be NULL if the linker was "linkSpam" because
	//   Msg20.cpp's handler does not set them in that case


	// compare now for sanity!
	//long p1 = 0;
	//long p2 = 0;
	//long p3 = 0;
	// short cut
	//long ni = m_niceness;
	//if ( v1 && x1 && nv1 >= 2 && nx1 >= 2 )
	//	p1 = (long)computeSimilarity (v1,x1,NULL,NULL,NULL,ni);
	// only consider p2 if each vector is beefy. these
	// can be small because these vectors represent the
	// word pairs just to the right of the link in the
	// content, and before any "breaking" tag thereafter.
	// no, there are too many little ads, so disregard the beeft
	// requirement.
	//if ( v2 && x2 && nv2 >= 2 && nx2 >= 2 ) 
	//	p2 = (long)computeSimilarity (v2,x2,NULL,NULL,NULL,ni);

	// compare tag id pair vectors
	//if ( v3 && x3 && nv3 >= 2 && nx3 >= 2 ) 
	//	p3 = (long)computeSimilarity (v3,x3,NULL,NULL,NULL,ni);

	//if ( p1 >=  80 ) return "similar content";
	//if ( p2 >=  80 ) return "similar link desc";
	//if ( p3 >= 100 ) return "similar tag template";


	// these count the terminating 0 long as a component
	if ( v1 && x1 && nv1 >= 2 && nx1 >= 2 ) {
		//p1 = (long)computeSimilarity (v1,x1,NULL,NULL,NULL,ni);
		// are these two vecs 80% or more similar?
		if ( isSimilar_sorted (v1,x1,nv1,nx1,80,m_niceness) ) {
			//if ( p1 < 80 ) 
			//	log("test p1 failed");
			return "similar content";
		}
	}

	//if ( p1 >= 80 ) 
	//	log("test p1 failed1");


	// only consider p2 if each vector is beefy. these
	// can be small because these vectors represent the
	// word pairs just to the right of the link in the
	// content, and before any "breaking" tag thereafter.
	// no, there are too many little ads, so disregard the beeft
	// requirement.
	if ( v2 && x2 && nv2 >= 2 && nx2 >= 2 ) {
		//p2 = (long)computeSimilarity (v2,x2,NULL,NULL,NULL,ni);
		// are these two vecs 80% or more similar?
		if ( isSimilar_sorted (v2,x2,nv2,nx2,80,m_niceness) ) {
			//if ( p2 < 80 )
			//	log("test p2 failed");
			return "similar link desc";
		}
	}

	//if ( p2 >= 80 ) 
	//	log("test p2 failed2");

	// compare tag id pair vectors
	if ( v3 && x3 && nv3 >= 2 && nx3 >= 2 ) {
		//p3 = (long)computeSimilarity (v3,x3,NULL,NULL,NULL,ni);
		// are these two vecs 80% or more similar?
		if ( isSimilar_sorted (v3,x3,nv3,nx3,100,m_niceness) ) {
			//if ( p3 < 100 )
			//	log("test p3 failed");
			return "similar tag template";
		}
	}

	//if ( p3 >= 100 ) 
	//	log("test p3 failed2");

	return NULL;

	// compare the ip tops (ip tops of the inlinks)
	//long sum4 = v4->m_numPairHashes + x4->m_numPairHashes;
	// each must have at least this many
	//sum4 /= 3;
	// at least 4 inlinking ips each...
	//if ( sum4 < 4 ) sum4 = 4;
	// check it
	//if ( v4->m_numPairHashes >= sum4 &&
	//     x4->m_numPairHashes >= sum4 )
	//	p4 = v4->getLinkBrotherProbability(x4,false);

	/*
	// sensitivity settings
	if ( p1 >=  80 ) return "similar content";
	if ( p2 >=  80 ) return "similar link desc";
	if ( p3 >= 100 ) return "similar tag template";
	//if ( p4 >= 80 ) return "similar incoming ip shingle";
	//if ( p4 >= 80 ) {
	//	// get the link text that is residing
	//	logf(LOG_DEBUG,"build: ip shingle sim2 of %li%% for "
	//	     "nhi=%li nhj=%li di=%llu dj=%llu",
	//	     (long)p4,
	//	     v4->m_numPairHashes,x4->m_numPairHashes,
	//	     rold->getDocId(),rnew->getDocId());
	//}

	// if no memory pN will be -1
	if ( p1 < 0 || p2 < 0 || p3 < 0 )
		log("build: Msg25: Could not perform link spam "
		    "removal: %s.",mstrerror(g_errno));
	// all done, it was not a dup, but g_errno MAY be set
	return NULL;
	*/
}

bool Msg25::addNote ( char *note , long noteLen , long long docId ) {
	// return right away if no note
	if ( ! note || noteLen <= 0 ) return true;
	// get hash
	long  h          = hash32 ( note , noteLen );
	NoteEntry **pentry = (NoteEntry **)m_table.getValue ( &h );
	// if this "type" has not been recorded, add it
	if ( ! pentry && h ) {
		char *p = m_bufPtr;
		if ( p + sizeof(NoteEntry) + noteLen + 1 >= m_bufEnd ) {
			log("build: increase buf size in Msg25.");
			char *xx = NULL; *xx = 0;
		}
		// store the entry
		NoteEntry *e = (NoteEntry *)p;
		p += sizeof(NoteEntry);
		// init that entry
		e->m_count     =  1;
		e->m_note      =  p;
		e->m_docIds[0] =  docId;
		e->m_docIds[1] = -1LL;
		// store note into the buffer, NULL terminated
		memcpy ( p , note , noteLen ); p += noteLen;
		*p++ = '\0';
		// add to the table
		long slot = -1;
		if ( ! m_table.addKey(&h,&e,&slot))
			log("build: Msg25 could not add note.");
		// did we add successfully?
		if ( slot < 0 ) return false;
		// get the ptr to the stuff
		//val = (char *)m_table.getValueFromSlot(slot);
		// advance to next spot
		m_bufPtr = p;
		return true;
	}
	// cast it
	NoteEntry *entry = *pentry;
	// get the count
	entry->m_count++;
	// add docids to the list
	for ( long i = 0 ; i < MAX_ENTRY_DOCIDS ; i++ ) {
		// skip if not empty
		if ( entry->m_docIds[i] != -1LL ) continue;
		// take it over, its empty if it is -1LL
		entry->m_docIds[i] = docId;
		// next one should be -1 now
		if ( i + 1 < MAX_ENTRY_DOCIDS ) entry->m_docIds[i+1] = -1LL;
		// all done
		break;
	}
	// increase the count
	//if ( val ) *(long *)val = *(long *)val + 1;
	return true;
}
/*
// . the ip to which we send this request must allow us to make udp requests
// . let's also have a separate list of allowable ips (not just admin ips)
//   for which gigablast will accept udp requests
bool Msg25::getPageLinkInfo2 ( Url       *url                ,
			       char      *coll               ,
			       char      *remoteColl         ,
			       void      *state              ,
			       void (* callback)(void *state),
			       bool       doLinkSpamCheck    ,
			       bool       oneVotePerIpDom    ,
			       bool       canBeCancelled     ) {
	//linkInfo->reset();

	// sanity check
	if ( ! coll ) { char *xx = NULL; *xx = 0; }

	// get hostdb to use
	//long collLen = gbstrlen(coll);
	CollectionRec *cr     =  g_collectiondb.getRec ( coll );//, collLen );
	Hostdb        *hostdb = &g_hostdb;
	if ( cr->m_importFromHosts2Conf ) hostdb = &g_hostdb2;

	// sanity check
	//if ( g_hostdb2.m_numHosts == 0 ) {
	if ( hostdb->m_numHosts == 0 ) {
		if ( m_linkInfo ) 
			mfree(m_linkInfo,m_linkInfo->getStoredSize(),"msg25s");
		m_linkInfo = NULL;
		//g_errno = EBADENGINEER;
		static bool s_printed = false;
		if ( s_printed ) return true;
		s_printed = true;
		log("build: No hostdb2.conf to get secondary link info from.");
		return true;
	}
	// watch out for bogus urls
	if ( url->getHostLen() <= 0 ) {
		g_errno = EBADENGINEER;
		log("build: Url %s has no hostname.",url->getUrl());
		return true;
	}
	// save callback info
	m_state    = state;
	m_callback = callback;
	m_url      = url;
	//m_linkInfo = linkInfo;
	long remoteCollLen = 0;
	if ( remoteColl ) remoteCollLen = gbstrlen ( remoteColl );

	// assign it in case somebody uses it
	m_coll     = coll;
	//m_collLen  = collLen;

	// make a Msg25 request for fresh link info
	char *p = m_request;
	// store url
	memcpy ( p , url->getUrl() , url->getUrlLen() );
	// skip over url
	p += url->getUrlLen();
	// store \0
	*p++ = '\0';
	// store remote coll
	memcpy ( p , remoteColl , remoteCollLen );
	// skip over it
	p += remoteCollLen;
	// store \0
	*p++ = '\0';
	// store ip
	*(long *)p = m_ip; p += 4; // url->getIp(); p += 4;
	// siteNumInlinks (bogus)
	*(long *)p = 0; p += 4;
	// sitePop (bogus)
	*(long *)p = 0; p += 4;
	// the last update time
	*(long *)p = m_lastUpdateTime; p += 4;
	// . store a BOGUS root quality now so gk cluster won't force a core
	//   because the rootQuality < 0
	// . older clusters like gk will use it to compute quality, but we
	//   disard that info now, we just want the number of extrapolated
	//   inlinks to use in Msg16.cpp's computeQuality() function.
	// *(char *)p = 0; p += 1;
	// store flags
	*p = 0;
	if ( doLinkSpamCheck          ) *p |= 0x02;
	if ( oneVotePerIpDom          ) *p |= 0x08;
	if ( canBeCancelled           ) *p |= 0x40;
	p++;

	// get size of request
	m_requestSize = p - m_request;

	// sanity check
	if ( m_requestSize > MSG25_MAX_REQUEST_SIZE ) {
		char *xx = NULL; *xx = 0; }

	// use the group that has this url's title rec local, if it exists
	//m_groupId = g_titledb.getGroupIdForDatil(url,hostdb);//&g_hostdb2);
	//m_probDocId = g_titledb.getProbableDocIdForDatil ( url );
	m_probDocId = g_titledb.getProbableDocId ( url );	
	m_groupId   = getGroupIdFromDocId ( m_probDocId );

	// . send that request
	// . returns false and sets g_errno on error, otherwise it will block
	//   and return true
	if ( ! m_mcast.send ( m_request            ,
			      m_requestSize        ,
			      0x25                 , // msgType 0x25
			      false                , // m_mcast own m_request?
			      m_groupId            , // send to group(groupKey)
			      false                , // send to whole group?
			      m_probDocId          , // probDocId (key)
			      this                 , // state data
			      NULL                 , // state data
			      gotReplyWrapper25    ,
			      3600*24*360          , // block forever for this!
			      MAX_NICENESS         , // niceness
			      false                , // real time?
			      -1                   , // firstHostId
			      NULL                 , // m_replyBuf           ,
			      0                    , // MSG25_MAX_REPLY_SIZE ,
			      false                , // free reply buf?
			      false                , // do disk load balancing?
			      -1                   , // max cache age
			      0                    , // cacheKey
			      0                    , // bogus rdbId
			      -1                   , // minRecSizes
			      true                 , // sendToSelf
			      true                 , // retry forever
			      //&g_hostdb2         ))// send to 2ndary cluster
			      hostdb               ))// send to 2ndary cluster
		return true;
	// we blocked, wait for callback to be called
	return false;
}

void gotReplyWrapper25 ( void *state , void *state2 ) {
	Msg25 *THIS = (Msg25 *)state;
	THIS->gotMsg25Reply ( );
	THIS->m_callback ( THIS->m_state );//, THIS->m_linkInfo );
}

bool Msg25::gotMsg25Reply ( ) {
	// ENOTFOUND errors are very common
	//if ( g_errno == ENOTFOUND ) g_errno = 0;
	// error?
	if ( g_errno ) {
		log("build: Failed to get external link info for %s.",
		    m_url->getUrl());
		return true;
	}
	// grab it
	bool  freeit;
	long  replySize;
	long  replyMaxSize;
	char *reply = m_mcast.getBestReply(&replySize,&replyMaxSize,&freeit);

	// relabel it if different
	//if( reply != m_replyBuf )
	relabel( reply, replyMaxSize, "Msg25-mcastGBR" );

	// sanity check - find that mem leak
	if ( m_linkInfo ) { char *xx=NULL;*xx=0; }
	// . deserialize the reply here (copied from Msg20.cpp)
	// . m_linkInfo will own it
	m_linkInfo = (LinkInfo *)reply;
	// sanity check
	if ( m_linkInfo->getStoredSize() != replySize ) { char*xx=NULL;*xx=0;}
	// fix our string ptrs
	//m_linkInfo->updateStringPtrs();
	// sanity check
	//if ( reply == m_replyBuf ) { char *xx=NULL;*xx=0;}
	return true;
}

class State25 {
public:
	LinkInfo  m_linkInfo;
	Msg25     m_msg25;
	UdpSlot  *m_slot;
	//char     *m_statusPtr;
	Url       m_url;
	//SiteRec   m_siteRec;
};


// TODO: add in neighborhood flag, do not look at g_conf for anything!!
// likewise, cr->* is not right for cr->m_indexInlinkNeighborhoods and
// cr->doLinkSpamDetection in LinkText.cpp...
// AND, all the cr->m_* in Msg25.cpp should be flags!!!
void  handleRequest25 ( UdpSlot *slot , long netnice ) {

	char  *p = slot->m_readBuf;
	// deserialize url
	char  *url = p; p += gbstrlen(p) + 1;
	// deserialize coll
	char  *coll           = p; p += gbstrlen(p) + 1;
	//long collLen        = gbstrlen(coll);
	long   ip             = *(long *)p; p += 4;
	long   siteNumInlinks = *(long *)p; p += 4;
	long   sitePop        = *(long *)p; p += 4;
	long   lastUpdateTime = *(long *)p; p += 4;
	// sanity check
	if ( lastUpdateTime == 0 || lastUpdateTime == -1){char *xx=NULL;*xx=0;}
	// sanity check
	//if ( rootQuality < 0 || rootQuality > 100 ) { char *xx=NULL; *xx=0;}
	// get flags
	char   doLinkSpamCheck          = *p & 0x02;
	char   oneVotePerIpDom          = *p & 0x08;
	char   canBeCancelled           = *p & 0x40;
	p++;
	// make a new Msg25
	State25 *st ;
	try { st = new ( State25 ); }
	catch ( ... ) {
		g_errno = ENOMEM;
		log("build: msg25: new(%i): %s", 
		    sizeof(State25),mstrerror(g_errno));
		g_udpServer.sendErrorReply ( slot , g_errno );
		return;
	}
	mnew ( st , sizeof(State25) , "Msg25" );
	// set url class
	st->m_url.set ( url , gbstrlen(url) );
	// save socket
	st->m_slot = slot;
	// call it
	Msg25 *mm = &st->m_msg25;
	if ( ! mm->getPageLinkInfo ( &st->m_url               ,
				     ip                       ,
				     -1                       , // docId
				     coll                     ,
				     NULL                     , // qbuf
				     0                        , // qbufSize
				     st                       , // state
				     sendLinkInfoReplyWrapper , // callback
				     false                    , // isInjecting?
				     NULL                     , // pbuf
				     NULL                    , // xd
				     siteNumInlinks           ,
				     sitePop                  ,
				     NULL                     , // oldLinkInfo
				     MAX_NICENESS             ,
				     doLinkSpamCheck          ,
				     oneVotePerIpDom          ,
				     canBeCancelled           ,
				     lastUpdateTime           ))
		return;
	// return the reply
	sendLinkInfoReplyWrapper ( st );//, &st->m_linkInfo );
}


void sendLinkInfoReplyWrapper ( void *state ) { // , LinkInfo *infoArg ) {

	State25 *st = (State25 *)state;
	// get our state
	UdpSlot *slot = st->m_slot;
	// did it have an error?
	if ( g_errno ) {
		mdelete ( st , sizeof(st) , "Msg25" );
		delete ( st );
		g_udpServer.sendErrorReply ( slot , g_errno );		
		return;
	}

	Msg25 *m = &st->m_msg25;
	// get the link info ptr
	LinkInfo *info = m->m_linkInfo;
	// sanity test
	//if ( info != infoArg ) { char *xx=NULL; *xx=0; }
	// grab it
	char *reply = (char *)info;
	// get the size
	long need = 0;
	if ( info ) need = info->getStoredSize();
	// don't let Msg25 free it
	m->m_linkInfo = NULL;

	// free the state
	mdelete ( st , sizeof(st) , "Msg25" );
	delete ( st );

	// send it away
	g_udpServer.sendReply_ass ( reply , need , reply , need , slot );
}
*/

//////////
//
// LINKINFO
//
//////////

#include "HashTableX.h"
#include "Words.h"
#include "Titledb.h"
#include "Msge0.h"
#include "IndexList.h"
#include "XmlDoc.h" // score8to32()

#define MAX_LINKERS          3000
#define MAX_INTERNAL_INLINKS 10

// . after Msg25 calls LinkInfo::set() (was merge()) make it call
//   Msg25::print(SafeBuf *pbuf) to print the stats. it can access
//   LinkInfo's Inlinks to get their weights, etc.
// . returns the LinkInfo on success
// . returns NULL and sets g_errno on error
LinkInfo *makeLinkInfo ( char        *coll                    ,
			 long         ip                      ,
			 long         siteNumInlinks          ,
			 //long         sitePop                 ,
			 Msg20Reply **replies                 ,
			 long         numReplies              ,
			 //long         extrapolated            ,
			 //long         xfactor                 ,
			 // if link spam give this weight
			 long         spamWeight              ,
			 bool         oneVotePerIpTop         ,
			 long long    linkeeDocId             ,
			 long         lastUpdateTime          ,
			 bool         onlyNeedGoodInlinks      ,
			 long         niceness                ,
			 Msg25 *msg25 ,
			 SafeBuf *linkInfoBuf ) {

	// for parsing the link text
	Words words;
	// a table for counting words per link text
	HashTableX tt;
	// buf for tt
	char ttbuf[2048];
	// must init it!
	tt.set ( 8 ,4,128,ttbuf,2048,false,niceness,"linknfo");
	// how many internal linkers do we have?
	long icount = 0;
	// we are currently only sampling from 10
	for ( long i = 0 ; i < numReplies ; i++ ) {
		// replies are NULL if MsgE had an error, like ENOTFOUND
		if ( ! replies[i] ) continue;
		//if ( texts[i]->m_errno ) continue;
		bool internal = false;
		if ( (replies[i]->m_ip&0x0000ffff) == (ip&0x0000ffff) )
			internal = true;
		if ( (replies[i]->m_firstIp&0x0000ffff) == (ip&0x0000ffff) )
			internal = true;
		if ( internal )
			icount++;
	}
	/*
	// limit
	if ( icount > MAX_INTERNAL_INLINKS ) icount = MAX_INTERNAL_INLINKS;
	// count external now too
	// *ecount = numReplies - icount;
	// for counting internal links again
	long icount2 = 0;
	// . only allow 1 vote per ip domain OR mid domain
	// . assign weights of -1 to links to ignore (from same ip top domain)
	for ( long i = 0 ; i < numReplies ; i++ ) {
		// get the reply
		Msg20Reply *r = replies[i];
		// replies are NULL if MsgE had an error, like ENOTFOUND
		if ( ! r ) continue;
		// . for setting "outlinks", skip those not in the index
		// . this will cause them not to be stored
		if ( r->m_docId == 0LL ) {
			r->m_linkTextScoreWeight = 0;
			continue;
		}
		// are we internal?
		bool internal = ((r->m_ip&0x0000ffff) == (ip & 0x0000ffff));
		// . if he's probably a guestbook, message board page or other
		//   cgi/dynamic page, don't let him vote because he can be
		//   easily subverted by a clever link spammer
		// . this now include any url with the string "link" in it, too
		if ( r->m_isLinkSpam && ! internal ) {
			r->m_linkTextScoreWeight = spamWeight;
			continue;
		}
		// if we are external
		if ( ! internal ) {
			r->m_linkTextScoreWeight = 100;
			continue;
		}
		// . allow first 100 internal links, if any, but together
		//   they cannot count more than one external link can
		// . see Links.cpp::hash() for values of "scores" 
		// . TODO: if he's an rss or atom doc then let him thru
		// . only allow first 10 internal linkers to vote
		if ( ++icount2 > MAX_INTERNAL_INLINKS ) {
			r->m_linkTextScoreWeight = -1; // should this be 0?
			continue;
		}
		long kc = r->m_pageNumInlinks + r->m_siteNumInlinks;
		long long b2 ;
		if      ( kc >= 5000 ) b2 = 100;
		else if ( kc >= 2500 ) b2 =  95;
		else if ( kc >= 1000 ) b2 =  80;
		else if ( kc >=  500 ) b2 =  70;
		else if ( kc >=  100 ) b2 =  60;
		else if ( kc >=   50 ) b2 =  50;
		else if ( kc >=   10 ) b2 =  40;
		else                   b2 =  10;
		r->m_linkTextScoreWeight =  b2 / icount;
	}
	*/

	// sum up all weights into "total"
	//long total = 0;
	
	/*
	for ( long i = 0 ; i < numReplies ; i++ ) {
		// get the reply
		Msg20Reply *r = replies[i];
		// replies are NULL if MsgE had an error, like ENOTFOUND
		if ( ! r ) continue;
		// skip links from the same 2 byte ip as another link
		//if ( r->m_linkTextScoreWeight <= 0 ) continue;
		// ignore if spam
		if ( onlyNeedGoodInlinks && r->m_isLinkSpam ) continue;
		// add up weights
		//total += r->m_linkTextScoreWeight;
		char *txt       = r->ptr_linkText;
		long  txtLen    = r->size_linkText;
		if ( txtLen > 0 ) txtLen--;
		// this can be empty if we just had an item
		if ( ! txt || txtLen <= 0 ) continue;
		// otherwise, hash each word with weight of 1
		//words.set ( false , txt , txtLen );
		words.set ( txt, txtLen,TITLEREC_CURRENT_VERSION,true, true );
		// get # of words
		long nw = words.getNumWords();
		// loop over each on in this link text
		for ( long k = 0 ; k < nw; k++ ) {
			// don't count punct
			if ( words.isPunct(k) ) continue;
			// get the word Id of the ith word
			long long wid = words.getWordId(k);
			// does it match a word in this same link text?
			long j;
			for ( j = 0 ; j < k ; j++ ) {
				// don't consider the wordId of punct "words"
				// because it is a bogus value that may
				// ultimately cause us to segfault below
				if ( words.isPunct  (j)        ) continue ;
				if ( words.getWordId(j) == wid ) break;
			}
			// if it does then skip it
			if ( j < k ) continue;
			// otherwise, hash it so we can count word occurences
			if ( ! tt.addTerm ( &wid , 1 ) ) {
				log("build: Failed to add word to table.");
				return NULL;
			}
		}
	}
	// always return >= 0, -1 means error
	//if ( total < 0 ) total = 0;
	// at least .3% of the good linkers need to have the word so when
	// if numReplies is 1000, minCount is 3, if it is 100, this is .3
	unsigned long minCount = (3 * ((long)numReplies)) / 1000;
	// and at least 3 docs need to have the word...
	if ( minCount < 3 ) minCount = 3;
	// and if 7 have it you are always golden
	if ( minCount > 7 ) minCount = 7;

	long nn = numReplies;
	// skip this part if table empty
	if ( tt.getNumSlotsUsed() <= 0 ) nn = 0;
	// set the m_isAnomaly bits
	for ( long i = 0 ; i < nn ; i++ ) {
		// get the reply
		Msg20Reply *r = replies[i];
		// replies are NULL if MsgE had an error, like ENOTFOUND
		if ( ! r ) continue;
		// skip weights 0 or less
		if ( r->m_linkTextScoreWeight <= 0 ) continue;
		if ( onlyNeedGoodInlinks && r->m_isLinkSpam ) continue;
		// point to link text itself
		char *txt       = r->ptr_linkText;
		long  txtLen    = r->size_linkText;
		if ( txtLen > 0 ) txtLen--;
		// this can be empty if we just had an item
		if ( ! txt || txtLen <= 0 ) continue;
		// reset it now to prevent core
		words.reset();
		// set the words class again from this link text
		words.set ( txt , txtLen, 
			    TITLEREC_CURRENT_VERSION,
			    true, true );
		// get # of words in this link text
		long nw = words.getNumWords();
		// loop over each word
		long k ;
		for ( k = 0 ; k < nw; k++ ) {
			// don't count punct
			if ( words.isPunct(k) ) continue;
			// do not count stop words (uncapitalized)
			if ( words.isStopWord(k) ) continue;
			// get the word Id of the ith word
			long long wid = words.getWordId(k);
			// filter out this LinkText if has an anomalous word
			if ( tt.getScore ( &wid ) < minCount )
				break;
		}
		// continue if not anomalous
		if ( k == nw ) continue;
		// remove this weight from the total
		total -= r->m_linkTextScoreWeight;
		// set the anomaly bit
		r->m_isAnomaly = true;
	}
	// always return >= 0, -1 means error
	if ( total < 0 ) total = 0;
	*/

	// we can estimate our quality here
	long numGoodInlinks = 0;

	// set m_linkTextNumWords
	for ( long i = 0 ; i < numReplies ; i++ ) {
		// get the reply
		Msg20Reply *r = replies[i];
		// replies are NULL if MsgE had an error, like ENOTFOUND
		if ( ! r ) continue;
		// get the weight
		//long w = r->m_linkTextScoreWeight;
		// skip weights 0 or less
		//if ( w <= 0 ) continue;
		// get the link text itself
		char *txt    = r->ptr_linkText;
		long  txtLen = r->size_linkText;
		// discount terminating \0
		if ( txtLen > 0 ) txtLen--; 
		// get approx # of words in link text
		long nw = 0;
		if ( txtLen > 0 )
			nw = getNumWords(txt,txtLen,TITLEREC_CURRENT_VERSION);
		// store it
		r->m_linkTextNumWords = nw;
		
		// linkspam?
		if ( r->ptr_note ) {
			r->m_isLinkSpam = true;
			continue;
		}

		bool internal = false;
		if ((r->m_ip&0x0000ffff) == (ip & 0x0000ffff)) 
			internal = true;
		if ((r->m_firstIp&0x0000ffff) == (ip & 0x0000ffff)) 
			internal = true;

		// if its internal do not count towards good, but do
		// indeed store it!
		if ( internal ) continue;

		// otherwise count as good
		numGoodInlinks++;
	}

	long count = 0;
	// . now just store the Inlinks whose weights are > 0
	// . how much space do we need?
	long need = 0;
	// how much space do we need?
	for ( long i = 0 ; i < numReplies ; i++ ) {
		// get the reply
		Msg20Reply *r = replies[i];
		// replies are NULL if MsgE had an error, like ENOTFOUND
		if ( ! r ) continue;
		// ignore if spam
		//if ( onlyNeedGoodInlinks && r->m_isLinkSpam ) continue;
		if ( r->m_isLinkSpam ) {
			// linkdb debug
			if ( g_conf.m_logDebugLinkInfo ) 
				log("linkdb: inlink #%li is link spam: %s",
				    i,r->ptr_note);
			if ( onlyNeedGoodInlinks )
				continue;
		}
		// do a quick set
		Inlink k; k.set ( r );
		// get space
		need += k.getStoredSize ( );
		// count it
		count++;
	}

	// we need space for our header
	need += sizeof(LinkInfo);
	// alloc the buffer
	//char *buf = (char *)mmalloc ( need,"LinkInfo");
	//if ( ! buf ) return NULL;
	if ( ! linkInfoBuf->reserve ( need , "LinkInfo" ) ) return NULL;
	// set ourselves to this new buffer
	LinkInfo *info = (LinkInfo *)(linkInfoBuf->getBufStart());
	
	// set our header
	info->m_version                = 0;
	info->m_size                   = need;
	info->m_lastUpdated            = lastUpdateTime;//getTimeGlobal();
	// how many Inlinks we stored in info->m_buf[]
	info->m_numStoredInlinks       = count;
	// the gross total of inlinks we got, both internal and external
	info->m_totalInlinkingDocIds   = msg25->m_numDocIds;
	// . only valid if titlerec version >= 119
	// . how many unique c blocks link to us?
	// . includes your own internal c block
	info->m_numUniqueCBlocks       = msg25->m_cblocks;
	// . only valid if titlerec version >= 119
	// . how many unique ips link to us?
	// . this count includes internal IPs as well
	info->m_numUniqueIps           = msg25->m_uniqueIps;
	// keep things consistent for the "test" coll
	info->m_reserved1              = 0;
	info->m_reserved2              = 0;
	// how many total GOOD inlinks we got. does not include internal cblock
	info->m_numGoodInlinks  = numGoodInlinks;
	//info->m_siteRootQuality      = siteRootQuality; // bye-bye
	// these two guys will be 0 if we called Msg25::getSiteInfo()
	//info->m_sitePop                = sitePop;
	//info->m_siteNumInlinks         = siteNumInlinks;
	// if we do not read all the inlinkers on disk because there are
	// more than MAX_LINKRES_IN_TERMLIST inlinkers, then we must 
	// extrapolate the total # of inlinkers we have. Msg25 does this
	// and passes it in to us.
	//info->m_numInlinksExtrapolated = extrapolated;

	// point to our buf
	char *p    = info->m_buf;
	char *pend = linkInfoBuf->getBufStart() + need;
	// count the ones we store that are internal
	long  icount3 = 0;
	// now set each inlink
	for ( long i = 0 ; i < numReplies ; i++ ) {
		// get the reply
		Msg20Reply *r = replies[i];
		// replies are NULL if MsgE had an error, like ENOTFOUND
		if ( ! r ) continue;
		// skip weights 0 or less
		//if ( r->m_linkTextScoreWeight <= 0 ) continue;
		// ignore if spam
		//if ( onlyNeedGoodInlinks && r->m_isLinkSpam ) continue;
		if ( r->m_isLinkSpam && onlyNeedGoodInlinks ) continue;
		// are we internal?
		bool internal = false;
		if ( (r->m_ip&0x0000ffff) == (ip & 0x0000ffff) ) 
			internal = true;
		if ( (r->m_firstIp&0x0000ffff) == (ip & 0x0000ffff) )
			internal = true;
		if ( internal ) icount3++;
		// set the Inlink
		Inlink k;
		// store it. our ptrs will reference into the Msg20Reply buf
		k.set ( r );
		// . this will copy itself into "p"
		// . "true" --> makePtrsRefNewBuf
		long wrote;
		char *s = k.serialize ( &wrote , p , pend - p , true );
		// sanity check
		if ( s != p ) { char *xx=NULL;*xx=0; }
		// sanity check
		if ( k.getStoredSize() != wrote ) { char *xx=NULL;*xx=0;}
		// note it if recycled
		if ( k.m_recycled )
			logf(LOG_DEBUG,"build: recycling Inlink %s for linkee "
			     "%lli", k.ptr_urlBuf,linkeeDocId);
		// advance
		p += wrote;
	}
	// . sanity check, should have used up all the buf exactly
	// . so we can free the buf with k->getStoredSize() being the allocSize
	if ( p != pend ) { char *xx=NULL;*xx=0; }

	// how many guys that we stored were internal?
	info->m_numInlinksInternal = (char)icount3;


	// sanity parse it
	//long ss = 0;
	//for ( Inlink *k =NULL; (k=info->getNextInlink(k)) ; ) 
	//	ss += k->getStoredSize();
	//if ( info->m_buf + ss != pend ) { char *xx=NULL;*xx=0;}

	// success
	return info;
}

static Inlink  s_inlink;
static Inlink *s_orig;

// if we are an old version, we have to set s_inlink and return
// a ptr to that
Inlink *LinkInfo::getNextInlink ( Inlink *k ) {
	// switch back
	if ( k == &s_inlink ) k = s_orig;
	// get it as the latest versioned inlink
	Inlink *p = getNextInlink2 ( k );
	// if none, we are done
	if ( ! p ) return p;
	// sanity checks
	if (p->m_numStrings==0 && p->m_firstStrPtrOffset){char *xx=NULL;*xx=0;}
	if (p->m_numStrings && p->m_firstStrPtrOffset==0){char *xx=NULL;*xx=0;}
	// fix this for the really old guy. we did not store these two
	// things initially, but they should have been set to this...
	// luckily, we had a "reserved1" long...
	if ( p->m_numStrings == 0 ) {
		// urlBuf,linkText,surroudingText,rssItem
		p->m_numStrings        = 4;
		p->m_firstStrPtrOffset = 64;
	}
	// if latest, return that
	if ( p->m_numStrings        == p->getBaseNumStrings() &&
	     p->m_firstStrPtrOffset == (char *)&p->ptr_urlBuf - (char *)p ) {
		p->updateStringPtrs(NULL);
		return p;
	}
	// otherwise, set s_inlink to it
	s_inlink.set2 ( (Inlink *)p );
	// preserve p though for next call
	s_orig = (Inlink *)p;
	// and return that
	return &s_inlink;
}

Inlink *LinkInfo::getNextInlink2 ( Inlink *k ) {
	if ( this == NULL ) return NULL;
	// if none, return NULL
	if ( m_numStoredInlinks == 0 ) return NULL;
	// if k is NULL, return the first
	if ( ! k ) {
		// set it to the first one
		k = (Inlink *)m_buf;
		// done
		return k;
	}
	// point to next
	long size = k->getStoredSize();
	// get the inlink to return
	Inlink *next = (Inlink *)((char *)k + size);
	// return NULL if breached
	if ( (char *)next >= ((char *)this)+m_size ) return NULL;
	// otherwise, we are still good
	return next;
}

// . returns false and sets g_errno on error
// . returns true if no error was encountered
// . call xml->isEmpty() to see if you got anything
bool LinkInfo::getItemXml ( Xml *xml , long niceness ) {
	// reset it
	xml->reset();
	// loop through the Inlinks
	Inlink *k = NULL;
	for ( ; (k = getNextInlink(k)) ; ) {
		// does it have an xml item? skip if not.
		if ( k->size_rssItem <= 1 ) continue;
		// got it
		break;
	}
	// return if nada
	if ( ! k ) return true;
	// set the xml
	return k->setXmlFromRSS ( xml , niceness );
}

bool Inlink::setXmlFromRSS ( Xml *xml , long niceness ) {
	// compute the length (excludes the \0's)
	long len = size_rssItem - 1;
	// return false and set g_errno if this fails
	return xml->set ( ptr_rssItem              ,
			  len                      ,
			  false                    , // own data?
			  0                        , // allocSize
			  true                     , // pure xml?
			  TITLEREC_CURRENT_VERSION ,
			  false                    , // no need to now
			  niceness                 );
}

// only Title.cpp uses this right now
/*
bool Inlink::setXmlFromLinkText ( Xml *xml ) {
	// compute the length (excludes the \0's)
	long len = size_linkText - 1;
	// bitch
	if ( ptr_linkText[len] )
		log("linknfo: bad link text, no NULL termination. truncing.");
	// if not null terminated make it so!
	ptr_linkText[len] = '\0';
	// for some reason the link text is not DOUBLE NULL TERMINATED
*/
/*
	// . copy into buf to ensure NULL termination
	// . older versions were not null terminated or doubly null terminated
	//   as Xml::set() requires for uni
	char buf[1000];
	// sanity check
	if ( len > 900 ) { char *xx=NULL;*xx=0; }
	// copy
	memcpy ( buf , ptr_linkText , size_linkText );
	// ensure null termination
	buf [ size_linkText     ] = '\0';
	buf [ size_linkText + 1 ] = '\0';

	// return false and set g_errno if this fails
	return xml->set ( csUTF8                   ,
			  ptr_linkText             ,
			  len                      ,
			  false                    , // own data?
			  0                        , // allocSize
			  true                     , // pure xml?
			  TITLEREC_CURRENT_VERSION ,
			  false                    ); // no need to now
}
*/

// . update them for each Inlink
// . same as calling deserialize()
//void LinkInfo::updateStringPtrs ( ) {
//	// loop through the Inlinks and update them
//	for ( Inlink *k = NULL; (k = getNextInlink(k)) ; )
//		k->updateStringPtrs();
//}

bool LinkInfo::hasLinkText ( ) {
	// loop through the Inlinks
	for ( Inlink *k = NULL; (k = getNextInlink(k)) ; ) 
		if ( k->size_linkText > 1 ) return true;
	return false;
}

/*
bool LinkInfo::hash ( TermTable     *table                  ,
		      long           externalLinkTextWeight ,
		      long           internalLinkTextWeight ,
		      //TitleRec      *tr                   ,
		      long           ip                     ,
		      long           version                ,
		      long           siteNumInlinks         ,
		      TermTable     *countTable             ,
		      char          *note                   ,
		      long           niceness               ) {

	long noteLen = 0;
	if ( note ) noteLen = gbstrlen ( note );
	// count "external" inlinkers
	long ecount = 0;
	// loop through the link texts and hash them
	for ( Inlink *k = NULL; (k = getNextInlink(k)) ; ) {
		// is this inlinker internal?
		bool internal=((ip&0x0000ffff)==(k->m_ip&0x0000ffff));
		// count external inlinks we have for indexing gbmininlinks:
		if ( ! internal ) ecount++;
		// get score
		long long baseScore = k->m_baseScore;
                // get the weight
		long long ww ;
		if ( internal ) ww = internalLinkTextWeight;
		else            ww = externalLinkTextWeight;
		// modify the baseScore
		long long final = (baseScore * ww) / 100LL;
		// get length of link text
		long tlen = k->size_linkText;
		if ( tlen > 0 ) tlen--;
		// get the text
		char *txt = k->ptr_linkText;
		// if it is anomalous, set this, we don't
		//if ( k->m_isAnomalous )
		//	table->m_hashIffNotUnique = true;
		// . hash the link text into the table
		// . returns false and sets g_errno on error
		// . do NOT hash singletons if they are in a phrase already!
		//   this way "yahoo groups" won't come up for a "yahoo" query
		//   and "new york times" won't come up for a "york" query
		// . actually, this may backfire on us, so i reverted back!
		// . we still have the score punish from # of words though!
		if ( ! table->hash ( version      ,
				     note         ,
				     noteLen      ,
				     NULL         ,  
				     0            ,
				     txt          ,
				     tlen         ,
				     final        ,  // modified score
				     0x7fffffff   ,  // maxScore
				     true         ,  // doSpamDetection?
				     true         ,  // hashSingleWords? ok.
				     true         ,  // hashPhrases?
				     false        ,  // hashAsWhole?
				     false        ,  // useStems?
				     true         ,  // useStopWords?
				     false        ,  // hashIffUnique?
				     false        ,  // hashWordIffNotInPhrase
				     // FIXME: need normaliztion Info (partap)
				     4            , // mp
				     NULL         , // wwptr        ,
				     NULL         , // pptr         ,
				     NULL         , // bptr         ,
				     NULL         , // wgptr        ,
				     true         , // isLinkText?
				     countTable   ,
				     NULL         , // scoresPtr
				     20           , // numRepeatWords
				     siteNumInlinks,
				     niceness     ))
			return false;
*/
		// turn this back off in case enabled
		//table->m_hashIffNotUnique = false;

		/*
		if( !hasLinkText ) continue;

		if ( ! table->hash ( titleRecVersion ,
				     "hash incoming link text for this field" ,
				     0            ,  // prefixLen1
				     field        ,  // "linktextincoming:"
				     gbstrlen(field),  // length of field
				     txt          ,
				     txtLen       ,
				     docQuality   ,
				     TERMTABLE_MAXSCORE, // maxScore  
				     true         ,  // doSpamDetection?
				     true         ,  // hashSingleWords? ok.
				     true         ,  // hashPhrases?
				     false        ,  // hashAsWhole?
				     false        ,  // useStems?
				     true         ,  // useStopWords?
				     false        ,  // hashIffUnique?
				     false        ,  // hashWordIffNotInPhrase
			return false;
		*/
/*
	}

	// . hash gbkeyword:numinlinks where score is # of inlinks from 1-255
	// . do not hash gbkeyword:numinlinks if we don't got any
	if ( ecount <= 0 ) return true;
	// limit it since our score can't be more than 255 (8-bits)
	if ( ecount > 255 ) ecount = 255;
	// IndexList::set() converts our 32 bit score to 8-bits so we trick it!
	long score = score8to32 ( (uint8_t)ecount );
	// watch out for wrap
	if ( score < 0 ) score = 0x7fffffff;
	if ( ! table->hash ( version           ,
			     "hash numinlinks" ,
			     15                ,
			     "gbkeyword"       ,
			     9                 ,
			     "numinlinks"      ,
			     10                ,
			     score             ,
			     TERMTABLE_MAXSCORE,  // maxScore  
			     false             ,  // doSpamDetection?
			     true              ,  // hashSingleWords? ok.
			     false             ,  // hashPhrases?
			     false             ,  // hashAsWhole?
			     false             ,  // useStems?
			     false             ,  // useStopWords?
			     false             ,  // hashIffUnique?
			     false             )) // hashWordIffNotInPhrase
			return false;
	

	return true;
}
*/

/*
long long getBoostFromLinkeeQuality ( char docQuality ) {
	// hard code this
	float fboost    = 1.0;
	float myQuality = (float)docQuality;
	// do it different over 50, that is very nice quality...
	while ( myQuality >= 50.0 ) {
		myQuality--;
		fboost    *= 1.10;
	}
	// . every 14 pts over 30 doubles our boost
	// .  44 --> x2
	// .  58 --> x4
	// .  72 --> x8
	// .  86 --> x16
	// . 100 --> x32
	while ( myQuality >= 30.0 ) {
		//myQuality *= 0.9;
		myQuality--;
		fboost    *= 1.05;
	}
	// assign
	return (long long )(fboost * 100.0);
}
*/

void Inlink::set ( Msg20Reply *r ) {

	// . these two things are used for version-based deserializing
	// . our current version has 5 strings
	m_numStrings         = getBaseNumStrings();
	// and our current string offset
	m_firstStrPtrOffset  = (char *)getFirstStrPtr() - (char *)this;

	// set ourselves now
	m_ip                 = r->m_ip;
	m_firstIp            = r->m_firstIp;
	m_wordPosStart       = r->m_wordPosStart;
	m_docId              = r->m_docId;
	m_firstSpidered      = r->m_firstSpidered;
	m_lastSpidered       = r->m_lastSpidered;
	//m_nextSpiderDate     = r->m_nextSpiderTime;
	m_datedbDate         = r->m_datedbDate;
	m_firstIndexedDate   = r->m_firstIndexedDate;
	m_numOutlinks        = r->m_numOutlinks;
	//m_baseScore          = r->m_linkTextBaseScore;
	//m_pagePop            = r->m_pagePop;
	//m_sitePop            = r->m_sitePop;
	//m_siteNumInlinks     = r->m_siteNumInlinks;
	//m_reserved1        = 0;
	
	m_isPermalink        = r->m_isPermalink;
	m_outlinkInContent   = r->m_outlinkInContent;
	m_outlinkInComment   = r->m_outlinkInComment;
	m_isLinkSpam         = r->m_isLinkSpam;
	//m_isAnomaly          = r->m_isAnomaly;
	m_hasAllQueryTerms   = r->m_hasAllQueryTerms;
	m_recycled           = r->m_recycled;

	// usually the datedb date is a publication date, but it can also
	// be a "modified" date. when the document was last modified. That
	// is indicated by the last bit of the datedb date. it is clear if it
	// is a Modified date, and it is set if it is a Publish date.
	//m_datedbModified     = r->m_datedbModified;

	m_country             = r->m_country;
	m_language            = r->m_language;
	//m_docQuality        = r->m_docQuality;
	m_siteRank            = r->m_siteRank;
	//m_ruleset             = r->m_ruleset;
	m_hopcount            = r->m_hopcount;
	//m_linkTextScoreWeight = r->m_linkTextScoreWeight;

	ptr_urlBuf            = r->ptr_ubuf;
	ptr_linkText          = r->ptr_linkText;
	ptr_surroundingText   = r->ptr_surroundingText;
	ptr_rssItem           = r->ptr_rssItem;
	ptr_categories        = r->ptr_categories;
	ptr_gigabitQuery      = r->ptr_gigabitQuery;
	ptr_templateVector    = r->ptr_templateVector;

	size_urlBuf           = r->size_ubuf;
	size_linkText         = r->size_linkText;
	size_surroundingText  = r->size_surroundingText;
	size_rssItem          = r->size_rssItem;
	size_categories       = r->size_categories;
	size_gigabitQuery     = r->size_gigabitQuery;
	size_templateVector   = r->size_templateVector;
}

// Msg25 calls this to make a "fake" msg20 reply for recycling Inlinks
// that are no longer there... preserves rssInfo, etc.
void Inlink::setMsg20Reply ( Msg20Reply *r ) {

	r->m_ip                  = m_ip;
	r->m_firstIp             = m_firstIp;
	r->m_wordPosStart        = m_wordPosStart;
	r->m_docId               = m_docId;
	r->m_firstSpidered       = m_firstSpidered;
	
	r->m_lastSpidered        = m_lastSpidered;
	//r->m_nextSpiderTime      = m_nextSpiderDate;
	r->m_datedbDate          = m_datedbDate;
	r->m_firstIndexedDate    = m_firstIndexedDate;
	r->m_numOutlinks         = m_numOutlinks;
	//r->m_linkTextBaseScore   = m_baseScore;
	//r->m_pagePop             = m_pagePop;
	//r->m_sitePop             = m_sitePop;
	//r->m_siteNumInlinks      = m_siteNumInlinks;
	
	r->m_isPermalink         = m_isPermalink;
	r->m_outlinkInContent    = m_outlinkInContent;
	r->m_outlinkInComment    = m_outlinkInComment;
	
	r->m_isLinkSpam          = m_isLinkSpam;
	//r->m_isAnomaly           = m_isAnomaly;
	r->m_hasAllQueryTerms    = m_hasAllQueryTerms;
	
	r->m_country             = m_country;
	r->m_language            = m_language;
	//r->m_docQuality        = m_docQuality;
	r->m_siteRank            = m_siteRank;
	//r->m_ruleset             = m_ruleset;
	r->m_hopcount            = m_hopcount;
	//r->m_linkTextScoreWeight = m_linkTextScoreWeight;
	
	r->ptr_ubuf              = ptr_urlBuf;
	r->ptr_linkText          = ptr_linkText;
	r->ptr_surroundingText   = ptr_surroundingText;
	r->ptr_rssItem           = ptr_rssItem;
	r->ptr_categories        = ptr_categories;
	r->ptr_gigabitQuery      = ptr_gigabitQuery;
	r->ptr_templateVector    = ptr_templateVector;
	
	r->size_ubuf             = size_urlBuf;
	r->size_linkText         = size_linkText;
	r->size_surroundingText  = size_surroundingText;
	r->size_rssItem          = size_rssItem;
	r->size_categories       = size_categories;
	r->size_gigabitQuery     = size_gigabitQuery;
	r->size_templateVector   = size_templateVector;
}

// convert offsets back into ptrs
long Inlink::updateStringPtrs ( char *buf ) {
	// point to our string buffer
	char *p = buf;
	// use our buf if none supplied
	if ( ! p ) p = getStringBuf(); // m_buf;
	// then store the strings!
	long  *sizePtr = getFirstSizeParm(); // &size_qbuf;
	long  *sizeEnd = getLastSizeParm (); // &size_displayMetas;
	char **strPtr  = getFirstStrPtr  (); // &ptr_qbuf;
	for ( ; sizePtr <= sizeEnd ;  ) {
		// convert the offset to a ptr
		*strPtr = p;
		// make it NULL if size is 0 though
		if ( *sizePtr == 0 ) *strPtr = NULL;
		// sanity check
		if ( *sizePtr < 0 ) { char *xx = NULL; *xx =0; }
		// advance our destination ptr
		p += *sizePtr;
		// advance both ptrs to next string
		sizePtr++;
		strPtr++;
	}
	// return how many bytes we processed
	return getBaseSize() + (p - getStringBuf());
}

void Inlink::reset ( ) {
	// clear ourselves out
	memset ( (char *)this,0,sizeof(Inlink) );
}

// . set a new Inlink from an older versioned Inlink
// . this is how we handle versioning
void Inlink::set2 ( Inlink *old ) {
	// clear ouselves
	reset();
	// copy what is legit to us
	memcpy ( (char *)this , (char *)old , old->m_firstStrPtrOffset );
	// set our offset to the string ptrs
	m_firstStrPtrOffset = (char *)&ptr_urlBuf - (char *)this;
	// and our base
	m_numStrings = getBaseNumStrings();
	// now copy over string ptrs
	char *dst = (char *)this +      m_firstStrPtrOffset;
	char *src = (char *)old  + old->m_firstStrPtrOffset;
	memcpy ( dst , src , old->m_numStrings * 4 );
	// and the sizes
	dst += 4 * m_numStrings ;
	src += 4 * old->m_numStrings ;
	memcpy ( dst , src , old->m_numStrings * 4 );
	// sanity tests. make sure they match up
	//if ( old->ptr_urlBuf  != ptr_urlBuf  ) { char *xx=NULL;*xx=0; }
	//if ( old->ptr_rssItem != ptr_rssItem ) { char *xx=NULL;*xx=0; }
	// point to the old buf now, OldInlink::m_buf[]
	src += 4 * old->m_numStrings ;
	// update our string ptrs to reference into "old's" m_buf[]
	updateStringPtrs ( src );
	// log it
	//logf(LOG_DEBUG,"build: setting new Inlink from old.");
	// we can't do this sanity check because we cast "old" as an Inlink
	// whereas before it was an older version of "Inlink"
	//if ( old->size_urlBuf != size_urlBuf ) { char *xx=NULL;*xx=0; }
}

long Inlink::getStoredSize ( ) {
	//long size = (long)sizeof(Msg);
	//long size = getBaseSize();
	long size = m_firstStrPtrOffset;
	// add in string AND size ptrs
	size += 8 * m_numStrings;
	// add up string buffer sizes
	//long *sizePtr = getFirstSizeParm(); // &size_qbuf;
	//long *sizeEnd = getLastSizeParm (); // &size_displayMetas;
	long *sizePtr = 
		(long *)((char *)this + m_firstStrPtrOffset + 4*m_numStrings);
	long *sizeEnd = sizePtr + m_numStrings;
	for ( ; sizePtr < sizeEnd ; sizePtr++ )
		size += *sizePtr;
	return size;
}

// . return ptr to the buffer we serialize into
// . return NULL and set g_errno on error
char *Inlink::serialize ( long *retSize     ,
			  char *userBuf     ,
			  long  userBufSize ,
			  bool  makePtrsRefNewBuf ) {
	// make a buffer to serialize into
	char *buf  = NULL;
	long  need = getStoredSize();
	// big enough?
	if ( need <= userBufSize ) buf = userBuf;
	// alloc if we should
	if ( ! buf ) buf = (char *)mmalloc ( need , "Ra" );
	// bail on error, g_errno should be set
	if ( ! buf ) return NULL;
	// set how many bytes we will serialize into
	*retSize = need;
	// copy the easy stuff
	char *p = buf;
	memcpy ( p , (char *)this , getBaseSize() );
	p += getBaseSize();
	// then store the strings!
	long  *sizePtr = getFirstSizeParm(); // &size_qbuf;
	long  *sizeEnd = getLastSizeParm (); // &size_displayMetas;
	char **strPtr  = getFirstStrPtr  (); // &ptr_qbuf;
	for ( ; sizePtr <= sizeEnd ;  ) {
		// if we are NULL, we are a "bookmark", so
		// we alloc'd space for it, but don't copy into
		// the space until after this call toe serialize()
		if ( ! *strPtr ) goto skip;
		// sanity check -- cannot copy onto ourselves
		if ( p > *strPtr && p < *strPtr + *sizePtr ) {
			char *xx = NULL; *xx = 0; }
		// copy the string into the buffer
		memcpy ( p , *strPtr , *sizePtr );
	skip:
		// . make it point into the buffer now
		// . MDW: why? that is causing problems for the re-call in
		//   Msg3a, it calls this twice with the same "m_r"
		if ( makePtrsRefNewBuf ) *strPtr = p;
		// advance our destination ptr
		p += *sizePtr;
		// advance both ptrs to next string
		sizePtr++;
		strPtr++;
	}
	return buf;
}

// used by PageTitledb.cpp
bool LinkInfo::print ( SafeBuf *sb , char *coll ) {
	//char buf [1024];
	//char buf2[1024];
	//char buf3[MAX_RSSITEM_SIZE]; // 30000?
	//char buf4[1024];
	long count = 1;
	// loop through the link texts
	for ( Inlink *k = NULL; (k = getNextInlink(k)) ; count++ ) {
		char *s    = k->ptr_linkText;
		long  slen = k->size_linkText - 1;
		char *d    = k->ptr_surroundingText;
		long  dlen = k->size_surroundingText - 1;
		char *r    = k->ptr_rssItem;
		long  rlen = k->size_rssItem - 1;
		char *g    = k->ptr_gigabitQuery;
		long  glen = k->size_gigabitQuery - 1;
		char *c    = k->ptr_categories;
		long  clen = k->size_categories - 1;
		if ( slen < 0 ) slen = 0;
		if ( dlen < 0 ) dlen = 0;
		if ( rlen < 0 ) rlen = 0;
		if ( glen < 0 ) glen = 0;
		if ( clen < 0 ) clen = 0;
		if ( ! c || clen <= 0 ) c = "";

		// encode the rss item further
		char buf3b[MAX_RSSITEM_SIZE*2];
		buf3b[0] = 0;
		if ( rlen > 0 ) 
			htmlEncode ( buf3b , 
				     buf3b + MAX_RSSITEM_SIZE*2 ,
				     r , // buf3 ,
				     r + rlen , // buf3 + gbstrlen(buf3) ,
				     true         );

		// . print link text and score into table
		// . there is a ton more info in Inlink class to print if
		//   you want to throw it in here...
		sb->safePrintf(
			       "<tr><td colspan=2>link #%04li "
			       "("
			       //"baseScore=%010li, "
			       "d=<a href=\"/master/titledb?c=%s&"
			       "d=%lli\">%016lli</a>, "
			       "siterank=%li, "
			       "hopcount=%03li "
			       "outlinks=%05li, "
			       "ip=%s "
			       //"pagepop=%li "
			       //"sitepop=%li "
			       "numLinksToSite=%li "
			       //"anomaly=%li "
			       "<b>url</b>=\"%s\" "
			       "<b>txt=</b>\""
			       "%s\" "
			       "<b>neigh=</b>\"%s\" "
			       "<b>rssItem</b>=\"%s\" "
			       "<b>gigabits</b>=\"%s\" "
			       "<b>categories</b>=\"%s\" "
			       //"<b>templateVec=\"</b>%s\" "
			       "</td></tr>\n",
			       count , 
			       //(long)k->m_baseScore ,
			       coll ,
			       k->m_docId,
			       k->m_docId,
			       //(long)k->m_docQuality,
			       (long)k->m_siteRank,
			       (long)k->m_hopcount,
			       (long)k->m_numOutlinks ,
			       iptoa(k->m_ip),
			       //(long)k->m_pagePop,
			       //(long)k->m_sitePop,
			       (long)k->m_siteNumInlinks,
			       //(long)k->m_isAnomaly,
			       k->ptr_urlBuf, // the linker url
			       s, // buf,
			       d, // buf2,
			       buf3b,
			       g, // buf4,
			       c
			       );
	}
	return true;
}

//long getNumLinksToSite ( long q ) {
//	if ( q <= 20 ) return 0; return (1 << ((q - 20)/5)); };
//long getSitePop        ( long q ) { return getNumLinksToSite ( q ) * 5; };
// . total pop of all the inlinkers to the page
// . assume 20 times less than site pop
//long getPagePop        ( long q ) { return getSitePop ( q ) / 20; }

/*
long LinkInfo::computePagePop ( Url *u , char *coll ) {
	// get our site hiash. use domain for now
	long dh = u->getDomainHash32 ( );
	// store pagepop into "sum"
	long sum = 0;
	for ( Inlink *k=NULL;(k=getNextInlink(k));) {
		// get the url
		Url u2; u2.set ( k->ptr_urlBuf , k->size_urlBuf - 1);
		// get the site hash
		long dh2 = u2.getDomainHash32 ( );
		// skip if it is from our same site
		if ( dh2 == dh ) continue;
		// one point for having him
		sum++;
		// and we inherit his points as well
		sum += k->m_pagePop;
	}
	return sum;
}
*/

bool LinkInfo::hasRSSItem() {
	for ( Inlink *k=NULL;(k=getNextInlink(k));) 
		// rss item?
		if ( k->size_rssItem > 10 ) return true;
	return false;
}

///////////////
//
// LINKS CLASS
//
///////////////

// . use siteRec to set m_extractRedirects from <extractRedirectsFromLinks>
// . this is used because yahoo has links like:
//   yahoo.com/drst/pop/2/10/576995/37640326/*http://www.m.com/
// . we need yahoo's link: terms to be more precise for our ban/unban algorithm

static long getLinkBufferSize(long numLinks);
Links::Links(){
	m_allocBuf = NULL;
	m_allocSize = 0;
	m_linkBuf  = NULL;
	m_allocLinks = 0;
	m_spamNote = NULL;
	m_numLinks = 0;
	m_baseUrl = NULL;
	m_numOutlinksAdded = 0;
}

Links::~Links() {
	reset();
}

void Links::reset() {
	if (m_allocBuf) mfree(m_allocBuf, m_allocSize, "Links");
	m_allocBuf = NULL;
	m_allocSize = 0;
	if (m_linkBuf) mfree(m_linkBuf, getLinkBufferSize(m_allocLinks), 
			     "Links");
	m_linkBuf = NULL;
	m_allocLinks = 0;
	m_spamNote = NULL;
	m_numLinks = 0;
	m_flagged = false;
	m_hasRSS  = false;
	m_isFeedBurner = false;
	m_hasSelfPermalink = false;
	m_hasRSSOutlink = false;
	m_hasSubdirOutlink = false;
	m_rssOutlinkPtr = NULL;
}

bool Links::set ( bool useRelNoFollow ,
		  Xml *xml , Url *parentUrl , bool setLinkHash ,
		  //bool useBaseHref , 
		  // use null for this if you do not want to use it
		  Url *baseUrl , 
		  long version , 
		  long niceness ,
		  //bool addSiteRootFlags ,
		  //char *coll ,
		  bool parentIsPermalink ,
		  Links *oldLinks ,
		  bool doQuickSet ) {

	reset();

	// always for this to true now since we need them for linkdb
	setLinkHash = true;
	if ( doQuickSet ) setLinkHash = false;

	m_xml       = xml;
	m_baseUrl   = parentUrl;
	m_parentUrl = parentUrl;
	m_doQuickSet = doQuickSet;
	m_parentIsPermalink = parentIsPermalink;

	m_baseSite    = NULL;
	m_baseSiteLen = 0;
	// default to domain, not hostname
	//if ( m_baseUrl && addSiteRootFlags ) 
	//	m_baseSite = m_baseUrl->getSite ( &m_baseSiteLen, coll, false);

	//m_addSiteRootFlags = addSiteRootFlags;
	//m_coll             = coll;
	// sanity check
	//if ( addSiteRootFlags && ! coll ) { char *xx=NULL;*xx=0; }

	m_numLinks = 0;
	m_numNodes = xml->getNumNodes();
	m_bufPtr   = NULL;
	//m_buf[0]   = '\0';

	//bool useRelNoFollow = true;
	//if ( sx ) useRelNoFollow = sx->getBool ("useRelNoFollow",true);

	m_linksToGigablast = false;
	m_hasRelNoFollow   = false;

	// unknow if baseUrl are a permalink or not
	//m_isPermalink = -1;

	//char utf8Buf[MAX_URL_LEN+1];
	//long utf8Len = 0;

	// this is not a good thing, don't strip here, but we did it for 
	// version 51 so we have to keep doing it for version 51. we strip
	// the session id in Msg10.cpp in the addUrlLoop().
	//if ( version == 51 )
	//	m_stripIds = true;
	//else
	m_stripIds = false;
	// ok, let's remove it for the links: hashing, it just makes more
	// sense this way i think. we can normalize the links: terms in the
	// query if you are worried about it.
	//if ( version >= 54 ) m_stripIds = true;
	m_stripIds = true;

	// get the <base href=> tag if any (12)
	if ( baseUrl ) m_baseUrl = baseUrl;

	// count the dirty links
	//m_numDirtyLinks = 0;

	// get this from the xml of the siteRec
	//m_extractRedirects = sx->getBool ("extractRedirectsFromLinks",false);

	//bool gotIt = false;
	//for ( long i=0; i < m_numNodes ; i++ ) {
	//	if ( xml->getNodeId ( i ) != TAG_FBORIGLINK ) continue;
	//	gotIt = true;
	//	break;
	//}

	// visit each node in the xml tree. a node can be a tag or a non-tag.
	char *urlattr = NULL;
	for ( long i=0; i < m_numNodes ; i++ ) {
		QUICKPOLL(niceness);
		// . continue if this tag ain't an <a href> tag
		// . atom feeds have a <link href=""> field in them
		long id = xml->getNodeId ( i );
		if ( id != TAG_A         &&
		     id != TAG_LINK      &&
		     id != TAG_AREA      &&
		     id != TAG_ENCLOSURE &&
		     id != TAG_WEBLOG    &&
		     id != TAG_URLFROM   && //  <UrlFrom> for ahrefs.com
		     id != TAG_FBORIGLINK )
			continue;
		urlattr = "href";
		if ( id == TAG_WEBLOG     ) urlattr ="url";
		if ( id == TAG_FBORIGLINK ) m_isFeedBurner = true;

		// if it's a back tag continue
		if ( xml->isBackTag ( i ) ) continue;
		// reset
		linkflags_t flags = 0;
		// . if it has rel=nofollow then ignore it
		// . for old titleRecs we should skip this part so that the
		//   link: terms are indexed/hashed the same way in XmlDoc.cpp
		long  slen;
		char *s ;
		if ( useRelNoFollow ) s = xml->getString ( i , "rel", &slen ) ;
		if ( useRelNoFollow &&
		     slen==8 &&   // ASCII
		     strncasecmp ( s,"nofollow", 8 ) == 0) {
			// if this flag is set then::hasSpamLinks() will always
			// return false. the site owner is taking the necessary
			// precautions to prevent log spam.
			m_hasRelNoFollow = true;
			// . do not ignore it now, just flag it
			// . fandango has its ContactUs with a nofollow!
			flags |= LF_NOFOLLOW;
		}
		// get the href field of this anchor tag
		long linkLen;
		char *link = (char *) xml->getString ( i, urlattr, &linkLen );
		// does it have the link after the tag?
		//long tagId = xml->getNodeId(i);
		// skip the block below if we got one in the tag itself
		//if ( linkLen ) tagId = 0;
		// if no href, but we are a <link> tag then the url may
		// follow, like in an rss feed.
		if ( linkLen==0 && 
		     (id == TAG_LINK || 
		      id == TAG_URLFROM ||
		      id == TAG_FBORIGLINK) ) {
			// the the <link> node
			char *node    = xml->getNode(i);
			long  nodeLen = xml->getNodeLen(i);
			// but must NOT end in "/>" then
			if ( node[nodeLen-2] == '/' )  continue;
			// expect the url like <link> url </link> then
			if ( i+2 >= m_numNodes         ) continue;
			if ( xml->getNodeId(i+2) != id ) continue;
			if ( ! xml->isBackTag(i+2)     ) continue;
			// ok assume url is next node
			link    = xml->getNode(i+1);
			linkLen = xml->getNodeLen(i+1);
			// watch out for CDATA
			if ( linkLen > 12 &&
			     strncasecmp(link, "<![CDATA[", 9) == 0 ) {
				link += 9;
				linkLen -= 12;
			}
		}

		// was it an enclosure?
		//if ( linkLen == 0 && xml->getNodeId( i ) == TAG_XMLTAG ) 
		//	link = (char *) xml->getString ( i, "url", &linkLen );
			
		// . it doesn't have an "href" field (could be "name" field)
		// . "link" may not be NULL if empty, so use linkLen
		if ( linkLen == 0 ) 
			continue;
		// skip spaces in the front (should be utf8 compatible)
		while ( linkLen > 0 && is_wspace_a(*link) ) {
			link++; 
			linkLen--;
		}
		// don't add this link if it begins with javascript:
		if ( linkLen >= 11 && strncasecmp (link,"javascript:",11) ==0){
			// well... a lot of times the provided function has
			// the url as an arg to a popup window
			long oclen = 0;
			char *oc = xml->getString(i,"onclick",&oclen);
			// if none, bail
			if ( ! oc ) continue;
			// set end
			char *ocend = oc + oclen - 2;
			char *ocurl = NULL;
			// scan for "'/" which should indicate the url
			for ( ; oc < ocend ; oc++ ) {
				if ( *oc   !='\'' ) continue;
				if (  oc[1]!='/'  ) continue;
				// set the start
				ocurl = oc + 1;
				// and stop the scan
				break;
			}
			// if none, bail
			if ( ! ocurl ) continue;
			// now find the end of the url
			char *ocurlend = ocurl + 1;
			for ( ; ocurlend < ocend ; ocurlend++ ) 
				if ( *ocurlend == '\'' ) break;
			// assign it now
			link    = ocurl;
			linkLen = ocurlend - ocurl;
			// and continue
		}
		if ( linkLen == 0 ) 
			continue;
		// it's a page-relative link
		if ( link[0]=='#' )  continue;
		// ignore mailto: links
		if ( linkLen >= 7 && strncasecmp( link , "mailto:" , 7 ) == 0 )
			continue;
		// make sure not too many links already
		//if ( version < 72 && m_numLinks >= MAX_LINKS ) break;

		QUICKPOLL(niceness);

		// if we have a sequence of alnum chars (or hpyhens) followed 
		// by a ':' then that is a protocol. we only support http and 
		// https protocols right now. let "p" point to the ':'.
		char *p = link;
		long  pmaxLen = linkLen;
		if ( pmaxLen > 20 ) pmaxLen = 20;
		char *pend = link + pmaxLen;
		while ( p < pend && (is_alnum_a(*p) || *p=='-') ) p++;
		// is the protocol, if it exists, a valid one like http or 
		// https?  if not, ignore it. we only support FQDNs 
		// (fully qualified domain names) here really. so if you
		// have something like mylocalhostname:8000/ it is not going
		// to work anymore. you would need "use /etc/hosts" enabled
		// for that to work, too.
		bool proto = true;
		if ( p < pend && *p == ':' ) { // && version >= 62 ) {
			proto = false;
			long plen = p - link;
			if ( plen == 4 && strncasecmp(link,"http" ,plen) == 0 )
				proto = true;
			if ( plen == 5 && strncasecmp(link,"https",plen) == 0 )
				proto = true;
		}
		// skip if proto invalid like callto:+4355645998 or
		// mailto:jimbob@hoho.com
		if ( ! proto ) continue;
		// add it
		char  ptmp [ MAX_URL_LEN + 1 + 1 ];
		// keep an underpad of 1 byte in case we need to prepend a /
		char *tmp = ptmp + 1;
		if ( linkLen > MAX_URL_LEN ) {
			// only log this once just so people know, don't spam
			// the log with it.
			static bool s_flag = 1;
			if ( s_flag ) {
				s_flag = 0;
				log(LOG_INFO, "build: Link len %li is longer "
					      "than max of %li. Link will not "
					      "be added to spider queue or "
					      "indexed for link: search.",
					      linkLen,(long)MAX_URL_LEN);
			}
			continue;
		}
		// see if the <link> tag has a "type" file
		bool isRSS = false;
		long typeLen;
		char *type =(char *)xml->getString(i, "type", &typeLen );
		// . MDW: imported from Xml.cpp:
		// . check for valid type:
		//   type="application/atom+xml" (atom)
		//   type="application/rss+xml"  (RSS 1.0/2.0)
		//   type="application/rdf+xml"  (RDF)
		//   type="text/xml"             (RSS .92) support?
		// compare
		if ( type ) {
			if (strncasecmp(type,"application/atom+xml",20)==0)
				isRSS=true;
			if (strncasecmp(type,"application/rss+xml" ,19)==0)
				isRSS=true;
			// doesn't seem like good rss
			//if (strncasecmp(type,"application/rdf+xml" ,19)==0)
			//	isRSS=true;
			if (strncasecmp(type,"text/xml",8)==0)
				isRSS=true;
		}
		long relLen = 0;
		char *rel = NULL;
		// make sure we got rel='alternate' or rel="alternate", etc.
		if ( isRSS ) rel = xml->getString(i,"rel",&relLen);
		// compare
		if ( rel && strncasecmp(rel,"alternate",9) != 0 ) 
			isRSS = false;
		// skip if a reply! rss feeds have these links to comments
		// and just ignore them for now
		if ( rel && strncasecmp(rel,"replies",7)==0 )
			continue;
		// http://dancleary.blogspot.com/feeds/posts/default uses edit:
		if ( rel && strncasecmp(rel,"edit",4)==0 )
			continue;
		// . if type exists but is not rss/xml, skip it. probably
		//   javascript, css, etc.
		// . NO! i've seen this to be type="text/html"!
		//if ( ! isRSS && type ) continue;
		// store it
		if ( isRSS ) m_hasRSS = true;
		// JAB: warning abatement
		//unsigned char flags = 0;
		//TODO: should we urlEncode here?
		// i didn't know this, but links can have encoded html entities
		// like &amp; and &gt; etc. in them and we have to decode
		// assign the new decoded length. 
		// this is not compatible with m_doQuickSet because we store
		// the "link" ptr into the array of link ptrs, and this uses
		// the "tmp" buf.
		// nono, need this now otherwise it hits that linkNode<0
		// error msg in XmlDoc.cpp. but for Msg13 spider compression
		// you might want to do something else then i guess...
		//if ( ! m_doQuickSet ) {
		linkLen = htmlDecode ( tmp , 
				       link , 
				       linkLen,
				       false,
				       niceness );
		// use tmp buf
		link = tmp;
		//}

		if (!addLink ( link , linkLen , i , setLinkHash , 
			       version , niceness , isRSS , id , flags ))
			return false;
		// get the xml node
		//XmlNode *node = m_xml->getNodePtr(i);
		// set this special member
		//node->m_linkNum = m_numLinks - 1;
		// set the flag if it is an RSS link
	}
	// . flag the links we have that are old (spidered last time)
	// . set LF_OLDLINK flag
	if ( ! flagOldLinks ( oldLinks ) ) return false;
	return true;
}

// just a NULL-terminated text buffer/file of links to add
bool Links::set ( char *buf ,  long niceness ) { //char *coll,long niceness ) {
	reset();
	// need "coll" for Url::isSiteRoot(), etc.
	//m_coll = coll;
	m_parentUrl = NULL;
	m_baseUrl = NULL;
	m_addSiteRootFlags = false;
	m_xml = NULL;
	char *p = buf;
	while ( *p ) {
		// skip spaces
		while ( *p && is_wspace_a(*p) ) p++;
		// get the length of the link
		char *q = p;
		while ( *q && ! is_wspace_a(*q) ) q++;
		long len = q - p;
		// add the link
		if ( ! addLink ( p , len , -1 , false , 
				 TITLEREC_CURRENT_VERSION , niceness, false,
				 TAG_A , 0 ) ) 
			return false;
		// advance
		p = q;
	}
	// assume none are flagged as old, LF_OLDLINK
	m_flagged = true;
	return true;
}

// . the blogroll must consist of 2 outlinks to two different external blogs
//   in order to be a valid blogroll
// . add the all the site root outlinks in the valid blogroll into the 
//   turk queue so they can be manually reviewed
// . use ruleset "66" to indicate not a blog? that will help us identify
//   false blogrolls.
bool Links::queueBlogRoll ( TagRec **tagRecPtrs , long niceness ) {

	// sanity check, we must have set Links with this as true!
	if ( ! m_addSiteRootFlags ) { char *xx=NULL;*xx=0; }

	// do not add blogroll if any of our outlinks are banned
	for ( long i = 0 ; i < m_numLinks ; i++ ) {
		// skip if none
		if ( ! tagRecPtrs[i] ) continue;
		// return if any one outlink is banned
		if ( tagRecPtrs[i]->getLong("manualban",0) ) return true;
	}

	// how many nodes do we have?
	long nn = m_xml->getNumNodes();
	// count the link #
	long j = 0;
	// the loop over the nodes
	long i = 0;
	// come back up here if the evaluated blogroll was no good
 loop:
	// we record the depth of the first valid blog outlink
	// and as soon as another link is at a different depth, we terminate
	// that particular blogroll, and break out of the loop to evaluate it
	long validDepth = -1;
	// start of the blog roll (only one allowed)
	long start = -1;
	// domain hash of the first valid blog outlink
	uint32_t dh = 0;
	// saved depth of the section
	long sd = -1;
	// is it NOT a blog roll
	char notBlogRoll = 0;
	// count the valid blog outlinks
	long count = 0;
	// loop over every node
	for ( ; i < nn ; i++ ) {
		// take a breath
		QUICKPOLL(niceness);
		// skip if we are not a link, otherwise, we are link #j
		if ( i != m_linkNodes[j] ) continue;
		// inc j so we keep our scan in sync
		j++;
		// get the xml node
		XmlNode *node = m_xml->getNodePtr(i);
		// get the "depth" of this link
		long depth = node->m_depth;
		// if we had encountered a valid blog outlink, and this 
		// particular outlink is NOT at the same depth, then assume
		// that our blogroll is over... and break out to evaluate it
		if ( start >= 0 && depth != validDepth ) {
			// if we did not have 2+ valid blog outlinks, 
			// reset and keep going
			if ( count < 2 ) goto loop;
			// if we had a blogroll, it's over now. break out
			// and nwe can see if it was a valid blogroll.
			break;
		}
		// skip if not a blog roll cuz we got a non-blog outlink
		if ( notBlogRoll ) continue;
		// skip if from the same domain as this page is
		if ( m_linkFlags[j] & LF_SAMEDOM ) continue;
		// skip if not a site root or a root url
		if ( ! (m_linkFlags[j] & LF_SITEROOT) ) continue;
		// is this outlink a blog?
		long b = tagRecPtrs[j]->getLong("blog",-1);
		// skip if NOT a blog for sure
		if ( b == 0 ) {
			// set the flag to indicate not a blogroll
			notBlogRoll = 1;
			continue;
		}
		// skip if unknown whether it is a blog
		if ( b == -1 ) continue;
		// normalize it
		Url link; link.set ( m_linkPtrs[j] , m_linkLens[j] );
		// get domain
		char *dom  = link.getDomain   ();
		long  dlen = link.getDomainLen();
		// must have domain, a valid TLD, otherwise this is NULL
		if ( ! dom || dlen == 0 ) continue;
		// . save the spot of the first valid blog outlink
		// . save his depth too
		if ( start < 0 ) { start = j; sd = depth; }
		// get domain hash
		uint32_t linkDomHash = hash32 ( dom , dlen );
		// if we are NOT the start of a blogroll, make sure we got
		// a distinct domain...
		if ( linkDomHash == dh ) continue;
		// count it as a prospect
		count++;
		// save his depth
		//sd = depth;
		// store the domain hash, so that we can make sure to get
		// to outlinks from two different domains
		dh = linkDomHash;
	}

	// bail if no valid blog roll found
	if ( start < 0 ) return true;

	// point to the link right before our valid blog outlink
	long k = start - 1;
	// backup to find the start of the blogroll, keep depth the same
	for ( ; k >= 0 ; k-- ) {
		// get link node #
		i = m_linkNodes[k];
		// get the xml node
		XmlNode *node = m_xml->getNodePtr(i);
		// back up until depth !=
		if ( node->m_depth != sd ) break;
	}		
	// . now link #k+1 is the first link in the blogroll
	// . and link #j-1 is the last  link in the blogroll

	//
	// TODO: classify as "newsy" if site has a lot of pgs w/ pub dates
	//
		
	// index the outlinks in the blogroll, which are roots of sites
	// as gbblogroll:<link> and make sure we don't split it.
	for ( long i = k + 1 ; i < j ; i++ ) {
		// skip if from the same domain 
		if ( m_linkFlags[i] & LF_SAMEDOM ) continue;
		// it must be the root of a "site" too
		if ( ! (m_linkFlags[i] & LF_SITEROOT) ) continue;
		// must be unidentified
		long rs = tagRecPtrs[i]->getLong("ruleset",-1);
		// skip if it is already classified in a ruleset
		if ( rs != -1 ) continue;
		// the link
		char *p    = m_linkPtrs[i];
		long  plen = m_linkLens[i];
		// tmp NULL
		char c = p[plen]; p[plen] = '\0';
		// vote on these tagids
		//long tagIds[] = { ST_BLOG , ST_NEWS };
		// . add it to turk
		// . buzz has mostly blogs and mainstream media news sites
		/*
		g_turk.addUrl ( p      ,
				0LL    , // docid
				NULL   , // TagRec
				tagIds , // tagIds to test for
				2      );
		*/
		// put it back
		p[plen] = c;

		// now make the score
		//unsigned char score = QUALITY;
		// . now hash with our score
		// . this should only be called by XmlDoc::hashNoSplit()
		//if ( ! link.hashAsLink ( version        ,
		//			 table          ,
		//			 NULL           ,
		//			 0              ,
		//			 score          ,
		//			 false          , // internal?
		//			 "gbblogroll:"  ,
		//			 indexSiteLinks ) )
		//	return false;
	}
	// success
	return true;
}

// . should this page be considered dirty based on it's dirty links
// . ?we should avoid adding dirty pages to the index?
// . we should not add ANY of links if enough are dirty
// . we should not hash link: terms for dirty pages
//bool Links::isPageDirty ( ) {
//	if ( m_numLinks < 5  ) return ( m_numDirtyLinks >= 2 ) ;
//	// get percent dirty
//	long percent = ( m_numDirtyLinks * 100 ) / m_numLinks ;
//	return ( percent >= 10 );
//}

bool Links::addLink ( char *link , long linkLen , long nodeNum ,
		      bool setLinkHash , long titleRecVersion ,
		      long niceness , bool isRSS , long tagId ,
		      long flagsArg ){

	// don't add 0 length links
	if ( linkLen <= 0 ) return true;
	// ensure buf has enough room
// 	if (titleRecVersion < 72){
// 		if ( m_bufPtr-m_buf + linkLen + 1 > LINK_BUF_SIZE ){
// 			return true;
// 		}
// 	}

	// do we need to alloc more link space?
	if (m_numLinks >= m_allocLinks) {
		long newAllocLinks;
		// older titlerecs can't hold more than 10000 links
		//if(titleRecVersion<72 && m_allocLinks >= 10000) return true;

		if (!m_allocLinks)            newAllocLinks =10000;
		else if (m_allocLinks<100000) newAllocLinks =m_allocLinks*2;
		else                          newAllocLinks =m_allocLinks+100000;
		
		// how much mem do we need for newAllocLinks links?
		long newAllocSize = getLinkBufferSize(newAllocLinks); 

		QUICKPOLL(niceness);

		char *newBuf = (char*)mmalloc(newAllocSize, "Links");
		if (!newBuf) return false; 

		QUICKPOLL(niceness);

		// a ptr to it
		char *p = newBuf;
		// debug msg
		log(LOG_DEBUG, "build: resizing Links ptr buffer to %ld",
		    newAllocSize);

		char **newLinkPtrs = (char**)p;
		p += newAllocLinks * sizeof(char *) ;

		long *newLinkLens = (long*)p;
		p += newAllocLinks * sizeof(long) ;

		long *newLinkNodes = (long*)p; 
		p += newAllocLinks * sizeof(long) ;

		uint64_t *newLinkHashes = (uint64_t *)p;
		p += newAllocLinks * sizeof(uint64_t) ;

		uint64_t *newHostHashes = (uint64_t *)p;
		p += newAllocLinks * sizeof(uint64_t) ;

		long *newDomHashes = (long *)p;
		p += newAllocLinks * sizeof(long);

		linkflags_t *newLinkFlags = (linkflags_t *)p;
		p += newAllocLinks * sizeof(linkflags_t) ;

		char **newSpamNotes = (char **)p;
		p += newAllocLinks * sizeof(char **);

		// sanity check -- check for breach
		if ( p > newBuf + newAllocSize ) { char *xx = NULL; *xx = 0; }

		if (m_linkBuf){
			memcpy(newLinkPtrs, m_linkPtrs, 
			       m_numLinks * sizeof(char*));
			QUICKPOLL(niceness);
			memcpy(newLinkLens, m_linkLens, 
			       m_numLinks * sizeof(long));
			QUICKPOLL(niceness);
			memcpy(newLinkNodes, m_linkNodes, 
			       m_numLinks * sizeof(long));
			QUICKPOLL(niceness);
			memcpy(newLinkHashes, m_linkHashes,
			       m_numLinks * sizeof(uint64_t));
			QUICKPOLL(niceness);
			memcpy(newHostHashes, m_hostHashes,
			       m_numLinks * sizeof(uint64_t));
			QUICKPOLL(niceness);
			memcpy(newDomHashes, m_domHashes,
			       m_numLinks * sizeof(long));
			QUICKPOLL(niceness);
			memcpy(newLinkFlags, m_linkFlags,
			       m_numLinks * sizeof(linkflags_t));
			QUICKPOLL(niceness);
			memcpy(newSpamNotes,m_spamNotes,
			       m_numLinks * sizeof(char *));
			long oldSize = getLinkBufferSize(m_allocLinks);
			mfree(m_linkBuf, oldSize, "Links");
			QUICKPOLL(niceness);
		}
		m_allocLinks = newAllocLinks;
		m_linkBuf    = newBuf;
		m_linkPtrs   = newLinkPtrs;
		m_linkLens   = newLinkLens;
		m_linkNodes  = newLinkNodes;
		m_linkHashes = newLinkHashes;
		m_hostHashes = newHostHashes;
		m_domHashes  = newDomHashes;
		m_linkFlags  = newLinkFlags;
		m_spamNotes  = newSpamNotes;
	}

	// quickset?
	/*
	if ( m_doQuickSet ) {
		m_linkPtrs    [ m_numLinks ] = link;
		m_linkLens    [ m_numLinks ] = linkLen;
		m_linkNodes   [ m_numLinks ] = nodeNum;
		m_numLinks++;
		return true;
	}
	*/

	// normalize the link and prepend base url if needed
	Url url;

	// if our base url is http://poo.com/user/register/user/register/...
	// and this link is "user/register/", for instance, we got a link loop.
	// the "webmaster" really meant "/user/register" to be the link.
	/*
	char fix = false;
	//if ( titleRecVersion >= 79 && link[0] != '/' ) {
	if ( && link[0] != '/' ) {
		// temporarily NULL terminate the link
		char c = link[linkLen];
		link[linkLen] = '\0';
		// get the base url path
		char *path = m_baseUrl->getPath();
		long  plen = m_baseUrl->getPathLen();
		char *pend = path + plen;
		// is this relative path is repeated in the base url?
		char *p = strnstr ( path ,  // haystack
				    link ,  // needle
				    plen ); // haystackSize
		//char *p2 = strnstr ( 
		if ( p )
			log("hey");
		// advance over the needle in the haystack
		if ( p ) 
			p += linkLen;
		// but if the relative url contains a / it only needs to
		// be repeated once
		if ( strchr ( link , '/' ) ) 
			fix = true;
		// is it repeated again after that? making it twice repeated?
		if ( p && ! fix && strnstr ( p        ,  // haystack
					     link     ,  // needle
					     pend - p )) // haystackSize
			fix = true;
		// put the char back
		link[linkLen] = c;
	}
	char c;
	if ( fix ) {
		link--;
		c     = *link;
		*link = '/';
		linkLen++;
	}
	*/

	// . let's force the www (back to the old ways)
	// . before i wasn't, but root urls w or w/o the www can be dynamic
	//   pages that really are the same, but had a different ad or whatever
	//url.set ( m_baseUrl , link , linkLen , true/*addWWW?*/, m_stripIds ,

	// MDW: let's try turning it off. slashdot.org's outlinkers are all
	// seen as external because we force the 'www' on them here and the
	// m_baseUrl does NOT have the 'www'. we now check for "www dups" in
	// Msg16.cpp so that should alleviate the issue discussed right above.
	//bool addWWW = true;
	//if ( titleRecVersion >= 99 ) addWWW = false;
	//bool addWWW = false;
	bool addWWW = true;

	url.set ( m_baseUrl       ,
		  link            ,
		  linkLen         ,
		  addWWW          , // addWWW?
		  m_stripIds      ,
		  // now i strip this thang because the rss
		  // feeds have a link to every comment but it is
		  // really the same url...
		  true            , // stripPound? 
		  // convert /index.html to /
		  // turned this back on per john's request
		  // will cause undeletable data in existing indexes.
		  true            , // stripCommonFile?
		  titleRecVersion );// used for removing session ids

	// refix the link
	//if ( fix ) *link = c;

	// sometimes there's links like:
	// http://'+ycso[8]+ \n'commentsn?blog_id=... which is within
	// <script></script> tags
	if ( url.getDomainLen() <= 0 || url.getHostLen() <= 0 ) return true;

	// count dirty links
	//if ( url.isDirty() ) m_numDirtyLinks++;
	// . a lot of links are redirect to other places
	// . yahoo.com/drst/pop/2/10/576995/37640326/*http://www.m.com/
	// . find the asterix before the url to redirect to if we should
	// . if we had a known redirect url in the link, set url class to it
	// . this makes the link: search work much better
	// . that means we can use yahoo to tell us banned/unbanned sites
	//char *s ;
	//if ( m_extractRedirects && (s=strchr(url.getUrl(),'*'))  ) {
	//	// . this is really just for yahoo, but we could eventually
	//	//   use an aribtrary delimeter in the site file
	//	// . skip the *http:/
	//	s += 7;
	//	char buf[MAX_URL_LEN];
	//	strcpy ( buf , s );
	//	long blen = gbstrlen(buf);
	//	// . this was causing problems!
	//	// . sometimes yahoo has nothing after the '*'
	//	if ( blen == 0 ) return;
	//	// it must start with http:
	//	url.set ( buf, blen , true, m_stripIds);
	//}
	// debug TODO: fix a href=\"http://www.thecounter.com"\ thangs
	// if ( url.getUrl()[0] !='h' ) sleep(10);
	// ensure buf has enough room
// 	if (titleRecVersion < 72) {
// 		if ( m_bufPtr + url.getUrlLen() + 1 >= m_buf+LINK_BUF_SIZE ) 
// 			return true;
// 	}

	// this is now set to 0 in XmlNode.cpp
	// make sure it is valid
	//if ( nodeNum >= 0 )
	//	// reset this
	//	m_xml->m_nodes[nodeNum].m_isSelfLink = 0;

	// Allocate more link buffer space?
	//long bufSize = m_allocSize+LINK_BUF_SIZE;
	//long bufSpace = m_allocBuf?m_allocSize - (m_bufPtr-m_allocBuf):0;
	long bufSpace ;
	if ( m_allocBuf ) bufSpace = m_allocSize - (m_bufPtr-m_allocBuf);
	else              bufSpace = 0;
	// allocate dynamic buffer for lotsa links
	if ( url.getUrlLen() + 1 > bufSpace ) {
		//if (titleRecVersion < 72 && m_allocSize >= LINK_BUF_SIZE)
		//	return true;
		// grow by 100K
		long newAllocSize;// = m_allocSize+LINK_BUF_SIZE;
		if ( ! m_allocSize ) newAllocSize = LINK_BUF_SIZE;
		else if (m_allocSize < 1024*1024) newAllocSize = m_allocSize*2;
		else  newAllocSize = m_allocSize + 1024*1024;
		// MDW: a realloc would be more efficient here.
		QUICKPOLL(niceness);
		char *newBuf = (char*)mmalloc(newAllocSize, "Links");
		if ( ! newBuf ) return log("build: Links failed to realloc.");
		log(LOG_DEBUG, "build: resizing Links text buffer to %ld",
		    newAllocSize);
		QUICKPOLL(niceness);
		if ( m_allocBuf ) {
			QUICKPOLL(niceness);
			memcpy ( newBuf , m_allocBuf , m_allocSize );
			QUICKPOLL(niceness);
			// update pointers to previous buffer
			long offset = newBuf - m_allocBuf;
			char *allocEnd = m_allocBuf + m_allocSize;
			for (long i = 0 ; i < m_numLinks ; i++ ) {
				QUICKPOLL(niceness);
				if ( m_linkPtrs[i] <  m_allocBuf ) continue;
				if ( m_linkPtrs[i] >= allocEnd   ) continue;
				m_linkPtrs[i] += offset;
			}
			m_bufPtr += offset;
			QUICKPOLL(niceness);
			mfree ( m_allocBuf , m_allocSize , "Links");
			QUICKPOLL(niceness);
		}
		else m_bufPtr = newBuf;

		m_allocBuf  = newBuf;
		m_allocSize = newAllocSize;
	}
	
	// . is hostname gigablast.com or www.gigablast.com?
	// . must be in the top 100k of link text, too!
	long  hlen = url.getHostLen();
	char *h    = url.getHost   ();
	if ( hlen == 13 && strncmp ( h , "gigablast.com"     , 13 ) == 0 )
		m_linksToGigablast = true;
	if ( hlen == 17 && strncmp ( h , "www.gigablast.com" , 17 ) == 0 )
		m_linksToGigablast = true;
	// add some info
	m_linkPtrs    [ m_numLinks ] = m_bufPtr;
	m_linkLens    [ m_numLinks ] = url.getUrlLen();
	m_linkNodes   [ m_numLinks ] = nodeNum;
	// serialize the normalized link into the buffer 
	memcpy ( m_bufPtr , url.getUrl(), url.getUrlLen() );
	m_bufPtr += url.getUrlLen();
	QUICKPOLL(niceness);

	// and NULL terminate it
	*m_bufPtr++  = '\0';

	/*
	// do permalink detection here
	char *d    = url.getDomain();
	long  dlen = url.getDomainLen();
	// is the baseurl contained in a link to reddit?
	if ( dlen == 10 && m_baseUrl && strncmp ( d , "reddit.com" ) == 0 ) {
		// get the baseurl without the http://
		char *bh  = m_baseUrl->getHost();
		char *cgi = url.getCgi();
		// our base url is a permalink then
		if ( strstr ( cgi , bh ) ) m_isPermalink = 1;
		// otherwise, if it has a link elsewhere it is an index page
		//else if ( strstr ( cgi, "diggthis.php?") ) m_isPermalink = 0;
	}
	*/


	// . set link hash if we need to
	// . the Vector class uses these link hashes for determining similarity
	//   of this document to another for purposes of fightling link spam
	// . we essentially compare the linking web pages against one another
	//   and if we find one that is similar to another we weight it's
	//   link text down. The more similar the more the penalty. We just
	//   see what links it has in common with the others for now...
	if ( setLinkHash ) {
		// sanity
		if ( m_doQuickSet ) { char *xx=NULL;*xx=0; }
		// get url length
		long ulen = url.getUrlLen();
		// subtract the cgi length
		if ( url.isCgi() ) ulen -= 1 + url.getQueryLen();
		// store it's hash
		m_linkHashes [ m_numLinks ] = url.getUrlHash64();
		m_hostHashes [ m_numLinks ] = url.getHostHash64();
		m_domHashes  [ m_numLinks ] = url.getDomainHash32();
	}

	// set the bits in the flags byte
	linkflags_t flags = flagsArg; // 0;
	// set flag bit #0 if it is an "internal" link -- from same hostname
	if ( m_baseUrl && url.getHostLen() == m_baseUrl->getHostLen() &&
	     strncmp(url.getHost(),m_baseUrl->getHost(),url.getHostLen())==0){
		flags |= LF_SAMEHOST; //0x01;
		flags |= LF_SAMEDOM;  //0x02
	}
	else if (m_baseUrl &&url.getDomainLen() == m_baseUrl->getDomainLen() &&
	     strncmp(url.getDomain(),m_baseUrl->getDomain(),
		     url.getDomainLen())==0) {
		flags |= LF_SAMEDOM;
		// . memoori was adding www.construction.com which redirected
		//   to construction.com/index.asp and it did not add the
		//   outlinks because "spider internal links only" was true.
		//   i.e. Msg16::m_sameHostLinks was true
		// . if not same host but domains match, consider it internal
		//   if hosts only differ by a www. this should fix that.
		if ( m_baseUrl->isHostWWW() && !url       .hasSubdomain() )
			flags |= LF_SAMEHOST;
		if ( url.       isHostWWW() && !m_baseUrl->hasSubdomain() )
			flags |= LF_SAMEHOST;
	}		

	char *tld  = url.getTLD();
	long  tlen = url.getTLDLen();
	if ( tlen == 3 && ! strncmp(tld,"edu",3) ) flags |= LF_EDUTLD;
	if ( tlen == 3 && ! strncmp(tld,"gov",3) ) flags |= LF_GOVTLD;

	//if ( m_addSiteRootFlags ) {
	//	char *site    = NULL;
	//	long  siteLen = 0;
	//	// i guess TagRec is NULL here. we really should have
	//	// the tag recs of all the outlinks at this point
	//	if ( url.isSiteRoot(m_coll,NULL,&site,&siteLen) ) 
	//		flags |= LF_SITEROOT;
	//	// same site flag?
	//	if ( site&&siteLen==m_baseSiteLen&&
	//	     strncmp(site,m_baseSite,siteLen)==0)
	//		flags |= LF_SAMESITE;
	//}

	// rss?
	if ( isRSS ) {
		// flag it
		flags |= LF_RSS;
		// we had one
		m_hasRSSOutlink = true;
		// store the first one
		if ( ! m_rssOutlinkPtr ) {
			m_rssOutlinkPtr = m_linkPtrs[m_numLinks];
			m_rssOutlinkLen = m_linkLens[m_numLinks];
			// logit
			//char c = link[linkLen];
			//link[linkLen]=0;
			//logf(LOG_DEBUG,"gb: parent=%s rssoutlink= %s",
			//     m_parentUrl->m_url,link);
			//link[linkLen]=c;
		}
	}


	if      ( tagId == TAG_A          ) flags |= LF_AHREFTAG;
	else if ( tagId == TAG_LINK       ) flags |= LF_LINKTAG;
	else if ( tagId == TAG_FBORIGLINK ) flags |= LF_FBTAG;

	// a self link?
	if ( m_parentUrl &&
	     // MUST be a PROPER subset, links to itself do not count!
	     url.getUrlLen() == m_parentUrl->getUrlLen() &&
	     strncmp(url.getUrl(), m_parentUrl->getUrl(), 
		     m_parentUrl->getUrlLen())==0) {
		flags |= LF_SELFLINK;
		// turn this flag on
		if ( nodeNum >= 0 ) m_xml->m_nodes[nodeNum].m_isSelfLink = 1;
	}

	// now check for the "permalink" key word or "permanent link" keyphrase
	// TEST CASES:
	//http://www.celebritybabyscoop.com/2008/12/28/jennifer-garner-is-still-pregnant/  +  fp_1765421_garner_jennifer_znk_122808jpg/
	//http://www.celebritybabyscoop.com/2008/12/27/gwen-stefani-family-spread-holiday-cheer/
	// http://www.thetrendwatch.com/2008/12/22/how-big-shows-are-becoming-utterly-blaze/   + events-are-boring/
	// http://thinkprogress.org/2008/12/26/bush-pardon-campaign/
	if ( ( flags & LF_SELFLINK ) && ( flags & LF_AHREFTAG ) &&
	     // must be valid
	     nodeNum >= 0 ) {
		XmlNode *nodes = m_xml->getNodes();
		// get back tag
		long max = nodeNum + 20;
		if ( max > m_xml->getNumNodes() ) max = m_xml->getNumNodes();
		long nn = nodeNum + 1;
		while ( nn < max && nodes[nn].m_nodeId != TAG_A ) nn++;
		if ( nn < max ) {
			char *s       = nodes[nodeNum].m_node;
			char *send    = nodes[nn].m_node;
			for ( ; s < send ; s++ ) {
				if ( *s != 'p' && *s != 'P' ) continue;
				if ( ! strncasecmp(s,"permalink",9) ) 
					break;
				if ( ! strncasecmp(s,"permanent link",14) ) 
					break;
			}
			if ( s < send ) {
				flags |= LF_SELFPERMALINK;
				m_hasSelfPermalink = true;
			}
		}
	}

	// get each url length without the cgi
	long len1 = url.getUrlLen() - url.getQueryLen();
	long len2 = 0;
	if ( m_parentUrl ) 
		len2 = m_parentUrl->getUrlLen() - m_parentUrl->getQueryLen();
	// discount the '?' cuz it is not included in the queryLen right now
	if ( url.getQueryLen() ) len1--;
	if ( m_parentUrl && m_parentUrl->getQueryLen() ) len2--;

	// . is it in a subdir of us?
	// TEST CASES:
	// http://joedecie.livejournal.com/28834.html?thread=167074#t167074
	if ( m_parentUrl &&
	     // MUST be a PROPER subset, links to itself do not count!
	     len1 > len2 &&
	     strncmp(url.getUrl(), m_parentUrl->getUrl(),len2)==0) {
		flags |= LF_SUBDIR;
		m_hasSubdirOutlink = true;
	}


	// FIXME:
	// http://www.packers.com/news/releases/2008/12/24/1/email_to_a_friend/

	// FIXME:
	// only has one hyphen but is indeed a permalink!
	// http://marccooper.com/xmas-vacation/

	// TEST CASES:
	// href="...ami.php?url=http://lewebpedagogique.com/blog/2008/11/16/
	// la-seconde-guerre-mondiale-cours ... will have its cgi ignored so
	// such links as this one will not be considered permalinks
	char *pathOverride = NULL;
	bool  ignoreCgi    = false;
	if ( (flags & LF_SUBDIR) && m_parentIsPermalink ) {
		pathOverride = url.getUrl() + m_parentUrl->getUrlLen();
		// must be same host
		if ( m_parentUrl->getHostLen() != url.getHostLen() )
			pathOverride = NULL;
		// same host str check
		else if ( strncmp( m_parentUrl->getHost() ,
				   url.getHost() ,
				   url.getHostLen()) )
			pathOverride = NULL;
		// must be in bounds		
		else if ( url.getUrlLen() <= m_parentUrl->getUrlLen() )
			pathOverride = NULL;
		// if we are a permalink, ignore cgi for seeing if they are
		if ( pathOverride ) ignoreCgi = true;
	}

	// if it is a subset of a permalink parent, it is not a "true" 
	// permalink if it concatenates the word "comment" onto the parent url,
	// it is likely a permalink for a comment, which does not really count
	// TEST CASES:
	// www.flickr.com/photos/korayem/2947977582/comment72157608088269210/
	// robertsquier.blogspot.com/2008/10/drawing-tv.html?showComment=.2..
	// profootballtalk.com/2008/12/28/vikings-win-nfc-north/comment-page-1/
	bool permCheck = true;
	if ( m_doQuickSet ) permCheck = false;
	if ( permCheck && ignoreCgi && strstr(pathOverride,"comment") ) 
		permCheck = false;
	if ( permCheck && ignoreCgi && strstr(pathOverride,"Comment") ) 
		permCheck = false;
	if ( permCheck && ignoreCgi && strstr(pathOverride,"COMMENT") ) 
		permCheck = false;

	// . are we probably a permalink?
	// . we do not have a TagRec at this point so we just can not 
	//   tell whether we are siteRoot, and therefore NOT a permalink
	linkflags_t extraFlags = 0;
	if ( permCheck &&
	     ::isPermalink ( //m_coll   ,
			     NULL     , // Links ptr
			     &url     , // the url
			     CT_HTML  , // contentType
			     NULL     , // LinkInfo ptr
			     isRSS    ,
			     NULL     , // note ptr
			     pathOverride ,
			     ignoreCgi    ,
			     // might include LF_STRONGPERM
			     &extraFlags  ) ) {
		flags |= LF_PERMALINK;
		flags |= extraFlags;
	}

	// set in flag array
	m_linkFlags [ m_numLinks ] = flags;

	// set to NULL for now -- call setLinkSpam() later...
	m_spamNotes [ m_numLinks ] = NULL;

	QUICKPOLL(niceness);

	// inc the count
	m_numLinks++;
	return true;
}

// . does link #i have link text?
// . link text must have at least one alnum in it
bool Links::hasLinkText ( long n, long version ) {
	// return 0 if no link to our "url"
	if ( n >= m_numLinks ) return false;
	// get the node range so we can call Xml::getText()
	long node1 = m_linkNodes [ n ];

	// post-dating this change back to version 75, since it happened
	// sometime right before this version bump, it allows for 
	// the least amount of docs to be indexed wrong
	// only for <a> tags
	if (node1 >= m_xml->getNumNodes()) return false;
	if (m_xml->getNodeId(node1) != TAG_A) return false;
	
	// find the </a> to this <a href> tag, or next <a href> tag
	long node2 = m_xml->getNodeNum ( node1+1,9999999,"a",1);
	// if not found use the last node in the document
	if ( node2 < 0 ) node2 = m_xml->getNumNodes();
	// check for text node in (node1,node2) range
	for ( long i = node1+1 ; i < node2 ; i++ ) {
		// continue if a tag
		if ( m_xml->isTag(i) ) continue;
		// otherwise, it's text
		char *s    = m_xml->getNode   (i);
		char *send = s + m_xml->getNodeLen(i);
		// . does it have any alnums in it?
		// . may be tricked by html entities like #187; or something
		for ( ; s < send ; s += getUtf8CharSize(s) ) 
			if ( is_alnum_utf8 ( s ) ) return true;
	}
	// otherwise, we found no text node with an alnum
	return false;
}

// . stores link text into "buf" and returns the length
// . TODO: speed up so we don't have to set Url for every link in doc
long Links::getLinkText ( char  *linkee ,
			  bool   getSiteLinkInfo ,
			  char  *buf       , 
			  long   bufMaxLen , 
			  //bool   filter    ,
			  char **itemPtr   ,
			  long  *itemLen   ,
			  long  *retNode1  ,
			  long  *retLinkNum ,
			  long   niceness  ) {
	// assume none
	if ( retNode1   ) *retNode1 = -1;
	// assume no link text
	buf[0] = '\0';
	// assume no item
	if ( itemPtr ) *itemPtr = NULL;
	if ( itemLen ) *itemLen = 0;

	// if it is site based, skip the protocol because the site might
	// be just a domain and not a subdomain
	if ( getSiteLinkInfo ) {
		char *pp = strstr ( linkee, "://");
		if ( pp ) linkee = pp + 3;
	}

	long linkeeLen = gbstrlen(linkee);

	// find the link point to our url
	long i;
	for ( i = 0 ; i < m_numLinks ; i++ ) {
		QUICKPOLL(niceness);
		char *link    = getLinkPtr(i);
		long  linkLen = getLinkLen(i);
		// now see if its a full match
		// special case if site
		if ( getSiteLinkInfo ) {
			if ( strstr ( link, linkee ) ) break;
			continue;
		}
		// continue if don't match
		if ( linkLen != linkeeLen ) continue;
		// continue if don't match
		if ( strcmp ( link , linkee ) != 0 ) continue;
		// otherwise it's a hit
		break;
	}
	// return 0 if no link to our "url"
	if ( i >= m_numLinks ) return 0;

	*retLinkNum = i;

	return getLinkText2(i,buf,bufMaxLen,itemPtr,itemLen,retNode1,niceness);
}


long Links::getLinkText2 ( long i ,
			   char  *buf       , 
			   long   bufMaxLen , 
			   //bool   filter    ,
			   char **itemPtr   ,
			   long  *itemLen   ,
			   long  *retNode1  ,
			   long   niceness  ) {
	// get the node range so we can call Xml::getText()
	long node1 = m_linkNodes [ i ];

	// . <area href=> tags have no link text
	// . fix for http://www.cs.umass.edu/%7Everts/index.html 's
	//   link to phdcomics.com . it was picking up bogus link text
	//   from page tail.
	XmlNode *xmlNodes = m_xml->getNodes();
	if ( xmlNodes[node1].m_nodeId == TAG_AREA ) return 0;

	// what delimeter are we using? this only applies to rss/atom feeds.
	//char *del = NULL;
	char del[16];
	long dlen = 0;
	long rss = m_xml->isRSSFeed();
	if ( rss == 1 ) {
		//del = "item";
		memcpy(del, "item\0", 5);
		dlen = 4;
	}
	else if ( rss == 2 ) {
		//del = "entry";
		memcpy(del, "entry\0", 6);
		dlen = 5;
	}
	// if rss or atom page, return the whole xml <item> or <entry>
	//if ( itemBuf && del ) {
	if ( dlen > 0 ) {
		// bail if not wanted
		if ( ! itemPtr ) return 0;
		//log ( LOG_INFO, "Links: Getting Link Item For Url" );
		long     xmlNumNodes = m_xml->getNumNodes();
		// . must come from a <link> node, not a <a>
		// . can also be an <enclosure> tag now too
		if ( xmlNodes[node1].m_nodeId == 2 ) goto skipItem;
		// get item delimeter length
		//long dlen = gbstrlen(del);
		// back pedal node until we hit <item> or <entry> tag
		long j ;
		for ( j = node1 ; j > 0 ; j-- ) {
			QUICKPOLL(niceness);
			// skip text nodes
			if ( xmlNodes[j].m_nodeId == 0 ) continue;
			// check the tag
			if(xmlNodes[j].m_tagNameLen != dlen) continue;
			if(strncasecmp(xmlNodes[j].m_tagName,del,dlen))
				continue;
			break;
		}
		// . if j is 0 we never found the <item> or <entry> tag
		//   because rss and atom feeds never start with such a tag
		// . but we could be in the <channel> section, which is ok
		//   so i commented this out
		//if ( j == 0 ) return 0;
		// ptr to the start of it
		char *s = xmlNodes[j].m_node;
		// save this
		if ( retNode1 ) *retNode1 = j;
		// the end ptr
		//char *send = s + xmlNodes[j].m_nodeLen;
		char *send = m_xml->getContent() + m_xml->getContentLen();
		// . start at the first tag in this element/item
		// . we will copy the blurb on the interval [j,k)
		for ( long k = j+1 ; k < xmlNumNodes ; k++ ) {
			QUICKPOLL(niceness);
			// get the next node in line
			XmlNode *nn = &xmlNodes[k];
			// . break out if would be too long
			// . save room for terminating \0
			//if (nn->m_node+nn->m_nodeLen-s > itemBufSize-1)break;
			// break out if done
			if ( k >= xmlNumNodes ) break;
			// skip text nodes
			if ( nn->m_nodeId == 0 ) continue;
			// skip script sections, inside script tags
			if ( nn->m_nodeId == TAG_SCRIPTTEXT ) continue;
			if(nn->m_tagNameLen != dlen) continue;
			if(strncasecmp(nn->m_tagName,del,dlen))	continue;
			//if ( nn->m_tagNameLen != dlen           ) continue;
			//if ( strncasecmp(nn->m_tagName,del,dlen)) continue;
			// we got the end of the item, set "send"
			send = nn->m_node + nn->m_nodeLen;
			// and we're done, break out
			break; 
		}
		// . if "send" is still NULL then the item/entry blurb was too
		//   big to fit into our buffer, or it never had a closing tag
		// . but if the feed just had a <channel> section and not items
		//   then use the whole thing
		//if ( ! send ) return 0;
		// this is a blurb, send it back as such
		*itemPtr = s;
		*itemLen = send - s;
		// rss feeds do not have conventional link text
		return 0;
	}
skipItem:
	// find the </a> to this <a href> tag, or next <a href> tag
	long node2 = m_xml->getNodeNum ( node1+1,9999999,"a",1);
	// if not found use the last node in the document
	if ( node2 < 0 ) node2 = 99999999;
	// get the back tag for node #n0
	//long n1 = m_xml->getEndNode ( i );
	// now we can call Xml::getText()
	long bufLen =  m_xml->getText ( buf   , 
					bufMaxLen ,
					node1 ,  // get kid text of this node
					node2 ,
					false ,  // include tags?
					true  ,  // visible text only?
					//true  ,  // filter? (entities to iso)
					false ,  // filter? (entities to iso)
					false ,  // filter spaces?
					false ); // exclude if in <stop index>?
	// set it
	if ( retNode1 ) *retNode1 = node1;
	// hunt for an alnum in the link text
	char *p    = buf;
	char *pend = buf + bufLen;
	for ( ; p < pend ; p += getUtf8CharSize(p) ) {
		QUICKPOLL ( niceness );
		if ( is_alnum_utf8(p) ) break;
	}
	// if no alnum then return 0 as the link text len
	if ( p >= pend ) return 0;
	// find last non-space char
	char *q = p;
	char *last = NULL;
	for ( ; q < pend ; q += getUtf8CharSize(p) ) {
		QUICKPOLL ( niceness );
		if ( ! is_wspace_utf8(q) ) last = q;
	}
	// hack off trailing spaces
	if ( last ) pend = last + getUtf8CharSize(last); // +1;
	// shift left if we expunged some leading non-alnums
	memmove ( buf , p , pend - p );
	// reset buflen
	bufLen = pend - p;
	// null terminate
	buf [ bufLen ] = '\0';
	// return length
	return bufLen;
}

// find an ascii subtring in linktext for this link and return a pointer 
// to it, or NULL if not present
char *Links::linkTextSubstr(long linkNum, char *string, long niceness) {
	if (linkNum >= m_numLinks) return NULL;
	long nodeNum = getNodeNum(linkNum);
	if (nodeNum >= m_xml->getNumNodes()-1) return NULL;
	
	for (long i=nodeNum+1 ; i < m_xml->getNumNodes() ; i++ ) {
		XmlNode *node = m_xml->getNodePtr(i);
		if (node->getNodeId() == TAG_A) return NULL;
		if (node->getNodeId() != TAG_TEXTNODE) continue;
		// quickpoll, this is prone to blocking
		QUICKPOLL(niceness);
		// maybe handle img alt text here someday, too
		char *ptr;
		if ((ptr = strncasestr(node->getNode(), 
				       node->getNodeLen(), string)))
			return ptr;
	}
	return NULL;
}

// . hash the link: terms
// . ensure that more useful linkers are scored higher
// . useful for computing offsite link text for qdb-ish algorithm
// . NOTE: for now i do not hash links to the same domain in order to
//   hopefully save 10%-25% index space
// . NOTE: PLUS, they may clog up the link-adjusted quality ratings since
//   different site links with no link text will be ranked behind them
// . the 8-bit bitmap of the score of a link: term:
// . 00ubdcss  u = link is Unbanned? b = link isBanned?
//             d = link dirty?       c = link clean? 
//             s = 01 if no link text, 10 if link text
// . NOTE: this is used in Msg18.cpp for extraction
// . CAUTION: IndexList::score32to8() will warp our score if its >= 128
//   so i moved the bits down
/*
bool Links::hash ( TermTable *table,
		   Url *url , 
		   Url *redirUrl , 
		   long version,
		   long niceness ,
		   // can cal Xml::isRSSFeed() on the content to check
		   // for the special identifying tag to be a feed
		   bool isRSSFeed ) {
	// we mask in some bits to the score sometimes
	//unsigned char mask = 0;
	// see if our links are all banned or all unbanned via siteRec
	// Xml *sx = sr->getXml();
	// later, the score can contain a ruleset that should be used to parse
	// the document that is linked to, but just support unbanning of 
	// soft banned sites for now
	//if ( sx->getBool("linksUnbanned", false ) ) mask |= 0x20;
	// let's phase these guys out, they aren't really used anyway
	//if ( version < 21 ) {
	//	if ( sx->getBool("linksBanned"  , false ) ) mask |= 0x10;
	//	if ( sx->getBool("linksDirty"   , false ) ) mask |= 0x08;
	//	if ( sx->getBool("linksClean"   , false ) ) mask |= 0x04;
	//}
	// decide if we will index sitelink terms
	bool indexSiteLinks = false;
	if (version >= 71) {
		//if (sx->getBool("indexSiteLinks", true)) indexSiteLinks=true;
		indexSiteLinks = true;
	}
	// see ../url/Url2.cpp for hashAsLink() algorithm
	for ( long i = 0 ; i < m_numLinks ; i++ ) {
		// skip links with zero 0 length
		if ( m_linkLens[i] == 0 ) continue;
		// . skip if we are rss page and this link is an <a href> link
		// . we only harvest/index <link> urls from rss feeds
		// . or in the case of feedburner, those orig tags
		if ( isRSSFeed && (m_linkFlags[i] & LF_AHREFTAG) ) 
			continue;
		// if we have a <feedburner:origLink> tag, then ignore <link> 
		// tags and only get the links from the original links
		if ( m_isFeedBurner && !(m_linkFlags[i] & LF_FBTAG) )
			continue;
		// normalize the link
		Url2 link;
		// now we always add "www" to these links so that any link
		// to cnn.com is same as link to www.cnn.com, because either
		// we index cnn.com or www.cnn.com but not both providing 
		// their content is identical (deduping). This way whichever
		// one we index, we can take advantage of all link text whether
		// it's to cnn.com or www.cnn.com.
		// Every now and then we add new session ids to our list in
		// Url.cpp, too, so we have to version that.
		link.set ( m_linkPtrs[i] , 
			   m_linkLens[i] ,
			   true          , // addWWW? 
			   m_stripIds    ,
			   false         , // stripPound?
			   false         , // stripCommonFile?
			   version       );// used for new session id stripping
		QUICKPOLL(niceness);
		// . the score depends on some factors:
		// . NOTE: these are no longer valid! (see score bitmap above)
		// . 4 --> if link has different domain AND has link text
		// . 3 --> if link has same domain AND has link text
		// . 2 --> if link has different domain AND no link text
		// . 1 --> if link has sam domain AND no link text
		// . is domain the same as ours?
		// . NOTE: ideally, using the IP domain would be better, but
		//   we do not know the ip of the linker right now... so scores
		//   may be topped with a bunch of same-ip domain links so that
		//   we may not get as much link text as we'd like, since we
		//   only sample from one link text per ip domain 
		// . now we also just use the mid domain! (excludes TLD)
		//bool sameMidDomain = false;
		bool internal = false;
		long mdlen = url->getMidDomainLen();
		if ( mdlen == link.getMidDomainLen() &&
		     strncmp(url->getMidDomain(),link.getMidDomain(),mdlen)==0)
			//continue; // sameMidDomain = true;
			internal = true;
		// also check the redir url
		if ( redirUrl ) {
			mdlen = redirUrl->getMidDomainLen();
			if ( mdlen == link.getMidDomainLen() &&
			     strncmp(redirUrl->getMidDomain(),
				     link.getMidDomain(),mdlen)==0)
				//continue; // sameMidDomain = true;
				internal = true;
		}
		// select prefix
		char *prefix = "link:";
		// let's use a different termlist for version 21 and up since
		// we include internal links and we do scoring differently. 
		// once those new termlists get beefed up we can switch over 
		// to them exclusively.
		if ( version >= 21 ) prefix = "links";
		// older versions used ilink:
		if ( version <  21 && internal ) prefix = "ilink:";

		// for now, don't hash same-mid-domain links at all (save disk)
		//if ( sameMidDomain ) continue;
		// now make the score
		unsigned char score ;
		// . TODO: consider not hashing link w/o text!
		// . otherwise, give it a higher score if it's got link TEXT
		bool gotLinkText = hasLinkText ( i, version );
		//if ( ! sameMidDomain ) {
		// support the old scores for backwards compatibility
		if ( version < 21 ) {
			if ( gotLinkText ) score = 2; // has link text
			else               score = 1; // no link text
		}
		// otherwise, beginning with version 21, allow internal links,
		// but with lower scores
		else {
			//                          score
			// internal, no link text:  2
			// internal, w/ link text:  4
			// external, no link text:  6
			// external, w/ link text:  8
			if ( internal ) {
				if ( ! gotLinkText ) score = 0x02;
				else                 score = 0x04;
			}
			else {
				if ( ! gotLinkText ) score = 0x06;
				else                 score = 0x08;
			}
		}
		//}
		//else if ( gotLinkText ) score = 3;
		// set upper 2 bits to indicate of link is banned/unbanned
		//score |= mask;

		// now hash with our score
		if ( ! link.hashAsLink ( version, table , NULL , 0 , score ,
					 internal , prefix, indexSiteLinks ) )
			return false;

		QUICKPOLL(niceness);
	}
	return true;
}
*/

long Links::findLinkNum(char* url, long urlLen) {
	for(long i = 0;i< m_numLinks; i++) {
		if(m_linkLens[i] == urlLen &&
		   strncmp(url, m_linkPtrs[i], urlLen) == 0)
			return i;
	}
	return -1;
}

// helper function for shared link ptr buffer
static long getLinkBufferSize(long numLinks){
	return numLinks * 
		(sizeof(char*        ) + // linkPtrs
		 sizeof(long         ) + // linkLens
		 sizeof(long         ) + // linkNodes
		 sizeof(uint64_t     ) + // linkHashes
		 sizeof(uint64_t     ) + // hostHashes
		 sizeof(long         ) + // domHashes
		 sizeof(linkflags_t  ) + // linkFlags
		 sizeof(char*        )   // spamNotes
		 );
}

/*
void Links::removeExternalLinks ( ) {
	long j = 0;
	char *p = m_allocBuf;
	for ( long i = 0 ; i < m_numLinks ; i++ ) {
		// skip if not internal (by hostname)
		if ( ! isInternalHost(i) ) continue;
		// copy it over
		memcpy ( p , m_linkPtrs[i] , m_linkLens[i] );
		// add it back
		m_linkPtrs  [j] = p;
		m_linkLens  [j] = m_linkLens  [i];
		m_linkNodes [j] = m_linkNodes [i];
		m_linkHashes[j] = m_linkHashes[i];
		m_hostHashes[j] = m_hostHashes[i];
		m_linkFlags [j] = m_linkFlags [i];
		// skip it
		p += m_linkLens[i];
		// NULL
		*p++ = '\0';
		// inc count of links
		j++;
	}
	// . update m_bufPtr cuz that is what getLinkBufLen() returns!
	// . Msg10 uses that to know when to stop adding urls
	m_bufPtr = p;
	// update count
	m_numLinks = j;
}
*/

// returns false and sets g_errno on error
bool Links::flagOldLinks ( Links *old ) {
	// do not double call
	if ( m_flagged ) return true;
	// only call once
	m_flagged = true;
	// skip if null
	if ( ! old ) return true;
	// hash the old links into a table
	HashTable ht;
	for ( long i = 0 ; i < old->m_numLinks ; i++ ) {
		// get the url
		char *u    = old->m_linkPtrs[i];
		long  ulen = old->m_linkLens[i];
		// hash it
		long long uh = hash32 ( u , ulen );
		// it does not like keys of 0, that means empty slot
		if ( uh == 0 ) uh = 1;
		// add to hash table
		if ( ! ht.addKey ( uh , 1 ) ) return false;
	}
	// set the flags
	for ( long i = 0 ; i < m_numLinks ; i++ ) {
		// get the url
		char *u    = m_linkPtrs[i];
		long  ulen = m_linkLens[i];
		// get our hash
		long long uh = hash32 ( u , ulen );
		// it does not like keys of 0, that means empty slot
		if ( uh == 0 ) uh = 1;
		// check if our hash is in this hash table, if not, then
		// it is a new link, skip this
		if ( ht.getSlot ( uh ) < 0 ) continue;
		// assume new
		m_linkFlags[i] |= LF_OLDLINK;
	}
	return true;
}

//static char s_isLinkRSS;
//static char s_permalink;
//static long s_age;

// . are we a permalink?
// . this registers as a permalink which it is not:
//   http://www.dawn.com/2009/01/04/rss.htm
//   http://www.msnbc.msn.com/id/3032072
bool isPermalink ( //char        *coll         ,
		   Links       *links        ,
		   Url         *u            ,
		   char         contentType  ,
		   LinkInfo    *linkInfo     ,
		   bool         isRSS        ,
		   char       **note         ,
		   char        *pathOverride ,
		   bool         ignoreCgi    ,
		   linkflags_t *retFlags     ) {

	// reset. caller will OR these into its flags
	if ( retFlags ) *retFlags = 0;

	// how can this happen?
	if ( ! u ) return false;

	// rss feeds cannot be permalinks
	if ( isRSS ) { if ( note ) *note = "url is rss feed."; return false; }

	// root pages don't get to be permalinks
	if ( u->isRoot() ) {
		if ( note ) *note = "url is a site root";	return false; }
	
	// are we a "site root" i.e. hometown.com/users/fred/ etc.
	//if ( u->isSiteRoot ( coll ) ) {
	//	if ( note ) *note = "url is a site root"; return false; }
		
	// only html (atom feeds link to themselves)
	if ( contentType != CT_HTML) {
		if ( note ) *note = "content is not html"; return false;}

	// techcrunch has links like this in the rss:
	// http://feedproxy.google.com/~r/Techcrunch/~3/pMaRh78u1W8/
	if ( strncmp(u->getHost(),"feedproxy.",10)==0 ) {
		if ( note ) *note = "from feedproxy host"; return true; }
	// might want to get <feedburner:origLink> instead of <link> if
	// we can. that woudl save a redirect through evil g
	if ( strncmp(u->getHost(),"feeds.feedburner.com/~",22)==0 ) {
		if ( note ) *note = "feedburner tilde url"; return true; }
	

	// . BUT if it has a link to itself on digg, reddit, etc. then it
	//   doesn't need to have the digits or the underscores...
	// . this helps us disnguish between
	//   science.howstuffworks.com/fantasy-football.html (permalink) and
	//   science.howstuffworks.com/space-channel.htm (NOT a permalink)
	// . i guess this includes "post a comment" links that are just 
	//   anchor links to the textarea at the page bottom so that will fix:
	//   http://workandplay.vox.com/library/post/running-again.html
	// . includes trackbacks, comments feed, etc.
	// . returns -1 if unknown whether it is a permalink or not
	char status = -1;
	if ( links ) status = links->isPermalink ( note );
	if ( status == 1 ) return true;
	if ( status == 0 ) return false;

	char *pathStart = u->getPath();
	// a hack by Links.cpp after setting LF_SUBDIR
	if ( pathOverride ) pathStart = pathOverride;

	// compute these
	linkflags_t extraFlags  = 0;

	// we must have a sequence of 3 or more digits in the path
	char *p      = pathStart;
	long  plen   = u->getPathLen();
	char *pend   = u->getPath() + plen;
	long  dcount = 0;
	// now we scan the cgi stuff too!!
	// http://www.rocklintoday.com/news/templates/sierra_college.asp?articleid=6848&zoneid=51
	// http://www.freemarketnews.com/WorldNews.asp?nid=57373
	char *uend   = u->getUrl() + u->getUrlLen();
	// halt at path if we should
	if ( ignoreCgi ) uend -= u->getQueryLen(); // CgiLen();
	// see if we find the digits in the cgi part
	bool digitsInCgi = false;
	// start scanning at the path
	for ( ; p < uend ; p++ ) {
		if ( *p == '?' ) digitsInCgi = true;
		// if not a digit, reset count
		if ( ! is_digit(*p) ) { dcount = 0; continue; }
		// . check if it is a "strong permalink"
		// . i.e. contains /yyyy/mm/?? in PATH (not cgi)
		if ( p + 9 < pend  &&
		     *(p-1)=='/'    && 
		     is_digit(p[0]) &&
		     is_digit(p[1]) &&
		     is_digit(p[2]) &&
		     is_digit(p[3]) &&
		     p[4] == '/'    &&
		     is_digit(p[5]) &&
		     is_digit(p[6]) &&
		     p[7] == '/'    ) {
		     //is_digit(p[8]) &&
		     //is_digit(p[9]) &&
		     //p[10] == '/'   )
			// http://www.it.com.cn/f/office/091/4/722111.htm 
			// was thought to have strong outlinks, but they were
			// not! this should fix it...
			long y = atoi(p+0);
			long m = atoi(p+5);
			// make sure the year and month are in range
			if ( y >= 1990 && y <= 2050 && m >= 1 && m <= 31 )
				extraFlags |= LF_STRONGPERM;
		}
		// count it if a digit
		if ( ++dcount == 3 ) break;
	}
	// it can also have 2+ hyphens or 2+ underscores in a single
	// path component to be a permalink
	long hcount = 0;
	p = pathStart;
	for ( ; p < pend ; p++ ) {
		// if not a digit, reset count
		if ( *p == '/' ) { hcount = 0; continue; }
		// is it a thing?
		if ( *p != '_' && *p != '-' ) continue;
		// count it
		if ( ++hcount == 2 ) break;
	}

	// we can have a cgi of "?p=<digit>" and be a permalink
	p = u->m_query;
	bool hasp = ( p && p[0]=='p' && p[1]=='=' && is_digit(p[2]) ) ;
	// fix for http://proglobalbusiness.org/?m=200806 being detected as
	// a permalink... it has ?p=xxx outlinks.
	if ( hasp ) extraFlags |= LF_STRONGPERM;

	// return these if the caller wants them
	if ( retFlags ) *retFlags = extraFlags;

	// . if we don't then not a permalink
	// . THIS STILL FAILS on stuff like:
	//   BUT we can fix that by doing url pattern analysis? yeah, 
	//   each domain can have a tag that is the permalink subdir, so
	//   that any url in that subdir is a permalink.
	if ( ! hasp && dcount < 3 && hcount < 2 ) {
		if ( note ) 
			*note = "path has no digits, underscores or hyphens";
		return false;
	}


	// if self link check for link text "permalink" then we are
	// probably very strongly a permalink
	// http://www.5minutesformom.com/5225/wordless-wednesday-angel/
	// has a /promote-your-site tack-on which casues the LF_SUBDIR
	// algo to call the parent a NON-permalink.this should fix that
	// because it has a link to itself with the word "permalink"
	if ( links && links->hasSelfPermalink() ) {
		if ( note ) *note = "has permalink text to itself";
		return true;
	}

	// http://proglobalbusiness.org/?m=200806 is never a permalink
	p = u->m_query;
	if ( p && p[0]=='m' && p[1]=='=' && is_digit(p[2]) ) {
		long n = atoi(p+2);
		if ( n > 199000 && n < 205000 ) {
			if ( note ) *note = "has ?m=<year><month> cgi";
			return false;
		}
	}

	// . if we have an internal outlink that is a permalink and is
	//   in a subdirectory of us, THEN we are not a permalink
	// . fixes andrewsullivan.atlanticmonthly.com/the_daily_dish/
	linkflags_t mf  = (LF_PERMALINK | LF_SAMEHOST | LF_SUBDIR );
	// loop over all outlinks
	long no = 0;
	// make sure we got them
	if ( links ) no = links->m_numLinks;
	// practically all internal outlinks have LF_SUBDIR set for permalinks
	// for the url http://www.breitbart.tv/?p=249453 so do not do this 
	// outlink algo on it on such urls! basically anytime we got our
	// permalink indicator in the cgi portion of the url, do not do this
	// subdir algorithm.
	if ( hcount < 2 && dcount < 3 && hasp ) no = 0;
	// or if we only got digits and they were in the cgi
	if ( hcount < 2 && ! hasp && digitsInCgi ) no = 0;
	// do the outlink loop
	for ( long i = 0 ; i < no ; i++ ) {
		// get the flags
		linkflags_t flags = links->m_linkFlags[i];
		// skip if not a match. match the match flags = "mf"
		if ( (flags & mf) != mf ) continue;
		// allow /print/ "printer view" pages
		//if ( strstr ( links->m_linkPtrs[i],"/print" ) ) continue;
		if ( note ) *note = "has subdir permalink outlink";
		// ok, we are not a permalink now
		return false;
	}


	// now check for strong outlinks on same host when we are not strong
	if ( links ) no = links->m_numLinks;
	// if we are strong, forget it
	if ( extraFlags & LF_STRONGPERM ) no = 0;
	// look for strong permalink outlinks
	mf = (LF_STRONGPERM| LF_SAMEHOST );
	// loop over all outlinks we have
	for ( long i = 0 ; i < no ; i++ ) {
		// get the flags
		linkflags_t flags = links->m_linkFlags[i];
		// . if we are NOT a "strong permalink" but we have a same host
		//   outlink that is, then we are not a permalink
		// . fixes: http://blog.makezine.com/archive/kids/
		//   ?CMP=OTC-0D6B48984890
		if ( (flags & mf) != mf ) continue;
		// allow /print/ "printer view" pages
		//if ( strstr ( links->m_linkPtrs[i],"/print" ) ) continue;
		if ( note ) *note = "has strong permalink outlink";
		// ok, we are not a permalink now
		return false;
	}


	// no permalinks for archive directories
	if ( (gb_strcasestr(u->getPath(),"/archive")||
	      u->getPathDepth(false)==0) &&
	     gb_strcasestr(u->getPath(), "/index.") && 
	     !u->isCgi()){ 
		if ( note ) *note = "has /archive and /index. and not cgi";
		return false;}
	// no, /tag/ is ok -->  http://www.makeuseof.com/
	// BUT technorati.com/tag/search-engine-optimization is not a 
	// permalink!!! i took technorati.com|jp out of ruleset 36 for now
	// ah, but a ton of the urls have /tags/ and are NOT permalinks!!!
	if (gb_strcasestr(u->getPath(), "/tag/")){
		if ( note ) *note = "has /tag/"; return false;}
	// no forums or category indexes
	if (gb_strcasestr(u->getPath(), "/category")){
		if ( note ) *note = "has /category"; return false;}
	if (gb_strcasestr(u->getPath(), "/cat_")){
		if ( note ) *note = "has /cat_"; return false;}
	// http://www.retailerdaily.com/cat/search-engine-marketing/
	if (gb_strcasestr(u->getPath(), "/cat/")){
		if ( note ) *note = "has /cat/"; return false;}
	if (gb_strcasestr(u->getPath(), "/comment.html")){
		if ( note ) *note = "has /comment.html"; return false;}
	if (gb_strcasestr(u->getPath(), "/comments/")){
		if ( note ) *note = "has /comments/"; return false;}
	

	char *pos;
	// category or tag page detection
	pos = gb_strcasestr(u->getUrl(), "cat=");
	if ( pos && pos > u->getUrl() && !is_alpha_a(*(pos-1))){
		if ( note ) *note = "has [A-z]cat="; return false;}
	pos = gb_strcasestr(u->getUrl(), "tag=");
	if ( pos && pos > u->getUrl() && !is_alpha_a(*(pos-1))){
		if ( note ) *note = "has [A-z]tag="; return false;}
	pos = gb_strcasestr(u->getUrl(), "tags=");
	if ( pos && pos > u->getUrl() && !is_alpha_a(*(pos-1))){
		if ( note ) *note = "has [A-z]tags="; return false;}

	// more forum detection
	if (gb_strcasestr(u->getUrl(), "forum")){
		if ( note ) *note = "has forum"; return false;}
	if (gb_strcasestr(u->getPath(), "thread")){
		if ( note ) *note = "has thread";	return false;}
	if (gb_strcasestr(u->getPath(), "topic") &&
	    !gb_strcasestr(u->getPath(), "/topics/")){
		if ( note ) *note = "has /topics/"; return false;}

	// more index page detection
	if (gb_strcasestr(u->getPath(), "/default.")){
		if ( note ) *note = "has /default."; return false;}
	if (gb_strcasestr(u->getPath(), "/profile.")){
		if ( note ) *note = "has /profile."; return false;}
	if (gb_strcasestr(u->getPath(), "/archives.")){
		if ( note ) *note = "has /archives."; return false;}
	if (gb_strcasestr(u->getPath(), "_archive.")){
		if ( note ) *note = "has _archive."; return false;}
	if (gb_strcasestr(u->getPath(), "/search.")){
		if ( note ) *note = "has /search."; return false; }
	if (gb_strcasestr(u->getPath(), "/search/")){
		if ( note ) *note = "has /search/"; return false; }

	// get path end
	p    = u->getPath() + u->getPathLen();
	plen = u->getPathLen();
	// back up over index.html
	if ( plen > 10 && strncmp(p-10,"index.html",10)==0 ) {
		plen -= 10; p -= 10; }
	// hack off the /
	if ( p[-1]=='/' ) { plen--; p--; }

	// ends in /trackback means not a permalink
	if ( plen >= 10 && strncasecmp(p-10,"/trackback",10)==0) {
		if ( note ) *note = "ends in /trackback";
		return false;
	}

	// ends in /dddd/dd means usually an archive date
	if ( plen >= 8 && 
	     is_digit(p[-1]) &&
	     is_digit(p[-2]) &&
	     p[-3] == '/'    &&
	     is_digit(p[-4]) &&
	     is_digit(p[-5]) &&
	     is_digit(p[-6]) &&
	     is_digit(p[-7]) &&
	     p[-8] == '/'      ) {
		     // ensure the numbers are in range for a date
		     long year  = atoi(p-7);
		     long month = atoi(p-2);
		     if ( year  > 1990 && year  <= 2015 &&
			  month > 0    && month <= 12    ) {
			     if ( note ) *note = "ends in /dddd/dd/"; 
			     return false; 
		     }
	}

	// /2008 is usually not permalink
	if ( plen >= 5 &&
	     p[-5] == '/' &&
	     p[-4] == '2' &&
	     p[-3] == '0' &&
	     atoi(p-2) < 50 ) {
		if ( note ) *note = "ends in year /20xx";
		return false;
	}
	// /199? too
	if ( plen >= 5 &&
	     p[-5] == '/' &&
	     p[-4] == '1' &&
	     p[-3] == '9' &&
	     atoi(p-2) > 90 ) {
		if ( note ) *note = "ends in year /19xx";
		return false;
	}


	// . look for a repetitive sequence of html tags
	// . each must contain an outlink! there are some blog entries that
	//   have excerpts of an email chain. 
	// . if the repetition intersects the main content section, 
	//   then it is an index page

	// . make sure that SCORES can detect comment sections. very often
	//   the comment is bigger than the main section!!!!

	// . or we can subtract the repetitive sections, and see if we have
	//   any beefy content left over... then we don't have to worry
	//   about comment identification

	// . index tag pairs
	// . look at header tags, div, p, index level # before the pair

	// . find the delimeter between the blurbs
	// . delimeter must touch the beefy content section
	// . go by "strings" of tagids, a tagid of 0 means text i think
	//   but eliminate it if pure punctuation
	// . and have a subtagid field, which is the hash of a tag's attributes
	//   BUT in the case of a text tag, a hash of the alpha chars

	// . how many delimeters can we find that start at level X.

	// . now we are determining 
	if ( note ) *note = "is permalink";

	return true;
}


long getSiteRank ( long sni ) {
	if ( sni <= 0 ) return 0;
	if ( sni <= 1 ) return 1;
	if ( sni <= 2 ) return 2;
	if ( sni <= 3 ) return 3;
	if ( sni <= 4 ) return 4;
	if ( sni <= 5 ) return 5;
	if ( sni <= 9 ) return 6;
	if ( sni <= 19 ) return 7;
	if ( sni <= 39 ) return 8;
	if ( sni <= 79 ) return 9;
	if ( sni <= 200-1 ) return 10;
	if ( sni <= 500-1 ) return 11;
	if ( sni <= 2000-1 ) return 12;
	if ( sni <= 5000-1 ) return 13;
	if ( sni <= 10000-1 ) return 14;
	//if ( sni <= 3120 ) return 15;
	return 15;
}