open-source-search-engine/Linkdb.cpp

6298 lines
196 KiB
C++
Raw Normal View History

2013-08-03 00:12:24 +04:00
#include "Linkdb.h"
#include "Threads.h"
#include "Titledb.h"
#include "linkspam.h"
#include "sort.h"
#include "XmlDoc.h" // score32to8()
Linkdb g_linkdb;
Linkdb g_linkdb2;
void Linkdb::reset() {
m_rdb.reset();
}
bool Linkdb::init ( ) {
key224_t k;
// sanity tests
uint32_t linkeeSiteHash32 = (uint32_t)rand();
uint32_t linkerSiteHash32 = (uint32_t)rand();
uint64_t linkeeUrlHash64 = (uint64_t)rand() << 32LL | rand();
// mask it to 32+15 bits
linkeeUrlHash64 &= 0x00007fffffffffffLL;
unsigned char linkerSiteRank = 13;
unsigned char hopCount = 7;
long ip = rand();
long ipdom3 = ipdom(ip);
long long docId = ((uint64_t)rand() << 32 | rand()) & DOCID_MASK;
long discoveryDate = 1339784732;
long lostDate = discoveryDate + 86400*23;
char linkSpam = 1;
k = makeKey_uk ( linkeeSiteHash32 ,
linkeeUrlHash64 ,
linkSpam , // islinkspam?
linkerSiteRank ,
hopCount ,
ip ,
docId ,
discoveryDate ,
lostDate ,
false , // newaddtooldpage?
linkerSiteHash32 ,
false ); // is del?
// jan 1 2008
unsigned long epoch = LINKDBEPOCH;
long dd2 = (discoveryDate - epoch) / 86400;
if ( discoveryDate == 0 ) dd2 = 0;
dd2 = dd2 * 86400 + epoch;
long ld2 = (lostDate - epoch) / 86400;
if ( lostDate == 0 ) ld2 = 0;
ld2 = ld2 * 86400 + epoch;
// try this
setLostDate_uk(&k,ld2 );
// now test it
if(getLinkeeSiteHash32_uk(&k)!=linkeeSiteHash32){char *xx=NULL;*xx=0;}
if(getLinkeeUrlHash64_uk(&k)!=linkeeUrlHash64){char *xx=NULL;*xx=0;}
if ( isLinkSpam_uk ( &k ) != linkSpam ) {char *xx=NULL;*xx=0;}
if (getLinkerSiteHash32_uk(&k)!=linkerSiteHash32){char *xx=NULL;*xx=0;}
if ( getLinkerSiteRank_uk(&k) != linkerSiteRank){char *xx=NULL;*xx=0;}
//if (getLinkerHopCount_uk (&k ) != hopCount ) {char *xx=NULL;*xx=0;}
if ( getLinkerIp24_uk ( &k ) != ipdom3 ) {char *xx=NULL;*xx=0;}
if ( getLinkerIp_uk ( &k ) != ip ) {char *xx=NULL;*xx=0;}
if ( getLinkerDocId_uk( &k ) != docId ) {char *xx=NULL;*xx=0;}
if ( getDiscoveryDate_uk(&k) != dd2 ) {char *xx=NULL;*xx=0;}
if ( getLostDate_uk(&k) != ld2 ) {char *xx=NULL;*xx=0;}
// more tests
setDiscoveryDate_uk (&k,discoveryDate);
setLostDate_uk (&k,lostDate);
if ( getDiscoveryDate_uk(&k) != dd2 ) {char *xx=NULL;*xx=0;}
if ( getLostDate_uk(&k) != ld2 ) {char *xx=NULL;*xx=0;}
long ip3 = 0xabcdef12;
setIp32_uk ( &k , ip3 );
long ip4 = getLinkerIp_uk ( &k );
if ( ip3 != ip4 ) { char *xx=NULL;*xx=0; }
/*
// test similarity
long v1[] = {86845183, 126041601, 193138017, 194832692, 209041345, 237913907,
253753116, 420176029, 425806029, 469664463, 474491119, 486025959, 524746875,
565034969, 651889954, 723451712, 735373612, 740115430, 889005385,
1104585188, 1180264907, 1190905206, 1555245401, 1585281138, 1775919002,
1780336562, 1784029178, 1799261433, 2013337516, 2095261394, 2137774538, 0};
long v2[] = {51207128, 126041601, 237913907, 253753116, 315255440, 394767298,
420176029, 435382723, 469664463, 486025959, 536944585, 556667308, 565034969,
615792190, 624608202, 629600018, 807226240, 1107373572, 1113238204,
1134807359, 1135960080, 1200900964, 1527062593, 1585281138, 1634165777,
1694464250, 1802457437, 1943916889, 1960218442, 2058631149, -2130866760, 0};
long nv1 = sizeof(v1)/4;
long nv2 = sizeof(v2)/4;
if ( isSimilar_sorted (v1,v2,nv1,nv2,80,0) ) {
char *xx=NULL;*xx=0;
}
*/
// we use the same disk page size as indexdb (for rdbmap.cpp)
long pageSize = GB_INDEXDB_PAGE_SIZE;
// set this for debugging
//long long maxTreeMem = 1000000;
long long maxTreeMem = 40000000; // 40MB
// . what's max # of tree nodes?
// . key+4+left+right+parents+dataPtr = sizeof(key192_t)+4 +4+4+4+4
// . 32 bytes per record when in the tree
long maxTreeNodes = maxTreeMem /(sizeof(key224_t)+16);
// disk page cache mem, 100MB on gk0 now
long pcmem = 0; // g_conf.m_linkdbMaxDiskPageCacheMem;
// give it a little
pcmem = 10000000; // 10MB
2013-08-03 00:12:24 +04:00
// keep this low if we are the tmp cluster
//if ( g_hostdb.m_useTmpCluster ) pcmem = 0;
// TODO: would be nice to just do page caching on the satellite files;
// look into "minimizeDiskSeeks" at some point...
if ( ! m_pc.init ( "linkdb" ,
RDB_LINKDB,
pcmem ,
pageSize ,
true , // use shared mem?
false )) // minimizeDiskSeeks?
return log("db: Linkdb init failed.");
// init the rdb
return m_rdb.init ( g_hostdb.m_dir ,
"linkdb" ,
true , // dedup
0 , // fixeddatasize is 0 since no data
2014-01-10 01:22:51 +04:00
3,//g_conf.m_linkdbMinFilesToMerge ,
2013-08-03 00:12:24 +04:00
// fix this to 15 and rely on the page cache of
// just the satellite files and the daily merge to
// keep things fast.
//15 ,
maxTreeMem ,
maxTreeNodes ,
true , //isTreeBalanced
0 , // cache mem
0 , // cache nodes
false, // true , // use half keys
false , // load cache from disk
&m_pc ,
false , // false
false , // preload page cache
sizeof(key224_t) ,
true ); // bias page cache? (true!)
}
// init the rebuild/secondary rdb, used by PageRepair.cpp
bool Linkdb::init2 ( long treeMem ) {
// . what's max # of tree nodes?
// . key+4+left+right+parents+dataPtr = 12+4 +4+4+4+4 = 32
// . 28 bytes per record when in the tree
long nodeSize = ( sizeof(key224_t) + 12 + 4 ) + sizeof(collnum_t);
long maxTreeNodes = treeMem / nodeSize;
// initialize our own internal rdb
return m_rdb.init ( g_hostdb.m_dir ,
"linkdbRebuild" ,
true , // dedup
0 , // no data now! just docid/s/c
50 , // m_clusterdbMinFilesToMerge,
treeMem , // g_conf.m_clusterdbMaxTreeMem,
maxTreeNodes ,
true , // balance tree?
0 , // maxCacheMem ,
0 , // maxCacheNodes ,
false, // true , // half keys?
false , // g_conf.m_clusterdbSaveCache,
NULL , // &m_pc ,
false , // is titledb
false , // preload disk page cache
sizeof(key224_t), // key size
true );// bias disk page cache
}
bool Linkdb::addColl ( char *coll, bool doVerify ) {
if ( ! m_rdb.addColl ( coll ) ) return false;
if ( ! doVerify ) return true;
// verify
if ( verify(coll) ) return true;
// if not allowing scale, return false
if ( ! g_conf.m_allowScale ) return false;
// otherwise let it go
log ( "db: Verify failed, but scaling is allowed, passing." );
return true;
}
bool Linkdb::verify ( char *coll ) {
log ( LOG_DEBUG, "db: Verifying Linkdb for coll %s...", coll );
2013-08-03 00:12:24 +04:00
g_threads.disableThreads();
Msg5 msg5;
Msg5 msg5b;
RdbList list;
key224_t startKey;
key224_t endKey;
startKey.setMin();
endKey.setMax();
long minRecSizes = 64000;
if ( ! msg5.getList ( RDB_LINKDB ,
coll ,
&list ,
(char*)&startKey ,
(char*)&endKey ,
minRecSizes ,
true , // includeTree ,
false , // add to cache?
0 , // max cache age
0 , // startFileNum ,
-1 , // numFiles ,
NULL , // state
NULL , // callback
0 , // niceness
false , // err correction?
NULL ,
0 ,
-1 ,
true ,
-1LL ,
&msg5b ,
true )) {
g_threads.enableThreads();
return log("db: HEY! it did not block");
}
long count = 0;
long got = 0;
for ( list.resetListPtr() ; ! list.isExhausted() ;
list.skipCurrentRecord() ) {
key224_t k;
list.getCurrentKey((char*)&k);
count++;
//uint32_t shardNum = getShardNum ( RDB_LINKDB , &k );
//if ( groupId == g_hostdb.m_groupId ) got++;
unsigned long shardNum = getShardNum( RDB_LINKDB , &k );
if ( shardNum == getMyShardNum() ) got++;
2013-08-03 00:12:24 +04:00
}
if ( got != count ) {
log ("db: Out of first %li records in Linkdb , "
"only %li belong to our group.",count,got);
/*
// repeat with log
for ( list.resetListPtr() ; ! list.isExhausted() ;
list.skipCurrentRecord() ) {
key224_t k;
list.getCurrentKey((char*)&k);
uint32_t shardNum = getShardNum ( RDB_LINKDB , &k );
2013-08-03 00:12:24 +04:00
long groupNum = g_hostdb.getGroupNum(groupId);
unsigned long sh32 ;
sh32 = g_linkdb.getLinkeeSiteHash32_uk(&k);
uint16_t sh16 = sh32 >> 19;
log("db: sh16=0x%lx group=%li",
(long)sh16,groupNum);
}
*/
// exit if NONE, we probably got the wrong data
if ( got == 0 ) log("db: Are you sure you have the "
"right "
"data in the right directory? "
"Exiting.");
log ( "db: Exiting due to inconsistency.");
g_threads.enableThreads();
return g_conf.m_bypassValidation;
}
log ( LOG_DEBUG, "db: Linkdb passed verification successfully for "
2013-08-03 00:12:24 +04:00
"%li recs.", count );
// DONE
g_threads.enableThreads();
return true;
}
// make a "url" key
key224_t Linkdb::makeKey_uk ( uint32_t linkeeSiteHash32 ,
uint64_t linkeeUrlHash64 ,
bool isLinkSpam ,
unsigned char linkerSiteRank ,
unsigned char linkerHopCount ,
uint32_t linkerIp ,
long long linkerDocId ,
unsigned long discoveryDate ,
unsigned long lostDate ,
bool newAddToOldPage ,
uint32_t linkerSiteHash32 ,
bool isDelete ) {
//if ( linkerSiteRank > LDB_MAXSITERANK ) { char *xx=NULL;*xx=0; }
//if ( linkerHopCount > LDB_MAXHOPCOUNT ) { char *xx=NULL;*xx=0; }
// mask it
linkeeUrlHash64 &= LDB_MAXURLHASH;
key224_t k;
k.n3 = linkeeSiteHash32;
k.n3 <<= 32;
k.n3 |= (linkeeUrlHash64>>15) & 0xffffffff;
// finish the url hash
k.n2 = linkeeUrlHash64 & 0x7fff;
k.n2 <<= 1;
if ( isLinkSpam ) k.n2 |= 0x01;
// make it 8-bites for now even though only needs 4
k.n2 <<= 8;
k.n2 |= (unsigned char)~linkerSiteRank;
k.n2 <<= 8;
//k.n2 |= linkerHopCount;
// this is now part of the linkerip, steve wants the full ip
k.n2 |= (linkerIp >> 24);
//uint32_t id = ipdom(linkerIp);
//if ( id > 0xffffff ) { char *xx=NULL;*xx=0; }
k.n2 <<= 24;
k.n2 |= (linkerIp & 0x00ffffff);
k.n2 <<= 8;
k.n2 |= (((unsigned long long)linkerDocId) >> 30);
k.n1 = (((unsigned long long)linkerDocId) & 0x3fffffffLL);
// two reserved bits
k.n1 <<= 2;
// sanity checks
//if(discoveryDate && discoveryDate < 1025376000){char *xx=NULL;*xx=0;}
if ( lostDate && lostDate < LINKDBEPOCH){char *xx=NULL;*xx=0;}
// . convert discovery date from utc into days since jan 2008 epoch
// . the number is for jan 2012, so subtract 4 years to do 2008
unsigned long epoch = LINKDBEPOCH;
if ( discoveryDate && discoveryDate < epoch ) { char *xx=NULL;*xx=0; }
unsigned long nd = (discoveryDate - epoch) / 86400;
if ( discoveryDate == 0 ) nd = 0;
// makeEndKey_uk() maxes this out!
if ( nd > 0x3fff ) nd = 0x3fff;
k.n1 <<= 14;
k.n1 |= nd;
// one reservied bit
k.n1 <<= 1;
k.n1 <<= 1;
if ( newAddToOldPage ) k.n1 |= 0x01;
// the "lost" date. 0 if not yet lost.
unsigned long od = (lostDate - LINKDBEPOCH) / 86400;
if ( lostDate == 0 ) od = 0;
// makeEndKey_uk() maxes this out!
if ( od > 0x3fff ) od = 0x3fff;
k.n1 <<= 14;
k.n1 |= od;
// 2 bits of linker site hash
k.n1 <<= 2;
k.n1 |= linkerSiteHash32 >> 30;
// rest of linker site hash
k.n0 = linkerSiteHash32;
// halfbit - unused now!
k.n0 <<= 1;
// delbit
k.n0 <<= 1;
if ( ! isDelete ) k.n0 |= 0x01;
return k;
}
/////////
//
// MSG25 :: getLinkInfo()
//
/////////
#include "Collectiondb.h"
//#include "CollectionRec.h"
#include "matches2.h"
2013-08-03 00:12:24 +04:00
// 1MB read size for now
#define READSIZE 1000000
#define MAX_INTERNAL_INLINKS 10
//static void gotRootTitleRecWrapper25 ( void *state ) ;
//static void gotTermFreqWrapper ( void *state ) ;
static void gotListWrapper ( void *state );//, RdbList *list );
static bool gotLinkTextWrapper ( void *state );
//static void sendLinkInfoReplyWrapper ( void *state );//, LinkInfo *info ) ;
//static void gotReplyWrapper25 ( void *state , void *state2 ) ;
Msg25::Msg25() {
m_numRequests = 0;
m_linkSpamOut = 0;
// set minhopcount to unknown
//m_minInlinkerHopCount = -1;
m_numReplyPtrs = 0;
m_linkInfo = NULL;
m_ownReplies = true;
}
Msg25::~Msg25 ( ) {
reset();
}
void Msg25::reset() {
if ( ! m_ownReplies ) m_numReplyPtrs = 0;
for ( long i = 0 ; i < m_numReplyPtrs ; i++ )
mfree ( m_replyPtrs[i], m_replySizes[i], "msg25r");
// reset array count to 0
m_numReplyPtrs = 0;
// . free the linkinfo if we are responsible for it
// . if someone "steals" it from us, they should set this to NULL
//if ( m_linkInfo )
// mfree ( m_linkInfo , m_linkInfo->getStoredSize(),"msg25s");
// this now points into m_linkInfoBuf safebuf, just NULL it
2013-08-03 00:12:24 +04:00
m_linkInfo = NULL;
m_table.reset();
m_ipTable.reset();
m_fullIpTable.reset();
m_firstIpTable.reset();
m_docIdTable.reset();
}
#define MODE_PAGELINKINFO 1
#define MODE_SITELINKINFO 2
// . get the inlinkers to this SITE (any page on this site)
// . use that to compute a site quality
// . also get the inlinkers sorted by date and see how many good inlinkers
// we had since X days ago. (each inlinker needs a pub/birth date)
// . returns false if blocked, true otherwise
// . sets g_errno on error
// . we need the siteRec of the url for merging the linkInfo's
// . NOTE: make sure no input vars are on the stack in case we block
// . reallyGetLinkInfo is set to false if caller does not want it but calls
// us anyway for some reason forgotten...
bool Msg25::getLinkInfo ( char *site ,
char *url ,
// either MODE_PAGELINKINFO or MODE_SITELINKINFO
bool isSiteLinkInfo ,
long ip ,
long long docId ,
char *coll ,
char *qbuf ,
long qbufSize ,
void *state ,
void (* callback)(void *state) ,
bool isInjecting ,
SafeBuf *pbuf ,
XmlDoc *xd ,
//bool printInXml ,
long siteNumInlinks ,
//long sitePop ,
LinkInfo *oldLinkInfo ,
long niceness ,
bool doLinkSpamCheck ,
bool oneVotePerIpDom ,
bool canBeCancelled ,
long lastUpdateTime ,
bool onlyNeedGoodInlinks ,
bool getLinkerTitles ,
long ourHostHash32 ,
long ourDomHash32 ,
SafeBuf *linkInfoBuf ) {
2013-08-03 00:12:24 +04:00
// reset the ip table
reset();
//long mode = MODE_PAGELINKINFO;
//m_printInXml = printInXml;
if ( isSiteLinkInfo ) m_mode = MODE_SITELINKINFO;
else m_mode = MODE_PAGELINKINFO;
m_xd = xd;
m_printInXml = false;
if ( m_xd ) m_printInXml = m_xd->m_printInXml;
// sanity check
if ( ! coll ) { char *xx=NULL; *xx=0; }
m_onlyNeedGoodInlinks = onlyNeedGoodInlinks;
m_getLinkerTitles = getLinkerTitles;
// save safebuf ptr, where we store the link info
m_linkInfoBuf = linkInfoBuf;
if ( ! linkInfoBuf ) { char *xx=NULL;*xx=0; }
2013-08-03 00:12:24 +04:00
// sanity check
if ( m_mode == MODE_PAGELINKINFO && ! docId ) {char *xx=NULL; *xx=0; }
// must have a valid ip
//if ( ! ip || ip == -1 ) { char *xx = NULL; *xx = 0; }
// get collection rec for our collection
CollectionRec *cr = g_collectiondb.getRec ( coll );//, collLen );
// bail if NULL
if ( ! cr ) {
g_errno = ENOTFOUND;
log("build: No collection record found when getting "
"link info.");
return true;
}
m_gettingList = false;
// record this in case we were called by Msg3b with the spiders off
m_spideringEnabled = g_conf.m_spideringEnabled;
m_ourHostHash32 = ourHostHash32;
m_ourDomHash32 = ourDomHash32;
//m_minInlinkerHopCount = -1; // -1 -->unknown
m_niceness = niceness;
m_maxNumLinkers = MAX_LINKERS;
m_errno = 0;
m_numReplyPtrs = 0;
m_bufPtr = m_buf;
m_bufEnd = m_buf + MAX_NOTE_BUF_LEN;
m_dupCount = 0;
m_vectorDups = 0;
m_spamLinks = 0;
m_errors = 0;
m_noText = 0;
m_reciprocal = 0;
m_ipDupsLinkdb = 0;
m_docIdDupsLinkdb = 0;
m_lostLinks = 0;
m_ipDups = 0;
m_linkSpamLinkdb = 0;
//m_url = url;
m_docId = docId;
m_coll = coll;
//m_collLen = collLen;
m_callback = callback;
m_state = state;
m_oneVotePerIpDom = oneVotePerIpDom;
m_doLinkSpamCheck = doLinkSpamCheck;
m_canBeCancelled = canBeCancelled;
m_siteNumInlinks = siteNumInlinks; // -1 --> unknown
//m_sitePop = sitePop; // -1 --> unknown
m_qbuf = qbuf;
m_qbufSize = qbufSize;
m_isInjecting = isInjecting;
m_oldLinkInfo = oldLinkInfo;
m_pbuf = pbuf;
m_ip = ip;
m_top = iptop(m_ip);
m_lastUpdateTime = lastUpdateTime;
m_nextKey.setMin();
m_adBanTable.reset();
m_adBanTable.set(4,0,0,NULL,0,false,m_niceness,"adbans");
m_table.set (4,4,0,NULL,0,false,m_niceness,"msg25tab");
QUICKPOLL(m_niceness);
m_url = url;
m_site = site;
// and the "mid domain hash" so that ibm.com and ibm.ru cannot both
// vote even if from totally different ips
Url u; u.set(url);
char *m = u.getMidDomain();
long mlen = u.getMidDomainLen();
m_midDomHash = hash32 ( m , mlen );
// do not prepend "www." to the root url
m_prependWWW = false;
// we have not done a retry yet
m_retried = false;
// change status
//if ( m_statusPtr ) *m_statusPtr = "consulting linkdb";
// . add a "www" to our url
// . we do this when indexing link: terms as well
// . this allows www.xyz.com & xyz.com to both get the same link text
// . we only index one of those if they both have the same content
// . the problem is is that Linkdb::getUrlHash() is what we set
// Links::m_linkHashes[i] to, and that does NOT add "www"
// . ultimately i would say it should add the "www" but only for
// computing the m_linkHashes[i], not for indexing links:?
// . MDW: doesn't seem like we do this anymore...
//Url u2;
//u2.set ( m_url->getUrl() , m_url->getUrlLen() , false/*addWWW?*/);
//log("debug: entering getlinkinfo this=%lx",(long)this);
// then the url/site hash
//uint64_t linkHash64 = (uint64_t) u.getUrlHash64();
m_linkHash64 = (uint64_t) u.getUrlHash64();
//uint32_t hostHash32 = (uint32_t)m_url->getHostHash32();
m_round = 0;
// must have a valid ip
if ( ! ip || ip == -1 ) { //char *xx = NULL; *xx = 0; }
log("linkdb: no inlinks because ip is invalid");
return true;
}
return doReadLoop();
}
bool Msg25::doReadLoop ( ) {
//log("debug: entering doReadLoop this=%lx",(long)this);
// sanity. no double entry.
if ( m_gettingList ) { char *xx=NULL;*xx=0; }
// . get the top X results from this termlist
// . but skip link: terms with a 1 (no link text) for a score
// . these keys are ordered from lowest to highest
key224_t startKey ;
key224_t endKey ;
long siteHash32 = hash32n ( m_site );
// access different parts of linkdb depending on the "mode"
if ( m_mode == MODE_SITELINKINFO ) {
startKey = g_linkdb.makeStartKey_uk ( siteHash32 );
endKey = g_linkdb.makeEndKey_uk ( siteHash32 );
//log("linkdb: getlinkinfo: "
// "site=%s sitehash32=%lu",site,siteHash32);
}
else {
startKey = g_linkdb.makeStartKey_uk (siteHash32,m_linkHash64 );
endKey = g_linkdb.makeEndKey_uk (siteHash32,m_linkHash64 );
}
// resume from where we left off?
if ( m_round > 0 )
//startKey = m_nextKey;
memcpy ( &startKey , &m_nextKey , LDBKS );
// but new links: algo does not need internal links with no link test
// see Links.cpp::hash() for score table
QUICKPOLL(m_niceness);
m_minRecSizes = READSIZE; // MAX_LINKERS_IN_TERMLIST * 10 + 6;
long numFiles = -1;
// NO, DON't restrict because it will mess up the hopcount.
bool includeTree = true;
// what group has this linkdb list?
//unsigned long groupId = getGroupId ( RDB_LINKDB , &startKey );
unsigned long shardNum = getShardNum ( RDB_LINKDB, &startKey );
2013-08-03 00:12:24 +04:00
// use a biased lookup
long numTwins = g_hostdb.getNumHostsPerShard();
2013-08-03 00:12:24 +04:00
long long sectionWidth = (0xffffffff/(long long)numTwins) + 1;
// these are 192 bit keys, top 32 bits are a hash of the url
unsigned long x = siteHash32;//(startKey.n1 >> 32);
long hostNum = x / sectionWidth;
long numHosts = g_hostdb.getNumHostsPerShard();
Host *hosts = g_hostdb.getShard ( shardNum); // Group ( groupId );
2013-08-03 00:12:24 +04:00
if ( hostNum >= numHosts ) { char *xx = NULL; *xx = 0; }
long hostId = hosts [ hostNum ].m_hostId ;
// debug log
if ( g_conf.m_logDebugLinkInfo ) {
char *ms = "page";
if ( m_mode == MODE_SITELINKINFO ) ms = "site";
log("msg25: getting full linkinfo mode=%s site=%s url=%s "
"docid=%lli",
ms,m_site,m_url,m_docId);
}
m_gettingList = true;
// . get the linkdb list
// . we now get the WHOLE list so we can see how many linkers there are
// . we need a high timeout because udp server was getting suspended
// before for 30 seconds and this was timing out and yahoo.com
// was getting spidered w/o any link text -- that's bad.
// Now we hang indefinitely. We also fixed UdpServer to resend
// requests after 30 seconds even though it was fully acked in case
// the receiving host went down and is now back up.
if ( ! m_msg0.getList ( -1 , // hostId, -1 if none
0 , // hostId ip
0 , // hostId port
0 , // max cache age in seconds
false , // addToCache?
RDB_LINKDB ,
m_coll ,
&m_list ,
(char*)&startKey,
(char*)&endKey ,
m_minRecSizes ,
this ,
gotListWrapper ,
m_niceness ,
true , // error correct?
includeTree ,
true , // do merge
hostId , // firstHostId
0 , // startFileNum
numFiles ,
60*60*24*365 )){// timeout of one year
//log("debug: msg0 blocked this=%lx",(long)this);
return false;
}
// all done
m_gettingList = false;
// debug log
if ( g_conf.m_logDebugBuild )
log("build: msg25 call to msg0 did not block");
// return true on error
if ( g_errno ) {
log("build: Had error getting linkers to url %s : %s.",
m_url,mstrerror(g_errno));
return true;
}
// . this returns false if blocked, true otherwise
// . sets g_errno on error
return gotList();
}
void gotListWrapper ( void *state ) { // , RdbList *list ) {
Msg25 *THIS = (Msg25 *) state;
//log("debug: entering gotlistwrapper this=%lx",(long)THIS);
// return if it blocked
// . this calls sendRequests()
// . which can call gotLinkText(NULL) if none sent
// . which can call doReadLoop() if list was not empty (lost linker)
// . which can block on msg0
if ( ! THIS->gotList() ) return;
// error? wait for all replies to come in...
if ( THIS->m_numRequests > THIS->m_numReplies ) {
log("msg25: had error %s numreplies=%li numrequests=%li "
"round=%li",
mstrerror(g_errno),THIS->m_numReplies,THIS->m_numRequests,
THIS->m_round);
return;
}
// the call to gotList() may have launched another msg0 even
// though it did not return false???
// THIS FIXED a double entry core!!!! msg0 would return after
// we had destroyed the xmldoc class behind this!!
if ( THIS->m_gettingList ) {
//log("debug: waiting for msg0 1");
return;
}
//log("debug: calling final callback 1");
// otherwise call callback, g_errno probably is set
THIS->m_callback ( THIS->m_state );//, THIS->m_linkInfo );
}
// . this returns false if blocked, true otherwise
// . sets g_errno on error
bool Msg25::gotList() {
// all done
m_gettingList = false;
// reset # of docIds linking to us
//m_numDocIds = 0;
//log("debug: entering gotlist this=%lx",(long)this);
// return true on error
if ( g_errno ) {
log("build: Had error getting linkers to url %s : %s.",
m_url,mstrerror(g_errno));
return true;
}
// this counts all the inlinks... not inlinking pages/docids
//m_numDocIds = (m_list.getListSize() - 6)/10;
//if ( m_numDocIds < 0 ) m_numDocIds = 0;
// . record the # of hits we got for weighting the score of the
// link text iff it's truncated by MAX_LINKERS
// . TODO: if url is really popular, like yahoo, we should use the
// termFreq of the link: term!
// . TODO: we now only read in first 50k linkers so Msg0::getList()
// doesn't waste space through its stupid buffer pre-allocation.
// it should not preallocate for us since our niceness is over 1
// cuz we don't require a real-time signal handler to read our reply.
m_list.resetListPtr();
// clear this too
m_k = (Inlink *)-1;
// MATT: fix this later maybe....
/*
// all done?
if ( m_list.getListSize() < m_minRecSizes ) return gotTermFreq (false);
// change status
//if ( m_statusPtr ) *m_statusPtr = "getting linkdb term freq";
// returns false if blocked, returns true and sets g_errno
// on error
if ( ! m_msg42.getTermFreq ( m_coll ,
3600 , // maxAge
m_termId ,
this , // state
gotTermFreqWrapper ,
m_niceness ) )
return false;
return gotTermFreq(true);
}
void gotTermFreqWrapper ( void *state ) {
Msg25 *THIS = (Msg25 *)state;
// if blocked, just return
if ( ! THIS->gotTermFreq(true) ) return;
// otherwise call callback, g_errno is probably set
THIS->m_callback ( THIS->m_state , THIS->m_linkInfo );
}
bool Msg25::gotTermFreq ( bool msg42Called ) {
// error?
if ( g_errno ) {
log("build: Msg25 had error getting term freq.");
return true;
}
// was msg42 called?
if ( msg42Called ) {
// set the new one
long long tf = m_msg42.getTermFreq();
logf(LOG_DEBUG,"build: Upping linkers from %li to %lli",
m_numDocIds,tf);
if ( tf > m_numDocIds ) m_numDocIds = tf;
}
*/
// set m_nextKey in case we need to re-call doReadLoop()
//m_list.getLastKey ( (char *)&m_nextKey );
// inc by 1
//m_nextKey += 1;
// we haven't got any responses as of yet or sent any requests
m_numReplies = 0;
m_numRequests = 0;
if ( m_round == 0 ) {
m_linkSpamOut = 0;
m_numFromSameIp = 0;
memset ( m_inUse , 0 , MAX_MSG20_OUTSTANDING );
// use this to dedup ips in linkdb to avoid looking up their
// title recs... saves a lot of lookups
//m_ipTable.set(256);
if (!m_ipTable.set(4,0,256,NULL,0,false,m_niceness,"msg25ips"))
return true;
long long needSlots = m_list.getListSize() / LDBKS;
// wtf?
if ( m_list.getListSize() > READSIZE + 10000 ) {
//char *xx=NULL;*xx=0; }
log("linkdb: read very big linkdb list %li bytes "
"bigger than needed",
m_list.getListSize() - READSIZE );
}
// triple for hash table speed
needSlots *= 3;
// ensure 256 min
if ( needSlots < 256 ) needSlots = 256;
if ( ! m_fullIpTable.set(4,0,needSlots,NULL,0,false,
m_niceness,"msg25ip32") )
return true;
if ( ! m_firstIpTable.set(4,0,needSlots,NULL,0,false,
m_niceness,"msg25fip32") )
return true;
// this too
//m_docIdTable.set(256);
if ( ! m_docIdTable.set(8,0,needSlots,
NULL,0,false,m_niceness,"msg25docid") )
return true;
// . how many link spam inlinks can we accept?
// . they do not contribute to quality
// . they only contribute to link text
// . their number and weights depend on our ROOT QUALITY!
// . we need this because our filters are too stringent!
m_spamCount = 0;
m_spamWeight = 0;
m_maxSpam = 0;
m_numDocIds = 0;
m_cblocks = 0;
m_uniqueIps = 0;
}
// if we are doing site linkinfo, bail now
if ( m_mode == MODE_SITELINKINFO ) return sendRequests();
// when MODE_PAGELINKINFO we must have a site quality for that site
if ( m_siteNumInlinks < 0 ) {char *xx=NULL;*xx=0; }
// shortcut
long n = m_siteNumInlinks;
if ( n >= 1000 ) {m_spamWeight = 90; m_maxSpam = 4000;}
else if ( n >= 900 ) {m_spamWeight = 80; m_maxSpam = 3000;}
else if ( n >= 800 ) {m_spamWeight = 70; m_maxSpam = 2000;}
else if ( n >= 700 ) {m_spamWeight = 55; m_maxSpam = 1000;}
else if ( n >= 600 ) {m_spamWeight = 50; m_maxSpam = 100;}
else if ( n >= 500 ) {m_spamWeight = 15; m_maxSpam = 20;}
else if ( n >= 200 ) {m_spamWeight = 10; m_maxSpam = 15;}
else if ( n >= 70 ) {m_spamWeight = 07; m_maxSpam = 10;}
else if ( n >= 20 ) {m_spamWeight = 05; m_maxSpam = 7;}
/*
for steve i took this out of the key and put in the lower ip byte
// scan list for the minimum hop count of the inlinkers
m_list.resetListPtr();
long minhc = -1;
for ( ; ! m_list.isExhausted() ; m_list.skipCurrentRecord() ) {
// get the key/rec
key224_t key;
m_list.getCurrentKey( (char*)&key );
char hc = g_linkdb.getLinkerHopCount_uk ( &key );
if ( hc >= 0 && hc < minhc ) minhc = hc;
}
// now set our hopcount based on that
if ( minhc >= 0 ) m_minInlinkerHopCount = minhc;
*/
// now send the requests
m_list.resetListPtr();
return sendRequests();
}
// . returns false if blocked, true otherwise
// . sets g_errno on error
bool Msg25::sendRequests ( ) {
// . stop if cancelled
// . a niceness of 0 implies we are called from Msg3b, the docid
// reranking tool
// . no, now that can be niceness 0, so use the m_spideringEnabled flag
//if ( ! g_conf.m_spideringEnabled &&
// ! m_isInjecting &&
// ! m_pbuf &&
// m_canBeCancelled ) {
// g_errno = ECANCELLED;
// return true;
//}
unsigned long long lastDocId = 0LL;
//log("debug: entering sendrequests this=%lx",(long)this);
// change status
//if ( m_statusPtr ) *m_statusPtr = "getting link texts";
// smaller clusters cannot afford to launch the full 300 msg20s
// because it can clog up one host!
float ratio = (float)g_hostdb.getNumHosts() / 128.0;
long ourMax = (long)(ratio * (float)MAX_MSG20_OUTSTANDING);
if ( ourMax > MAX_MSG20_OUTSTANDING )
ourMax = MAX_MSG20_OUTSTANDING;
// keep sending requests
while ( 1 == 1 ) {
// breathe
QUICKPOLL ( m_niceness );
// if we still haven't gotten enough good inlinks, quit after
// looking up this many titlerecs
if ( m_numRequests >= MAX_DOCIDS_TO_SAMPLE ) break;
// . we only need at most MAX_LINKERS in our sample
// . but we do keep "losers" until the very end so we can
// remove them in an order-independent fashion to guarantee
// consistency. otherwise, "losers" will depend on the order
// in which the Msg20Replies are received to some degree
if ( m_numReplyPtrs >= MAX_LINKERS ) break;
// do not have more than this many outstanding Msg23s
if ( m_numRequests-m_numReplies >= ourMax ) break;
// we may have pre-allocated the LinkText classes for
// use be currently outstanding Msg20 requests, therefore
// they are not available at this time... m_numReplyPtrs is
// how many replies we have kept.
if ( m_numReplyPtrs+m_numRequests-m_numReplies>=MAX_LINKERS)
break;
// reset g_errno just in case
g_errno = 0;
char isLinkSpam = 0;
//char hc = -1;
long itop ;
unsigned long ip32;
uint64_t docId ;
long discovered = 0;
// was the link lost?
long lostDate = 0;
// . recycle inlinks from the old link info guy
// . this keeps our inlinks persistent!!! very nice...
// . useful for when they disappear on an aggregator site
if ( m_list.isExhausted() && m_round == 0 ) {
// recycle old inlinks at this point
if ( m_k == (Inlink *)-1 ) m_k = NULL;
// get it
m_k = m_oldLinkInfo->getNextInlink ( m_k );
// if none left, we really are done
if ( ! m_k ) break;
// set these
//hc = m_k->m_hopcount;
itop = m_k->m_ip & 0x0000ffff;
ip32 = m_k->m_ip;
isLinkSpam = m_k->m_isLinkSpam;
docId = m_k->m_docId;
discovered = m_k->m_firstIndexedDate;
}
else if ( m_list.isExhausted() && m_round != 0 )
break;
// is this a "url" key?
else if ( m_mode == MODE_PAGELINKINFO ) {
// get the current key if list has more left
key224_t key; m_list.getCurrentKey( &key );
// skip if the bit is not set right
//if ( ! g_linkdb.isUrlKey(&key) ) continue;
//hc = g_linkdb.getLinkerHopCount_uk ( &key );
itop = g_linkdb.getLinkerIp24_uk ( &key );
ip32 = g_linkdb.getLinkerIp_uk ( &key );
isLinkSpam = g_linkdb.isLinkSpam_uk ( &key );
docId = g_linkdb.getLinkerDocId_uk ( &key );
discovered = g_linkdb.getDiscoveryDate_uk(&key);
// is it expired?
lostDate = g_linkdb.getLostDate_uk(&key);
// update this
memcpy ( &m_nextKey , &key , LDBKS );
//if ( ip32+1 < ip32 ) { char *xx=NULL;*xx=0; }
// skip to next ip!
//g_linkdb.setIp32_uk ( &m_nextKey , ip32+1 );
m_nextKey += 1;
}
// otherwise this is a "site" key. we are getting all the
// inlinks to any page on the site...
else {
// get the current key if list has more left
key224_t key; m_list.getCurrentKey( &key );
// show it for debug
//log("key: %s",KEYSTR(&key,LDBKS));
// skip if the bit is not set right
//if ( ! g_linkdb.isSiteKey(&key) ) continue;
//hc = -1;
itop = g_linkdb.getLinkerIp24_uk ( &key );
ip32 = g_linkdb.getLinkerIp_uk ( &key );
//isLinkSpam=g_linkdb.isLinkSpam_sk ( &key );
isLinkSpam = false;
docId = g_linkdb.getLinkerDocId_uk ( &key );
//linkDate = g_linkdb.getDate_sk ( &key );
discovered = g_linkdb.getDiscoveryDate_uk(&key);
// is it expired?
lostDate = g_linkdb.getLostDate_uk(&key);
// update this
memcpy ( &m_nextKey , &key , LDBKS );
//if ( ip32+1 < ip32 ) { char *xx=NULL;*xx=0; }
// skip to next ip!
//g_linkdb.setIp32_uk ( &m_nextKey , ip32+1 );
m_nextKey += 1;
}
// advance to next rec if the list has more to go
if ( ! m_list.isExhausted() ) m_list.skipCurrentRecord();
// clear this if we should
if ( ! m_doLinkSpamCheck ) isLinkSpam = false;
// mangle it so hashtable does not collide so much
//long long dh = hash64h ( docId , docId );
// dedup docid, since we now try to keep old Inlinks from
// the previous LinkInfo. this allows us to preserve RSS
// info, good hopcounts, etc.
// on round 487 this is going OOM at 300MB, so take it out
//if ( m_docIdTable.getSlot ( &dh ) >= 0 ) {
// m_docIdDupsLinkdb++;
// continue;
//}
// add it. TODO: what if this fails?
//if ( ! m_docIdTable.addKey ( &dh ) )
// return true;
// if it is no longer there, just ignore
if ( lostDate ) {
m_lostLinks++;
continue;
}
// try using this to save mem then
if ( docId == lastDocId ) {
m_docIdDupsLinkdb++;
continue;
}
// update this then
lastDocId = docId;
// count unique docids
m_numDocIds++;
//
// get the next docId
//
// . now the 4 hi bits of the score represent special things
// . see Msg18.cpp:125 where we repeat this
// once we got this many "good" spam inlinkers, we should
// get no more! assume the outstanding link spams will be
// successful. they may not, so we must wait on them.
// MDW: take this out
//if(isLinkSpam && m_spamCount + m_linkSpamOut >= m_maxSpam ) {
// // actually, assume they will be successful,
// // and just skip ourselves
// m_linkSpamLinkdb++;
// continue;
//}
// is it the minimum thus far?
// MDW: i saw a bunch of corrupt linkdb recs with hc<0. check.
//if ( hc>=0 &&
// (m_minInlinkerHopCount==-1 || hc<m_minInlinkerHopCount))
// m_minInlinkerHopCount = hc;
// count unique ips for steve's stats
if ( ! m_fullIpTable.isInTable(&ip32) ) {
// return true on error
if ( ! m_fullIpTable.addKey(&ip32) )
return true;
// count it
m_uniqueIps++;
}
// TODO: if inlinker is internal by having the same DOMAIN
// even though a different ip, we should adjust this logic!!
if ( itop != m_top ) {
long slot = m_ipTable.getSlot ( &itop );
if ( slot != -1 ) {m_ipDupsLinkdb++;continue;}
// store it
if ( ! m_ipTable.addKey ( &itop ) )
return true;
// count unique cblock inlinks
m_cblocks++;
}
// if we are local... allow up to 5 votes, weight is diminished
else {
// count your own, only once!
if ( m_numFromSameIp == 0 ) m_cblocks++;
// count it as internal
m_numFromSameIp++;
// only get link text from first 5 internal linkers,
// they will all count as one external linker
if ( m_numFromSameIp > MAX_INTERNAL_INLINKS ) {
m_ipDupsLinkdb++; continue; }
}
// count this request as launched
m_numRequests++;
// if linkspam, count this
if ( isLinkSpam ) m_linkSpamOut++;
// find a msg20 we can use
long j ;
for (j=0 ;j<MAX_MSG20_OUTSTANDING;j++) if (!m_inUse[j]) break;
// sanity check
if ( j >= MAX_MSG20_OUTSTANDING ) { char *xx = NULL; *xx = 0; }
// "claim" it
m_inUse [j] = 1;
// . this will return false if blocks
// . we EXPECT these recs to be there...
// . now pass in the score for the newAlgo
Msg20Request *r = &m_msg20Requests[j];
// clear it. reset to defaults.
r->reset();
// set the request
r->m_getLinkText = true;
r->m_onlyNeedGoodInlinks = m_onlyNeedGoodInlinks;
// is linkee a site? then we will try to find link text
// to any page on that site...
// if we are in site mode, then m_url should be m_site!
if ( m_mode == MODE_PAGELINKINFO ) {
r->m_isSiteLinkInfo = false;
r-> ptr_linkee = m_url;
r->size_linkee = gbstrlen(m_url)+1; // include \0
}
else {
r->m_isSiteLinkInfo = true;
r-> ptr_linkee = m_site;
r->size_linkee = gbstrlen(m_site)+1; // include \0
}
r-> ptr_coll = m_coll;
r->size_coll = gbstrlen(m_coll) + 1; // include \0
r->m_docId = docId;
r->m_expected = true; // false;
r->m_niceness = m_niceness;
r->m_state = r;
r->m_state2 = this;
r->m_j = j;
r->m_callback = gotLinkTextWrapper;
// do NOT get summary stuff!! slows us down...
r->m_maxNumCharsPerLine = 0;
r->m_numSummaryLines = 0;
// get title now for steve
r->m_titleMaxLen = 300;
r->m_summaryMaxLen = 0;
r->m_discoveryDate = discovered;
// buzz sets the query to see if inlinker has the query terms
// so we can set <absScore2>
r->m_langId = langUnknown; // no synonyms i guess
r->ptr_qbuf = m_qbuf;
r->size_qbuf = m_qbufSize;
// These parms should maybe be passed from the calling msg20,
// But, at least for now, buzz always wants these values.
//if (m_qbufSize > 1){
// r->m_hackFixWords = true;
// r->m_hackFixPhrases = true;
// r->m_excludeLinkText = true;
// r->m_excludeMetaText = true;
//}
// place holder used below
r->m_isLinkSpam = isLinkSpam;
// buzz may not want link spam checks! they are pretty clean.
r->m_doLinkSpamCheck = m_doLinkSpamCheck;
// this just gets the LinkInfo class from the TITLEREC
// so that Msg25 should not be called. thus avoiding an
// infinite loop!
// we need the LinkInfo of each inlinker to get the inlinker's
// sitePop and numInlinksToSite, which is needed to call
// makeLinkInfo() below.
r->m_getLinkInfo = true;
r->m_ourHostHash32 = m_ourHostHash32;
r->m_ourDomHash32 = m_ourDomHash32;
// . MAKE A FAKE MSG20REPLY for pre-existing Inlinks
// . the opposite of Inlink::set(Msg20Reply *)
// . we used that as a reference
// . ISSUES:
// . 1. if inlinker gets banned, we still recycle it
// . 2. if ad id gets banned, we still recycle it
// . 3. we cannot dedup by the vectors, because we do not
// store those in the Inlink class (Msg25::isDup())
if ( m_k && m_k != (Inlink *)-1 ) {
Msg20Reply *rep = &m_msg20Replies[j];
rep->reset();
m_k->setMsg20Reply ( rep );
// let receiver know we are a recycle
rep->m_recycled = 1;
// . this returns true if we are done
// . g_errno is set on error, and true is returned
if ( gotLinkText ( r ) ) return true;
// keep going
continue;
}
// debug log
if ( g_conf.m_logDebugLinkInfo ) {
char *ms = "page";
if ( m_mode == MODE_SITELINKINFO ) ms = "site";
log("msg25: getting single link mode=%s site=%s "
"url=%s docid=%lli request=%li",
ms,m_site,m_url,docId,m_numRequests-1);
}
// returns false if blocks, true otherwise
bool status = m_msg20s[j].getSummary ( r ) ;
// breathe
QUICKPOLL(m_niceness);
// if blocked launch another
if ( ! status ) continue;
// . this returns true if we are done
// . g_errno is set on error, and true is returned
if ( gotLinkText ( r ) ) return true;
}
// we may still be waiting on some replies to come in
if ( m_numRequests > m_numReplies ) return false;
// if the list had linkdb recs in it, but we launched no msg20s
// because they were "lost" then we end up here.
/*
if ( m_url )
log("linkdb: encountered %li lost links for url %s "
"rnd=%li",
m_lostLinks,m_url,m_round);
else
log("linkdb: encountered %li lost links for docid %lli "
"rnd=%li",
m_lostLinks,m_docId,m_round);
*/
// . otherwise, we got everyone, so go right to the merge routine
// . returns false if not all replies have been received
// . returns true if done
// . sets g_errno on error
// . if all replies are in then this can call doReadLoop() and
// return false!
return gotLinkText ( NULL );
}
bool gotLinkTextWrapper ( void *state ) { // , LinkTextReply *linkText ) {
Msg20Request *req = (Msg20Request *)state;
// get our Msg25
Msg25 *THIS = (Msg25 *)req->m_state2;
//log("debug: entering gotlinktextwrapper this=%lx",(long)THIS);
// . this returns false if we're still awaiting replies
// . returns true if all replies have been received and processed
if ( THIS->gotLinkText ( req ) ) {
//log("debug: calling final callback 2");
if ( THIS->m_gettingList ) return false;
// . now call callback, we're done
// . g_errno will be set on critical error
THIS->m_callback ( THIS->m_state );//, THIS->m_linkInfo );
return true;
}
// if gotLinkText() called doReadLoop() it blocked calling msg0
if ( THIS->m_gettingList ) return false;
// . try to send more requests
// . return if it blocked
// . shit, this could call doReadLoop() now that we have added
// the lostdate filter, because there will end up being no requests
// sent out even though the list was of positive size, so it
// will try to read the next round of msg0.
if ( ! THIS->sendRequests ( ) ) return false;
// . shit, therefore, return false if we did launch a msg0 after this
if ( THIS->m_gettingList ) return false;
//log("debug: calling final callback 3");
// otherwise we're done
THIS->m_callback ( THIS->m_state );//, THIS->m_linkInfo );
return true;
}
char *getExplanation ( char *note ) {
if ( ! note ) return NULL;
if ( strcmp(note,"good")==0) return NULL;
static char *s_notes[] = {
"same mid domain",
"inlinker's domain, excluding TLD, is same as page it "
"links to",
"linker banned or filtered",
"inlinker's domain has been manually banned",
"no link text",
"inlink contains no text, probably an image",
"banned by ad id",
"inlinking page contains a google ad id that has been "
"manually banned",
"ip dup",
"inlinker is from the same C Block as another inlinker",
"first ip dup",
"first recorded C Block of inlinker matches another inlinker",
"post page",
"inlink is from a page that contains a form tag whose "
"submit url contains character sequence that are indicative "
"of posting a comment, thereby indicating that the inlink "
"could be in a comment section",
"path is cgi",
"inlinker url contains a question mark",
"similar link desc",
"the text surrounding the anchor text of this inlink is "
"too similar to that of another processed inlink",
"similar content",
"the inlinker's page content is "
"too similar to that of another processed inlink",
"doc too big",
"inlinker's page is too large, and was truncated, and "
"might have lost some indicators",
"link chain middle",
"inlink is in the middle of a list of inlinks, without "
"any non-link text separating it, inidicative of text ads",
"link chain right",
"inlink is at the end of a list of inlinks, without "
"any non-link text separating it, inidicative of text ads",
"link chain left",
"inlink is at the beginning of a list of inlinks, without "
"any non-link text separating it, inidicative of text ads",
"near sporny outlink",
"inlink is near another outlink on that page which contains "
"porn words in its domain or url",
"70.8*.",
"inlinker is from an IP address that is of the form "
"70.8*.*.* which is a notorious block of spam",
".info tld",
"inlinker's tld is .info, indicative of spam",
".biz tld",
"inlinker's tld is .biz, indicative of spam",
"textarea tag",
"inlinker page contains a textarea html tag, indicative "
"of being in a comment section",
"stats page",
"inlinker is from a web access stats page",
"has dmoz path",
"inlinker url looks like a dmoz mirror url",
"guestbook in hostname",
"inlinker is from a guestbook site",
"ad table",
"inlink appears to be in a table of ad links",
"duplicateIPCClass",
};
long n = sizeof(s_notes)/ sizeof(char *);
for ( long i = 0 ; i < n ; i += 2 ) {
if ( strcmp(note,s_notes[i]) ) continue;
return s_notes[i+1];
}
if ( strncmp(note,"path has",8) == 0 )
return "inlinker's url contains keywords indicative "
"of a comment page, guestbook page or "
"link exchange";
return
"inlinker's page contains the described text, indicative of "
"being a link exchange or being in a comment section or "
"being an otherwise spammy page";
}
// . returns false if not all replies have been received (or timed/erroredout)
// . returns true if done
// . sets g_errno on error
bool Msg25::gotLinkText ( Msg20Request *req ) { // LinkTextReply *linkText ) {
//log("debug: entering gotlinktext this=%lx",(long)this);
long j = -1;
if ( req ) j = req->m_j;
// get it
Msg20 *m = NULL;
// the reply
Msg20Reply *r = NULL;
// the alloc size of the reply
long rsize = 0;
// the original request
// set the reply
if ( j >= 0 ) {
// get the msg20
m = &m_msg20s[j];
// set the reply
r = m->m_r;
// the reply size
rsize = m->m_replyMaxSize;
// inc # of replies
m_numReplies++;
// get the request
Msg20Request *req = &m_msg20Requests[j];
// discount this if was linkspam
if ( req->m_isLinkSpam ) m_linkSpamOut--;
// "make available" msg20 and msg20Request #j for re-use
m_inUse [ j ] = 0;
// . propagate internal error to g_errno
// . if g_errno was set then the reply will be empty
if ( r && r->m_errno && ! g_errno ) g_errno = r->m_errno;
// if it had an error print it for now
if ( r && r->m_errno )
log("query: msg25: msg20 had error for docid %lli : "
"%s",r->m_docId, mstrerror(r->m_errno));
}
// what is the reason it cannot vote...?
char *note = NULL;
long noteLen = 0;
// assume it CAN VOTE for now
bool good = true;
// just log then reset g_errno if it's set
if ( g_errno ) {
// a dummy docid
long long docId = -1LL;
// set it right
if ( r ) docId = r->m_docId;
// we often restrict link: termlist lookup to indexdb root
// file, so we end up including terms from deleted docs...
// this we get a lot of ENOTFOUND errors.
// MDW: we no longer do this restriction...
log(LOG_DEBUG,
"build: Got error getting link text from one document: "
"%s. Will have to restart later. docid=%lli.",
mstrerror(g_errno),docId);
// this is a special case
if ( g_errno == ECANCELLED ||
g_errno == ENOMEM ||
g_errno == ENOSLOTS ) {
m_errors++;
if ( m_numReplies < m_numRequests ) return false;
return true;
}
// otherwise, keep going, but this reply can not vote
good = false;
m_errors++;
note = mstrerror ( g_errno );
// reset g_errno
g_errno = 0;
}
// are we an "internal" inlink?
bool internal = false;
if ( r && iptop(r->m_firstIp) == m_top )
internal = true;
if ( r && iptop(r->m_ip) == m_top )
internal = true;
// . if the mid domain hash of the inlinker matches ours, no voting
// . this is set to 0 for recycles
if ( r && good && r->m_midDomHash == m_midDomHash && ! internal ) {
good = false;
m_sameMidDomain++;
note = "same mid domain";
}
// is the inlinker banned?
if ( r && good && r->m_isBanned ) {
// it is no longer good
good = false;
// inc the general count, too
m_spamLinks++;
// add his ad id hash to the table, so if any
// other linker has it, it will be banned by ad id!
if ( r->m_adIdHash ) m_adBanTable.addKey ( &r->m_adIdHash );
// count each *type* of "link spam". the type is given
// by linkText->m_note and is a string...
note = "linker banned or filtered";
}
// breathe
QUICKPOLL(m_niceness);
// get the linker url
Url linker;
if ( r ) linker.set ( r->ptr_ubuf , r->size_ubuf );
// sanity check, Xml::set() requires this...
if ( r&&r->size_rssItem > 0 && r->ptr_rssItem[r->size_rssItem-1]!=0 ) {
char *xx=NULL;*xx=0; }
// . if no link text, count as error
// . linkText->getLinkTextLen()
if ( r && good &&
r->size_linkText <= 0 &&
r->size_rssItem <= 0 &&
// allow if from a ping server because like
// rpc.weblogs.com/shortChanges.xml so we can use
// "inlink==xxx" in the url filters to assign any page linked
// to by a pingserver into a special spider queue. then we can
// spider that page quickly and get its xml feed url, and then
// spider that to get new outlinks of permalinks.
// Well now we use "inpingserver" instead of having to specify
// the "inlink==xxx" expression for every ping server we know.
! linker.isPingServer() ) {
good = false;
m_noText++;
note = "no link text";
}
// banned by way of ad id?
if (r && good&& r->m_adIdHash&&m_adBanTable.getSlot(&r->m_adIdHash)>0){
// it is no longer good
good = false;
// inc the general count, too
m_spamLinks++;
// count each *type* of "link spam". the type is given
// by linkText->m_note and is a string...
note = "banned by ad id";
}
// . if we are linked to by a page on the same ip as the linkee
// then call it a reciprocal link
// . only check if our root quality is < 45%
// . MDW: i disabled this until it can prove more useful
/*
Vector *v5 = NULL;
if ( r ) v5 = r->ptr_vectorBuf5; // linkText->getVector4();
if ( r && good && ! internal && m_rootQuality < 45 ) {
// these are the IPs of the linker's incoming linkers
long numIps = v5->getNumPairHashes();
long ourIp = m_url->getIp();
for ( long i = 0 ; i < numIps ; i++ ) {
long ip = v5->m_pairHashes[i];
if ( ip != ourIp ) continue;
good = false;
m_reciprocal++;
note = "reciprocal link";
break;
}
}
*/
QUICKPOLL(m_niceness);
// discount if LinkText::isLinkSpam() or isLinkSpam2() said it
// should not vote
if ( r && good && ! internal && r->m_isLinkSpam &&
// we can no allow link spam iff it is below the max!
++m_spamCount >= m_maxSpam ) {
// it is no longer good
good = false;
// inc the general count, too
m_spamLinks++;
// count each *type* of "link spam". the type is given
// by linkText->m_note and is a string...
note = r-> ptr_note;
noteLen = r->size_note - 1; // includes \0
}
// loop over all the replies we got so far to see if "r" is a dup
// or if another reply is a dup of "r"
long n = m_numReplyPtrs;
// do not do the deduping if no reply given
if ( ! r ) n = 0;
// do not do this if "r" already considered bad
if ( ! good ) n = 0;
// this is the "dup"
Msg20Reply *dup = NULL;
long dupi = -1;
// . we do not actually remove the Msg20Replies at this point because
// this filter is dependent on the order in which we receive the
// Msg20Replies. we do the removal below after all replies are in.
// . NO! not anymore, i don't want to hit MAX_LINKERS and end up
// removing all the dups below and end up with hardly any inlinkers
for ( long i = 0 ; ! internal && i < n ; i++ ) {
// breathe
QUICKPOLL(m_niceness);
// get the reply in a ptr
Msg20Reply *p = m_replyPtrs[i];
// is it internal
bool pinternal = false;
if ( iptop(p->m_ip) == m_top ) pinternal = true;
if ( iptop(p->m_firstIp) == m_top ) pinternal = true;
// allow internal inlinks to match
if ( pinternal ) continue;
// is "p" a dup of us? (or we of it?)
char *dupNote = isDup ( r , p ) ;
// if it is not a dup, keep going
if ( ! dupNote ) continue;
// getLoser() returns the lowest-scoring reply of "r" and "p"
Msg20Reply *tmp = getLoser ( r , p );
// is it worse than the current "dup"? if so, update "dup"
dup = getLoser ( tmp , dup );
// get the "i" value
if ( dup == r ) dupi = j;
if ( dup == p ) dupi = i;
// we got a dup
good = false;
note = dupNote;
}
// inc this count
if ( dup ) m_dupCount++;
// if "p" is the lower-scoring dup, put "r" in its place, and then
// set "r" to "p" doing a swap operation
if ( dup && dup != r ) {
// sanity check
if ( dupi < 0 ) { char *xx=NULL;*xx=0; }
// HACK: swap them
Msg20Reply *tmp = m_replyPtrs [dupi];
long tmpSize = m_replySizes[dupi];
m_replyPtrs [dupi] = r;
m_replySizes[dupi] = rsize;
r = tmp;
rsize = tmpSize;
// make Msg20 point to that old "dup" reply
m->m_r = r;
m->m_replyMaxSize = rsize;
}
// breathe
QUICKPOLL(m_niceness);
if ( r && good ) {
long iptop1 = iptop(r->m_ip);
long iptop2 = iptop(r->m_firstIp);
if ( m_firstIpTable.isInTable ( &iptop1 ) ||
m_firstIpTable.isInTable ( &iptop2 ) ) {
good = false;
m_ipDups++;
note = "first ip dup";
}
// add to table. return true with g_errno set on error
if ( ! m_firstIpTable.addKey(&iptop1) ) return true;
if ( ! m_firstIpTable.addKey(&iptop2) ) return true;
}
// BUT do not set good to false so it is stored so we can look
// at the linktext for indexing purposes anyway, but we do not
// want to count it towards the # of good siteinlinks because
// its internal
if ( internal && ! note ) {
m_ipDups++;
note = "ip dup";
}
if ( r && ! good && ! note )
note = "unknown reason";
// compile the reason it could not vote
if ( r && ! good ) {
// set "noteLen" if not yet set
if ( note && noteLen == 0 ) noteLen = gbstrlen ( note );
// add it to our table
addNote ( note , noteLen , r->m_docId );
// . free the reply since it cannot vote
// . no, it should be auto-freed when the msg20 is re-used
}
bool store = true;
if ( ! good ) store = false;
//if ( ! m_onlyNeedGoodInlinks ) store = true;
// . if doing for display, show good
// . steve needs to see the bad guys as well ppl can learn
if ( m_pbuf ) store = true;
// now for showing recommended link sources let's include the
// bad boys because we might have mislabelled them as bad, b/c maybe
// google thinks they are good! fix for
// XmlDoc::getRecommendedLinksBuf()
if ( ! m_onlyNeedGoodInlinks ) store = true;
// how is this NULL?
if ( ! r ) store = false;
if ( store ) {
// save the reply
m_replyPtrs [m_numReplyPtrs] = r;
m_replySizes[m_numReplyPtrs] = rsize;
// why we do this?
if ( note && ! r->ptr_note ) {
r->ptr_note = note;
r->size_note = noteLen+1;
}
// store this in the reply for convenience
r->m_discoveryDate = req->m_discoveryDate;
m_numReplyPtrs++;
// debug note
//log("linkdb: stored %li msg20replies",m_numReplyPtrs);
// do not allow Msg20 to free it
m->m_r = NULL;
}
// free the reply buf of this msg20 now to save mem because
// we can't send out like 100,000 of these for yahoo.com to find
// less than 1000 good ones!
// tell msg20 to free the reply if not null
if ( m ) m->reset();
// wait for all replies to come in
if ( m_numReplies < m_numRequests ) return false;
//
//
// READ MORE FROM LINKDB to avoid truncation
//
//
// youtube is doing like 180,000 rounds! wtf! limit to 10000
if ( m_list.m_listSize > 0 && // && m_round < 10000 ) {
// no! now we shrink a list by removing dup docids from it
// in Msg0.cpp before sending back to save memory and cpu and
// network. so it can be well below m_minRecSizes and still need
// to go on to the next round
//m_list.m_listSize >= m_minRecSizes &&
m_numReplyPtrs < MAX_LINKERS ) {
// count it
m_round++;
// note it
char *ms = "page";
char *id = m_url;
if ( m_mode == MODE_SITELINKINFO ) {
ms = "site";
id = m_site;
}
// debug
if ( g_conf.m_logDebugLinkInfo ) {
log("linkdb: recalling round=%li for %s=%s",
m_round,ms,m_site);
}
// and re-call
if ( ! doReadLoop() ) return false;
// it did not block!! wtf? i guess it read no more or
// launched no more requests.
//log("linkdb: doreadloop did not block");
}
//
//
// process all "good" Msg20Replies
//
//
/*
// since we may not have called a Msg20 for every docid in the
// linkdb list, extroplate "m_numReplyPtrs" to what it probably should
// have been. it is a non-linear. we could go out to more derivatives,
// m_deltaDiff2, etc. if necessary.
long long extrapolated = m_numReplyPtrs;
long long bonus = m_numReplyPtrs;
long long step = (long)MAX_DOCIDS_TO_SAMPLE * 2 ;
// add in "bonus" X docids sampled
long nd;
for ( nd = m_numReplies ; nd + step <= m_numDocIds ; nd += step ) {
QUICKPOLL(m_niceness);
extrapolated += bonus;
step *= 2;
}
*/
// . do linear estimation of remainder however.
// . hey i don't want to get into crazy logs...
/*
long long rem = m_numDocIds - nd;
if ( step > 0 && rem > 0 ) extrapolated += (bonus * rem) / step;
// sanity check
if ( rem > step ) { char *xx = NULL; *xx = 0; }
// log build msg
if ( g_conf.m_logDebugSpider )
log(LOG_DEBUG,"build: msg25: %s extrapolated=%li "
"goodReplies=%li "
"allReplies=%li",
m_url->getUrl(), (long)extrapolated, (long)m_numReplyPtrs,
(long)m_numReplies);
// sanity check
if ( extrapolated < 0 ) {
if ( g_conf.m_logDebugSpider )
log("build: msg25: extrapolated = %li < 0. Resetting "
"to 0.",(long)extrapolated);
extrapolated = 0;
}
// the x factor
long x = 100;
if ( m_numReplyPtrs > 0 )
x = ((long long)extrapolated * 100LL) / m_numReplyPtrs;
*/
// skip making link info?
//if ( ! m_onlyNeedGoodInlinks ) return true;
// breathe
QUICKPOLL(m_niceness);
// debug log
if ( g_conf.m_logDebugLinkInfo ) {
char *ms = "page";
if ( m_mode == MODE_SITELINKINFO ) ms = "site";
log("msg25: making final linkinfo mode=%s site=%s url=%s "
"docid=%lli",
ms,m_site,m_url,m_docId);
}
// . this returns NULL and sets g_errno on error
// . returns an allocated ptr to a LinkInfo class
// . we are responsible for freeing
// . LinkInfo::getSize() returns the allocated size
m_linkInfo = makeLinkInfo ( m_coll ,
m_ip ,
m_siteNumInlinks ,
//m_sitePop ,
m_replyPtrs ,
m_numReplyPtrs ,
//m_numReplyPtrs, // extrapolated ,
//100, // x ,
m_spamWeight ,
m_oneVotePerIpDom ,
m_docId , // linkee docid
m_lastUpdateTime ,
m_onlyNeedGoodInlinks ,
m_niceness ,
this ,
m_linkInfoBuf );
2013-08-03 00:12:24 +04:00
// return true with g_errno set on error
if ( ! m_linkInfo ) {
log("build: msg25 linkinfo set: %s",mstrerror(g_errno));
return true;
}
// if nothing to print out, be on our way
if ( ! m_pbuf ) return true;
/////////////////////////////////////////
//
// print out for PageParser.cpp
//
/////////////////////////////////////////
// sort by docid so we get consistent output on PageParser.cpp
char sflag = 1;
while ( sflag ) {
sflag = 0;
for ( long i = 1 ; i < m_numReplyPtrs ; i++ ) {
// sort by quality first
char q1 = m_replyPtrs[i-1]->m_siteRank;//docQuality;
char q2 = m_replyPtrs[i ]->m_siteRank;//docQuality;
if ( q1 > q2 ) continue;
// if tied, check docids
long long d1 = m_replyPtrs[i-1]->m_docId;
long long d2 = m_replyPtrs[i ]->m_docId;
if ( d1 == d2 )
log("build: got same docid in msg25 "
"d=%lli url=%s",d1,
m_replyPtrs[i]->ptr_ubuf);
if ( q1 == q2 && d1 <= d2 ) continue;
// swap them
Msg20Reply *tmp = m_replyPtrs [i-1];
long size = m_replySizes [i-1];
m_replyPtrs [i-1] = m_replyPtrs [i ];
m_replySizes[i-1] = m_replySizes [i ];
m_replyPtrs [i ] = tmp;
m_replySizes[i ] = size;
sflag = 1;
}
}
// LinkInfo::set() probably filtered out even more!
//long ng = m_inlinkingDocIdsRead - inlinkDocIdsFiltered;
// how many were filtered by LinkInfo::set()?
//long inlinkingDocIdsFiltered2 = ng - m_linkInfo->getNumInlinks();
struct tm *timeStruct = localtime ( &m_lastUpdateTime );
char buf[64];
strftime ( buf, 64 , "%b-%d-%Y(%H:%M:%S)" , timeStruct );
char *ss = "site";
if ( m_mode == MODE_PAGELINKINFO ) ss = "page";
long siteRank = ::getSiteRank ( m_linkInfo->m_numGoodInlinks );
if ( m_printInXml ) { // && m_xd ) {
m_pbuf->safePrintf("\t<desc>inlinks to %s</desc>\n",ss);
m_pbuf->safePrintf("\t<sampleCreatedUTC>%lu"
"</sampleCreatedUTC>\n"
, m_lastUpdateTime
);
char *u = NULL;
if ( m_xd ) u = m_xd->ptr_firstUrl;
if ( u )
m_pbuf->safePrintf("\t<url><![CDATA[%s]]></url>\n",u);
char *site = NULL;
if ( m_xd ) site = m_xd->ptr_site;
if ( site )
m_pbuf->safePrintf("\t<site><![CDATA[%s]]></site>\n",
site);
long long d = 0LL;
if ( m_xd ) d = m_xd->m_docId;
if ( d && d != -1LL )
m_pbuf->safePrintf("\t<docId>%lli</docId>\n",d);
m_pbuf->safePrintf(
"\t<ipAddress><![CDATA[%s]]></ipAddress>\n"
"\t<totalSiteInlinksProcessed>%li"
"</totalSiteInlinksProcessed>\n"
"\t<totalGoodSiteInlinksProcessed>%li"
"</totalGoodSiteInlinksProcessed>\n"
"\t<numUniqueCBlocksLinkingToPage>%li"
"</numUniqueCBlocksLinkingToPage>\n"
"\t<numUniqueIpsLinkingToPage>%li"
"</numUniqueIpsLinkingToPage>\n"
, iptoa(m_ip)
// the total # of inlinkers. we may not have
// read all of them from disk though.
, m_numDocIds
, m_linkInfo->m_numGoodInlinks
, m_cblocks
, m_uniqueIps
);
}
else
m_pbuf->safePrintf( "<table width=100%%>"
"<td bgcolor=lightyellow>\n"
"<b>Summary of inlinks to %s "
"%s</b>\n"
"<br><br>"
"<table cellpadding=3 "
"border=1 width=100%%>\n"
"<tr>"
"<td>sample created</td>"
"<td>%s</td>"
"<td>when this info was last "
"computed</td>"
"</tr>\n"
"<tr>"
"<td>IP address</td>"
"<td>%s</td>"
"<td>&nbsp;"
"</td>"
"<td> &nbsp; </td>"
"</tr>\n"
"<tr>"
"<td>total inlinkers</td>"
"<td>%li</td>"
"<td>how many inlinks we have total. "
"Max: %li."
//" Bad docids are removed so may be "
//"less than that max."
"</td>"
"<td> &nbsp; </td>"
"</tr>\n"
"<tr>"
"<td>unique cblocks</td>"
"<td>%li</td>"
"<td>unique EXTERNAL cblock inlinks</td>"
"<td> &nbsp; </td>"
"</tr>\n"
"<tr>"
"<td>unique ips</td>"
"<td>%li</td>"
"<td>unique EXTERNAL IP inlinks</td>"
"<td> &nbsp; </td>"
"</tr>\n"
//"<tr>"
//"<td>sampled inlinkers</td>"
//"<td>%li</td>"
//"<td>how many docs "
//"we sampled for inlink text. "
//"Limited to %li docs.</td>"
//"<td> &nbsp; </td>"
//"</tr>\n"
,
ss,
m_url,
buf, //m_lastUpdateTime,
iptoa(m_ip),
// the total # of inlinkers. we may not
// have read all of them from disk though
m_numDocIds ,
(long)READSIZE/(long)LDBKS,
//(long)MAX_LINKERS_IN_TERMLIST,
// how many docids we read from disk
//(m_list.getListSize()-6)/12 ,
//(long)MAX_DOCIDS_TO_SAMPLE);
m_cblocks,
m_uniqueIps
);
if ( m_mode == MODE_SITELINKINFO && m_printInXml )
m_pbuf->safePrintf("\t<siteRank>%li</siteRank>\n" , siteRank );
// print link spam types
long ns = m_table.getNumSlots();
for ( long i = 0 ; i < ns ; i++ ) {
// skip empty slots
if ( m_table.isEmpty(i) ) continue;
// who is in this slot
NoteEntry *e = *(NoteEntry **)m_table.getValueFromSlot(i);
char *exp = getExplanation ( e->m_note );
// show it
if ( m_printInXml ) {
m_pbuf->safePrintf ( "\t<inlinkStat>\n");
m_pbuf->safePrintf ( "\t\t<name><![CDATA[" );
//m_pbuf->htmlEncode(e->m_note,gbstrlen(e->m_note),0);
m_pbuf->safeStrcpy ( e->m_note );
m_pbuf->safePrintf ( "]]></name>\n");
if ( exp )
m_pbuf->safePrintf ( "\t\t<explanation>"
"<![CDATA[%s]]>"
"</explanation>\n",
exp);
m_pbuf->safePrintf ( "\t\t<count>%li</count>\n",
e->m_count );
m_pbuf->safePrintf ( "\t</inlinkStat>\n");
}
else {
m_pbuf->safePrintf ( "<tr><td>%s", e->m_note );
//if ( exp )
// m_pbuf->safePrintf ( " - %s", exp );
m_pbuf->safePrintf("</td>");
m_pbuf->safePrintf (
"<td><font color=red>%li</font>"
"</td>"
"<td>reason could not vote</td>"
"<td>"
, e->m_count );
}
// print some docids that had this problem
for ( long j = 0 ; j < MAX_ENTRY_DOCIDS ; j++ ) {
if ( e->m_docIds[j] == -1LL ) break;
if ( ! m_printInXml )
m_pbuf->safePrintf ("<a href=\"/master/titledb"
"?c=%s&d=%lli\">"
"%li</a> ",
m_coll,e->m_docIds[j],j);
}
if ( ! m_printInXml )
m_pbuf->safePrintf ( "&nbsp; </td></tr>\n" );
}
if ( ! m_printInXml )
m_pbuf->safePrintf(
"<tr>"
"<td>ip dup</td>"
"<td><font color=red>%li</font></td>"
"<td>"
"basically the same ip, but we looked "
"it up anyway."
"</td>"
"</tr>\n"
"<tr>"
"<td>ip dup linkdb</td>"
"<td>%li</td>"
"<td>"
"linkdb saved us from having to "
"look up this many title recs from the "
"same ip C block.</td>"
"</tr>\n"
"<tr>"
"<td>docid dup linkdb</td>"
"<td>%li</td>"
"<td>"
"linkdb saved us from having to "
"look up this many title recs from the "
"same docid.</td>"
"</tr>\n"
"<tr>"
"<td>link spam linkdb</td>"
"<td>%li</td>"
"<td>"
"linkdb saved us from having to "
"look up this many title recs because "
"they were pre-identified as link "
"spam.</td>"
"</tr>\n"
"<tr>"
"<td><b>good</b></td>"
"<td><b>%li</b></td>"
"<td>"
//"may include anomalies and some "
//"link farms discovered later. "
"# inlinkers with positive weight"
//"limited to MAX_LINKERS = %li"
"</td>"
"<td> &nbsp; </td>"
"</tr>\n"
/*
"<tr>"
"<td>good extrapolated</td>"
"<td>%li</td>"
"<td>extrapolate the good links to get "
"around the MAX_LINKERS limitation</td>"
"<td> &nbsp; </td>"
"</tr>\n"
"<tr>"
"<td>X factor (not linear!)</td>"
"<td>%li%%</td>"
"<td>~ good extrapolated / good = "
"%li / %li</td>"
"<td> &nbsp; </td>"
"</tr>\n"
*/
,
(long)m_ipDups ,
(long)m_ipDupsLinkdb ,
(long)m_docIdDupsLinkdb ,
(long)m_linkSpamLinkdb ,
m_linkInfo->m_numGoodInlinks
// good and max
//(long)m_linkInfo->getNumInlinks() ,
);
if ( m_mode == MODE_SITELINKINFO && ! m_printInXml )
m_pbuf->safePrintf(
"<tr><td><b>siterank</b></td>"
"<td><b>%li</b></td>"
"<td>based on # of good inlinks</td>"
"</tr>",
siteRank
);
if ( ! m_printInXml )
m_pbuf->safePrintf("</table>"
"<br>"
"<br>" );
// xml?
if ( m_printInXml && m_ipDups ) {
// ip dups
m_pbuf->safePrintf ( "\t<inlinkStat>\n"
"\t\t<name><![CDATA[");
m_pbuf->safePrintf ( "duplicateIPCClass" );
m_pbuf->safePrintf ( "]]></name>\n");
m_pbuf->safePrintf ( "\t\t<explanation><![CDATA[");
m_pbuf->safePrintf ( "inlinker is form the same C Block "
"as another inlinker we processed");
m_pbuf->safePrintf ( "]]></explanation>\n");
m_pbuf->safePrintf ( "\t\t<count>%li</count>\n",
m_ipDups );
m_pbuf->safePrintf ( "\t</inlinkStat>\n");
}
if ( m_printInXml && m_ipDupsLinkdb ) {
// ip dups
m_pbuf->safePrintf ( "\t<inlinkStat>\n"
"\t\t<name><![CDATA[");
m_pbuf->safePrintf ( "duplicateIPCClass" );
m_pbuf->safePrintf ( "]]></name>\n");
m_pbuf->safePrintf ( "\t\t<explanation><![CDATA[");
m_pbuf->safePrintf ( "inlinker is form the same C Block "
"as another inlinker we processed");
m_pbuf->safePrintf ( "]]></explanation>\n");
m_pbuf->safePrintf ( "\t\t<count>%li</count>\n",
m_ipDupsLinkdb );
m_pbuf->safePrintf ( "\t</inlinkStat>\n");
}
if ( m_printInXml && m_docIdDupsLinkdb ) {
// ip dups
m_pbuf->safePrintf ( "\t<inlinkStat>\n"
"\t\t<name><![CDATA[");
m_pbuf->safePrintf ( "duplicateDocId" );
m_pbuf->safePrintf ( "]]></name>\n");
m_pbuf->safePrintf ( "\t\t<explanation><![CDATA[");
m_pbuf->safePrintf ( "inlinker is on the same page "
"as another inlinker we processed");
m_pbuf->safePrintf ( "]]></explanation>\n");
m_pbuf->safePrintf ( "\t\t<count>%li</count>\n",
m_ipDupsLinkdb );
m_pbuf->safePrintf ( "\t</inlinkStat>\n");
}
if ( m_printInXml && m_linkSpamLinkdb ) {
// link spam
m_pbuf->safePrintf ( "\t<inlinkStat>\n"
"\t\t<name><![CDATA[");
m_pbuf->safePrintf ( "generalLinkSpam" );
m_pbuf->safePrintf ( "]]></name>\n");
m_pbuf->safePrintf ( "\t\t<count>%li</count>\n",
m_linkSpamLinkdb );
m_pbuf->safePrintf ( "\t</inlinkStat>\n");
}
char *tt = "";
if ( m_mode == MODE_SITELINKINFO ) tt = "site ";
if ( ! m_printInXml ) {
m_pbuf->safePrintf( "<table cellpadding=3 border=1>"
"<tr>"
"<td colspan=20>Inlinks "
"to %s%s &nbsp; (IP=%s) " // pagePop=%li "
//"sitePop=%li numLinksToSite=%li)"
"</td>"
"</tr>\n"
"<tr>"
"<td>#</td>"
"<td>docId</td>"
"<td>note</td>"
"<td>url</td>"
"<td>site</td>"
"<td>title</td>"
//"<td>reason</td>"
"<td>IP</td>"
"<td>firstIP</td>"
//"<td>external</td>"
"<td>lang</td>"
"<td>discovered</td>"
"<td>pubdate</td>"
"<td>hop count</td>"
"<td>site rank</td>"
"<td># words in link text</td>"
"<td>link text bytes</td>"
"<td>link text</td>",
tt,m_url,iptoa(m_ip));
if ( m_mode == MODE_SITELINKINFO )
m_pbuf->safePrintf("<td>link url</td>");
m_pbuf->safePrintf("<td>neighborhood</td>"
"</tr>\n" );
}
//CollectionRec *cr = g_collectiondb.getRec ( m_coll );
// print out each Inlink/Msg20Reply
for ( long i = 0 ; i < m_numReplyPtrs ; i++ ) {
// point to a reply
Msg20Reply *r = m_replyPtrs[i];
// are we internal
bool internal = false;
if ( (r->m_ip&0x0000ffff) == (m_ip&0x0000ffff) )
internal = true;
if ( (r->m_firstIp&0x0000ffff) == (m_ip&0x0000ffff))
internal = true;
// the "external" string
//char *ext = "Y"; if ( internal ) ext = "N";
// are we an "anomaly"?
char *note = r->ptr_note;
if ( r->m_isLinkSpam && !note )
note = "unknown";
// get our ip as a string
//char *ips = iptoa(r->m_ip);
// print the link text itself
char *txt = r->ptr_linkText;
// get length of link text
long tlen = r->size_linkText;
if ( tlen > 0 ) tlen--;
//float weight = 1.0;
//if ( note ) weight = 0.0;
//if ( internal ) weight = 0.0;
// datedb date
char dbuf[128];
if ( r->m_datedbDate > 1 )
sprintf(dbuf,"%s UTC",
asctime ( gmtime(&r->m_datedbDate )));
else
sprintf(dbuf,"---");
char discBuf[128];
long dd = r->m_discoveryDate;
if ( dd ) {
struct tm *timeStruct = gmtime ( &dd );
strftime ( discBuf, 128 , "<nobr>%b %d %Y</nobr>" ,
timeStruct);
}
else
sprintf(discBuf,"---");
char *title = r->ptr_tbuf;
if ( ! title ) title = "";
// show the linking docid, the its weight
if ( m_printInXml ) {
char *ns = note;
if ( ! note ) ns = "good";
//if ( internal ) note = "internal";
m_pbuf->safePrintf("\t<inlink>\n"
"\t\t<docId>%lli</docId>\n"
"\t\t<url><![CDATA[%s]]></url>\n"
"\t\t<site><![CDATA[%s]]></site>\n"
"\t\t<title><![CDATA[%s]]></title>\n"
//"\t\t<weight>%.01f</weight>\n"
"\t\t<note><![CDATA[%s]]>"
"</note>\n"
, r->m_docId
, r->ptr_ubuf
, r->ptr_site
, title
//, weight
, ns
);
// get explanation of note
char *exp = getExplanation ( ns );
if ( exp )
m_pbuf->safePrintf("\t\t<explanation>"
"<![CDATA[%s]]>"
"</explanation>\n"
, exp );
m_pbuf->safePrintf("\t\t<ipAddress>"
"<![CDATA[%s]]>"
"</ipAddress>\n"
,iptoa(r->m_ip) );
m_pbuf->safePrintf("\t\t<firstIpAddress>"
"<![CDATA[%s]]>"
"</firstIpAddress>\n"
,iptoa(r->m_firstIp) );
m_pbuf->safePrintf(
"\t\t<onSite>%li</onSite>\n"
"\t\t<discoveryDateUTC>%lu"
"</discoveryDateUTC>\n"
"\t\t<language><![CDATA[%s]]>"
"</language>\n"
"\t\t<siteRank>%li</siteRank>\n"
, (long)internal
, dd
, getLanguageString(r->m_language)
, (long)r->m_siteRank
);
m_pbuf->safePrintf("\t\t<linkText><![CDATA[");
m_pbuf->htmlEncode ( txt,tlen,0);
m_pbuf->safePrintf("]]>"
"</linkText>\n"
);
if ( m_mode == MODE_SITELINKINFO )
m_pbuf->safePrintf("\t<linkUrl><![CDATA[%s]]>"
"</linkUrl>\n",
r->ptr_linkUrl);
m_pbuf->safePrintf(
"\t</inlink>\n"
);
continue;
}
m_pbuf->safePrintf( "<tr>"
"<td>%li</td>" // #i
"<td>"
"<a href=\"/print?page=1&d=%lli\">"
"%lli</a></td>" // docid
"<td><nobr>"//%.1f"
,i+1
,r->m_docId
,r->m_docId
//,weight
);
if ( note )
m_pbuf->safePrintf("%s", note );
else
m_pbuf->safePrintf("<b>good</b>");
m_pbuf->safePrintf( "</nobr></td>"//wghtnte
"<td>%s</td>" // url
"<td>%s</td>" // site
"<td>%s</td>", // title
r->ptr_ubuf,
r->ptr_site,
title);
m_pbuf->safePrintf("<td><a href=\"/search?q=ip%%3A"
"%s&c=%s&n=200\">%s</a></td>" // ip
, iptoa(r->m_ip)
, m_coll
, iptoa(r->m_ip)
);
m_pbuf->safePrintf("<td>%s</td>"
, iptoa(r->m_firstIp)
);
m_pbuf->safePrintf( //"<td>%s</td>" // external
"<td>%s</td>" // language
"<td>%s</td>" // discoverydate
"<td>%s</td>" // datedbdate
"<td>%li</td>" // hopcount
"<td><font color=red><b>%li"
"</b></font></td>" // site rank
"<td>%li</td>" // nw
"<td>%li</td>" // textLen
"<td><nobr>", // text
//ext,
getLanguageString(r->m_language),
discBuf,
dbuf,//r->m_datedbDate,
(long)r->m_hopcount,
(long)r->m_siteRank, // docQuality,
(long)r->m_linkTextNumWords ,
tlen );
// only bold if good
if ( ! note )
m_pbuf->safePrintf("<b>");
// this is in utf8 already
m_pbuf->safeMemcpy ( txt , tlen );
// only bold if good
if ( ! note )
m_pbuf->safePrintf("</b>");
// wrap it up
m_pbuf->safePrintf("</nobr></td>");
// print url that is linked to in the case of site inlinks
if ( m_mode == MODE_SITELINKINFO )
m_pbuf->safePrintf("<td>%s</td>",r->ptr_linkUrl);
// print the neighborhood
m_pbuf->safePrintf("<td>");
txt = r->ptr_surroundingText;
tlen = r->size_surroundingText - 1;
if(!txt) {
m_pbuf->safePrintf("--\n");
m_pbuf->safePrintf("</td></tr>\n");
continue;
}
// this is utf8 already
m_pbuf->safeMemcpy ( txt , tlen );
m_pbuf->safePrintf("</td></tr>\n");
}
if ( ! m_printInXml ) {
m_pbuf->safePrintf( "</table>\n<br>\n" );
m_pbuf->safePrintf( "</td></table>\n<br><br>\n" );
}
// print site rank
if ( m_printInXml && m_mode == MODE_SITELINKINFO )
m_pbuf->safePrintf("\t<siteRankTable>\n");
if ( ! m_printInXml && m_mode == MODE_SITELINKINFO )
m_pbuf->safePrintf("<table border=1>"
"<tr><td colspan=2>"
"<center>siteRankTable</center>"
"</td></tr>"
"<tr><td># good inlinks</td>"
"<td>siteRank</td></tr>"
);
// print site rank table
long lastsr = -1;
for ( long i = 0 ; i < 11000 && m_mode == MODE_SITELINKINFO ; i++ ) {
long sr = ::getSiteRank ( i );
if ( sr == lastsr ) continue;
lastsr = sr;
if ( m_printInXml )
m_pbuf->safePrintf("\t\t<row>"
"\t\t\t<numInlinks>%li"
"</numInlinks>\n"
"\t\t\t<siteRank>%li"
"</siteRank>\n"
"\t\t</row>\n"
,i,sr);
else
m_pbuf->safePrintf("<tr><td>%li</td><td>%li</td></tr>"
,i,sr);
}
if ( m_printInXml && m_mode == MODE_SITELINKINFO )
m_pbuf->safePrintf("\t</siteRankTable>\n");
else if ( m_mode == MODE_SITELINKINFO )
m_pbuf->safePrintf("</table>"
"<br>" );
//m_pbuf->safePrintf("<b>*</b><i>The maximum of these two weights "
// "is used.</i><br><br>");
return true;
}
// . return the "worst" of the two Msg20Replies
// . in the case of a dup, we use this to determine which one is kicked out
// . if one is NULL, return the other
Msg20Reply *Msg25::getLoser ( Msg20Reply *r , Msg20Reply *p ) {
if ( ! p ) return r;
if ( ! r ) return p;
// if "r" is internal, but p is not, r is the loser
bool rinternal = false;
bool pinternal = false;
if ( iptop(r->m_ip ) == m_top ) rinternal = true;
if ( iptop(r->m_firstIp) == m_top ) rinternal = true;
if ( iptop(p->m_ip ) == m_top ) pinternal = true;
if ( iptop(p->m_firstIp) == m_top ) pinternal = true;
if ( rinternal && ! pinternal )
return r;
// and vice versa
if ( pinternal && ! rinternal )
return p;
// then resort to doc quality
//if ( p->m_docQuality < r->m_docQuality ) return p;
//if ( r->m_docQuality < p->m_docQuality ) return r;
if ( p->m_siteRank < r->m_siteRank ) return p;
if ( r->m_siteRank < p->m_siteRank ) return r;
// . if they had the same quality... check docid
// . the lower the docid, the "better" it is. this behavior
// is opposite of the quality behavior.
if ( p->m_docId > r->m_docId ) return p;
// fall back to r then
return r;
}
// . is "p" a dup of "r"?
// . we will kick out the worst one so it cannot vote
// . returns NULL if not a dup
// . returns NULL with g_errno set on error
char *Msg25::isDup ( Msg20Reply *r , Msg20Reply *p ) {
// reset this
g_errno = 0;
// do ip tops match? (top 2 bytes of ip addresses)
bool internal = false;
if ( iptop(p->m_ip) == m_top ) internal = true;
if ( iptop(p->m_firstIp) == m_top ) internal = true;
if ( internal && m_oneVotePerIpDom )
return "ip dup";
// only one ad id
if ( r->m_adIdHash && r->m_adIdHash == p->m_adIdHash )
return "same ad id";
/*
// see if he is too similar to another, if so he is not a good voter
Vector *v1 = (Vector *)r->ptr_vector1;
Vector *v2 = (Vector *)r->ptr_vector2;
Vector *v3 = (Vector *)r->ptr_vector3;
// get vectors for Msg20Reply "p"
Vector *x1 = (Vector *)p->ptr_vector1;
Vector *x2 = (Vector *)p->ptr_vector2;
Vector *x3 = (Vector *)p->ptr_vector3;
// doc j is 0% to 100% similar to doc i
// . but we need to remove the wordpairs found
// in common so they aren't used against
// another doc, so we say 'true' here
// . returns -1 and sets g_errno on error
// . vX vectors can be NULL if the linker was "linkSpam" because
// Msg20.cpp's handler does not set them in that case
long p1 = 0;
long p2 = 0;
long p3 = 0;
if ( v1 && x1 ) p1 = v1->getLinkBrotherProbability ( x1 , false);
// only consider p2 if each vector is beefy. these
// can be small because these vectors represent the
// word pairs just to the right of the link in the
// content, and before any "breaking" tag thereafter.
// no, there are too many little ads, so disregard the beeft
// requirement.
if ( v2 && x2 && v2->m_numPairHashes >= 1 && x2->m_numPairHashes >= 1 )
p2 = v2->getLinkBrotherProbability(x2,false);
// compare tag id pair vectors
if ( v3 && x3 && v3->m_numPairHashes >= 2 && x3->m_numPairHashes >= 2 )
p3 = v3->getLinkBrotherProbability(x3,false);
*/
// see if he is too similar to another, if so he is not a good voter
long *v1 = (long *)r->ptr_vector1;
long *v2 = (long *)r->ptr_vector2;
long *v3 = (long *)r->ptr_vector3;
long nv1 = r->size_vector1 / 4;
long nv2 = r->size_vector2 / 4;
long nv3 = r->size_vector3 / 4;
// get vectors for Msg20Reply "p"
long *x1 = (long *)p->ptr_vector1;
long *x2 = (long *)p->ptr_vector2;
long *x3 = (long *)p->ptr_vector3;
long nx1 = p->size_vector1 / 4;
long nx2 = p->size_vector2 / 4;
long nx3 = p->size_vector3 / 4;
// doc j is 0% to 100% similar to doc i
// . but we need to remove the wordpairs found
// in common so they aren't used against
// another doc, so we say 'true' here
// . returns -1 and sets g_errno on error
// . vX vectors can be NULL if the linker was "linkSpam" because
// Msg20.cpp's handler does not set them in that case
// compare now for sanity!
//long p1 = 0;
//long p2 = 0;
//long p3 = 0;
// short cut
//long ni = m_niceness;
//if ( v1 && x1 && nv1 >= 2 && nx1 >= 2 )
// p1 = (long)computeSimilarity (v1,x1,NULL,NULL,NULL,ni);
// only consider p2 if each vector is beefy. these
// can be small because these vectors represent the
// word pairs just to the right of the link in the
// content, and before any "breaking" tag thereafter.
// no, there are too many little ads, so disregard the beeft
// requirement.
//if ( v2 && x2 && nv2 >= 2 && nx2 >= 2 )
// p2 = (long)computeSimilarity (v2,x2,NULL,NULL,NULL,ni);
// compare tag id pair vectors
//if ( v3 && x3 && nv3 >= 2 && nx3 >= 2 )
// p3 = (long)computeSimilarity (v3,x3,NULL,NULL,NULL,ni);
//if ( p1 >= 80 ) return "similar content";
//if ( p2 >= 80 ) return "similar link desc";
//if ( p3 >= 100 ) return "similar tag template";
// these count the terminating 0 long as a component
if ( v1 && x1 && nv1 >= 2 && nx1 >= 2 ) {
//p1 = (long)computeSimilarity (v1,x1,NULL,NULL,NULL,ni);
// are these two vecs 80% or more similar?
if ( isSimilar_sorted (v1,x1,nv1,nx1,80,m_niceness) ) {
//if ( p1 < 80 )
// log("test p1 failed");
return "similar content";
}
}
//if ( p1 >= 80 )
// log("test p1 failed1");
// only consider p2 if each vector is beefy. these
// can be small because these vectors represent the
// word pairs just to the right of the link in the
// content, and before any "breaking" tag thereafter.
// no, there are too many little ads, so disregard the beeft
// requirement.
if ( v2 && x2 && nv2 >= 2 && nx2 >= 2 ) {
//p2 = (long)computeSimilarity (v2,x2,NULL,NULL,NULL,ni);
// are these two vecs 80% or more similar?
if ( isSimilar_sorted (v2,x2,nv2,nx2,80,m_niceness) ) {
//if ( p2 < 80 )
// log("test p2 failed");
return "similar link desc";
}
}
//if ( p2 >= 80 )
// log("test p2 failed2");
// compare tag id pair vectors
if ( v3 && x3 && nv3 >= 2 && nx3 >= 2 ) {
//p3 = (long)computeSimilarity (v3,x3,NULL,NULL,NULL,ni);
// are these two vecs 80% or more similar?
if ( isSimilar_sorted (v3,x3,nv3,nx3,100,m_niceness) ) {
//if ( p3 < 100 )
// log("test p3 failed");
return "similar tag template";
}
}
//if ( p3 >= 100 )
// log("test p3 failed2");
return NULL;
// compare the ip tops (ip tops of the inlinks)
//long sum4 = v4->m_numPairHashes + x4->m_numPairHashes;
// each must have at least this many
//sum4 /= 3;
// at least 4 inlinking ips each...
//if ( sum4 < 4 ) sum4 = 4;
// check it
//if ( v4->m_numPairHashes >= sum4 &&
// x4->m_numPairHashes >= sum4 )
// p4 = v4->getLinkBrotherProbability(x4,false);
/*
// sensitivity settings
if ( p1 >= 80 ) return "similar content";
if ( p2 >= 80 ) return "similar link desc";
if ( p3 >= 100 ) return "similar tag template";
//if ( p4 >= 80 ) return "similar incoming ip shingle";
//if ( p4 >= 80 ) {
// // get the link text that is residing
// logf(LOG_DEBUG,"build: ip shingle sim2 of %li%% for "
// "nhi=%li nhj=%li di=%llu dj=%llu",
// (long)p4,
// v4->m_numPairHashes,x4->m_numPairHashes,
// rold->getDocId(),rnew->getDocId());
//}
// if no memory pN will be -1
if ( p1 < 0 || p2 < 0 || p3 < 0 )
log("build: Msg25: Could not perform link spam "
"removal: %s.",mstrerror(g_errno));
// all done, it was not a dup, but g_errno MAY be set
return NULL;
*/
}
bool Msg25::addNote ( char *note , long noteLen , long long docId ) {
// return right away if no note
if ( ! note || noteLen <= 0 ) return true;
// get hash
long h = hash32 ( note , noteLen );
NoteEntry **pentry = (NoteEntry **)m_table.getValue ( &h );
// if this "type" has not been recorded, add it
if ( ! pentry && h ) {
char *p = m_bufPtr;
if ( p + sizeof(NoteEntry) + noteLen + 1 >= m_bufEnd ) {
log("build: increase buf size in Msg25.");
char *xx = NULL; *xx = 0;
}
// store the entry
NoteEntry *e = (NoteEntry *)p;
p += sizeof(NoteEntry);
// init that entry
e->m_count = 1;
e->m_note = p;
e->m_docIds[0] = docId;
e->m_docIds[1] = -1LL;
// store note into the buffer, NULL terminated
memcpy ( p , note , noteLen ); p += noteLen;
*p++ = '\0';
// add to the table
long slot = -1;
if ( ! m_table.addKey(&h,&e,&slot))
log("build: Msg25 could not add note.");
// did we add successfully?
if ( slot < 0 ) return false;
// get the ptr to the stuff
//val = (char *)m_table.getValueFromSlot(slot);
// advance to next spot
m_bufPtr = p;
return true;
}
// cast it
NoteEntry *entry = *pentry;
// get the count
entry->m_count++;
// add docids to the list
for ( long i = 0 ; i < MAX_ENTRY_DOCIDS ; i++ ) {
// skip if not empty
if ( entry->m_docIds[i] != -1LL ) continue;
// take it over, its empty if it is -1LL
entry->m_docIds[i] = docId;
// next one should be -1 now
if ( i + 1 < MAX_ENTRY_DOCIDS ) entry->m_docIds[i+1] = -1LL;
// all done
break;
}
// increase the count
//if ( val ) *(long *)val = *(long *)val + 1;
return true;
}
/*
// . the ip to which we send this request must allow us to make udp requests
// . let's also have a separate list of allowable ips (not just admin ips)
// for which gigablast will accept udp requests
bool Msg25::getPageLinkInfo2 ( Url *url ,
char *coll ,
char *remoteColl ,
void *state ,
void (* callback)(void *state),
bool doLinkSpamCheck ,
bool oneVotePerIpDom ,
bool canBeCancelled ) {
//linkInfo->reset();
// sanity check
if ( ! coll ) { char *xx = NULL; *xx = 0; }
// get hostdb to use
//long collLen = gbstrlen(coll);
CollectionRec *cr = g_collectiondb.getRec ( coll );//, collLen );
Hostdb *hostdb = &g_hostdb;
if ( cr->m_importFromHosts2Conf ) hostdb = &g_hostdb2;
// sanity check
//if ( g_hostdb2.m_numHosts == 0 ) {
if ( hostdb->m_numHosts == 0 ) {
if ( m_linkInfo )
mfree(m_linkInfo,m_linkInfo->getStoredSize(),"msg25s");
m_linkInfo = NULL;
//g_errno = EBADENGINEER;
static bool s_printed = false;
if ( s_printed ) return true;
s_printed = true;
log("build: No hostdb2.conf to get secondary link info from.");
return true;
}
// watch out for bogus urls
if ( url->getHostLen() <= 0 ) {
g_errno = EBADENGINEER;
log("build: Url %s has no hostname.",url->getUrl());
return true;
}
// save callback info
m_state = state;
m_callback = callback;
m_url = url;
//m_linkInfo = linkInfo;
long remoteCollLen = 0;
if ( remoteColl ) remoteCollLen = gbstrlen ( remoteColl );
// assign it in case somebody uses it
m_coll = coll;
//m_collLen = collLen;
// make a Msg25 request for fresh link info
char *p = m_request;
// store url
memcpy ( p , url->getUrl() , url->getUrlLen() );
// skip over url
p += url->getUrlLen();
// store \0
*p++ = '\0';
// store remote coll
memcpy ( p , remoteColl , remoteCollLen );
// skip over it
p += remoteCollLen;
// store \0
*p++ = '\0';
// store ip
*(long *)p = m_ip; p += 4; // url->getIp(); p += 4;
// siteNumInlinks (bogus)
*(long *)p = 0; p += 4;
// sitePop (bogus)
*(long *)p = 0; p += 4;
// the last update time
*(long *)p = m_lastUpdateTime; p += 4;
// . store a BOGUS root quality now so gk cluster won't force a core
// because the rootQuality < 0
// . older clusters like gk will use it to compute quality, but we
// disard that info now, we just want the number of extrapolated
// inlinks to use in Msg16.cpp's computeQuality() function.
// *(char *)p = 0; p += 1;
// store flags
*p = 0;
if ( doLinkSpamCheck ) *p |= 0x02;
if ( oneVotePerIpDom ) *p |= 0x08;
if ( canBeCancelled ) *p |= 0x40;
p++;
// get size of request
m_requestSize = p - m_request;
// sanity check
if ( m_requestSize > MSG25_MAX_REQUEST_SIZE ) {
char *xx = NULL; *xx = 0; }
// use the group that has this url's title rec local, if it exists
//m_groupId = g_titledb.getGroupIdForDatil(url,hostdb);//&g_hostdb2);
//m_probDocId = g_titledb.getProbableDocIdForDatil ( url );
m_probDocId = g_titledb.getProbableDocId ( url );
m_groupId = getGroupIdFromDocId ( m_probDocId );
// . send that request
// . returns false and sets g_errno on error, otherwise it will block
// and return true
if ( ! m_mcast.send ( m_request ,
m_requestSize ,
0x25 , // msgType 0x25
false , // m_mcast own m_request?
m_groupId , // send to group(groupKey)
false , // send to whole group?
m_probDocId , // probDocId (key)
this , // state data
NULL , // state data
gotReplyWrapper25 ,
3600*24*360 , // block forever for this!
MAX_NICENESS , // niceness
false , // real time?
-1 , // firstHostId
NULL , // m_replyBuf ,
0 , // MSG25_MAX_REPLY_SIZE ,
false , // free reply buf?
false , // do disk load balancing?
-1 , // max cache age
0 , // cacheKey
0 , // bogus rdbId
-1 , // minRecSizes
true , // sendToSelf
true , // retry forever
//&g_hostdb2 ))// send to 2ndary cluster
hostdb ))// send to 2ndary cluster
return true;
// we blocked, wait for callback to be called
return false;
}
void gotReplyWrapper25 ( void *state , void *state2 ) {
Msg25 *THIS = (Msg25 *)state;
THIS->gotMsg25Reply ( );
THIS->m_callback ( THIS->m_state );//, THIS->m_linkInfo );
}
bool Msg25::gotMsg25Reply ( ) {
// ENOTFOUND errors are very common
//if ( g_errno == ENOTFOUND ) g_errno = 0;
// error?
if ( g_errno ) {
log("build: Failed to get external link info for %s.",
m_url->getUrl());
return true;
}
// grab it
bool freeit;
long replySize;
long replyMaxSize;
char *reply = m_mcast.getBestReply(&replySize,&replyMaxSize,&freeit);
// relabel it if different
//if( reply != m_replyBuf )
relabel( reply, replyMaxSize, "Msg25-mcastGBR" );
// sanity check - find that mem leak
if ( m_linkInfo ) { char *xx=NULL;*xx=0; }
// . deserialize the reply here (copied from Msg20.cpp)
// . m_linkInfo will own it
m_linkInfo = (LinkInfo *)reply;
// sanity check
if ( m_linkInfo->getStoredSize() != replySize ) { char*xx=NULL;*xx=0;}
// fix our string ptrs
//m_linkInfo->updateStringPtrs();
// sanity check
//if ( reply == m_replyBuf ) { char *xx=NULL;*xx=0;}
return true;
}
class State25 {
public:
LinkInfo m_linkInfo;
Msg25 m_msg25;
UdpSlot *m_slot;
//char *m_statusPtr;
Url m_url;
//SiteRec m_siteRec;
};
// TODO: add in neighborhood flag, do not look at g_conf for anything!!
// likewise, cr->* is not right for cr->m_indexInlinkNeighborhoods and
// cr->doLinkSpamDetection in LinkText.cpp...
// AND, all the cr->m_* in Msg25.cpp should be flags!!!
void handleRequest25 ( UdpSlot *slot , long netnice ) {
char *p = slot->m_readBuf;
// deserialize url
char *url = p; p += gbstrlen(p) + 1;
// deserialize coll
char *coll = p; p += gbstrlen(p) + 1;
//long collLen = gbstrlen(coll);
long ip = *(long *)p; p += 4;
long siteNumInlinks = *(long *)p; p += 4;
long sitePop = *(long *)p; p += 4;
long lastUpdateTime = *(long *)p; p += 4;
// sanity check
if ( lastUpdateTime == 0 || lastUpdateTime == -1){char *xx=NULL;*xx=0;}
// sanity check
//if ( rootQuality < 0 || rootQuality > 100 ) { char *xx=NULL; *xx=0;}
// get flags
char doLinkSpamCheck = *p & 0x02;
char oneVotePerIpDom = *p & 0x08;
char canBeCancelled = *p & 0x40;
p++;
// make a new Msg25
State25 *st ;
try { st = new ( State25 ); }
catch ( ... ) {
g_errno = ENOMEM;
log("build: msg25: new(%i): %s",
sizeof(State25),mstrerror(g_errno));
g_udpServer.sendErrorReply ( slot , g_errno );
return;
}
mnew ( st , sizeof(State25) , "Msg25" );
// set url class
st->m_url.set ( url , gbstrlen(url) );
// save socket
st->m_slot = slot;
// call it
Msg25 *mm = &st->m_msg25;
if ( ! mm->getPageLinkInfo ( &st->m_url ,
ip ,
-1 , // docId
coll ,
NULL , // qbuf
0 , // qbufSize
st , // state
sendLinkInfoReplyWrapper , // callback
false , // isInjecting?
NULL , // pbuf
NULL , // xd
siteNumInlinks ,
sitePop ,
NULL , // oldLinkInfo
MAX_NICENESS ,
doLinkSpamCheck ,
oneVotePerIpDom ,
canBeCancelled ,
lastUpdateTime ))
return;
// return the reply
sendLinkInfoReplyWrapper ( st );//, &st->m_linkInfo );
}
void sendLinkInfoReplyWrapper ( void *state ) { // , LinkInfo *infoArg ) {
State25 *st = (State25 *)state;
// get our state
UdpSlot *slot = st->m_slot;
// did it have an error?
if ( g_errno ) {
mdelete ( st , sizeof(st) , "Msg25" );
delete ( st );
g_udpServer.sendErrorReply ( slot , g_errno );
return;
}
Msg25 *m = &st->m_msg25;
// get the link info ptr
LinkInfo *info = m->m_linkInfo;
// sanity test
//if ( info != infoArg ) { char *xx=NULL; *xx=0; }
// grab it
char *reply = (char *)info;
// get the size
long need = 0;
if ( info ) need = info->getStoredSize();
// don't let Msg25 free it
m->m_linkInfo = NULL;
// free the state
mdelete ( st , sizeof(st) , "Msg25" );
delete ( st );
// send it away
g_udpServer.sendReply_ass ( reply , need , reply , need , slot );
}
*/
//////////
//
// LINKINFO
//
//////////
#include "HashTableX.h"
#include "Words.h"
#include "Titledb.h"
#include "Msge0.h"
#include "IndexList.h"
#include "XmlDoc.h" // score8to32()
#define MAX_LINKERS 3000
#define MAX_INTERNAL_INLINKS 10
// . after Msg25 calls LinkInfo::set() (was merge()) make it call
// Msg25::print(SafeBuf *pbuf) to print the stats. it can access
// LinkInfo's Inlinks to get their weights, etc.
// . returns the LinkInfo on success
// . returns NULL and sets g_errno on error
LinkInfo *makeLinkInfo ( char *coll ,
long ip ,
long siteNumInlinks ,
//long sitePop ,
Msg20Reply **replies ,
long numReplies ,
//long extrapolated ,
//long xfactor ,
// if link spam give this weight
long spamWeight ,
bool oneVotePerIpTop ,
long long linkeeDocId ,
long lastUpdateTime ,
bool onlyNeedGoodInlinks ,
long niceness ,
Msg25 *msg25 ,
SafeBuf *linkInfoBuf ) {
2013-08-03 00:12:24 +04:00
// for parsing the link text
Words words;
// a table for counting words per link text
HashTableX tt;
// buf for tt
char ttbuf[2048];
// must init it!
tt.set ( 8 ,4,128,ttbuf,2048,false,niceness,"linknfo");
// how many internal linkers do we have?
long icount = 0;
// we are currently only sampling from 10
for ( long i = 0 ; i < numReplies ; i++ ) {
// replies are NULL if MsgE had an error, like ENOTFOUND
if ( ! replies[i] ) continue;
//if ( texts[i]->m_errno ) continue;
bool internal = false;
if ( (replies[i]->m_ip&0x0000ffff) == (ip&0x0000ffff) )
internal = true;
if ( (replies[i]->m_firstIp&0x0000ffff) == (ip&0x0000ffff) )
internal = true;
if ( internal )
icount++;
}
/*
// limit
if ( icount > MAX_INTERNAL_INLINKS ) icount = MAX_INTERNAL_INLINKS;
// count external now too
// *ecount = numReplies - icount;
// for counting internal links again
long icount2 = 0;
// . only allow 1 vote per ip domain OR mid domain
// . assign weights of -1 to links to ignore (from same ip top domain)
for ( long i = 0 ; i < numReplies ; i++ ) {
// get the reply
Msg20Reply *r = replies[i];
// replies are NULL if MsgE had an error, like ENOTFOUND
if ( ! r ) continue;
// . for setting "outlinks", skip those not in the index
// . this will cause them not to be stored
if ( r->m_docId == 0LL ) {
r->m_linkTextScoreWeight = 0;
continue;
}
// are we internal?
bool internal = ((r->m_ip&0x0000ffff) == (ip & 0x0000ffff));
// . if he's probably a guestbook, message board page or other
// cgi/dynamic page, don't let him vote because he can be
// easily subverted by a clever link spammer
// . this now include any url with the string "link" in it, too
if ( r->m_isLinkSpam && ! internal ) {
r->m_linkTextScoreWeight = spamWeight;
continue;
}
// if we are external
if ( ! internal ) {
r->m_linkTextScoreWeight = 100;
continue;
}
// . allow first 100 internal links, if any, but together
// they cannot count more than one external link can
// . see Links.cpp::hash() for values of "scores"
// . TODO: if he's an rss or atom doc then let him thru
// . only allow first 10 internal linkers to vote
if ( ++icount2 > MAX_INTERNAL_INLINKS ) {
r->m_linkTextScoreWeight = -1; // should this be 0?
continue;
}
long kc = r->m_pageNumInlinks + r->m_siteNumInlinks;
long long b2 ;
if ( kc >= 5000 ) b2 = 100;
else if ( kc >= 2500 ) b2 = 95;
else if ( kc >= 1000 ) b2 = 80;
else if ( kc >= 500 ) b2 = 70;
else if ( kc >= 100 ) b2 = 60;
else if ( kc >= 50 ) b2 = 50;
else if ( kc >= 10 ) b2 = 40;
else b2 = 10;
r->m_linkTextScoreWeight = b2 / icount;
}
*/
// sum up all weights into "total"
//long total = 0;
/*
for ( long i = 0 ; i < numReplies ; i++ ) {
// get the reply
Msg20Reply *r = replies[i];
// replies are NULL if MsgE had an error, like ENOTFOUND
if ( ! r ) continue;
// skip links from the same 2 byte ip as another link
//if ( r->m_linkTextScoreWeight <= 0 ) continue;
// ignore if spam
if ( onlyNeedGoodInlinks && r->m_isLinkSpam ) continue;
// add up weights
//total += r->m_linkTextScoreWeight;
char *txt = r->ptr_linkText;
long txtLen = r->size_linkText;
if ( txtLen > 0 ) txtLen--;
// this can be empty if we just had an item
if ( ! txt || txtLen <= 0 ) continue;
// otherwise, hash each word with weight of 1
//words.set ( false , txt , txtLen );
words.set ( txt, txtLen,TITLEREC_CURRENT_VERSION,true, true );
// get # of words
long nw = words.getNumWords();
// loop over each on in this link text
for ( long k = 0 ; k < nw; k++ ) {
// don't count punct
if ( words.isPunct(k) ) continue;
// get the word Id of the ith word
long long wid = words.getWordId(k);
// does it match a word in this same link text?
long j;
for ( j = 0 ; j < k ; j++ ) {
// don't consider the wordId of punct "words"
// because it is a bogus value that may
// ultimately cause us to segfault below
if ( words.isPunct (j) ) continue ;
if ( words.getWordId(j) == wid ) break;
}
// if it does then skip it
if ( j < k ) continue;
// otherwise, hash it so we can count word occurences
if ( ! tt.addTerm ( &wid , 1 ) ) {
log("build: Failed to add word to table.");
return NULL;
}
}
}
// always return >= 0, -1 means error
//if ( total < 0 ) total = 0;
// at least .3% of the good linkers need to have the word so when
// if numReplies is 1000, minCount is 3, if it is 100, this is .3
unsigned long minCount = (3 * ((long)numReplies)) / 1000;
// and at least 3 docs need to have the word...
if ( minCount < 3 ) minCount = 3;
// and if 7 have it you are always golden
if ( minCount > 7 ) minCount = 7;
long nn = numReplies;
// skip this part if table empty
if ( tt.getNumSlotsUsed() <= 0 ) nn = 0;
// set the m_isAnomaly bits
for ( long i = 0 ; i < nn ; i++ ) {
// get the reply
Msg20Reply *r = replies[i];
// replies are NULL if MsgE had an error, like ENOTFOUND
if ( ! r ) continue;
// skip weights 0 or less
if ( r->m_linkTextScoreWeight <= 0 ) continue;
if ( onlyNeedGoodInlinks && r->m_isLinkSpam ) continue;
// point to link text itself
char *txt = r->ptr_linkText;
long txtLen = r->size_linkText;
if ( txtLen > 0 ) txtLen--;
// this can be empty if we just had an item
if ( ! txt || txtLen <= 0 ) continue;
// reset it now to prevent core
words.reset();
// set the words class again from this link text
words.set ( txt , txtLen,
TITLEREC_CURRENT_VERSION,
true, true );
// get # of words in this link text
long nw = words.getNumWords();
// loop over each word
long k ;
for ( k = 0 ; k < nw; k++ ) {
// don't count punct
if ( words.isPunct(k) ) continue;
// do not count stop words (uncapitalized)
if ( words.isStopWord(k) ) continue;
// get the word Id of the ith word
long long wid = words.getWordId(k);
// filter out this LinkText if has an anomalous word
if ( tt.getScore ( &wid ) < minCount )
break;
}
// continue if not anomalous
if ( k == nw ) continue;
// remove this weight from the total
total -= r->m_linkTextScoreWeight;
// set the anomaly bit
r->m_isAnomaly = true;
}
// always return >= 0, -1 means error
if ( total < 0 ) total = 0;
*/
// we can estimate our quality here
long numGoodInlinks = 0;
// set m_linkTextNumWords
for ( long i = 0 ; i < numReplies ; i++ ) {
// get the reply
Msg20Reply *r = replies[i];
// replies are NULL if MsgE had an error, like ENOTFOUND
if ( ! r ) continue;
// get the weight
//long w = r->m_linkTextScoreWeight;
// skip weights 0 or less
//if ( w <= 0 ) continue;
// get the link text itself
char *txt = r->ptr_linkText;
long txtLen = r->size_linkText;
// discount terminating \0
if ( txtLen > 0 ) txtLen--;
// get approx # of words in link text
long nw = 0;
if ( txtLen > 0 )
nw = getNumWords(txt,txtLen,TITLEREC_CURRENT_VERSION);
// store it
r->m_linkTextNumWords = nw;
// linkspam?
if ( r->ptr_note ) {
r->m_isLinkSpam = true;
continue;
}
bool internal = false;
if ((r->m_ip&0x0000ffff) == (ip & 0x0000ffff))
internal = true;
if ((r->m_firstIp&0x0000ffff) == (ip & 0x0000ffff))
internal = true;
// if its internal do not count towards good, but do
// indeed store it!
if ( internal ) continue;
// otherwise count as good
numGoodInlinks++;
}
long count = 0;
// . now just store the Inlinks whose weights are > 0
// . how much space do we need?
long need = 0;
// how much space do we need?
for ( long i = 0 ; i < numReplies ; i++ ) {
// get the reply
Msg20Reply *r = replies[i];
// replies are NULL if MsgE had an error, like ENOTFOUND
if ( ! r ) continue;
// ignore if spam
//if ( onlyNeedGoodInlinks && r->m_isLinkSpam ) continue;
if ( r->m_isLinkSpam ) {
// linkdb debug
if ( g_conf.m_logDebugLinkInfo )
log("linkdb: inlink #%li is link spam: %s",
i,r->ptr_note);
if ( onlyNeedGoodInlinks )
continue;
2013-08-03 00:12:24 +04:00
}
// do a quick set
Inlink k; k.set ( r );
// get space
need += k.getStoredSize ( );
// count it
count++;
}
// we need space for our header
need += sizeof(LinkInfo);
// alloc the buffer
//char *buf = (char *)mmalloc ( need,"LinkInfo");
//if ( ! buf ) return NULL;
if ( ! linkInfoBuf->reserve ( need , "LinkInfo" ) ) return NULL;
2013-08-03 00:12:24 +04:00
// set ourselves to this new buffer
LinkInfo *info = (LinkInfo *)(linkInfoBuf->getBufStart());
2013-08-03 00:12:24 +04:00
// set our header
info->m_version = 0;
info->m_size = need;
info->m_lastUpdated = lastUpdateTime;//getTimeGlobal();
// how many Inlinks we stored in info->m_buf[]
info->m_numStoredInlinks = count;
// the gross total of inlinks we got, both internal and external
info->m_totalInlinkingDocIds = msg25->m_numDocIds;
// . only valid if titlerec version >= 119
// . how many unique c blocks link to us?
// . includes your own internal c block
info->m_numUniqueCBlocks = msg25->m_cblocks;
// . only valid if titlerec version >= 119
// . how many unique ips link to us?
// . this count includes internal IPs as well
info->m_numUniqueIps = msg25->m_uniqueIps;
// keep things consistent for the "test" coll
info->m_reserved1 = 0;
info->m_reserved2 = 0;
// how many total GOOD inlinks we got. does not include internal cblock
info->m_numGoodInlinks = numGoodInlinks;
//info->m_siteRootQuality = siteRootQuality; // bye-bye
// these two guys will be 0 if we called Msg25::getSiteInfo()
//info->m_sitePop = sitePop;
//info->m_siteNumInlinks = siteNumInlinks;
// if we do not read all the inlinkers on disk because there are
// more than MAX_LINKRES_IN_TERMLIST inlinkers, then we must
// extrapolate the total # of inlinkers we have. Msg25 does this
// and passes it in to us.
//info->m_numInlinksExtrapolated = extrapolated;
// point to our buf
char *p = info->m_buf;
char *pend = linkInfoBuf->getBufStart() + need;
2013-08-03 00:12:24 +04:00
// count the ones we store that are internal
long icount3 = 0;
// now set each inlink
for ( long i = 0 ; i < numReplies ; i++ ) {
// get the reply
Msg20Reply *r = replies[i];
// replies are NULL if MsgE had an error, like ENOTFOUND
if ( ! r ) continue;
// skip weights 0 or less
//if ( r->m_linkTextScoreWeight <= 0 ) continue;
// ignore if spam
//if ( onlyNeedGoodInlinks && r->m_isLinkSpam ) continue;
if ( r->m_isLinkSpam && onlyNeedGoodInlinks ) continue;
2013-08-03 00:12:24 +04:00
// are we internal?
bool internal = false;
if ( (r->m_ip&0x0000ffff) == (ip & 0x0000ffff) )
internal = true;
if ( (r->m_firstIp&0x0000ffff) == (ip & 0x0000ffff) )
internal = true;
if ( internal ) icount3++;
// set the Inlink
Inlink k;
// store it. our ptrs will reference into the Msg20Reply buf
k.set ( r );
// . this will copy itself into "p"
// . "true" --> makePtrsRefNewBuf
long wrote;
char *s = k.serialize ( &wrote , p , pend - p , true );
// sanity check
if ( s != p ) { char *xx=NULL;*xx=0; }
// sanity check
if ( k.getStoredSize() != wrote ) { char *xx=NULL;*xx=0;}
// note it if recycled
if ( k.m_recycled )
logf(LOG_DEBUG,"build: recycling Inlink %s for linkee "
"%lli", k.ptr_urlBuf,linkeeDocId);
// advance
p += wrote;
}
// . sanity check, should have used up all the buf exactly
// . so we can free the buf with k->getStoredSize() being the allocSize
if ( p != pend ) { char *xx=NULL;*xx=0; }
// how many guys that we stored were internal?
info->m_numInlinksInternal = (char)icount3;
// sanity parse it
//long ss = 0;
//for ( Inlink *k =NULL; (k=info->getNextInlink(k)) ; )
// ss += k->getStoredSize();
//if ( info->m_buf + ss != pend ) { char *xx=NULL;*xx=0;}
// success
return info;
}
static Inlink s_inlink;
static Inlink *s_orig;
// if we are an old version, we have to set s_inlink and return
// a ptr to that
Inlink *LinkInfo::getNextInlink ( Inlink *k ) {
// switch back
if ( k == &s_inlink ) k = s_orig;
// get it as the latest versioned inlink
Inlink *p = getNextInlink2 ( k );
// if none, we are done
if ( ! p ) return p;
// sanity checks
if (p->m_numStrings==0 && p->m_firstStrPtrOffset){char *xx=NULL;*xx=0;}
if (p->m_numStrings && p->m_firstStrPtrOffset==0){char *xx=NULL;*xx=0;}
// fix this for the really old guy. we did not store these two
// things initially, but they should have been set to this...
// luckily, we had a "reserved1" long...
if ( p->m_numStrings == 0 ) {
// urlBuf,linkText,surroudingText,rssItem
p->m_numStrings = 4;
p->m_firstStrPtrOffset = 64;
}
// if latest, return that
if ( p->m_numStrings == p->getBaseNumStrings() &&
p->m_firstStrPtrOffset == (char *)&p->ptr_urlBuf - (char *)p ) {
p->updateStringPtrs(NULL);
return p;
}
// otherwise, set s_inlink to it
s_inlink.set2 ( (Inlink *)p );
// preserve p though for next call
s_orig = (Inlink *)p;
// and return that
return &s_inlink;
}
Inlink *LinkInfo::getNextInlink2 ( Inlink *k ) {
if ( this == NULL ) return NULL;
// if none, return NULL
if ( m_numStoredInlinks == 0 ) return NULL;
// if k is NULL, return the first
if ( ! k ) {
// set it to the first one
k = (Inlink *)m_buf;
// done
return k;
}
// point to next
long size = k->getStoredSize();
// get the inlink to return
Inlink *next = (Inlink *)((char *)k + size);
// return NULL if breached
if ( (char *)next >= ((char *)this)+m_size ) return NULL;
// otherwise, we are still good
return next;
}
// . returns false and sets g_errno on error
// . returns true if no error was encountered
// . call xml->isEmpty() to see if you got anything
bool LinkInfo::getItemXml ( Xml *xml , long niceness ) {
// reset it
xml->reset();
// loop through the Inlinks
Inlink *k = NULL;
for ( ; (k = getNextInlink(k)) ; ) {
// does it have an xml item? skip if not.
if ( k->size_rssItem <= 1 ) continue;
// got it
break;
}
// return if nada
if ( ! k ) return true;
// set the xml
return k->setXmlFromRSS ( xml , niceness );
}
bool Inlink::setXmlFromRSS ( Xml *xml , long niceness ) {
// compute the length (excludes the \0's)
long len = size_rssItem - 1;
// return false and set g_errno if this fails
return xml->set ( ptr_rssItem ,
len ,
false , // own data?
0 , // allocSize
true , // pure xml?
TITLEREC_CURRENT_VERSION ,
false , // no need to now
niceness );
}
// only Title.cpp uses this right now
/*
bool Inlink::setXmlFromLinkText ( Xml *xml ) {
// compute the length (excludes the \0's)
long len = size_linkText - 1;
// bitch
if ( ptr_linkText[len] )
log("linknfo: bad link text, no NULL termination. truncing.");
// if not null terminated make it so!
ptr_linkText[len] = '\0';
// for some reason the link text is not DOUBLE NULL TERMINATED
*/
/*
// . copy into buf to ensure NULL termination
// . older versions were not null terminated or doubly null terminated
// as Xml::set() requires for uni
char buf[1000];
// sanity check
if ( len > 900 ) { char *xx=NULL;*xx=0; }
// copy
memcpy ( buf , ptr_linkText , size_linkText );
// ensure null termination
buf [ size_linkText ] = '\0';
buf [ size_linkText + 1 ] = '\0';
// return false and set g_errno if this fails
return xml->set ( csUTF8 ,
ptr_linkText ,
len ,
false , // own data?
0 , // allocSize
true , // pure xml?
TITLEREC_CURRENT_VERSION ,
false ); // no need to now
}
*/
// . update them for each Inlink
// . same as calling deserialize()
//void LinkInfo::updateStringPtrs ( ) {
// // loop through the Inlinks and update them
// for ( Inlink *k = NULL; (k = getNextInlink(k)) ; )
// k->updateStringPtrs();
//}
bool LinkInfo::hasLinkText ( ) {
// loop through the Inlinks
for ( Inlink *k = NULL; (k = getNextInlink(k)) ; )
if ( k->size_linkText > 1 ) return true;
return false;
}
/*
bool LinkInfo::hash ( TermTable *table ,
long externalLinkTextWeight ,
long internalLinkTextWeight ,
//TitleRec *tr ,
long ip ,
long version ,
long siteNumInlinks ,
TermTable *countTable ,
char *note ,
long niceness ) {
long noteLen = 0;
if ( note ) noteLen = gbstrlen ( note );
// count "external" inlinkers
long ecount = 0;
// loop through the link texts and hash them
for ( Inlink *k = NULL; (k = getNextInlink(k)) ; ) {
// is this inlinker internal?
bool internal=((ip&0x0000ffff)==(k->m_ip&0x0000ffff));
// count external inlinks we have for indexing gbmininlinks:
if ( ! internal ) ecount++;
// get score
long long baseScore = k->m_baseScore;
// get the weight
long long ww ;
if ( internal ) ww = internalLinkTextWeight;
else ww = externalLinkTextWeight;
// modify the baseScore
long long final = (baseScore * ww) / 100LL;
// get length of link text
long tlen = k->size_linkText;
if ( tlen > 0 ) tlen--;
// get the text
char *txt = k->ptr_linkText;
// if it is anomalous, set this, we don't
//if ( k->m_isAnomalous )
// table->m_hashIffNotUnique = true;
// . hash the link text into the table
// . returns false and sets g_errno on error
// . do NOT hash singletons if they are in a phrase already!
// this way "yahoo groups" won't come up for a "yahoo" query
// and "new york times" won't come up for a "york" query
// . actually, this may backfire on us, so i reverted back!
// . we still have the score punish from # of words though!
if ( ! table->hash ( version ,
note ,
noteLen ,
NULL ,
0 ,
txt ,
tlen ,
final , // modified score
0x7fffffff , // maxScore
true , // doSpamDetection?
true , // hashSingleWords? ok.
true , // hashPhrases?
false , // hashAsWhole?
false , // useStems?
true , // useStopWords?
false , // hashIffUnique?
false , // hashWordIffNotInPhrase
// FIXME: need normaliztion Info (partap)
4 , // mp
NULL , // wwptr ,
NULL , // pptr ,
NULL , // bptr ,
NULL , // wgptr ,
true , // isLinkText?
countTable ,
NULL , // scoresPtr
20 , // numRepeatWords
siteNumInlinks,
niceness ))
return false;
*/
// turn this back off in case enabled
//table->m_hashIffNotUnique = false;
/*
if( !hasLinkText ) continue;
if ( ! table->hash ( titleRecVersion ,
"hash incoming link text for this field" ,
0 , // prefixLen1
field , // "linktextincoming:"
gbstrlen(field), // length of field
txt ,
txtLen ,
docQuality ,
TERMTABLE_MAXSCORE, // maxScore
true , // doSpamDetection?
true , // hashSingleWords? ok.
true , // hashPhrases?
false , // hashAsWhole?
false , // useStems?
true , // useStopWords?
false , // hashIffUnique?
false , // hashWordIffNotInPhrase
return false;
*/
/*
}
// . hash gbkeyword:numinlinks where score is # of inlinks from 1-255
// . do not hash gbkeyword:numinlinks if we don't got any
if ( ecount <= 0 ) return true;
// limit it since our score can't be more than 255 (8-bits)
if ( ecount > 255 ) ecount = 255;
// IndexList::set() converts our 32 bit score to 8-bits so we trick it!
long score = score8to32 ( (uint8_t)ecount );
// watch out for wrap
if ( score < 0 ) score = 0x7fffffff;
if ( ! table->hash ( version ,
"hash numinlinks" ,
15 ,
"gbkeyword" ,
9 ,
"numinlinks" ,
10 ,
score ,
TERMTABLE_MAXSCORE, // maxScore
false , // doSpamDetection?
true , // hashSingleWords? ok.
false , // hashPhrases?
false , // hashAsWhole?
false , // useStems?
false , // useStopWords?
false , // hashIffUnique?
false )) // hashWordIffNotInPhrase
return false;
return true;
}
*/
/*
long long getBoostFromLinkeeQuality ( char docQuality ) {
// hard code this
float fboost = 1.0;
float myQuality = (float)docQuality;
// do it different over 50, that is very nice quality...
while ( myQuality >= 50.0 ) {
myQuality--;
fboost *= 1.10;
}
// . every 14 pts over 30 doubles our boost
// . 44 --> x2
// . 58 --> x4
// . 72 --> x8
// . 86 --> x16
// . 100 --> x32
while ( myQuality >= 30.0 ) {
//myQuality *= 0.9;
myQuality--;
fboost *= 1.05;
}
// assign
return (long long )(fboost * 100.0);
}
*/
void Inlink::set ( Msg20Reply *r ) {
// . these two things are used for version-based deserializing
// . our current version has 5 strings
m_numStrings = getBaseNumStrings();
// and our current string offset
m_firstStrPtrOffset = (char *)getFirstStrPtr() - (char *)this;
// set ourselves now
m_ip = r->m_ip;
m_firstIp = r->m_firstIp;
m_wordPosStart = r->m_wordPosStart;
m_docId = r->m_docId;
m_firstSpidered = r->m_firstSpidered;
m_lastSpidered = r->m_lastSpidered;
//m_nextSpiderDate = r->m_nextSpiderTime;
m_datedbDate = r->m_datedbDate;
m_firstIndexedDate = r->m_firstIndexedDate;
m_numOutlinks = r->m_numOutlinks;
//m_baseScore = r->m_linkTextBaseScore;
//m_pagePop = r->m_pagePop;
//m_sitePop = r->m_sitePop;
//m_siteNumInlinks = r->m_siteNumInlinks;
//m_reserved1 = 0;
m_isPermalink = r->m_isPermalink;
m_outlinkInContent = r->m_outlinkInContent;
m_outlinkInComment = r->m_outlinkInComment;
m_isLinkSpam = r->m_isLinkSpam;
//m_isAnomaly = r->m_isAnomaly;
m_hasAllQueryTerms = r->m_hasAllQueryTerms;
m_recycled = r->m_recycled;
// usually the datedb date is a publication date, but it can also
// be a "modified" date. when the document was last modified. That
// is indicated by the last bit of the datedb date. it is clear if it
// is a Modified date, and it is set if it is a Publish date.
//m_datedbModified = r->m_datedbModified;
m_country = r->m_country;
m_language = r->m_language;
//m_docQuality = r->m_docQuality;
m_siteRank = r->m_siteRank;
//m_ruleset = r->m_ruleset;
m_hopcount = r->m_hopcount;
//m_linkTextScoreWeight = r->m_linkTextScoreWeight;
ptr_urlBuf = r->ptr_ubuf;
ptr_linkText = r->ptr_linkText;
ptr_surroundingText = r->ptr_surroundingText;
ptr_rssItem = r->ptr_rssItem;
ptr_categories = r->ptr_categories;
ptr_gigabitQuery = r->ptr_gigabitQuery;
ptr_templateVector = r->ptr_templateVector;
size_urlBuf = r->size_ubuf;
size_linkText = r->size_linkText;
size_surroundingText = r->size_surroundingText;
size_rssItem = r->size_rssItem;
size_categories = r->size_categories;
size_gigabitQuery = r->size_gigabitQuery;
size_templateVector = r->size_templateVector;
}
// Msg25 calls this to make a "fake" msg20 reply for recycling Inlinks
// that are no longer there... preserves rssInfo, etc.
void Inlink::setMsg20Reply ( Msg20Reply *r ) {
r->m_ip = m_ip;
r->m_firstIp = m_firstIp;
r->m_wordPosStart = m_wordPosStart;
r->m_docId = m_docId;
r->m_firstSpidered = m_firstSpidered;
r->m_lastSpidered = m_lastSpidered;
//r->m_nextSpiderTime = m_nextSpiderDate;
r->m_datedbDate = m_datedbDate;
r->m_firstIndexedDate = m_firstIndexedDate;
r->m_numOutlinks = m_numOutlinks;
//r->m_linkTextBaseScore = m_baseScore;
//r->m_pagePop = m_pagePop;
//r->m_sitePop = m_sitePop;
//r->m_siteNumInlinks = m_siteNumInlinks;
r->m_isPermalink = m_isPermalink;
r->m_outlinkInContent = m_outlinkInContent;
r->m_outlinkInComment = m_outlinkInComment;
r->m_isLinkSpam = m_isLinkSpam;
//r->m_isAnomaly = m_isAnomaly;
r->m_hasAllQueryTerms = m_hasAllQueryTerms;
r->m_country = m_country;
r->m_language = m_language;
//r->m_docQuality = m_docQuality;
r->m_siteRank = m_siteRank;
//r->m_ruleset = m_ruleset;
r->m_hopcount = m_hopcount;
//r->m_linkTextScoreWeight = m_linkTextScoreWeight;
r->ptr_ubuf = ptr_urlBuf;
r->ptr_linkText = ptr_linkText;
r->ptr_surroundingText = ptr_surroundingText;
r->ptr_rssItem = ptr_rssItem;
r->ptr_categories = ptr_categories;
r->ptr_gigabitQuery = ptr_gigabitQuery;
r->ptr_templateVector = ptr_templateVector;
r->size_ubuf = size_urlBuf;
r->size_linkText = size_linkText;
r->size_surroundingText = size_surroundingText;
r->size_rssItem = size_rssItem;
r->size_categories = size_categories;
r->size_gigabitQuery = size_gigabitQuery;
r->size_templateVector = size_templateVector;
}
// convert offsets back into ptrs
long Inlink::updateStringPtrs ( char *buf ) {
// point to our string buffer
char *p = buf;
// use our buf if none supplied
if ( ! p ) p = getStringBuf(); // m_buf;
// then store the strings!
long *sizePtr = getFirstSizeParm(); // &size_qbuf;
long *sizeEnd = getLastSizeParm (); // &size_displayMetas;
char **strPtr = getFirstStrPtr (); // &ptr_qbuf;
for ( ; sizePtr <= sizeEnd ; ) {
// convert the offset to a ptr
*strPtr = p;
// make it NULL if size is 0 though
if ( *sizePtr == 0 ) *strPtr = NULL;
// sanity check
if ( *sizePtr < 0 ) { char *xx = NULL; *xx =0; }
// advance our destination ptr
p += *sizePtr;
// advance both ptrs to next string
sizePtr++;
strPtr++;
}
// return how many bytes we processed
return getBaseSize() + (p - getStringBuf());
}
void Inlink::reset ( ) {
// clear ourselves out
memset ( (char *)this,0,sizeof(Inlink) );
}
// . set a new Inlink from an older versioned Inlink
// . this is how we handle versioning
void Inlink::set2 ( Inlink *old ) {
// clear ouselves
reset();
// copy what is legit to us
memcpy ( (char *)this , (char *)old , old->m_firstStrPtrOffset );
// set our offset to the string ptrs
m_firstStrPtrOffset = (char *)&ptr_urlBuf - (char *)this;
// and our base
m_numStrings = getBaseNumStrings();
// now copy over string ptrs
char *dst = (char *)this + m_firstStrPtrOffset;
char *src = (char *)old + old->m_firstStrPtrOffset;
memcpy ( dst , src , old->m_numStrings * 4 );
// and the sizes
dst += 4 * m_numStrings ;
src += 4 * old->m_numStrings ;
memcpy ( dst , src , old->m_numStrings * 4 );
// sanity tests. make sure they match up
//if ( old->ptr_urlBuf != ptr_urlBuf ) { char *xx=NULL;*xx=0; }
//if ( old->ptr_rssItem != ptr_rssItem ) { char *xx=NULL;*xx=0; }
// point to the old buf now, OldInlink::m_buf[]
src += 4 * old->m_numStrings ;
// update our string ptrs to reference into "old's" m_buf[]
updateStringPtrs ( src );
// log it
//logf(LOG_DEBUG,"build: setting new Inlink from old.");
// we can't do this sanity check because we cast "old" as an Inlink
// whereas before it was an older version of "Inlink"
//if ( old->size_urlBuf != size_urlBuf ) { char *xx=NULL;*xx=0; }
}
long Inlink::getStoredSize ( ) {
//long size = (long)sizeof(Msg);
//long size = getBaseSize();
long size = m_firstStrPtrOffset;
// add in string AND size ptrs
size += 8 * m_numStrings;
// add up string buffer sizes
//long *sizePtr = getFirstSizeParm(); // &size_qbuf;
//long *sizeEnd = getLastSizeParm (); // &size_displayMetas;
long *sizePtr =
(long *)((char *)this + m_firstStrPtrOffset + 4*m_numStrings);
long *sizeEnd = sizePtr + m_numStrings;
for ( ; sizePtr < sizeEnd ; sizePtr++ )
size += *sizePtr;
return size;
}
// . return ptr to the buffer we serialize into
// . return NULL and set g_errno on error
char *Inlink::serialize ( long *retSize ,
char *userBuf ,
long userBufSize ,
bool makePtrsRefNewBuf ) {
// make a buffer to serialize into
char *buf = NULL;
long need = getStoredSize();
// big enough?
if ( need <= userBufSize ) buf = userBuf;
// alloc if we should
if ( ! buf ) buf = (char *)mmalloc ( need , "Ra" );
// bail on error, g_errno should be set
if ( ! buf ) return NULL;
// set how many bytes we will serialize into
*retSize = need;
// copy the easy stuff
char *p = buf;
memcpy ( p , (char *)this , getBaseSize() );
p += getBaseSize();
// then store the strings!
long *sizePtr = getFirstSizeParm(); // &size_qbuf;
long *sizeEnd = getLastSizeParm (); // &size_displayMetas;
char **strPtr = getFirstStrPtr (); // &ptr_qbuf;
for ( ; sizePtr <= sizeEnd ; ) {
// if we are NULL, we are a "bookmark", so
// we alloc'd space for it, but don't copy into
// the space until after this call toe serialize()
if ( ! *strPtr ) goto skip;
// sanity check -- cannot copy onto ourselves
if ( p > *strPtr && p < *strPtr + *sizePtr ) {
char *xx = NULL; *xx = 0; }
// copy the string into the buffer
memcpy ( p , *strPtr , *sizePtr );
skip:
// . make it point into the buffer now
// . MDW: why? that is causing problems for the re-call in
// Msg3a, it calls this twice with the same "m_r"
if ( makePtrsRefNewBuf ) *strPtr = p;
// advance our destination ptr
p += *sizePtr;
// advance both ptrs to next string
sizePtr++;
strPtr++;
}
return buf;
}
// used by PageTitledb.cpp
bool LinkInfo::print ( SafeBuf *sb , char *coll ) {
//char buf [1024];
//char buf2[1024];
//char buf3[MAX_RSSITEM_SIZE]; // 30000?
//char buf4[1024];
long count = 1;
// loop through the link texts
for ( Inlink *k = NULL; (k = getNextInlink(k)) ; count++ ) {
char *s = k->ptr_linkText;
long slen = k->size_linkText - 1;
char *d = k->ptr_surroundingText;
long dlen = k->size_surroundingText - 1;
char *r = k->ptr_rssItem;
long rlen = k->size_rssItem - 1;
char *g = k->ptr_gigabitQuery;
long glen = k->size_gigabitQuery - 1;
char *c = k->ptr_categories;
long clen = k->size_categories - 1;
if ( slen < 0 ) slen = 0;
if ( dlen < 0 ) dlen = 0;
if ( rlen < 0 ) rlen = 0;
if ( glen < 0 ) glen = 0;
if ( clen < 0 ) clen = 0;
if ( ! c || clen <= 0 ) c = "";
// encode the rss item further
char buf3b[MAX_RSSITEM_SIZE*2];
buf3b[0] = 0;
if ( rlen > 0 )
htmlEncode ( buf3b ,
buf3b + MAX_RSSITEM_SIZE*2 ,
r , // buf3 ,
r + rlen , // buf3 + gbstrlen(buf3) ,
true );
// . print link text and score into table
// . there is a ton more info in Inlink class to print if
// you want to throw it in here...
sb->safePrintf(
"<tr><td colspan=2>link #%04li "
"("
//"baseScore=%010li, "
"d=<a href=\"/master/titledb?c=%s&"
"d=%lli\">%016lli</a>, "
"siterank=%li, "
"hopcount=%03li "
"outlinks=%05li, "
"ip=%s "
//"pagepop=%li "
//"sitepop=%li "
"numLinksToSite=%li "
//"anomaly=%li "
"<b>url</b>=\"%s\" "
"<b>txt=</b>\""
"%s\" "
"<b>neigh=</b>\"%s\" "
"<b>rssItem</b>=\"%s\" "
"<b>gigabits</b>=\"%s\" "
"<b>categories</b>=\"%s\" "
//"<b>templateVec=\"</b>%s\" "
"</td></tr>\n",
count ,
//(long)k->m_baseScore ,
coll ,
k->m_docId,
k->m_docId,
//(long)k->m_docQuality,
(long)k->m_siteRank,
(long)k->m_hopcount,
(long)k->m_numOutlinks ,
iptoa(k->m_ip),
//(long)k->m_pagePop,
//(long)k->m_sitePop,
(long)k->m_siteNumInlinks,
//(long)k->m_isAnomaly,
k->ptr_urlBuf, // the linker url
s, // buf,
d, // buf2,
buf3b,
g, // buf4,
c
);
}
return true;
}
//long getNumLinksToSite ( long q ) {
// if ( q <= 20 ) return 0; return (1 << ((q - 20)/5)); };
//long getSitePop ( long q ) { return getNumLinksToSite ( q ) * 5; };
// . total pop of all the inlinkers to the page
// . assume 20 times less than site pop
//long getPagePop ( long q ) { return getSitePop ( q ) / 20; }
/*
long LinkInfo::computePagePop ( Url *u , char *coll ) {
// get our site hiash. use domain for now
long dh = u->getDomainHash32 ( );
// store pagepop into "sum"
long sum = 0;
for ( Inlink *k=NULL;(k=getNextInlink(k));) {
// get the url
Url u2; u2.set ( k->ptr_urlBuf , k->size_urlBuf - 1);
// get the site hash
long dh2 = u2.getDomainHash32 ( );
// skip if it is from our same site
if ( dh2 == dh ) continue;
// one point for having him
sum++;
// and we inherit his points as well
sum += k->m_pagePop;
}
return sum;
}
*/
bool LinkInfo::hasRSSItem() {
for ( Inlink *k=NULL;(k=getNextInlink(k));)
// rss item?
if ( k->size_rssItem > 10 ) return true;
return false;
}
///////////////
//
// LINKS CLASS
//
///////////////
// . use siteRec to set m_extractRedirects from <extractRedirectsFromLinks>
// . this is used because yahoo has links like:
// yahoo.com/drst/pop/2/10/576995/37640326/*http://www.m.com/
// . we need yahoo's link: terms to be more precise for our ban/unban algorithm
static long getLinkBufferSize(long numLinks);
Links::Links(){
m_allocBuf = NULL;
m_allocSize = 0;
m_linkBuf = NULL;
m_allocLinks = 0;
m_spamNote = NULL;
m_numLinks = 0;
m_baseUrl = NULL;
m_numOutlinksAdded = 0;
}
Links::~Links() {
reset();
}
void Links::reset() {
if (m_allocBuf) mfree(m_allocBuf, m_allocSize, "Links");
m_allocBuf = NULL;
m_allocSize = 0;
if (m_linkBuf) mfree(m_linkBuf, getLinkBufferSize(m_allocLinks),
"Links");
m_linkBuf = NULL;
m_allocLinks = 0;
m_spamNote = NULL;
m_numLinks = 0;
m_flagged = false;
m_hasRSS = false;
m_isFeedBurner = false;
m_hasSelfPermalink = false;
m_hasRSSOutlink = false;
m_hasSubdirOutlink = false;
m_rssOutlinkPtr = NULL;
}
bool Links::set ( bool useRelNoFollow ,
Xml *xml , Url *parentUrl , bool setLinkHash ,
//bool useBaseHref ,
// use null for this if you do not want to use it
Url *baseUrl ,
long version ,
long niceness ,
//bool addSiteRootFlags ,
//char *coll ,
bool parentIsPermalink ,
Links *oldLinks ,
bool doQuickSet ) {
reset();
// always for this to true now since we need them for linkdb
setLinkHash = true;
if ( doQuickSet ) setLinkHash = false;
m_xml = xml;
m_baseUrl = parentUrl;
m_parentUrl = parentUrl;
m_doQuickSet = doQuickSet;
m_parentIsPermalink = parentIsPermalink;
m_baseSite = NULL;
m_baseSiteLen = 0;
// default to domain, not hostname
//if ( m_baseUrl && addSiteRootFlags )
// m_baseSite = m_baseUrl->getSite ( &m_baseSiteLen, coll, false);
//m_addSiteRootFlags = addSiteRootFlags;
//m_coll = coll;
// sanity check
//if ( addSiteRootFlags && ! coll ) { char *xx=NULL;*xx=0; }
m_numLinks = 0;
m_numNodes = xml->getNumNodes();
m_bufPtr = NULL;
//m_buf[0] = '\0';
//bool useRelNoFollow = true;
//if ( sx ) useRelNoFollow = sx->getBool ("useRelNoFollow",true);
m_linksToGigablast = false;
m_hasRelNoFollow = false;
// unknow if baseUrl are a permalink or not
//m_isPermalink = -1;
//char utf8Buf[MAX_URL_LEN+1];
//long utf8Len = 0;
// this is not a good thing, don't strip here, but we did it for
// version 51 so we have to keep doing it for version 51. we strip
// the session id in Msg10.cpp in the addUrlLoop().
//if ( version == 51 )
// m_stripIds = true;
//else
m_stripIds = false;
// ok, let's remove it for the links: hashing, it just makes more
// sense this way i think. we can normalize the links: terms in the
// query if you are worried about it.
//if ( version >= 54 ) m_stripIds = true;
m_stripIds = true;
// get the <base href=> tag if any (12)
if ( baseUrl ) m_baseUrl = baseUrl;
// count the dirty links
//m_numDirtyLinks = 0;
// get this from the xml of the siteRec
//m_extractRedirects = sx->getBool ("extractRedirectsFromLinks",false);
//bool gotIt = false;
//for ( long i=0; i < m_numNodes ; i++ ) {
// if ( xml->getNodeId ( i ) != TAG_FBORIGLINK ) continue;
// gotIt = true;
// break;
//}
// visit each node in the xml tree. a node can be a tag or a non-tag.
char *urlattr = NULL;
for ( long i=0; i < m_numNodes ; i++ ) {
QUICKPOLL(niceness);
// . continue if this tag ain't an <a href> tag
// . atom feeds have a <link href=""> field in them
long id = xml->getNodeId ( i );
if ( id != TAG_A &&
id != TAG_LINK &&
id != TAG_AREA &&
id != TAG_ENCLOSURE &&
id != TAG_WEBLOG &&
id != TAG_URLFROM && // <UrlFrom> for ahrefs.com
id != TAG_FBORIGLINK )
continue;
urlattr = "href";
if ( id == TAG_WEBLOG ) urlattr ="url";
if ( id == TAG_FBORIGLINK ) m_isFeedBurner = true;
// if it's a back tag continue
if ( xml->isBackTag ( i ) ) continue;
// reset
linkflags_t flags = 0;
// . if it has rel=nofollow then ignore it
// . for old titleRecs we should skip this part so that the
// link: terms are indexed/hashed the same way in XmlDoc.cpp
long slen;
char *s ;
if ( useRelNoFollow ) s = xml->getString ( i , "rel", &slen ) ;
if ( useRelNoFollow &&
slen==8 && // ASCII
strncasecmp ( s,"nofollow", 8 ) == 0) {
// if this flag is set then::hasSpamLinks() will always
// return false. the site owner is taking the necessary
// precautions to prevent log spam.
m_hasRelNoFollow = true;
// . do not ignore it now, just flag it
// . fandango has its ContactUs with a nofollow!
flags |= LF_NOFOLLOW;
}
// get the href field of this anchor tag
long linkLen;
char *link = (char *) xml->getString ( i, urlattr, &linkLen );
// does it have the link after the tag?
//long tagId = xml->getNodeId(i);
// skip the block below if we got one in the tag itself
//if ( linkLen ) tagId = 0;
// if no href, but we are a <link> tag then the url may
// follow, like in an rss feed.
if ( linkLen==0 &&
(id == TAG_LINK ||
id == TAG_URLFROM ||
id == TAG_FBORIGLINK) ) {
// the the <link> node
char *node = xml->getNode(i);
long nodeLen = xml->getNodeLen(i);
// but must NOT end in "/>" then
if ( node[nodeLen-2] == '/' ) continue;
// expect the url like <link> url </link> then
if ( i+2 >= m_numNodes ) continue;
if ( xml->getNodeId(i+2) != id ) continue;
if ( ! xml->isBackTag(i+2) ) continue;
// ok assume url is next node
link = xml->getNode(i+1);
linkLen = xml->getNodeLen(i+1);
// watch out for CDATA
if ( linkLen > 12 &&
strncasecmp(link, "<![CDATA[", 9) == 0 ) {
link += 9;
linkLen -= 12;
}
}
// was it an enclosure?
//if ( linkLen == 0 && xml->getNodeId( i ) == TAG_XMLTAG )
// link = (char *) xml->getString ( i, "url", &linkLen );
// . it doesn't have an "href" field (could be "name" field)
// . "link" may not be NULL if empty, so use linkLen
if ( linkLen == 0 )
continue;
// skip spaces in the front (should be utf8 compatible)
while ( linkLen > 0 && is_wspace_a(*link) ) {
link++;
linkLen--;
}
// don't add this link if it begins with javascript:
if ( linkLen >= 11 && strncasecmp (link,"javascript:",11) ==0){
// well... a lot of times the provided function has
// the url as an arg to a popup window
long oclen = 0;
char *oc = xml->getString(i,"onclick",&oclen);
// if none, bail
if ( ! oc ) continue;
// set end
char *ocend = oc + oclen - 2;
char *ocurl = NULL;
// scan for "'/" which should indicate the url
for ( ; oc < ocend ; oc++ ) {
if ( *oc !='\'' ) continue;
if ( oc[1]!='/' ) continue;
// set the start
ocurl = oc + 1;
// and stop the scan
break;
}
// if none, bail
if ( ! ocurl ) continue;
// now find the end of the url
char *ocurlend = ocurl + 1;
for ( ; ocurlend < ocend ; ocurlend++ )
if ( *ocurlend == '\'' ) break;
// assign it now
link = ocurl;
linkLen = ocurlend - ocurl;
// and continue
}
if ( linkLen == 0 )
continue;
// it's a page-relative link
if ( link[0]=='#' ) continue;
// ignore mailto: links
if ( linkLen >= 7 && strncasecmp( link , "mailto:" , 7 ) == 0 )
continue;
// make sure not too many links already
//if ( version < 72 && m_numLinks >= MAX_LINKS ) break;
QUICKPOLL(niceness);
// if we have a sequence of alnum chars (or hpyhens) followed
// by a ':' then that is a protocol. we only support http and
// https protocols right now. let "p" point to the ':'.
char *p = link;
long pmaxLen = linkLen;
if ( pmaxLen > 20 ) pmaxLen = 20;
char *pend = link + pmaxLen;
while ( p < pend && (is_alnum_a(*p) || *p=='-') ) p++;
// is the protocol, if it exists, a valid one like http or
// https? if not, ignore it. we only support FQDNs
// (fully qualified domain names) here really. so if you
// have something like mylocalhostname:8000/ it is not going
// to work anymore. you would need "use /etc/hosts" enabled
// for that to work, too.
bool proto = true;
if ( p < pend && *p == ':' ) { // && version >= 62 ) {
proto = false;
long plen = p - link;
if ( plen == 4 && strncasecmp(link,"http" ,plen) == 0 )
proto = true;
if ( plen == 5 && strncasecmp(link,"https",plen) == 0 )
proto = true;
}
// skip if proto invalid like callto:+4355645998 or
// mailto:jimbob@hoho.com
if ( ! proto ) continue;
// add it
char ptmp [ MAX_URL_LEN + 1 + 1 ];
// keep an underpad of 1 byte in case we need to prepend a /
char *tmp = ptmp + 1;
if ( linkLen > MAX_URL_LEN ) {
// only log this once just so people know, don't spam
// the log with it.
static bool s_flag = 1;
if ( s_flag ) {
s_flag = 0;
log(LOG_INFO, "build: Link len %li is longer "
"than max of %li. Link will not "
"be added to spider queue or "
"indexed for link: search.",
linkLen,(long)MAX_URL_LEN);
}
continue;
}
// see if the <link> tag has a "type" file
bool isRSS = false;
long typeLen;
char *type =(char *)xml->getString(i, "type", &typeLen );
// . MDW: imported from Xml.cpp:
// . check for valid type:
// type="application/atom+xml" (atom)
// type="application/rss+xml" (RSS 1.0/2.0)
// type="application/rdf+xml" (RDF)
// type="text/xml" (RSS .92) support?
// compare
if ( type ) {
if (strncasecmp(type,"application/atom+xml",20)==0)
isRSS=true;
if (strncasecmp(type,"application/rss+xml" ,19)==0)
isRSS=true;
// doesn't seem like good rss
//if (strncasecmp(type,"application/rdf+xml" ,19)==0)
// isRSS=true;
if (strncasecmp(type,"text/xml",8)==0)
isRSS=true;
}
long relLen = 0;
char *rel = NULL;
// make sure we got rel='alternate' or rel="alternate", etc.
if ( isRSS ) rel = xml->getString(i,"rel",&relLen);
// compare
if ( rel && strncasecmp(rel,"alternate",9) != 0 )
isRSS = false;
// skip if a reply! rss feeds have these links to comments
// and just ignore them for now
if ( rel && strncasecmp(rel,"replies",7)==0 )
continue;
// http://dancleary.blogspot.com/feeds/posts/default uses edit:
if ( rel && strncasecmp(rel,"edit",4)==0 )
continue;
// . if type exists but is not rss/xml, skip it. probably
// javascript, css, etc.
// . NO! i've seen this to be type="text/html"!
//if ( ! isRSS && type ) continue;
// store it
if ( isRSS ) m_hasRSS = true;
// JAB: warning abatement
//unsigned char flags = 0;
//TODO: should we urlEncode here?
// i didn't know this, but links can have encoded html entities
// like &amp; and &gt; etc. in them and we have to decode
// assign the new decoded length.
// this is not compatible with m_doQuickSet because we store
// the "link" ptr into the array of link ptrs, and this uses
// the "tmp" buf.
// nono, need this now otherwise it hits that linkNode<0
// error msg in XmlDoc.cpp. but for Msg13 spider compression
// you might want to do something else then i guess...
//if ( ! m_doQuickSet ) {
linkLen = htmlDecode ( tmp ,
link ,
linkLen,
false,
niceness );
// use tmp buf
link = tmp;
//}
if (!addLink ( link , linkLen , i , setLinkHash ,
version , niceness , isRSS , id , flags ))
return false;
// get the xml node
//XmlNode *node = m_xml->getNodePtr(i);
// set this special member
//node->m_linkNum = m_numLinks - 1;
// set the flag if it is an RSS link
}
// . flag the links we have that are old (spidered last time)
// . set LF_OLDLINK flag
if ( ! flagOldLinks ( oldLinks ) ) return false;
return true;
}
// just a NULL-terminated text buffer/file of links to add
bool Links::set ( char *buf , long niceness ) { //char *coll,long niceness ) {
reset();
// need "coll" for Url::isSiteRoot(), etc.
//m_coll = coll;
m_parentUrl = NULL;
m_baseUrl = NULL;
m_addSiteRootFlags = false;
m_xml = NULL;
char *p = buf;
while ( *p ) {
// skip spaces
while ( *p && is_wspace_a(*p) ) p++;
// get the length of the link
char *q = p;
while ( *q && ! is_wspace_a(*q) ) q++;
long len = q - p;
// add the link
if ( ! addLink ( p , len , -1 , false ,
TITLEREC_CURRENT_VERSION , niceness, false,
TAG_A , 0 ) )
return false;
// advance
p = q;
}
// assume none are flagged as old, LF_OLDLINK
m_flagged = true;
return true;
}
// . the blogroll must consist of 2 outlinks to two different external blogs
// in order to be a valid blogroll
// . add the all the site root outlinks in the valid blogroll into the
// turk queue so they can be manually reviewed
// . use ruleset "66" to indicate not a blog? that will help us identify
// false blogrolls.
bool Links::queueBlogRoll ( TagRec **tagRecPtrs , long niceness ) {
// sanity check, we must have set Links with this as true!
if ( ! m_addSiteRootFlags ) { char *xx=NULL;*xx=0; }
// do not add blogroll if any of our outlinks are banned
for ( long i = 0 ; i < m_numLinks ; i++ ) {
// skip if none
if ( ! tagRecPtrs[i] ) continue;
// return if any one outlink is banned
if ( tagRecPtrs[i]->getLong("manualban",0) ) return true;
}
// how many nodes do we have?
long nn = m_xml->getNumNodes();
// count the link #
long j = 0;
// the loop over the nodes
long i = 0;
// come back up here if the evaluated blogroll was no good
loop:
// we record the depth of the first valid blog outlink
// and as soon as another link is at a different depth, we terminate
// that particular blogroll, and break out of the loop to evaluate it
long validDepth = -1;
// start of the blog roll (only one allowed)
long start = -1;
// domain hash of the first valid blog outlink
uint32_t dh = 0;
// saved depth of the section
long sd = -1;
// is it NOT a blog roll
char notBlogRoll = 0;
// count the valid blog outlinks
long count = 0;
// loop over every node
for ( ; i < nn ; i++ ) {
// take a breath
QUICKPOLL(niceness);
// skip if we are not a link, otherwise, we are link #j
if ( i != m_linkNodes[j] ) continue;
// inc j so we keep our scan in sync
j++;
// get the xml node
XmlNode *node = m_xml->getNodePtr(i);
// get the "depth" of this link
long depth = node->m_depth;
// if we had encountered a valid blog outlink, and this
// particular outlink is NOT at the same depth, then assume
// that our blogroll is over... and break out to evaluate it
if ( start >= 0 && depth != validDepth ) {
// if we did not have 2+ valid blog outlinks,
// reset and keep going
if ( count < 2 ) goto loop;
// if we had a blogroll, it's over now. break out
// and nwe can see if it was a valid blogroll.
break;
}
// skip if not a blog roll cuz we got a non-blog outlink
if ( notBlogRoll ) continue;
// skip if from the same domain as this page is
if ( m_linkFlags[j] & LF_SAMEDOM ) continue;
// skip if not a site root or a root url
if ( ! (m_linkFlags[j] & LF_SITEROOT) ) continue;
// is this outlink a blog?
long b = tagRecPtrs[j]->getLong("blog",-1);
// skip if NOT a blog for sure
if ( b == 0 ) {
// set the flag to indicate not a blogroll
notBlogRoll = 1;
continue;
}
// skip if unknown whether it is a blog
if ( b == -1 ) continue;
// normalize it
Url link; link.set ( m_linkPtrs[j] , m_linkLens[j] );
// get domain
char *dom = link.getDomain ();
long dlen = link.getDomainLen();
// must have domain, a valid TLD, otherwise this is NULL
if ( ! dom || dlen == 0 ) continue;
// . save the spot of the first valid blog outlink
// . save his depth too
if ( start < 0 ) { start = j; sd = depth; }
// get domain hash
uint32_t linkDomHash = hash32 ( dom , dlen );
// if we are NOT the start of a blogroll, make sure we got
// a distinct domain...
if ( linkDomHash == dh ) continue;
// count it as a prospect
count++;
// save his depth
//sd = depth;
// store the domain hash, so that we can make sure to get
// to outlinks from two different domains
dh = linkDomHash;
}
// bail if no valid blog roll found
if ( start < 0 ) return true;
// point to the link right before our valid blog outlink
long k = start - 1;
// backup to find the start of the blogroll, keep depth the same
for ( ; k >= 0 ; k-- ) {
// get link node #
i = m_linkNodes[k];
// get the xml node
XmlNode *node = m_xml->getNodePtr(i);
// back up until depth !=
if ( node->m_depth != sd ) break;
}
// . now link #k+1 is the first link in the blogroll
// . and link #j-1 is the last link in the blogroll
//
// TODO: classify as "newsy" if site has a lot of pgs w/ pub dates
//
// index the outlinks in the blogroll, which are roots of sites
// as gbblogroll:<link> and make sure we don't split it.
for ( long i = k + 1 ; i < j ; i++ ) {
// skip if from the same domain
if ( m_linkFlags[i] & LF_SAMEDOM ) continue;
// it must be the root of a "site" too
if ( ! (m_linkFlags[i] & LF_SITEROOT) ) continue;
// must be unidentified
long rs = tagRecPtrs[i]->getLong("ruleset",-1);
// skip if it is already classified in a ruleset
if ( rs != -1 ) continue;
// the link
char *p = m_linkPtrs[i];
long plen = m_linkLens[i];
// tmp NULL
char c = p[plen]; p[plen] = '\0';
// vote on these tagids
//long tagIds[] = { ST_BLOG , ST_NEWS };
// . add it to turk
// . buzz has mostly blogs and mainstream media news sites
/*
g_turk.addUrl ( p ,
0LL , // docid
NULL , // TagRec
tagIds , // tagIds to test for
2 );
*/
// put it back
p[plen] = c;
// now make the score
//unsigned char score = QUALITY;
// . now hash with our score
// . this should only be called by XmlDoc::hashNoSplit()
//if ( ! link.hashAsLink ( version ,
// table ,
// NULL ,
// 0 ,
// score ,
// false , // internal?
// "gbblogroll:" ,
// indexSiteLinks ) )
// return false;
}
// success
return true;
}
// . should this page be considered dirty based on it's dirty links
// . ?we should avoid adding dirty pages to the index?
// . we should not add ANY of links if enough are dirty
// . we should not hash link: terms for dirty pages
//bool Links::isPageDirty ( ) {
// if ( m_numLinks < 5 ) return ( m_numDirtyLinks >= 2 ) ;
// // get percent dirty
// long percent = ( m_numDirtyLinks * 100 ) / m_numLinks ;
// return ( percent >= 10 );
//}
bool Links::addLink ( char *link , long linkLen , long nodeNum ,
bool setLinkHash , long titleRecVersion ,
long niceness , bool isRSS , long tagId ,
long flagsArg ){
// don't add 0 length links
if ( linkLen <= 0 ) return true;
// ensure buf has enough room
// if (titleRecVersion < 72){
// if ( m_bufPtr-m_buf + linkLen + 1 > LINK_BUF_SIZE ){
// return true;
// }
// }
// do we need to alloc more link space?
if (m_numLinks >= m_allocLinks) {
long newAllocLinks;
// older titlerecs can't hold more than 10000 links
//if(titleRecVersion<72 && m_allocLinks >= 10000) return true;
if (!m_allocLinks) newAllocLinks =10000;
else if (m_allocLinks<100000) newAllocLinks =m_allocLinks*2;
else newAllocLinks =m_allocLinks+100000;
// how much mem do we need for newAllocLinks links?
long newAllocSize = getLinkBufferSize(newAllocLinks);
QUICKPOLL(niceness);
char *newBuf = (char*)mmalloc(newAllocSize, "Links");
if (!newBuf) return false;
QUICKPOLL(niceness);
// a ptr to it
char *p = newBuf;
// debug msg
log(LOG_DEBUG, "build: resizing Links ptr buffer to %ld",
newAllocSize);
char **newLinkPtrs = (char**)p;
p += newAllocLinks * sizeof(char *) ;
long *newLinkLens = (long*)p;
p += newAllocLinks * sizeof(long) ;
long *newLinkNodes = (long*)p;
p += newAllocLinks * sizeof(long) ;
uint64_t *newLinkHashes = (uint64_t *)p;
p += newAllocLinks * sizeof(uint64_t) ;
uint64_t *newHostHashes = (uint64_t *)p;
p += newAllocLinks * sizeof(uint64_t) ;
long *newDomHashes = (long *)p;
p += newAllocLinks * sizeof(long);
linkflags_t *newLinkFlags = (linkflags_t *)p;
p += newAllocLinks * sizeof(linkflags_t) ;
char **newSpamNotes = (char **)p;
p += newAllocLinks * sizeof(char **);
// sanity check -- check for breach
if ( p > newBuf + newAllocSize ) { char *xx = NULL; *xx = 0; }
if (m_linkBuf){
memcpy(newLinkPtrs, m_linkPtrs,
m_numLinks * sizeof(char*));
QUICKPOLL(niceness);
memcpy(newLinkLens, m_linkLens,
m_numLinks * sizeof(long));
QUICKPOLL(niceness);
memcpy(newLinkNodes, m_linkNodes,
m_numLinks * sizeof(long));
QUICKPOLL(niceness);
memcpy(newLinkHashes, m_linkHashes,
m_numLinks * sizeof(uint64_t));
QUICKPOLL(niceness);
memcpy(newHostHashes, m_hostHashes,
m_numLinks * sizeof(uint64_t));
QUICKPOLL(niceness);
memcpy(newDomHashes, m_domHashes,
m_numLinks * sizeof(long));
QUICKPOLL(niceness);
memcpy(newLinkFlags, m_linkFlags,
m_numLinks * sizeof(linkflags_t));
QUICKPOLL(niceness);
memcpy(newSpamNotes,m_spamNotes,
m_numLinks * sizeof(char *));
long oldSize = getLinkBufferSize(m_allocLinks);
mfree(m_linkBuf, oldSize, "Links");
QUICKPOLL(niceness);
}
m_allocLinks = newAllocLinks;
m_linkBuf = newBuf;
m_linkPtrs = newLinkPtrs;
m_linkLens = newLinkLens;
m_linkNodes = newLinkNodes;
m_linkHashes = newLinkHashes;
m_hostHashes = newHostHashes;
m_domHashes = newDomHashes;
m_linkFlags = newLinkFlags;
m_spamNotes = newSpamNotes;
}
// quickset?
/*
if ( m_doQuickSet ) {
m_linkPtrs [ m_numLinks ] = link;
m_linkLens [ m_numLinks ] = linkLen;
m_linkNodes [ m_numLinks ] = nodeNum;
m_numLinks++;
return true;
}
*/
// normalize the link and prepend base url if needed
Url url;
// if our base url is http://poo.com/user/register/user/register/...
// and this link is "user/register/", for instance, we got a link loop.
// the "webmaster" really meant "/user/register" to be the link.
/*
char fix = false;
//if ( titleRecVersion >= 79 && link[0] != '/' ) {
if ( && link[0] != '/' ) {
// temporarily NULL terminate the link
char c = link[linkLen];
link[linkLen] = '\0';
// get the base url path
char *path = m_baseUrl->getPath();
long plen = m_baseUrl->getPathLen();
char *pend = path + plen;
// is this relative path is repeated in the base url?
char *p = strnstr ( path , // haystack
link , // needle
plen ); // haystackSize
//char *p2 = strnstr (
if ( p )
log("hey");
// advance over the needle in the haystack
if ( p )
p += linkLen;
// but if the relative url contains a / it only needs to
// be repeated once
if ( strchr ( link , '/' ) )
fix = true;
// is it repeated again after that? making it twice repeated?
if ( p && ! fix && strnstr ( p , // haystack
link , // needle
pend - p )) // haystackSize
fix = true;
// put the char back
link[linkLen] = c;
}
char c;
if ( fix ) {
link--;
c = *link;
*link = '/';
linkLen++;
}
*/
// . let's force the www (back to the old ways)
// . before i wasn't, but root urls w or w/o the www can be dynamic
// pages that really are the same, but had a different ad or whatever
//url.set ( m_baseUrl , link , linkLen , true/*addWWW?*/, m_stripIds ,
// MDW: let's try turning it off. slashdot.org's outlinkers are all
// seen as external because we force the 'www' on them here and the
// m_baseUrl does NOT have the 'www'. we now check for "www dups" in
// Msg16.cpp so that should alleviate the issue discussed right above.
//bool addWWW = true;
//if ( titleRecVersion >= 99 ) addWWW = false;
//bool addWWW = false;
bool addWWW = true;
url.set ( m_baseUrl ,
link ,
linkLen ,
addWWW , // addWWW?
m_stripIds ,
// now i strip this thang because the rss
// feeds have a link to every comment but it is
// really the same url...
true , // stripPound?
// convert /index.html to /
2013-11-04 23:05:10 +04:00
// turned this back on per john's request
// will cause undeletable data in existing indexes.
true , // stripCommonFile?
2013-08-03 00:12:24 +04:00
titleRecVersion );// used for removing session ids
// refix the link
//if ( fix ) *link = c;
// sometimes there's links like:
// http://'+ycso[8]+ \n'commentsn?blog_id=... which is within
// <script></script> tags
if ( url.getDomainLen() <= 0 || url.getHostLen() <= 0 ) return true;
// count dirty links
//if ( url.isDirty() ) m_numDirtyLinks++;
// . a lot of links are redirect to other places
// . yahoo.com/drst/pop/2/10/576995/37640326/*http://www.m.com/
// . find the asterix before the url to redirect to if we should
// . if we had a known redirect url in the link, set url class to it
// . this makes the link: search work much better
// . that means we can use yahoo to tell us banned/unbanned sites
//char *s ;
//if ( m_extractRedirects && (s=strchr(url.getUrl(),'*')) ) {
// // . this is really just for yahoo, but we could eventually
// // use an aribtrary delimeter in the site file
// // . skip the *http:/
// s += 7;
// char buf[MAX_URL_LEN];
// strcpy ( buf , s );
// long blen = gbstrlen(buf);
// // . this was causing problems!
// // . sometimes yahoo has nothing after the '*'
// if ( blen == 0 ) return;
// // it must start with http:
// url.set ( buf, blen , true, m_stripIds);
//}
// debug TODO: fix a href=\"http://www.thecounter.com"\ thangs
// if ( url.getUrl()[0] !='h' ) sleep(10);
// ensure buf has enough room
// if (titleRecVersion < 72) {
// if ( m_bufPtr + url.getUrlLen() + 1 >= m_buf+LINK_BUF_SIZE )
// return true;
// }
// this is now set to 0 in XmlNode.cpp
// make sure it is valid
//if ( nodeNum >= 0 )
// // reset this
// m_xml->m_nodes[nodeNum].m_isSelfLink = 0;
// Allocate more link buffer space?
//long bufSize = m_allocSize+LINK_BUF_SIZE;
//long bufSpace = m_allocBuf?m_allocSize - (m_bufPtr-m_allocBuf):0;
long bufSpace ;
if ( m_allocBuf ) bufSpace = m_allocSize - (m_bufPtr-m_allocBuf);
else bufSpace = 0;
// allocate dynamic buffer for lotsa links
if ( url.getUrlLen() + 1 > bufSpace ) {
//if (titleRecVersion < 72 && m_allocSize >= LINK_BUF_SIZE)
// return true;
// grow by 100K
long newAllocSize;// = m_allocSize+LINK_BUF_SIZE;
if ( ! m_allocSize ) newAllocSize = LINK_BUF_SIZE;
else if (m_allocSize < 1024*1024) newAllocSize = m_allocSize*2;
else newAllocSize = m_allocSize + 1024*1024;
// MDW: a realloc would be more efficient here.
QUICKPOLL(niceness);
char *newBuf = (char*)mmalloc(newAllocSize, "Links");
if ( ! newBuf ) return log("build: Links failed to realloc.");
log(LOG_DEBUG, "build: resizing Links text buffer to %ld",
newAllocSize);
QUICKPOLL(niceness);
if ( m_allocBuf ) {
QUICKPOLL(niceness);
memcpy ( newBuf , m_allocBuf , m_allocSize );
QUICKPOLL(niceness);
// update pointers to previous buffer
long offset = newBuf - m_allocBuf;
char *allocEnd = m_allocBuf + m_allocSize;
for (long i = 0 ; i < m_numLinks ; i++ ) {
QUICKPOLL(niceness);
if ( m_linkPtrs[i] < m_allocBuf ) continue;
if ( m_linkPtrs[i] >= allocEnd ) continue;
m_linkPtrs[i] += offset;
}
m_bufPtr += offset;
QUICKPOLL(niceness);
mfree ( m_allocBuf , m_allocSize , "Links");
QUICKPOLL(niceness);
}
else m_bufPtr = newBuf;
m_allocBuf = newBuf;
m_allocSize = newAllocSize;
}
// . is hostname gigablast.com or www.gigablast.com?
// . must be in the top 100k of link text, too!
long hlen = url.getHostLen();
char *h = url.getHost ();
if ( hlen == 13 && strncmp ( h , "gigablast.com" , 13 ) == 0 )
m_linksToGigablast = true;
if ( hlen == 17 && strncmp ( h , "www.gigablast.com" , 17 ) == 0 )
m_linksToGigablast = true;
// add some info
m_linkPtrs [ m_numLinks ] = m_bufPtr;
m_linkLens [ m_numLinks ] = url.getUrlLen();
m_linkNodes [ m_numLinks ] = nodeNum;
// serialize the normalized link into the buffer
memcpy ( m_bufPtr , url.getUrl(), url.getUrlLen() );
m_bufPtr += url.getUrlLen();
QUICKPOLL(niceness);
// and NULL terminate it
*m_bufPtr++ = '\0';
/*
// do permalink detection here
char *d = url.getDomain();
long dlen = url.getDomainLen();
// is the baseurl contained in a link to reddit?
if ( dlen == 10 && m_baseUrl && strncmp ( d , "reddit.com" ) == 0 ) {
// get the baseurl without the http://
char *bh = m_baseUrl->getHost();
char *cgi = url.getCgi();
// our base url is a permalink then
if ( strstr ( cgi , bh ) ) m_isPermalink = 1;
// otherwise, if it has a link elsewhere it is an index page
//else if ( strstr ( cgi, "diggthis.php?") ) m_isPermalink = 0;
}
*/
// . set link hash if we need to
// . the Vector class uses these link hashes for determining similarity
// of this document to another for purposes of fightling link spam
// . we essentially compare the linking web pages against one another
// and if we find one that is similar to another we weight it's
// link text down. The more similar the more the penalty. We just
// see what links it has in common with the others for now...
if ( setLinkHash ) {
// sanity
if ( m_doQuickSet ) { char *xx=NULL;*xx=0; }
// get url length
long ulen = url.getUrlLen();
// subtract the cgi length
if ( url.isCgi() ) ulen -= 1 + url.getQueryLen();
// store it's hash
m_linkHashes [ m_numLinks ] = url.getUrlHash64();
m_hostHashes [ m_numLinks ] = url.getHostHash64();
m_domHashes [ m_numLinks ] = url.getDomainHash32();
}
// set the bits in the flags byte
linkflags_t flags = flagsArg; // 0;
// set flag bit #0 if it is an "internal" link -- from same hostname
if ( m_baseUrl && url.getHostLen() == m_baseUrl->getHostLen() &&
strncmp(url.getHost(),m_baseUrl->getHost(),url.getHostLen())==0){
flags |= LF_SAMEHOST; //0x01;
flags |= LF_SAMEDOM; //0x02
}
else if (m_baseUrl &&url.getDomainLen() == m_baseUrl->getDomainLen() &&
strncmp(url.getDomain(),m_baseUrl->getDomain(),
url.getDomainLen())==0) {
flags |= LF_SAMEDOM;
// . memoori was adding www.construction.com which redirected
// to construction.com/index.asp and it did not add the
// outlinks because "spider internal links only" was true.
// i.e. Msg16::m_sameHostLinks was true
// . if not same host but domains match, consider it internal
// if hosts only differ by a www. this should fix that.
if ( m_baseUrl->isHostWWW() && !url .hasSubdomain() )
flags |= LF_SAMEHOST;
if ( url. isHostWWW() && !m_baseUrl->hasSubdomain() )
flags |= LF_SAMEHOST;
}
char *tld = url.getTLD();
long tlen = url.getTLDLen();
if ( tlen == 3 && ! strncmp(tld,"edu",3) ) flags |= LF_EDUTLD;
if ( tlen == 3 && ! strncmp(tld,"gov",3) ) flags |= LF_GOVTLD;
//if ( m_addSiteRootFlags ) {
// char *site = NULL;
// long siteLen = 0;
// // i guess TagRec is NULL here. we really should have
// // the tag recs of all the outlinks at this point
// if ( url.isSiteRoot(m_coll,NULL,&site,&siteLen) )
// flags |= LF_SITEROOT;
// // same site flag?
// if ( site&&siteLen==m_baseSiteLen&&
// strncmp(site,m_baseSite,siteLen)==0)
// flags |= LF_SAMESITE;
//}
// rss?
if ( isRSS ) {
// flag it
flags |= LF_RSS;
// we had one
m_hasRSSOutlink = true;
// store the first one
if ( ! m_rssOutlinkPtr ) {
m_rssOutlinkPtr = m_linkPtrs[m_numLinks];
m_rssOutlinkLen = m_linkLens[m_numLinks];
// logit
//char c = link[linkLen];
//link[linkLen]=0;
//logf(LOG_DEBUG,"gb: parent=%s rssoutlink= %s",
// m_parentUrl->m_url,link);
//link[linkLen]=c;
}
}
if ( tagId == TAG_A ) flags |= LF_AHREFTAG;
else if ( tagId == TAG_LINK ) flags |= LF_LINKTAG;
else if ( tagId == TAG_FBORIGLINK ) flags |= LF_FBTAG;
// a self link?
if ( m_parentUrl &&
// MUST be a PROPER subset, links to itself do not count!
url.getUrlLen() == m_parentUrl->getUrlLen() &&
strncmp(url.getUrl(), m_parentUrl->getUrl(),
m_parentUrl->getUrlLen())==0) {
flags |= LF_SELFLINK;
// turn this flag on
if ( nodeNum >= 0 ) m_xml->m_nodes[nodeNum].m_isSelfLink = 1;
}
// now check for the "permalink" key word or "permanent link" keyphrase
// TEST CASES:
//http://www.celebritybabyscoop.com/2008/12/28/jennifer-garner-is-still-pregnant/ + fp_1765421_garner_jennifer_znk_122808jpg/
//http://www.celebritybabyscoop.com/2008/12/27/gwen-stefani-family-spread-holiday-cheer/
// http://www.thetrendwatch.com/2008/12/22/how-big-shows-are-becoming-utterly-blaze/ + events-are-boring/
// http://thinkprogress.org/2008/12/26/bush-pardon-campaign/
if ( ( flags & LF_SELFLINK ) && ( flags & LF_AHREFTAG ) &&
// must be valid
nodeNum >= 0 ) {
XmlNode *nodes = m_xml->getNodes();
// get back tag
long max = nodeNum + 20;
if ( max > m_xml->getNumNodes() ) max = m_xml->getNumNodes();
long nn = nodeNum + 1;
while ( nn < max && nodes[nn].m_nodeId != TAG_A ) nn++;
if ( nn < max ) {
char *s = nodes[nodeNum].m_node;
char *send = nodes[nn].m_node;
for ( ; s < send ; s++ ) {
if ( *s != 'p' && *s != 'P' ) continue;
if ( ! strncasecmp(s,"permalink",9) )
break;
if ( ! strncasecmp(s,"permanent link",14) )
break;
}
if ( s < send ) {
flags |= LF_SELFPERMALINK;
m_hasSelfPermalink = true;
}
}
}
// get each url length without the cgi
long len1 = url.getUrlLen() - url.getQueryLen();
long len2 = 0;
if ( m_parentUrl )
len2 = m_parentUrl->getUrlLen() - m_parentUrl->getQueryLen();
// discount the '?' cuz it is not included in the queryLen right now
if ( url.getQueryLen() ) len1--;
if ( m_parentUrl && m_parentUrl->getQueryLen() ) len2--;
// . is it in a subdir of us?
// TEST CASES:
// http://joedecie.livejournal.com/28834.html?thread=167074#t167074
if ( m_parentUrl &&
// MUST be a PROPER subset, links to itself do not count!
len1 > len2 &&
strncmp(url.getUrl(), m_parentUrl->getUrl(),len2)==0) {
flags |= LF_SUBDIR;
m_hasSubdirOutlink = true;
}
// FIXME:
// http://www.packers.com/news/releases/2008/12/24/1/email_to_a_friend/
// FIXME:
// only has one hyphen but is indeed a permalink!
// http://marccooper.com/xmas-vacation/
// TEST CASES:
// href="...ami.php?url=http://lewebpedagogique.com/blog/2008/11/16/
// la-seconde-guerre-mondiale-cours ... will have its cgi ignored so
// such links as this one will not be considered permalinks
char *pathOverride = NULL;
bool ignoreCgi = false;
if ( (flags & LF_SUBDIR) && m_parentIsPermalink ) {
pathOverride = url.getUrl() + m_parentUrl->getUrlLen();
// must be same host
if ( m_parentUrl->getHostLen() != url.getHostLen() )
pathOverride = NULL;
// same host str check
else if ( strncmp( m_parentUrl->getHost() ,
url.getHost() ,
url.getHostLen()) )
pathOverride = NULL;
// must be in bounds
else if ( url.getUrlLen() <= m_parentUrl->getUrlLen() )
pathOverride = NULL;
// if we are a permalink, ignore cgi for seeing if they are
if ( pathOverride ) ignoreCgi = true;
}
// if it is a subset of a permalink parent, it is not a "true"
// permalink if it concatenates the word "comment" onto the parent url,
// it is likely a permalink for a comment, which does not really count
// TEST CASES:
// www.flickr.com/photos/korayem/2947977582/comment72157608088269210/
// robertsquier.blogspot.com/2008/10/drawing-tv.html?showComment=.2..
// profootballtalk.com/2008/12/28/vikings-win-nfc-north/comment-page-1/
bool permCheck = true;
if ( m_doQuickSet ) permCheck = false;
if ( permCheck && ignoreCgi && strstr(pathOverride,"comment") )
permCheck = false;
if ( permCheck && ignoreCgi && strstr(pathOverride,"Comment") )
permCheck = false;
if ( permCheck && ignoreCgi && strstr(pathOverride,"COMMENT") )
permCheck = false;
// . are we probably a permalink?
// . we do not have a TagRec at this point so we just can not
// tell whether we are siteRoot, and therefore NOT a permalink
linkflags_t extraFlags = 0;
if ( permCheck &&
::isPermalink ( //m_coll ,
NULL , // Links ptr
&url , // the url
CT_HTML , // contentType
NULL , // LinkInfo ptr
isRSS ,
NULL , // note ptr
pathOverride ,
ignoreCgi ,
// might include LF_STRONGPERM
&extraFlags ) ) {
flags |= LF_PERMALINK;
flags |= extraFlags;
}
// set in flag array
m_linkFlags [ m_numLinks ] = flags;
// set to NULL for now -- call setLinkSpam() later...
m_spamNotes [ m_numLinks ] = NULL;
QUICKPOLL(niceness);
// inc the count
m_numLinks++;
return true;
}
// . does link #i have link text?
// . link text must have at least one alnum in it
bool Links::hasLinkText ( long n, long version ) {
// return 0 if no link to our "url"
if ( n >= m_numLinks ) return false;
// get the node range so we can call Xml::getText()
long node1 = m_linkNodes [ n ];
// post-dating this change back to version 75, since it happened
// sometime right before this version bump, it allows for
// the least amount of docs to be indexed wrong
// only for <a> tags
if (node1 >= m_xml->getNumNodes()) return false;
if (m_xml->getNodeId(node1) != TAG_A) return false;
// find the </a> to this <a href> tag, or next <a href> tag
long node2 = m_xml->getNodeNum ( node1+1,9999999,"a",1);
// if not found use the last node in the document
if ( node2 < 0 ) node2 = m_xml->getNumNodes();
// check for text node in (node1,node2) range
for ( long i = node1+1 ; i < node2 ; i++ ) {
// continue if a tag
if ( m_xml->isTag(i) ) continue;
// otherwise, it's text
char *s = m_xml->getNode (i);
char *send = s + m_xml->getNodeLen(i);
// . does it have any alnums in it?
// . may be tricked by html entities like #187; or something
for ( ; s < send ; s += getUtf8CharSize(s) )
if ( is_alnum_utf8 ( s ) ) return true;
}
// otherwise, we found no text node with an alnum
return false;
}
// . stores link text into "buf" and returns the length
// . TODO: speed up so we don't have to set Url for every link in doc
long Links::getLinkText ( char *linkee ,
bool getSiteLinkInfo ,
char *buf ,
long bufMaxLen ,
//bool filter ,
char **itemPtr ,
long *itemLen ,
long *retNode1 ,
long *retLinkNum ,
long niceness ) {
// assume none
if ( retNode1 ) *retNode1 = -1;
// assume no link text
buf[0] = '\0';
// assume no item
if ( itemPtr ) *itemPtr = NULL;
if ( itemLen ) *itemLen = 0;
// if it is site based, skip the protocol because the site might
// be just a domain and not a subdomain
if ( getSiteLinkInfo ) {
char *pp = strstr ( linkee, "://");
if ( pp ) linkee = pp + 3;
}
long linkeeLen = gbstrlen(linkee);
// find the link point to our url
long i;
for ( i = 0 ; i < m_numLinks ; i++ ) {
QUICKPOLL(niceness);
char *link = getLinkPtr(i);
long linkLen = getLinkLen(i);
// now see if its a full match
// special case if site
if ( getSiteLinkInfo ) {
if ( strstr ( link, linkee ) ) break;
continue;
}
// continue if don't match
if ( linkLen != linkeeLen ) continue;
// continue if don't match
if ( strcmp ( link , linkee ) != 0 ) continue;
// otherwise it's a hit
break;
}
// return 0 if no link to our "url"
if ( i >= m_numLinks ) return 0;
*retLinkNum = i;
return getLinkText2(i,buf,bufMaxLen,itemPtr,itemLen,retNode1,niceness);
}
long Links::getLinkText2 ( long i ,
char *buf ,
long bufMaxLen ,
//bool filter ,
char **itemPtr ,
long *itemLen ,
long *retNode1 ,
long niceness ) {
// get the node range so we can call Xml::getText()
long node1 = m_linkNodes [ i ];
// . <area href=> tags have no link text
// . fix for http://www.cs.umass.edu/%7Everts/index.html 's
// link to phdcomics.com . it was picking up bogus link text
// from page tail.
XmlNode *xmlNodes = m_xml->getNodes();
if ( xmlNodes[node1].m_nodeId == TAG_AREA ) return 0;
// what delimeter are we using? this only applies to rss/atom feeds.
//char *del = NULL;
char del[16];
long dlen = 0;
long rss = m_xml->isRSSFeed();
if ( rss == 1 ) {
//del = "item";
memcpy(del, "item\0", 5);
dlen = 4;
}
else if ( rss == 2 ) {
//del = "entry";
memcpy(del, "entry\0", 6);
dlen = 5;
}
// if rss or atom page, return the whole xml <item> or <entry>
//if ( itemBuf && del ) {
if ( dlen > 0 ) {
// bail if not wanted
if ( ! itemPtr ) return 0;
//log ( LOG_INFO, "Links: Getting Link Item For Url" );
long xmlNumNodes = m_xml->getNumNodes();
// . must come from a <link> node, not a <a>
// . can also be an <enclosure> tag now too
if ( xmlNodes[node1].m_nodeId == 2 ) goto skipItem;
// get item delimeter length
//long dlen = gbstrlen(del);
// back pedal node until we hit <item> or <entry> tag
long j ;
for ( j = node1 ; j > 0 ; j-- ) {
QUICKPOLL(niceness);
// skip text nodes
if ( xmlNodes[j].m_nodeId == 0 ) continue;
// check the tag
if(xmlNodes[j].m_tagNameLen != dlen) continue;
if(strncasecmp(xmlNodes[j].m_tagName,del,dlen))
continue;
break;
}
// . if j is 0 we never found the <item> or <entry> tag
// because rss and atom feeds never start with such a tag
// . but we could be in the <channel> section, which is ok
// so i commented this out
//if ( j == 0 ) return 0;
// ptr to the start of it
char *s = xmlNodes[j].m_node;
// save this
if ( retNode1 ) *retNode1 = j;
// the end ptr
//char *send = s + xmlNodes[j].m_nodeLen;
char *send = m_xml->getContent() + m_xml->getContentLen();
// . start at the first tag in this element/item
// . we will copy the blurb on the interval [j,k)
for ( long k = j+1 ; k < xmlNumNodes ; k++ ) {
QUICKPOLL(niceness);
// get the next node in line
XmlNode *nn = &xmlNodes[k];
// . break out if would be too long
// . save room for terminating \0
//if (nn->m_node+nn->m_nodeLen-s > itemBufSize-1)break;
// break out if done
if ( k >= xmlNumNodes ) break;
// skip text nodes
if ( nn->m_nodeId == 0 ) continue;
// skip script sections, inside script tags
if ( nn->m_nodeId == TAG_SCRIPTTEXT ) continue;
if(nn->m_tagNameLen != dlen) continue;
if(strncasecmp(nn->m_tagName,del,dlen)) continue;
//if ( nn->m_tagNameLen != dlen ) continue;
//if ( strncasecmp(nn->m_tagName,del,dlen)) continue;
// we got the end of the item, set "send"
send = nn->m_node + nn->m_nodeLen;
// and we're done, break out
break;
}
// . if "send" is still NULL then the item/entry blurb was too
// big to fit into our buffer, or it never had a closing tag
// . but if the feed just had a <channel> section and not items
// then use the whole thing
//if ( ! send ) return 0;
// this is a blurb, send it back as such
*itemPtr = s;
*itemLen = send - s;
// rss feeds do not have conventional link text
return 0;
}
skipItem:
// find the </a> to this <a href> tag, or next <a href> tag
long node2 = m_xml->getNodeNum ( node1+1,9999999,"a",1);
// if not found use the last node in the document
if ( node2 < 0 ) node2 = 99999999;
// get the back tag for node #n0
//long n1 = m_xml->getEndNode ( i );
// now we can call Xml::getText()
long bufLen = m_xml->getText ( buf ,
bufMaxLen ,
node1 , // get kid text of this node
node2 ,
false , // include tags?
true , // visible text only?
//true , // filter? (entities to iso)
false , // filter? (entities to iso)
false , // filter spaces?
false ); // exclude if in <stop index>?
// set it
if ( retNode1 ) *retNode1 = node1;
// hunt for an alnum in the link text
char *p = buf;
char *pend = buf + bufLen;
for ( ; p < pend ; p += getUtf8CharSize(p) ) {
QUICKPOLL ( niceness );
if ( is_alnum_utf8(p) ) break;
}
// if no alnum then return 0 as the link text len
if ( p >= pend ) return 0;
// find last non-space char
char *q = p;
char *last = NULL;
for ( ; q < pend ; q += getUtf8CharSize(p) ) {
QUICKPOLL ( niceness );
if ( ! is_wspace_utf8(q) ) last = q;
}
// hack off trailing spaces
if ( last ) pend = last + getUtf8CharSize(last); // +1;
// shift left if we expunged some leading non-alnums
memmove ( buf , p , pend - p );
// reset buflen
bufLen = pend - p;
// null terminate
buf [ bufLen ] = '\0';
// return length
return bufLen;
}
// find an ascii subtring in linktext for this link and return a pointer
// to it, or NULL if not present
char *Links::linkTextSubstr(long linkNum, char *string, long niceness) {
if (linkNum >= m_numLinks) return NULL;
long nodeNum = getNodeNum(linkNum);
if (nodeNum >= m_xml->getNumNodes()-1) return NULL;
for (long i=nodeNum+1 ; i < m_xml->getNumNodes() ; i++ ) {
XmlNode *node = m_xml->getNodePtr(i);
if (node->getNodeId() == TAG_A) return NULL;
if (node->getNodeId() != TAG_TEXTNODE) continue;
// quickpoll, this is prone to blocking
QUICKPOLL(niceness);
// maybe handle img alt text here someday, too
char *ptr;
if ((ptr = strncasestr(node->getNode(),
node->getNodeLen(), string)))
return ptr;
}
return NULL;
}
// . hash the link: terms
// . ensure that more useful linkers are scored higher
// . useful for computing offsite link text for qdb-ish algorithm
// . NOTE: for now i do not hash links to the same domain in order to
// hopefully save 10%-25% index space
// . NOTE: PLUS, they may clog up the link-adjusted quality ratings since
// different site links with no link text will be ranked behind them
// . the 8-bit bitmap of the score of a link: term:
// . 00ubdcss u = link is Unbanned? b = link isBanned?
// d = link dirty? c = link clean?
// s = 01 if no link text, 10 if link text
// . NOTE: this is used in Msg18.cpp for extraction
// . CAUTION: IndexList::score32to8() will warp our score if its >= 128
// so i moved the bits down
/*
bool Links::hash ( TermTable *table,
Url *url ,
Url *redirUrl ,
long version,
long niceness ,
// can cal Xml::isRSSFeed() on the content to check
// for the special identifying tag to be a feed
bool isRSSFeed ) {
// we mask in some bits to the score sometimes
//unsigned char mask = 0;
// see if our links are all banned or all unbanned via siteRec
// Xml *sx = sr->getXml();
// later, the score can contain a ruleset that should be used to parse
// the document that is linked to, but just support unbanning of
// soft banned sites for now
//if ( sx->getBool("linksUnbanned", false ) ) mask |= 0x20;
// let's phase these guys out, they aren't really used anyway
//if ( version < 21 ) {
// if ( sx->getBool("linksBanned" , false ) ) mask |= 0x10;
// if ( sx->getBool("linksDirty" , false ) ) mask |= 0x08;
// if ( sx->getBool("linksClean" , false ) ) mask |= 0x04;
//}
// decide if we will index sitelink terms
bool indexSiteLinks = false;
if (version >= 71) {
//if (sx->getBool("indexSiteLinks", true)) indexSiteLinks=true;
indexSiteLinks = true;
}
// see ../url/Url2.cpp for hashAsLink() algorithm
for ( long i = 0 ; i < m_numLinks ; i++ ) {
// skip links with zero 0 length
if ( m_linkLens[i] == 0 ) continue;
// . skip if we are rss page and this link is an <a href> link
// . we only harvest/index <link> urls from rss feeds
// . or in the case of feedburner, those orig tags
if ( isRSSFeed && (m_linkFlags[i] & LF_AHREFTAG) )
continue;
// if we have a <feedburner:origLink> tag, then ignore <link>
// tags and only get the links from the original links
if ( m_isFeedBurner && !(m_linkFlags[i] & LF_FBTAG) )
continue;
// normalize the link
Url2 link;
// now we always add "www" to these links so that any link
// to cnn.com is same as link to www.cnn.com, because either
// we index cnn.com or www.cnn.com but not both providing
// their content is identical (deduping). This way whichever
// one we index, we can take advantage of all link text whether
// it's to cnn.com or www.cnn.com.
// Every now and then we add new session ids to our list in
// Url.cpp, too, so we have to version that.
link.set ( m_linkPtrs[i] ,
m_linkLens[i] ,
true , // addWWW?
m_stripIds ,
false , // stripPound?
false , // stripCommonFile?
version );// used for new session id stripping
QUICKPOLL(niceness);
// . the score depends on some factors:
// . NOTE: these are no longer valid! (see score bitmap above)
// . 4 --> if link has different domain AND has link text
// . 3 --> if link has same domain AND has link text
// . 2 --> if link has different domain AND no link text
// . 1 --> if link has sam domain AND no link text
// . is domain the same as ours?
// . NOTE: ideally, using the IP domain would be better, but
// we do not know the ip of the linker right now... so scores
// may be topped with a bunch of same-ip domain links so that
// we may not get as much link text as we'd like, since we
// only sample from one link text per ip domain
// . now we also just use the mid domain! (excludes TLD)
//bool sameMidDomain = false;
bool internal = false;
long mdlen = url->getMidDomainLen();
if ( mdlen == link.getMidDomainLen() &&
strncmp(url->getMidDomain(),link.getMidDomain(),mdlen)==0)
//continue; // sameMidDomain = true;
internal = true;
// also check the redir url
if ( redirUrl ) {
mdlen = redirUrl->getMidDomainLen();
if ( mdlen == link.getMidDomainLen() &&
strncmp(redirUrl->getMidDomain(),
link.getMidDomain(),mdlen)==0)
//continue; // sameMidDomain = true;
internal = true;
}
// select prefix
char *prefix = "link:";
// let's use a different termlist for version 21 and up since
// we include internal links and we do scoring differently.
// once those new termlists get beefed up we can switch over
// to them exclusively.
if ( version >= 21 ) prefix = "links";
// older versions used ilink:
if ( version < 21 && internal ) prefix = "ilink:";
// for now, don't hash same-mid-domain links at all (save disk)
//if ( sameMidDomain ) continue;
// now make the score
unsigned char score ;
// . TODO: consider not hashing link w/o text!
// . otherwise, give it a higher score if it's got link TEXT
bool gotLinkText = hasLinkText ( i, version );
//if ( ! sameMidDomain ) {
// support the old scores for backwards compatibility
if ( version < 21 ) {
if ( gotLinkText ) score = 2; // has link text
else score = 1; // no link text
}
// otherwise, beginning with version 21, allow internal links,
// but with lower scores
else {
// score
// internal, no link text: 2
// internal, w/ link text: 4
// external, no link text: 6
// external, w/ link text: 8
if ( internal ) {
if ( ! gotLinkText ) score = 0x02;
else score = 0x04;
}
else {
if ( ! gotLinkText ) score = 0x06;
else score = 0x08;
}
}
//}
//else if ( gotLinkText ) score = 3;
// set upper 2 bits to indicate of link is banned/unbanned
//score |= mask;
// now hash with our score
if ( ! link.hashAsLink ( version, table , NULL , 0 , score ,
internal , prefix, indexSiteLinks ) )
return false;
QUICKPOLL(niceness);
}
return true;
}
*/
long Links::findLinkNum(char* url, long urlLen) {
for(long i = 0;i< m_numLinks; i++) {
if(m_linkLens[i] == urlLen &&
strncmp(url, m_linkPtrs[i], urlLen) == 0)
return i;
}
return -1;
}
// helper function for shared link ptr buffer
static long getLinkBufferSize(long numLinks){
return numLinks *
(sizeof(char* ) + // linkPtrs
sizeof(long ) + // linkLens
sizeof(long ) + // linkNodes
sizeof(uint64_t ) + // linkHashes
sizeof(uint64_t ) + // hostHashes
sizeof(long ) + // domHashes
sizeof(linkflags_t ) + // linkFlags
sizeof(char* ) // spamNotes
);
}
/*
void Links::removeExternalLinks ( ) {
long j = 0;
char *p = m_allocBuf;
for ( long i = 0 ; i < m_numLinks ; i++ ) {
// skip if not internal (by hostname)
if ( ! isInternalHost(i) ) continue;
// copy it over
memcpy ( p , m_linkPtrs[i] , m_linkLens[i] );
// add it back
m_linkPtrs [j] = p;
m_linkLens [j] = m_linkLens [i];
m_linkNodes [j] = m_linkNodes [i];
m_linkHashes[j] = m_linkHashes[i];
m_hostHashes[j] = m_hostHashes[i];
m_linkFlags [j] = m_linkFlags [i];
// skip it
p += m_linkLens[i];
// NULL
*p++ = '\0';
// inc count of links
j++;
}
// . update m_bufPtr cuz that is what getLinkBufLen() returns!
// . Msg10 uses that to know when to stop adding urls
m_bufPtr = p;
// update count
m_numLinks = j;
}
*/
// returns false and sets g_errno on error
bool Links::flagOldLinks ( Links *old ) {
// do not double call
if ( m_flagged ) return true;
// only call once
m_flagged = true;
// skip if null
if ( ! old ) return true;
// hash the old links into a table
HashTable ht;
for ( long i = 0 ; i < old->m_numLinks ; i++ ) {
// get the url
char *u = old->m_linkPtrs[i];
long ulen = old->m_linkLens[i];
// hash it
long long uh = hash32 ( u , ulen );
// it does not like keys of 0, that means empty slot
if ( uh == 0 ) uh = 1;
// add to hash table
if ( ! ht.addKey ( uh , 1 ) ) return false;
}
// set the flags
for ( long i = 0 ; i < m_numLinks ; i++ ) {
// get the url
char *u = m_linkPtrs[i];
long ulen = m_linkLens[i];
// get our hash
long long uh = hash32 ( u , ulen );
// it does not like keys of 0, that means empty slot
if ( uh == 0 ) uh = 1;
// check if our hash is in this hash table, if not, then
// it is a new link, skip this
if ( ht.getSlot ( uh ) < 0 ) continue;
// assume new
m_linkFlags[i] |= LF_OLDLINK;
}
return true;
}
//static char s_isLinkRSS;
//static char s_permalink;
//static long s_age;
// . are we a permalink?
// . this registers as a permalink which it is not:
// http://www.dawn.com/2009/01/04/rss.htm
// http://www.msnbc.msn.com/id/3032072
bool isPermalink ( //char *coll ,
Links *links ,
Url *u ,
char contentType ,
LinkInfo *linkInfo ,
bool isRSS ,
char **note ,
char *pathOverride ,
bool ignoreCgi ,
linkflags_t *retFlags ) {
// reset. caller will OR these into its flags
if ( retFlags ) *retFlags = 0;
// how can this happen?
if ( ! u ) return false;
// rss feeds cannot be permalinks
if ( isRSS ) { if ( note ) *note = "url is rss feed."; return false; }
// root pages don't get to be permalinks
if ( u->isRoot() ) {
if ( note ) *note = "url is a site root"; return false; }
// are we a "site root" i.e. hometown.com/users/fred/ etc.
//if ( u->isSiteRoot ( coll ) ) {
// if ( note ) *note = "url is a site root"; return false; }
// only html (atom feeds link to themselves)
if ( contentType != CT_HTML) {
if ( note ) *note = "content is not html"; return false;}
// techcrunch has links like this in the rss:
// http://feedproxy.google.com/~r/Techcrunch/~3/pMaRh78u1W8/
if ( strncmp(u->getHost(),"feedproxy.",10)==0 ) {
if ( note ) *note = "from feedproxy host"; return true; }
// might want to get <feedburner:origLink> instead of <link> if
// we can. that woudl save a redirect through evil g
if ( strncmp(u->getHost(),"feeds.feedburner.com/~",22)==0 ) {
if ( note ) *note = "feedburner tilde url"; return true; }
// . BUT if it has a link to itself on digg, reddit, etc. then it
// doesn't need to have the digits or the underscores...
// . this helps us disnguish between
// science.howstuffworks.com/fantasy-football.html (permalink) and
// science.howstuffworks.com/space-channel.htm (NOT a permalink)
// . i guess this includes "post a comment" links that are just
// anchor links to the textarea at the page bottom so that will fix:
// http://workandplay.vox.com/library/post/running-again.html
// . includes trackbacks, comments feed, etc.
// . returns -1 if unknown whether it is a permalink or not
char status = -1;
if ( links ) status = links->isPermalink ( note );
if ( status == 1 ) return true;
if ( status == 0 ) return false;
char *pathStart = u->getPath();
// a hack by Links.cpp after setting LF_SUBDIR
if ( pathOverride ) pathStart = pathOverride;
// compute these
linkflags_t extraFlags = 0;
// we must have a sequence of 3 or more digits in the path
char *p = pathStart;
long plen = u->getPathLen();
char *pend = u->getPath() + plen;
long dcount = 0;
// now we scan the cgi stuff too!!
// http://www.rocklintoday.com/news/templates/sierra_college.asp?articleid=6848&zoneid=51
// http://www.freemarketnews.com/WorldNews.asp?nid=57373
char *uend = u->getUrl() + u->getUrlLen();
// halt at path if we should
if ( ignoreCgi ) uend -= u->getQueryLen(); // CgiLen();
// see if we find the digits in the cgi part
bool digitsInCgi = false;
// start scanning at the path
for ( ; p < uend ; p++ ) {
if ( *p == '?' ) digitsInCgi = true;
// if not a digit, reset count
if ( ! is_digit(*p) ) { dcount = 0; continue; }
// . check if it is a "strong permalink"
// . i.e. contains /yyyy/mm/?? in PATH (not cgi)
if ( p + 9 < pend &&
*(p-1)=='/' &&
is_digit(p[0]) &&
is_digit(p[1]) &&
is_digit(p[2]) &&
is_digit(p[3]) &&
p[4] == '/' &&
is_digit(p[5]) &&
is_digit(p[6]) &&
p[7] == '/' ) {
//is_digit(p[8]) &&
//is_digit(p[9]) &&
//p[10] == '/' )
// http://www.it.com.cn/f/office/091/4/722111.htm
// was thought to have strong outlinks, but they were
// not! this should fix it...
long y = atoi(p+0);
long m = atoi(p+5);
// make sure the year and month are in range
if ( y >= 1990 && y <= 2050 && m >= 1 && m <= 31 )
extraFlags |= LF_STRONGPERM;
}
// count it if a digit
if ( ++dcount == 3 ) break;
}
// it can also have 2+ hyphens or 2+ underscores in a single
// path component to be a permalink
long hcount = 0;
p = pathStart;
for ( ; p < pend ; p++ ) {
// if not a digit, reset count
if ( *p == '/' ) { hcount = 0; continue; }
// is it a thing?
if ( *p != '_' && *p != '-' ) continue;
// count it
if ( ++hcount == 2 ) break;
}
// we can have a cgi of "?p=<digit>" and be a permalink
p = u->m_query;
bool hasp = ( p && p[0]=='p' && p[1]=='=' && is_digit(p[2]) ) ;
// fix for http://proglobalbusiness.org/?m=200806 being detected as
// a permalink... it has ?p=xxx outlinks.
if ( hasp ) extraFlags |= LF_STRONGPERM;
// return these if the caller wants them
if ( retFlags ) *retFlags = extraFlags;
// . if we don't then not a permalink
// . THIS STILL FAILS on stuff like:
// BUT we can fix that by doing url pattern analysis? yeah,
// each domain can have a tag that is the permalink subdir, so
// that any url in that subdir is a permalink.
if ( ! hasp && dcount < 3 && hcount < 2 ) {
if ( note )
*note = "path has no digits, underscores or hyphens";
return false;
}
// if self link check for link text "permalink" then we are
// probably very strongly a permalink
// http://www.5minutesformom.com/5225/wordless-wednesday-angel/
// has a /promote-your-site tack-on which casues the LF_SUBDIR
// algo to call the parent a NON-permalink.this should fix that
// because it has a link to itself with the word "permalink"
if ( links && links->hasSelfPermalink() ) {
if ( note ) *note = "has permalink text to itself";
return true;
}
// http://proglobalbusiness.org/?m=200806 is never a permalink
p = u->m_query;
if ( p && p[0]=='m' && p[1]=='=' && is_digit(p[2]) ) {
long n = atoi(p+2);
if ( n > 199000 && n < 205000 ) {
if ( note ) *note = "has ?m=<year><month> cgi";
return false;
}
}
// . if we have an internal outlink that is a permalink and is
// in a subdirectory of us, THEN we are not a permalink
// . fixes andrewsullivan.atlanticmonthly.com/the_daily_dish/
linkflags_t mf = (LF_PERMALINK | LF_SAMEHOST | LF_SUBDIR );
// loop over all outlinks
long no = 0;
// make sure we got them
if ( links ) no = links->m_numLinks;
// practically all internal outlinks have LF_SUBDIR set for permalinks
// for the url http://www.breitbart.tv/?p=249453 so do not do this
// outlink algo on it on such urls! basically anytime we got our
// permalink indicator in the cgi portion of the url, do not do this
// subdir algorithm.
if ( hcount < 2 && dcount < 3 && hasp ) no = 0;
// or if we only got digits and they were in the cgi
if ( hcount < 2 && ! hasp && digitsInCgi ) no = 0;
// do the outlink loop
for ( long i = 0 ; i < no ; i++ ) {
// get the flags
linkflags_t flags = links->m_linkFlags[i];
// skip if not a match. match the match flags = "mf"
if ( (flags & mf) != mf ) continue;
// allow /print/ "printer view" pages
//if ( strstr ( links->m_linkPtrs[i],"/print" ) ) continue;
if ( note ) *note = "has subdir permalink outlink";
// ok, we are not a permalink now
return false;
}
// now check for strong outlinks on same host when we are not strong
if ( links ) no = links->m_numLinks;
// if we are strong, forget it
if ( extraFlags & LF_STRONGPERM ) no = 0;
// look for strong permalink outlinks
mf = (LF_STRONGPERM| LF_SAMEHOST );
// loop over all outlinks we have
for ( long i = 0 ; i < no ; i++ ) {
// get the flags
linkflags_t flags = links->m_linkFlags[i];
// . if we are NOT a "strong permalink" but we have a same host
// outlink that is, then we are not a permalink
// . fixes: http://blog.makezine.com/archive/kids/
// ?CMP=OTC-0D6B48984890
if ( (flags & mf) != mf ) continue;
// allow /print/ "printer view" pages
//if ( strstr ( links->m_linkPtrs[i],"/print" ) ) continue;
if ( note ) *note = "has strong permalink outlink";
// ok, we are not a permalink now
return false;
}
// no permalinks for archive directories
if ( (gb_strcasestr(u->getPath(),"/archive")||
u->getPathDepth(false)==0) &&
gb_strcasestr(u->getPath(), "/index.") &&
!u->isCgi()){
if ( note ) *note = "has /archive and /index. and not cgi";
return false;}
// no, /tag/ is ok --> http://www.makeuseof.com/
// BUT technorati.com/tag/search-engine-optimization is not a
// permalink!!! i took technorati.com|jp out of ruleset 36 for now
// ah, but a ton of the urls have /tags/ and are NOT permalinks!!!
if (gb_strcasestr(u->getPath(), "/tag/")){
if ( note ) *note = "has /tag/"; return false;}
// no forums or category indexes
if (gb_strcasestr(u->getPath(), "/category")){
if ( note ) *note = "has /category"; return false;}
if (gb_strcasestr(u->getPath(), "/cat_")){
if ( note ) *note = "has /cat_"; return false;}
// http://www.retailerdaily.com/cat/search-engine-marketing/
if (gb_strcasestr(u->getPath(), "/cat/")){
if ( note ) *note = "has /cat/"; return false;}
if (gb_strcasestr(u->getPath(), "/comment.html")){
if ( note ) *note = "has /comment.html"; return false;}
if (gb_strcasestr(u->getPath(), "/comments/")){
if ( note ) *note = "has /comments/"; return false;}
char *pos;
// category or tag page detection
pos = gb_strcasestr(u->getUrl(), "cat=");
if ( pos && pos > u->getUrl() && !is_alpha_a(*(pos-1))){
if ( note ) *note = "has [A-z]cat="; return false;}
pos = gb_strcasestr(u->getUrl(), "tag=");
if ( pos && pos > u->getUrl() && !is_alpha_a(*(pos-1))){
if ( note ) *note = "has [A-z]tag="; return false;}
pos = gb_strcasestr(u->getUrl(), "tags=");
if ( pos && pos > u->getUrl() && !is_alpha_a(*(pos-1))){
if ( note ) *note = "has [A-z]tags="; return false;}
// more forum detection
if (gb_strcasestr(u->getUrl(), "forum")){
if ( note ) *note = "has forum"; return false;}
if (gb_strcasestr(u->getPath(), "thread")){
if ( note ) *note = "has thread"; return false;}
if (gb_strcasestr(u->getPath(), "topic") &&
!gb_strcasestr(u->getPath(), "/topics/")){
if ( note ) *note = "has /topics/"; return false;}
// more index page detection
if (gb_strcasestr(u->getPath(), "/default.")){
if ( note ) *note = "has /default."; return false;}
if (gb_strcasestr(u->getPath(), "/profile.")){
if ( note ) *note = "has /profile."; return false;}
if (gb_strcasestr(u->getPath(), "/archives.")){
if ( note ) *note = "has /archives."; return false;}
if (gb_strcasestr(u->getPath(), "_archive.")){
if ( note ) *note = "has _archive."; return false;}
if (gb_strcasestr(u->getPath(), "/search.")){
if ( note ) *note = "has /search."; return false; }
if (gb_strcasestr(u->getPath(), "/search/")){
if ( note ) *note = "has /search/"; return false; }
// get path end
p = u->getPath() + u->getPathLen();
plen = u->getPathLen();
// back up over index.html
if ( plen > 10 && strncmp(p-10,"index.html",10)==0 ) {
plen -= 10; p -= 10; }
// hack off the /
if ( p[-1]=='/' ) { plen--; p--; }
// ends in /trackback means not a permalink
if ( plen >= 10 && strncasecmp(p-10,"/trackback",10)==0) {
if ( note ) *note = "ends in /trackback";
return false;
}
// ends in /dddd/dd means usually an archive date
if ( plen >= 8 &&
is_digit(p[-1]) &&
is_digit(p[-2]) &&
p[-3] == '/' &&
is_digit(p[-4]) &&
is_digit(p[-5]) &&
is_digit(p[-6]) &&
is_digit(p[-7]) &&
p[-8] == '/' ) {
// ensure the numbers are in range for a date
long year = atoi(p-7);
long month = atoi(p-2);
if ( year > 1990 && year <= 2015 &&
month > 0 && month <= 12 ) {
if ( note ) *note = "ends in /dddd/dd/";
return false;
}
}
// /2008 is usually not permalink
if ( plen >= 5 &&
p[-5] == '/' &&
p[-4] == '2' &&
p[-3] == '0' &&
atoi(p-2) < 50 ) {
if ( note ) *note = "ends in year /20xx";
return false;
}
// /199? too
if ( plen >= 5 &&
p[-5] == '/' &&
p[-4] == '1' &&
p[-3] == '9' &&
atoi(p-2) > 90 ) {
if ( note ) *note = "ends in year /19xx";
return false;
}
// . look for a repetitive sequence of html tags
// . each must contain an outlink! there are some blog entries that
// have excerpts of an email chain.
// . if the repetition intersects the main content section,
// then it is an index page
// . make sure that SCORES can detect comment sections. very often
// the comment is bigger than the main section!!!!
// . or we can subtract the repetitive sections, and see if we have
// any beefy content left over... then we don't have to worry
// about comment identification
// . index tag pairs
// . look at header tags, div, p, index level # before the pair
// . find the delimeter between the blurbs
// . delimeter must touch the beefy content section
// . go by "strings" of tagids, a tagid of 0 means text i think
// but eliminate it if pure punctuation
// . and have a subtagid field, which is the hash of a tag's attributes
// BUT in the case of a text tag, a hash of the alpha chars
// . how many delimeters can we find that start at level X.
// . now we are determining
if ( note ) *note = "is permalink";
return true;
}
long getSiteRank ( long sni ) {
if ( sni <= 0 ) return 0;
if ( sni <= 1 ) return 1;
if ( sni <= 2 ) return 2;
if ( sni <= 3 ) return 3;
if ( sni <= 4 ) return 4;
if ( sni <= 5 ) return 5;
if ( sni <= 9 ) return 6;
if ( sni <= 19 ) return 7;
if ( sni <= 39 ) return 8;
if ( sni <= 79 ) return 9;
if ( sni <= 200-1 ) return 10;
if ( sni <= 500-1 ) return 11;
if ( sni <= 2000-1 ) return 12;
if ( sni <= 5000-1 ) return 13;
if ( sni <= 10000-1 ) return 14;
//if ( sni <= 3120 ) return 15;
return 15;
}