add switch to turn off site num inlink computation and just

use sitelinks.txt for speed
This commit is contained in:
Matt 2015-08-31 22:29:51 -06:00
parent 9de719b050
commit de51769e5a
4 changed files with 56 additions and 9 deletions

View File

@ -535,6 +535,7 @@ class CollectionRec {
char m_recycleContent ;
char m_recycleCatdb ;
char m_getLinkInfo ; // turn off to save seeks
char m_computeSiteNumInlinks ;
//char m_recycleLinkInfo2 ; // ALWAYS recycle linkInfo2?
//char m_useLinkInfo2ForQuality ;
char m_indexInlinkNeighborhoods;

View File

@ -714,6 +714,14 @@ void handleRequest25 ( UdpSlot *slot , int32_t netnice ) {
// used by sendReply()
req->m_udpSlot = slot;
if ( g_conf.m_logDebugLinkInfo && req->m_mode == MODE_SITELINKINFO ) {
log("linkdb: got msg25 request sitehash64=%"INT64" "
"site=%s "
,req->m_siteHash64
,req->ptr_site
);
}
// set up the hashtable if our first time
if ( ! g_lineTable.isInitialized() )
g_lineTable.set ( 8,sizeof(Msg25Request *),256,
@ -738,7 +746,8 @@ void handleRequest25 ( UdpSlot *slot , int32_t netnice ) {
req->m_next = head->m_next;
head->m_next = req;
// note it for debugging
log("build: msg25 request waiting in line for %s slot=0x%"PTRFMT"",
log("build: msg25 request waiting in line for %s "
"udpslot=0x%"PTRFMT"",
req->ptr_url,(PTRTYPE)slot);
// we will send a reply back for this guy when done
// getting the reply for the head msg25request
@ -1116,9 +1125,9 @@ bool Msg25::doReadLoop ( ) {
if ( g_conf.m_logDebugLinkInfo ) {
char *ms = "page";
if ( m_mode == MODE_SITELINKINFO ) ms = "site";
log("msg25: getting full linkinfo mode=%s site=%s url=%s "
"docid=%"INT64"",
ms,m_site,m_url,m_docId);
log("msg25: reading linkdb list mode=%s site=%s url=%s "
"docid=%"INT64" linkdbstartkey=%s",
ms,m_site,m_url,m_docId,KEYSTR(&startKey,LDBKS));
}
m_gettingList = true;
@ -2308,8 +2317,9 @@ bool Msg25::gotLinkText ( Msg20Request *req ) { // LinkTextReply *linkText ) {
}
// debug
if ( g_conf.m_logDebugLinkInfo ) {
log("linkdb: recalling round=%"INT32" for %s=%s",
m_round,ms,m_site);
log("linkdb: recalling round=%"INT32" for %s=%s "
"req=0x%"PTRFMT" numlinkerreplies=%"INT32,
m_round,ms,m_site,(PTRTYPE)m_req25,m_numReplyPtrs);
}
// and re-call. returns true if did not block.
// returns true with g_errno set on error.

View File

@ -17315,6 +17315,21 @@ void Parms::init ( ) {
m->m_obj = OBJ_COLL;
m++;
m->m_title = "compute site num inlinks";
m->m_desc = "If this is true Gigablast will "
"compute the number of site inlinks for the sites it "
"indexes. It will cache them in tagdb for some time. "
"The greater the number of inlinks, the longer the cached "
"time, because the site is considered more stable.";
m->m_cgi = "csni";
m->m_off = (char *)&cr.m_computeSiteNumInlinks - x;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_flags = PF_CLONE|PF_API;//PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "do link spam checking";
m->m_desc = "If this is true, do not allow spammy inlinks to vote. "
"This check is "

View File

@ -13801,6 +13801,30 @@ int32_t *XmlDoc::getSiteNumInlinks ( ) {
// sanity check
if ( m_setFromTitleRec && ! m_useSecondaryRdbs) {char *xx=NULL;*xx=0;}
CollectionRec *cr = getCollRec();
if ( ! cr ) return NULL;
// hacks of speed. computeSiteNumInlinks is true by default
// but if the user turns it off the just use sitelinks.txt
if ( ! cr->m_computeSiteNumInlinks ) {
int32_t hostHash32 = getHostHash32a();
int32_t min = g_tagdb.getMinSiteInlinks ( hostHash32 );
// try with www if not there
if ( min < 0 && ! m_firstUrl.hasSubdomain() ) {
int32_t wwwHash32 = m_firstUrl.getHash32WithWWW();
min = g_tagdb.getMinSiteInlinks ( wwwHash32 );
}
// if still not in sitelinks.txt, just use 0
if ( min < 0 ) {
m_siteNumInlinksValid = true;
m_siteNumInlinks = 0;
return &m_siteNumInlinks;
}
m_siteNumInlinks = min;
m_siteNumInlinksValid = true;
return &m_siteNumInlinks;
}
setStatus ( "getting site num inlinks");
// get it from the tag rec if we can
@ -13838,9 +13862,6 @@ int32_t *XmlDoc::getSiteNumInlinks ( ) {
if ( ! wfts ) return NULL;
if ( wfts == -1 ) return (int32_t *)-1;
CollectionRec *cr = getCollRec();
if ( ! cr ) return NULL;
setStatus ( "getting site num inlinks");
// check the tag first
Tag *tag = gr->getTag ("sitenuminlinks");