diff --git a/Collectiondb.h b/Collectiondb.h index 75bb0858..2b3a2c2b 100644 --- a/Collectiondb.h +++ b/Collectiondb.h @@ -535,6 +535,7 @@ class CollectionRec { char m_recycleContent ; char m_recycleCatdb ; char m_getLinkInfo ; // turn off to save seeks + char m_computeSiteNumInlinks ; //char m_recycleLinkInfo2 ; // ALWAYS recycle linkInfo2? //char m_useLinkInfo2ForQuality ; char m_indexInlinkNeighborhoods; diff --git a/Linkdb.cpp b/Linkdb.cpp index df9dd43f..7331a681 100644 --- a/Linkdb.cpp +++ b/Linkdb.cpp @@ -714,6 +714,14 @@ void handleRequest25 ( UdpSlot *slot , int32_t netnice ) { // used by sendReply() req->m_udpSlot = slot; + if ( g_conf.m_logDebugLinkInfo && req->m_mode == MODE_SITELINKINFO ) { + log("linkdb: got msg25 request sitehash64=%"INT64" " + "site=%s " + ,req->m_siteHash64 + ,req->ptr_site + ); + } + // set up the hashtable if our first time if ( ! g_lineTable.isInitialized() ) g_lineTable.set ( 8,sizeof(Msg25Request *),256, @@ -738,7 +746,8 @@ void handleRequest25 ( UdpSlot *slot , int32_t netnice ) { req->m_next = head->m_next; head->m_next = req; // note it for debugging - log("build: msg25 request waiting in line for %s slot=0x%"PTRFMT"", + log("build: msg25 request waiting in line for %s " + "udpslot=0x%"PTRFMT"", req->ptr_url,(PTRTYPE)slot); // we will send a reply back for this guy when done // getting the reply for the head msg25request @@ -1116,9 +1125,9 @@ bool Msg25::doReadLoop ( ) { if ( g_conf.m_logDebugLinkInfo ) { char *ms = "page"; if ( m_mode == MODE_SITELINKINFO ) ms = "site"; - log("msg25: getting full linkinfo mode=%s site=%s url=%s " - "docid=%"INT64"", - ms,m_site,m_url,m_docId); + log("msg25: reading linkdb list mode=%s site=%s url=%s " + "docid=%"INT64" linkdbstartkey=%s", + ms,m_site,m_url,m_docId,KEYSTR(&startKey,LDBKS)); } m_gettingList = true; @@ -2308,8 +2317,9 @@ bool Msg25::gotLinkText ( Msg20Request *req ) { // LinkTextReply *linkText ) { } // debug if ( g_conf.m_logDebugLinkInfo ) { - log("linkdb: recalling round=%"INT32" for %s=%s", - m_round,ms,m_site); + log("linkdb: recalling round=%"INT32" for %s=%s " + "req=0x%"PTRFMT" numlinkerreplies=%"INT32, + m_round,ms,m_site,(PTRTYPE)m_req25,m_numReplyPtrs); } // and re-call. returns true if did not block. // returns true with g_errno set on error. diff --git a/Parms.cpp b/Parms.cpp index 8311d08d..08f63da0 100644 --- a/Parms.cpp +++ b/Parms.cpp @@ -17315,6 +17315,21 @@ void Parms::init ( ) { m->m_obj = OBJ_COLL; m++; + m->m_title = "compute site num inlinks"; + m->m_desc = "If this is true Gigablast will " + "compute the number of site inlinks for the sites it " + "indexes. It will cache them in tagdb for some time. " + "The greater the number of inlinks, the longer the cached " + "time, because the site is considered more stable."; + m->m_cgi = "csni"; + m->m_off = (char *)&cr.m_computeSiteNumInlinks - x; + m->m_type = TYPE_BOOL; + m->m_def = "1"; + m->m_flags = PF_CLONE|PF_API;//PF_HIDDEN | PF_NOSAVE; + m->m_page = PAGE_SPIDER; + m->m_obj = OBJ_COLL; + m++; + m->m_title = "do link spam checking"; m->m_desc = "If this is true, do not allow spammy inlinks to vote. " "This check is " diff --git a/XmlDoc.cpp b/XmlDoc.cpp index 484defd8..5737a369 100644 --- a/XmlDoc.cpp +++ b/XmlDoc.cpp @@ -13801,6 +13801,30 @@ int32_t *XmlDoc::getSiteNumInlinks ( ) { // sanity check if ( m_setFromTitleRec && ! m_useSecondaryRdbs) {char *xx=NULL;*xx=0;} + CollectionRec *cr = getCollRec(); + if ( ! cr ) return NULL; + + // hacks of speed. computeSiteNumInlinks is true by default + // but if the user turns it off the just use sitelinks.txt + if ( ! cr->m_computeSiteNumInlinks ) { + int32_t hostHash32 = getHostHash32a(); + int32_t min = g_tagdb.getMinSiteInlinks ( hostHash32 ); + // try with www if not there + if ( min < 0 && ! m_firstUrl.hasSubdomain() ) { + int32_t wwwHash32 = m_firstUrl.getHash32WithWWW(); + min = g_tagdb.getMinSiteInlinks ( wwwHash32 ); + } + // if still not in sitelinks.txt, just use 0 + if ( min < 0 ) { + m_siteNumInlinksValid = true; + m_siteNumInlinks = 0; + return &m_siteNumInlinks; + } + m_siteNumInlinks = min; + m_siteNumInlinksValid = true; + return &m_siteNumInlinks; + } + setStatus ( "getting site num inlinks"); // get it from the tag rec if we can @@ -13838,9 +13862,6 @@ int32_t *XmlDoc::getSiteNumInlinks ( ) { if ( ! wfts ) return NULL; if ( wfts == -1 ) return (int32_t *)-1; - CollectionRec *cr = getCollRec(); - if ( ! cr ) return NULL; - setStatus ( "getting site num inlinks"); // check the tag first Tag *tag = gr->getTag ("sitenuminlinks");