mirror of
https://github.com/gigablast/open-source-search-engine.git
synced 2024-10-04 12:17:35 +03:00
add switch to turn off site num inlink computation and just
use sitelinks.txt for speed
This commit is contained in:
parent
9de719b050
commit
de51769e5a
@ -535,6 +535,7 @@ class CollectionRec {
|
||||
char m_recycleContent ;
|
||||
char m_recycleCatdb ;
|
||||
char m_getLinkInfo ; // turn off to save seeks
|
||||
char m_computeSiteNumInlinks ;
|
||||
//char m_recycleLinkInfo2 ; // ALWAYS recycle linkInfo2?
|
||||
//char m_useLinkInfo2ForQuality ;
|
||||
char m_indexInlinkNeighborhoods;
|
||||
|
22
Linkdb.cpp
22
Linkdb.cpp
@ -714,6 +714,14 @@ void handleRequest25 ( UdpSlot *slot , int32_t netnice ) {
|
||||
// used by sendReply()
|
||||
req->m_udpSlot = slot;
|
||||
|
||||
if ( g_conf.m_logDebugLinkInfo && req->m_mode == MODE_SITELINKINFO ) {
|
||||
log("linkdb: got msg25 request sitehash64=%"INT64" "
|
||||
"site=%s "
|
||||
,req->m_siteHash64
|
||||
,req->ptr_site
|
||||
);
|
||||
}
|
||||
|
||||
// set up the hashtable if our first time
|
||||
if ( ! g_lineTable.isInitialized() )
|
||||
g_lineTable.set ( 8,sizeof(Msg25Request *),256,
|
||||
@ -738,7 +746,8 @@ void handleRequest25 ( UdpSlot *slot , int32_t netnice ) {
|
||||
req->m_next = head->m_next;
|
||||
head->m_next = req;
|
||||
// note it for debugging
|
||||
log("build: msg25 request waiting in line for %s slot=0x%"PTRFMT"",
|
||||
log("build: msg25 request waiting in line for %s "
|
||||
"udpslot=0x%"PTRFMT"",
|
||||
req->ptr_url,(PTRTYPE)slot);
|
||||
// we will send a reply back for this guy when done
|
||||
// getting the reply for the head msg25request
|
||||
@ -1116,9 +1125,9 @@ bool Msg25::doReadLoop ( ) {
|
||||
if ( g_conf.m_logDebugLinkInfo ) {
|
||||
char *ms = "page";
|
||||
if ( m_mode == MODE_SITELINKINFO ) ms = "site";
|
||||
log("msg25: getting full linkinfo mode=%s site=%s url=%s "
|
||||
"docid=%"INT64"",
|
||||
ms,m_site,m_url,m_docId);
|
||||
log("msg25: reading linkdb list mode=%s site=%s url=%s "
|
||||
"docid=%"INT64" linkdbstartkey=%s",
|
||||
ms,m_site,m_url,m_docId,KEYSTR(&startKey,LDBKS));
|
||||
}
|
||||
|
||||
m_gettingList = true;
|
||||
@ -2308,8 +2317,9 @@ bool Msg25::gotLinkText ( Msg20Request *req ) { // LinkTextReply *linkText ) {
|
||||
}
|
||||
// debug
|
||||
if ( g_conf.m_logDebugLinkInfo ) {
|
||||
log("linkdb: recalling round=%"INT32" for %s=%s",
|
||||
m_round,ms,m_site);
|
||||
log("linkdb: recalling round=%"INT32" for %s=%s "
|
||||
"req=0x%"PTRFMT" numlinkerreplies=%"INT32,
|
||||
m_round,ms,m_site,(PTRTYPE)m_req25,m_numReplyPtrs);
|
||||
}
|
||||
// and re-call. returns true if did not block.
|
||||
// returns true with g_errno set on error.
|
||||
|
15
Parms.cpp
15
Parms.cpp
@ -17315,6 +17315,21 @@ void Parms::init ( ) {
|
||||
m->m_obj = OBJ_COLL;
|
||||
m++;
|
||||
|
||||
m->m_title = "compute site num inlinks";
|
||||
m->m_desc = "If this is true Gigablast will "
|
||||
"compute the number of site inlinks for the sites it "
|
||||
"indexes. It will cache them in tagdb for some time. "
|
||||
"The greater the number of inlinks, the longer the cached "
|
||||
"time, because the site is considered more stable.";
|
||||
m->m_cgi = "csni";
|
||||
m->m_off = (char *)&cr.m_computeSiteNumInlinks - x;
|
||||
m->m_type = TYPE_BOOL;
|
||||
m->m_def = "1";
|
||||
m->m_flags = PF_CLONE|PF_API;//PF_HIDDEN | PF_NOSAVE;
|
||||
m->m_page = PAGE_SPIDER;
|
||||
m->m_obj = OBJ_COLL;
|
||||
m++;
|
||||
|
||||
m->m_title = "do link spam checking";
|
||||
m->m_desc = "If this is true, do not allow spammy inlinks to vote. "
|
||||
"This check is "
|
||||
|
27
XmlDoc.cpp
27
XmlDoc.cpp
@ -13801,6 +13801,30 @@ int32_t *XmlDoc::getSiteNumInlinks ( ) {
|
||||
// sanity check
|
||||
if ( m_setFromTitleRec && ! m_useSecondaryRdbs) {char *xx=NULL;*xx=0;}
|
||||
|
||||
CollectionRec *cr = getCollRec();
|
||||
if ( ! cr ) return NULL;
|
||||
|
||||
// hacks of speed. computeSiteNumInlinks is true by default
|
||||
// but if the user turns it off the just use sitelinks.txt
|
||||
if ( ! cr->m_computeSiteNumInlinks ) {
|
||||
int32_t hostHash32 = getHostHash32a();
|
||||
int32_t min = g_tagdb.getMinSiteInlinks ( hostHash32 );
|
||||
// try with www if not there
|
||||
if ( min < 0 && ! m_firstUrl.hasSubdomain() ) {
|
||||
int32_t wwwHash32 = m_firstUrl.getHash32WithWWW();
|
||||
min = g_tagdb.getMinSiteInlinks ( wwwHash32 );
|
||||
}
|
||||
// if still not in sitelinks.txt, just use 0
|
||||
if ( min < 0 ) {
|
||||
m_siteNumInlinksValid = true;
|
||||
m_siteNumInlinks = 0;
|
||||
return &m_siteNumInlinks;
|
||||
}
|
||||
m_siteNumInlinks = min;
|
||||
m_siteNumInlinksValid = true;
|
||||
return &m_siteNumInlinks;
|
||||
}
|
||||
|
||||
setStatus ( "getting site num inlinks");
|
||||
|
||||
// get it from the tag rec if we can
|
||||
@ -13838,9 +13862,6 @@ int32_t *XmlDoc::getSiteNumInlinks ( ) {
|
||||
if ( ! wfts ) return NULL;
|
||||
if ( wfts == -1 ) return (int32_t *)-1;
|
||||
|
||||
CollectionRec *cr = getCollRec();
|
||||
if ( ! cr ) return NULL;
|
||||
|
||||
setStatus ( "getting site num inlinks");
|
||||
// check the tag first
|
||||
Tag *tag = gr->getTag ("sitenuminlinks");
|
||||
|
Loading…
Reference in New Issue
Block a user