do not scan spiderdb for entries in waiting tree

when spidering is turned off because it slows injections down.
2024-10-04 12:17:35 +03:00 · 2015-01-10 09:19:14 -07:00 · 2015-01-10 09:19:14 -07:00 · f9ccc342a7
commit f9ccc342a7
parent d541de8186
3 changed files with 60 additions and 46 deletions
--- a/Spider.cpp
+++ b/Spider.cpp
@ -3029,6 +3029,12 @@ void SpiderColl::populateDoledbFromWaitingTree ( ) { // bool reentry ) {
 	if ( m_isPopulating ) return;
 	// skip if in repair mode
 	if ( g_repairMode ) return;
+
+	// let's skip if spiders off so we can inject/popoulate the index quick
+	// since addSpiderRequest() calls addToWaitingTree() which then calls
+	// this. 
+	if ( ! g_conf.m_spideringEnabled ) return;
+
 	// try skipping!!!!!!!!!!!
 	// yeah, this makes us scream. in addition to calling
 	// Doledb::m_rdb::addRecord() below
--- a/XmlDoc.cpp
+++ b/XmlDoc.cpp
@ -1873,10 +1873,10 @@ void XmlDoc::setStatus ( char *s ) {
 	if ( s == s_last ) return;

 	bool timeIt = false;
-	if ( m_sreqValid &&
-	     m_sreq.m_isInjecting &&
-	     m_sreq.m_isPageInject ) 
-		timeIt = true;
+	// if ( m_sreqValid &&
+	//      m_sreq.m_isInjecting &&
+	//      m_sreq.m_isPageInject ) 
+	// 	timeIt = true;
 	if ( g_conf.m_logDebugBuildTime )
 		timeIt = true;

@ -1885,7 +1885,7 @@ void XmlDoc::setStatus ( char *s ) {
 		int64_t now = gettimeofdayInMillisecondsLocal();
 		if ( s_lastTimeStart == 0LL ) s_lastTimeStart = now;
 		int32_t took = now - s_lastTimeStart;
-		if ( took > 100 )
+		//if ( took > 100 )
 			log("xmldoc: %s (xd=0x%"PTRFMT" "
 			    "u=%s) took %"INT32"ms",
 			    s_last,
@ -13724,6 +13724,12 @@ LinkInfo *XmlDoc::getLinkInfo1 ( ) {
 	if ( m_linkInfo1Valid && ptr_linkInfo1 )
 		return ptr_linkInfo1;

+	// at least get our firstip so if cr->m_getLinkInfo is false
+	// then getRevisedSpiderReq() will not core because it is invalid
+	int32_t *ip = getFirstIp();
+	if ( ! ip || ip == (int32_t *)-1 ) return (LinkInfo *)ip;
+	
+
 	// just return nothing if not doing link voting
 	CollectionRec *cr = getCollRec();
 	if ( ! cr ) return NULL;
@ -13755,8 +13761,6 @@ LinkInfo *XmlDoc::getLinkInfo1 ( ) {
 	if ( ! sni || sni == (int32_t *)-1 ) return (LinkInfo *)sni;
 	//int32_t *fip = getFirstIp();
 	//if ( ! fip || fip == (int32_t *)-1 ) return (LinkInfo *)fip;
-	int32_t *ip = getFirstIp();
-	if ( ! ip || ip == (int32_t *)-1 ) return (LinkInfo *)ip;
 	int64_t *d = getDocId();
 	if ( ! d || d == (int64_t *)-1 ) return (LinkInfo *)d;
 	// sanity check. error?
@ -30582,47 +30586,50 @@ Msg20Reply *XmlDoc::getMsg20Reply ( ) {
 	// breathe
 	QUICKPOLL( m_niceness );

-	//if ( cr->m_doLinkSpamCheck ) {
-	// reset to NULL to avoid gbstrlen segfault
-	char *note = NULL;
-	// need this
-	if ( ! m_xmlValid ) { char *xx=NULL;*xx=0; }
-	// time it
-	//int64_t start = gettimeofdayInMilliseconds();
+	if ( ! m_req->m_doLinkSpamCheck ) 
+		reply->m_isLinkSpam = false;

-	Url linkeeUrl;
-	linkeeUrl.set ( m_req->ptr_linkee );
+	if ( m_req->m_doLinkSpamCheck ) {
+		// reset to NULL to avoid gbstrlen segfault
+		char *note = NULL;
+		// need this
+		if ( ! m_xmlValid ) { char *xx=NULL;*xx=0; }
+		// time it
+		//int64_t start = gettimeofdayInMilliseconds();

-	// get it. does not block.
-	reply->m_isLinkSpam = ::isLinkSpam ( linker , 
-					     m_ip ,
-					     ptr_indCatIds ,
-					     size_indCatIds / 4 ,
-					     m_siteNumInlinks,
-					     &m_xml, 
-					     links,
-					     MAXDOCLEN,//150000,//maxDocLen , 
-					     &note , 
-					     &linkeeUrl , // url ,
-					     linkNode , 
-					     cr->m_coll ,
-					     m_niceness );
-	// store it
-	if ( note ) {
-		// include the \0
-		reply->ptr_note  = note;
-		reply->size_note = gbstrlen(note)+1;
+		Url linkeeUrl;
+		linkeeUrl.set ( m_req->ptr_linkee );
+
+		// get it. does not block.
+		reply->m_isLinkSpam = ::isLinkSpam ( linker , 
+						     m_ip ,
+						     ptr_indCatIds ,
+						     size_indCatIds / 4 ,
+						     m_siteNumInlinks,
+						     &m_xml, 
+						     links,
+						     MAXDOCLEN,//150000,
+						     &note , 
+						     &linkeeUrl , // url ,
+						     linkNode , 
+						     cr->m_coll ,
+						     m_niceness );
+		// store it
+		if ( note ) {
+			// include the \0
+			reply->ptr_note  = note;
+			reply->size_note = gbstrlen(note)+1;
+		}
+		// log the reason why it is a log page
+		if ( reply->m_isLinkSpam )
+			log(LOG_DEBUG,"build: linker %s: %s.",
+			    linker->getUrl(),note);
+		// sanity
+		if ( reply->m_isLinkSpam && ! note )
+			log("linkspam: missing note for d=%"INT64"!",m_docId);
+		// store times... nah, might have yielded cpu!
+		reply->m_timeLinkSpam = 0;
 	}
-	// log the reason why it is a log page
-	if ( reply->m_isLinkSpam )
-		log(LOG_DEBUG,"build: linker %s: %s.",
-		    linker->getUrl(),note);
-	// sanity
-	if ( reply->m_isLinkSpam && ! note )
-		log("linkspam: missing note for d=%"INT64"!",m_docId);
-	// store times... nah, might have yielded cpu!
-	reply->m_timeLinkSpam = 0;
-	//}

 	// breathe
 	QUICKPOLL(m_niceness);
--- a/html/faq.html
+++ b/html/faq.html
@ -1251,9 +1251,10 @@ override that switch.

 <br>

-<b>Spider Optimizations:</b>
+<b>Spidering and Indexing Optimizations:</b>
 <ul>
 <!--<li> Set <b>restrict indexdb for spidering</b> on the -->
+<li> Disable <b>link voting</b> or <b>link spam checking</b> in the spider controls if you do not care about it. This is also useful when doing millions of injections and doing an indexdb rebuild using the rebuild tool later to pick up the link text.
 <li> Disable dup checking. Gigablast will not allow any duplicate pages
 from the same domain into the index when this is enabled. This means that
 Gigablast must do about one disk seek for every URL indexed to verify it is