log cleanups mostly.

took out disk page cache, kinda buggy... need to fix at some point.
2024-10-04 12:17:35 +03:00 · 2013-12-18 10:57:18 -08:00 · 2013-12-18 10:57:18 -08:00 · 1b5057ad42
commit 1b5057ad42
parent 2ffad5d835
11 changed files with 37 additions and 32 deletions
--- a/Cachedb.cpp
+++ b/Cachedb.cpp
@ -88,7 +88,7 @@ bool Cachedb::addColl ( char *coll, bool doVerify ) {

 bool Cachedb::verify ( char *coll ) {
 	// coll is NULL here methinks
-	log ( LOG_INFO, "db: Verifying %s...",m_name );
+	log ( LOG_DEBUG, "db: Verifying %s...",m_name );
 	g_threads.disableThreads();

 	Msg5 msg5;
@ -167,7 +167,7 @@ bool Cachedb::verify ( char *coll ) {
 		g_threads.enableThreads();
 		return g_conf.m_bypassValidation;
 	}
-	log ( LOG_INFO, "db: %s passed verification successfully for "
+	log ( LOG_DEBUG, "db: %s passed verification successfully for "
 	      "%li recs.", m_name,count );
 	// DONE
 	g_threads.enableThreads();
--- a/Clusterdb.cpp
+++ b/Clusterdb.cpp
@ -350,7 +350,7 @@ bool Clusterdb::addColl ( char *coll, bool doVerify ) {
 }

 bool Clusterdb::verify ( char *coll ) {
-	log ( LOG_INFO, "db: Verifying Clusterdb for coll %s...", coll );
+	log ( LOG_DEBUG, "db: Verifying Clusterdb for coll %s...", coll );
 	g_threads.disableThreads();

 	Msg5 msg5;
@ -411,7 +411,7 @@ bool Clusterdb::verify ( char *coll ) {
 		g_threads.enableThreads();
 		return g_conf.m_bypassValidation;
 	}
-	log ( LOG_INFO, "db: Clusterdb passed verification successfully for "
+	log ( LOG_DEBUG, "db: Clusterdb passed verification successfully for "
 			"%li recs.", count );
 	// DONE
 	g_threads.enableThreads();
--- a/Collectiondb.cpp
+++ b/Collectiondb.cpp
@ -527,8 +527,8 @@ bool Collectiondb::registerCollRec ( CollectionRec *cr ,
 	if ( ! g_doledb.addColl     ( coll, verify ) ) goto hadError;

 	// debug message
-	log ( LOG_INFO, "db: verified collection \"%s\" (%li).",
-	      coll,(long)cr->m_collnum);
+	//log ( LOG_INFO, "db: verified collection \"%s\" (%li).",
+	//      coll,(long)cr->m_collnum);

 	// tell SpiderCache about this collection, it will create a 
 	// SpiderCollection class for it.
@ -1383,6 +1383,8 @@ bool CollectionRec::load ( char *coll , long i ) {
 	m_collLen = gbstrlen ( coll );
 	strcpy ( m_coll , coll );

+	log(LOG_INFO,"db: loading data for %s",coll);
+
 	// collection name HACK for backwards compatibility
 	//if ( strcmp ( coll , "main" ) == 0 ) {
 	//	m_coll[0] = '\0';
@ -1432,7 +1434,7 @@ bool CollectionRec::load ( char *coll , long i ) {
 	// LOAD LOCAL
 	sprintf ( tmp1 , "%scoll.%s.%li/localcrawlinfo.dat",
 		  g_hostdb.m_dir , m_coll , (long)m_collnum );
-	log(LOG_INFO,"db: loading %s",tmp1);
+	log(LOG_DEBUG,"db: loading %s",tmp1);
 	m_localCrawlInfo.reset();
 	SafeBuf sb;
 	// fillfromfile returns 0 if does not exist, -1 on read error
@ -1443,7 +1445,7 @@ bool CollectionRec::load ( char *coll , long i ) {
 	// LOAD GLOBAL
 	sprintf ( tmp1 , "%scoll.%s.%li/globalcrawlinfo.dat",
 		  g_hostdb.m_dir , m_coll , (long)m_collnum );
-	log(LOG_INFO,"db: loading %s",tmp1);
+	log(LOG_DEBUG,"db: loading %s",tmp1);
 	m_globalCrawlInfo.reset();
 	sb.reset();
 	if ( sb.fillFromFile ( tmp1 ) > 0 )
--- a/File.cpp
+++ b/File.cpp
@ -698,8 +698,10 @@ bool File::unlink ( ) {
 	if ( status == 0 ) return true;
 	// return false and set g_errno on error
 	if ( status  < 0 ) return false;
-	// log it so we can see what happened to timedb!
-	log(LOG_INFO,"disk: unlinking %s", m_filename );
+	// . log it so we can see what happened to timedb!
+	// . don't log startup unlinks of "tmpfile"
+	if ( ! strstr(m_filename,"tmpfile") )
+		log(LOG_INFO,"disk: unlinking %s", m_filename );
 	// remove ourselves from the disk
 	if ( ::unlink ( m_filename ) == 0 ) return true;
 	// sync it to disk in case power goes out
--- a/Linkdb.cpp
+++ b/Linkdb.cpp
@ -184,7 +184,7 @@ bool Linkdb::addColl ( char *coll, bool doVerify ) {
 }

 bool Linkdb::verify ( char *coll ) {
-	log ( LOG_INFO, "db: Verifying Linkdb for coll %s...", coll );
+	log ( LOG_DEBUG, "db: Verifying Linkdb for coll %s...", coll );
 	g_threads.disableThreads();

 	Msg5 msg5;
@ -265,7 +265,7 @@ bool Linkdb::verify ( char *coll ) {
 		g_threads.enableThreads();
 		return g_conf.m_bypassValidation;
 	}
-	log ( LOG_INFO, "db: Linkdb passed verification successfully for "
+	log ( LOG_DEBUG, "db: Linkdb passed verification successfully for "
 	      "%li recs.", count );
 	// DONE
 	g_threads.enableThreads();
--- a/Posdb.cpp
+++ b/Posdb.cpp
@ -122,7 +122,7 @@ bool Posdb::init ( ) {
 	long nodeSize      = (sizeof(key144_t)+12+4) + sizeof(collnum_t);
 	long maxTreeNodes = maxTreeMem  / nodeSize ;

-	long pageSize = GB_INDEXDB_PAGE_SIZE;
+	//long pageSize = GB_INDEXDB_PAGE_SIZE;
 	// we now use a disk page cache as opposed to the
 	// old rec cache. i am trying to do away with the Rdb::m_cache rec
 	// cache in favor of cleverly used disk page caches, because
@ -141,6 +141,7 @@ bool Posdb::init ( ) {
 	//pcmem = 0;
 	// . init the page cache
 	// . MDW: "minimize disk seeks" not working otherwise i'd enable it!
+	/*
 	if ( ! m_pc.init ( "posdb",
 			   RDB_POSDB,
 			   pcmem    ,
@ -148,6 +149,7 @@ bool Posdb::init ( ) {
 			   true     ,  // use RAM disk?
 			   false    )) // minimize disk seeks?
 		return log("db: Posdb init failed.");
+	*/

 	// . set our own internal rdb
 	// . max disk space for bin tree is same as maxTreeMem so that we
@ -169,7 +171,10 @@ bool Posdb::init ( ) {
 			   0 , // maxCacheNodes 	       ,
 			   true                        , // use half keys?
 			   false                       , // g_conf.m_posdbSav
-			   &m_pc                       ,
+			   // newer systems have tons of ram to use
+			   // for their disk page cache. it is slower than
+			   // ours but the new engine has much slower things
+			   NULL,//&m_pc                       ,
 			   false , // istitledb?
 			   false , // preloaddiskpagecache?
 			   sizeof(key144_t)
@ -235,7 +240,7 @@ bool Posdb::addColl ( char *coll, bool doVerify ) {

 bool Posdb::verify ( char *coll ) {
 	return true;
-	log ( LOG_INFO, "db: Verifying Posdb for coll %s...", coll );
+	log ( LOG_DEBUG, "db: Verifying Posdb for coll %s...", coll );
 	g_threads.disableThreads();

 	Msg5 msg5;
@ -314,7 +319,7 @@ bool Posdb::verify ( char *coll ) {
 		g_threads.enableThreads();
 		return g_conf.m_bypassValidation;
 	}
-	log ( LOG_INFO, "db: Posdb passed verification successfully for %li "
+	log ( LOG_DEBUG, "db: Posdb passed verification successfully for %li "
 			"recs.", count );
 	// DONE
 	g_threads.enableThreads();
--- a/RdbBase.cpp
+++ b/RdbBase.cpp
@ -771,7 +771,7 @@ long RdbBase::addFile ( long id , bool isNew , long mergeNum , long id2 ,
 		g_statsdb.m_disabled = false;
 		if ( ! status ) return log("db: Save failed.");
 	}
-	if ( ! isNew ) logf(LOG_INFO,"db: Added %s for collnum=%li pages=%li",
+	if ( ! isNew ) log(LOG_DEBUG,"db: Added %s for collnum=%li pages=%li",
 			    name ,(long)m_collnum,m->getNumPages());
 	// open this big data file for reading only
 	if ( ! isNew ) {
--- a/Spider.cpp
+++ b/Spider.cpp
@ -625,7 +625,7 @@ bool Spiderdb::addColl ( char *coll, bool doVerify ) {

 bool Spiderdb::verify ( char *coll ) {
 	//return true;
-	log ( LOG_INFO, "db: Verifying Spiderdb for coll %s...", coll );
+	log ( LOG_DEBUG, "db: Verifying Spiderdb for coll %s...", coll );
 	g_threads.disableThreads();

 	Msg5 msg5;
@ -688,7 +688,7 @@ bool Spiderdb::verify ( char *coll ) {
 		g_threads.enableThreads();
 		return g_conf.m_bypassValidation;
 	}
-	log ( LOG_INFO,"db: Spiderdb passed verification successfully for %li "
+	log (LOG_DEBUG,"db: Spiderdb passed verification successfully for %li "
 	      "recs.", count );
 	// DONE
 	g_threads.enableThreads();
--- a/Tagdb.cpp
+++ b/Tagdb.cpp
@ -1873,7 +1873,7 @@ bool Tagdb::verify ( char *coll ) {
 	char *rdbName = NULL;
 	rdbName = "Tagdb";
 	
-	log ( LOG_INFO, "db: Verifying %s for coll %s...", rdbName, coll );
+	log ( LOG_DEBUG, "db: Verifying %s for coll %s...", rdbName, coll );
 	
 	g_threads.disableThreads();

@ -1945,7 +1945,7 @@ bool Tagdb::verify ( char *coll ) {
 		g_threads.enableThreads();
 		return g_conf.m_bypassValidation;
 	}
-	log ( LOG_INFO, "db: %s passed verification successfully for %li "
+	log ( LOG_DEBUG, "db: %s passed verification successfully for %li "
 	      "recs.",rdbName, count );

 	// turn threads back on
--- a/Titledb.cpp
+++ b/Titledb.cpp
@ -55,13 +55,16 @@ bool Titledb::init ( ) {
 	// do not use any page cache if doing tmp cluster in order to
 	// prevent swapping
 	if ( g_hostdb.m_useTmpCluster ) pcmem = 0;
-	long pageSize = GB_INDEXDB_PAGE_SIZE;
+	//long pageSize = GB_INDEXDB_PAGE_SIZE;
 	// init the page cache
+	// . MDW: "minimize disk seeks" not working otherwise i'd enable it!
+	/*
 	if ( ! m_pc.init ( "titledb",
 			   RDB_TITLEDB,
 			   pcmem    ,
 			   pageSize ) )
 		return log("db: Titledb init failed.");
+	*/

 	// each entry in the cache is usually just a single record, no lists
 	//long maxCacheNodes = g_conf.m_titledbMaxCacheMem / (10*1024);
@ -87,7 +90,7 @@ bool Titledb::init ( ) {
 			    0,//maxCacheNodes               ,
 			    false                       ,// half keys?
 			    false                       ,// g_conf.m_titledbSav
-			    &m_pc                       , // page cache ptr
+			    NULL,//&m_pc               , // page cache ptr
 			    true                        ) )// is titledb?
 		return false;
 	return true;
@ -136,7 +139,7 @@ bool Titledb::addColl ( char *coll, bool doVerify ) {
 }

 bool Titledb::verify ( char *coll ) {
-	log ( LOG_INFO, "db: Verifying Titledb for coll %s...", coll );
+	log ( LOG_DEBUG, "db: Verifying Titledb for coll %s...", coll );
 	g_threads.disableThreads();

 	Msg5 msg5;
@ -209,7 +212,7 @@ bool Titledb::verify ( char *coll ) {
 		return g_conf.m_bypassValidation;
 	}

-	log ( LOG_INFO, "db: Titledb passed verification successfully for %li"
+	log ( LOG_DEBUG, "db: Titledb passed verification successfully for %li"
 			" recs.", count );
 	// DONE
 	g_threads.enableThreads();
--- a/coll.main.0/coll.conf
+++ b/coll.main.0/coll.conf
@ -968,8 +968,6 @@
 <harvestLinks>1</>
 <harvestLinks>1</>
 <harvestLinks>1</>
-<harvestLinks>1</>
-<spidersEnabled>1</>
 <spidersEnabled>1</>
 <spidersEnabled>1</>
 <spidersEnabled>1</>
@ -1004,7 +1002,6 @@
 <filterFrequency>60.000000</>
 <filterFrequency>30.000000</>
 <filterFrequency>30.000000</>
-<filterFrequency>0.000000</>

 # Do not allow more than this many outstanding spiders for all urls in this
 # priority.
@ -1025,7 +1022,6 @@
 <maxSpidersPerRule>1</>
 <maxSpidersPerRule>99</>
 <maxSpidersPerRule>99</>
-<maxSpidersPerRule>0</>

 # Allow this many spiders per IP.
 <maxSpidersPerIp>1</>
@ -1045,7 +1041,6 @@
 <maxSpidersPerIp>1</>
 <maxSpidersPerIp>1</>
 <maxSpidersPerIp>1</>
-<maxSpidersPerIp>0</>

 # Wait at least this long before downloading urls from the same IP address.
 <spiderIpWait>1000</>
@ -1065,7 +1060,6 @@
 <spiderIpWait>1000</>
 <spiderIpWait>1000</>
 <spiderIpWait>1000</>
-<spiderIpWait>0</>
 <filterPriority>80</>
 <filterPriority>-3</>
 <filterPriority>3</>
@ -1083,6 +1077,5 @@
 <filterPriority>19</>
 <filterPriority>1</>
 <filterPriority>0</>
-<filterPriority>0</>

 # Use <diffbotAPI> tag.