quite a few fixes to the quota system, cleanups etc.

2024-10-04 04:07:13 +03:00 · 2014-01-18 16:23:13 -08:00 · 2014-01-18 16:23:13 -08:00 · 10f4443974
commit 10f4443974
parent f3000e2763
11 changed files with 115 additions and 74 deletions
--- a/Collectiondb.cpp
+++ b/Collectiondb.cpp
@ -1403,6 +1403,10 @@ void CollectionRec::setToDefaults ( ) {

 void CollectionRec::reset() {

+	// . grows dynamically
+	// . setting to 0 buckets should never have error
+	m_pageCountTable.set ( 4,4,0,NULL,0,false,MAX_NICENESS,"pctbl" );
+
 	// regex_t types
 	if ( m_hasucr ) regfree ( &m_ucr );
 	if ( m_hasupr ) regfree ( &m_upr );
@ -1421,7 +1425,6 @@ void CollectionRec::reset() {
 	     Rdb *rdb = g_process.m_rdbs[i];
 	     rdb->resetBase ( m_collnum );
 	}
-
 }

 CollectionRec *g_cr = NULL;
@ -1500,9 +1503,6 @@ bool CollectionRec::load ( char *coll , long i ) {
 	// PAGE COUNT TABLE for doing quotas in url filters
 	//
 	/////////////
-	// . grows dynamically
-	// . setting to 0 buckets should never have error
-	m_pageCountTable.set ( 4,4,0,NULL,0,false,MAX_NICENESS,"pctbl" );
 	// log it up if there on disk
 	snprintf ( tmp1 , 1023, "/coll.%s.%li/pagecounts.dat",
 		   m_coll , (long)m_collnum );
--- a/HashTableX.cpp
+++ b/HashTableX.cpp
@ -187,6 +187,8 @@ bool HashTableX::addKey ( void *key , void *val , long *slot ) {
 		g_errno = ETRYAGAIN; 
 		return false;
 	}
+	// never got initialized? call HashTableX::init()
+	if ( m_ks <= 0 ){ char *xx=NULL; *xx=0; }
 	// check to see if we should grow the table. now we grow
 	// when 25% full to make operations faster so getLongestString()
 	// doesn't return such big numbers!
--- a/PageAddColl.cpp
+++ b/PageAddColl.cpp
@ -137,7 +137,7 @@ bool sendPageAddDelColl ( TcpSocket *s , HttpRequest *r , bool add ) {
 		if ( ! cr ) continue;
 		p.safePrintf (
 			  "<tr><td>"
-			  "<input type=checkbox name=delete value=\"%s\"> "
+			  "<input type=checkbox name=delColl value=\"%s\"> "
 			  "%s</td></tr>\n",cr->m_coll,cr->m_coll);
 	}
 	p.safePrintf( "</table></center></td></tr></table><br>\n" );
--- a/PageInject.cpp
+++ b/PageInject.cpp
@ -204,6 +204,8 @@ bool sendReply ( void *state ) {
 		  "<b>%s</b>\n\n" // the url msg
 		  //"<FORM method=POST action=/inject>\n\n" 

+		  "<FORM method=GET action=/inject>\n\n" 
+
 		  //"<input type=hidden name=pwd value=\"%s\">\n"
 		  //"<input type=hidden name=username value=\"%s\">\n"
 		  "<table width=100%% bgcolor=#%s cellpadding=4 border=1>"
@ -279,13 +281,13 @@ bool sendReply ( void *state ) {
 		  "</td></tr>\n\n"


-		  "<tr><td><b>delete?</b><br>"
+		  "<tr><td><b>delete url?</b><br>"
 		  "<font size=1>Should this url be deleted from the index? "
 		  "Default: no"
 		  "</td>"
 		  "<td>\n"
-		  "<input type=radio name=delete value=0 checked>no &nbsp; "
-		  "<input type=radio name=delete value=1>yes "
+		  "<input type=radio name=deleteurl value=0 checked>no &nbsp; "
+		  "<input type=radio name=deleteurl value=1>yes "
 		  "</td></tr>\n\n"


@ -463,16 +465,29 @@ bool Msg7::inject ( TcpSocket *s ,
 	long  contentLen;

 	// get the junk
-	char *coll           = r->getString ( "c" , NULL  , NULL /*default*/);
+	//char *coll        = r->getString ( "c" , NULL  , NULL /*default*/);
 	//if ( ! coll ) coll = "main";
 	// sometimes crawlbot will add or reset a coll and do an inject
 	// in PageCrawlBot.cpp
 	//if ( ! coll ) coll = r->getString("addcoll");
 	//if ( ! coll ) coll = r->getString("resetcoll");
-	if ( ! coll ) coll = collOveride;
+	//if ( ! coll ) coll = collOveride;

 	// default to main
-	if ( ! coll || ! coll[0] ) coll = "main";
+	//if ( ! coll || ! coll[0] ) coll = "main";
+
+	if ( collOveride && ! collOveride[0] ) collOveride = NULL;
+
+	CollectionRec *cr = NULL;
+	if ( collOveride ) cr = g_collectiondb.getRec ( collOveride );
+	else cr = g_collectiondb.getRec ( r );
+
+	if ( ! cr ) {
+		g_errno = ENOCOLLREC;
+		return true;
+	}
+
+	char *coll = cr->m_coll;

 	bool  quickReply     = r->getLong   ( "quick" , 0 );	
 	//char *pwd            = r->getString ( "pwd" , NULL );
@ -490,7 +505,7 @@ bool Msg7::inject ( TcpSocket *s ,
 	long hopCount     = r->getLong("hopcount",-1);
 	long newOnly      = r->getLong("newonly",0);
 	long charset      = r->getLong("charset",-1);
-	long deleteIt     = r->getLong("delete",0);
+	long deleteUrl    = r->getLong("deleteurl",0);
 	char hasMime      = r->getLong("hasmime",0);
 	// do consistency testing?
 	bool doConsistencyTesting = r->getLong("dct",0);
@ -549,7 +564,7 @@ bool Msg7::inject ( TcpSocket *s ,
 			newOnly,
 			charset,
 			spiderLinks,
-			deleteIt,
+			deleteUrl,
 			hasMime,
 			doConsistencyTesting);
 }
@ -573,7 +588,7 @@ bool Msg7::inject ( char *url ,
 		    char newOnly,
 		    short charset,
 		    char spiderLinks,
-		    char deleteIt,
+		    char deleteUrl,
 		    char hasMime,
 		    bool doConsistencyTesting
 		    ) {
@ -674,7 +689,7 @@ bool Msg7::inject ( char *url ,
 			  niceness, // 1 , 
 			  // inject this content
 			  content ,
-			  deleteIt, // false, // deleteFromIndex ,
+			  deleteUrl, // false, // deleteFromIndex ,
 			  forcedIp ,
 			  contentType ,
 			  lastSpidered ,
--- a/PageResults.cpp
+++ b/PageResults.cpp
@ -1129,7 +1129,7 @@ bool gotResults ( void *state ) {
 	if ( collLen == 4 && strncmp ( coll, "main", 4) == 0 ) isMain = true;

 	// print "in collection ***" if we had a collection
-	if ( collLen > 0 && ! isMain ) // && isAdmin )
+	if ( collLen > 0 && ! isMain && si->m_format == FORMAT_HTML )
 		sb.safePrintf (" in collection <b>%s</b>",coll);


--- a/Parms.cpp
+++ b/Parms.cpp
@ -271,6 +271,24 @@ bool CommandDeleteColl ( char *rec , WaitEntry *we ) {
 	return true;
 }

+// . returns true and sets g_errno on error
+// . returns false if would block
+bool CommandDeleteColl2 ( char *rec , WaitEntry *we ) {
+	char *coll = (char *)rec;
+	collnum_t collnum = g_collectiondb.getCollnum ( coll );
+	if ( collnum < 0 ) {
+		g_errno = ENOCOLLREC;
+		return true;;
+	}
+	// the delete might block because the tree is saving and we can't
+	// remove our collnum recs from it while it is doing that
+	if ( ! g_collectiondb.deleteRec2 ( collnum ) )
+		// we blocked, we->m_callback will be called when done
+		return false;
+	// delete is successful
+	return true;
+}
+
 // . returns true and sets g_errno on error
 // . returns false if would block
 bool CommandRestartColl ( char *rec , WaitEntry *we ) {
@ -1306,8 +1324,9 @@ bool Parms::sendPageGeneric ( TcpSocket *s , HttpRequest *r , long page ,
 			  "millions of search results very quickly without "
 			  "having to lookup each of their urls. You should "
 			  "definitely have this if you use the reindexing "
-			  "feature. You can temporarily disabled the "
-			  "spidering enabled checkbox for non "
+			  "feature. "
+			  "You can set max spiders to 0 "
+			  "for non "
 			  "docidbased requests while you reindex or delete "
 			  "the results of a query for extra speed."
 			  "</td></tr>"
@ -9244,6 +9263,15 @@ void Parms::init ( ) {
 	m->m_cast  = 1;
 	m++;

+	m->m_title = "delete collection";
+	m->m_desc  = "delete the specified collection";
+	m->m_cgi   = "delColl";
+	m->m_type  = TYPE_CMD;
+	m->m_page  = PAGE_NONE;
+	m->m_func2 = CommandDeleteColl2;
+	m->m_cast  = 1;
+	m++;
+
 	m->m_title = "add collection";
 	m->m_desc  = "add a new collection";
 	m->m_cgi   = "addColl";
@ -13178,7 +13206,7 @@ void Parms::init ( ) {
 	m->m_group = 0;
 	m++;

-	m->m_title = "number of related topics (gigabits)";
+	m->m_title = "number of related topics";
 	m->m_desc  = "What is the number of "
 		"related topics (gigabits) "
 		"displayed per query? Set to 0 to save "
@ -13955,6 +13983,7 @@ void Parms::init ( ) {
 	m->m_flags = PF_REBUILDURLFILTERS;
 	m++;

+	/*
 	m->m_title = "spidering enabled";
 	m->m_cgi   = "cspe";
 	m->m_xml   = "spidersEnabled";
@ -13966,6 +13995,7 @@ void Parms::init ( ) {
 	m->m_rowid = 1;
 	m->m_flags = PF_REBUILDURLFILTERS;
 	m++;
+	*/

 	m->m_title = "respider frequency (days)";
 	m->m_cgi   = "fsf";
--- a/Query.cpp
+++ b/Query.cpp
@ -3010,7 +3010,7 @@ struct QueryField g_fields[] = {
 	{"ilink", FIELD_ILINK, true,"Similar to above."},
 	{"sitelink", FIELD_SITELINK, true,"Matches all pages that link to the given site. Example:sitelink:www.gigablast.com matches all pages that link to some page on the www.gigablast.com site."},
 	{"site", FIELD_SITE, true,"Matches all pages from the given site. Example: site:www.gigablast.com will return all the pages on the gigablast site"},
-	{"coll", FIELD_COLL, true,"Not sure if this works."},
+	//{"coll", FIELD_COLL, true,"Not sure if this works."},
 	{"ip", FIELD_IP, true,"Matches all pages with the given ip. Example:1.2.3.4 will match all pages whose urls have that IP address."},
 	{"inurl", FIELD_SUBURL, true,"Matches all pages that have the given terms in the url. Example inurl:water will match all pages whose url has the word water in it, but the word must be delineated by punctuation."},
 	{"suburl", FIELD_SUBURL, true,"Same as inurl."},
@ -3042,8 +3042,8 @@ struct QueryField g_fields[] = {
 	{"gbhasext", FIELD_GBOTHER, false,""},
 	{"gbsubmiturl", FIELD_GBOTHER, false,""},

-	{"qdom", FIELD_QUOTA, false,""},
-	{"qhost", FIELD_QUOTA, false,""},
+	//{"qdom", FIELD_QUOTA, false,""},
+	//{"qhost", FIELD_QUOTA, false,""},
 	{"gbtagvector", FIELD_GBTAGVECTOR, false,""},

 	{"gbgigabitvector", FIELD_GBGIGABITVECTOR, false,""},
@ -3065,7 +3065,7 @@ struct QueryField g_fields[] = {
 	{"gbduphash"                ,FIELD_GBOTHER,false,"Internal use only."},
 	{"gbsitetemplate"           ,FIELD_GBOTHER,false,"Internal use only."},
 	{"gboutlinkedtitle"         ,FIELD_GBOTHER,false,"gboutlinkedtitle:0 and gboutlinkedtitle:1 matches events whose title is not in and in a hyperlink, respectively."},
-	{"gbisaggregator"           ,FIELD_GBOTHER,false,"gbisaggregator:0|1 depending on if the event came from an event aggregator website, like eviesays.com."},
+	//{"gbisaggregator"           ,FIELD_GBOTHER,false,"gbisaggregator:0|1 depending on if the event came from an event aggregator website, like eviesays.com."},
 	{"gbdeduped"                ,FIELD_GBOTHER,false,""},

 	{"gbinjected", FIELD_GBOTHER,false,"Was the event injected?."},
@ -3074,7 +3074,7 @@ struct QueryField g_fields[] = {
 	//{"gbendrange",FIELD_GBENDRANGE,false,""},

 	{"gbpermalink",FIELD_GBPERMALINK,false,""},
-	{"gbcsenum",FIELD_GBCSENUM,false,""},
+	//{"gbcsenum",FIELD_GBCSENUM,false,""},
 	{"gbparenturl", FIELD_GBPARENTURL, true,"Match the json urls that were extract from this parent url. Example: gbparenturl:www.gigablast.com/addurl.htm"},
 	{"gbdocid",FIELD_GBDOCID,false,"restrict results to this docid"}
 	
--- a/SearchInput.h
+++ b/SearchInput.h
@ -224,8 +224,10 @@ class SearchInput {
 	//long  m_formatStrLen;
 	//char *m_formatStr;

+	char m_formatTmp[11];
+
 	// can be 0 for FORMAT_HTML, 1 = FORMAT_XML, 2=FORMAT_JSON, 3=csv
-	char m_format;
+	long m_format;

 	// this should be part of the key because it will affect the results!
 	char   m_queryExpansion;
--- a/Spider.cpp
+++ b/Spider.cpp
@ -2861,6 +2861,42 @@ static void gotSpiderdbListWrapper ( void *state , RdbList *list , Msg5 *msg5){
 	if ( g_conf.m_logDebugSpider )
 		log("spider: back from msg5 spiderdb read2");

+
+	// ensure collection rec still there
+	CollectionRec *cr = g_collectiondb.getRec ( THIS->m_collnum );
+	if ( ! cr ) return;
+
+
+	// if we do not have a pg count entry for this then enter count mode
+	// where we just scan all the spider records for m_scanningIp
+	// and count how many pages are in the index for each subdomain/site
+	// and when it is over we re-do the scan from the top. 
+	THIS->m_countingPagesIndexed = false;
+	// don't bother with this stuff though if url filters do not specify 
+	// "pagesinip" or "pagesinsubdomain"
+	if ( cr->m_urlFiltersHavePageCounts &&
+	     // and only do this if we do not have an entry for this ip yet
+	     ! cr->m_pageCountTable.isInTable ( &THIS->m_scanningIp ) ) {
+		// it is on
+		THIS->m_countingPagesIndexed = true;
+		// reset this
+		THIS->m_lastReqUh48 = 0LL;
+		THIS->m_lastRepUh48 = 0LL;
+		// and setup the LOCAL counting table if not initialized
+		if ( THIS->m_localTable.m_ks == 0 ) 
+			THIS->m_localTable.set (4,4,0,NULL,0,false,0,"ltpct" );
+		// do not recompute this in case all records for this ip
+		// are missing or have issues, like maybe there was only
+		// a spiderreply
+		if ( ! cr->m_pageCountTable.addScore( &THIS->m_scanningIp,1)){
+			log("spider: error adding to pg cnt tbl: %s",
+			    mstrerror(g_errno));
+			return;
+		}
+	}
+
+	     
+
 	// . finish processing the list we read now
 	// . if that blocks, it will call doledWrapper
 	if ( ! THIS->scanSpiderdb ( false ) ) return;
@ -2989,26 +3025,6 @@ bool SpiderColl::scanSpiderdb ( bool needList ) {
 		return true;
 	}

-	// if we do not have a pg count entry for this then enter count mode
-	// where we just scan all the spider records for m_scanningIp
-	// and count how many pages are in the index for each subdomain/site
-	// and when it is over we re-do the scan from the top. 
-	m_countingPagesIndexed = false;
-	// don't bother with this stuff though if url filters do not specify 
-	// "pagesinip" or "pagesinsubdomain"
-	if ( cr->m_urlFiltersHavePageCounts &&
-	     // and only do this if we do not have an entry for this ip yet
-	     ! cr->m_pageCountTable.isInTable ( &m_scanningIp ) ) {
-		// it is on
-		m_countingPagesIndexed = true;
-		// reset this
-		m_lastReqUh48 = 0LL;
-		m_lastRepUh48 = 0LL;
-		// and setup the LOCAL counting table if not initialized
-		if ( m_localTable.m_ks == 0 ) 
-			m_localTable.set ( 4 ,4,0,NULL,0,false,0,"ltpct" );
-	}
-
 	// i guess we are always restricted to an ip, because
 	// populateWaitingTreeFromSpiderdb calls its own msg5.
 	long firstIp0 = g_spiderdb.getFirstIp(&m_nextKey);
--- a/XmlDoc.cpp
+++ b/XmlDoc.cpp
@ -23181,7 +23181,10 @@ bool XmlDoc::hashNoSplit ( HashTableX *tt ) {



+	char *host = fu->getHost    ();
+	//long  hlen = fu->getHostLen ();

+	/*
 	setStatus ( "hashing no-split qdom keys" );

 	char *dom  = fu->getDomain   ();
@ -23192,15 +23195,13 @@ bool XmlDoc::hashNoSplit ( HashTableX *tt ) {
 	if ( ! hashString ( dom,dlen,&hi ) ) return false;


-
 	setStatus ( "hashing no-split qhost keys" );

-	char *host = fu->getHost    ();
-	long  hlen = fu->getHostLen ();
-
 	// desc is NULL, prefix will be used as desc
 	hi.m_prefix = "qhost";
 	if ( ! hashString ( host,hlen,&hi ) ) return false;
+	*/
+

 	// now hash the site

--- a/coll.main.0/coll.conf
+++ b/coll.main.0/coll.conf
@ -144,10 +144,6 @@
 # search results?
 <maxTitleLen>80</>

-# Can Gigablast make titles from the document content? Used mostly for the
-# news collection where the title tags are not very reliable.
-<considerTitlesFromBody>0</>
-
 # Should search results be site clustered by default?
 <siteClusterByDefault>1</>

@ -172,10 +168,6 @@
 # enabled.
 <numberOfLinesToUseInSummaryToDedup>4</>

-# Use Language weights to sort query results. This will give results that
-# match the specified &qlang higher ranking.
-<useLanguageWeights>1</>
-
 # Default language to use for ranking results. Value should be any language
 # abbreviation, for example "en" for English.
 <sortLanguagePreference><![CDATA[en]]></>
@ -315,23 +307,6 @@
 <harvestLinks>1</>
 <harvestLinks>1</>
 <harvestLinks>1</>
-<spidersEnabled>1</>
-<spidersEnabled>1</>
-<spidersEnabled>1</>
-<spidersEnabled>1</>
-<spidersEnabled>1</>
-<spidersEnabled>1</>
-<spidersEnabled>1</>
-<spidersEnabled>1</>
-<spidersEnabled>1</>
-<spidersEnabled>1</>
-<spidersEnabled>1</>
-<spidersEnabled>1</>
-<spidersEnabled>1</>
-<spidersEnabled>1</>
-<spidersEnabled>1</>
-<spidersEnabled>1</>
-<spidersEnabled>1</>
 <filterFrequency>0.000000</>
 <filterFrequency>0.000000</>
 <filterFrequency>1.000000</>