quite a few fixes to the quota system, cleanups etc.

This commit is contained in:
Matt Wells 2014-01-18 16:23:13 -08:00
parent f3000e2763
commit 10f4443974
11 changed files with 115 additions and 74 deletions

View File

@ -1403,6 +1403,10 @@ void CollectionRec::setToDefaults ( ) {
void CollectionRec::reset() {
// . grows dynamically
// . setting to 0 buckets should never have error
m_pageCountTable.set ( 4,4,0,NULL,0,false,MAX_NICENESS,"pctbl" );
// regex_t types
if ( m_hasucr ) regfree ( &m_ucr );
if ( m_hasupr ) regfree ( &m_upr );
@ -1421,7 +1425,6 @@ void CollectionRec::reset() {
Rdb *rdb = g_process.m_rdbs[i];
rdb->resetBase ( m_collnum );
}
}
CollectionRec *g_cr = NULL;
@ -1500,9 +1503,6 @@ bool CollectionRec::load ( char *coll , long i ) {
// PAGE COUNT TABLE for doing quotas in url filters
//
/////////////
// . grows dynamically
// . setting to 0 buckets should never have error
m_pageCountTable.set ( 4,4,0,NULL,0,false,MAX_NICENESS,"pctbl" );
// log it up if there on disk
snprintf ( tmp1 , 1023, "/coll.%s.%li/pagecounts.dat",
m_coll , (long)m_collnum );

View File

@ -187,6 +187,8 @@ bool HashTableX::addKey ( void *key , void *val , long *slot ) {
g_errno = ETRYAGAIN;
return false;
}
// never got initialized? call HashTableX::init()
if ( m_ks <= 0 ){ char *xx=NULL; *xx=0; }
// check to see if we should grow the table. now we grow
// when 25% full to make operations faster so getLongestString()
// doesn't return such big numbers!

View File

@ -137,7 +137,7 @@ bool sendPageAddDelColl ( TcpSocket *s , HttpRequest *r , bool add ) {
if ( ! cr ) continue;
p.safePrintf (
"<tr><td>"
"<input type=checkbox name=delete value=\"%s\"> "
"<input type=checkbox name=delColl value=\"%s\"> "
"%s</td></tr>\n",cr->m_coll,cr->m_coll);
}
p.safePrintf( "</table></center></td></tr></table><br>\n" );

View File

@ -204,6 +204,8 @@ bool sendReply ( void *state ) {
"<b>%s</b>\n\n" // the url msg
//"<FORM method=POST action=/inject>\n\n"
"<FORM method=GET action=/inject>\n\n"
//"<input type=hidden name=pwd value=\"%s\">\n"
//"<input type=hidden name=username value=\"%s\">\n"
"<table width=100%% bgcolor=#%s cellpadding=4 border=1>"
@ -279,13 +281,13 @@ bool sendReply ( void *state ) {
"</td></tr>\n\n"
"<tr><td><b>delete?</b><br>"
"<tr><td><b>delete url?</b><br>"
"<font size=1>Should this url be deleted from the index? "
"Default: no"
"</td>"
"<td>\n"
"<input type=radio name=delete value=0 checked>no &nbsp; "
"<input type=radio name=delete value=1>yes "
"<input type=radio name=deleteurl value=0 checked>no &nbsp; "
"<input type=radio name=deleteurl value=1>yes "
"</td></tr>\n\n"
@ -463,16 +465,29 @@ bool Msg7::inject ( TcpSocket *s ,
long contentLen;
// get the junk
char *coll = r->getString ( "c" , NULL , NULL /*default*/);
//char *coll = r->getString ( "c" , NULL , NULL /*default*/);
//if ( ! coll ) coll = "main";
// sometimes crawlbot will add or reset a coll and do an inject
// in PageCrawlBot.cpp
//if ( ! coll ) coll = r->getString("addcoll");
//if ( ! coll ) coll = r->getString("resetcoll");
if ( ! coll ) coll = collOveride;
//if ( ! coll ) coll = collOveride;
// default to main
if ( ! coll || ! coll[0] ) coll = "main";
//if ( ! coll || ! coll[0] ) coll = "main";
if ( collOveride && ! collOveride[0] ) collOveride = NULL;
CollectionRec *cr = NULL;
if ( collOveride ) cr = g_collectiondb.getRec ( collOveride );
else cr = g_collectiondb.getRec ( r );
if ( ! cr ) {
g_errno = ENOCOLLREC;
return true;
}
char *coll = cr->m_coll;
bool quickReply = r->getLong ( "quick" , 0 );
//char *pwd = r->getString ( "pwd" , NULL );
@ -490,7 +505,7 @@ bool Msg7::inject ( TcpSocket *s ,
long hopCount = r->getLong("hopcount",-1);
long newOnly = r->getLong("newonly",0);
long charset = r->getLong("charset",-1);
long deleteIt = r->getLong("delete",0);
long deleteUrl = r->getLong("deleteurl",0);
char hasMime = r->getLong("hasmime",0);
// do consistency testing?
bool doConsistencyTesting = r->getLong("dct",0);
@ -549,7 +564,7 @@ bool Msg7::inject ( TcpSocket *s ,
newOnly,
charset,
spiderLinks,
deleteIt,
deleteUrl,
hasMime,
doConsistencyTesting);
}
@ -573,7 +588,7 @@ bool Msg7::inject ( char *url ,
char newOnly,
short charset,
char spiderLinks,
char deleteIt,
char deleteUrl,
char hasMime,
bool doConsistencyTesting
) {
@ -674,7 +689,7 @@ bool Msg7::inject ( char *url ,
niceness, // 1 ,
// inject this content
content ,
deleteIt, // false, // deleteFromIndex ,
deleteUrl, // false, // deleteFromIndex ,
forcedIp ,
contentType ,
lastSpidered ,

View File

@ -1129,7 +1129,7 @@ bool gotResults ( void *state ) {
if ( collLen == 4 && strncmp ( coll, "main", 4) == 0 ) isMain = true;
// print "in collection ***" if we had a collection
if ( collLen > 0 && ! isMain ) // && isAdmin )
if ( collLen > 0 && ! isMain && si->m_format == FORMAT_HTML )
sb.safePrintf (" in collection <b>%s</b>",coll);

View File

@ -271,6 +271,24 @@ bool CommandDeleteColl ( char *rec , WaitEntry *we ) {
return true;
}
// . returns true and sets g_errno on error
// . returns false if would block
bool CommandDeleteColl2 ( char *rec , WaitEntry *we ) {
char *coll = (char *)rec;
collnum_t collnum = g_collectiondb.getCollnum ( coll );
if ( collnum < 0 ) {
g_errno = ENOCOLLREC;
return true;;
}
// the delete might block because the tree is saving and we can't
// remove our collnum recs from it while it is doing that
if ( ! g_collectiondb.deleteRec2 ( collnum ) )
// we blocked, we->m_callback will be called when done
return false;
// delete is successful
return true;
}
// . returns true and sets g_errno on error
// . returns false if would block
bool CommandRestartColl ( char *rec , WaitEntry *we ) {
@ -1306,8 +1324,9 @@ bool Parms::sendPageGeneric ( TcpSocket *s , HttpRequest *r , long page ,
"millions of search results very quickly without "
"having to lookup each of their urls. You should "
"definitely have this if you use the reindexing "
"feature. You can temporarily disabled the "
"spidering enabled checkbox for non "
"feature. "
"You can set max spiders to 0 "
"for non "
"docidbased requests while you reindex or delete "
"the results of a query for extra speed."
"</td></tr>"
@ -9244,6 +9263,15 @@ void Parms::init ( ) {
m->m_cast = 1;
m++;
m->m_title = "delete collection";
m->m_desc = "delete the specified collection";
m->m_cgi = "delColl";
m->m_type = TYPE_CMD;
m->m_page = PAGE_NONE;
m->m_func2 = CommandDeleteColl2;
m->m_cast = 1;
m++;
m->m_title = "add collection";
m->m_desc = "add a new collection";
m->m_cgi = "addColl";
@ -13178,7 +13206,7 @@ void Parms::init ( ) {
m->m_group = 0;
m++;
m->m_title = "number of related topics (gigabits)";
m->m_title = "number of related topics";
m->m_desc = "What is the number of "
"related topics (gigabits) "
"displayed per query? Set to 0 to save "
@ -13955,6 +13983,7 @@ void Parms::init ( ) {
m->m_flags = PF_REBUILDURLFILTERS;
m++;
/*
m->m_title = "spidering enabled";
m->m_cgi = "cspe";
m->m_xml = "spidersEnabled";
@ -13966,6 +13995,7 @@ void Parms::init ( ) {
m->m_rowid = 1;
m->m_flags = PF_REBUILDURLFILTERS;
m++;
*/
m->m_title = "respider frequency (days)";
m->m_cgi = "fsf";

View File

@ -3010,7 +3010,7 @@ struct QueryField g_fields[] = {
{"ilink", FIELD_ILINK, true,"Similar to above."},
{"sitelink", FIELD_SITELINK, true,"Matches all pages that link to the given site. Example:sitelink:www.gigablast.com matches all pages that link to some page on the www.gigablast.com site."},
{"site", FIELD_SITE, true,"Matches all pages from the given site. Example: site:www.gigablast.com will return all the pages on the gigablast site"},
{"coll", FIELD_COLL, true,"Not sure if this works."},
//{"coll", FIELD_COLL, true,"Not sure if this works."},
{"ip", FIELD_IP, true,"Matches all pages with the given ip. Example:1.2.3.4 will match all pages whose urls have that IP address."},
{"inurl", FIELD_SUBURL, true,"Matches all pages that have the given terms in the url. Example inurl:water will match all pages whose url has the word water in it, but the word must be delineated by punctuation."},
{"suburl", FIELD_SUBURL, true,"Same as inurl."},
@ -3042,8 +3042,8 @@ struct QueryField g_fields[] = {
{"gbhasext", FIELD_GBOTHER, false,""},
{"gbsubmiturl", FIELD_GBOTHER, false,""},
{"qdom", FIELD_QUOTA, false,""},
{"qhost", FIELD_QUOTA, false,""},
//{"qdom", FIELD_QUOTA, false,""},
//{"qhost", FIELD_QUOTA, false,""},
{"gbtagvector", FIELD_GBTAGVECTOR, false,""},
{"gbgigabitvector", FIELD_GBGIGABITVECTOR, false,""},
@ -3065,7 +3065,7 @@ struct QueryField g_fields[] = {
{"gbduphash" ,FIELD_GBOTHER,false,"Internal use only."},
{"gbsitetemplate" ,FIELD_GBOTHER,false,"Internal use only."},
{"gboutlinkedtitle" ,FIELD_GBOTHER,false,"gboutlinkedtitle:0 and gboutlinkedtitle:1 matches events whose title is not in and in a hyperlink, respectively."},
{"gbisaggregator" ,FIELD_GBOTHER,false,"gbisaggregator:0|1 depending on if the event came from an event aggregator website, like eviesays.com."},
//{"gbisaggregator" ,FIELD_GBOTHER,false,"gbisaggregator:0|1 depending on if the event came from an event aggregator website, like eviesays.com."},
{"gbdeduped" ,FIELD_GBOTHER,false,""},
{"gbinjected", FIELD_GBOTHER,false,"Was the event injected?."},
@ -3074,7 +3074,7 @@ struct QueryField g_fields[] = {
//{"gbendrange",FIELD_GBENDRANGE,false,""},
{"gbpermalink",FIELD_GBPERMALINK,false,""},
{"gbcsenum",FIELD_GBCSENUM,false,""},
//{"gbcsenum",FIELD_GBCSENUM,false,""},
{"gbparenturl", FIELD_GBPARENTURL, true,"Match the json urls that were extract from this parent url. Example: gbparenturl:www.gigablast.com/addurl.htm"},
{"gbdocid",FIELD_GBDOCID,false,"restrict results to this docid"}

View File

@ -224,8 +224,10 @@ class SearchInput {
//long m_formatStrLen;
//char *m_formatStr;
char m_formatTmp[11];
// can be 0 for FORMAT_HTML, 1 = FORMAT_XML, 2=FORMAT_JSON, 3=csv
char m_format;
long m_format;
// this should be part of the key because it will affect the results!
char m_queryExpansion;

View File

@ -2861,6 +2861,42 @@ static void gotSpiderdbListWrapper ( void *state , RdbList *list , Msg5 *msg5){
if ( g_conf.m_logDebugSpider )
log("spider: back from msg5 spiderdb read2");
// ensure collection rec still there
CollectionRec *cr = g_collectiondb.getRec ( THIS->m_collnum );
if ( ! cr ) return;
// if we do not have a pg count entry for this then enter count mode
// where we just scan all the spider records for m_scanningIp
// and count how many pages are in the index for each subdomain/site
// and when it is over we re-do the scan from the top.
THIS->m_countingPagesIndexed = false;
// don't bother with this stuff though if url filters do not specify
// "pagesinip" or "pagesinsubdomain"
if ( cr->m_urlFiltersHavePageCounts &&
// and only do this if we do not have an entry for this ip yet
! cr->m_pageCountTable.isInTable ( &THIS->m_scanningIp ) ) {
// it is on
THIS->m_countingPagesIndexed = true;
// reset this
THIS->m_lastReqUh48 = 0LL;
THIS->m_lastRepUh48 = 0LL;
// and setup the LOCAL counting table if not initialized
if ( THIS->m_localTable.m_ks == 0 )
THIS->m_localTable.set (4,4,0,NULL,0,false,0,"ltpct" );
// do not recompute this in case all records for this ip
// are missing or have issues, like maybe there was only
// a spiderreply
if ( ! cr->m_pageCountTable.addScore( &THIS->m_scanningIp,1)){
log("spider: error adding to pg cnt tbl: %s",
mstrerror(g_errno));
return;
}
}
// . finish processing the list we read now
// . if that blocks, it will call doledWrapper
if ( ! THIS->scanSpiderdb ( false ) ) return;
@ -2989,26 +3025,6 @@ bool SpiderColl::scanSpiderdb ( bool needList ) {
return true;
}
// if we do not have a pg count entry for this then enter count mode
// where we just scan all the spider records for m_scanningIp
// and count how many pages are in the index for each subdomain/site
// and when it is over we re-do the scan from the top.
m_countingPagesIndexed = false;
// don't bother with this stuff though if url filters do not specify
// "pagesinip" or "pagesinsubdomain"
if ( cr->m_urlFiltersHavePageCounts &&
// and only do this if we do not have an entry for this ip yet
! cr->m_pageCountTable.isInTable ( &m_scanningIp ) ) {
// it is on
m_countingPagesIndexed = true;
// reset this
m_lastReqUh48 = 0LL;
m_lastRepUh48 = 0LL;
// and setup the LOCAL counting table if not initialized
if ( m_localTable.m_ks == 0 )
m_localTable.set ( 4 ,4,0,NULL,0,false,0,"ltpct" );
}
// i guess we are always restricted to an ip, because
// populateWaitingTreeFromSpiderdb calls its own msg5.
long firstIp0 = g_spiderdb.getFirstIp(&m_nextKey);

View File

@ -23181,7 +23181,10 @@ bool XmlDoc::hashNoSplit ( HashTableX *tt ) {
char *host = fu->getHost ();
//long hlen = fu->getHostLen ();
/*
setStatus ( "hashing no-split qdom keys" );
char *dom = fu->getDomain ();
@ -23192,15 +23195,13 @@ bool XmlDoc::hashNoSplit ( HashTableX *tt ) {
if ( ! hashString ( dom,dlen,&hi ) ) return false;
setStatus ( "hashing no-split qhost keys" );
char *host = fu->getHost ();
long hlen = fu->getHostLen ();
// desc is NULL, prefix will be used as desc
hi.m_prefix = "qhost";
if ( ! hashString ( host,hlen,&hi ) ) return false;
*/
// now hash the site

View File

@ -144,10 +144,6 @@
# search results?
<maxTitleLen>80</>
# Can Gigablast make titles from the document content? Used mostly for the
# news collection where the title tags are not very reliable.
<considerTitlesFromBody>0</>
# Should search results be site clustered by default?
<siteClusterByDefault>1</>
@ -172,10 +168,6 @@
# enabled.
<numberOfLinesToUseInSummaryToDedup>4</>
# Use Language weights to sort query results. This will give results that
# match the specified &qlang higher ranking.
<useLanguageWeights>1</>
# Default language to use for ranking results. Value should be any language
# abbreviation, for example "en" for English.
<sortLanguagePreference><![CDATA[en]]></>
@ -315,23 +307,6 @@
<harvestLinks>1</>
<harvestLinks>1</>
<harvestLinks>1</>
<spidersEnabled>1</>
<spidersEnabled>1</>
<spidersEnabled>1</>
<spidersEnabled>1</>
<spidersEnabled>1</>
<spidersEnabled>1</>
<spidersEnabled>1</>
<spidersEnabled>1</>
<spidersEnabled>1</>
<spidersEnabled>1</>
<spidersEnabled>1</>
<spidersEnabled>1</>
<spidersEnabled>1</>
<spidersEnabled>1</>
<spidersEnabled>1</>
<spidersEnabled>1</>
<spidersEnabled>1</>
<filterFrequency>0.000000</>
<filterFrequency>0.000000</>
<filterFrequency>1.000000</>