mirror of
https://github.com/gigablast/open-source-search-engine.git
synced 2024-10-04 04:07:13 +03:00
quite a few fixes to the quota system, cleanups etc.
This commit is contained in:
parent
f3000e2763
commit
10f4443974
@ -1403,6 +1403,10 @@ void CollectionRec::setToDefaults ( ) {
|
||||
|
||||
void CollectionRec::reset() {
|
||||
|
||||
// . grows dynamically
|
||||
// . setting to 0 buckets should never have error
|
||||
m_pageCountTable.set ( 4,4,0,NULL,0,false,MAX_NICENESS,"pctbl" );
|
||||
|
||||
// regex_t types
|
||||
if ( m_hasucr ) regfree ( &m_ucr );
|
||||
if ( m_hasupr ) regfree ( &m_upr );
|
||||
@ -1421,7 +1425,6 @@ void CollectionRec::reset() {
|
||||
Rdb *rdb = g_process.m_rdbs[i];
|
||||
rdb->resetBase ( m_collnum );
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
CollectionRec *g_cr = NULL;
|
||||
@ -1500,9 +1503,6 @@ bool CollectionRec::load ( char *coll , long i ) {
|
||||
// PAGE COUNT TABLE for doing quotas in url filters
|
||||
//
|
||||
/////////////
|
||||
// . grows dynamically
|
||||
// . setting to 0 buckets should never have error
|
||||
m_pageCountTable.set ( 4,4,0,NULL,0,false,MAX_NICENESS,"pctbl" );
|
||||
// log it up if there on disk
|
||||
snprintf ( tmp1 , 1023, "/coll.%s.%li/pagecounts.dat",
|
||||
m_coll , (long)m_collnum );
|
||||
|
@ -187,6 +187,8 @@ bool HashTableX::addKey ( void *key , void *val , long *slot ) {
|
||||
g_errno = ETRYAGAIN;
|
||||
return false;
|
||||
}
|
||||
// never got initialized? call HashTableX::init()
|
||||
if ( m_ks <= 0 ){ char *xx=NULL; *xx=0; }
|
||||
// check to see if we should grow the table. now we grow
|
||||
// when 25% full to make operations faster so getLongestString()
|
||||
// doesn't return such big numbers!
|
||||
|
@ -137,7 +137,7 @@ bool sendPageAddDelColl ( TcpSocket *s , HttpRequest *r , bool add ) {
|
||||
if ( ! cr ) continue;
|
||||
p.safePrintf (
|
||||
"<tr><td>"
|
||||
"<input type=checkbox name=delete value=\"%s\"> "
|
||||
"<input type=checkbox name=delColl value=\"%s\"> "
|
||||
"%s</td></tr>\n",cr->m_coll,cr->m_coll);
|
||||
}
|
||||
p.safePrintf( "</table></center></td></tr></table><br>\n" );
|
||||
|
@ -204,6 +204,8 @@ bool sendReply ( void *state ) {
|
||||
"<b>%s</b>\n\n" // the url msg
|
||||
//"<FORM method=POST action=/inject>\n\n"
|
||||
|
||||
"<FORM method=GET action=/inject>\n\n"
|
||||
|
||||
//"<input type=hidden name=pwd value=\"%s\">\n"
|
||||
//"<input type=hidden name=username value=\"%s\">\n"
|
||||
"<table width=100%% bgcolor=#%s cellpadding=4 border=1>"
|
||||
@ -279,13 +281,13 @@ bool sendReply ( void *state ) {
|
||||
"</td></tr>\n\n"
|
||||
|
||||
|
||||
"<tr><td><b>delete?</b><br>"
|
||||
"<tr><td><b>delete url?</b><br>"
|
||||
"<font size=1>Should this url be deleted from the index? "
|
||||
"Default: no"
|
||||
"</td>"
|
||||
"<td>\n"
|
||||
"<input type=radio name=delete value=0 checked>no "
|
||||
"<input type=radio name=delete value=1>yes "
|
||||
"<input type=radio name=deleteurl value=0 checked>no "
|
||||
"<input type=radio name=deleteurl value=1>yes "
|
||||
"</td></tr>\n\n"
|
||||
|
||||
|
||||
@ -463,16 +465,29 @@ bool Msg7::inject ( TcpSocket *s ,
|
||||
long contentLen;
|
||||
|
||||
// get the junk
|
||||
char *coll = r->getString ( "c" , NULL , NULL /*default*/);
|
||||
//char *coll = r->getString ( "c" , NULL , NULL /*default*/);
|
||||
//if ( ! coll ) coll = "main";
|
||||
// sometimes crawlbot will add or reset a coll and do an inject
|
||||
// in PageCrawlBot.cpp
|
||||
//if ( ! coll ) coll = r->getString("addcoll");
|
||||
//if ( ! coll ) coll = r->getString("resetcoll");
|
||||
if ( ! coll ) coll = collOveride;
|
||||
//if ( ! coll ) coll = collOveride;
|
||||
|
||||
// default to main
|
||||
if ( ! coll || ! coll[0] ) coll = "main";
|
||||
//if ( ! coll || ! coll[0] ) coll = "main";
|
||||
|
||||
if ( collOveride && ! collOveride[0] ) collOveride = NULL;
|
||||
|
||||
CollectionRec *cr = NULL;
|
||||
if ( collOveride ) cr = g_collectiondb.getRec ( collOveride );
|
||||
else cr = g_collectiondb.getRec ( r );
|
||||
|
||||
if ( ! cr ) {
|
||||
g_errno = ENOCOLLREC;
|
||||
return true;
|
||||
}
|
||||
|
||||
char *coll = cr->m_coll;
|
||||
|
||||
bool quickReply = r->getLong ( "quick" , 0 );
|
||||
//char *pwd = r->getString ( "pwd" , NULL );
|
||||
@ -490,7 +505,7 @@ bool Msg7::inject ( TcpSocket *s ,
|
||||
long hopCount = r->getLong("hopcount",-1);
|
||||
long newOnly = r->getLong("newonly",0);
|
||||
long charset = r->getLong("charset",-1);
|
||||
long deleteIt = r->getLong("delete",0);
|
||||
long deleteUrl = r->getLong("deleteurl",0);
|
||||
char hasMime = r->getLong("hasmime",0);
|
||||
// do consistency testing?
|
||||
bool doConsistencyTesting = r->getLong("dct",0);
|
||||
@ -549,7 +564,7 @@ bool Msg7::inject ( TcpSocket *s ,
|
||||
newOnly,
|
||||
charset,
|
||||
spiderLinks,
|
||||
deleteIt,
|
||||
deleteUrl,
|
||||
hasMime,
|
||||
doConsistencyTesting);
|
||||
}
|
||||
@ -573,7 +588,7 @@ bool Msg7::inject ( char *url ,
|
||||
char newOnly,
|
||||
short charset,
|
||||
char spiderLinks,
|
||||
char deleteIt,
|
||||
char deleteUrl,
|
||||
char hasMime,
|
||||
bool doConsistencyTesting
|
||||
) {
|
||||
@ -674,7 +689,7 @@ bool Msg7::inject ( char *url ,
|
||||
niceness, // 1 ,
|
||||
// inject this content
|
||||
content ,
|
||||
deleteIt, // false, // deleteFromIndex ,
|
||||
deleteUrl, // false, // deleteFromIndex ,
|
||||
forcedIp ,
|
||||
contentType ,
|
||||
lastSpidered ,
|
||||
|
@ -1129,7 +1129,7 @@ bool gotResults ( void *state ) {
|
||||
if ( collLen == 4 && strncmp ( coll, "main", 4) == 0 ) isMain = true;
|
||||
|
||||
// print "in collection ***" if we had a collection
|
||||
if ( collLen > 0 && ! isMain ) // && isAdmin )
|
||||
if ( collLen > 0 && ! isMain && si->m_format == FORMAT_HTML )
|
||||
sb.safePrintf (" in collection <b>%s</b>",coll);
|
||||
|
||||
|
||||
|
36
Parms.cpp
36
Parms.cpp
@ -271,6 +271,24 @@ bool CommandDeleteColl ( char *rec , WaitEntry *we ) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// . returns true and sets g_errno on error
|
||||
// . returns false if would block
|
||||
bool CommandDeleteColl2 ( char *rec , WaitEntry *we ) {
|
||||
char *coll = (char *)rec;
|
||||
collnum_t collnum = g_collectiondb.getCollnum ( coll );
|
||||
if ( collnum < 0 ) {
|
||||
g_errno = ENOCOLLREC;
|
||||
return true;;
|
||||
}
|
||||
// the delete might block because the tree is saving and we can't
|
||||
// remove our collnum recs from it while it is doing that
|
||||
if ( ! g_collectiondb.deleteRec2 ( collnum ) )
|
||||
// we blocked, we->m_callback will be called when done
|
||||
return false;
|
||||
// delete is successful
|
||||
return true;
|
||||
}
|
||||
|
||||
// . returns true and sets g_errno on error
|
||||
// . returns false if would block
|
||||
bool CommandRestartColl ( char *rec , WaitEntry *we ) {
|
||||
@ -1306,8 +1324,9 @@ bool Parms::sendPageGeneric ( TcpSocket *s , HttpRequest *r , long page ,
|
||||
"millions of search results very quickly without "
|
||||
"having to lookup each of their urls. You should "
|
||||
"definitely have this if you use the reindexing "
|
||||
"feature. You can temporarily disabled the "
|
||||
"spidering enabled checkbox for non "
|
||||
"feature. "
|
||||
"You can set max spiders to 0 "
|
||||
"for non "
|
||||
"docidbased requests while you reindex or delete "
|
||||
"the results of a query for extra speed."
|
||||
"</td></tr>"
|
||||
@ -9244,6 +9263,15 @@ void Parms::init ( ) {
|
||||
m->m_cast = 1;
|
||||
m++;
|
||||
|
||||
m->m_title = "delete collection";
|
||||
m->m_desc = "delete the specified collection";
|
||||
m->m_cgi = "delColl";
|
||||
m->m_type = TYPE_CMD;
|
||||
m->m_page = PAGE_NONE;
|
||||
m->m_func2 = CommandDeleteColl2;
|
||||
m->m_cast = 1;
|
||||
m++;
|
||||
|
||||
m->m_title = "add collection";
|
||||
m->m_desc = "add a new collection";
|
||||
m->m_cgi = "addColl";
|
||||
@ -13178,7 +13206,7 @@ void Parms::init ( ) {
|
||||
m->m_group = 0;
|
||||
m++;
|
||||
|
||||
m->m_title = "number of related topics (gigabits)";
|
||||
m->m_title = "number of related topics";
|
||||
m->m_desc = "What is the number of "
|
||||
"related topics (gigabits) "
|
||||
"displayed per query? Set to 0 to save "
|
||||
@ -13955,6 +13983,7 @@ void Parms::init ( ) {
|
||||
m->m_flags = PF_REBUILDURLFILTERS;
|
||||
m++;
|
||||
|
||||
/*
|
||||
m->m_title = "spidering enabled";
|
||||
m->m_cgi = "cspe";
|
||||
m->m_xml = "spidersEnabled";
|
||||
@ -13966,6 +13995,7 @@ void Parms::init ( ) {
|
||||
m->m_rowid = 1;
|
||||
m->m_flags = PF_REBUILDURLFILTERS;
|
||||
m++;
|
||||
*/
|
||||
|
||||
m->m_title = "respider frequency (days)";
|
||||
m->m_cgi = "fsf";
|
||||
|
10
Query.cpp
10
Query.cpp
@ -3010,7 +3010,7 @@ struct QueryField g_fields[] = {
|
||||
{"ilink", FIELD_ILINK, true,"Similar to above."},
|
||||
{"sitelink", FIELD_SITELINK, true,"Matches all pages that link to the given site. Example:sitelink:www.gigablast.com matches all pages that link to some page on the www.gigablast.com site."},
|
||||
{"site", FIELD_SITE, true,"Matches all pages from the given site. Example: site:www.gigablast.com will return all the pages on the gigablast site"},
|
||||
{"coll", FIELD_COLL, true,"Not sure if this works."},
|
||||
//{"coll", FIELD_COLL, true,"Not sure if this works."},
|
||||
{"ip", FIELD_IP, true,"Matches all pages with the given ip. Example:1.2.3.4 will match all pages whose urls have that IP address."},
|
||||
{"inurl", FIELD_SUBURL, true,"Matches all pages that have the given terms in the url. Example inurl:water will match all pages whose url has the word water in it, but the word must be delineated by punctuation."},
|
||||
{"suburl", FIELD_SUBURL, true,"Same as inurl."},
|
||||
@ -3042,8 +3042,8 @@ struct QueryField g_fields[] = {
|
||||
{"gbhasext", FIELD_GBOTHER, false,""},
|
||||
{"gbsubmiturl", FIELD_GBOTHER, false,""},
|
||||
|
||||
{"qdom", FIELD_QUOTA, false,""},
|
||||
{"qhost", FIELD_QUOTA, false,""},
|
||||
//{"qdom", FIELD_QUOTA, false,""},
|
||||
//{"qhost", FIELD_QUOTA, false,""},
|
||||
{"gbtagvector", FIELD_GBTAGVECTOR, false,""},
|
||||
|
||||
{"gbgigabitvector", FIELD_GBGIGABITVECTOR, false,""},
|
||||
@ -3065,7 +3065,7 @@ struct QueryField g_fields[] = {
|
||||
{"gbduphash" ,FIELD_GBOTHER,false,"Internal use only."},
|
||||
{"gbsitetemplate" ,FIELD_GBOTHER,false,"Internal use only."},
|
||||
{"gboutlinkedtitle" ,FIELD_GBOTHER,false,"gboutlinkedtitle:0 and gboutlinkedtitle:1 matches events whose title is not in and in a hyperlink, respectively."},
|
||||
{"gbisaggregator" ,FIELD_GBOTHER,false,"gbisaggregator:0|1 depending on if the event came from an event aggregator website, like eviesays.com."},
|
||||
//{"gbisaggregator" ,FIELD_GBOTHER,false,"gbisaggregator:0|1 depending on if the event came from an event aggregator website, like eviesays.com."},
|
||||
{"gbdeduped" ,FIELD_GBOTHER,false,""},
|
||||
|
||||
{"gbinjected", FIELD_GBOTHER,false,"Was the event injected?."},
|
||||
@ -3074,7 +3074,7 @@ struct QueryField g_fields[] = {
|
||||
//{"gbendrange",FIELD_GBENDRANGE,false,""},
|
||||
|
||||
{"gbpermalink",FIELD_GBPERMALINK,false,""},
|
||||
{"gbcsenum",FIELD_GBCSENUM,false,""},
|
||||
//{"gbcsenum",FIELD_GBCSENUM,false,""},
|
||||
{"gbparenturl", FIELD_GBPARENTURL, true,"Match the json urls that were extract from this parent url. Example: gbparenturl:www.gigablast.com/addurl.htm"},
|
||||
{"gbdocid",FIELD_GBDOCID,false,"restrict results to this docid"}
|
||||
|
||||
|
@ -224,8 +224,10 @@ class SearchInput {
|
||||
//long m_formatStrLen;
|
||||
//char *m_formatStr;
|
||||
|
||||
char m_formatTmp[11];
|
||||
|
||||
// can be 0 for FORMAT_HTML, 1 = FORMAT_XML, 2=FORMAT_JSON, 3=csv
|
||||
char m_format;
|
||||
long m_format;
|
||||
|
||||
// this should be part of the key because it will affect the results!
|
||||
char m_queryExpansion;
|
||||
|
56
Spider.cpp
56
Spider.cpp
@ -2861,6 +2861,42 @@ static void gotSpiderdbListWrapper ( void *state , RdbList *list , Msg5 *msg5){
|
||||
if ( g_conf.m_logDebugSpider )
|
||||
log("spider: back from msg5 spiderdb read2");
|
||||
|
||||
|
||||
// ensure collection rec still there
|
||||
CollectionRec *cr = g_collectiondb.getRec ( THIS->m_collnum );
|
||||
if ( ! cr ) return;
|
||||
|
||||
|
||||
// if we do not have a pg count entry for this then enter count mode
|
||||
// where we just scan all the spider records for m_scanningIp
|
||||
// and count how many pages are in the index for each subdomain/site
|
||||
// and when it is over we re-do the scan from the top.
|
||||
THIS->m_countingPagesIndexed = false;
|
||||
// don't bother with this stuff though if url filters do not specify
|
||||
// "pagesinip" or "pagesinsubdomain"
|
||||
if ( cr->m_urlFiltersHavePageCounts &&
|
||||
// and only do this if we do not have an entry for this ip yet
|
||||
! cr->m_pageCountTable.isInTable ( &THIS->m_scanningIp ) ) {
|
||||
// it is on
|
||||
THIS->m_countingPagesIndexed = true;
|
||||
// reset this
|
||||
THIS->m_lastReqUh48 = 0LL;
|
||||
THIS->m_lastRepUh48 = 0LL;
|
||||
// and setup the LOCAL counting table if not initialized
|
||||
if ( THIS->m_localTable.m_ks == 0 )
|
||||
THIS->m_localTable.set (4,4,0,NULL,0,false,0,"ltpct" );
|
||||
// do not recompute this in case all records for this ip
|
||||
// are missing or have issues, like maybe there was only
|
||||
// a spiderreply
|
||||
if ( ! cr->m_pageCountTable.addScore( &THIS->m_scanningIp,1)){
|
||||
log("spider: error adding to pg cnt tbl: %s",
|
||||
mstrerror(g_errno));
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
// . finish processing the list we read now
|
||||
// . if that blocks, it will call doledWrapper
|
||||
if ( ! THIS->scanSpiderdb ( false ) ) return;
|
||||
@ -2989,26 +3025,6 @@ bool SpiderColl::scanSpiderdb ( bool needList ) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// if we do not have a pg count entry for this then enter count mode
|
||||
// where we just scan all the spider records for m_scanningIp
|
||||
// and count how many pages are in the index for each subdomain/site
|
||||
// and when it is over we re-do the scan from the top.
|
||||
m_countingPagesIndexed = false;
|
||||
// don't bother with this stuff though if url filters do not specify
|
||||
// "pagesinip" or "pagesinsubdomain"
|
||||
if ( cr->m_urlFiltersHavePageCounts &&
|
||||
// and only do this if we do not have an entry for this ip yet
|
||||
! cr->m_pageCountTable.isInTable ( &m_scanningIp ) ) {
|
||||
// it is on
|
||||
m_countingPagesIndexed = true;
|
||||
// reset this
|
||||
m_lastReqUh48 = 0LL;
|
||||
m_lastRepUh48 = 0LL;
|
||||
// and setup the LOCAL counting table if not initialized
|
||||
if ( m_localTable.m_ks == 0 )
|
||||
m_localTable.set ( 4 ,4,0,NULL,0,false,0,"ltpct" );
|
||||
}
|
||||
|
||||
// i guess we are always restricted to an ip, because
|
||||
// populateWaitingTreeFromSpiderdb calls its own msg5.
|
||||
long firstIp0 = g_spiderdb.getFirstIp(&m_nextKey);
|
||||
|
@ -23181,7 +23181,10 @@ bool XmlDoc::hashNoSplit ( HashTableX *tt ) {
|
||||
|
||||
|
||||
|
||||
char *host = fu->getHost ();
|
||||
//long hlen = fu->getHostLen ();
|
||||
|
||||
/*
|
||||
setStatus ( "hashing no-split qdom keys" );
|
||||
|
||||
char *dom = fu->getDomain ();
|
||||
@ -23192,15 +23195,13 @@ bool XmlDoc::hashNoSplit ( HashTableX *tt ) {
|
||||
if ( ! hashString ( dom,dlen,&hi ) ) return false;
|
||||
|
||||
|
||||
|
||||
setStatus ( "hashing no-split qhost keys" );
|
||||
|
||||
char *host = fu->getHost ();
|
||||
long hlen = fu->getHostLen ();
|
||||
|
||||
// desc is NULL, prefix will be used as desc
|
||||
hi.m_prefix = "qhost";
|
||||
if ( ! hashString ( host,hlen,&hi ) ) return false;
|
||||
*/
|
||||
|
||||
|
||||
// now hash the site
|
||||
|
||||
|
@ -144,10 +144,6 @@
|
||||
# search results?
|
||||
<maxTitleLen>80</>
|
||||
|
||||
# Can Gigablast make titles from the document content? Used mostly for the
|
||||
# news collection where the title tags are not very reliable.
|
||||
<considerTitlesFromBody>0</>
|
||||
|
||||
# Should search results be site clustered by default?
|
||||
<siteClusterByDefault>1</>
|
||||
|
||||
@ -172,10 +168,6 @@
|
||||
# enabled.
|
||||
<numberOfLinesToUseInSummaryToDedup>4</>
|
||||
|
||||
# Use Language weights to sort query results. This will give results that
|
||||
# match the specified &qlang higher ranking.
|
||||
<useLanguageWeights>1</>
|
||||
|
||||
# Default language to use for ranking results. Value should be any language
|
||||
# abbreviation, for example "en" for English.
|
||||
<sortLanguagePreference><![CDATA[en]]></>
|
||||
@ -315,23 +307,6 @@
|
||||
<harvestLinks>1</>
|
||||
<harvestLinks>1</>
|
||||
<harvestLinks>1</>
|
||||
<spidersEnabled>1</>
|
||||
<spidersEnabled>1</>
|
||||
<spidersEnabled>1</>
|
||||
<spidersEnabled>1</>
|
||||
<spidersEnabled>1</>
|
||||
<spidersEnabled>1</>
|
||||
<spidersEnabled>1</>
|
||||
<spidersEnabled>1</>
|
||||
<spidersEnabled>1</>
|
||||
<spidersEnabled>1</>
|
||||
<spidersEnabled>1</>
|
||||
<spidersEnabled>1</>
|
||||
<spidersEnabled>1</>
|
||||
<spidersEnabled>1</>
|
||||
<spidersEnabled>1</>
|
||||
<spidersEnabled>1</>
|
||||
<spidersEnabled>1</>
|
||||
<filterFrequency>0.000000</>
|
||||
<filterFrequency>0.000000</>
|
||||
<filterFrequency>1.000000</>
|
||||
|
Loading…
Reference in New Issue
Block a user