open-source-search-engine/Parms.cpp
Matt Wells 8a65d21371 fix the source of lots of corruption in spiderdb and titledb.
rdbmem.cpp was storing in secondary mem which got reset when
dump completed. also do not add keys that are in collnum and
key range of list currently being dumped, return ETRYAGAIN.
added verify writes parm. clean out tree of titledb and spiderdb
corruption on startup.
2016-03-15 15:54:12 -07:00

23365 lines
687 KiB
C++

#include "gb-include.h"
#include "Parms.h"
#include "File.h"
#include "Conf.h"
//#include "CollectionRec.h"
#include "TcpSocket.h"
#include "HttpRequest.h"
#include "Pages.h" // g_pages
#include "Tagdb.h" // g_tagdb
#include "Catdb.h"
#include "Collectiondb.h"
#include "HttpMime.h" // atotime()
//#include "Msg28.h"
//#include "Sync.h"
#include "Indexdb.h" // for MIN_TRUNC
#include "SearchInput.h"
#include "Unicode.h"
#include "Threads.h"
#include "Spider.h" // MAX_SPIDER_PRIORITIES
#include "Statsdb.h"
#include "Sections.h"
#include "Msg17.h"
#include "Process.h"
#include "Repair.h"
#include "Ads.h"
#include "LanguagePages.h"
#include "PingServer.h"
#include "Users.h"
#include "Proxy.h"
#include "hash.h"
#include "Test.h"
#include "Rebalance.h"
#include "SpiderProxy.h" // buildProxyTable()
#include "PageInject.h" // InjectionRequest
// width of input box in characters for url filter expression
#define REGEX_TXT_MAX 80
Parms g_parms;
//#include "Tfndb.h"
#include "Spider.h"
#include "Tagdb.h"
#include "Indexdb.h"
#include "Datedb.h"
//#include "Checksumdb.h"
#include "Clusterdb.h"
#include "Collectiondb.h"
//
// new functions to extricate info from parm recs
//
int32_t getDataSizeFromParmRec ( char *rec ) {
return *(int32_t *)(rec+sizeof(key96_t));
}
char *getDataFromParmRec ( char *rec ) {
return rec+sizeof(key96_t)+4;
}
collnum_t getCollnumFromParmRec ( char *rec ) {
key96_t *k = (key96_t *)rec;
return (collnum_t)k->n1;
}
// for parms that are arrays...
int16_t getOccNumFromParmRec ( char *rec ) {
key96_t *k = (key96_t *)rec;
return (int16_t)((k->n0>>16));
}
Parm *getParmFromParmRec ( char *rec ) {
key96_t *k = (key96_t *)rec;
int32_t cgiHash32 = (k->n0 >> 32);
return g_parms.getParmFast2 ( cgiHash32 );
}
int32_t getHashFromParmRec ( char *rec ) {
key96_t *k = (key96_t *)rec;
int32_t cgiHash32 = (k->n0 >> 32);
return cgiHash32;
}
// . occNum is index # for parms that are arrays. it is -1 if not used.
// . collnum is -1 for g_conf, which is not a collrec
// . occNUm is -1 for a non-array parm
key96_t makeParmKey ( collnum_t collnum , Parm *m , int16_t occNum ) {
key96_t k;
k.n1 = collnum;
k.n0 = (uint32_t)m->m_cgiHash; // 32 bit
k.n0 <<= 16;
k.n0 |= (uint16_t)occNum;
// blanks
k.n0 <<= 16;
// delbit. 1 means positive key
k.n0 |= 0x01;
// test
if ( getCollnumFromParmRec ((char *)&k)!=collnum){char*xx=NULL;*xx=0;}
if ( getOccNumFromParmRec ((char *)&k)!=occNum){char*xx=NULL;*xx=0;}
return k;
}
bool printUrlExpressionExamples ( SafeBuf *sb ) ;
//////////////////////////////////////////////
//
// Command Functions. All return false if block... yadayada
//
//////////////////////////////////////////////
////////
//
// . do commands this way now
// . when handleRequest4 receives a special "command" parmdb rec
// it calls executes the cmd, one of the functions listed below
// . all these Command*() functions are called in updateParm() below
// . they return false if they would block and they'll call your callback
// specified in you "we" the WaitEntry
// . they return true with g_errno set on error, set to 0 on success
//
////////
// from PageBasic.cpp:
bool updateSiteListBuf(collnum_t collnum,bool addSeeds,char *siteListArg);
bool CommandUpdateSiteList ( char *rec ) {
// caller must specify collnum
collnum_t collnum = getCollnumFromParmRec ( rec );
if ( collnum < 0 ) {
log("parms: bad collnum for update site list");
g_errno = ENOCOLLREC;
return true;
}
// sanity
int32_t dataSize = getDataSizeFromParmRec ( rec );
if ( dataSize < 0 ) {
log("parms: bad site list size = %"INT32" bad!",dataSize);
g_errno = EBADENGINEER;
return true;
}
// need this
CollectionRec *cr = g_collectiondb.getRec ( collnum );
if ( ! cr ) {
log("parms: no cr for collnum %"INT32" to update",(int32_t)collnum);
return true;
}
// get the sitelist
char *data = getDataFromParmRec ( rec );
// update the table that maps site to whether we should spider it
// and also add newly introduced sites in "data" into spiderdb.
updateSiteListBuf ( collnum ,
true , // add NEW seeds?
data // entire sitelist
);
// now that we deduped the old site list with the new one for
// purposes of adding NEW seeds, we can do the final copy
cr->m_siteListBuf.set ( data );
return true;
}
// . require user manually execute this to prevent us fucking up the data
// at first initially because of a bad hosts.conf file!!!
// . maybe put a red 'A' in the hosts table on the web page to indicate
// we detected records that don't belong to our shard so user knows to
// rebalance?
// . we'll show it in a special msg box on all admin pages if required
bool CommandRebalance ( char *rec ) {
g_rebalance.m_userApproved = true;
// force this to on so it goes through
g_rebalance.m_numForeignRecs = 1;
g_rebalance.m_needsRebalanceValid = false;
return true;
}
bool CommandInsertUrlFiltersRow ( char *rec ) {
// caller must specify collnum
collnum_t collnum = getCollnumFromParmRec ( rec );
if ( collnum < 0 ) {
log("parms: bad collnum for insert row");
g_errno = ENOCOLLREC;
return true;
}
// sanity
int32_t dataSize = getDataSizeFromParmRec ( rec );
if ( dataSize <= 1 ) {
log("parms: insert row data size = %"INT32" bad!",dataSize);
g_errno = EBADENGINEER;
return true;
}
// need this
CollectionRec *cr = g_collectiondb.getRec ( collnum );
// get the row #
char *data = getDataFromParmRec ( rec );
int32_t rowNum = atol(data);//*(int32_t *)data;
// scan all parms for url filter parms
for ( int32_t i = 0 ; i < g_parms.m_numParms ; i++ ) {
Parm *m = &g_parms.m_parms[i];
// parm must be a url filters parm
if ( m->m_page != PAGE_FILTERS ) continue;
// must be an array!
if ( ! m->isArray() ) continue;
// sanity check
if ( m->m_obj != OBJ_COLL ) { char *xx=NULL;*xx=0; }
// . add that row
// . returns false and sets g_errno on error
if ( ! g_parms.insertParm ( i, rowNum,(char *)cr)) return true;
}
return true;
}
bool CommandRemoveConnectIpRow ( char *rec ) {
// caller must specify collnum
//collnum_t collnum = getCollnumFromParmRec ( rec );
//if ( collnum < 0 ) {
// g_errno = ENOCOLLREC;
// log("parms: bad collnum for remove row");
// return true;
//}
// sanity
int32_t dataSize = getDataSizeFromParmRec ( rec );
if ( dataSize <= 1 ) {
log("parms: insert row data size = %"INT32" bad!",dataSize);
g_errno = EBADENGINEER;
return true;
}
// need this
//CollectionRec *cr = g_collectiondb.getRec ( collnum );
// get the row #
char *data = getDataFromParmRec ( rec );
int32_t rowNum = atol(data);
// scan all parms for url filter parms
for ( int32_t i = 0 ; i < g_parms.m_numParms ; i++ ) {
Parm *m = &g_parms.m_parms[i];
// parm must be a url filters parm
if ( m->m_page != PAGE_MASTERPASSWORDS ) continue;
// must be an array!
if ( ! m->isArray() ) continue;
// sanity check
if ( m->m_obj != OBJ_CONF ) { char *xx=NULL;*xx=0; }
// must be masterip
if ( m->m_type != TYPE_IP ) continue;
// . nuke that parm's element
// . returns false and sets g_errno on error
if (!g_parms.removeParm(i,rowNum,(char *)&g_conf))return true;
}
return true;
}
bool CommandRemovePasswordRow ( char *rec ) {
// sanity
int32_t dataSize = getDataSizeFromParmRec ( rec );
if ( dataSize <= 1 ) {
log("parms: insert row data size = %"INT32" bad!",dataSize);
g_errno = EBADENGINEER;
return true;
}
// get the row #
char *data = getDataFromParmRec ( rec );
int32_t rowNum = atol(data);
// scan all parms for url filter parms
for ( int32_t i = 0 ; i < g_parms.m_numParms ; i++ ) {
Parm *m = &g_parms.m_parms[i];
// parm must be a url filters parm
if ( m->m_page != PAGE_MASTERPASSWORDS ) continue;
// must be an array!
if ( ! m->isArray() ) continue;
// sanity check
if ( m->m_obj != OBJ_CONF ) { char *xx=NULL;*xx=0; }
// must be master password
if ( m->m_type != TYPE_STRINGNONEMPTY ) continue;
// . nuke that parm's element
// . returns false and sets g_errno on error
if (!g_parms.removeParm(i,rowNum,(char *)&g_conf))return true;
}
return true;
}
bool CommandRemoveUrlFiltersRow ( char *rec ) {
// caller must specify collnum
collnum_t collnum = getCollnumFromParmRec ( rec );
if ( collnum < 0 ) {
g_errno = ENOCOLLREC;
log("parms: bad collnum for remove row");
return true;
}
// sanity
int32_t dataSize = getDataSizeFromParmRec ( rec );
if ( dataSize <= 1 ) {
log("parms: insert row data size = %"INT32" bad!",dataSize);
g_errno = EBADENGINEER;
return true;
}
// need this
CollectionRec *cr = g_collectiondb.getRec ( collnum );
// get the row #
char *data = getDataFromParmRec ( rec );
int32_t rowNum = atol(data);
// scan all parms for url filter parms
for ( int32_t i = 0 ; i < g_parms.m_numParms ; i++ ) {
Parm *m = &g_parms.m_parms[i];
// parm must be a url filters parm
if ( m->m_page != PAGE_FILTERS ) continue;
// must be an array!
if ( ! m->isArray() ) continue;
// sanity check
if ( m->m_obj != OBJ_COLL ) { char *xx=NULL;*xx=0; }
// . nuke that parm's element
// . returns false and sets g_errno on error
if ( ! g_parms.removeParm ( i,rowNum,(char *)cr)) return true;
}
return true;
}
// after we add a new coll, or at anytime after we can clone it
bool CommandCloneColl ( char *rec ) {
// the collnum we want to affect.
collnum_t dstCollnum = getCollnumFromParmRec ( rec );
// . data is the collnum in ascii.
// . from "&restart=467" for example
char *data = rec + sizeof(key96_t) + 4;
int32_t dataSize = *(int32_t *)(rec + sizeof(key96_t));
//if ( dataSize < 1 ) { char *xx=NULL;*xx=0; }
// copy parm settings from this collection name
char *srcColl = data;
// return if none to clone from
if ( dataSize <= 0 ) return true;
// avoid defaulting to main collection
if ( ! data[0] ) return true;
CollectionRec *srcRec = NULL;
CollectionRec *dstRec = NULL;
srcRec = g_collectiondb.getRec ( srcColl ); // get from name
dstRec = g_collectiondb.getRec ( dstCollnum ); // get from #
if ( ! srcRec )
return log("parms: invalid coll %s to clone from",
srcColl);
if ( ! dstRec )
return log("parms: invalid collnum %"INT32" to clone to",
(int32_t)dstCollnum);
log ("parms: cloning parms from collection %s to %s",
srcRec->m_coll,dstRec->m_coll);
g_parms.cloneCollRec ( (char *)dstRec , (char *)srcRec );
return true;
}
// customCrawl:
// 0 for regular collection
// 1 for custom crawl
// 2 for bulk job
// . returns false if blocks true otherwise
bool CommandAddColl ( char *rec , char customCrawl ) {
// caller must specify collnum
collnum_t newCollnum = getCollnumFromParmRec ( rec );
// sanity.
if ( newCollnum < 0 ) {
g_errno = ENOCOLLREC;
log("parms: bad collnum for AddColl");
return true;
}
char *data = rec + sizeof(key96_t) + 4;
int32_t dataSize = *(int32_t *)(rec + sizeof(key96_t));
// collection name must be at least 2 bytes (includes \0)
if ( dataSize <= 1 ) { char *xx=NULL;*xx=0; }
// then collname, \0 terminated
char *collName = data;
if ( gbstrlen(collName) > MAX_COLL_LEN ) {
log("crawlbot: collection name too long");
return true;
}
// if ( ! g_parms.m_inSyncWithHost0 ) {
// log("parms: can not add coll #%i %s until in sync with host 0",
// (int)newCollnum,collName);
// g_errno = EBADENGINEER;
// return true;
// }
// this saves it to disk! returns false and sets g_errno on error.
if ( ! g_collectiondb.addNewColl ( collName,
customCrawl ,
NULL , // copy from
0 , // copy from len
true , // save?
newCollnum
) )
// error! g_errno should be set
return true;
return true;
}
// all nodes are guaranteed to add the same collnum for the given name
bool CommandAddColl0 ( char *rec ) { // regular collection
return CommandAddColl ( rec , 0 );
}
bool CommandAddColl1 ( char *rec ) { // custom crawl
return CommandAddColl ( rec , 1 );
}
bool CommandAddColl2 ( char *rec ) { // bulk job
return CommandAddColl ( rec , 2 );
}
bool CommandResetProxyTable ( char *rec ) {
// from SpiderProxy.h
return resetProxyStats();
}
// . returns true and sets g_errno on error
// . returns false if would block
bool CommandDeleteColl ( char *rec , WaitEntry *we ) {
collnum_t collnum = getCollnumFromParmRec ( rec );
// if ( ! g_parms.m_inSyncWithHost0 ) {
// log("parms: can not del collnum %i until in sync with host 0",
// (int)collnum);
// g_errno = EBADENGINEER;
// return true;
// }
// the delete might block because the tree is saving and we can't
// remove our collnum recs from it while it is doing that
if ( ! g_collectiondb.deleteRec2 ( collnum ) )
// we blocked, we->m_callback will be called when done
return false;
// delete is successful
return true;
}
// . returns true and sets g_errno on error
// . returns false if would block
bool CommandDeleteColl2 ( char *rec , WaitEntry *we ) {
char *data = rec + sizeof(key96_t) + 4;
char *coll = (char *)data;
collnum_t collnum = g_collectiondb.getCollnum ( coll );
// if ( ! g_parms.m_inSyncWithHost0 ) {
// log("parms: can not del collnum %i until in sync with host 0",
// (int)collnum);
// g_errno = EBADENGINEER;
// return true;
// }
if ( collnum < 0 ) {
g_errno = ENOCOLLREC;
return true;;
}
// the delete might block because the tree is saving and we can't
// remove our collnum recs from it while it is doing that
if ( ! g_collectiondb.deleteRec2 ( collnum ) )
// we blocked, we->m_callback will be called when done
return false;
// delete is successful
return true;
}
bool CommandForceNextSpiderRound ( char *rec ) {
// caller must specify collnum
collnum_t collnum = getCollnumFromParmRec ( rec );
// need this
CollectionRec *cr = g_collectiondb.getRec ( collnum );
if ( ! cr ) {
g_errno = ENOCOLLREC;
log("parms: bad collnum %"INT32" for restart spider round",
(int32_t)collnum);
return true;
}
// seems like parmlist is an rdblist, so we have a key_t followed
// by 4 bytes of datasize then the data... which is an ascii string
// in our case...
char *data = getDataFromParmRec ( rec );
uint32_t roundStartTime;
int32_t newRoundNum;
// see the HACK: in Parms::convertHttpRequestToParmList() where we
// construct this data in response to a "roundStart" cmd. we used
// sprintf() so it's natural to use sscanf() to parse it out.
sscanf ( data , "%"UINT32",%"INT32"",
&roundStartTime,
&newRoundNum);
cr->m_spiderRoundStartTime = roundStartTime;
cr->m_spiderRoundNum = newRoundNum;
// if we don't have this is prints out "skipping0 ... " for urls
// we try to spider in Spider.cpp.
cr->m_spiderStatus = SP_INPROGRESS;
// reset the round counts. this will log a msg. resetting the
// round counts will prevent maxToProcess/maxToCrawl from holding
// us back...
spiderRoundIncremented ( cr );
// yeah, if we don't nuke doledb then it doesn't work...
cr->rebuildUrlFilters();
return true;
}
// . returns true and sets g_errno on error
// . returns false if would block
bool CommandRestartColl ( char *rec , WaitEntry *we ) {
collnum_t newCollnum = getCollnumFromParmRec ( rec );
// . data is the collnum in ascii.
// . from "&restart=467" for example
char *data = rec + sizeof(key96_t) + 4;
int32_t dataSize = *(int32_t *)(rec + sizeof(key96_t));
if ( dataSize < 1 ) { char *xx=NULL;*xx=0; }
collnum_t oldCollnum = atol(data);
if ( oldCollnum < 0 ||
oldCollnum >= g_collectiondb.m_numRecs ||
! g_collectiondb.m_recs[oldCollnum] ) {
log("parms: invalid collnum %"INT32" to restart",(int32_t)oldCollnum);
return true;
}
// this can block if tree is saving, it has to wait
// for tree save to complete before removing old
// collnum recs from tree
if ( ! g_collectiondb.resetColl2 ( oldCollnum ,
newCollnum ,
false ) ) // purgeSeeds?
// we blocked, we->m_callback will be called when done
return false;
// turn on spiders on new collrec. collname is same but collnum
// will be different.
CollectionRec *cr = g_collectiondb.getRec ( newCollnum );
// if reset from crawlbot api page then enable spiders
// to avoid user confusion
//if ( cr ) cr->m_spideringEnabled = 1;
if ( ! cr ) return true;
//
// repopulate spiderdb with the same sites
//
char *oldSiteList = cr->m_siteListBuf.getBufStart();
// do not let it have the buf any more
cr->m_siteListBuf.detachBuf();
// can't leave it NULL, safebuf parms do not like to be null
cr->m_siteListBuf.nullTerm();
// re-add the buf so it re-seeds spiderdb. it will not dedup these
// urls in "oldSiteList" with "m_siteListBuf" which is now empty.
// "true" = addSeeds.
updateSiteListBuf ( newCollnum , true , oldSiteList );
// now put it back
if ( oldSiteList ) cr->m_siteListBuf.safeStrcpy ( oldSiteList );
// all done
return true;
}
// . returns true and sets g_errno on error
// . returns false if would block
bool CommandResetColl ( char *rec , WaitEntry *we ) {
collnum_t newCollnum = getCollnumFromParmRec ( rec );
// . data is the collnum in ascii.
// . from "&restart=467" for example
char *data = rec + sizeof(key96_t) + 4;
int32_t dataSize = *(int32_t *)(rec + sizeof(key96_t));
if ( dataSize < 1 ) { char *xx=NULL;*xx=0; }
collnum_t oldCollnum = atol(data);
if ( oldCollnum < 0 ||
oldCollnum >= g_collectiondb.m_numRecs ||
! g_collectiondb.m_recs[oldCollnum] ) {
log("parms: invalid collnum %"INT32" to reset",(int32_t)oldCollnum);
return true;
}
// this will not go through if tree is saving, it has to wait
// for tree save to complete before removing old
// collnum recs from tree. so return false in that case so caller
// will know to re-call later.
if ( ! g_collectiondb.resetColl2 ( oldCollnum ,
newCollnum ,
true ) ) // purgeSeeds?
// we blocked, we->m_callback will be called when done
return false;
// turn on spiders on new collrec. collname is same but collnum
// will be different.
CollectionRec *cr = g_collectiondb.getRec ( newCollnum );
if ( ! cr ) return true;
//
// repopulate spiderdb with the same sites
//
char *oldSiteList = cr->m_siteListBuf.getBufStart();
// do not let it have the buf any more
cr->m_siteListBuf.detachBuf();
// can't leave it NULL, safebuf parms do not like to be null
cr->m_siteListBuf.nullTerm();
// re-add the buf so it re-seeds spiderdb. it will not dedup these
// urls in "oldSiteList" with "m_siteListBuf" which is now empty.
// "true" = addSeeds.
updateSiteListBuf ( newCollnum , true , oldSiteList );
// now put it back
if ( oldSiteList ) cr->m_siteListBuf.safeStrcpy ( oldSiteList );
// turn spiders off
//if ( cr ) cr->m_spideringEnabled = 0;
return true;
}
bool CommandParserTestInit ( char *rec ) {
// enable testing for all other hosts
g_conf.m_testParserEnabled = 1;
// reset all files
g_test.removeFiles();
// turn spiders on globally
g_conf.m_spideringEnabled = 1;
//g_conf.m_webSpideringEnabled = 1;
// turn on for test coll too
CollectionRec *cr = g_collectiondb.getRec("qatest123");
// turn on spiders
if ( cr ) cr->m_spideringEnabled = 1;
// tell spider loop to update active list
g_spiderLoop.m_activeListValid = false;
// if we are not host 0, turn on spiders for testing
if ( g_hostdb.m_myHost->m_hostId != 0 ) return true;
// start the test loop to inject urls for parsing/spidering
g_test.initTestRun();
// done
return true;
}
bool CommandSpiderTestInit ( char *rec ) {
// enable testing for all other hosts
g_conf.m_testSpiderEnabled = 1;
// reset all files
g_test.removeFiles();
// turn spiders on globally
g_conf.m_spideringEnabled = 1;
//g_conf.m_webSpideringEnabled = 1;
// turn on for test coll too
CollectionRec *cr = g_collectiondb.getRec("qatest123");
// turn on spiders
if ( cr ) cr->m_spideringEnabled = 1;
// tell spider loop to update active list
g_spiderLoop.m_activeListValid = false;
// if we are not host 0, turn on spiders for testing
if ( g_hostdb.m_myHost->m_hostId != 0 ) return true;
// start the test loop to inject urls for parsing/spidering
g_test.initTestRun();
// done
return true;
}
bool CommandSpiderTestCont ( char *rec ) {
// enable testing for all other hosts
g_conf.m_testSpiderEnabled = 1;
// turn spiders on globally
g_conf.m_spideringEnabled = 1;
//g_conf.m_webSpideringEnabled = 1;
// turn on for test coll too
CollectionRec *cr = g_collectiondb.getRec("qatest123");
// turn on spiders
if ( cr ) cr->m_spideringEnabled = 1;
// tell spider loop to update active list
g_spiderLoop.m_activeListValid = false;
// done
return true;
}
// some of these can block a little. if threads are off, a lot!
bool CommandMerge ( char *rec ) {
forceMergeAll ( RDB_POSDB ,1);
forceMergeAll ( RDB_TITLEDB ,1);
forceMergeAll ( RDB_TAGDB ,1);
forceMergeAll ( RDB_SPIDERDB ,1);
forceMergeAll ( RDB_LINKDB ,1);
// most of these are probably already in good shape
//g_checksumdb.getRdb()->attemptMerge (1,true);
// g_clusterdb.getRdb()->attemptMerge (1,true); // niceness, force?
// g_tagdb.getRdb()->attemptMerge (1,true);
// g_catdb.getRdb()->attemptMerge (1,true);
// //g_tfndb.getRdb()->attemptMerge (1,true);
// g_spiderdb.getRdb()->attemptMerge (1,true);
// // these 2 will probably need the merge the most
// g_indexdb.getRdb()->attemptMerge (1,true);
// g_datedb.getRdb()->attemptMerge (1,true);
// g_titledb.getRdb()->attemptMerge (1,true);
// //g_sectiondb.getRdb()->attemptMerge (1,true);
// g_statsdb.getRdb()->attemptMerge (1,true);
// g_linkdb .getRdb()->attemptMerge (1,true);
return true;
}
bool CommandMergePosdb ( char *rec ) {
forceMergeAll ( RDB_POSDB ,1);
// set this for each posdb base
return true;
}
bool CommandMergeSectiondb ( char *rec ) {
//g_sectiondb.getRdb()->attemptMerge (1,true); // nice , force
return true;
}
bool CommandMergeTitledb ( char *rec ) {
forceMergeAll ( RDB_TITLEDB ,1);
//g_titledb.getRdb()->attemptMerge (1,true);
return true;
}
bool CommandMergeSpiderdb ( char *rec ) {
forceMergeAll ( RDB_SPIDERDB ,1);
//g_spiderdb.getRdb()->attemptMerge (1,true);
return true;
}
bool CommandDiskPageCacheOff ( char *rec ) {
g_process.resetPageCaches();
return true;
}
bool CommandForceIt ( char *rec ) {
g_conf.m_forceIt = true;
return true;
}
bool CommandDiskDump ( char *rec ) {
//g_checksumdb.getRdb()->dumpTree ( 1 ); // niceness
g_clusterdb.getRdb()->dumpTree ( 1 );
g_tagdb.getRdb()->dumpTree ( 1 );
g_catdb.getRdb()->dumpTree ( 1 );
//g_tfndb.getRdb()->dumpTree ( 1 );
g_spiderdb.getRdb()->dumpTree ( 1 );
g_posdb.getRdb()->dumpTree ( 1 );
//g_datedb.getRdb()->dumpTree ( 1 );
g_titledb.getRdb()->dumpTree ( 1 );
//g_sectiondb.getRdb()->dumpTree ( 1 );
g_statsdb.getRdb()->dumpTree ( 1 );
g_linkdb.getRdb() ->dumpTree ( 1 );
g_errno = 0;
return true;
}
bool CommandJustSave ( char *rec ) {
// returns false if blocked, true otherwise
g_process.save ();
// always return true here
return true;
}
bool CommandSaveAndExit ( char *rec ) {
// return true if this blocks
g_process.shutdown ( false , NULL , NULL );
return true;
}
bool CommandUrgentSaveAndExit ( char *rec ) {
// "true" means urgent
g_process.shutdown ( true );
return true;
}
bool CommandReloadLanguagePages ( char *rec ) {
g_languagePages.reloadPages();
return true;
}
bool CommandClearKernelError ( char *rec ) {
g_hostdb.m_myHost->m_pingInfo.m_kernelErrors = 0;
return true;
}
bool CommandPowerNotice ( int32_t hasPower ) {
//int32_t hasPower = r->getLong("haspower",-1);
log("powermo: received haspower=%"INT32"",hasPower);
if ( hasPower != 0 && hasPower != 1 ) return true;
// did power state change? if not just return true
if ( g_process.m_powerIsOn && hasPower ) return true;
if ( ! g_process.m_powerIsOn && ! hasPower ) return true;
if ( hasPower ) {
log("powermo: power is regained");
g_process.m_powerIsOn = true;
return true;
}
// if it was on and went off...
// now it is off
log("powermo: power was lost");
// . SpiderLoop.cpp will not launch any more spiders as
// int32_t as the power is off
// . autosave should kick in every 30 seconds
g_process.m_powerIsOn = false;
// note the autosave
log("powermo: disabling spiders, suspending merges, disabling "
"tree writes and saving.");
// tell Process.cpp::save2() to save the blocking caches too!
//g_process.m_pleaseSaveCaches = true;
// . save everything now... this may block some when saving the
// caches... then do not do ANY writes...
// . RdbMerge suspends all merging if power is off
// . Rdb.cpp does not allow any adds if power is off. it will
// send back an ETRYAGAIN...
// . if a tree is being dumped, this will keep re-calling
// Process.cpp::save2()
g_process.save();
// also send an email if we are host #0
if ( g_hostdb.m_myHost->m_hostId != 0 ) return true;
if ( g_proxy.isProxy() ) return true;
char tmp[128];
Host *h0 = g_hostdb.getHost ( 0 );
int32_t ip0 = 0;
if ( h0 ) ip0 = h0->m_ip;
sprintf(tmp,"%s: POWER IS OFF",iptoa(ip0));
g_pingServer.sendEmail ( NULL , // Host ptr
tmp , // msg
true , // sendToAdmin
false , // oom?
false , // kernel error?
true , // parm change?
// force it? even if disabled?
false );
return true;
}
bool CommandPowerOnNotice ( char *rec ) {
return CommandPowerNotice ( 1 );
}
bool CommandPowerOffNotice ( char *rec ) {
return CommandPowerNotice ( 0 );
}
bool CommandInSync ( char *rec ) {
g_parms.m_inSyncWithHost0 = true;
return true;
}
//////////////////////
//
// end new commands
//
//////////////////////
static bool printDropDown ( int32_t n , SafeBuf* sb, char *name,
int32_t selet ,
bool includeMinusOne ,
bool includeMinusTwo ) ;
extern bool closeAll ( void *state, void (* callback)(void *state) );
extern bool allExit ( ) ;
/*
class Checksum {
public:
Checksum() : m_sum1( 0xffff ), m_sum2( 0xffff ) {}
void addIn( const uint16_t *data, size_t size, FILE *f = 0 ) {
// if an odd len of data, add first byte, then do rest below
if ( size % 2 != 0 ) {
m_sum1 += (uint16_t)*(uint8_t *)data;
m_sum2 += m_sum1;
size--;
data = (uint16_t *)(((uint8_t *)data)+1);
}
size_t len = size/2;
while ( len ) {
unsigned tlen = len;
// . 360 is largest amnt of sums that can be performed
// without overflow
if ( len > 360 ) tlen = 360;
len -= tlen;
do {
m_sum1 += *data++;
m_sum2 += m_sum1;
} while ( --tlen );
m_sum1 = (m_sum1 & 0xffff) + (m_sum1 >> 16);
m_sum2 = (m_sum2 & 0xffff) + (m_sum2 >> 16);
}
}
void addInStrings( const uint16_t *data, int32_t cnt, int32_t size ) {
while ( cnt ) {
const uint16_t *origData = data;
int32_t len = gbstrlen((char *)data);
// if an odd len of data, add first byte,
// then do rest below
if ( len % 2 != 0 ) {
m_sum1 += (uint16_t)*(uint8_t *)data;
m_sum2 += m_sum1;
len--;
data = (uint16_t *)(((uint8_t *)data)+1);
}
len /= 2;
while ( len ) {
unsigned tlen = len;
// . 360 = largest amnt of sums that can be
// performed without overflow
if ( len > 360 ) tlen = 360;
len -= tlen;
do {
m_sum1 += *data++;
m_sum2 += m_sum1;
} while ( --tlen );
m_sum1 = (m_sum1 & 0xffff) + (m_sum1 >> 16);
m_sum2 = (m_sum2 & 0xffff) + (m_sum2 >> 16);
}
cnt--;
data = (uint16_t *)((char *)origData + size);
}
}
void finalize() {
m_sum1 = (m_sum1 & 0xffff) + (m_sum1 >> 16);
m_sum2 = (m_sum2 & 0xffff) + (m_sum2 >> 16);
}
uint32_t getSum() const {
return ( m_sum2 << 16 | m_sum1 );
}
private:
uint32_t m_sum1;
uint32_t m_sum2;
};
*/
Parms::Parms ( ) {
m_isDefaultLoaded = false;
m_inSyncWithHost0 = false;
m_triedToSync = false;
}
void Parms::detachSafeBufs ( CollectionRec *cr ) {
for ( int32_t i = 0 ; i < m_numParms ; i++ ) {
Parm *m = &m_parms[i];
if ( m->m_type != TYPE_SAFEBUF ) continue;
if ( m->m_obj != OBJ_COLL ) continue;
if ( m->m_off < 0 ) continue;
int32_t max = 1;
// this will be zero if not an array.
// othewise it is the # of elements in the array
if ( m->m_size > max ) max = m->m_size;
// an array of safebufs? m->m_size will be > 1 then.
for ( int32_t j = 0 ; j < max ; j++ ) {
// get it
SafeBuf *sb = (SafeBuf *)((char *)cr + m->m_off +
j*sizeof(SafeBuf));
sb->detachBuf();
}
}
}
/*
uint32_t Parms::calcChecksum() {
Checksum cs;
for ( int32_t i = 0 ; i < m_numParms ; i++ ) {
Parm *m = &m_parms[i];
if ( m->m_obj == OBJ_SI ) continue;
if ( m->m_off < 0 ) continue;
if ( m->m_type == TYPE_COMMENT ) continue;
if ( m->m_type == TYPE_MONOD2 ) continue;
if ( m->m_type == TYPE_MONOM2 ) continue;
if ( m->m_type == TYPE_CMD ) continue;
if ( m->m_type == TYPE_LONG_CONST ) continue;
int32_t size = 0;
if ( m->m_type == TYPE_CHECKBOX ) size = 1;
if ( m->m_type == TYPE_CHAR ) size = 1;
if ( m->m_type == TYPE_CHAR2 ) size = 1;
if ( m->m_type == TYPE_BOOL ) size = 1;
if ( m->m_type == TYPE_BOOL2 ) size = 1;
if ( m->m_type == TYPE_PRIORITY ) size = 1;
if ( m->m_type == TYPE_PRIORITY2 ) size = 1;
//if ( m->m_type == TYPE_DIFFBOT_DROPDOWN) size = 1;
if ( m->m_type == TYPE_PRIORITY_BOXES ) size = 1;
if ( m->m_type == TYPE_RETRIES ) size = 1;
if ( m->m_type == TYPE_TIME ) size = 6;
if ( m->m_type == TYPE_DATE2 ) size = 4;
if ( m->m_type == TYPE_DATE ) size = 4;
if ( m->m_type == TYPE_FLOAT ) size = 4;
if ( m->m_type == TYPE_IP ) size = 4;
if ( m->m_type == TYPE_RULESET ) size = 4;
if ( m->m_type == TYPE_LONG ) size = 4;
if ( m->m_type == TYPE_LONG_LONG ) size = 8;
if ( m->m_type == TYPE_STRING ) size = m->m_size;
if ( m->m_type == TYPE_STRINGBOX ) size = m->m_size;
if ( m->m_type == TYPE_STRINGNONEMPTY ) size = m->m_size;
if ( m->m_type == TYPE_SAFEBUF ) size = m->m_size;
if ( m->m_type == TYPE_SITERULE ) size = 4;
// if we have an array
int32_t cnt = 1;
if (m->m_fixed > 0) {
size *= m->m_fixed;
cnt = m->m_fixed;
}
else {
size *= m->m_max;
cnt = m->m_max;
}
uint16_t *p = NULL;
if ( m->m_obj == OBJ_CONF ) {
p = (uint16_t *)((char *)&g_conf + m->m_off);
if (m->m_type == TYPE_STRING ||
m->m_type == TYPE_STRINGBOX ||
m->m_type == TYPE_STRINGNONEMPTY ) {
cs.addInStrings( p,
cnt,
m->m_size );
}
else if ( m->m_type == TYPE_SAFEBUF ) {
uint16_t *p2;
SafeBuf *sb2 = (SafeBuf *)p;
p2 = (uint16_t *)sb2->getBufStart();
cs.addIn( p2 , sb2->length() );
}
else {
cs.addIn( p, size );
}
}
else if ( m->m_obj == OBJ_COLL ) {
collnum_t j = g_collectiondb.getFirstCollnum ();
while ( j >= 0 ) {
CollectionRec *cr = g_collectiondb.getRec( j );
p = (uint16_t *)((char *)cr + m->m_off);
if (m->m_type == TYPE_STRING ||
m->m_type == TYPE_STRINGBOX ||
m->m_type == TYPE_STRINGNONEMPTY ) {
cs.addInStrings( p,
cnt,
m->m_size );
}
else if ( m->m_type == TYPE_SAFEBUF ) {
uint16_t *p2;
SafeBuf *sb2 = (SafeBuf *)p;
p2 = (uint16_t *)sb2->getBufStart();
cs.addIn( p2 , sb2->length() );
}
else {
cs.addIn( p, size );
}
j = g_collectiondb.getNextCollnum ( j );
}
}
}
cs.finalize();
return cs.getSum();
}
*/
// from Pages.cpp
bool printApiForPage ( SafeBuf *sb , int32_t PAGENUM , CollectionRec *cr ) ;
// returns false and sets g_errno on error
bool Parms::setGigablastRequest ( TcpSocket *socket ,
HttpRequest *hrArg ,
GigablastRequest *gr ) {
// get the page from the path... like /sockets --> PAGE_SOCKETS
int32_t page = g_pages.getDynamicPageNumber ( hrArg );
// is it a collection?
char *THIS = (char *)gr;
// ensure valid
if ( ! THIS ) {
// it is null when no collection explicitly specified...
log("admin: THIS is null for page %"INT32".",page);
return false;
}
// just in case
memset ( gr , 0 , sizeof(GigablastRequest) );
gr->m_socket = socket;
// make a copy of the httprequest because the original is on the stack
// in HttpServer::requestHandler()
if ( ! gr->m_hr.copy ( hrArg ) ) {
log("admin: failed to copy httprequest: %s",
mstrerror(g_errno));
return false;
}
// use the one we copied which won't disappear/beFreed on us
HttpRequest *hr = &gr->m_hr;
// need this
int32_t obj = OBJ_GBREQUEST;
//
// reset THIS to defaults. use NULL for cr since mostly for SearchInput
//
setToDefault ( THIS , obj , NULL);
// map PAGE_ADDURL to PAGE_ADDURL2 so
// /addurl is same as /admin/addurl as far as parms.
if ( page == PAGE_ADDURL )
page = PAGE_ADDURL2;
// loop through cgi parms
for ( int32_t i = 0 ; i < hr->getNumFields() ; i++ ) {
// get cgi parm name
char *field = hr->getField ( i );
//int32_t flen = hr->getFieldLen ( i );
// find in parms list
int32_t j;
Parm *m;
for ( j = 0 ; j < m_numParms ; j++ ) {
// get it
m = &m_parms[j];
// must be of this type
if ( m->m_obj != obj ) continue;
// page must match
if ( m->m_page != page ) continue;
// skip if no cgi parm, may not be configurable now
if ( ! m->m_cgi ) continue;
// otherwise, must match the cgi name exactly
if ( strcmp ( field,m->m_cgi ) == 0 ) break;
//if ( ! m->m_cgi2 ) continue; // alias check
//if ( strcmp ( field,m->m_cgi2 ) == 0 ) break;
//if ( ! m->m_cgi2 ) continue; // alias check
//if ( strcmp ( field,m->m_cgi3 ) == 0 ) break;
//if ( ! m->m_cgi3 ) continue; // alias check
//if ( strcmp ( field,m->m_cgi4 ) == 0 ) break;
}
// bail if the cgi field is not in the parms list
if ( j >= m_numParms ) {
//log("parms: missing cgi parm %s",field);
continue;
}
// value of cgi parm (null terminated)
char *v = hr->getValue ( i );
// . skip if no value was provided
// . unless it was a string! so we can make them empty.
if ( v[0] == '\0' &&
m->m_type != TYPE_CHARPTR &&
m->m_type != TYPE_STRING &&
m->m_type != TYPE_STRINGBOX ) continue;
// skip if offset is negative, that means none
if ( m->m_off < 0 ) continue;
// skip if no permission
//if ( (m->m_perms & user) == 0 ) continue;
// set it. now our TYPE_CHARPTR will just be set to it directly
// to save memory...
setParm ( (char *)THIS , m, j, 0, v, false,//not html enc
false ); // true );
// need to save it
//if ( THIS != (char *)&g_conf )
// ((CollectionRec *)THIS)->m_needsSave = true;
}
return true;
}
bool printSitePatternExamples ( SafeBuf *sb , HttpRequest *hr );
// . returns false if blocked, true otherwise
// . sets g_errno on error
// . must ultimately send reply back on "s"
// . called by Pages.cpp's sendDynamicReply() when it calls pg->function()
// which is called by HttpServer::sendReply(s,r) when it gets an http request
bool Parms::sendPageGeneric ( TcpSocket *s , HttpRequest *r ) {
char buf [ 128000 ];
SafeBuf stackBuf(buf,128000);
SafeBuf *sb = &stackBuf;
int32_t page = g_pages.getDynamicPageNumber ( r );
char format = r->getReplyFormat();
char guide = r->getLong("guide",0);
bool isMasterAdmin = g_conf.isMasterAdmin ( s , r );
bool isCollAdmin = g_conf.isCollAdmin ( s , r );
if ( ! g_conf.m_allowCloudUsers &&
! isMasterAdmin &&
! isCollAdmin ) {
char *msg = "NO PERMISSION";
return g_httpServer.sendDynamicPage (s, msg,gbstrlen(msg));
}
//
// CLOUD SEARCH ENGINE SUPPORT
//
char *action = r->getString("action",NULL);
if ( page == PAGE_BASIC_SETTINGS &&
guide &&
// this is non-null if handling a submit request
action &&
format == FORMAT_HTML ) {
//return g_parms.sendPageGeneric ( s, r, PAGE_BASIC_SETTINGS );
// just redirect to it
char *coll = r->getString("c",NULL);
if ( coll ) {
sb->safePrintf("<meta http-equiv=Refresh "
"content=\"0; URL=/widgets.html"
"?guide=1&c=%s\">",
coll);
return g_httpServer.sendDynamicPage (s,
sb->getBufStart(),
sb->length());
}
}
//
// some "generic" pages do additional processing on the provided input
// so we need to call those functions here...
//
// if we were an addurl page..
//if ( page == PAGE_ADDURL2 ) {
// // this returns false if blocked and it should re-call
// // sendPageGeneric when completed
// if ( ! processAddUrlRequest ( s , r ) )
// return false;
//}
char *bodyjs = NULL;
if ( page == PAGE_BASIC_SETTINGS )
bodyjs =" onload=document.getElementById('tabox').focus();";
// print standard header
if ( format != FORMAT_XML && format != FORMAT_JSON )
g_pages.printAdminTop ( sb , s , r , NULL , bodyjs );
// xml/json header
char *res = NULL;
if ( format == FORMAT_XML )
res = "<response>\n"
"\t<statusCode>0</statusCode>\n"
"\t<statusMsg>Success</statusMsg>\n";
if ( format == FORMAT_JSON )
res = "{ \"response:\"{\n"
"\t\"statusCode\":0,\n"
"\t\"statusMsg\":\"Success\"\n";
if ( res )
sb->safeStrcpy ( res );
// do not show the parms and their current values unless showsettings=1
// was explicitly given for the xml/json feeds
int32_t show = 1;
if ( format != FORMAT_HTML )
show = r->getLong("show",0);
if ( show )
printParmTable ( sb , s , r );
// xml/json tail
if ( format == FORMAT_XML )
res = "</response>\n";
if ( format == FORMAT_JSON )
res = "\t}\n}\n";
if ( res )
sb->safeStrcpy ( res );
bool POSTReply = g_pages.getPage ( page )->m_usePost;
char *ct = "text/html";
if ( format == FORMAT_XML ) ct = "text/xml";
if ( format == FORMAT_JSON ) ct = "application/json";
return g_httpServer.sendDynamicPage ( s ,
sb->getBufStart() ,
sb->length() ,
-1 ,
POSTReply ,
ct , // contType
-1 , // httpstatus
NULL,//cookie ,
NULL );// charset
}
bool Parms::printParmTable ( SafeBuf *sb , TcpSocket *s , HttpRequest *r ) {
int32_t page = g_pages.getDynamicPageNumber ( r );
int32_t fromIp = s->m_ip;
char format = r->getReplyFormat();
/*
if ( format == FORMAT_HTML )
sb->safePrintf (
"<script type=\"text/javascript\">"
"function filterRow(str) {"
//"alert ('string: ' + str);"
"var tab = document.all ? document.all"
"['parmtable'] :"
" document.getElementById ?"
"document.getElementById('parmtable') : null;"
" for(var j = 1; j < tab.rows.length;j++) {"
" if(tab.rows[j].innerHTML.indexOf(str) < 0) {"
" tab.rows[j].style.display = 'none';"
" } else {"
" tab.rows[j].style.display = '';"
" }"
" }"
"}\n"
"function checkAll(form, name, num) {\n "
" for (var i = 0; i < num; i++) {\n"
" var nombre;\n"
" if( i > 0) nombre = name + i;\n"
" else nombre = name;\n"
" var e = document.getElementById(nombre);\n"
" e.checked = !e.checked;\n"
//" if ( e.value == 'Y' ) e.value='N';"
//" else if ( e.value == 'N' ) e.value='Y';"
" }\n"
"}\n"
"</script>");
*/
if ( page == PAGE_COLLPASSWORDS2 )
page = PAGE_COLLPASSWORDS;
// print the start of the table
char *tt = "None";
if ( page == PAGE_LOG ) tt = "Log Controls";
if ( page == PAGE_MASTER ) tt = "Master Controls";
if ( page == PAGE_INJECT ) tt = "Inject Url";
if ( page == PAGE_MASTERPASSWORDS ) tt = "Master Passwords";
if ( page == PAGE_ADDURL2 ) tt = "Add Urls";
if ( page == PAGE_SPIDER ) tt = "Spider Controls";
if ( page == PAGE_SEARCH ) tt = "Search Controls";
if ( page == PAGE_ACCESS ) tt = "Access Controls";
if ( page == PAGE_FILTERS ) tt = "Url Filters";
if ( page == PAGE_BASIC_SETTINGS ) tt = "Settings";
if ( page == PAGE_COLLPASSWORDS ) tt = "Collection Passwords";
//if ( page == PAGE_SITES ) tt = "Site List";
//if ( page == PAGE_PRIORITIES ) tt = "Priority Controls";
//if ( page == PAGE_RULES ) tt = "Site Rules";
//if ( page == PAGE_SYNC ) tt = "Sync";
if ( page == PAGE_REPAIR ) tt = "Rebuild Controls";
//if ( page == PAGE_ADFEED ) tt = "Ad Feed Controls";
// special messages for spider controls
char *e1 = "";
char *e2 = "";
if ( page == PAGE_SPIDER && ! g_conf.m_spideringEnabled )
e1 = "<tr><td colspan=20><font color=#ff0000><b><center>"
"Spidering is temporarily disabled in Master Controls."
"</font></td></tr>\n";
if ( page == PAGE_SPIDER && ! g_conf.m_addUrlEnabled )
e2 = "<tr><td colspan=20><font color=#ff0000><b><center>"
"Add url is temporarily disabled in Master Controls."
"</font></td></tr>\n";
if ( format == FORMAT_XML || format == FORMAT_JSON ) {
char *coll = g_collectiondb.getDefaultColl(r);
CollectionRec *cr = g_collectiondb.getRec(coll);//2(r,true);
bool isMasterAdmin = g_conf.isMasterAdmin ( s , r );
bool isCollAdmin = g_conf.isCollAdmin ( s , r );
g_parms.printParms2 ( sb ,
page ,
cr ,
1 , // int32_t nc , # cols?
1 , // int32_t pd , print desc?
false , // isCrawlbot
format ,
NULL , // TcpSocket *sock
isMasterAdmin ,
isCollAdmin );
return true;
}
// . page repair (PageRepair.cpp) has a status table BEFORE the parms
// iff we are doing a repair
// . only one page for all collections, we have a parm that is
// a comma-separated list of the collections to repair. leave blank
// to repair all collections.
if ( page == PAGE_REPAIR )
g_repair.printRepairStatus ( sb , fromIp );
// start the table
sb->safePrintf(
"\n"
"<table %s "
//"style=\"border-radius:15px;"
//"border:#6060f0 2px solid;"
//"\" "
//"width=100%% bgcolor=#%s "
//"bgcolor=black "
//"cellpadding=4 "
//"border=0 "//border=1 "
"id=\"parmtable\">"
"<tr><td colspan=20>"// bgcolor=#%s>"
,TABLE_STYLE
//,DARKER_BLUE
//,DARK_BLUE
);
/*
take this out since we took out a ton of parms for
simplicties sake
if ( page != PAGE_FILTERS )
sb->safePrintf("<div style=\"float:left;\">"
"filter:<input type=\"text\" "
"onkeyup=\"filterRow(this.value)\" "
"value=\"\"></div>"
);
*/
sb->safePrintf(//"<div style=\"margin-left:45%%;\">"
//"<font size=+1>"
"<center>"
"<b>%s</b>"
//"</font>"
"</center>"
//"</div>"
"</td></tr>%s%s\n",
tt,e1,e2);
//bool isCrawlbot = false;
//if ( collOveride ) isCrawlbot = true;
// print the table(s) of controls
//p= g_parms.printParms (p, pend, page, user, THIS, coll, pwd, nc, pd);
g_parms.printParms ( sb , s , r );
// end the table
sb->safePrintf ( "</table>\n" );
// this must be outside of table, submit button follows
sb->safePrintf ( "<br>\n" );
if ( page == PAGE_SPIDERPROXIES ) {
// wrap up the form, print a submit button
g_pages.printSubmit ( sb );
printSpiderProxyTable ( sb );
// do not print another submit button
return true;
}
// url filter page has a test table
if ( page == PAGE_FILTERS ) {
// wrap up the form, print a submit button
g_pages.printSubmit ( sb );
printUrlExpressionExamples ( sb );
}
else if ( page == PAGE_BASIC_SETTINGS ) {
// wrap up the form, print a submit button
g_pages.printSubmit ( sb );
printSitePatternExamples ( sb , r );
}
else if ( page == PAGE_SPIDER ) { // PAGE_SITES
// wrap up the form, print a submit button
g_pages.printSubmit ( sb );
printSitePatternExamples ( sb , r );
}
else {
// wrap up the form, print a submit button
g_pages.printAdminBottom ( sb );
}
// extra sync table
/*
if ( page == PAGE_SYNC ) {
// a table that shows the progress of a sync process
sb.safePrintf (
"<br>"
"<table width=100%% border=1 bgcolor=#d0d0e0 "
"cellpadding=4 border=0>"
//"<tr><td colspan=2 bgcolor=#d0c0d0>"
"<tr><td colspan=2 bgcolor=#%s>"
"<center>"
//"<font size=+1>"
"<b>Sync Progress</b>"
//"</font>"
"</td></tr>\n" , DARK_BLUE);
for ( int32_t i = RDB_START ; i < RDB_END ; i++ ) {
Rdb *r = getRdbFromId ( i );
if ( ! r ) continue;
float pd = g_sync.getPercentDone ( i );
sb.safePrintf (
"<tr>"
"<td>%s</td>"
"<td>%.1f%%</td></tr>\n",
r->m_dbname , pd );
}
sb.safePrintf ( "</table>\n");
}
*/
// if just printing into a buffer, return now
//if ( pageBuf ) return true;
return true;
}
/*
char *printDropDown ( int32_t n , char *p, char *pend, char *name, int32_t select,
bool includeMinusOne ,
bool includeMinusTwo ) {
// begin the drop down menu
sprintf ( p , "<select name=%s>", name );
p += gbstrlen ( p );
char *s;
int32_t i = -1;
if ( includeMinusOne ) i = -1;
// . by default, minus 2 includes minus 3, the new "FILTERED" priority
// . it is link "BANNED" but does not mean the url is low quality necessarily
if ( includeMinusTwo ) i = -3;
for ( ; i < n ; i++ ) {
if ( i == select ) s = " selected";
else s = "";
if ( i == -3 )
sprintf (p,"<option value=%"INT32"%s>FILTERED",i,s);
else if ( i == -2 )
sprintf (p,"<option value=%"INT32"%s>BANNED",i,s);
else if ( i == -1 )
sprintf (p,"<option value=%"INT32"%s>undefined",i,s);
else
sprintf (p,"<option value=%"INT32"%s>%"INT32"",i,s,i);
p += gbstrlen ( p );
}
sprintf ( p , "</select>" );
p += gbstrlen ( p );
return p;
}
bool printDiffbotDropDown ( SafeBuf *sb,char *name,char *THIS , SafeBuf *sx) {
//CollectionRec *cr = (CollectionRec *)THIS;
// . get the string we have selected
// . the list of available strings to select is in
// m_diffbotApiList for this collection, and that can
// be changed by john to add custom diffbot api urls.
// . should just be m_spiderDiffbotApiUrl[i] safebuf
char *usingApi = sx->getBufStart();
if ( sx->length() == 0 ) usingApi = NULL;
// now scan each item in the list. see the setting of
// "m_def" for "diffbotApiList" below to see the
// comma separated list of default strings. each item in
// this list is of the format "<title>|<urlPath>,"
//char *p = cr->m_diffbotApiList.getBufStart();
char *p =
"None|none,"
"All|http://www.diffbot.com/api/analyze?mode=auto&fields=*,"
"Article (autodetect)|http://www.diffbot.com/api/analyze?mode=article&fields=*,"
"Article (force)|http://www.diffbot.com/api/article?fields=*,"
"Product (autodetect)|http://www.diffbot.com/api/analyze?mode=product&fields=*,"
"Product (force)|http://www.diffbot.com/v2/product?fields=*,"
"Image (autodetect)|http://www.diffbot.com/api/analyze?mode=image&fields=*,"
"Image (force)|http://www.diffbot.com/api/image?fields=*,"
"FrontPage (autodetect)|http://www.diffbot.com/api/analyze?mode=frontpage&fields=*,"
"FrontPage (force)|http://www.diffbot.com/api/frontpage?fields=*"
;
// wtf?
if ( ! p ) return true;
// print out. cgi is "dapi%"INT32"".
sb->safePrintf("<select name=%s>\n",name);
// print "none" as the first option
//char *sel = "";
//if ( ! usingApi ) sel = " selected";
//sb->safePrintf("<option value=\"\"%s>None</option>",sel);
// the various "diffbot urls" are separated by commas
for ( ; *p ; ) {
// point to start of item name
char *name = p;
// p should now point to name of the item
char *end1 = p;
// point to start of url for that item
for ( ; *end1 && *end1 != '|' ;end1++);
// save that
char *url = end1;
if ( *url == '|' ) url++;
// find end of url
char *urlEnd = url;
for ( ; *urlEnd && *urlEnd != ',' ; urlEnd++ );
// do we match it?
char *sel = "";
if ( usingApi && strncmp(usingApi,url,urlEnd-url)== 0 )
sel = " selected";
if ( ! usingApi && urlEnd - url == 0 )
sel = " selected";
// advance p
p = urlEnd;
// skip over comma to get next one
if ( *p == ',' ) p++;
// use the hash as the identifier
sb->safePrintf("<option value=\"");
sb->safeMemcpy ( url, urlEnd - url );
sb->safePrintf("\"%s>",sel);
// print item name
sb->safeMemcpy ( name , end1 - name );
sb->safePrintf("</option>\n");
}
sb->safePrintf("</select>");
return true;
}
*/
bool printDropDown ( int32_t n , SafeBuf* sb, char *name, int32_t select,
bool includeMinusOne ,
bool includeMinusTwo ) { // begin the drop down menu
sb->safePrintf ( "<select name=%s>", name );
char *s;
int32_t i = -1;
if ( includeMinusOne ) i = -1;
// . by default, minus 2 includes minus 3, the new "FILTERED" priority
// . it is link "BANNED" but does not mean the url is low quality necessarily
if ( includeMinusTwo ) i = -3;
// no more DELETE, etc.
i = 0;
if ( select < 0 ) select = 0;
for ( ; i < n ; i++ ) {
if ( i == select ) s = " selected";
else s = "";
if ( i == -3 )
sb->safePrintf ("<option value=%"INT32"%s>DELETE",i,s);
else if ( i == -2 )
//sb->safePrintf ("<option value=%"INT32"%s>BANNED",i,s);
continue;
else if ( i == -1 )
//sb->safePrintf ("<option value=%"INT32"%s>undefined",i,s);
continue;
else
sb->safePrintf ("<option value=%"INT32"%s>%"INT32"",i,s,i);
}
sb->safePrintf ( "</select>" );
return true;
}
class DropLangs {
public:
char *m_title;
char *m_lang;
char *m_tld;
};
DropLangs g_drops[] = {
{"custom",NULL,NULL},
{"web",NULL,NULL},
{"news",NULL,NULL},
{"english","en","com,us.gov,org"},
{"german","de","de"},
{"french","fr","fr"},
{"norweigian","nl","nl"},
{"spanish","es","es"},
{"italian","it","it"},
{"romantic","en,de,fr,nl,es,it","com,us.gov,org,de,fr,nl,es,it"}
};
// "url filters profile" values. used to set default crawl rules
// in Collectiondb.cpp's CollectionRec::setUrlFiltersToDefaults().
// for instance, UFP_NEWS spiders sites more frequently but less deep in
// order to get "news" pages and articles
bool printDropDownProfile ( SafeBuf* sb, char *name, CollectionRec *cr ) {
sb->safePrintf ( "<select name=%s>", name );
// the type of url filters profiles
//char *items[] = {"custom","web","news","chinese","shallow"};
int32_t nd = sizeof(g_drops)/sizeof(DropLangs);
for ( int32_t i = 0 ; i < nd ; i++ ) {
//if ( i == select ) s = " selected";
//else s = "";
char *x = cr->m_urlFiltersProfile.getBufStart();
char *s;
if ( strcmp(g_drops[i].m_title, x) == 0 ) s = " selected";
else s = "";
sb->safePrintf ("<option value=%s%s>%s",
g_drops[i].m_title,
s,
g_drops[i].m_title );
}
sb->safePrintf ( "</select>");
return true;
}
/*
char *printCheckBoxes ( int32_t n , char *p, char *pend, char *name, char *array){
for ( int32_t i = 0 ; i < n ; i++ ) {
if ( i > 0 )
sprintf (p, "<input type=checkbox value=1 name=%s%"INT32"",
name,i);
else
sprintf (p, "<input type=checkbox value=1 name=%s",
name);
p += gbstrlen ( p );
if ( array[i] ) {
sprintf ( p , " checked");
p += gbstrlen ( p );
}
sprintf ( p , ">%"INT32" &nbsp;" , i );
p += gbstrlen ( p );
//if i is single digit, add another nbsp so that everything's
//aligned
if ( i < 10 )
sprintf(p,"&nbsp;&nbsp;");
p +=gbstrlen(p);
if ( i > 0 && (i+1) % 6 == 0 )
sprintf(p,"<br>\n");
p+=gbstrlen(p);
}
return p;
}
*/
bool printCheckBoxes ( int32_t n , SafeBuf* sb, char *name, char *array){
for ( int32_t i = 0 ; i < n ; i++ ) {
if ( i > 0 )
sb->safePrintf ("<input type=checkbox value=1 name=%s%"INT32"",
name,i);
else
sb->safePrintf ("<input type=checkbox value=1 name=%s",
name);
if ( array[i] ) {
sb->safePrintf ( " checked");
}
sb->safePrintf ( ">%"INT32" &nbsp;" , i );
//if i is single digit, add another nbsp so that everything's
//aligned
if ( i < 10 )
sb->safePrintf("&nbsp;&nbsp;");
if ( i > 0 && (i+1) % 6 == 0 )
sb->safePrintf("<br>\n");
}
return true;
}
bool Parms::printParms (SafeBuf* sb, TcpSocket *s , HttpRequest *r) {
int32_t page = g_pages.getDynamicPageNumber ( r );
int32_t nc = r->getLong("nc",1);
int32_t pd = r->getLong("pd",1);
char *coll = g_collectiondb.getDefaultColl(r);
CollectionRec *cr = g_collectiondb.getRec(coll);//2(r,true);
bool isMasterAdmin = g_conf.isMasterAdmin ( s , r );
bool isCollAdmin = g_conf.isCollAdmin ( s , r );
//char *coll = r->getString ( "c" );
//if ( ! coll || ! coll[0] ) coll = "main";
//CollectionRec *cr = g_collectiondb.getRec ( coll );
// if "main" collection does not exist, try another
//if ( ! cr ) cr = getCollRecFromHttpRequest ( r );
printParms2 ( sb, page, cr, nc, pd,0,0 , s,isMasterAdmin,isCollAdmin);
return true;
}
static int32_t s_count = 0;
bool Parms::printParms2 ( SafeBuf* sb ,
int32_t page ,
CollectionRec *cr ,
int32_t nc ,
int32_t pd ,
bool isCrawlbot ,
char format , // bool isJSON ,
TcpSocket *sock ,
bool isMasterAdmin ,
bool isCollAdmin ) {
bool status = true;
s_count = 0;
// background color
char *bg1 = LIGHT_BLUE;
char *bg2 = DARK_BLUE;
// background color
char *bg = NULL;
char *coll = NULL;
if ( cr ) coll = cr->m_coll;
// page aliases
//if ( page == PAGE_COLLPASSWORDS )
// page = PAGE_MASTERPASSWORDS;
if ( page == PAGE_COLLPASSWORDS2 )
page = PAGE_COLLPASSWORDS;
GigablastRequest gr;
g_parms.setToDefault ( (char *)&gr , OBJ_GBREQUEST , NULL);
InjectionRequest ir;
g_parms.setToDefault ( (char *)&ir , OBJ_IR , NULL);
// Begin "parms":[]
if (format == FORMAT_JSON ) {
sb->safePrintf ("\"parms\":[\n");
}
// find in parms list
for ( int32_t i = 0 ; i < m_numParms ; i++ ) {
// get it
Parm *m = &m_parms[i];
// make sure we got the right parms for what we want
if ( m->m_page != page ) continue;
// and same object tpye. but allow OBJ_NONE for
// PageAddUrl.cpp
//if ( m->m_obj != parmObj && m->m_obj != OBJ_NONE ) continue;
// skip if offset is negative, that means none
// well then use OBJ_NONE now!!!
//if ( m->m_off < 0 &&
// m->m_type != TYPE_MONOD2 &&
// m->m_type != TYPE_MONOM2 &&
// m->m_type != TYPE_CMD ) continue;
// skip if hidden
if ( m->m_flags & PF_HIDDEN ) continue;
// or if should not show in html, like the
// name of the collection, the "c" parm we do not show
// generally on the html page even though it is a required parm
// we have it in a hidden html input tag in Pages.cpp.
if ( (m->m_flags & PF_NOHTML) &&
format != FORMAT_JSON &&
format != FORMAT_XML )
continue;
// get right ptr
char *THIS = NULL;
if ( m->m_obj == OBJ_CONF )
THIS = (char *)&g_conf;
if ( m->m_obj == OBJ_COLL ) {
THIS = (char *)cr;
if ( ! THIS ) continue;
}
if ( m->m_obj == OBJ_GBREQUEST )
THIS = (char *)&gr;
if ( m->m_obj == OBJ_IR )
THIS = (char *)&ir;
// might have an array, do not exceed the array size
int32_t jend = m->m_max;
int32_t size = jend ;
char *ss = ((char *)THIS + m->m_off - 4);
if ( m->m_type == TYPE_MONOD2 ) ss = NULL;
if ( m->m_type == TYPE_MONOM2 ) ss = NULL;
if ( m->m_max > 1 && ss ) size = *(int32_t *)ss;
if ( size < jend ) jend = size;
// toggle background color on group boundaries...
if ( m->m_group == 1 ) {
if ( bg == bg1 ) bg = bg2;
else bg = bg1;
}
//
// mdw just debug to here ... left off here
//char *xx=NULL;*xx=0;
// . do we have an array? if so print title on next row
// UNLESS these are priority checkboxes, those can all
// cluster together onto one row
// . only add if not in a row of controls
if ( m->m_max > 1 && m->m_type != TYPE_PRIORITY_BOXES &&
m->m_rowid == -1 &&
format != FORMAT_JSON &&
format != FORMAT_XML ) { // ! isJSON ) {
//
// make a separate table for array of parms
sb->safePrintf (
//"<table width=100%% bgcolor=#d0d0e0 "
//"cellpadding=4 border=1>\n"
"<tr><td colspan=20 bgcolor=#%s>"
"<center>"
//"<font size=+1>"
"<b>%s"
"</b>"
//"</font>"
"</td></tr>\n"
"<tr><td colspan=20><font size=-1>"
,DARK_BLUE,m->m_title);
// print the description
sb->safePrintf ( "%s" , m->m_desc );
// end the description
sb->safePrintf("</font></td></tr>\n");
}
// arrays always have blank line for adding stuff
if ( m->m_max > 1 )
// not for PAGE_PRIORITIES!
//m->m_page != PAGE_PRIORITIES )
size++;
// if m_rowid of consecutive parms are the same then they
// are all printed in the same row, otherwise the inner loop
// has no effect
int32_t rowid = m_parms[i].m_rowid;
// if not part of a complex row, just print this array right up
if ( rowid == -1 ) {
for ( int32_t j = 0 ; j < size ; j++ )
status &=printParm ( sb,NULL,&m_parms[i],i,
j, jend, (char *)THIS,
coll,NULL,
bg,nc,pd,
false,
isCrawlbot,
format,
isMasterAdmin,
isCollAdmin,
sock);
continue;
}
// if not first in a row, skip it, we printed it already
if ( i > 0 && m_parms[i-1].m_rowid == rowid ) continue;
// otherwise print everything in the row
for ( int32_t j = 0 ; j < size ; j++ ) {
// flip j if in this page
int32_t newj = j;
//if ( m->m_page == PAGE_PRIORITIES )
// newj = size - 1 - j;
for ( int32_t k = i ;
k < m_numParms &&
m_parms[k].m_rowid == rowid;
k++ ) {
status &=printParm(sb,NULL,&m_parms[k],k,
newj,jend,(char *)THIS,coll,NULL,
bg,nc,pd, j==size-1,
isCrawlbot,format,
isMasterAdmin,
isCollAdmin,
sock);
}
}
// end array table
//if ( m->m_max > 1 ) {
// sprintf ( p , "</table><br>\n");
// p += gbstrlen ( p );
//}
}
// end "parms":[]
if ( format == FORMAT_JSON ) {
if ( m_numParms != 0 ) sb->m_length -= 2;
sb->safePrintf("\n]\n");
}
return status;
}
bool Parms::printParm ( SafeBuf* sb,
//int32_t user ,
char *username,
Parm *m ,
int32_t mm , // m = &m_parms[mm]
int32_t j ,
int32_t jend ,
char *THIS ,
char *coll ,
char *pwd ,
char *bg ,
int32_t nc , // # column?
int32_t pd , // print description
bool lastRow ,
bool isCrawlbot ,
//bool isJSON ) {
char format ,
bool isMasterAdmin ,
bool isCollAdmin ,
TcpSocket *sock ) {
bool status = true;
// do not print if no permissions
//if ( m->m_perms != 0 && !g_users.hasPermission(username,m->m_perms) )
// return status;
//if ( m->m_perms != 0 && (m->m_perms & user) == 0 ) return status;
// do not print some if #define _CLIENT_ is true
//#ifdef _GLOBALSPEC_
//if ( m->m_priv == 2 ) return status;
//if ( m->m_priv == 3 ) return status;
//#elif _CLIENT_
//if ( m->m_priv ) return status;
//#elif _METALINCS_
//if ( m->m_priv == 2 ) return status;
//if ( m->m_priv == 3 ) return status;
//#endif
// priv of 4 means do not print at all
if ( m->m_priv == 4 ) return true;
// do not print comments, those are for the xml conf file
if ( m->m_type == TYPE_COMMENT ) return true;
if ( m->m_flags & PF_HIDDEN ) return true;
CollectionRec *cr = NULL;
collnum_t collnum = -1;
if ( coll ) {
cr = g_collectiondb.getRec ( coll );
if ( cr ) collnum = cr->m_collnum;
}
if ( format == FORMAT_XML || format == FORMAT_JSON ) {
// the upload button has no val, cmds too
if ( m->m_type == TYPE_FILEUPLOADBUTTON ) return true;
}
int32_t page = m->m_page;
if ( format == FORMAT_XML ) {
sb->safePrintf ( "\t<parm>\n");
sb->safePrintf ( "\t\t<title><![CDATA[");
sb->cdataEncode ( m->m_title );
sb->safePrintf ( "]]></title>\n");
sb->safePrintf ( "\t\t<desc><![CDATA[");
sb->cdataEncode ( m->m_desc );
sb->safePrintf ( "]]></desc>\n");
if ( m->m_flags & PF_REQUIRED )
sb->safePrintf("\t\t<required>1</required>\n");
sb->safePrintf ( "\t\t<cgi>%s</cgi>\n",m->m_cgi);
// and default value if it exists
char *def = m->m_def;
if ( ! def ) def = "";
sb->safePrintf ( "\t\t<defaultValue><![CDATA[");
sb->cdataEncode ( def );
sb->safePrintf ( "]]></defaultValue>\n");
if ( page == PAGE_MASTER ||
page == PAGE_SEARCH ||
page == PAGE_SPIDER ||
page == PAGE_SPIDERPROXIES ||
page == PAGE_FILTERS ||
page == PAGE_MASTERPASSWORDS ||
page == PAGE_REPAIR ||
page == PAGE_LOG ) {
sb->safePrintf ( "\t\t<currentValue><![CDATA[");
SafeBuf xb;
m->printVal ( &xb , collnum , 0 );//occNum
sb->cdataEncode ( xb.getBufStart() );
sb->safePrintf ( "]]></currentValue>\n");
}
sb->safePrintf ( "\t</parm>\n");
return true;
}
if ( format == FORMAT_JSON ) {
sb->safePrintf ( "\t{\n");
sb->safePrintf ( "\t\t\"title\":\"%s\",\n",m->m_title);
sb->safePrintf ( "\t\t\"desc\":\"");
sb->jsonEncode ( m->m_desc );
sb->safePrintf("\",\n");
if ( m->m_flags & PF_REQUIRED )
sb->safePrintf("\t\t\"required\":1,\n");
sb->safePrintf ( "\t\t\"cgi\":\"%s\",\n",m->m_cgi);
// and default value if it exists
char *def = m->m_def;
if ( ! def ) def = "";
sb->safePrintf ( "\t\t\"defaultValue\":\"");
sb->jsonEncode(def);
sb->safePrintf("\",\n");
if ( page == PAGE_MASTER ||
page == PAGE_SEARCH ||
page == PAGE_SPIDER ||
page == PAGE_SPIDERPROXIES ||
page == PAGE_FILTERS ||
page == PAGE_MASTERPASSWORDS ||
page == PAGE_REPAIR ||
page == PAGE_LOG ) {
sb->safePrintf ( "\t\t\"currentValue\":\"");
SafeBuf js;
m->printVal ( &js , collnum , 0 );//occNum );
sb->jsonEncode(js.getBufStart());
sb->safePrintf("\",\n");
}
sb->m_length -= 2; // hack of trailing comma
sb->safePrintf("\n\t},\n");
return true;
}
// . if printing on crawlbot page hide these
// . we repeat this logic below when printing parm titles
// for the column headers in the table
//char *vt = "";
//if ( isCrawlbot &&
// m->m_page == PAGE_FILTERS &&
// (strcmp(m->m_xml,"spidersEnabled") == 0 ||
// //strcmp(m->m_xml,"maxSpidersPerRule")==0||
// //strcmp(m->m_xml,"maxSpidersPerIp") == 0||
// strcmp(m->m_xml,"spiderIpWait") == 0
// ) )
// vt = " style=display:none;";
// what type of parameter?
char t = m->m_type;
// point to the data in THIS
char *s = THIS + m->m_off + m->m_size * j ;
// if THIS is NULL then it must be GigablastRequest or something
// and is not really a persistent thing, but a one-shot deal.
if ( ! THIS ) s = NULL;
// . if an array, passed our end, this is the blank line at the end
// . USE THIS EMPTY/DEFAULT LINE TO ADD NEW DATA TO AN ARRAY
// . make at least as big as a int64_t
if ( j >= jend ) s = "\0\0\0\0\0\0\0\0";
// delimit each cgi var if we need to
if ( m->m_cgi && gbstrlen(m->m_cgi) > 45 ) {
log(LOG_LOGIC,"admin: Cgi variable is TOO big.");
char *xx = NULL; *xx = 0;
}
char cgi[64];
if ( m->m_cgi ) {
if ( j > 0 ) sprintf ( cgi , "%s%"INT32"" , m->m_cgi , j );
else sprintf ( cgi , "%s" , m->m_cgi );
// let's try dropping the index # and just doing dup parms
//sprintf ( cgi , "%s" , m->m_cgi );
}
// . display title and description of the control/parameter
// . the input cell of some parameters are colored
char *color = "";
if ( t == TYPE_CMD || t == TYPE_BOOL2 )
color = " bgcolor=#6060ff";
if ( t == TYPE_BOOL ) {
if ( *s ) color = " bgcolor=#00ff00";
else color = " bgcolor=#ff0000";
}
if ( t == TYPE_BOOL || t == TYPE_BOOL2 ) {
// disable controls not allowed in read only mode
if ( g_conf.m_readOnlyMode && m->m_rdonly )
color = " bgcolor=#ffff00";
}
bool firstInRow = false;
if ( (s_count % nc) == 0 ) firstInRow = true;
s_count++;
if ( mm > 0 && m->m_rowid >= 0 && m_parms[mm-1].m_rowid == m->m_rowid )
firstInRow = false;
int32_t firstRow = 0;
//if ( m->m_page==PAGE_PRIORITIES ) firstRow = MAX_PRIORITY_QUEUES - 1;
// . use a separate table for arrays
// . make title and description header of that table
// . do not print all headers if not m_hdrs, a special case for the
// default line in the url filters table
if ( j == firstRow && m->m_rowid >= 0 && firstInRow && m->m_hdrs ) {
// print description as big comment
if ( m->m_desc && pd == 1 ) {
// url FILTERS table description row
sb->safePrintf ( "<td colspan=20 bgcolor=#%s>"
"<font size=-1>\n" , DARK_BLUE);
//p = htmlEncode ( p , pend , m->m_desc ,
// m->m_desc + gbstrlen ( m->m_desc ) );
sb->safePrintf ( "%s" , m->m_desc );
sb->safePrintf ( "</font></td></tr>"
// for "#,expression,harvestlinks.."
// header row in url FILTERS table
"<tr bgcolor=#%s>\n" ,DARK_BLUE);
}
// # column
// do not show this for PAGE_PRIORITIES it is confusing
if ( m->m_max > 1 ) {
//m->m_page != PAGE_PRIORITIES ) {
sb->safePrintf ( "<td><b>#</b></td>\n" );
}
// print all headers
for ( int32_t k = mm ;
k<m_numParms && m_parms[k].m_rowid==m->m_rowid; k++ ) {
// parm int16_tcut
Parm *mk = &m_parms[k];
// not if printing json
//if ( format != FORMAT_HTML )continue;//isJSON )
// skip if hidden
if ( cr && ! cr->m_isCustomCrawl &&
(mk->m_flags & PF_DIFFBOT) )
continue;
// . hide table column headers that are too advanced
// . we repeat this logic above for the actual parms
//char *vt = "";
//if ( isCrawlbot &&
// m->m_page == PAGE_FILTERS &&
// (strcmp(mk->m_xml,"spidersEnabled") == 0 ||
// //strcmp(mk->m_xml,"maxSpidersPerRule")==0||
// //strcmp(mk->m_xml,"maxSpidersPerIp") == 0||
// strcmp(mk->m_xml,"spiderIpWait") == 0 ) )
// vt = " style=display:none;display:none;";
//sb->safePrintf ( "<td%s>" , vt );
sb->safePrintf ( "<td>" );
// if its of type checkbox in a table make it
// toggle them all on/off
if ( mk->m_type == TYPE_CHECKBOX &&
mk->m_page == PAGE_FILTERS ) {
sb->safePrintf("<a href=# "
"onclick=\"checkAll(this, "
"'id_%s', %"INT32");\">",
m_parms[k].m_cgi, m->m_max);
}
sb->safePrintf ( "<b>%s</b>", m_parms[k].m_title );
if ( mk->m_type == TYPE_CHECKBOX &&
mk->m_page == PAGE_FILTERS )
sb->safePrintf("</a>");
/*
if ( m->m_page == PAGE_PRIORITIES &&
m_parms[k].m_type == TYPE_CHECKBOX)
sb->safePrintf("<br><a href=# "
"onclick=\"checkAll(this, "
"'id_%s', %"INT32");\">(toggle)</a>",
m_parms[k].m_cgi, m->m_max);
*/
sb->safePrintf ("</td>\n");
}
//if ( format == FORMAT_HTML )
sb->safePrintf ( "</tr>\n" ); // mdw added
}
// skip if hidden. diffbot api url only for custom crawls.
//if(cr && ! cr->m_isCustomCrawl && (m->m_flags & PF_DIFFBOT) )
// return true;
// print row start for single parm
if ( m->m_max <= 1 && ! m->m_hdrs ) {
if ( firstInRow ) {
sb->safePrintf ( "<tr bgcolor=#%s><td>" , bg );
}
sb->safePrintf ( "<td width=%"INT32"%%>" , 100/nc/2 );
}
// if parm value is not defaut, use orange!
char rr[1024];
SafeBuf val1(rr,1024);
if ( m->m_type != TYPE_FILEUPLOADBUTTON )
m->printVal ( &val1 , collnum , j ); // occNum );
// test it
if ( m->m_def &&
m->m_obj != OBJ_NONE &&
m->m_obj != OBJ_IR && // do not do for injectionrequest
m->m_obj != OBJ_GBREQUEST && // do not do for GigablastRequest
strcmp ( val1.getBufStart() , m->m_def ) )
// put non-default valued parms in orange!
bg = "ffa500";
// print the title/description in current table for non-arrays
if ( m->m_max <= 1 && m->m_hdrs ) { // j == 0 && m->m_rowid < 0 ) {
if ( firstInRow )
sb->safePrintf ( "<tr bgcolor=#%s>",bg);
if ( t == TYPE_STRINGBOX ) {
sb->safePrintf ( "<td colspan=2><center>"
"<b>%s</b><br><font size=-1>",m->m_title );
if ( pd ) {
status &= sb->htmlEncode (m->m_desc,
gbstrlen(m->m_desc),
false);
// is it required?
if ( m->m_flags & PF_REQUIRED )
sb->safePrintf(" <b><font color=green>"
"REQUIRED</font></b>");
}
sb->safePrintf ( "</font><br>\n" );
}
if ( t != TYPE_STRINGBOX ) {
// this td will be invisible if isCrawlbot and the
// parm is too advanced to display
sb->safePrintf ( "<td " );
if ( m->m_colspan > 0 )
sb->safePrintf ( "colspan=%"INT32" ",
(int32_t)m->m_colspan);
sb->safePrintf ( "width=%"INT32"%%>"//"<td width=78%%>
"<b>%s</b><br><font size=1>",
3*100/nc/2/4, m->m_title );
// the "site list" parm has html in description
if ( pd ) {
status &= sb->safeStrcpy(m->m_desc);
//status &= sb->htmlEncode (m->m_desc,
// gbstrlen(m->m_desc),
// false);
// is it required?
if ( m->m_flags & PF_REQUIRED )
sb->safePrintf(" <b><font color=green>"
"REQUIRED</font></b>");
// print users current ip if showing the list
// of "Master IPs" for admin access
if ( ( m->m_page == PAGE_MASTERPASSWORDS ||
m->m_page == PAGE_COLLPASSWORDS ) &&
sock &&
m->m_title &&
strstr(m->m_title,"IP") )
sb->safePrintf(" <b>Your current IP "
"is %s.</b>",
iptoa(sock->m_ip));
}
// and cgi parm if it exists
//if ( m->m_def && m->m_scgi )
// sb->safePrintf(" CGI override: %s.",m->m_scgi);
// just let them see the api page for this...
//sb->safePrintf(" CGI: %s.",m->m_cgi);
// and default value if it exists
if ( m->m_def && m->m_def[0] && t != TYPE_CMD ) {
char *d = m->m_def;
if ( t == TYPE_BOOL || t == TYPE_CHECKBOX ) {
if ( d[0]=='0' ) d = "NO";
else d = "YES";
sb->safePrintf ( " <nobr>"
"Default: %s."
"</nobr>",d);
}
else {
sb->safePrintf (" Default: ");
status &= sb->htmlEncode (d,
gbstrlen(d),
false);
}
}
sb->safePrintf ( "</font></td>\n<td%s width=%"INT32"%%>" ,
color , 100/nc/2/4 );
}
}
// . print number in row if array, start at 1 for clarity's sake
// . used for url filters table, etc.
if ( m->m_max > 1 ) {
// bg color alternates
char *bgc = LIGHT_BLUE;
if ( j % 2 ) bgc = DARK_BLUE;
// do not print this if doing json
//if ( format != FORMAT_HTML );//isJSON ) ;
// but if it is in same row as previous, do not repeat it
// for this same row, silly
if ( firstInRow ) // && m->m_page != PAGE_PRIORITIES )
sb->safePrintf ( "<tr bgcolor=#%s>"
"<td>%"INT32"</td>\n<td>",
bgc,
j );//j+1
else if ( firstInRow )
sb->safePrintf ( "<tr><td>" );
else
//sb->safePrintf ( "<td%s>" , vt);
sb->safePrintf ( "<td>" );
}
//int32_t cast = m->m_cast;
//if ( g_proxy.isProxy() ) cast = 0;
// print the input box
if ( t == TYPE_BOOL ) {
char *tt, *v;
if ( *s ) { tt = "YES"; v = "0"; }
else { tt = "NO" ; v = "1"; }
if ( g_conf.m_readOnlyMode && m->m_rdonly )
sb->safePrintf ( "<b>read-only mode</b>" );
// if cast=1, command IS broadcast to all hosts
else
sb->safePrintf ( "<b><a href=\"/%s?c=%s&"
"%s=%s\">" // &cast=%"INT32"\">"
"<center>%s</center></a></b>",
g_pages.getPath(m->m_page),coll,
cgi,v,//cast,
tt);
}
else if ( t == TYPE_BOOL2 ) {
if ( g_conf.m_readOnlyMode && m->m_rdonly )
sb->safePrintf ( "<b><center>read-only mode"
"</center></b>");
// always use m_def as the value for TYPE_BOOL2
else
sb->safePrintf ( "<b><a href=\"/%s?c=%s&%s=%s\">"
//"cast=1\">"
"<center>%s</center></a></b>",
g_pages.getPath(m->m_page),coll,
cgi,m->m_def, m->m_title);
}
else if ( t == TYPE_CHECKBOX ) {
//char *ddd1 = "";
//char *ddd2 = "";
//if ( *s ) ddd1 = " checked";
//else ddd2 = " checked";
// just show the parm name and value if printing in json
// if ( format == FORMAT_JSON ) { // isJSON ) {
// if ( ! lastRow ) {
// int32_t val = 0;
// if ( *s ) val = 1;
// sb->safePrintf("\"%s\":%"INT32",\n",cgi,val);
// }
// }
//sb->safePrintf("<center><nobr>");
sb->safePrintf("<nobr>");
// this is part of the "HACK" fix below. you have to
// specify the cgi parm in the POST request, and
// unchecked checkboxes are not included in the POST
// request.
//if ( lastRow && m->m_page == PAGE_FILTERS )
// sb->safePrintf("<input type=hidden ");
//char *val = "Y";
//if ( ! *s ) val = "N";
char *val = "";
// "s" is invalid of parm has no "object"
if ( m->m_obj == OBJ_NONE && m->m_def[0] != '0' )
val = " checked";
if ( m->m_obj != OBJ_NONE && s && *s )
val = " checked";
// s is NULL for GigablastRequest parms
if ( ! s && m->m_def && m->m_def[0]=='1' )
val = " checked";
// in case it is not checked, submit that!
// if it gets checked this should be overridden then
sb->safePrintf("<input type=hidden name=%s value=0>"
, cgi );
//else
sb->safePrintf("<input type=checkbox value=1 ");
//"<nobr><input type=button ");
if ( m->m_page == PAGE_FILTERS)
sb->safePrintf("id=id_%s ",cgi);
sb->safePrintf("name=%s%s"
//" onmouseup=\""
//"if ( this.value=='N' ) {"
//"this.value='Y';"
//"} "
//"else if ( this.value=='Y' ) {"
//"this.value='N';"
//"}"
//"\" "
">"
,cgi
,val);//,ddd);
//
// repeat for off position
//
//if ( ! lastRow || m->m_page != PAGE_FILTERS ) {
// sb->safePrintf(" Off:<input type=radio ");
// if ( m->m_page == PAGE_FILTERS)
// sb->safePrintf("id=id_%s ",cgi);
// sb->safePrintf("value=0 name=%s%s>",
// cgi,ddd2);
//}
sb->safePrintf("</nobr>"
//"</center>"
);
}
else if ( t == TYPE_CHAR )
sb->safePrintf ("<input type=text name=%s value=\"%"INT32"\" "
"size=3>",cgi,(int32_t)(*s));
/* else if ( t == TYPE_CHAR2 )
sprintf (p,"<input type=text name=%s value=\"%"INT32"\" "
"size=3>",cgi,*(char*)s);*/
else if ( t == TYPE_PRIORITY )
printDropDown ( MAX_SPIDER_PRIORITIES , sb , cgi , *s ,
false , false );
else if ( t == TYPE_PRIORITY2 ) {
// just show the parm name and value if printing in json
// if ( format==FORMAT_JSON) // isJSON )
// sb->safePrintf("\"%s\":%"INT32",\n",cgi,(int32_t)*(char *)s);
// else
printDropDown ( MAX_SPIDER_PRIORITIES , sb , cgi , *s ,
true , true );
}
// this url filters parm is an array of SAFEBUFs now, so each is
// a string and that string is the diffbot api url to use.
// the string is empty or zero length to indicate none.
//else if ( t == TYPE_DIFFBOT_DROPDOWN ) {
// char *xx=NULL;*xx=0;
//}
//else if ( t == TYPE_UFP )
else if ( t == TYPE_SAFEBUF &&
strcmp(m->m_title,"url filters profile")==0)
// url filters profile drop down "ufp"
printDropDownProfile ( sb , "ufp" , cr );//*s );
// do not expose master passwords or IPs to non-root admins
else if ( ( m->m_flags & PF_PRIVATE ) &&
m->m_obj == OBJ_CONF &&
! isMasterAdmin )
return true;
// do not expose master passwords or IPs to non-root admins
else if ( ( m->m_flags & PF_PRIVATE ) &&
m->m_obj == OBJ_COLL &&
! isCollAdmin )
return true;
else if ( t == TYPE_RETRIES )
printDropDown ( 4 , sb , cgi , *s , false , false );
else if ( t == TYPE_FILEUPLOADBUTTON ) {
sb->safePrintf("<input type=file name=%s>",cgi);
}
else if ( t == TYPE_PRIORITY_BOXES ) {
// print ALL the checkboxes when we get the first parm
if ( j != 0 ) return status;
printCheckBoxes ( MAX_SPIDER_PRIORITIES , sb , cgi , s );
}
else if ( t == TYPE_CMD )
// if cast=0 it will be executed, otherwise it will be
// broadcasted with cast=1 to all hosts and they will all
// execute it
sb->safePrintf ( "<b><a href=\"/%s?c=%s&%s=1\">" // cast=%"INT32"
"<center>%s</center></a></b>",
g_pages.getPath(m->m_page),coll,
cgi,m->m_title);
else if ( t == TYPE_FLOAT ) {
// just show the parm name and value if printing in json
// if ( format == FORMAT_JSON )//isJSON )
// sb->safePrintf("\"%s\":%f,\n",cgi,*(float *)s);
// else
sb->safePrintf ("<input type=text name=%s "
"value=\"%f\" "
// 3 was ok on firefox but need 6
// on chrome
"size=7>",cgi,*(float *)s);
}
else if ( t == TYPE_IP ) {
if ( m->m_max > 0 && j == jend )
sb->safePrintf ("<input type=text name=%s value=\"\" "
"size=12>",cgi);
else
sb->safePrintf ("<input type=text name=%s value=\"%s\" "
"size=12>",cgi,iptoa(*(int32_t *)s));
}
else if ( t == TYPE_LONG ) {
// just show the parm name and value if printing in json
// if ( format == FORMAT_JSON ) // isJSON )
// sb->safePrintf("\"%s\":%"INT32",\n",cgi,*(int32_t *)s);
// else
sb->safePrintf ("<input type=text name=%s "
"value=\"%"INT32"\" "
// 3 was ok on firefox but need 6
// on chrome
"size=6>",cgi,*(int32_t *)s);
}
else if ( t == TYPE_LONG_CONST )
sb->safePrintf ("%"INT32"",*(int32_t *)s);
else if ( t == TYPE_LONG_LONG )
sb->safePrintf ("<input type=text name=%s value=\"%"INT64"\" "
"size=12>",cgi,*(int64_t *)s);
else if ( t == TYPE_STRING || t == TYPE_STRINGNONEMPTY ) {
int32_t size = m->m_size;
// give regular expression box on url filters page more room
//if ( m->m_page == PAGE_FILTERS ) {
// if ( size > REGEX_TXT_MAX ) size = REGEX_TXT_MAX;
//}
//else {
if ( size > 20 ) size = 20;
//}
sb->safePrintf ("<input type=text name=%s size=%"INT32" value=\"",
cgi,size);
// if it has PF_DEFAULTCOLL flag set then use the coll
if ( cr && (m->m_flags & PF_COLLDEFAULT) )
sb->safePrintf("%s",cr->m_coll);
else
sb->dequote ( s , gbstrlen(s) );
sb->safePrintf ("\">");
}
// HACK: print a drop down not a textbox for selecting the
// m_spiderDiffbotApiUrl[]. we can't just store this selection
// as a number because m_diffbotApiList (a string of comma separated
// items to select from) can change! it is not a typical dropdown.
// so we have to record the actual text we selected, which is
// basically the diffbot api url. this is because john can add
// custom diffbot api urls at anytime to the list.
/*
else if ( t == TYPE_SAFEBUF && strcmp(m->m_cgi,"dapi") == 0 ) {
SafeBuf *sx = (SafeBuf *)s;
// just show the parm name and value if printing in json
if ( isJSON ) {
// this can be empty for the empty row i guess
if ( sx->length() ) {
// convert diffbot # to string
sb->safePrintf("\"%s\":\"",cgi);
// this is just the url path, not the title
// of the menu option... so this would be
// like "/api/article?u="
sb->safeUtf8ToJSON (sx->getBufStart() );
sb->safePrintf("\",\n");
}
}
else
printDiffbotDropDown ( sb , cgi , THIS , sx );
}
*/
else if ( t == TYPE_CHARPTR ) {
int32_t size = m->m_size;
char *sp = NULL;
if ( s && *s ) sp = *(char **)s;
if ( ! sp ) sp = "";
if ( m->m_flags & PF_TEXTAREA ) {
sb->safePrintf ("<textarea name=%s rows=10 cols=80>",
cgi);
if ( m->m_obj != OBJ_NONE )
sb->htmlEncode(sp,gbstrlen(sp),false);
sb->safePrintf ("</textarea>");
}
else {
sb->safePrintf ("<input type=text name=%s size=%"INT32" "
"value=\"",cgi,size);
// if it has PF_DEFAULTCOLL flag set then use the coll
if ( cr && (m->m_flags & PF_COLLDEFAULT) )
sb->safePrintf("%s",cr->m_coll);
else if ( sp )
sb->dequote ( sp , gbstrlen(sp) );
sb->safePrintf ("\">");
}
}
else if ( t == TYPE_SAFEBUF ) {
int32_t size = m->m_size;
// give regular expression box on url filters page more room
if ( m->m_page == PAGE_FILTERS ) {
//if ( size > REGEX_TXT_MAX ) size = REGEX_TXT_MAX;
size = 40;
}
else {
if ( size > 20 ) size = 20;
}
SafeBuf *sx = (SafeBuf *)s;
SafeBuf tmp;
// if printing a parm in a one-shot deal like GigablastRequest
// then s and sx will always be NULL, so set to default
if ( ! sx ) {
sx = &tmp;
char *def = m->m_def;
// if it has PF_DEFAULTCOLL flag set then use the coll
if ( cr && (m->m_flags & PF_COLLDEFAULT) )
def = cr->m_coll;
tmp.safePrintf("%s",def);
}
// just show the parm name and value if printing in json
// if ( format == FORMAT_JSON ) { // isJSON ) {
// // this can be empty for the empty row i guess
// if ( sx->length() ) {
// // convert diffbot # to string
// sb->safePrintf("\"%s\":\"",cgi);
// if ( m->m_obj != OBJ_NONE )
// sb->safeUtf8ToJSON (sx->getBufStart());
// sb->safePrintf("\",\n");
// }
// }
if ( m->m_flags & PF_TEXTAREA ) {
int rows = 10;
if ( m->m_flags & PF_SMALLTEXTAREA )
rows = 4;
sb->safePrintf ("<textarea id=tabox "
"name=%s rows=%i cols=80>",
cgi,rows);
//sb->dequote ( s , gbstrlen(s) );
// note it
//log("hack: %s",sx->getBufStart());
//sb->dequote ( sx->getBufStart() , sx->length() );
if ( m->m_obj != OBJ_NONE )
sb->htmlEncode(sx->getBufStart(),
sx->length(),false);
sb->safePrintf ("</textarea>");
}
else {
sb->safePrintf ("<input type=text name=%s size=%"INT32" "
"value=\"",
cgi,size);
//sb->dequote ( s , gbstrlen(s) );
// note it
//log("hack: %s",sx->getBufStart());
if ( cr &&
(m->m_flags & PF_COLLDEFAULT) &&
sx &&
sx->length() <= 0 )
sb->dequote ( cr->m_coll,gbstrlen(cr->m_coll));
// if parm is OBJ_NONE there is no stored valued
else if ( m->m_obj != OBJ_NONE )
sb->dequote ( sx->getBufStart(), sx->length());
sb->safePrintf ("\">");
}
}
else if ( t == TYPE_STRINGBOX ) {
sb->safePrintf("<textarea id=tabox rows=10 cols=64 name=%s>",
cgi);
//p += urlEncode ( p , pend - p , s , gbstrlen(s) );
//p += htmlDecode ( p , s , gbstrlen(s) );
sb->htmlEncode ( s , gbstrlen(s), false );
//sprintf ( p , "%s" , s );
//p += gbstrlen(p);
sb->safePrintf ("</textarea>\n");
}
else if ( t == TYPE_CONSTANT )
sb->safePrintf ("%s",m->m_title);
else if ( t == TYPE_MONOD2 )
sb->safePrintf ("%"INT32"",j / 2 );
else if ( t == TYPE_MONOM2 ) {
/*
if ( m->m_page == PAGE_PRIORITIES ) {
if ( j % 2 == 0 ) sb->safePrintf ("old");
else sb->safePrintf ("new");
}
else
*/
sb->safePrintf ("%"INT32"",j % 2 );
}
else if ( t == TYPE_RULESET ) ;
// subscript is already included in "cgi"
//g_pages.printRulesetDropDown ( sb ,
// user ,
// cgi ,
// *(int32_t *)s , // selected
// -1 ); // subscript
else if ( t == TYPE_TIME ) {
//time is stored as a string
//if time is not stored properly, just write 00:00
if ( s[2] != ':' )
strncpy ( s, "00:00", 5 );
char hr[3];
char min[3];
gbmemcpy ( hr, s, 2 );
gbmemcpy ( min, s + 3, 2 );
hr[2] = '\0';
min[2] = '\0';
// print the time in the input forms
sb->safePrintf("<input type=text name=%shr size=2 "
"value=%s>h "
"<input type=text name=%smin size=2 "
"value=%s>m " ,
cgi ,
hr ,
cgi ,
min );
}
else if ( t == TYPE_DATE || t == TYPE_DATE2 ) {
// time is stored as int32_t
int32_t ct = *(int32_t *)s;
// get the time struct
struct tm *tp = gmtime ( (time_t *)&ct ) ;
// set the "selected" month for the drop down
char *ss[12];
for ( int32_t i = 0 ; i < 12 ; i++ ) ss[i]="";
int32_t month = tp->tm_mon;
if ( month < 0 || month > 11 ) month = 0; // Jan
ss[month] = " selected";
// print the date in the input forms
sb->safePrintf(
"<input type=text name=%sday "
"size=2 value=%"INT32"> "
"<select name=%smon>"
"<option value=0%s>Jan"
"<option value=1%s>Feb"
"<option value=2%s>Mar"
"<option value=3%s>Apr"
"<option value=4%s>May"
"<option value=5%s>Jun"
"<option value=6%s>Jul"
"<option value=7%s>Aug"
"<option value=8%s>Sep"
"<option value=9%s>Oct"
"<option value=10%s>Nov"
"<option value=11%s>Dec"
"</select>\n"
"<input type=text name=%syr size=4 value=%"INT32">"
"<br>"
"<input type=text name=%shr size=2 "
"value=%02"INT32">h "
"<input type=text name=%smin size=2 "
"value=%02"INT32">m "
"<input type=text name=%ssec size=2 "
"value=%02"INT32">s" ,
cgi ,
(int32_t)tp->tm_mday ,
cgi ,
ss[0],ss[1],ss[2],ss[3],ss[4],ss[5],ss[6],ss[7],ss[8],
ss[9],ss[10],ss[11],
cgi ,
(int32_t)tp->tm_year + 1900 ,
cgi ,
(int32_t)tp->tm_hour ,
cgi ,
(int32_t)tp->tm_min ,
cgi ,
(int32_t)tp->tm_sec );
/*
if ( t == TYPE_DATE2 ) {
p += gbstrlen ( p );
// a int32_t after the int32_t is used for this
int32_t ct = *(int32_t *)(THIS+m->m_off+4);
char *ss = "";
if ( ct ) ss = " checked";
sprintf ( p , "<br><input type=checkbox "
"name=%sct value=1%s> use current "
"time\n",cgi,ss);
}
*/
}
else if ( t == TYPE_SITERULE ) {
// print the siterec rules as a drop down
char *ss[5];
for ( int32_t i = 0; i < 5; i++ ) ss[i] = "";
int32_t v = *(int32_t*)s;
if ( v < 0 || v > 4 ) v = 0;
ss[v] = " selected";
sb->safePrintf ( "<select name=%s>"
"<option value=0%s>Hostname"
"<option value=1%s>Path Depth 1"
"<option value=2%s>Path Depth 2"
"<option value=3%s>Path Depth 3"
"</select>\n",
cgi, ss[0], ss[1], ss[2], ss[3] );
}
// end the input cell
sb->safePrintf ( "</td>\n");
// "insert above" link? used for arrays only, where order matters
if ( m->m_addin && j < jend ) {//! isJSON ) {
sb->safePrintf ( "<td><a href=\"?c=%s&" // cast=1&"
//"ins_%s=1\">insert</td>\n",coll,cgi );
// insert=<rowNum>
// "j" is the row #
"insert=%"INT32"\">insert</td>\n",coll,j );
}
// does next guy start a new row?
bool lastInRow = true; // assume yes
if (mm+1<m_numParms&&m->m_rowid>=0&&m_parms[mm+1].m_rowid==m->m_rowid)
lastInRow = false;
if ( ((s_count-1) % nc) != (nc-1) ) lastInRow = false;
// . display the remove link for arrays if we need to
// . but don't display if next guy does NOT start a new row
//if ( m->m_max > 1 && lastInRow && ! isJSON ) {
if ( m->m_addin && j < jend ) { //! isJSON ) {
// m->m_page != PAGE_PRIORITIES ) {
// show remove link?
bool show = true;
//if ( j >= jend ) show = false;
// get # of rows
int32_t *nr = (int32_t *)((char *)THIS + m->m_off - 4);
// are we the last row?
bool lastRow = false;
// yes, if this is true
if ( j == *nr - 1 ) lastRow = true;
// do not allow removal of last default url filters rule
//if ( lastRow && !strcmp(m->m_cgi,"fsp")) show = false;
char *suffix = "";
if ( m->m_page == PAGE_MASTERPASSWORDS &&
m->m_type == TYPE_IP )
suffix = "ip";
if ( m->m_page == PAGE_MASTERPASSWORDS &&
m->m_type == TYPE_STRINGNONEMPTY )
suffix = "pwd";
if ( show )
sb->safePrintf ("<td><a href=\"?c=%s&" // cast=1&"
//"rm_%s=1\">"
// remove=<rownum>
"remove%s=%"INT32"\">"
"remove</a></td>\n",coll,//cgi );
suffix,
j); // j is row #
else
sb->safePrintf ( "<td></td>\n");
}
if ( lastInRow ) sb->safePrintf ("</tr>\n");
return status;
}
/*
// get the object of our desire
char *Parms::getTHIS ( HttpRequest *r , int32_t page ) {
// if not master controls, must be a collection rec
//if ( page < PAGE_CGIPARMS ) return (char *)&g_conf;
char *coll = r->getString ( "c" );
// support john wanting to use "id" for the crawl id which is really
// the collection id, hopefully won't conflict with other things.
if ( ! coll ) coll = r->getString ( "id" );
if ( ! coll || ! coll[0] )
//coll = g_conf.m_defaultColl;
coll = g_conf.getDefaultColl( r->getHost(), r->getHostLen() );
CollectionRec *cr = g_collectiondb.getRec ( coll );
if ( ! cr ) log("admin: Collection \"%s\" not found.",
r->getString("c") );
return (char *)cr;
}
*/
// now we use this to set SearchInput and GigablastRequest
bool Parms::setFromRequest ( HttpRequest *r ,
TcpSocket* s,
CollectionRec *newcr ,
char *THIS ,
int32_t objType ) {
// get the page from the path... like /sockets --> PAGE_SOCKETS
//int32_t page = g_pages.getDynamicPageNumber ( r );
// use convertHttpRequestToParmList() for these because they
// are persistent records that are updated on every shard.
if ( objType == OBJ_COLL ) { char *xx=NULL;*xx=0; }
if ( objType == OBJ_CONF ) { char *xx=NULL;*xx=0; }
// ensure valid
if ( ! THIS ) {
// it is null when no collection explicitly specified...
log(LOG_LOGIC,"admin: THIS is null for setFromRequest");
char *xx=NULL;*xx=0;
}
// need this for searchInput which takes default from "cr"
//CollectionRec *cr = g_collectiondb.getRec ( r , true );
// no SearchInput.cpp does this and then overrides if xml feed
// to set m_docsToScanForTopics
//setToDefault ( THIS , objType , cr );
// loop through cgi parms
for ( int32_t i = 0 ; i < r->getNumFields() ; i++ ) {
// get cgi parm name
char *field = r->getField ( i );
// find in parms list
int32_t j;
Parm *m;
for ( j = 0 ; j < m_numParms ; j++ ) {
// get it
m = &m_parms[j];
// skip if not our type
if ( m->m_obj != objType ) continue;
// skip if offset is negative, that means none
if ( m->m_off < 0 ) continue;
// skip if no cgi parm, may not be configurable now
if ( ! m->m_cgi ) continue;
// otherwise, must match the cgi name exactly
if ( strcmp ( field,m->m_cgi ) == 0 ) break;
}
// bail if the cgi field is not in the parms list
if ( j >= m_numParms ) continue;
// get the value of cgi parm (null terminated)
char *v = r->getValue ( i );
// empty?
if ( ! v ) continue;
// . skip if no value was provided
// . unless it was a string! so we can make them empty.
if ( v[0] == '\0' &&
m->m_type != TYPE_STRING &&
m->m_type != TYPE_STRINGBOX ) continue;
// set it
setParm ( (char *)THIS , m, j, 0, v, false,//not html enc
false );//true );
}
return true;
}
bool Parms::insertParm ( int32_t i , int32_t an , char *THIS ) {
Parm *m = &m_parms[i];
// . shift everyone above down
// . first int32_t at offset is always the count
// for arrays
char *pos = (char *)THIS + m->m_off ;
int32_t num = *(int32_t *)(pos - 4);
// ensure we are valid
if ( an >= num || an < 0 ) {
log("admin: Invalid insertion of element "
"%"INT32" in array of size %"INT32" for \"%s\".",
an,num,m->m_title);
return false;
}
// also ensure that we have space to put the parm in, because in
// case of URl filters, it is bounded by MAX_FILTERS
if ( num >= MAX_FILTERS ){
log("admin: Invalid insert of element %"INT32", array is full "
"in size %"INT32" for \"%s\".",an, num, m->m_title);
return false;
}
// point to the place where the element is to be inserted
char *src = pos + m->m_size * an;
//point to where it is to be moved
char *dst = pos + m->m_size * ( an + 1 );
// how much to move
int32_t size = ( num - an ) * m->m_size ;
// move them
memmove ( dst , src , size );
// if the src was a TYPE_SAFEBUF clear it so we don't end up doing
// a double free, etc.!
memset ( src , 0 , m->m_size );
// inc the count
*(int32_t *)(pos-4) = (*(int32_t *)(pos-4)) + 1;
// put the defaults in the inserted line
setParm ( (char *)THIS , m , i , an , m->m_def , false ,false );
return true;
}
bool Parms::removeParm ( int32_t i , int32_t an , char *THIS ) {
Parm *m = &m_parms[i];
// . shift everyone above down
// . first int32_t at offset is always the count
// for arrays
char *pos = (char *)THIS + m->m_off ;
int32_t num = *(int32_t *)(pos - 4);
// ensure we are valid
if ( an >= num || an < 0 ) {
log("admin: Invalid removal of element "
"%"INT32" in array of size %"INT32" for \"%s\".",
an,num,m->m_title);
return false;
}
// point to the element being removed
char *dst = pos + m->m_size * an;
// free memory pointed to by safebuf, if we are safebuf, before
// overwriting it... prevents a memory leak
if ( m->m_type == TYPE_SAFEBUF ) {
SafeBuf *dx = (SafeBuf *)dst;
dx->purge();
}
// then point to the good stuf
char *src = pos + m->m_size * (an+1);
// how much to bury it with
int32_t size = (num - an - 1 ) * m->m_size ;
// bury it
gbmemcpy ( dst , src , size );
// and detach the buf on the tail so it doesn't core in Mem.cpp
// when it tries to free...
if ( m->m_type == TYPE_SAFEBUF ) {
SafeBuf *tail = (SafeBuf *)(pos + m->m_size * (num-1));
tail->detachBuf();
}
// dec the count
*(int32_t *)(pos-4) = (*(int32_t *)(pos-4)) - 1;
return true;
}
void Parms::setParm ( char *THIS , Parm *m , int32_t mm , int32_t j , char *s ,
bool isHtmlEncoded , bool fromRequest ) {
if ( fromRequest ) { char *xx=NULL;*xx=0; }
// . this is just for setting CollectionRecs, so skip if offset < 0
// . some parms are just for SearchInput (search parms)
if ( m->m_off < 0 ) return;
if ( m->m_obj == OBJ_NONE ) return ;
float oldVal = 0;
float newVal = 0;
if ( ! s &&
m->m_type != TYPE_CHARPTR &&
m->m_type != TYPE_FILEUPLOADBUTTON &&
m->m_defOff==-1) {
s = "0";
char *tit = m->m_title;
if ( ! tit || ! tit[0] ) tit = m->m_xml;
log(LOG_LOGIC,"admin: Parm \"%s\" had NULL default value. "
"Forcing to 0.",
tit);
//char *xx = NULL; *xx = 0;
}
// sanity check
if ( &m_parms[mm] != m ) {
log(LOG_LOGIC,"admin: Not sane parameters.");
char *xx = NULL; *xx = 0;
}
// if attempting to add beyond array max, bail out
if ( j >= m->m_max && j >= m->m_fixed ) {
log ( "admin: Attempted to set parm beyond limit. Aborting." );
return;
}
// if we are setting a guy in an array AND he is NOT the first
// in his row, ensure the guy before has a count of j+1 or more.
//
// crap, on the url filters page if you do not check "spidering
// enabled" checkbox when adding a new rule at the bottom of the
// table, , then the spidering enabled parameter does not transmit so
// the "respider frequency" ends up checking the "spider enabled"
// array whose "count" was not incremented like it should have been.
// HACK: make new line at bottom always have spidering enabled
// checkbox set and make it impossible to unset.
/*
if ( m->m_max > 1 && m->m_rowid >= 0 && mm > 0 &&
m_parms[mm-1].m_rowid == m->m_rowid ) {
char *pos = (char *)THIS + m_parms[mm-1].m_off - 4 ;
int32_t maxcount = *(int32_t *)pos;
if ( j >= maxcount ) {
log("admin: parm before \"%s\" is limiting us",
m_parms[mm-1].m_title);
//log("admin: try nuking the url filters or whatever "
// "and re-adding");
return;
}
}
*/
// ensure array count at least j+1
if ( m->m_max > 1 ) {
// . is this element we're adding bumping up the count?
// . array count is 4 bytes before the array
char *pos = (char *)THIS + m->m_off - 4 ;
// set the count to it if it is bigger than current count
if ( j + 1 > *(int32_t *)pos ) *(int32_t *)pos = j + 1;
}
char t = m->m_type;
if ( t == TYPE_CHAR ||
t == TYPE_CHAR2 ||
t == TYPE_CHECKBOX ||
t == TYPE_BOOL ||
t == TYPE_BOOL2 ||
t == TYPE_PRIORITY ||
t == TYPE_PRIORITY2 ||
//t == TYPE_DIFFBOT_DROPDOWN ||
t == TYPE_UFP ||
t == TYPE_PRIORITY_BOXES ||
t == TYPE_RETRIES ||
t == TYPE_FILTER ) {
if ( fromRequest && *(char *)(THIS + m->m_off + j) == atol(s))
return;
if ( fromRequest)oldVal = (float)*(char *)(THIS + m->m_off +j);
*(char *)(THIS + m->m_off + j) = atol ( s );
newVal = (float)*(char *)(THIS + m->m_off + j);
goto changed; }
else if ( t == TYPE_CHARPTR ) {
// "s" might be NULL or m->m_def...
*(char **)(THIS + m->m_off + j) = s;
}
else if ( t == TYPE_FILEUPLOADBUTTON ) {
// "s" might be NULL or m->m_def...
*(char **)(THIS + m->m_off + j) = s;
}
else if ( t == TYPE_CMD ) {
log(LOG_LOGIC, "conf: Parms: TYPE_CMD is not a cgi var.");
return; }
else if ( t == TYPE_DATE2 || t == TYPE_DATE ) {
int32_t v = (int32_t)atotime ( s );
if ( fromRequest && *(int32_t *)(THIS + m->m_off + 4*j) == v )
return;
*(int32_t *)(THIS + m->m_off + 4*j) = v;
if ( v < 0 ) log("conf: Date for <%s> of \""
"%s\" is not in proper format like: "
"01 Jan 1980 22:45",m->m_xml,s);
goto changed; }
else if ( t == TYPE_FLOAT ) {
if( fromRequest &&
*(float *)(THIS + m->m_off + 4*j) == (float)atof ( s ) )
return;
// if changed within .00001 that is ok too, do not count
// as changed, the atof() has roundoff errors
//float curVal = *(float *)(THIS + m->m_off + 4*j);
//float newVal = atof(s);
//if ( newVal < curVal && newVal + .000001 >= curVal ) return;
//if ( newVal > curVal && newVal - .000001 <= curVal ) return;
if ( fromRequest ) oldVal = *(float *)(THIS + m->m_off + 4*j);
*(float *)(THIS + m->m_off + 4*j) = (float)atof ( s );
newVal = *(float *)(THIS + m->m_off + 4*j);
goto changed; }
else if ( t == TYPE_DOUBLE ) {
if( fromRequest &&
*(double *)(THIS + m->m_off + 4*j) == (double)atof ( s ) )
return;
if ( fromRequest ) oldVal = *(double *)(THIS + m->m_off + 4*j);
*(double *)(THIS + m->m_off + 4*j) = (double)atof ( s );
newVal = *(double *)(THIS + m->m_off + 4*j);
goto changed; }
else if ( t == TYPE_IP ) {
if ( fromRequest && *(int32_t *)(THIS + m->m_off + 4*j) ==
(int32_t)atoip (s,gbstrlen(s) ) )
return;
*(int32_t *)(THIS + m->m_off + 4*j) = (int32_t)atoip (s,gbstrlen(s) );
goto changed; }
else if ( t == TYPE_LONG || t == TYPE_LONG_CONST || t == TYPE_RULESET||
t == TYPE_SITERULE ) {
int32_t v = atol ( s );
// min is considered valid if >= 0
if ( m->m_min >= 0 && v < m->m_min ) v = m->m_min;
if ( fromRequest && *(int32_t *)(THIS + m->m_off + 4*j) == v )
return;
if ( fromRequest)oldVal=(float)*(int32_t *)(THIS + m->m_off +4*j);
*(int32_t *)(THIS + m->m_off + 4*j) = v;
newVal = (float)*(int32_t *)(THIS + m->m_off + 4*j);
goto changed; }
else if ( t == TYPE_LONG_LONG ) {
if ( fromRequest &&
*(uint64_t *)(THIS + m->m_off+8*j)==
strtoull(s,NULL,10))
return;
*(int64_t *)(THIS + m->m_off + 8*j) = strtoull(s,NULL,10);
goto changed; }
// like TYPE_STRING but dynamically allocates
else if ( t == TYPE_SAFEBUF ) {
int32_t len = gbstrlen(s);
// no need to truncate since safebuf is dynamic
//if ( len >= m->m_size ) len = m->m_size - 1; // truncate!!
//char *dst = THIS + m->m_off + m->m_size*j ;
// point to the safebuf, in the case of an array of
// SafeBufs "j" is the # in the array, starting at 0
SafeBuf *sb = (SafeBuf *)(THIS+m->m_off+(j*sizeof(SafeBuf)) );
int32_t oldLen = sb->length();
// why was this commented out??? we need it now that we
// send email alerts when parms change!
if ( fromRequest &&
! isHtmlEncoded && oldLen == len &&
memcmp ( sb->getBufStart() , s , len ) == 0 )
return;
// nuke it
sb->purge();
// this means that we can not use string POINTERS as parms!!
if ( ! isHtmlEncoded ) sb->safeMemcpy ( s , len );
else len = sb->htmlDecode (s,len,false,0);
// tag it
sb->setLabel ( "parm1" );
// ensure null terminated
sb->nullTerm();
// note it
//log("hack: %s",s);
// null term it all
//dst[len] = '\0';
//sb->reserve ( 1 );
// null terminate but do not include as m_length so the
// memcmp() above still works right
//sb->m_buf[sb->m_length] = '\0';
// . might have to set length
// . used for CollectionRec::m_htmlHeadLen and m_htmlTailLen
//if ( m->m_plen >= 0 )
// *(int32_t *)(THIS + m->m_plen) = len ;
goto changed;
}
else if ( t == TYPE_STRING ||
t == TYPE_STRINGBOX ||
t == TYPE_STRINGNONEMPTY ||
t == TYPE_TIME ) {
int32_t len = gbstrlen(s);
if ( len >= m->m_size ) len = m->m_size - 1; // truncate!!
char *dst = THIS + m->m_off + m->m_size*j ;
// why was this commented out??? we need it now that we
// send email alerts when parms change!
if ( fromRequest &&
! isHtmlEncoded && (int32_t)gbstrlen(dst) == len &&
memcmp ( dst , s , len ) == 0 )
return;
// this means that we can not use string POINTERS as parms!!
if ( ! isHtmlEncoded ) {gbmemcpy ( dst , s , len ); }
else len = htmlDecode (dst , s,len,false,0);
dst[len] = '\0';
// . might have to set length
// . used for CollectionRec::m_htmlHeadLen and m_htmlTailLen
if ( m->m_plen >= 0 )
*(int32_t *)(THIS + m->m_plen) = len ;
goto changed; }
changed:
// tell gigablast the value is EXPLICITLY given -- no longer based
// on default.conf
//if ( m->m_obj == OBJ_COLL ) ((CollectionRec *)THIS)->m_orig[mm] = 2;
// we do not recognize timezones corectly when this is serialized
// into coll.conf, it says UTC, which is ignored in HttpMime.cpp's
// atotime() function. and when we submit it i think we use the
// local time zone, so the values end up changing every time we
// submit!!! i think it might read it in as UTC then write it out
// as local time, or vice versa.
if ( t == TYPE_DATE || t == TYPE_DATE2 ) return;
// do not send if setting from startup
if ( ! fromRequest ) return;
// note it in the log
log("admin: parm \"%s\" changed value",m->m_title);
int64_t nowms = gettimeofdayInMillisecondsLocal();
// . note it in statsdb
// . record what parm change and from/to what value
g_statsdb.addStat ( 0, // niceness ,
"parm_change" ,
nowms,
nowms,
0 , // value
m->m_hash , // parmHash
oldVal,
newVal);
// if they turn spiders on or off then tell spiderloop to update
// the active list
//if ( strcmp(m->m_cgi,"cse") )
// g_spiderLoop.m_activeListValid = false;
// only send email alerts if we are host 0 since everyone syncs up
// with host #0 anyway
if ( g_hostdb.m_hostId != 0 ) return;
// send an email alert notifying the admins that this parm was changed
// BUT ALWAYS send it if email alerts were just TURNED OFF
// ("sea" = Send Email Alerts)
if ( ! g_conf.m_sendEmailAlerts && strcmp(m->m_cgi,"sea") != 0 )
return;
// if spiders we turned on, do not send an email alert, cuz we
// turn them on when we restart the cluster
if ( strcmp(m->m_cgi,"se")==0 && g_conf.m_spideringEnabled )
return;
char tmp[1024];
Host *h0 = g_hostdb.getHost ( 0 );
int32_t ip0 = 0;
if ( h0 ) ip0 = h0->m_ip;
sprintf(tmp,"%s: parm \"%s\" changed value",iptoa(ip0),m->m_title);
g_pingServer.sendEmail ( NULL , // Host ptr
tmp , // msg
true , // sendToAdmin
false , // oom?
false , // kernel error?
true , // parm change?
true );// force it? even if disabled?
// now the spider collection can just check the collection rec
//int64_t nowms = gettimeofdayInMilliseconds();
//((CollectionRec *)THIS)->m_lastUpdateTime = nowms;
return;
}
Parm *Parms::getParmFromParmHash ( int32_t parmHash ) {
for ( int32_t i = 0 ; i < m_numParms ; i++ ) {
Parm *m = &m_parms[i];
if ( m->m_hash != parmHash ) continue;
return m;
}
return NULL;
}
void Parms::setToDefault ( char *THIS , char objType , CollectionRec *argcr ) {
// init if we should
init();
// . clear out any coll rec to get the diffbotApiNum dropdowns
// . this is a backwards-compatibility hack since this new parm
// will not be in old coll.conf files and will not be properly
// initialize when displaying a url filter row.
//if ( THIS != (char *)&g_conf ) {
// CollectionRec *cr = (CollectionRec *)THIS;
// memset ( cr->m_spiderDiffbotApiNum , 0 , MAX_FILTERS);
//}
for ( int32_t i = 0 ; i < m_numParms ; i++ ) {
Parm *m = &m_parms[i];
if ( m->m_obj != objType ) continue;
if ( m->m_obj == OBJ_NONE ) continue;
if ( m->m_type == TYPE_COMMENT ) continue;
// no, we gotta set GigablastRequest::m_contentFile to NULL
//if ( m->m_type == TYPE_FILEUPLOADBUTTON )
// continue;
if ( m->m_type == TYPE_MONOD2 ) continue;
if ( m->m_type == TYPE_MONOM2 ) continue;
if ( m->m_type == TYPE_CMD ) continue;
if (THIS == (char *)&g_conf && m->m_obj != OBJ_CONF ) continue;
if (THIS != (char *)&g_conf && m->m_obj == OBJ_CONF ) continue;
// what is this?
//if ( m->m_obj == OBJ_COLL ) {
// CollectionRec *cr = (CollectionRec *)THIS;
// if ( cr->m_bases[1] ) { char *xx=NULL;*xx=0; }
//}
// sanity check, make sure it does not overflow
if ( m->m_obj == OBJ_COLL &&
m->m_off > (int32_t)sizeof(CollectionRec)){
log(LOG_LOGIC,"admin: Parm in Parms.cpp should use "
"OBJ_COLL not OBJ_CONF");
char *xx = NULL; *xx = 0;
}
//if ( m->m_page == PAGE_PRIORITIES )
// log("hey");
// or
if ( m->m_page > PAGE_API && // CGIPARMS &&
m->m_page != PAGE_NONE &&
m->m_obj == OBJ_CONF ) {
log(LOG_LOGIC,"admin: Page can not reference "
"g_conf and be declared AFTER PAGE_CGIPARMS in "
"Pages.h. Title=%s",m->m_title);
char *xx = NULL; *xx = 0;
}
// if defOff >= 0 get from cr like for searchInput vals
// whose default is from the collectionRec...
if ( m->m_defOff >= 0 && argcr ) {
if ( ! argcr ) { char *xx=NULL;*xx=0; }
char *def = m->m_defOff+(char *)argcr;
char *dst = (char *)THIS + m->m_off;
gbmemcpy ( dst , def , m->m_size );
continue;
}
// leave arrays empty, set everything else to default
if ( m->m_max <= 1 ) {
//if ( i == 282 ) // "query" parm
// log("hey");
//if ( ! m->m_def ) { char *xx=NULL;*xx=0; }
setParm ( THIS , m, i, 0, m->m_def, false/*not enc.*/,
false );
//((CollectionRec *)THIS)->m_orig[i] = 1;
//m->m_orig = 0; // set in setToDefaults()
}
// these are special, fixed size arrays
if ( m->m_fixed > 0 ) {
for ( int32_t k = 0 ; k < m->m_fixed ; k++ ) {
setParm(THIS,m,i,k,m->m_def,false/*not enc.*/,
false);
//m->m_orig = 0; // set in setToDefaults()
//((CollectionRec *)THIS)->m_orig[i] = 1;
}
continue;
}
// make array sizes 0
if ( m->m_max <= 1 ) continue;
// otherwise, array is not fixed size
char *s = THIS + m->m_off ;
// set count to 1 if a default is present
//if ( m->m_def[0] ) *(int32_t *)(s-4) = 1;
//else *(int32_t *)(s-4) = 0;
*(int32_t *)(s-4) = 0;
}
}
// . returns false and sets g_errno on error
// . you should set your "THIS" to its defaults before calling this
bool Parms::setFromFile ( void *THIS ,
char *filename ,
char *filenameDef ,
char objType ) {
// make sure we're init'd
init();
// let em know
//if ( THIS == &g_conf) log (LOG_INIT,"conf: Reading %s." , filename );
// . let the log know what we are doing
// . filename is NULL if a call from CollectionRec::setToDefaults()
Xml xml;
//char buf [ MAX_XML_CONF ];
SafeBuf sb;
if ( filename&&!setXmlFromFile(&xml,filename,&sb)){//buf,MAX_XML_CONF))
log("parms: error setting from file %s: %s",filename,
mstrerror(g_errno));
return false;
}
// . all the collectionRecs have the same default file in
// the workingDir/collections/default.conf
// . so use our built in buffer for that
/*
if ( THIS != &g_conf && ! m_isDefaultLoaded ) {
m_isDefaultLoaded = true;
File f;
f.set ( filenameDef );
if ( ! f.doesExist() ) {
log(LOG_INIT,
"db: Default collection configuration file "
"%s was not found. Newly created collections "
"will use hard coded defaults.",f.getFilename());
goto skip;
}
if ( ! setXmlFromFile ( &m_xml2 ,
filenameDef ,
m_buf ,
MAX_XML_CONF ) ) return false;
}
skip:
*/
int32_t vlen;
char *v ;
//char c ;
int32_t numNodes = xml.getNumNodes();
int32_t numNodes2 = m_xml2.getNumNodes();
// now set THIS based on the parameters in the xml file
for ( int32_t i = 0 ; i < m_numParms ; i++ ) {
// get it
Parm *m = &m_parms[i];
if ( m->m_obj != objType ) continue;
if ( m->m_obj == OBJ_NONE ) continue;
//log(LOG_DEBUG, "Parms: %s: parm: %s", filename, m->m_xml);
// . there are 2 object types, coll recs and g_conf, aka
// OBJ_COLL and OBJ_CONF.
// . make sure we got the right parms for what we want
if ( THIS == &g_conf && m->m_obj != OBJ_CONF ) continue;
if ( THIS != &g_conf && m->m_obj == OBJ_CONF ) continue;
// skip comments and command
if ( m->m_type == TYPE_COMMENT ) continue;
if ( m->m_type == TYPE_FILEUPLOADBUTTON ) continue;
if ( m->m_type == TYPE_MONOD2 ) continue;
if ( m->m_type == TYPE_MONOM2 ) continue;
if ( m->m_type == TYPE_CMD ) continue;
if ( m->m_type == TYPE_CONSTANT ) continue;
// these are special commands really
if ( m->m_type == TYPE_BOOL2 ) continue;
//if ( strcmp ( m->m_xml , "forceDeleteUrls" ) == 0 )
// log("got it");
// we did not get one from first xml file yet
bool first = true;
// array count
int32_t j = 0;
// node number
int32_t nn = 0;
// a tmp thingy
char tt[1];
int32_t nb;
int32_t newnn;
loop:
if ( m->m_obj == OBJ_NONE ) { char *xx=NULL;*xx=0; }
// get xml node number of m->m_xml in the "xml" file
newnn = xml.getNodeNum(nn,1000000,m->m_xml,gbstrlen(m->m_xml));
#ifdef _GLOBALSPEC_
if ( m->m_priv == 2 ) continue;
if ( m->m_priv == 3 ) continue;
#elif _CLIENT_
// always use default value if client not allowed control of
if ( m->m_priv ) continue;
#elif _METALINCS_
if ( m->m_priv == 2 ) continue;
if ( m->m_priv == 3 ) continue;
#endif
// debug
//log("%s --> %"INT32"",m->m_xml,nn);
// try default xml file if none, but only if first try
if ( newnn < 0 && first ) goto try2;
// it is valid, use it
nn = newnn;
// set the flag, we've committed the array to the first file
first = false;
// otherwise, we had some in this file, but now we're out
if ( nn < 0 ) continue;
// . next node is the value of this tag
// . skip if none there
if ( nn + 1 >= numNodes ) continue;
// point to it
v = xml.getNode ( nn + 1 );
vlen = xml.getNodeLen ( nn + 1 );
// if a back tag... set the value to the empty string
if ( v[0] == '<' && v[1] == '/' ) vlen = 0;
// now, extricate from the <![CDATA[ ... ]]> tag if we need to
if ( m->m_type == TYPE_STRING ||
m->m_type == TYPE_STRINGBOX ||
m->m_type == TYPE_SAFEBUF ||
m->m_type == TYPE_STRINGNONEMPTY ) {
char *oldv = v;
int32_t oldvlen = vlen;
// if next guy is NOT a tag node, try the next one
if ( v[0] != '<' && nn + 2 < numNodes ) {
v = xml.getNode ( nn + 2 );
vlen = xml.getNodeLen ( nn + 2 );
}
// should be a <![CDATA[...]]>
if ( vlen<12 || strncasecmp(v,"<![CDATA[",9)!=0 ) {
log("conf: No <![CDATA[...]]> tag found "
"for \"<%s>\" tag. Trying without CDATA.",
m->m_xml);
v = oldv;
vlen = oldvlen;
}
// point to the nugget
else {
v += 9;
vlen -= 12;
}
}
// get the value
//v = xml.getString ( nn , nn+2 , m->m_xml , &vlen );
// this only happens when tag is there, but without a value
if ( ! v || vlen == 0 ) { vlen = 0; v = tt; }
//c = v[vlen];
v[vlen]='\0';
if ( vlen == 0 ){
// . this is generally ok
// . this is spamming the log so i am commenting out! (MDW)
//log(LOG_INFO, "parms: %s: Empty value.", m->m_xml);
// Allow an empty string
//continue;
}
// now use proper cdata
// we can't do this and be backwards compatible right now
//nb = cdataDecode ( v , v , 0 );//, vlen , false ,0);
// now decode it into itself
nb = htmlDecode ( v , v , vlen , false ,0);
v[nb] = '\0';
// set our parm
setParm ( (char *)THIS, m, i, j, v, false/*is html encoded?*/,
false );
// we were set from the explicit file
//((CollectionRec *)THIS)->m_orig[i] = 2;
// go back
//v[vlen] = c;
// do not repeat same node
nn++;
// try to get the next node if we're an array
if ( ++j < m->m_max || j < m->m_fixed ) { goto loop; }
// otherwise, if not an array, go to next parm
continue;
try2:
// get xml node number of m->m_xml in the "m_xml" file
nn = m_xml2.getNodeNum(nn,1000000,m->m_xml,gbstrlen(m->m_xml));
// otherwise, we had one in file, but now we're out
if ( nn < 0 ) {
// if it was ONLY a search input parm, with no
// default value that can be changed in the
// CollectionRec then skip it
// if ( m->m_soff != -1 &&
// m->m_off == -1 &&
// m->m_smaxc == -1 )
// continue;
// . if it is a string, like <adminIp> and default is
// NULL then don't worry about reporting it
// . no, just make the default "" then
//if ( m->m_type==TYPE_STRING && ! m->m_def) continue;
// bitch that it was not found
//if ( ! m->m_def[0] )
// log("conf: %s does not have <%s> tag. "
// "Ommitting.",filename,m->m_xml);
//else
/*
if ( ! m->m_def ) //m->m_def[0] )
log("conf: %s does not have <%s> tag. Using "
"default value of \"%s\".", filename,
m->m_xml,m->m_def);
*/
continue;
}
// . next node is the value of this tag
// . skip if none there
if ( nn + 1 >= numNodes2 ) continue;
// point to it
v = m_xml2.getNode ( nn + 1 );
vlen = m_xml2.getNodeLen ( nn + 1 );
// if a back tag... set the value to the empty string
if ( v[0] == '<' && v[1] == '/' ) vlen = 0;
// now, extricate from the <![CDATA[ ... ]]> tag if we need to
if ( m->m_type == TYPE_STRING ||
m->m_type == TYPE_STRINGBOX ||
m->m_type == TYPE_STRINGNONEMPTY ) {
char *oldv = v;
int32_t oldvlen = vlen;
// reset if not a tag node
if ( v[0] != '<' && nn + 2 < numNodes2 ) {
v = m_xml2.getNode ( nn + 2 );
vlen = m_xml2.getNodeLen ( nn + 2 );
}
// should be a <![CDATA[...]]>
if ( vlen<12 || strncasecmp(v,"<![CDATA[",9)!=0 ) {
log("conf: No <![CDATA[...]]> tag found "
"for \"<%s>\" tag. Trying without CDATA.",
m->m_xml);
v = oldv;
vlen = oldvlen;
}
// point to the nugget
else {
v += 9;
vlen -= 12;
}
}
// get the value
//v = m_xml2.getString ( nn , nn+2 , m->m_xml , &vlen );
// this only happens when tag is there, but without a value
if ( ! v || vlen == 0 ) { vlen = 0; v = tt; }
//c = v[vlen];
v[vlen]='\0';
// now decode it into itself
nb = htmlDecode ( v , v , vlen , false,0);
v[nb] = '\0';
// set our parm
setParm ( (char *)THIS, m, i, j, v, false/*is html encoded?*/,
false );
// we were set from the backup default file
//((CollectionRec *)THIS)->m_orig[i] = 1;
// go back
//v[vlen] = c;
// do not repeat same node
nn++;
// try to get the next node if we're an array
if ( ++j < m->m_max || j < m->m_fixed ) { goto loop; }
// otherwise, if not an array, go to next parm
continue;
}
// backwards compatible hack for old <masterPassword> tags
for ( int32_t i = 1 ; i < numNodes ; i++ ) {
if ( objType != OBJ_CONF ) break;
XmlNode *pn = &xml.m_nodes[i-1];
XmlNode *xn = &xml.m_nodes[i];
// look for <masterPassword>
if ( pn->m_tagNameLen != 14 ) continue;
if ( xn->m_tagNameLen != 8 ) continue;
// if it is not the OLD supported tag then skip
if ( strncmp ( pn->m_tagName,"masterPassword",14 ) ) continue;
if ( strncmp ( xn->m_tagName,"![CDATA[",8 ) ) continue;
// otherwise append to buf
char *text = xn->m_node + 9;
int32_t tlen = xn->m_nodeLen - 12;
g_conf.m_masterPwds.safeMemcpy(text,tlen);
// a \n
g_conf.m_masterPwds.pushChar('\n');
g_conf.m_masterPwds.nullTerm();
}
// another backwards compatible hack for old masterIp tags
for ( int32_t i = 1 ; i < numNodes ; i++ ) {
if ( objType != OBJ_CONF ) break;
XmlNode *xn = &xml.m_nodes[i];
XmlNode *pn = &xml.m_nodes[i-1];
// look for <masterPassword>
if ( pn->m_tagNameLen != 8 ) continue;
if ( xn->m_tagNameLen != 8 ) continue;
// if it is not the OLD supported tag then skip
if ( strncmp ( pn->m_tagName,"masterIp",8 ) ) continue;
if ( strncmp ( xn->m_tagName,"![CDATA[",8 ) ) continue;
// otherwise append to buf
char *text = xn->m_node + 9;
int32_t tlen = xn->m_nodeLen - 12;
// otherwise append to buf
g_conf.m_connectIps.safeMemcpy(text,tlen);
// a \n
g_conf.m_connectIps.pushChar('\n');
g_conf.m_connectIps.nullTerm();
}
/*
// no! now we warn with a redbox alert
// always make sure we got some admin security
if ( g_conf.m_numMasterIps <= 0 && g_conf.m_numMasterPwds <= 0 ) {
//log(LOG_INFO,
// "conf: No master IP or password provided. Using default "
// "password 'footbar23'." );
//g_conf.m_masterIps[0] = atoip ( "64.139.94.202", 13 );
//g_conf.m_numMasterIps = 1;
strcpy ( g_conf.m_masterPwds[0] , "footbar23" );
g_conf.m_numMasterPwds = 1;
}
*/
return true;
}
// returns false and sets g_errno on error
bool Parms::setXmlFromFile(Xml *xml, char *filename, SafeBuf *sb ) {
// File f;
// f.set ( filename );
// is it too big?
// int32_t fsize = f.getFileSize();
// if ( fsize > bufSize ) {
// log ("conf: File size of %s is %"INT32", must be "
// "less than %"INT32".",f.getFilename(),fsize,bufSize );
// char *xx = NULL; *xx = 0;
// }
// open it for reading
// f.set ( filename );
// if ( ! f.open ( O_RDONLY ) )
// return log("conf: Could not open %s: %s.",
// filename,mstrerror(g_errno));
// // read in the file
// int32_t numRead = f.read ( buf , bufSize , 0 /*offset*/ );
// f.close ( );
// if ( numRead != fsize )
// return log ("conf: Could not read %s : %s.",
// filename,mstrerror(g_errno));
// // null terminate it
// buf [ fsize ] = '\0';
sb->load ( filename );
char *buf = sb->getBufStart();
if ( ! buf )
return log ("conf: Could not read %s : %s.",
filename,mstrerror(g_errno));
// . remove all comments in case they contain tags
// . if you have a # as part of your string, it must be html encoded,
// just like you encode < and >
char *s = buf;
char *d = buf;
while ( *s ) {
// . skip comments
// . watch out for html encoded pound signs though
if ( *s == '#' ) {
if (s>buf && *(s-1)=='&' && is_digit(*(s+1))) goto ok;
while ( *s && *s != '\n' ) s++;
continue;
}
// otherwise, transcribe over
ok:
*d++ = *s++;
}
*d = '\0';
int32_t bufSize = d - buf;
// . set to xml
// . use version of 0
return xml->set ( buf ,
bufSize ,
false , // ownData
0 , // allocSize
false , // pureXml?
0 , // version
true , // setParents
0 , // niceness
CT_XML );
}
//#define MAX_CONF_SIZE 200000
// returns false and sets g_errno on error
bool Parms::saveToXml ( char *THIS , char *f , char objType ) {
if ( g_conf.m_readOnlyMode ) return true;
// print into buffer
// "seeds" can be pretty big so go with safebuf now
// fix so if we core in malloc/free we can still save conf
char tmpbuf[200000];
SafeBuf sb(tmpbuf,200000);
//char *p = buf;
//char *pend = buf + MAX_CONF_SIZE;
int32_t len ;
//int32_t n ;
File ff ;
int32_t j ;
int32_t count;
char *s;
CollectionRec *cr = NULL;
if ( THIS != (char *)&g_conf ) cr = (CollectionRec *)THIS;
// now set THIS based on the parameters in the xml file
for ( int32_t i = 0 ; i < m_numParms ; i++ ) {
// get it
Parm *m = &m_parms[i];
if ( m->m_obj != objType ) continue;
// . there are 2 object types, coll recs and g_conf, aka
// OBJ_COLL and OBJ_CONF.
// . make sure we got the right parms for what we want
if ( m->m_obj == OBJ_NONE ) continue;
// skip dups
if ( m->m_flags & PF_DUP ) continue;
// do not allow searchinput parms through
if ( m->m_obj == OBJ_SI ) continue;
if ( THIS == (char *)&g_conf && m->m_obj != OBJ_CONF) continue;
if ( THIS != (char *)&g_conf && m->m_obj == OBJ_CONF) continue;
if ( m->m_type == TYPE_MONOD2 ) continue;
if ( m->m_type == TYPE_MONOM2 ) continue;
if ( m->m_type == TYPE_CMD ) continue;
if ( m->m_type == TYPE_BOOL2 ) continue;
if ( m->m_type == TYPE_FILEUPLOADBUTTON ) continue;
// ignore if hidden as well! no, have to keep those separate
// since spiderroundnum/starttime is hidden but should be saved
if ( m->m_flags & PF_NOSAVE ) continue;
// ignore if diffbot and we are not a diffbot/custom crawl
if ( cr &&
! cr->m_isCustomCrawl &&
(m->m_flags & PF_DIFFBOT) ) continue;
// skip if we should not save to xml
if ( ! m->m_save ) continue;
// allow comments though
if ( m->m_type == TYPE_COMMENT ) goto skip2;
// skip if this was compiled for a client and they should not
// see this control
//#ifdef _GLOBALSPEC_
// if ( m->m_priv == 2 ) continue;
// if ( m->m_priv == 3 ) continue;
//#elif _CLIENT_
// if ( m->m_priv ) continue;
//#elif _METALINCS_
// if ( m->m_priv == 2 ) continue;
// if ( m->m_priv == 3 ) continue;
//#endif
// skip if offset is negative, that means none
s = (char *)THIS + m->m_off ;
// if array, count can be 0 or more than 1
count = 1;
if ( m->m_max > 1 ) count = *(int32_t *)(s-4);
if ( m->m_fixed > 0 ) count = m->m_fixed;
// sanity check
if ( count > 100000 ) {
log(LOG_LOGIC,"admin: Outrageous array size in for "
"parameter %s. Does the array max size int32_t "
"preceed it in the conf class?",m->m_title);
exit(-1);
}
skip2:
// description, do not wrap words around lines
char *d = m->m_desc;
// if empty array mod description to include the tag name
char tmp [10*1024];
if ( m->m_max > 1 && count == 0 && gbstrlen(d) < 9000 &&
m->m_xml && m->m_xml[0] ) {
char *cc = "";
if ( d && d[0] ) cc = "\n";
sprintf ( tmp , "%s%sUse <%s> tag.",d,cc,m->m_xml);
d = tmp;
}
char *END = d + gbstrlen(d);
char *dend;
char *last;
char *start;
// just print tag if it has no description
if ( ! *d ) goto skip;
//if ( p + gbstrlen(d)+5 >= pend ) goto hadError;
//if ( p > buf ) *p++='\n';
if ( sb.length() ) sb.pushChar('\n');
loop:
dend = d + 77;
if ( dend > END ) dend = END;
last = d;
start = d;
while ( *d && d < dend ) {
if ( *d == ' ' ) last = d;
if ( *d == '\n' ) { last = d; break; }
d++;
}
if ( ! *d ) last = d;
//gbmemcpy ( p , "# " , 2 );
//p += 2;
sb.safeMemcpy("# ",2);
//gbmemcpy ( p , start , last - start );
//p += last - start;
sb.safeMemcpy(start,last-start);
//*p++='\n';
sb.pushChar('\n');
d = last + 1;
if ( d < END && *d ) goto loop;
// bail if comment
if ( m->m_type == TYPE_COMMENT ) {
//sprintf ( p , "\n" );
//p += gbstrlen ( p );
continue;
}
if ( m->m_type == TYPE_MONOD2 ) continue;
if ( m->m_type == TYPE_MONOM2 ) continue;
skip:
/* . note: this code commented out because it was specific to
an old client
// if value is from default collection file, do not
// explicitly list it
if ( m->m_obj == OBJ_COLL &&
((CollectionRec *)THIS)->m_orig[i] == 1 ) {
sprintf ( p ,"# Value for <%s> tag taken from "
"default.conf.\n",m->m_xml );
p += gbstrlen ( p );
continue;
}
*/
// debug point
//if ( m->m_type == TYPE_SAFEBUF )
// log("hey");
// loop over all in this potential array
for ( j = 0 ; j < count ; j++ ) {
// the xml
//if ( p + gbstrlen(m->m_xml) >= pend ) goto hadError;
if ( g_errno ) goto hadError;
//sprintf ( p , "<%s>" , m->m_xml );
//p += gbstrlen ( p );
sb.safePrintf("<%s>" , m->m_xml );
// print CDATA if string
if ( m->m_type == TYPE_STRING ||
m->m_type == TYPE_STRINGBOX ||
m->m_type == TYPE_SAFEBUF ||
m->m_type == TYPE_STRINGNONEMPTY ) {
//sprintf ( p , "<![CDATA[" );
//p += gbstrlen ( p );
sb.safeStrcpy( "<![CDATA[" );
}
// break point
//if (strcmp ( m->m_xml , "filterRulesetDefault")==0)
// log("got it");
// . represent it in ascii form
// . this escapes out <'s and >'s
// . this ALSO encodes #'s (xml comment indicators)
//p = getParmHtmlEncoded(p,pend,m,s);
getParmHtmlEncoded(&sb,m,s);
// print CDATA if string
if ( m->m_type == TYPE_STRING ||
m->m_type == TYPE_STRINGBOX ||
m->m_type == TYPE_SAFEBUF ||
m->m_type == TYPE_STRINGNONEMPTY ) {
//sprintf ( p , "]]>" );
//p += gbstrlen ( p );
sb.safeStrcpy("]]>" );
}
// this is NULL if it ran out of room
//if ( ! p ) goto hadError;
if ( g_errno ) goto hadError;
// advance to next element in array, if it is one
s = s + m->m_size;
// close the xml tag
//if ( p + 4 >= pend ) goto hadError;
//sprintf ( p , "</>\n" );
//p += gbstrlen ( p );
sb.safeStrcpy("</>\n" );
if ( g_errno ) goto hadError;
}
}
//*p = '\0';
sb.nullTerm();
//ff.set ( f );
//if ( ! ff.open ( O_RDWR | O_CREAT | O_TRUNC ) )
// return log("db: Could not open %s : %s",
// ff.getFilename(),mstrerror(g_errno));
// save the parm to the file
//len = gbstrlen(buf);
len = sb.length();
// use -1 for offset so we do not use pwrite() so it will not leave
// garbage at end of file
//n = ff.write ( buf , len , -1 );
//n = ff.write ( sb.getBufStart() , len , -1 );
//ff.close();
//if ( n == len ) return true;
// save to filename "f". returns # of bytes written. -1 on error.
if ( sb.safeSave ( f ) >= 0 )
return true;
return log("admin: Could not write to file %s.",f);
hadError:
return log("admin: Error writing to %s: %s",f,mstrerror(g_errno));
//File bigger than %"INT32" bytes."
// " Please increase #define in Parms.cpp.",
// (int32_t)MAX_CONF_SIZE);
}
Parm *Parms::getParm ( char *cgi ) {
for ( int32_t i = 0 ; i < m_numParms ; i++ ) {
if ( ! m_parms[i].m_cgi ) continue ;
if ( m_parms[i].m_cgi[0] != cgi[0] ) continue;
if ( m_parms[i].m_cgi[1] != cgi[1] ) continue;
if ( strcmp ( m_parms[i].m_cgi , cgi ) == 0 )
return &m_parms[i];
}
return NULL;
}
/*
Parm *Parms::getParm2 ( char *cgi , int32_t cgiLen ) {
for ( int32_t i = 0 ; i < m_numParms ; i++ ) {
if ( ! m_parms[i].m_cgi ) continue ;
if ( m_parms[i].m_cgi[0] != cgi[0] ) continue;
if ( cgiLen >=2 && m_parms[i].m_cgi[1] != cgi[1] ) continue;
// only compare as many letters as the cgi name has
if ( strncmp ( m_parms[i].m_cgi , cgi , cgiLen ) ) continue;
// that means we gotta check lengths next
if ( gbstrlen(m_parms[i].m_cgi) != cgiLen ) continue;
// got a match
return &m_parms[i];
}
return NULL;
}
*/
/*
#define PHTABLE_SIZE (MAX_PARMS*2)
Parm *Parms::getParm ( char *cgi ) {
// make the hash table for the first call
static int32_t s_phtable [ PHTABLE_SIZE ];
static Parm *s_phparm [ PHTABLE_SIZE ];
static bool s_init = false;
// do not re-make the table if we already did
if ( s_init ) goto skipMakeTable;
// ok, now make the table
s_init = true;
memset ( s_phparm , 0 , PHTABLE_SIZE );
for ( int32_t i = 0 ; i < m_numParms ; i++ ) {
if ( ! m_parms[i].m_cgi ) continue ;
int32_t h = hash32 ( m_parms[i].m_cgi );
int32_t n = h % PHTABLE_SIZE;
while ( s_phparm[n] ) {
// . sanity check
// . we don't have that many parms, they should never
// collide!!... but it is possible i guess.
if ( s_phtable[n] == h ) {
log(LOG_LOGIC,"Parms: collisions forbidden in "
"getParm(). Duplicate cgi name?");
char *xx = NULL; *xx = 0;
}
if (++n >= PHTABLE_SIZE) n = 0;
}
s_phtable[n] = h; // fill the bucket
s_phparm [n] = m; // the parm
}
skipMakeTable:
// look up in table
int32_t h = hash32 ( cgi );
int32_t n = h % PHTABLE_SIZE;
// while bucket is occupied and does not equal our hash... chain
while ( s_phparm[n] && s_phtable[n] != h )
if (++n >= PHTABLE_SIZE) n = 0;
// if empty, no match
return s_phparm[n];
}
*/
bool Parms::getParmHtmlEncoded ( SafeBuf *sb , Parm *m , char *s ) {
// do not breech the buffer
//if ( p + 100 >= pend ) return p;
// print it out
char t = m->m_type;
if ( t == TYPE_CHAR || t == TYPE_BOOL ||
t == TYPE_CHECKBOX ||
t == TYPE_PRIORITY || t == TYPE_PRIORITY2 ||
//t == TYPE_DIFFBOT_DROPDOWN ||
t == TYPE_UFP ||
t == TYPE_PRIORITY_BOXES || t == TYPE_RETRIES ||
t == TYPE_RETRIES || t == TYPE_FILTER ||
t == TYPE_BOOL2 || t == TYPE_CHAR2 )
sb->safePrintf("%"INT32"",(int32_t)*s);
else if ( t == TYPE_FLOAT )
sb->safePrintf("%f",*(float *)s);
else if ( t == TYPE_IP )
sb->safePrintf("%s",iptoa(*(int32_t *)s));
else if ( t == TYPE_LONG || t == TYPE_LONG_CONST || t == TYPE_RULESET||
t == TYPE_SITERULE )
sb->safePrintf("%"INT32"",*(int32_t *)s);
else if ( t == TYPE_LONG_LONG )
sb->safePrintf("%"INT64"",*(int64_t *)s);
else if ( t == TYPE_SAFEBUF ) {
SafeBuf *sb2 = (SafeBuf *)s;
char *buf = sb2->getBufStart();
//int32_t blen = 0;
//if ( buf ) blen = gbstrlen(buf);
//p = htmlEncode ( p , pend , buf , buf + blen , true ); // #?*
// we can't do proper cdata and be backwards compatible
//sb->cdataEncode ( buf );//, blen );//, true ); // #?*
if ( buf ) sb->htmlEncode ( buf );
}
else if ( t == TYPE_STRING ||
t == TYPE_STRINGBOX ||
t == TYPE_STRINGNONEMPTY ||
t == TYPE_TIME) {
//int32_t slen = gbstrlen ( s );
// this returns the length of what was written, it may
// not have converted everything if pend-p was too small...
//p += saftenTags2 ( p , pend - p , s , len );
//p = htmlEncode ( p , pend , s , s + slen , true /*#?*/);
// we can't do proper cdata and be backwards compatible
//sb->cdataEncode ( s );//, slen );//, true /*#?*/);
sb->htmlEncode ( s );
}
else if ( t == TYPE_DATE || t == TYPE_DATE2 ) {
// time is stored as int32_t
int32_t ct = *(int32_t *)s;
// get the time struct
struct tm *tp = localtime ( (time_t *)&ct ) ;
// set the "selected" month for the drop down
char tmp[100];
strftime ( tmp , 100 , "%d %b %Y %H:%M UTC" , tp );
sb->safeStrcpy ( tmp );
sb->setLabel("parm3");
}
//p += gbstrlen ( p );
//return p;
return true;
}
/*
// returns the size needed to serialize parms
int32_t Parms::getStoredSize() {
int32_t size = 0;
// calling serialize with no ptr gets size
serialize( NULL, &size );
return size;
}
// . serialize parms to buffer
// . accepts addr of buffer ptr and addr of buffer size
// . on entry buf can be NULL to determine required size
// . if buf is not NULL, *bufSize must specify the size of buf
// . on exit *buf is filled with serialized parms
// . on exit *bufSize is set to the actual len of *buf
bool Parms::serialize( char *buf, int32_t *bufSize ) {
g_errno = 0;
if ( ! bufSize ) {
g_errno = EBADENGINEER;
log( "admin: serialize: bad engineer: no bufSize ptr" );
*bufSize = 0;
return false;
}
bool sizeChk = false;
char *end = NULL;
if ( ! buf ) sizeChk = true; // just calc size
else end = buf + *bufSize; // for overrun checking
// serialize OBJ_CONF and OBJ_COLL parms
*bufSize = 0;
char *p = buf;
// now the parms
struct SerParm *sp = NULL;
for ( int32_t i = 0 ; i < m_numParms ; i++ ) {
Parm *m = &m_parms[i];
// ignore these:
if ( m->m_obj == OBJ_SI ) continue;
if ( m->m_off < 0 ) continue;
if ( m->m_type == TYPE_COMMENT ) continue;
if ( m->m_type == TYPE_MONOD2 ) continue;
if ( m->m_type == TYPE_MONOM2 ) continue;
if ( m->m_type == TYPE_CMD ) continue;
if ( m->m_type == TYPE_LONG_CONST ) continue;
if ( ! m->m_sync ) continue; // parm is not to be synced
// determine the size of the parm value
int32_t size = 0;
if ( m->m_type == TYPE_CHAR ) size = 1;
if ( m->m_type == TYPE_CHAR2 ) size = 1;
if ( m->m_type == TYPE_CHECKBOX ) size = 1;
if ( m->m_type == TYPE_BOOL ) size = 1;
if ( m->m_type == TYPE_BOOL2 ) size = 1;
if ( m->m_type == TYPE_PRIORITY ) size = 1;
if ( m->m_type == TYPE_PRIORITY2 ) size = 1;
//if ( m->m_type == TYPE_DIFFBOT_DROPDOWN) size = 1;
if ( m->m_type == TYPE_PRIORITY_BOXES ) size = 1;
if ( m->m_type == TYPE_RETRIES ) size = 1;
if ( m->m_type == TYPE_TIME ) size = 6;
if ( m->m_type == TYPE_DATE2 ) size = 4;
if ( m->m_type == TYPE_DATE ) size = 4;
if ( m->m_type == TYPE_FLOAT ) size = 4;
if ( m->m_type == TYPE_IP ) size = 4;
if ( m->m_type == TYPE_RULESET ) size = 4;
if ( m->m_type == TYPE_LONG ) size = 4;
if ( m->m_type == TYPE_LONG_LONG ) size = 8;
if ( m->m_type == TYPE_STRING ) size = m->m_size;
if ( m->m_type == TYPE_STRINGBOX ) size = m->m_size;
if ( m->m_type == TYPE_STRINGNONEMPTY ) size = m->m_size;
if ( m->m_type == TYPE_SAFEBUF ) size = m->m_size;
if ( m->m_type == TYPE_SITERULE ) size = 4;
// . set size to the total size of array
// . set cnt to the number of itmes
int32_t cnt = 1;
if (m->m_fixed > 0) {
size *= m->m_fixed;
cnt = m->m_fixed;
}
else {
size *= m->m_max;
cnt = m->m_max;
}
if ( m->m_obj == OBJ_CONF ) {
bool overflew = serializeConfParm( m, i, &p, end,
size, cnt,
sizeChk, bufSize );
if ( overflew ) goto overflow;
}
else if ( m->m_obj == OBJ_COLL ) {
collnum_t j = g_collectiondb.getFirstCollnum ();
while ( j >= 0 ) {
CollectionRec *cr = g_collectiondb.getRec( j );
bool overflew = serializeCollParm( cr,
m, i, &p,
end,
size, cnt,
sizeChk,
bufSize );
if ( overflew ) goto overflow;
j = g_collectiondb.getNextCollnum ( j );
}
}
}
if ( ! sizeChk ) {
// set the final marker to 0s to indicate the end
sp = (struct SerParm *)p;
sp->i = 0;
sp->obj = 0;
sp->size = 0;
sp->cnt = 0;
}
*bufSize += sizeof( struct SerParm );
return true;
overflow:
g_errno = EBADENGINEER;
log(LOG_WARN, "admin: serialize: bad engineer: overflow" );
*bufSize = 0;
return false;
}
// . serialize a conf parm
// . if sizeChk is true then we do not serialize, but just get the
// bytes required if we did serialize
// . serialize parm into *p, the cursor i guess, buf end is "end"
bool Parms::serializeConfParm( Parm *m, int32_t i, char **p, char *end,
int32_t size, int32_t cnt,
bool sizeChk, int32_t *bufSz ) {
SerParm *sp = NULL;
// safebuf not supported here yet, but it for coll recs below
// so copy code from there if you need it
if ( m->m_type == TYPE_SAFEBUF ) { char *xx=NULL;*xx=0;}
if (m->m_type == TYPE_STRING ||
m->m_type == TYPE_STRINGBOX ||
m->m_type == TYPE_STRINGNONEMPTY ) {
char *sVal = NULL;
if ( ! sizeChk ) {
sp = (SerParm *)*p;
sp->i = i; // index of parm
sp->obj = OBJ_CONF;
sp->size = 0L; // 0 for strings
sp->cnt = cnt; // # of strings
// if an array, get num of member
if ( cnt > 1 ) {
sp->off = m->m_off - sizeof(int32_t);
sp->num = *(int32_t *)((char *)&g_conf
+ sp->off);
}
else {
sp->off = 0;
sp->num = 0;
}
sVal = sp->val;
}
char *sConf = (char *)&g_conf + m->m_off;
int32_t totLen = 0;
int32_t tcnt = cnt;
while ( tcnt ) {
int32_t len = gbstrlen( sConf );
if ( ! sizeChk ) {
// copy the parm value
if ( sVal + len > end )
return true; // overflow
strcpy( sVal, sConf );
}
totLen += len + 1; // incl the NULL
// inc conf ptr by size of strings
sConf += m->m_size;
// inc ser value by len of str + NULL
sVal += len + 1;
tcnt--;
}
if ( ! sizeChk ) {
// inc by tot len of compacted strings
*p += sizeof( *sp ) + totLen;
}
*bufSz += sizeof( SerParm ) + totLen;
}
else {
if ( ! sizeChk ) {
sp = (SerParm *)*p;
sp->i = i;
sp->obj = OBJ_CONF;
sp->size = size; // tot size if array
sp->cnt = cnt; // num of items
// if array, get num of member
if ( cnt > 1 ) {
sp->off = m->m_off - sizeof(int32_t);
sp->num = *(int32_t *)((char *)&g_conf
+ sp->off);
}
else {
sp->off = 0;
sp->num = 0;
}
// copy the parm's whole value
if ( sp->val + size > end )
return true; // overflow
gbmemcpy( sp->val,
(char *)&g_conf + m->m_off, size );
// inc by tot size if array
*p += sizeof( *sp ) + size;
}
*bufSz += sizeof( SerParm ) + size;
}
return false;
}
// . serialize a coll parm in CollectionRec.h
// . if sizeChk is true then we do not serialize, but just get the
// bytes required if we did serialize
// . serialize parm into *p, the cursor i guess, buf end is "end"
bool Parms::serializeCollParm( CollectionRec *cr,
Parm *m, int32_t i, char **p, char *end,
int32_t size, int32_t cnt,
bool sizeChk, int32_t *bufSize) {
SerParm *sp = NULL;
if (m->m_type == TYPE_STRING ||
m->m_type == TYPE_STRINGBOX ||
m->m_type == TYPE_SAFEBUF ||
m->m_type == TYPE_STRINGNONEMPTY ) {
char *sVal = NULL;
if ( ! sizeChk ) {
sp = (SerParm *)*p;
sp->i = i; // index of parm
sp->obj = OBJ_COLL;
sp->size = 0L; // 0 for strings
sp->cnt = cnt; // # of strings
// is this parm an array if parms?
if ( cnt > 1 ) {
// the offset of the "count" or the
// "number of elements" in the array.
// it preceeds the value of the first element
// as can be seen infor parms in
// CollectionRec.h.
sp->off = m->m_off - sizeof(int32_t);
// store the # of then into "num"
sp->num = *(int32_t *)((char *)cr + sp->off);
}
else {
sp->off = 0;
sp->num = 0;
}
sVal = sp->val;
}
// point to the actual parm itself
char *sColl = (char *)cr + m->m_off;
int32_t totLen = 0;
// "cnt" is how many elements in the array
int32_t tcnt = cnt;
while ( tcnt ) {
// the length of the string
int32_t len;
// the string
char *pstr;
// if a safebuf, point to string it has
if ( m->m_type == TYPE_SAFEBUF ) {
SafeBuf *sx = (SafeBuf *)sColl;
pstr = sx->getBuf();
len = sx->length();
if ( ! pstr ) pstr = "";
}
// get length of the string. if not a safebuf it will
// just be an outright string in CollectionRec.h
else {
pstr = sColl;
len = gbstrlen( sColl );
}
if ( ! sizeChk ) {
// copy the string
if ( sVal+len > end ) {
log("parms: buffer too small");
return true;
}
// this puts a \0 at the end
strcpy( sVal, pstr );
}
totLen += len + 1; // incl NULL
// . inc cr ptr by size of strs
// . this is the size of the SafeBuf for TYPE_SAFEBUF
sColl += m->m_size;
// . inc the write cursor by string length + the \0
sVal += len + 1;
tcnt--;
}
if ( ! sizeChk ) {
// inc by tot len of cmpctd str
*p += sizeof( *sp ) + totLen;
}
*bufSize += sizeof( SerParm ) + totLen;
}
else {
if ( ! sizeChk ) {
sp = (SerParm *)*p;
sp->i = i;
sp->obj = OBJ_COLL;
sp->size = size; // tot size
sp->cnt = cnt; // num of items
// get num of member
if ( cnt > 1 ) {
sp->off = m->m_off - sizeof(int32_t);
sp->num = *(int32_t *)((char *)cr + sp->off);
}
else {
sp->off = 0;
sp->num = 0;
}
// copy whole value
if ( sp->val + size > end )
return true;
gbmemcpy( sp->val,
(char *)cr + m->m_off,
size );
// inc by whole size of value
*p += sizeof( *sp ) + size;
}
*bufSize += sizeof( SerParm ) + size;
}
return false;
}
// deserialize parms from buffer and set our values to the new values
void Parms::deserialize( char *buf ) {
g_errno = 0;
char *p = buf;
bool confChgd = false;
SerParm *sp = (SerParm *)p;
int32_t numLooped = 0;
const int32_t MAX_LOOP = (int32_t)(MAX_PARMS*1.5);
// if one of these is non-zero, we're still working
while ( (sp->obj || sp->size || sp->cnt) &&
(sp->obj > 0 && sp->size > 0 && sp->cnt > 0) &&
numLooped < MAX_LOOP ) {
// grab the parm we're working on
if ( sp->i < 0 || sp->i >= m_numParms ) {
log( "admin: invalid parm # in Parms::deserialize" );
char *xx = NULL; *xx = 0;
}
Parm *m = &m_parms[ sp->i ];
if ( sp->obj == OBJ_CONF ) {
deserializeConfParm( m, sp, &p, &confChgd );
sp = (struct SerParm *)p;
}
else if ( sp->obj == OBJ_COLL ) {
collnum_t j = g_collectiondb.getFirstCollnum ();
//if(j <= 0) {
// log("coll: Collectiondb does not have a rec" );
// return;
//}
while ( j >= 0 ) {
CollectionRec *cr = g_collectiondb.getRec( j );
deserializeCollParm( cr,
m, sp, &p );
sp = (SerParm *)p;
j = g_collectiondb.getNextCollnum ( j );
}
}
// setup the next rec
sp = (SerParm *)p;
numLooped++;
}
if (numLooped >= MAX_LOOP) {
log( "admin: infinite loop in Parms::deserialize(). halting!");
char *xx = NULL; *xx = 0;
}
// if we changed the conf, we need to save it
if ( confChgd ) {
g_conf.save ();
}
// if we changed a CollectionRec, we need to save it
int32_t j = g_collectiondb.getFirstCollnum ();
while ( j >= 0 ) {
CollectionRec *cr = g_collectiondb.getRec( j );
if ( cr->m_needsSave ) {
cr->save ();
// so g_spiderCache can reload if sameDomainWait, etc.
// have changed
g_collectiondb.updateTime();
}
j = g_collectiondb.getNextCollnum ( j );
}
}
void Parms::deserializeConfParm( Parm *m, SerParm *sp, char **p,
bool *confChgd ) {
if ( m->m_off + sp->size > (int32_t)sizeof(g_conf) ||
m->m_off + sp->size < 0 ){
log(LOG_WARN, "admin: deserializing parm would overflow "
"the collection rec!");
char *xx =0; *xx = 0;
}
if ( sp->size == 0 ) { // string
char *sVal = sp->val;
char *sConf = (char *)&g_conf + m->m_off;
int32_t totLen = 0;
bool goodParm = true;
int32_t tcnt = sp->cnt;
while ( tcnt ) {
goodParm = (goodParm && 0 == strcmp( sVal, sConf ));
int32_t len = gbstrlen( sVal );
totLen += len + 1;
// inc ser value by len of str + NULL
sVal += len + 1;
// inc conf ptr by size of strings
sConf += m->m_size;
tcnt--;
}
if ( goodParm ) {
// . inc by sizeof rec and tot len of compacted array
*p += sizeof( *sp ) + totLen;
return;
}
// parms don't match
sVal = sp->val;
sConf = (char *)&g_conf + m->m_off;
totLen = 0;
tcnt = sp->cnt;
while ( tcnt ) {
// copy an array value to this parm
strcpy( sConf, sVal );
int32_t len = gbstrlen( sVal );
totLen += len + 1; // incl the NULL
// inc conf ptr by size of strings
sConf += m->m_size;
// inc ser value by len of str + NULL
sVal += len + 1;
tcnt--;
}
// set num of member
if ( sp->off ) {
int32_t *tmp = (int32_t *)((char *)&g_conf + sp->off);
*tmp = sp->num;
}
// log the changed parm
log( LOG_INFO, "admin: Parm "
"#%"INT32" \"%s\" (\"%s\") in conf "
"changed on sync.",
sp->i, m->m_cgi, m->m_title );
*confChgd = true;
// inc by sizeof rec and tot len of compacted array
*p += sizeof( *sp ) + totLen;
}
else {
bool goodParm = ( 0 == memcmp( sp->val,
(char *)&g_conf + m->m_off,
sp->size ) );
if ( ! goodParm ) {
// copy the new parm to m's loc
gbmemcpy( (char *)&g_conf + m->m_off, sp->val,
sp->size );
// set num of member
if ( sp->off ) {
int32_t *tmp = (int32_t *)((char *)&g_conf
+ sp->off);
*tmp = sp->num;
}
// log the changed parm
log( LOG_INFO, "admin: Parm "
"#%"INT32" \"%s\" (\"%s\") in conf "
"changed on sync.",
sp->i, m->m_cgi, m->m_title );
*confChgd = true;
}
// increase by rec size and size of parm
*p += sizeof( *sp ) + sp->size;
}
}
void Parms::deserializeCollParm( CollectionRec *cr,
Parm *m, SerParm *sp, char **p ) {
if ( m->m_off + sp->size > (int32_t)sizeof(CollectionRec) ||
m->m_off + sp->size < 0 ) {
log(LOG_WARN, "admin: deserializing parm would overflow "
"the collection rec!");
char *xx =0; *xx = 0;
}
if ( sp->size == 0 ) { // strings
char *sVal = sp->val; // the sent string buffer i guess
char *sColl = (char *)cr + m->m_off; // what we have
int32_t totLen = 0;
int32_t tcnt = sp->cnt; // # of strings
bool goodParm = true;
while ( tcnt ) {
char *pstr;
if ( m->m_type == TYPE_SAFEBUF ) {
SafeBuf *sx = (SafeBuf *)sColl;
pstr = sx->getBuf();
}
else {
pstr = sColl;
}
// set goodParm to true if unchanged
goodParm= (goodParm && 0 == strcmp(sVal, pstr));
// get length of what was sent to us
int32_t len = gbstrlen( sVal );
totLen += len + 1; //incl NULL
// this is a list of strings with \0s (sent to us)
sVal += len + 1; //incl NULL
// inc by size of strs. point to next string we have
// stored in our array of strings in CollectionRec.
// for TYPE_SAFEBUF this size is sizeof(SafeBuf).
sColl += m->m_size;
tcnt--;
}
// if parm was an exact match return now
if ( goodParm ) {
// . inc by sizeof rec and
// tot len of compacted array
// . skip the SerParm and following string buffer.
*p += sizeof( *sp ) + totLen;
return;
}
//
// if parms don't match, we need to update our stuff
//
//
// point to the sent string buffer
sVal = sp->val;
// point to the local parm, array of strings or safebufs
sColl = (char *)cr + m->m_off;
totLen = 0;
// how many strings or safebufs in there?
tcnt = sp->cnt;
// loop over each one
while ( tcnt ) {
if ( m->m_type == TYPE_SAFEBUF ) {
SafeBuf *sx = (SafeBuf *)sColl;
sx->set ( sVal );
sx->nullTerm ( );
}
else {
// copy an array value to this parm
strcpy( sColl, sVal );
}
// get length of string we copied
int32_t len = gbstrlen( sVal );
totLen += len + 1; // +the NULL
// . inc conf ptr by size
// of strings
sColl += m->m_size;
// . inc ser value by len of str + NULL
sVal += len + 1;
tcnt--;
}
// we changed the record
cr->m_needsSave = true;
// set num of member
if ( sp->off ) {
int32_t *tmp = (int32_t *)((char *)cr + sp->off);
*tmp = sp->num;
}
// log the changed parm
log( LOG_INFO, "admin: Parm "
"#%"INT32" \"%s\" (\"%s\") in "
"collection \"%s\" "
"changed on sync.",
sp->i, m->m_cgi, m->m_title,
cr->m_coll );
// . inc by sizeof rec and
// tot len of compacted array
*p += sizeof( *sp ) + totLen;
}
else {
// sanity
if ( m->m_type == TYPE_SAFEBUF ) { char *xx=NULL;*xx=0; }
if ( 0 != memcmp( sp->val, (char *)cr + m->m_off, sp->size) ) {
// copy the new value
gbmemcpy( (char *)cr + m->m_off,
sp->val,
sp->size );
// set num of member
if ( sp->off ) {
int32_t *tmp = (int32_t *)((char *)cr + sp->off);
*tmp = sp->num;
}
// log the changed parm
log( LOG_INFO, "admin: Parm "
"#%"INT32" \"%s\" (\"%s\") "
"in collection \"%s\" "
"changed on sync.",
sp->i, m->m_cgi,
m->m_title,
cr->m_coll );
// we changed the record
cr->m_needsSave = true;
}
// inc by rec size and tot len of array
*p += sizeof( *sp ) + sp->size;
}
}
*/
void Parms::init ( ) {
// initialize the Parms class if we need to, only do it once
static bool s_init = false ;
if ( s_init ) return;
s_init = true ;
// default all
for ( int32_t i = 0 ; i < MAX_PARMS ; i++ ) {
m_parms[i].m_parmNum= i;
m_parms[i].m_hash = 0 ;
m_parms[i].m_title = "" ; // for detecting if not set
m_parms[i].m_desc = "" ; // for detecting if not set
m_parms[i].m_cgi = NULL ; // for detecting if not set
m_parms[i].m_off = -1 ; // for detecting if not set
// for PAGE_FILTERS url filters for printing the url
// filter profile parm above the url filters table rows.
m_parms[i].m_colspan= -1;
m_parms[i].m_def = NULL ; // for detecting if not set
m_parms[i].m_defOff = -1; // if default pts to collrec parm
m_parms[i].m_type = TYPE_NONE ; // for detecting if not set
m_parms[i].m_page = -1 ; // for detecting if not set
m_parms[i].m_obj = -1 ; // for detecting if not set
m_parms[i].m_max = 1 ; // max elements in array
m_parms[i].m_fixed = 0 ; // size of fixed size array
m_parms[i].m_size = 0 ; // max string size
m_parms[i].m_cast = 1 ; // send to all hosts?
m_parms[i].m_rowid = -1 ; // rowid of -1 means not in row
m_parms[i].m_addin = 0 ; // add insert row command?
m_parms[i].m_rdonly = 0 ; // is command off in read-only mode?
m_parms[i].m_hdrs = 1 ; // assume to always print headers
m_parms[i].m_perms = 0 ; // same as containing WebPages perms
m_parms[i].m_plen = -1 ; // offset for strings length
m_parms[i].m_group = 1 ; // start of a new group of controls?
m_parms[i].m_priv = 0 ; // is it private?
m_parms[i].m_save = 1 ; // save to xml file?
m_parms[i].m_min = -1 ; // min value (for int32_t parms)
// search fields
//m_parms[i].m_sparm = 0;
//m_parms[i].m_scmd = NULL;//"/search";
//m_parms[i].m_scgi = NULL;// defaults to m_cgi
m_parms[i].m_flags = 0;
m_parms[i].m_icon = NULL;
m_parms[i].m_class = NULL;
m_parms[i].m_qterm = NULL;
m_parms[i].m_subMenu= 0;
m_parms[i].m_spriv = 0;
// m_sdefo = -1; // just use m_off for this!
m_parms[i].m_sminc = -1; // min in collection rec
m_parms[i].m_smaxc = -1; // max in collection rec
m_parms[i].m_smin = 0x80000000; // 0xffffffff;
m_parms[i].m_smax = 0x7fffffff;
//m_parms[i].m_soff = -1; // offset into SearchInput
m_parms[i].m_sprpg = 1; // propagate to other pages via GET
m_parms[i].m_sprpp = 1; // propagate to other pages via POST
m_parms[i].m_sync = true;
}
// inherit perms from page
//for ( int32_t i = 1 ; i < MAX_PARMS ; i++ )
// if ( m_parms[i].m_page )
// m_parms[i].m_perms = m_parms[i-1].m_perms;
Parm *m = &m_parms [ 0 ];
CollectionRec cr;
SearchInput si;
///////////////////////////////////////////
// CAN ONLY BE CHANGED IN CONF AT STARTUP (no cgi field)
///////////////////////////////////////////
char *g = (char *)&g_conf;
char *x = (char *)&cr;
char *y = (char *)&si;
//////////////
//
// now for Pages.cpp printApiForPage() we need these
//
//////////////
GigablastRequest gr;
InjectionRequest ir;
/*
m->m_title = "delete collection";
m->m_desc = "A collection name to delete. You can specify multiple "
"&delColl= parms in the request to delete multiple "
"collections.";
m->m_cgi = "delColl";
m->m_page = PAGE_DELCOLL;
m->m_obj = OBJ_GBREQUEST;
m->m_type = TYPE_CHARPTR;//SAFEBUF;
m->m_def = NULL;
m->m_flags = 0;//PF_API | PF_REQUIRED;
m->m_off = (char *)&gr.m_coll - (char *)&gr;
m++;
m->m_title = "delete collection";
m->m_desc = "A collection name to delete. You can specify multiple "
"&delColl= parms in the request to delete multiple "
"collections.";
// camelcase as opposed to above lowercase
m->m_cgi = "delcoll";
m->m_page = PAGE_DELCOLL;
m->m_obj = OBJ_GBREQUEST;
m->m_type = TYPE_CHARPTR;//SAFEBUF;
m->m_def = NULL;
m->m_flags = PF_API | PF_REQUIRED;
m->m_off = (char *)&gr.m_coll - (char *)&gr;
m++;
m->m_title = "add collection";
m->m_desc = "A collection name to add.";
// camelcase support
m->m_cgi = "addColl";
m->m_page = PAGE_ADDCOLL;
m->m_obj = OBJ_GBREQUEST;
m->m_type = TYPE_CHARPTR;//SAFEBUF;
m->m_def = NULL;
m->m_flags = PF_API | PF_REQUIRED;
m->m_off = (char *)&gr.m_coll - (char *)&gr;
m++;
m->m_title = "add collection";
m->m_desc = "A collection name to add.";
// lowercase support
m->m_cgi = "addcoll";
m->m_page = PAGE_ADDCOLL;
m->m_obj = OBJ_GBREQUEST;
m->m_type = TYPE_CHARPTR;//SAFEBUF;
m->m_def = NULL;
m->m_flags = PF_HIDDEN;
m->m_off = (char *)&gr.m_coll - (char *)&gr;
m++;
*/
m->m_title = "collection";
m->m_desc = "Clone settings INTO this collection.";
m->m_cgi = "c";
m->m_page = PAGE_CLONECOLL;
m->m_obj = OBJ_GBREQUEST;
m->m_type = TYPE_CHARPTR;//SAFEBUF;
m->m_def = NULL;
m->m_flags = PF_API | PF_REQUIRED;
m->m_off = (char *)&gr.m_coll - (char *)&gr;
m++;
m->m_title = "collection";
m->m_desc = "Use this collection.";
m->m_cgi = "c";
m->m_page = PAGE_BASIC_STATUS;
m->m_obj = OBJ_GBREQUEST;
m->m_type = TYPE_CHARPTR;//SAFEBUF;
m->m_def = NULL;
m->m_flags = PF_API | PF_REQUIRED;
m->m_off = (char *)&gr.m_coll - (char *)&gr;
m++;
m->m_title = "collection";
m->m_desc = "Use this collection.";
m->m_cgi = "c";
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_GBREQUEST;
m->m_type = TYPE_CHARPTR;//SAFEBUF;
m->m_def = NULL;
// do not show in html controls
m->m_flags = PF_API | PF_REQUIRED | PF_NOHTML;
m->m_off = (char *)&gr.m_coll - (char *)&gr;
m++;
m->m_title = "collection";
m->m_desc = "Use this collection.";
m->m_cgi = "c";
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_GBREQUEST;
m->m_type = TYPE_CHARPTR;//SAFEBUF;
m->m_def = NULL;
// do not show in html controls
m->m_flags = PF_API | PF_REQUIRED | PF_NOHTML;
m->m_off = (char *)&gr.m_coll - (char *)&gr;
m++;
m->m_title = "collection";
m->m_desc = "Use this collection.";
m->m_cgi = "c";
m->m_page = PAGE_SPIDERDB;
m->m_obj = OBJ_GBREQUEST;
m->m_type = TYPE_CHARPTR;//SAFEBUF;
m->m_def = NULL;
// do not show in html controls
m->m_flags = PF_API | PF_REQUIRED | PF_NOHTML;
m->m_off = (char *)&gr.m_coll - (char *)&gr;
m++;
m->m_title = "collection";
m->m_desc = "Use this collection.";
m->m_cgi = "c";
m->m_page = PAGE_SITEDB;
m->m_obj = OBJ_GBREQUEST;
m->m_type = TYPE_CHARPTR;//SAFEBUF;
m->m_def = NULL;
// do not show in html controls
m->m_flags = PF_API | PF_REQUIRED | PF_NOHTML;
m->m_off = (char *)&gr.m_coll - (char *)&gr;
m++;
m->m_title = "collection";
m->m_desc = "Inject into this collection.";
m->m_cgi = "c";
m->m_obj = OBJ_GBREQUEST;
m->m_type = TYPE_CHARPTR;
m->m_def = NULL;
// PF_COLLDEFAULT: so it gets set to default coll on html page
m->m_flags = PF_API|PF_REQUIRED|PF_NOHTML;
m->m_page = PAGE_INJECT;
m->m_off = (char *)&gr.m_coll - (char *)&gr;
m++;
// //
// // more global-ish parms
// //
// m->m_title = "show settings";
// m->m_desc = "show settings or values for this page.";
// m->m_cgi = "showsettings";
// m->m_page = PAGE_MASTER;
// m->m_obj = OBJ_NONE;
// m->m_type = TYPE_BOOL;
// m->m_def = "1";
// // do not show in html controls
// m->m_flags = PF_API | PF_NOHTML;
// m->m_off = (char *)&gr.m_coll - (char *)&gr;
// m++;
////////////
//
// end stuff for printApiForPage()
//
////////////
// just a comment in the conf file
m->m_desc =
"All <, >, \" and # characters that are values for a field "
"contained herein must be represented as "
"&lt;, &gt;, &#34; and &#035; respectively.";
m->m_type = TYPE_COMMENT;
m->m_page = PAGE_NONE;
m->m_obj = OBJ_CONF;
m++;
// if the next guy has no description (m_desc) he is assumed to
// share the description of the previous parm with one.
/*
m->m_title = "main external ip";
m->m_desc = "This is the IP and port that a user connects to in "
"order to search this Gigablast network. This should be the "
"same for all gb processes.";
m->m_off = (char *)&g_conf.m_mainExternalIp - g;
m->m_def = "127.0.0.1"; // if no default, it is required!
m->m_type = TYPE_IP;
m++;
m->m_title = "main external port";
m->m_desc = "";
m->m_off = (char *)&g_conf.m_mainExternalPort - g;
m->m_def = "80";
m->m_type = TYPE_LONG;
m++;
*/
/*
m->m_title = "indexdb split";
m->m_desc = "Number of times to split indexdb across groups. "
"Must be a power of 2.";
m->m_off = (char *)&g_hostdb.m_indexSplits - g;
// -1 means to do a full split just based on docid, just like titledb
m->m_def = "-1"; // "1";
m->m_type = TYPE_LONG;
m++;
m->m_title = "full indexdb split";
m->m_desc = "Set to 1 (true) if indexdb is fully split. Performance "
"is much better for fully split indexes.";
m->m_off = (char *)&g_conf.m_fullSplit - g;
m->m_def = "0";
m->m_type = TYPE_BOOL;
m++;
m->m_title = "legacy indexdb split";
m->m_desc = "Set to 1 (true) if using legacy indexdb splitting. For "
"data generated with farmington release.";
m->m_off = (char *)&g_conf.m_legacyIndexdbSplit - g;
m->m_def = "0";
m->m_type = TYPE_BOOL;
m++;
m->m_title = "tfndb extension bits";
m->m_desc = "Number of extension bits to use in Tfndb. Increased for "
"large indexes.";
m->m_off = (char *)&g_conf.m_tfndbExtBits - g;
m->m_def = "7";
m->m_type = TYPE_LONG;
m++;
*/
/*
m->m_title = "checksumdb key size";
m->m_desc = "This determines the key size for checksums. "
"Must be set for every host.";
//m->m_cgi = "";
m->m_off = (char *)&g_conf.m_checksumdbKeySize - g;
m->m_type = TYPE_LONG;
m->m_def = "12";
m++;
*/
// just a comment in the conf file
m->m_desc =
"Below the various Gigablast databases are configured.\n"
"<*dbMaxTreeMem> - mem used for holding new recs\n"
"<*dbMaxDiskPageCacheMem> - disk page cache mem for this db\n"
"<*dbMaxCacheMem> - cache mem for holding single recs\n"
//"<*dbMinFilesToMerge> - required # files to trigger merge\n"
"<*dbSaveCache> - save the rec cache on exit?\n"
"<*dbMaxCacheAge> - max age (seconds) for recs in rec cache\n"
"See that Stats page for record counts and stats.\n";
m->m_type = TYPE_COMMENT;
m->m_page = PAGE_NONE;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "dns max cache mem";
m->m_desc = "How many bytes should be used for caching DNS replies?";
m->m_off = (char *)&g_conf.m_dnsMaxCacheMem - g;
m->m_def = "128000";
m->m_type = TYPE_LONG;
m->m_flags = PF_NOSYNC|PF_NOAPI;
m->m_page = PAGE_NONE;
m->m_obj = OBJ_CONF;
m++;
// g_dnsDistributed always saves now. main.cpp inits it that way.
//m->m_title = "dns save cache";
//m->m_desc = "Should the DNS reply cache be saved/loaded on "
// "exit/startup?";
//m->m_off = (char *)&g_conf.m_dnsSaveCache - g;
//m->m_def = "0";
//m->m_type = TYPE_BOOL;
//m++;
m->m_title = "tagdb max tree mem";
m->m_desc = "A tagdb record "
"assigns a url or site to a ruleset. Each tagdb record is "
"about 100 bytes or so.";
m->m_off = (char *)&g_conf.m_tagdbMaxTreeMem - g;
m->m_def = "1028000";
m->m_type = TYPE_LONG;
m->m_flags = PF_NOSYNC|PF_NOAPI;
m->m_page = PAGE_NONE;
m->m_obj = OBJ_CONF;
m++;
// m->m_title = "tagdb max page cache mem";
// m->m_desc = "";
// m->m_off = (char *)&g_conf.m_tagdbMaxDiskPageCacheMem - g;
// m->m_def = "200000";
// m->m_type = TYPE_LONG;
// m->m_flags = PF_NOSYNC|PF_NOAPI;
// m->m_page = PAGE_NONE;
// m->m_obj = OBJ_CONF;
// m++;
//m->m_title = "tagdb max cache mem";
//m->m_desc = "";
//m->m_off = (char *)&g_conf.m_tagdbMaxCacheMem - g;
//m->m_def = "128000";
//m->m_type = TYPE_LONG;
//m++;
//m->m_title = "tagdb min files to merge";
//m->m_desc = "";
//m->m_off = (char *)&g_conf.m_tagdbMinFilesToMerge - g;
//m->m_def = "2";
//m->m_type = TYPE_LONG;
//m->m_save = 0;
//m++;
m->m_title = "catdb max tree mem";
m->m_desc = "A catdb record "
"assigns a url or site to DMOZ categories. Each catdb record "
"is about 100 bytes.";
m->m_off = (char *)&g_conf.m_catdbMaxTreeMem - g;
m->m_def = "1000000";
m->m_type = TYPE_LONG;
m->m_flags = PF_NOSYNC|PF_NOAPI;
m->m_page = PAGE_NONE;
m->m_obj = OBJ_CONF;
m++;
// m->m_title = "catdb max page cache mem";
// m->m_desc = "";
// m->m_off = (char *)&g_conf.m_catdbMaxDiskPageCacheMem - g;
// m->m_def = "25000000";
// m->m_type = TYPE_LONG;
// m->m_flags = PF_NOSYNC|PF_NOAPI;
// m->m_page = PAGE_NONE;
// m->m_obj = OBJ_CONF;
// m++;
m->m_title = "catdb max cache mem";
m->m_desc = "";
m->m_off = (char *)&g_conf.m_catdbMaxCacheMem - g;
m->m_def = "0";
m->m_type = TYPE_LONG;
m->m_flags = PF_NOSYNC|PF_NOAPI;
m->m_page = PAGE_NONE;
m->m_obj = OBJ_CONF;
m++;
/*
m->m_title = "catdb min files to merge";
m->m_desc = "";
m->m_off = (char *)&g_conf.m_catdbMinFilesToMerge - g;
m->m_def = "2";
m->m_type = TYPE_LONG;
m->m_save = 0;
m++;
m->m_title = "revdb max tree mem";
m->m_desc = "Revdb holds the meta list we added for this doc.";
m->m_off = (char *)&g_conf.m_revdbMaxTreeMem - g;
m->m_def = "30000000";
m->m_type = TYPE_LONG;
m++;
*/
/*
m->m_title = "timedb max tree mem";
m->m_desc = "Timedb holds event time intervals";
m->m_off = (char *)&g_conf.m_timedbMaxTreeMem - g;
m->m_def = "30000000";
m->m_type = TYPE_LONG;
m++;
*/
/*
m->m_title = "titledb max tree mem";
m->m_desc = "Titledb holds the compressed documents that have been "
"indexed.";
m->m_off = (char *)&g_conf.m_titledbMaxTreeMem - g;
m->m_def = "10000000";
m->m_type = TYPE_LONG;
m++;
m->m_title = "titledb max cache mem";
m->m_desc = "";
m->m_off = (char *)&g_conf.m_titledbMaxCacheMem - g;
m->m_def = "1000000";
m->m_type = TYPE_LONG;
m++;
m->m_title = "titledb max cache age";
m->m_desc = "";
m->m_off = (char *)&g_conf.m_titledbMaxCacheAge - g;
m->m_def = "86400"; // 1 day
m->m_type = TYPE_LONG;
m++;
m->m_title = "titledb save cache";
m->m_desc = "";
m->m_off = (char *)&g_conf.m_titledbSaveCache - g;
m->m_def = "0";
m->m_type = TYPE_BOOL;
m++;
*/
m->m_title = "clusterdb max tree mem";
m->m_desc = "Clusterdb caches small records for site clustering "
"and deduping.";
m->m_off = (char *)&g_conf.m_clusterdbMaxTreeMem - g;
m->m_def = "1000000";
m->m_type = TYPE_LONG;
m->m_flags = PF_NOSYNC|PF_NOAPI;
m->m_page = PAGE_NONE;
m->m_obj = OBJ_CONF;
m++;
/*
m->m_title = "clusterdb max cache mem";
m->m_desc = "";
m->m_off = (char *)&g_conf.m_clusterdbMaxCacheMem - g;
m->m_def = "100000000";
m->m_type = TYPE_LONG;
m++;
m->m_title = "clusterdb max page cache mem";
m->m_desc = "";
m->m_off =(char *)&g_conf.m_clusterdbMaxDiskPageCacheMem - g;
m->m_def = "100000000";
m->m_type = TYPE_LONG;
m++;
*/
// this is overridden by collection
m->m_title = "clusterdb min files to merge";
m->m_desc = "";
m->m_cgi = "cmftm";
m->m_off = (char *)&g_conf.m_clusterdbMinFilesToMerge - g;
//m->m_def = "2";
m->m_def = "-1"; // -1 means to use collection rec
m->m_type = TYPE_LONG;
m->m_save = 0;
m->m_page = PAGE_NONE;
m->m_obj = OBJ_CONF;
m->m_flags = PF_NOAPI;
m++;
m->m_title = "clusterdb save cache";
m->m_desc = "";
m->m_cgi = "cdbsc";
m->m_off = (char *)&g_conf.m_clusterdbSaveCache - g;
m->m_def = "0";
m->m_type = TYPE_BOOL;
m->m_page = PAGE_NONE;
m->m_obj = OBJ_CONF;
m->m_flags = PF_NOAPI;
m++;
m->m_title = "max vector cache mem";
m->m_desc = "Max memory for dup vector cache.";
m->m_off = (char *)&g_conf.m_maxVectorCacheMem - g;
m->m_def = "10000000";
m->m_type = TYPE_LONG;
m->m_flags = PF_NOSYNC|PF_NOAPI;
m->m_page = PAGE_NONE;
m->m_obj = OBJ_CONF;
m++;
/*
m->m_title = "checksumdb max tree mem";
m->m_desc = "Checksumdb is used for deduping same-site urls at "
"index time.";
m->m_off = (char *)&g_conf.m_checksumdbMaxTreeMem - g;
m->m_def = "1000000";
m->m_type = TYPE_LONG;
m++;
m->m_title = "checksumdb max cache mem";
m->m_desc = "";
m->m_off = (char *)&g_conf.m_checksumdbMaxCacheMem - g;
m->m_def = "2000000";
m->m_type = TYPE_LONG;
m++;
m->m_title = "checksumdb max page cache mem";
m->m_desc = "";
m->m_off =(char *)&g_conf.m_checksumdbMaxDiskPageCacheMem-g;
m->m_def = "1000000";
m->m_type = TYPE_LONG;
m++;
// this is overridden by collection
m->m_title = "checksumdb min files to merge";
m->m_desc = "";
m->m_off = (char *)&g_conf.m_checksumdbMinFilesToMerge- g;
//m->m_def = "2";
m->m_def = "-1"; // -1 means to use collection rec
m->m_type = TYPE_LONG;
m->m_save = 0;
m++;
*/
/*
m->m_title = "tfndb max tree mem";
m->m_desc = "Tfndb holds small records for each url in Spiderdb or "
"Titledb.";
m->m_off = (char *)&g_conf.m_tfndbMaxTreeMem - g;
m->m_def = "1000000";
m->m_type = TYPE_LONG;
m++;
m->m_title = "tfndb max page cache mem";
m->m_desc = "";
m->m_off = (char *)&g_conf.m_tfndbMaxDiskPageCacheMem - g;
m->m_def = "5000000";
m->m_type = TYPE_LONG;
m++;
*/
/*
// this is overridden by collection
m->m_title = "tfndb min files to merge";
m->m_desc = "";
m->m_off = (char *)&g_conf.m_tfndbMinFilesToMerge - g;
m->m_def = "2";
m->m_type = TYPE_LONG;
m->m_save = 0;
m++;
*/
/*
m->m_title = "spiderdb max tree mem";
m->m_desc = "Spiderdb holds urls to be spidered.";
m->m_off = (char *)&g_conf.m_spiderdbMaxTreeMem - g;
m->m_def = "1000000";
m->m_type = TYPE_LONG;
m++;
m->m_title = "spiderdb max cache mem";
m->m_desc = "";
m->m_off = (char *)&g_conf.m_spiderdbMaxCacheMem - g;
m->m_def = "0";
m->m_type = TYPE_LONG;
m++;
m->m_title = "spiderdb max page cache mem";
m->m_desc = "";
m->m_off =(char *)&g_conf.m_spiderdbMaxDiskPageCacheMem-g;
m->m_def = "500000";
m->m_type = TYPE_LONG;
m++;
// this is overridden by collection
m->m_title = "spiderdb min files to merge";
m->m_desc = "";
m->m_off = (char *)&g_conf.m_spiderdbMinFilesToMerge - g;
//m->m_def = "2";
m->m_def = "-1"; // -1 means to use collection rec
m->m_type = TYPE_LONG;
m->m_save = 0;
m++;
*/
m->m_title = "robotdb max cache mem";
m->m_desc = "Robotdb caches robot.txt files.";
m->m_off = (char *)&g_conf.m_robotdbMaxCacheMem - g;
m->m_def = "128000";
m->m_type = TYPE_LONG;
m->m_flags = PF_NOSYNC|PF_NOAPI;
m->m_page = PAGE_NONE;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "robotdb save cache";
m->m_cgi = "rdbsc";
m->m_desc = "";
m->m_off = (char *)&g_conf.m_robotdbSaveCache - g;
m->m_def = "0";
m->m_type = TYPE_BOOL;
m->m_page = PAGE_NONE;
m->m_obj = OBJ_CONF;
m->m_flags = PF_NOAPI;
m++;
/*
m->m_title = "indexdb max tree mem";
m->m_desc = "Indexdb holds the terms extracted from spidered "
"documents.";
m->m_off = (char *)&g_conf.m_indexdbMaxTreeMem - g;
m->m_def = "10000000";
m->m_type = TYPE_LONG;
m++;
m->m_title = "indexdb max cache mem";
m->m_desc = "";
m->m_off = (char *)&g_conf.m_indexdbMaxCacheMem - g;
m->m_def = "5000000";
m->m_type = TYPE_LONG;
m++;
m->m_title = "indexdb max page cache mem";
m->m_desc = "";
m->m_off = (char *)&g_conf.m_indexdbMaxDiskPageCacheMem - g;
m->m_def = "50000000";
m->m_type = TYPE_LONG;
m++;
*/
// m->m_title = "linkdb max page cache mem";
// m->m_desc = "";
// m->m_off = (char *)&g_conf.m_linkdbMaxDiskPageCacheMem - g;
// m->m_def = "0";
// m->m_type = TYPE_LONG;
// m->m_flags = PF_NOSYNC|PF_NOAPI;
// m->m_page = PAGE_NONE;
// m->m_obj = OBJ_CONF;
// m++;
/*
// this is overridden by collection
m->m_title = "indexdb min files to merge";
m->m_desc = "";
m->m_off = (char *)&g_conf.m_indexdbMinFilesToMerge - g;
//m->m_def = "6";
m->m_def = "-1"; // -1 means to use collection rec
m->m_type = TYPE_LONG;
m->m_save = 0;
m++;
m->m_title = "indexdb max index list age";
m->m_desc = "";
m->m_off = (char *)&g_conf.m_indexdbMaxIndexListAge - g;
m->m_def = "60";
m->m_type = TYPE_LONG;
m++;
//m->m_title = "indexdb truncation limit";
//m->m_desc = "";
//m->m_off = (char *)&g_conf.m_indexdbTruncationLimit - g;
//m->m_def = "50000000";
//m->m_type = TYPE_LONG;
//m++;
m->m_title = "indexdb save cache";
m->m_desc = "";
m->m_off = (char *)&g_conf.m_indexdbSaveCache - g;
m->m_def = "0";
m->m_type = TYPE_BOOL;
m++;
*/
/*
m->m_title = "datedb max tree mem";
m->m_desc = "Datedb holds the terms extracted from spidered "
"documents.";
m->m_off = (char *)&g_conf.m_datedbMaxTreeMem - g;
m->m_def = "10000000";
m->m_type = TYPE_LONG;
m++;
m->m_title = "datedb max cache mem";
m->m_desc = "";
m->m_off = (char *)&g_conf.m_datedbMaxCacheMem - g;
m->m_def = "1000000";
m->m_type = TYPE_LONG;
m++;
// this is overridden by collection
m->m_title = "datedb min files to merge";
m->m_desc = "";
m->m_off = (char *)&g_conf.m_datedbMinFilesToMerge - g;
//m->m_def = "8";
m->m_def = "-1"; // -1 means to use collection rec
m->m_type = TYPE_LONG;
m->m_save = 0;
m++;
m->m_title = "datedb max index list age";
m->m_desc = "";
m->m_off = (char *)&g_conf.m_datedbMaxIndexListAge - g;
m->m_def = "60";
m->m_type = TYPE_LONG;
m++;
m->m_title = "datedb save cache";
m->m_desc = "";
m->m_off = (char *)&g_conf.m_datedbSaveCache - g;
m->m_def = "0";
m->m_type = TYPE_BOOL;
m++;
*/
/*
m->m_title = "linkdb max tree mem";
m->m_desc = "Linkdb stores linking information";
m->m_off = (char *)&g_conf.m_linkdbMaxTreeMem - g;
m->m_def = "20000000";
m->m_type = TYPE_LONG;
m++;
// this is overridden by collection
m->m_title = "linkdb min files to merge";
m->m_desc = "";
m->m_off = (char *)&g_conf.m_linkdbMinFilesToMerge - g;
m->m_def = "-1"; // -1 means to use collection rec
m->m_type = TYPE_LONG;
//m->m_save = 0;
m++;
*/
/*
m->m_title = "quota table max mem";
m->m_desc = "For caching and keeping tabs on exact quotas per "
"domain without having to do a disk seek. If you are using "
"exact quotas and see a lot of disk seeks on Indexdb, try "
"increasing this.";
m->m_off = (char *)&g_conf.m_quotaTableMaxMem - g;
m->m_def = "1000000";
m->m_type = TYPE_LONG;
m++;
*/
m->m_title = "statsdb max tree mem";
m->m_desc = "";
m->m_off = (char *)&g_conf.m_statsdbMaxTreeMem - g;
m->m_def = "5000000";
m->m_type = TYPE_LONG;
m->m_flags = PF_NOSYNC|PF_NOAPI;
m->m_page = PAGE_NONE;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "statsdb max cache mem";
m->m_desc = "";
m->m_off = (char *)&g_conf.m_statsdbMaxCacheMem - g;
m->m_def = "0";
m->m_type = TYPE_LONG;
m->m_flags = PF_NOSYNC|PF_NOAPI;
m->m_page = PAGE_NONE;
m->m_obj = OBJ_CONF;
m++;
// m->m_title = "statsdb max disk page cache mem";
// m->m_desc = "";
// m->m_off = (char *)&g_conf.m_statsdbMaxDiskPageCacheMem - g;
// m->m_def = "1000000";
// m->m_type = TYPE_LONG;
// m->m_flags = PF_NOSYNC|PF_NOAPI;
// m->m_page = PAGE_NONE;
// m->m_obj = OBJ_CONF;
// m++;
//m->m_title = "statsdb min files to merge";
//m->m_desc = "";
//m->m_off = (char *)&g_conf.m_statsdbMinFilesToMerge - g;
//m->m_def = "5";
//m->m_type = TYPE_LONG;
//m++;
/*
m->m_title = "use buckets for in memory recs";
m->m_desc = "Use buckets for in memory recs for indexdb, datedb, "
"and linkdb.";
m->m_off = (char *)&g_conf.m_useBuckets - g;
m->m_def = "1";
m->m_type = TYPE_BOOL;
m++;
*/
m->m_title = "http max send buf size";
m->m_desc = "Maximum bytes of a doc that can be sent before having "
"to read more from disk";
m->m_cgi = "hmsbs";
m->m_off = (char *)&g_conf.m_httpMaxSendBufSize - g;
m->m_def = "128000";
m->m_type = TYPE_LONG;
m->m_page = PAGE_NONE;
m->m_obj = OBJ_CONF;
m->m_flags = PF_NOAPI;
m++;
m->m_title = "search results max cache mem";
m->m_desc = "Bytes to use for caching search result pages.";
m->m_off = (char *)&g_conf.m_searchResultsMaxCacheMem - g;
m->m_def = "100000";
m->m_type = TYPE_LONG;
m->m_flags = PF_NOSYNC|PF_NOAPI;
m->m_page = PAGE_NONE;
m->m_obj = OBJ_CONF;
m++;
//m->m_title = "search results max cache age";
//m->m_desc = "Maximum age to cache search results page in seconds.";
//m->m_off = (char *)&g_conf.m_searchResultsMaxCacheAge - g;
//m->m_def = "86400";
//m->m_type = TYPE_LONG;
//m++;
//m->m_title = "search results save cache";
//m->m_desc = "Should the search results cache be saved to disk?";
//m->m_off = (char *)&g_conf.m_searchResultsSaveCache - g;
//m->m_def = "0";
//m->m_type = TYPE_BOOL;
//m++;
//m->m_title = "site link info max cache mem";
//m->m_desc = "Bytes to use for site link info data.";
//m->m_off = (char *)&g_conf.m_siteLinkInfoMaxCacheMem - g;
//m->m_def = "100000";
//m->m_type = TYPE_LONG;
//m++;
//m->m_title = "site link info max cache age";
//m->m_desc = "Maximum age to cache site link info data in seconds.";
//m->m_off = (char *)&g_conf.m_siteLinkInfoMaxCacheAge - g;
//m->m_def = "3600";
//m->m_type = TYPE_LONG;
//m++;
//m->m_title = "site link info save cache";
//m->m_desc = "Should the site link info cache be saved to disk?";
//m->m_off = (char *)&g_conf.m_siteLinkInfoSaveCache - g;
//m->m_def = "0";
//m->m_type = TYPE_BOOL;
//m++;
//m->m_title = "site quality max cache mem";
//m->m_desc = "Bytes to use for site or root page quality.";
//m->m_off = (char *)&g_conf.m_siteQualityMaxCacheMem - g;
//m->m_def = "2000000"; // 2MB
//m->m_type = TYPE_LONG;
//m++;
//m->m_title = "site quality save cache";
//m->m_desc = "Should the site link info cache be saved to disk?";
//m->m_off = (char *)&g_conf.m_siteQualitySaveCache - g;
//m->m_def = "0";
//m->m_type = TYPE_BOOL;
//m++;
//m->m_title = "max incoming links to sample";
//m->m_desc = "Max linkers to a doc that are sampled to determine "
// "quality and for gathering link text.";
//m->m_off = (char *)&g_conf.m_maxIncomingLinksToSample - g;
//m->m_def = "100";
//m->m_type = TYPE_LONG;
//m++;
//m->m_title = "allow async signals";
//m->m_desc = "Allow software interrupts?";
//m->m_off = (char *)&g_conf.m_allowAsyncSignals - g;
//m->m_def = "1";
//m->m_type = TYPE_BOOL;
//m++;
/*
m->m_title = "qa build mode";
m->m_desc = "When on Msg13.cpp saves docs in the qatest123 coll "
"to qa/ subdir, when off "
"if downloading a doc for qatest123 coll and not in "
"qa subdir then it returns a 404.";
m->m_cgi = "qabuildmode";
m->m_off = (char *)&g_conf.m_qaBuildMode - g;
m->m_def = "0";
m->m_type = TYPE_BOOL;
m->m_page = PAGE_NONE;
m->m_obj = OBJ_CONF;
m->m_flags = PF_NOAPI | PF_HIDDEN;
m++;
*/
m->m_title = "read only mode";
m->m_desc = "Read only mode does not allow spidering.";
m->m_cgi = "readonlymode";
m->m_off = (char *)&g_conf.m_readOnlyMode - g;
m->m_def = "0";
m->m_type = TYPE_BOOL;
m->m_page = PAGE_NONE;
m->m_obj = OBJ_CONF;
m->m_flags = PF_NOAPI;
m++;
/*
Disable this until it works.
m->m_title = "use merge token";
m->m_desc = "Restrict merging to one host per token group? Hosts "
"that use the same disk and mirror hosts are generally in the "
"same token group so that only one host in the group can be "
"doing a merge at a time. This prevents query response time "
"from suffering too much.";
m->m_off = (char *)&g_conf.m_useMergeToken - g;
m->m_def = "1";
m->m_type = TYPE_BOOL;
m++;
*/
/*
m->m_title = "do spell checking";
m->m_desc = "Spell check using the dictionary. Will be available "
"again soon.";
m->m_off = (char *)&g_conf.m_doSpellChecking - g;
m->m_cgi = "dospellchecking";
m->m_def = "1";
m->m_type = TYPE_BOOL;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_CONF;
m++;
*/
m->m_title = "do narrow search";
m->m_desc = "give narrow search suggestions.";
m->m_off = (char *)&g_conf.m_doNarrowSearch - g;
m->m_cgi = "donarrowsearch";
m->m_def = "0";
m->m_type = TYPE_BOOL;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_CONF;
m++;
///////////////////////////////////////////
// BASIC SETTINGS
///////////////////////////////////////////
m->m_title = "spidering enabled";
m->m_desc = "Pause and resumes spidering for this collection.";
m->m_cgi = "bcse";
m->m_off = (char *)&cr.m_spideringEnabled - x;
m->m_page = PAGE_BASIC_SETTINGS;
m->m_obj = OBJ_COLL;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_flags = PF_DUP|PF_CLONE;
m++;
m->m_title = "site list";
m->m_xml = "siteList";
m->m_desc = "List of sites to spider, one per line. "
"See <a href=#examples>example site list</a> below. "
"<br>"
"<br>"
"Example #1: <b>mysite.com myothersite.com</b>"
"<br>"
"<i>This will spider just those two sites.</i>"
"<br>"
"<br>"
"Example #2: <b>seed:dmoz.org</b>"
"<br>"
"<i>This will spider the whole web starting with the website "
"dmoz.org</i>"
"<br><br>"
"Gigablast uses the "
"<a href=/admin/filters#insitelist>insitelist</a> "
"directive on "
"the <a href=/admin/filters>url filters</a> "
"page to make sure that the spider only indexes urls "
"that match the site patterns you specify here, other than "
"urls you add individually via the add urls or inject url "
"tools. "
"Limit list to 300MB. If you have a lot of INDIVIDUAL urls "
"to add then consider using the <a href=/admin/addurl>add "
"urls</a> interface.";
m->m_cgi = "sitelist";
m->m_off = (char *)&cr.m_siteListBuf - x;
m->m_page = PAGE_BASIC_SETTINGS;
m->m_obj = OBJ_COLL;
m->m_type = TYPE_SAFEBUF;
m->m_func = CommandUpdateSiteList;
m->m_def = "";
// rebuild urlfilters now will nuke doledb and call updateSiteList()
m->m_flags = PF_TEXTAREA | PF_DUP | PF_REBUILDURLFILTERS;
m++;
/*
m->m_title = "spider sites";
m->m_desc = "Attempt to spider and index urls in the "
"\"site patterns\" above. Saves you from having to add "
"the same list of sites on the <a href=/admin/addurl>"
"add url</a> page.";
m->m_cgi = "spiderToo";
m->m_off = (char *)&cr.m_spiderToo - x;
m->m_page = PAGE_BASIC_SETTINGS;
m->m_obj = OBJ_COLL;
m->m_type = TYPE_CHECKBOX;
m->m_def = "1";
m->m_flags = PF_NOSAVE | PF_DUP;
m++;
*/
/*
// the new upload post submit button
m->m_title = "upload site list";
m->m_desc = "Upload your file of site patterns. Completely replaces "
"the site list in the text box above.";
m->m_cgi = "uploadsitelist";
m->m_page = PAGE_BASIC_SETTINGS;
m->m_obj = OBJ_COLL;
m->m_off = 0;
m->m_def = NULL;
m->m_type = TYPE_FILEUPLOADBUTTON;
m->m_flags = PF_NOSAVE | PF_DUP;
m++;
*/
m->m_title = "restart collection";
m->m_desc = "Remove all documents from the collection and re-add "
"seed urls from site list.";
// If you do this accidentally there "
//"is a <a href=/faq.html#recover>recovery procedure</a> to "
// "get back the trashed data.";
m->m_cgi = "restart";
m->m_page = PAGE_BASIC_SETTINGS;
m->m_obj = OBJ_COLL;
m->m_type = TYPE_CMD;
m->m_func2 = CommandRestartColl;
m++;
///////////////////////////////////////////
// SITE LIST
///////////////////////////////////////////
/*
m->m_title = "spider sites";
m->m_desc = "Attempt to spider and index urls in the "
"\"site patterns\" above. Saves you from having to add "
"the same list of sites on the <a href=/admin/addurl>"
"add url</a> page.";
m->m_cgi = "spiderToo";
m->m_off = (char *)&cr.m_spiderToo - x;
m->m_page = PAGE_SITES;
m->m_obj = OBJ_COLL;
m->m_type = TYPE_CHECKBOX;
m->m_def = "1";
m->m_flags = PF_NOSAVE ;
m++;
*/
///////////////////////////////////////////
// SYNC CONTROLS
///////////////////////////////////////////
/*
m->m_title = "sync enabled";
m->m_desc = "Turn data synchronization on or off. When a host comes "
"up he will perform an incremental synchronization with a "
"twin if he detects that he was unable to save his data "
"when he last exited.";
m->m_cgi = "sye";
m->m_off = (char *)&g_conf.m_syncEnabled - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_page = PAGE_SYNC;
m++;
m->m_title = "dry run";
m->m_desc = "Should Gigablast just run through and log the changes "
"it would make without actually making them?";
m->m_cgi = "sdr";
m->m_off = (char *)&g_conf.m_syncDryRun - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m++;
m->m_title = "sync indexdb";
m->m_desc = "Turn data synchronization on or off for indexdb. "
"Indexdb holds the index information.";
m->m_cgi = "si";
m->m_off = (char *)&g_conf.m_syncIndexdb - g;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m++;
m->m_title = "sync logging";
m->m_desc = "Log fixes?";
m->m_cgi = "slf";
m->m_off = (char *)&g_conf.m_syncLogging - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m++;
m->m_title = "union titledb and spiderdb";
m->m_desc = "If a host being sync'd has a title record (cached web "
"page) that the "
"remote host does not, normally, it would be deleted. "
"But if this is true then it is kept. "
"Useful for reducing title rec not found errors.";
m->m_cgi = "sdu";
m->m_off = (char *)&g_conf.m_syncDoUnion - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m++;
m->m_title = "force out of sync";
m->m_desc = "Forces this host to be out of sync.";
m->m_cgi = "foos";
m->m_type = TYPE_CMD;
m->m_func = CommandForceOutOfSync;
m->m_cast = 0;
m++;
m->m_title = "bytes per second";
m->m_desc = "How many bytes to read per second for syncing. "
"Decrease to reduce impact of syncing on query "
"response time.";
m->m_cgi = "sbps";
m->m_off = (char *)&g_conf.m_syncBytesPerSecond - g;
m->m_type = TYPE_LONG;
m->m_def = "10000000";
m->m_units = "bytes";
m++;
*/
/////////////////////
//
// DIFFBOT CRAWLBOT PARMS
//
//////////////////////
///////////
//
// DO NOT INSERT parms above here, unless you set
// m_obj = OBJ_COLL !!! otherwise it thinks it belongs to
// OBJ_CONF as used in the above parms.
//
///////////
m->m_cgi = "dbtoken";
m->m_xml = "diffbotToken";
m->m_off = (char *)&cr.m_diffbotToken - x;
m->m_type = TYPE_SAFEBUF;
m->m_page = PAGE_NONE;
m->m_obj = OBJ_COLL;
m->m_def = "";
m->m_flags = PF_DIFFBOT;
m++;
m->m_cgi = "createdtime";
m->m_xml = "collectionCreatedTime";
m->m_desc = "Time when this collection was created, or time of "
"the last reset or restart.";
m->m_off = (char *)&cr.m_diffbotCrawlStartTime - x;
m->m_type = TYPE_LONG;
m->m_page = PAGE_NONE;
m->m_obj = OBJ_COLL;
m->m_def = "0";
m->m_flags = PF_NOAPI;//PF_DIFFBOT; no i want to saveToXml
m++;
m->m_cgi = "spiderendtime";
m->m_xml = "crawlEndTime";
m->m_desc = "If spider is done, when did it finish.";
m->m_off = (char *)&cr.m_diffbotCrawlEndTime - x;
m->m_type = TYPE_LONG;
m->m_page = PAGE_NONE;
m->m_obj = OBJ_COLL;
m->m_def = "0";
m->m_flags = PF_NOAPI;//PF_DIFFBOT; no i want to saveToXml
m++;
m->m_cgi = "dbcrawlname";
m->m_xml = "diffbotCrawlName";
m->m_off = (char *)&cr.m_diffbotCrawlName - x;
m->m_type = TYPE_SAFEBUF;
m->m_page = PAGE_NONE;
m->m_obj = OBJ_COLL;
m->m_def = "";
m->m_flags = PF_DIFFBOT;
m++;
m->m_cgi = "notifyEmail";
m->m_title = "notify email";
m->m_xml = "notifyEmail";
m->m_off = (char *)&cr.m_notifyEmail - x;
m->m_type = TYPE_SAFEBUF;
m->m_page = PAGE_NONE;
m->m_obj = OBJ_COLL;
m->m_def = "";
m->m_flags = PF_DIFFBOT;
m++;
m->m_cgi = "notifyWebhook";
m->m_xml = "notifyWebhook";
m->m_title = "notify webhook";
m->m_off = (char *)&cr.m_notifyUrl - x;
m->m_type = TYPE_SAFEBUF;
m->m_page = PAGE_NONE;
m->m_obj = OBJ_COLL;
m->m_def = "";
m->m_flags = PF_DIFFBOT;
m++;
// collective respider frequency (for pagecrawlbot.cpp)
m->m_title = "collective respider frequency (days)";
m->m_cgi = "repeat";
m->m_xml = "collectiveRespiderFrequency";
m->m_off = (char *)&cr.m_collectiveRespiderFrequency - x;
m->m_type = TYPE_FLOAT;
m->m_def = "0.0"; // 0.0
m->m_page = PAGE_NONE;
m->m_obj = OBJ_COLL;
m->m_units = "days";
m->m_flags = PF_REBUILDURLFILTERS | PF_DIFFBOT;
m++;
m->m_title = "collective crawl delay (seconds)";
m->m_cgi = "crawlDelay";
m->m_xml = "collectiveCrawlDelay";
m->m_off = (char *)&cr.m_collectiveCrawlDelay - x;
m->m_type = TYPE_FLOAT;
m->m_def = ".250"; // 250 ms
m->m_page = PAGE_NONE;
m->m_obj = OBJ_COLL;
m->m_flags = PF_REBUILDURLFILTERS | PF_DIFFBOT;
m->m_units = "seconds";
m++;
m->m_cgi = "urlCrawlPattern";
m->m_xml = "diffbotUrlCrawlPattern";
m->m_title = "url crawl pattern";
m->m_off = (char *)&cr.m_diffbotUrlCrawlPattern - x;
m->m_type = TYPE_SAFEBUF;
m->m_page = PAGE_NONE;
m->m_obj = OBJ_COLL;
m->m_def = "";
m->m_flags = PF_REBUILDURLFILTERS | PF_DIFFBOT;
m++;
m->m_cgi = "urlProcessPattern";
m->m_xml = "diffbotUrlProcessPattern";
m->m_title = "url process pattern";
m->m_off = (char *)&cr.m_diffbotUrlProcessPattern - x;
m->m_type = TYPE_SAFEBUF;
m->m_page = PAGE_NONE;
m->m_obj = OBJ_COLL;
m->m_def = "";
m->m_flags = PF_REBUILDURLFILTERS | PF_DIFFBOT;
m++;
m->m_cgi = "pageProcessPattern";
m->m_xml = "diffbotPageProcessPattern";
m->m_title = "page process pattern";
m->m_off = (char *)&cr.m_diffbotPageProcessPattern - x;
m->m_type = TYPE_SAFEBUF;
m->m_page = PAGE_NONE;
m->m_obj = OBJ_COLL;
m->m_def = "";
m->m_flags = PF_REBUILDURLFILTERS | PF_DIFFBOT;
m++;
m->m_cgi = "urlCrawlRegEx";
m->m_xml = "diffbotUrlCrawlRegEx";
m->m_title = "url crawl regex";
m->m_off = (char *)&cr.m_diffbotUrlCrawlRegEx - x;
m->m_type = TYPE_SAFEBUF;
m->m_page = PAGE_NONE;
m->m_obj = OBJ_COLL;
m->m_def = "";
m->m_flags = PF_REBUILDURLFILTERS | PF_DIFFBOT;
m++;
m->m_cgi = "urlProcessRegEx";
m->m_xml = "diffbotUrlProcessRegEx";
m->m_title = "url process regex";
m->m_off = (char *)&cr.m_diffbotUrlProcessRegEx - x;
m->m_type = TYPE_SAFEBUF;
m->m_page = PAGE_NONE;
m->m_obj = OBJ_COLL;
m->m_def = "";
m->m_flags = PF_REBUILDURLFILTERS | PF_DIFFBOT;
m++;
m->m_cgi = "maxHops";
m->m_xml = "diffbotHopcount";
m->m_title = "diffbot max hopcount";
m->m_off = (char *)&cr.m_diffbotMaxHops - x;
m->m_type = TYPE_LONG;
m->m_page = PAGE_NONE;
m->m_obj = OBJ_COLL;
m->m_def = "-1";
m->m_flags = PF_REBUILDURLFILTERS | PF_DIFFBOT;
m++;
m->m_cgi = "onlyProcessIfNew";
m->m_xml = "diffbotOnlyProcessIfNew";
m->m_title = "onlyProcessIfNew";
m->m_off = (char *)&cr.m_diffbotOnlyProcessIfNewUrl - x;
m->m_type = TYPE_BOOL;
m->m_page = PAGE_NONE;
m->m_obj = OBJ_COLL;
m->m_def = "1";
m->m_flags = PF_REBUILDURLFILTERS | PF_DIFFBOT;
m++;
m->m_cgi = "seeds";
m->m_xml = "diffbotSeeds";
m->m_off = (char *)&cr.m_diffbotSeeds - x;
m->m_type = TYPE_SAFEBUF;
m->m_page = PAGE_NONE;
m->m_obj = OBJ_COLL;
m->m_flags = PF_DIFFBOT;
m->m_def = "";
m++;
m->m_xml = "isCustomCrawl";
m->m_off = (char *)&cr.m_isCustomCrawl - x;
m->m_type = TYPE_CHAR;
m->m_page = PAGE_NONE;
m->m_obj = OBJ_COLL;
m->m_cgi = "isCustomCrawl";
m->m_def = "0";
m->m_flags = PF_DIFFBOT;
m++;
m->m_cgi = "maxToCrawl";
m->m_title = "max to crawl";
m->m_xml = "maxToCrawl";
m->m_off = (char *)&cr.m_maxToCrawl - x;
m->m_type = TYPE_LONG_LONG;
m->m_page = PAGE_NONE;
m->m_obj = OBJ_COLL;
m->m_def = "100000";
m->m_flags = PF_DIFFBOT;
m++;
m->m_cgi = "maxToProcess";
m->m_title = "max to process";
m->m_xml = "maxToProcess";
m->m_off = (char *)&cr.m_maxToProcess - x;
m->m_type = TYPE_LONG_LONG;
m->m_page = PAGE_NONE;
m->m_obj = OBJ_COLL;
m->m_def = "-1";
m->m_flags = PF_DIFFBOT;
m++;
m->m_cgi = "maxRounds";
m->m_title = "max crawl rounds";
m->m_xml = "maxCrawlRounds";
m->m_off = (char *)&cr.m_maxCrawlRounds - x;
m->m_type = TYPE_LONG;
m->m_page = PAGE_NONE;
m->m_obj = OBJ_COLL;
m->m_def = "-1";
m->m_flags = PF_DIFFBOT;
m++;
/////////////////////
//
// new cmd parms
//
/////////////////////
m->m_title = "insert parm row";
m->m_desc = "insert a row into a parm";
m->m_cgi = "insert";
m->m_type = TYPE_CMD;
m->m_page = PAGE_NONE;
m->m_obj = OBJ_COLL;
m->m_func = CommandInsertUrlFiltersRow;
m->m_cast = 1;
m->m_flags = PF_REBUILDURLFILTERS;
m++;
m->m_title = "remove parm row";
m->m_desc = "remove a row from a parm";
m->m_cgi = "remove";
m->m_type = TYPE_CMD;
m->m_page = PAGE_NONE;
m->m_obj = OBJ_COLL;
m->m_func = CommandRemoveUrlFiltersRow;
m->m_cast = 1;
m->m_flags = PF_REBUILDURLFILTERS;
m++;
m->m_title = "delete collection";
m->m_desc = "delete a collection";
m->m_cgi = "delete";
m->m_type = TYPE_CMD;
m->m_page = PAGE_NONE;
m->m_obj = OBJ_COLL;
m->m_func2 = CommandDeleteColl;
m->m_cast = 1;
m++;
m->m_title = "delete collection 2";
m->m_desc = "delete the specified collection";
m->m_cgi = "delColl";
m->m_type = TYPE_CMD;
m->m_page = PAGE_NONE;
m->m_obj = OBJ_COLL;
m->m_func2 = CommandDeleteColl2;
m->m_cast = 1;
m++;
m->m_title = "delete collection";
m->m_desc = "Delete the specified collection. You can specify "
"multiple &delcoll= parms in a single request to delete "
"multiple collections at once.";
// lowercase as opposed to camelcase above
m->m_cgi = "delcoll";
m->m_type = TYPE_CMD;
m->m_page = PAGE_DELCOLL;
m->m_obj = OBJ_COLL;
m->m_func2 = CommandDeleteColl2;
m->m_cast = 1;
m->m_flags = PF_API | PF_REQUIRED;
m++;
// arg is the collection # to clone from
m->m_title = "clone collection";
m->m_desc = "Clone collection settings FROM this collection.";
m->m_cgi = "clonecoll";
m->m_type = TYPE_CMD;
m->m_page = PAGE_CLONECOLL;
m->m_obj = OBJ_COLL;
m->m_func = CommandCloneColl;
m->m_cast = 1;
m->m_flags = PF_API | PF_REQUIRED;
m++;
m->m_title = "add collection";
m->m_desc = "add a new collection";
// camelcase support
m->m_cgi = "addColl";
m->m_type = TYPE_CMD;
m->m_page = PAGE_NONE;
m->m_obj = OBJ_COLL;
m->m_func = CommandAddColl0;
m->m_cast = 1;
m++;
m->m_title = "add collection";
m->m_desc = "Add a new collection with this name. No spaces "
"allowed or strange characters allowed. Max of 64 characters.";
// lower case support
m->m_cgi = "addcoll";
m->m_type = TYPE_CMD;
m->m_page = PAGE_ADDCOLL;
m->m_obj = OBJ_COLL;
m->m_func = CommandAddColl0;
m->m_cast = 1;
m->m_flags = PF_API | PF_REQUIRED;
m++;
//
// CLOUD SEARCH ENGINE SUPPORT
//
// used to prevent a guest ip adding more than one coll
m->m_title = "user ip";
m->m_desc = "IP of user adding collection.";
m->m_cgi = "userip";
m->m_xml = "userIp";
m->m_off = (char *)&cr.m_userIp - x;
m->m_type = TYPE_STRING;
m->m_size = 16;
m->m_def = "";
m->m_group = 0;
m->m_flags = PF_HIDDEN;// | PF_NOSAVE;
m->m_page = PAGE_ADDCOLL;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "add custom crawl";
m->m_desc = "add custom crawl";
m->m_cgi = "addCrawl";
m->m_type = TYPE_CMD;
m->m_page = PAGE_NONE;
m->m_obj = OBJ_COLL;
m->m_func = CommandAddColl1;
m->m_cast = 1;
m++;
m->m_title = "add bulk job";
m->m_desc = "add bulk job";
m->m_cgi = "addBulk";
m->m_type = TYPE_CMD;
m->m_page = PAGE_NONE;
m->m_obj = OBJ_COLL;
m->m_func = CommandAddColl2;
m->m_cast = 1;
m++;
m->m_title = "in sync";
m->m_desc = "signify in sync with host 0";
m->m_cgi = "insync";
m->m_type = TYPE_CMD;
m->m_page = PAGE_NONE;
m->m_obj = OBJ_COLL;
m->m_func = CommandInSync;
m->m_cast = 1;
m++;
///////////////////////////////////////////
// SEARCH CONTROLS
///////////////////////////////////////////
//m->m_title = "allow RAID style list intersection";
//m->m_desc = "Allow using RAID style lookup for intersecting term "
// "lists and getting docIds for queries.";
//m->m_cgi = "uraid";
//m->m_off = (char *)&cr.m_allowRaidLookup - x;
//m->m_type = TYPE_BOOL;
//m->m_def = "0";
//m++;
//m->m_title = "allow RAIDed term list read";
//m->m_desc = "Allow splitting up the term list read for large lists "
// "amongst twins.";
//m->m_cgi = "ulraid";
//m->m_off = (char *)&cr.m_allowRaidListRead - x;
//m->m_type = TYPE_BOOL;
//m->m_def = "0";
//m->m_group = 0;
//m++;
//m->m_title = "max RAID mercenaries";
//m->m_desc = "Max number of mercenaries to use in RAID lookup and "
// "intersection.";
//m->m_cgi = "raidm";
//m->m_off = (char *)&cr.m_maxRaidMercenaries - x;
//m->m_type = TYPE_LONG;
//m->m_def = "2";
//m->m_group = 0;
//m++;
//m->m_title = "min term list size to RAID";
//m->m_desc = "Term list size to begin doing term list RAID";
//m->m_cgi = "raidsz";
//m->m_off = (char *)&cr.m_minRaidListSize - x;
//m->m_type = TYPE_LONG;
//m->m_def = "1000000";
//m->m_group = 0;
//m++;
m->m_title = "restrict indexdb for queries";
m->m_desc = "If this is true Gigablast will only search the root "
"index file for docIds. Saves on disk seeks, "
"but may use older versions of indexed web pages.";
m->m_cgi = "riq";
m->m_off = (char *)&cr.m_restrictIndexdbForQuery - x;
m->m_type = TYPE_BOOL;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m->m_def = "0";
//m->m_sparm = 1;
//m->m_scgi = "ri";
//m->m_soff = (char *)&si.m_restrictIndexdbForQuery - y;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m++;
m->m_title = "restrict indexdb for xml feed";
m->m_desc = "Like above, but specifically for XML feeds.";
m->m_cgi = "rix";
m->m_off = (char *)&cr.m_restrictIndexdbForXML - x;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
//m->m_title = "restrict indexdb for queries in xml feed";
//m->m_desc = "Same as above, but just for the XML feed.";
//m->m_cgi = "riqx";
//m->m_off = (char *)&cr.m_restrictIndexdbForQueryRaw - x;
//m->m_type = TYPE_BOOL;
//m->m_def = "1";
//m->m_group = 0;
//m++;
m->m_title = "read from cache by default";
m->m_desc = "Should we read search results from the cache? Set "
"to false to fix dmoz bug.";
m->m_cgi = "rcd";
m->m_off = (char *)&cr.m_rcache - x;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "fast results";
m->m_desc = "Use &fast=1 to obtain seach results from the much "
"faster Gigablast index, although the results are not "
"searched as thoroughly.";
m->m_obj = OBJ_SI;
m->m_page = PAGE_RESULTS;
m->m_off = (char *)&si.m_query - y;
m->m_type = TYPE_CHARPTR;//STRING;
m->m_def = "0";
m->m_cgi = "fast";
//m->m_size = MAX_QUERY_LEN;
m->m_flags = PF_COOKIE | PF_WIDGET_PARM | PF_API;
m++;
m->m_title = "query";
m->m_desc = "The query to perform. See <a href=/help.html>help</a>. "
"See the <a href=#qops>query operators</a> below for "
"more info.";
m->m_obj = OBJ_SI;
m->m_page = PAGE_RESULTS;
m->m_off = (char *)&si.m_query - y;
m->m_type = TYPE_CHARPTR;//STRING;
m->m_cgi = "q";
//m->m_size = MAX_QUERY_LEN;
m->m_flags = PF_REQUIRED | PF_COOKIE | PF_WIDGET_PARM | PF_API;
m++;
// m->m_title = "query2";
// m->m_desc = "The query on which to score inlinkers.";
// m->m_obj = OBJ_SI;
// m->m_page = PAGE_NONE;
// m->m_off = (char *)&si.m_query2 - y;
// m->m_type = TYPE_CHARPTR;//STRING;
// m->m_cgi = "qq";
// m->m_size = MAX_QUERY_LEN;
// m->m_sprpg = 0; // do not store query, needs to be last so related
// m->m_sprpp = 0; // topics can append to it
// m->m_flags = PF_HIDDEN | PF_NOSAVE;
// m++;
m->m_title = "collection";
m->m_desc = "Search this collection. Use multiple collection names "
"separated by a whitespace to search multiple collections at "
"once.";
m->m_cgi = "c";
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m->m_type = TYPE_CHARPTR;//SAFEBUF;
m->m_def = NULL;
m->m_flags = PF_API | PF_REQUIRED;
m->m_off = (char *)&si.m_coll - y;
m++;
m->m_title = "number of results per query";
m->m_desc = "The number of results returned per page.";
// make it 25 not 50 since we only have like 26 balloons
m->m_def = "10";
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m->m_off = (char *)&si.m_docsWanted - y;
m->m_type = TYPE_LONG;
m->m_cgi = "n";
m->m_flags = PF_WIDGET_PARM | PF_API;
m->m_smin = 0;
m++;
m->m_title = "first result num";
m->m_desc = "Start displaying at search result #X. Starts at 0.";
m->m_def = "0";
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m->m_off = (char *)&si.m_firstResultNum - y;
m->m_type = TYPE_LONG;
m->m_cgi = "s";
m->m_smin = 0;
m->m_sprpg = 0;
m->m_sprpp = 0;
m->m_flags = PF_REDBOX;
m++;
m->m_title = "show errors";
m->m_desc = "Show errors from generating search result summaries "
"rather than just hide the docid. Useful for debugging.";
m->m_cgi = "showerrors";
m->m_off = (char *)&si.m_showErrors - y;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_flags = PF_API;
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m++;
m->m_title = "site cluster";
m->m_desc = "Should search results be site clustered? This "
"limits each site to appearing at most twice in the "
"search results. Sites are subdomains for the most part, "
"like abc.xyz.com.";
m->m_cgi = "sc";
m->m_off = (char *)&si.m_doSiteClustering - y;
m->m_defOff= (char *)&cr.m_siteClusterByDefault - x;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_flags = PF_API;
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m++;
m->m_title = "hide all clustered results";
m->m_desc = "Only display at most one result per site.";
m->m_cgi = "hacr";
m->m_off = (char *)&si.m_hideAllClustered - y;
m->m_defOff= (char *)&cr.m_hideAllClustered - x;
m->m_type = TYPE_BOOL;
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m->m_def = "0";
m->m_group = 0;
m->m_flags = PF_API;
m++;
m->m_title = "dedup results";
m->m_desc = "Should duplicate search results be removed? This is "
"based on a content hash of the entire document. "
"So documents must be exactly the same for the most part.";
m->m_cgi = "dr"; // dedupResultsByDefault";
m->m_off = (char *)&si.m_doDupContentRemoval - y;
m->m_defOff= (char *)&cr.m_dedupResultsByDefault - x;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_group = 1;
m->m_cgi = "dr";
m->m_flags = PF_API;
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m++;
m->m_title = "percent similar dedup summary";
m->m_desc = "If document summary (and title) are "
"this percent similar "
"to a document summary above it, then remove it from the "
"search results. 100 means only to remove if exactly the "
"same. 0 means no summary deduping. You must also supply "
"dr=1 for this to work.";
m->m_cgi = "pss";
m->m_off = (char *)&si.m_percentSimilarSummary - y;
m->m_defOff= (char *)&cr.m_percentSimilarSummary - x;
m->m_type = TYPE_LONG;
m->m_group = 0;
m->m_smin = 0;
m->m_smax = 100;
m->m_flags = PF_API;
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m++;
m->m_title = "dedup URLs";
m->m_desc = "Should we dedup URLs with case insensitivity? This is "
"mainly to correct duplicate wiki pages.";
m->m_cgi = "ddu";
m->m_off = (char *)&si.m_dedupURL - y;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_group = 0;
m->m_flags = PF_API;
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m++;
m->m_title = "do spell checking";
m->m_desc = "If enabled while using the XML feed, "
"when Gigablast finds a spelling recommendation it will be "
"included in the XML <spell> tag. Default is 0 if using an "
"XML feed, 1 otherwise. Will be availble again soon.";
m->m_cgi = "spell";
m->m_off = (char *)&si.m_spellCheck - y;
m->m_type = TYPE_BOOL;
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m->m_def = "1";
m->m_flags = PF_API;
m++;
m->m_title = "stream search results";
m->m_desc = "Stream search results back on socket as they arrive. "
"Useful when thousands/millions of search results are "
"requested. Required when doing such things otherwise "
"Gigablast could run out of memory. Only supported for "
"JSON and XML formats, not HTML.";
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m->m_off = (char *)&si.m_streamResults - y;
m->m_type = TYPE_CHAR;
m->m_def = "0";
m->m_cgi = "stream";
m->m_flags = PF_API;
m->m_sprpg = 0; // propagate to next 10
m->m_sprpp = 0;
m++;
m->m_title = "seconds back";
m->m_desc = "Limit results to pages spidered this many seconds ago. "
"Use 0 to disable.";
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m->m_off = (char *)&si.m_secsBack - y;
m->m_type = TYPE_LONG;
m->m_def = "0";
m->m_cgi = "secsback";
m->m_flags = PF_API;
m++;
m->m_title = "sort by";
m->m_desc = "Use 0 to sort results by relevance, 1 to sort by "
"most recent spider date down, and 2 to sort by oldest "
"spidered results first.";
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m->m_off = (char *)&si.m_sortBy - y;
m->m_type = TYPE_CHAR;
m->m_def = "0"; // this means relevance
m->m_cgi = "sortby";
m->m_flags = PF_API;
m++;
m->m_title = "filetype";
m->m_desc = "Restrict results to this filetype. Supported "
"filetypes are pdf, doc, html xml, json, xls.";
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m->m_off = (char *)&si.m_filetype - y;
m->m_type = TYPE_CHARPTR;
m->m_def = "";
m->m_cgi = "filetype";
m->m_flags = PF_API;
m++;
m->m_title = "get scoring info";
m->m_desc = "Get scoring information for each result so you "
"can see how each result is scored. You must explicitly "
"request this using &scores=1 for the XML feed because it "
"is not included by default.";
m->m_cgi = "scores"; // dedupResultsByDefault";
m->m_off = (char *)&si.m_getDocIdScoringInfo - y;
m->m_type = TYPE_BOOL;
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m->m_def = NULL;
m->m_flags = PF_API;
// get default from collectionrec item
m->m_defOff= (char *)&cr.m_getDocIdScoringInfo - x;
m++;
m->m_title = "do query expansion";
m->m_desc = "If enabled, query expansion will expand your query "
"to include the various forms and "
"synonyms of the query terms.";
m->m_off = (char *)&si.m_queryExpansion - y;
m->m_defOff= (char *)&cr.m_queryExpansion - x;
m->m_type = TYPE_BOOL;
m->m_cgi = "qe";
m->m_flags = PF_API;
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m++;
// more general parameters
m->m_title = "max search results";
m->m_desc = "What is the maximum total number "
"of returned search results.";
m->m_cgi = "msr";
m->m_off = (char *)&cr.m_maxSearchResults - x;
m->m_type = TYPE_LONG;
m->m_def = "1000";
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "max search results per query";
m->m_desc = "What is the limit to the total number "
"of returned search results per query?";
m->m_cgi = "msrpq";
m->m_off = (char *)&cr.m_maxSearchResultsPerQuery - x;
m->m_type = TYPE_LONG;
m->m_def = "100";
m->m_flags = 0;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "max search results for paying clients";
m->m_desc = "What is the limit to the total number "
"of returned search results for clients.";
m->m_cgi = "msrfpc";
m->m_off = (char *)&cr.m_maxSearchResultsForClients - x;
m->m_type = TYPE_LONG;
m->m_def = "1000";
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "max search results per query for paying clients";
m->m_desc = "What is the limit to the total number "
"of returned search results per query for paying clients? "
"Auto ban must be enabled for this to work.";
m->m_cgi = "msrpqfc";
m->m_off = (char *)&cr.m_maxSearchResultsPerQueryForClients - x;
m->m_type = TYPE_LONG;
m->m_def = "1000";
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "user ip";
m->m_desc = "The ip address of the searcher. We can pass back "
"for use in the autoban technology which bans abusive IPs.";
m->m_obj = OBJ_SI;
m->m_page = PAGE_RESULTS;
m->m_off = (char *)&si.m_userIpStr - y;
m->m_type = TYPE_CHARPTR;//STRING;
m->m_cgi = "uip";
m->m_flags = PF_COOKIE | PF_WIDGET_PARM | PF_API;
m++;
m->m_title = "use min ranking algo";
m->m_desc = "Should search results be ranked using this algo?";
//m->m_cgi = "uma";
//m->m_off = (char *)&cr.m_siteClusterByDefault - x;
m->m_off = (char *)&si.m_useMinAlgo - y;
m->m_type = TYPE_BOOL;
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
// seems, good, default it on
m->m_def = "1";
m->m_cgi = "uma";
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m++;
// limit to this # of the top term pairs from inlink text whose
// score is accumulated
m->m_title = "real max top";
m->m_desc = "Only score up to this many inlink text term pairs";
m->m_off = (char *)&si.m_realMaxTop - y;
m->m_type = TYPE_LONG;
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m->m_def = "10";
m->m_cgi = "rmt";
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m++;
m->m_title = "use new ranking algo";
m->m_desc = "Should search results be ranked using this new algo?";
m->m_off = (char *)&si.m_useNewAlgo - y;
m->m_type = TYPE_BOOL;
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
// seems, good, default it on
m->m_def = "1";
m->m_cgi = "una";
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m++;
m->m_title = "do max score algo";
m->m_desc = "Quickly eliminated docids using max score algo";
m->m_off = (char *)&si.m_doMaxScoreAlgo - y;
m->m_type = TYPE_BOOL;
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m->m_def = "1";
m->m_cgi = "dmsa";
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m++;
m->m_title = "use fast intersection algo";
m->m_desc = "Should we try to speed up search results generation?";
m->m_off = (char *)&si.m_fastIntersection - y;
m->m_type = TYPE_CHAR;
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
// turn off until we debug
m->m_def = "-1";
m->m_cgi = "fi";
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m++;
m->m_title = "max number of facets to return";
m->m_desc = "Max number of facets to return";
m->m_off = (char *)&si.m_maxFacets - y;
m->m_type = TYPE_LONG;
m->m_def = "50";
m->m_group = 1;
m->m_cgi = "nf";
m->m_flags = PF_API;
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m++;
// m->m_title = "special query";
// m->m_desc = "List of docids to restrain results to.";
// m->m_cgi = "sq";
// m->m_off = (char *)&si.m_sq - y;
// m->m_type = TYPE_CHARPTR;
// m->m_def = NULL;
// m->m_group = 0;
// m++;
// m->m_title = "negative docids";
// m->m_desc = "List of docids to ignore.";
// m->m_cgi = "nodocids";
// m->m_off = (char *)&si.m_noDocIds - y;
// m->m_type = TYPE_CHARPTR;
// m->m_def = NULL;
// m->m_group = 0;
// m++;
// m->m_title = "negative siteids";
// m->m_desc = "Whitespace-separated list of 32-bit sitehashes "
//"to ignore.";
// m->m_cgi = "nositeids";
// m->m_off = (char *)&si.m_noSiteIds - y;
// m->m_type = TYPE_CHARPTR;
// m->m_def = NULL;
// m->m_group = 0;
// m++;
m->m_title = "language weight";
m->m_desc = "Defalt language weight if document matches quer "
"language. Use this to give results that match the specified "
"the speicified &qlang higher ranking, or docs whose language "
"is unnknown. Can be override with "
"&langw in the query url.";
m->m_cgi = "langweight";
m->m_off = (char *)&cr.m_sameLangWeight - x;
m->m_type = TYPE_FLOAT;
m->m_def = "20.000000";
m->m_group = 1;
m->m_flags = 0;//PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "use language weights";
m->m_desc = "Use Language weights to sort query results. "
"This will give results that match the specified &qlang "
"higher ranking.";
m->m_cgi = "lsort";
m->m_off = (char *)&cr.m_enableLanguageSorting - x;
//m->m_soff = (char *)&si.m_enableLanguageSorting - y;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_group = 1;
//m->m_scgi = "lsort";
m->m_smin = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "sort language preference";
m->m_desc = "Default language to use for ranking results. "
//"This should only be used on limited collections. "
"Value should be any language abbreviation, for example "
"\"en\" for English. Use <i>xx</i> to give ranking "
"boosts to no language in particular. See the language "
"abbreviations at the bottom of the "
"<a href=/admin/filters>url filters</a> page.";
m->m_cgi = "qlang";
m->m_off = (char *)&si.m_defaultSortLang - y;
m->m_type = TYPE_CHARPTR;
//m->m_size = 6; // up to 5 chars + NULL, e.g. "en_US"
m->m_def = "";//"xx";//_US";
m->m_group = 0;
m->m_flags = PF_API;
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m++;
m->m_title = "language weight";
m->m_desc = "Use this to override the default language weight "
"for this collection. The default language weight can be "
"set in the search controls and is usually something like "
"20.0. Which means that we multiply a result's score by 20 "
"if from the same language as the query or the language is "
"unknown.";
m->m_off = (char *)&si.m_sameLangWeight - y;
m->m_defOff= (char *)&cr.m_sameLangWeight - x;
m->m_type = TYPE_FLOAT;
m->m_cgi = "langw";
m->m_flags = PF_API;
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m++;
m->m_title = "sort country preference";
m->m_desc = "Default country to use for ranking results. "
//"This should only be used on limited collections. "
"Value should be any country code abbreviation, for example "
"\"us\" for United States. This is currently not working.";
m->m_cgi = "qcountry";
m->m_off = (char *)&si.m_defaultSortCountry - y;
m->m_type = TYPE_CHARPTR;
m->m_size = 2+1;
m->m_def = "us";
m->m_group = 0;
m->m_flags = PF_API;
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m->m_flags = PF_NOAPI;
m++;
/*
m->m_title = "language method weights";
m->m_desc = "Language method weights for spider language "
"detection. A string of ascii numerals that "
"should default to 895768712";
m->m_cgi = "lmweights";
m->m_off = (char *)&cr.m_languageMethodWeights - x;
m->m_type = TYPE_STRING;
m->m_size = 10; // up to 9 chars + NULL
m->m_def = "894767812";
m->m_group = 0;
// m->m_sparm = 1;
m++;
m->m_title = "language detection sensitivity";
m->m_desc = "Language detection sensitivity. Higher"
" values mean higher hitrate, but lower accuracy."
" Suggested values are from 2 to 20";
m->m_cgi = "lmbailout";
m->m_off = (char *)&cr.m_languageBailout - x;
m->m_type = TYPE_LONG;
m->m_def = "5";
m->m_group = 0;
// m->m_sparm = 1;
m++;
m->m_title = "language detection threshold";
m->m_desc = "Language detection threshold sensitivity."
" Higher values mean better accuracy, but lower hitrate."
" Suggested values are from 2 to 20";
m->m_cgi = "lmthreshold";
m->m_off = (char *)&cr.m_languageThreshold - x;
m->m_type = TYPE_LONG;
m->m_def = "3";
m->m_group = 0;
// m->m_sparm = 1;
m++;
m->m_title = "language detection samplesize";
m->m_desc = "Language detection size. Higher values"
" mean more accuracy, but longer processing time."
" Suggested values are 300-1000";
m->m_cgi = "lmsamples";
m->m_off = (char *)&cr.m_languageSamples - x;
m->m_type = TYPE_LONG;
m->m_def = "600";
m->m_group = 0;
// m->m_sparm = 1;
m++;
m->m_title = "language detection spider samplesize";
m->m_desc = "Language detection page sample size. "
"Higher values mean more accuracy, but longer "
"spider time."
" Suggested values are 3000-10000";
m->m_cgi = "lpsamples";
m->m_off = (char *)&cr.m_langPageLimit - x;
m->m_type = TYPE_LONG;
m->m_def = "6000";
m->m_group = 0;
// m->m_sparm = 1;
m++;
*/
m->m_title = "docs to check for post query";
m->m_desc = "How many search results should we "
"scan for post query demotion? "
"0 disables all post query reranking. ";
m->m_cgi = "pqrds";
m->m_off = (char *)&si.m_docsToScanForReranking - y;
m->m_type = TYPE_LONG;
m->m_def = "0";
m->m_group = 1;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m++;
m->m_title = "demotion for foreign languages";
m->m_desc = "Demotion factor of non-relevant languages. Score "
"will be penalized by this factor as a percent if "
"it's language is foreign. "
"A safe value is probably anywhere from 0.5 to 1. ";
m->m_cgi = "pqrlang";
m->m_off = (char *)&cr.m_languageWeightFactor - x;
//m->m_soff = (char *)&si.m_languageWeightFactor - y;
m->m_type = TYPE_FLOAT;
m->m_def = "0.999";
m->m_group = 0;
//m->m_scgi = "pqrlang";
m->m_smin = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "demotion for unknown languages";
m->m_desc = "Demotion factor for unknown languages. "
"Page's score will be penalized by this factor as a percent "
"if it's language is not known. "
"A safe value is 0, as these pages will be reranked by "
"country (see below). "
"0 means no demotion.";
m->m_cgi = "pqrlangunk";
m->m_off = (char *)&cr.m_languageUnknownWeight- x;
//m->m_soff = (char *)&si.m_languageUnknownWeight- y;
m->m_type = TYPE_FLOAT;
m->m_def = "0.0";
m->m_group = 0;
//m->m_scgi = "pqrlangunk";
m->m_smin = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "demotion for pages where the country of the page writes "
"in the same language as the country of the query";
m->m_desc = "Demotion for pages where the country of the page writes "
"in the same language as the country of the query. "
"If query language is the same as the language of the page, "
"then if a language written in the country of the page matches "
"a language written by the country of the query, then page's "
"score will be demoted by this factor as a percent. "
"A safe range is between 0.5 and 1. ";
m->m_cgi = "pqrcntry";
m->m_off = (char *)&cr.m_pqr_demFactCountry - x;
m->m_type = TYPE_FLOAT;
m->m_def = "0.98";
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "demotion for query terms or gigabits in url";
m->m_desc = "Demotion factor for query terms or gigabits "
"in a result's url. "
"Score will be penalized by this factor times the number "
"of query terms or gigabits in the url divided by "
"the max value below such that fewer "
"query terms or gigabits in the url causes the result "
"to be demoted more heavily, depending on the factor. "
"Higher factors demote more per query term or gigabit "
"in the page's url. "
"Generally, a page may not be demoted more than this "
"factor as a percent. Also, how it is demoted is "
"dependant on the max value. For example, "
"a factor of 0.2 will demote the page 20% if it has no "
"query terms or gigabits in its url. And if the max value is "
"10, then a page with 5 query terms or gigabits in its "
"url will be demoted 10%; and 10 or more query terms or "
"gigabits in the url will not be demoted at all. "
"0 means no demotion. "
"A safe range is from 0 to 0.35. ";
m->m_cgi = "pqrqttiu";
m->m_off = (char *)&cr.m_pqr_demFactQTTopicsInUrl - x;
m->m_type = TYPE_FLOAT;
m->m_def = "0";
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "max value for pages with query terms or gigabits "
"in url";
m->m_desc = "Max number of query terms or gigabits in a url. "
"Pages with a number of query terms or gigabits in their "
"urls greater than or equal to this value will not be "
"demoted. "
"This controls the range of values expected to represent "
"the number of query terms or gigabits in a url. It should "
"be set to or near the estimated max number of query terms "
"or topics that can be in a url. Setting to a lower value "
"increases the penalty per query term or gigabit that is "
"not in a url, but decreases the range of values that "
"will be demoted.";
m->m_cgi = "pqrqttium";
m->m_off = (char *)&cr.m_pqr_maxValQTTopicsInUrl - x;
m->m_type = TYPE_LONG;
m->m_def = "10";
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "demotion for pages that are not high quality";
m->m_desc = "Demotion factor for pages that are not high quality. "
"Score is penalized by this number as a percent times level "
"of quality. A pqge will be demoted by the formula "
"(max quality - page's quality) * this factor / the max "
"value given below. Generally, a page will not be "
"demoted more than this factor as a percent. "
"0 means no demotion. "
"A safe range is between 0 to 1. ";
m->m_cgi = "pqrqual";
m->m_off = (char *)&cr.m_pqr_demFactQual - x;
m->m_type = TYPE_FLOAT;
m->m_def = "0";
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "max value for pages that are not high quality";
m->m_desc = "Max page quality. Pages with a quality level "
"equal to or higher than this value "
"will not be demoted. ";
m->m_cgi = "pqrqualm";
m->m_off = (char *)&cr.m_pqr_maxValQual - x;
m->m_type = TYPE_LONG;
m->m_def = "100";
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "demotion for pages that are not "
"root or have many paths in the url";
m->m_desc = "Demotion factor each path in the url. "
"Score will be demoted by this factor as a percent "
"multiplied by the number of paths in the url divided "
"by the max value below. "
"Generally, the page will not be demoted more than this "
"value as a percent. "
"0 means no demotion. "
"A safe range is from 0 to 0.75. ";
m->m_cgi = "pqrpaths";
m->m_off = (char *)&cr.m_pqr_demFactPaths - x;
m->m_type = TYPE_FLOAT;
m->m_def = "0";
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "max value for pages that have many paths in the url";
m->m_desc = "Max number of paths in a url. "
"This should be set to a value representing a very high "
"number of paths for a url. Lower values increase the "
"difference between how much each additional path demotes. ";
m->m_cgi = "pqrpathsm";
m->m_off = (char *)&cr.m_pqr_maxValPaths - x;
m->m_type = TYPE_LONG;
m->m_def = "16";
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "demotion for pages that do not have a catid";
m->m_desc = "Demotion factor for pages that do not have a catid. "
"Score will be penalized by this factor as a percent. "
"A safe range is from 0 to 0.2. ";
m->m_cgi = "pqrcatid";
m->m_off = (char *)&cr.m_pqr_demFactNoCatId - x;
m->m_type = TYPE_FLOAT;
m->m_def = "0";
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "demotion for pages where smallest "
"catid has a lot of super topics";
m->m_desc = "Demotion factor for pages where smallest "
"catid has a lot of super topics. "
"Page will be penalized by the number of super topics "
"multiplied by this factor divided by the max value given "
"below. "
"Generally, the page will not be demoted more than this "
"factor as a percent. "
"Note: pages with no catid are demoted by this factor as "
"a percent so as not to penalize pages with a catid. "
"0 means no demotion. "
"A safe range is between 0 and 0.25. ";
m->m_cgi = "pqrsuper";
m->m_off = (char *)&cr.m_pqr_demFactCatidHasSupers - x;
m->m_type = TYPE_FLOAT;
m->m_def = "0";
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "max value for pages where smallest catid has a lot "
"of super topics";
m->m_desc = "Max number of super topics. "
"Pages whose smallest catid that has more super "
"topics than this will be demoted by the maximum amount "
"given by the factor above as a percent. "
"This should be set to a value representing a very high "
"number of super topics for a category id. "
"Lower values increase the difference between how much each "
"additional path demotes. ";
m->m_cgi = "pqrsuperm";
m->m_off = (char *)&cr.m_pqr_maxValCatidHasSupers - x;
m->m_type = TYPE_LONG;
m->m_def = "11";
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "demotion for larger pages";
m->m_desc = "Demotion factor for larger pages. "
"Page will be penalized by its size times this factor "
"divided by the max page size below. "
"Generally, a page will not be demoted more than this "
"factor as a percent. "
"0 means no demotion. "
"A safe range is between 0 and 0.25. ";
m->m_cgi = "pqrpgsz";
m->m_off = (char *)&cr.m_pqr_demFactPageSize - x;
m->m_type = TYPE_FLOAT;
m->m_def = "0";
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "max value for larger pages";
m->m_desc = "Max page size. "
"Pages with a size greater than or equal to this will be "
"demoted by the max amount (the factor above as a percent). ";
m->m_cgi = "pqrpgszm";
m->m_off = (char *)&cr.m_pqr_maxValPageSize - x;
m->m_type = TYPE_LONG;
m->m_def = "524288";
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "demotion for non-location specific queries "
"with a location specific title";
m->m_desc = "Demotion factor for non-location specific queries "
"with a location specific title. "
"Pages which contain a location in their title which is "
"not in the query or the gigabits will be demoted by their "
"population multiplied by this factor divided by the max "
"place population specified below. "
"Generally, a page will not be demoted more than this "
"value as a percent. "
"0 means no demotion. ";
m->m_cgi = "pqrloct";
m->m_off = (char *)&cr.m_pqr_demFactLocTitle - x;
//m->m_scgi = "pqrloct";
//m->m_soff = (char *)&si.m_pqr_demFactLocTitle - y;
m->m_type = TYPE_FLOAT;
m->m_def = "0.99";
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "demotion for non-location specific queries "
"with a location specific summary";
m->m_desc = "Demotion factor for non-location specific queries "
"with a location specific summary. "
"Pages which contain a location in their summary which is "
"not in the query or the gigabits will be demoted by their "
"population multiplied by this factor divided by the max "
"place population specified below. "
"Generally, a page will not be demoted more than this "
"value as a percent. "
"0 means no demotion. ";
m->m_cgi = "pqrlocs";
m->m_off = (char *)&cr.m_pqr_demFactLocSummary - x;
//m->m_scgi = "pqrlocs";
//m->m_soff = (char *)&si.m_pqr_demFactLocSummary - y;
m->m_type = TYPE_FLOAT;
m->m_def = "0.95";
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "demotion for non-location specific queries "
"with a location specific dmoz category";
m->m_desc = "Demotion factor for non-location specific queries "
"with a location specific dmoz regional category. "
"Pages which contain a location in their dmoz which is "
"not in the query or the gigabits will be demoted by their "
"population multiplied by this factor divided by the max "
"place population specified below. "
"Generally, a page will not be demoted more than this "
"value as a percent. "
"0 means no demotion. ";
m->m_cgi = "pqrlocd";
m->m_off = (char *)&cr.m_pqr_demFactLocDmoz - x;
//m->m_scgi = "pqrlocd";
//m->m_soff = (char *)&si.m_pqr_demFactLocDmoz - y;
m->m_type = TYPE_FLOAT;
m->m_def = "0.95";
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "demote locations that appear in gigabits";
m->m_desc = "Demote locations that appear in gigabits.";
m->m_cgi = "pqrlocg";
m->m_off = (char *)&cr.m_pqr_demInTopics - x;
//m->m_scgi = "pqrlocg";
//m->m_soff = (char *)&si.m_pqr_demInTopics - y;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "max value for non-location specific queries "
"with location specific results";
m->m_desc = "Max place population. "
"Places with a population greater than or equal to this "
"will be demoted to the maximum amount given by the "
"factor above as a percent. ";
m->m_cgi = "pqrlocm";
m->m_off = (char *)&cr.m_pqr_maxValLoc - x;
m->m_type = TYPE_LONG;
// charlottesville was getting missed when this was 1M
m->m_def = "100000";
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "demotion for non-html";
m->m_desc = "Demotion factor for content type that is non-html. "
"Pages which do not have an html content type will be "
"demoted by this factor as a percent. "
"0 means no demotion. "
"A safe range is between 0 and 0.35. ";
m->m_cgi = "pqrhtml";
m->m_off = (char *)&cr.m_pqr_demFactNonHtml - x;
m->m_type = TYPE_FLOAT;
m->m_def = "0";
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "demotion for xml";
m->m_desc = "Demotion factor for content type that is xml. "
"Pages which have an xml content type will be "
"demoted by this factor as a percent. "
"0 means no demotion. "
"Any value between 0 and 1 is safe if demotion for non-html "
"is set to 0. Otherwise, 0 should probably be used. ";
m->m_cgi = "pqrxml";
m->m_off = (char *)&cr.m_pqr_demFactXml - x;
m->m_type = TYPE_FLOAT;
m->m_def = "0.95";
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "demotion for pages with other pages from same "
"hostname";
m->m_desc = "Demotion factor for pages with fewer other pages from "
"same hostname. "
"Pages with results from the same host will be "
"demoted by this factor times each fewer host than the max "
"value given below, divided by the max value. "
"Generally, a page will not be demoted more than this "
"factor as a percent. "
"0 means no demotion. "
"A safe range is between 0 and 0.35. ";
m->m_cgi = "pqrfsd";
m->m_off = (char *)&cr.m_pqr_demFactOthFromHost - x;
m->m_type = TYPE_FLOAT;
m->m_def = "0";
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "max value for pages with other pages from same "
"domain";
m->m_desc = "Max number of pages from same domain. "
"Pages which have this many or more pages from the same "
"domain will not be demoted. ";
m->m_cgi = "pqrfsdm";
m->m_off = (char *)&cr.m_pqr_maxValOthFromHost - x;
m->m_type = TYPE_LONG;
m->m_def = "12";
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "initial demotion for pages with common "
"topics in dmoz as other results";
m->m_desc = "Initial demotion factor for pages with common "
"topics in dmoz as other results. "
"Pages will be penalized by the number of common topics "
"in dmoz times this factor divided by the max value "
"given below. "
"Generally, a page will not be demoted by more than this "
"factor as a percent. "
"Note: this factor is decayed by the factor specified in "
"the parm below, decay for pages with common topics in "
"dmoz as other results, as the number of pages with "
"common topics in dmoz increases. "
"0 means no demotion. "
"A safe range is between 0 and 0.35. ";
m->m_cgi = "pqrctid";
m->m_off = (char *)&cr.m_pqr_demFactComTopicInDmoz - x;
m->m_type = TYPE_FLOAT;
m->m_def = "0";
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "decay for pages with common topics in dmoz "
"as other results";
m->m_desc = "Decay factor for pages with common topics in "
"dmoz as other results. "
"The initial demotion factor will be decayed by this factor "
"as a percent as the number of common topics increase. "
"0 means no decay. "
"A safe range is between 0 and 0.25. ";
m->m_cgi = "pqrctidd";
m->m_off = (char *)&cr.m_pqr_decFactComTopicInDmoz - x;
m->m_type = TYPE_FLOAT;
m->m_def = "0";
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "max value for pages with common topics in dmoz "
"as other results";
m->m_desc = "Max number of common topics in dmoz as other results. "
"Pages with a number of common topics equal to or greater "
"than this value will be demoted to the maximum as given "
"by the initial factor above as a percent. ";
m->m_cgi = "pqrctidm";
m->m_off = (char *)&cr.m_pqr_maxValComTopicInDmoz - x;
m->m_type = TYPE_LONG;
m->m_def = "32";
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "demotion for pages where dmoz category names "
"contain query terms or their synonyms";
m->m_desc = "Demotion factor for pages where dmoz category names "
"contain fewer query terms or their synonyms. "
"Pages will be penalized for each query term or synonym of "
"a query term less than the max value given below multiplied "
"by this factor, divided by the max value. "
"Generally, a page will not be demoted more than this value "
"as a percent. "
"0 means no demotion. "
"A safe range is between 0 and 0.3. ";
m->m_cgi = "pqrdcndcqt";
m->m_off = (char *)&cr.m_pqr_demFactDmozCatNmNoQT - x;
m->m_type = TYPE_FLOAT;
m->m_def = "0";
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "max value for pages where dmoz category names "
"contain query terms or their synonyms";
m->m_desc = "Max number of query terms and their synonyms "
"in a page's dmoz category name. "
"Pages with a number of query terms or their synonyms in all "
"dmoz category names greater than or equal to this value "
"will not be demoted. ";
m->m_cgi = "pqrcndcqtm";
m->m_off = (char *)&cr.m_pqr_maxValDmozCatNmNoQT - x;
m->m_type = TYPE_LONG;
m->m_def = "10";
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "demotion for pages where dmoz category names "
"contain gigabits";
m->m_desc = "Demotion factor for pages where dmoz category "
"names contain fewer gigabits. "
"Pages will be penalized by the number of gigabits in all "
"dmoz category names fewer than the max value given below "
"divided by the max value. "
"Generally, a page will not be demoted more than than this "
"factor as a percent. "
"0 means no demotion. "
"A safe range is between 0 and 0.3. ";
m->m_cgi = "pqrdcndcgb";
m->m_off = (char *)&cr.m_pqr_demFactDmozCatNmNoGigabits - x;
m->m_type = TYPE_FLOAT;
m->m_def = "0";
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "max value for pages where dmoz category names "
"contain gigabits";
m->m_desc = "Max number of pages where dmoz category names "
"contain a gigabit. "
"Pages with a number of gigabits in all dmoz category names "
"greater than or equal to this value will not be demoted. ";
m->m_cgi = "pqrdcndcgbm";
m->m_off = (char *)&cr.m_pqr_maxValDmozCatNmNoGigabits - x;
m->m_type = TYPE_LONG;
m->m_def = "16";
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "demotion for pages based on datedb date";
m->m_desc = "Demotion factor for pages based on datedb date. "
"Pages will be penalized for being published earlier than the "
"max date given below. "
"The older the page, the more it will be penalized based on "
"the time difference between the page's date and the max date, "
"divided by the max date. "
"Generally, a page will not be demoted more than this "
"value as a percent. "
"0 means no demotion. "
"A safe range is between 0 and 0.4. ";
m->m_cgi = "pqrdate";
m->m_off = (char *)&cr.m_pqr_demFactDatedbDate - x;
m->m_type = TYPE_FLOAT;
m->m_def = "0";
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "min value for demotion based on datedb date ";
m->m_desc = "Pages with a publish date equal to or earlier than "
"this date will be demoted to the max (the factor above as "
"a percent). "
"Use this parm in conjunction with the max value below "
"to specify the range of dates where demotion occurs. "
"If you set this parm near the estimated earliest publish "
"date that occurs somewhat frequently, this method can better "
"control the additional demotion per publish day. "
"This number is given as seconds since the epoch, January 1st, "
"1970 divided by 1000. "
"0 means use the epoch. ";
m->m_cgi = "pqrdatei";
m->m_off = (char *)&cr.m_pqr_minValDatedbDate - x;
m->m_type = TYPE_LONG;
m->m_def = "631177"; // Jan 01, 1990
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "max value for demotion based on datedb date ";
m->m_desc = "Pages with a publish date greater than or equal to "
"this value divided by 1000 will not be demoted. "
"Use this parm in conjunction with the min value above "
"to specify the range of dates where demotion occurs. "
"This number is given as seconds before the current date "
"and time taken from the system clock divided by 1000. "
"0 means use the current time of the current day. ";
m->m_cgi = "pqrdatem";
m->m_off = (char *)&cr.m_pqr_maxValDatedbDate - x;
m->m_type = TYPE_LONG;
m->m_def = "0";
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "demotion for pages based on proximity";
m->m_desc = "Demotion factor for proximity of query terms in "
"a document. The closer together terms occur in a "
"document, the higher it will score."
"0 means no demotion. ";
m->m_cgi = "pqrprox";
m->m_off = (char *)&cr.m_pqr_demFactProximity - x;
//m->m_scgi = "pqrprox";
//m->m_soff = (char *)&si.m_pqr_demFactProximity - y;
m->m_type = TYPE_FLOAT;
m->m_def = "0";
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "demotion for pages based on query terms section";
m->m_desc = "Demotion factor for where the query terms occur "
"in the document. If the terms only occur in a menu, "
"a link, or a list, the document will be punished."
"0 means no demotion. ";
m->m_cgi = "pqrinsec";
//m->m_scgi = "pqrinsec";
m->m_off = (char *)&cr.m_pqr_demFactInSection - x;
//m->m_soff = (char *)&si.m_pqr_demFactInSection - y;
m->m_type = TYPE_FLOAT;
m->m_def = "0";
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "weight of indexed score on pqr";
m->m_desc = "The proportion that the original score affects "
"its rerank position. A factor of 1 will maintain "
"the original score, 0 will only use the indexed "
"score to break ties.";
m->m_cgi = "pqrorig";
//m->m_scgi = "pqrorig";
m->m_off = (char *)&cr.m_pqr_demFactOrigScore - x;
//m->m_soff = (char *)&si.m_pqr_demFactOrigScore - y;
m->m_type = TYPE_FLOAT;
m->m_def = "1";
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "max value for demotion for pages based on proximity";
m->m_desc = "Max summary score where no more demotion occurs above. "
"Pages with a summary score greater than or equal to this "
"value will not be demoted. ";
m->m_cgi = "pqrproxm";
m->m_off = (char *)&cr.m_pqr_maxValProximity - x;
m->m_type = TYPE_LONG;
m->m_def = "100000";
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "demotion for query being exclusivly in a subphrase";
m->m_desc = "Search result which contains the query terms only"
" as a subphrase of a larger phrase will have its score "
" reduced by this percent.";
m->m_cgi = "pqrspd";
m->m_off = (char *)&cr.m_pqr_demFactSubPhrase - x;
//m->m_soff = (char *)&si.m_pqr_demFactSubPhrase - y;
//m->m_scgi = "pqrspd";
m->m_type = TYPE_FLOAT;
m->m_def = "0";
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "demotion based on common inlinks";
m->m_desc = "Based on the number of inlinks a search results has "
"which are in common with another search result.";
m->m_cgi = "pqrcid";
m->m_off = (char *)&cr.m_pqr_demFactCommonInlinks - x;
//m->m_soff = (char *)&si.m_pqr_demFactCommonInlinks - y;
//m->m_scgi = "pqrcid";
m->m_type = TYPE_FLOAT;
m->m_def = ".5";
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "number of document calls multiplier";
m->m_desc = "Allows more results to be gathered in the case of "
"an index having a high rate of duplicate results. Generally"
" expressed as 1.2";
m->m_cgi = "ndm";
m->m_off = (char *)&cr.m_numDocsMultiplier - x;
m->m_type = TYPE_FLOAT;
m->m_def = "1.2";
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
/*
m->m_title = "max documents to compute per host";
m->m_desc = "Limit number of documents to search that do not provide"
" the required results.";
m->m_cgi = "mdi";
m->m_off = (char *)&cr.m_maxDocIdsToCompute - x;
m->m_type = TYPE_LONG;
m->m_def = "1000";
m->m_group = 0;
m++;
*/
m->m_title = "max real time inlinks";
m->m_desc = "Limit number of linksdb inlinks requested per result.";
m->m_cgi = "mrti";
m->m_off = (char *)&cr.m_maxRealTimeInlinks - x;
//m->m_soff = (char *)&si.m_maxRealTimeInlinks - y;
m->m_type = TYPE_LONG;
m->m_def = "10000";
m->m_group = 0;
//m->m_scgi = "mrti";
m->m_smin = 0;
m->m_smax = 100000;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "percent topic similar default";
m->m_desc = "Like above, but used for deciding when to cluster "
"results by topic for the news collection.";
m->m_cgi = "ptcd";
m->m_off = (char *)&cr.m_topicSimilarCutoffDefault - x;
m->m_type = TYPE_LONG;
m->m_def = "50";
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "max query terms";
m->m_desc = "Do not allow more than this many query terms. Helps "
"prevent big queries from resource hogging.";
m->m_cgi = "mqt";
m->m_off = (char *)&cr.m_maxQueryTerms - x;
//m->m_soff = (char *)&si.m_maxQueryTerms - y;
m->m_type = TYPE_LONG;
m->m_def = "999999"; // now we got synonyms... etc
m->m_group = 0;
m->m_flags = 0;//PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
/*
m->m_title = "dictionary site";
m->m_desc = "Where do we send requests for definitions of search "
"terms. Set to the empty string to turn this feature off.";
m->m_cgi = "dictionarySite";
m->m_off = (char *)&cr.m_dictionarySite - x;
m->m_type = TYPE_STRING;
m->m_size = SUMMARYHIGHLIGHTTAGMAXSIZE;
m->m_def = "http://www.answers.com/";
m++;
*/
/*
m->m_title = "allow links: searches";
m->m_desc = "Allows anyone access to perform links: searches on this "
"collection.";
m->m_cgi = "als";
m->m_off = (char *)&cr.m_allowLinksSearch - x;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m++;
*/
// REFERENCE PAGES CONTROLS
m->m_title = "number of reference pages to generate";
m->m_desc = "What is the number of "
"reference pages to generate per query? Set to 0 to save "
"CPU time.";
m->m_cgi = "nrp";
m->m_off = (char *)&cr.m_refs_numToGenerate - x;
//m->m_soff = (char *)&si.m_refs_numToGenerate - y;
m->m_smaxc = (char *)&cr.m_refs_numToGenerateCeiling - x;
m->m_type = TYPE_LONG;
m->m_def = "0";
m->m_priv = 0;
m->m_smin = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "number of reference pages to generate";
m->m_desc = "What is the number of "
"reference pages to generate per query? Set to 0 to save "
"CPU time.";
m->m_cgi = "snrp";
m->m_off = (char *)&si.m_refs_numToGenerate - y;
m->m_type = TYPE_LONG;
m->m_defOff =(char *)&cr.m_refs_numToGenerate - x;
m->m_priv = 0;
m->m_smin = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m++;
m->m_title = "number of reference pages to display";
m->m_desc = "What is the number of "
"reference pages to display per query?";
m->m_cgi = "nrpdd";
m->m_off = (char *)&cr.m_refs_numToDisplay - x;
//m->m_soff = (char *)&si.m_refs_numToDisplay - y;
m->m_type = TYPE_LONG;
m->m_def = "0";
m->m_group = 0;
m->m_priv = 0; // allow the (more) link
m->m_sprpg = 0; // do not propagate
m->m_sprpp = 0; // do not propagate
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "docs to scan for reference pages";
m->m_desc = "How many search results should we "
"scan for reference pages per query?";
m->m_cgi = "dsrp";
m->m_off = (char *)&cr.m_refs_docsToScan - x;
//m->m_soff = (char *)&si.m_refs_docsToScan - y;
m->m_smaxc = (char *)&cr.m_refs_docsToScanCeiling - x;
m->m_type = TYPE_LONG;
m->m_def = "30";
m->m_group = 0;
m->m_priv = 0;
m->m_smin = 0;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m++;
m->m_title = "min references quality";
m->m_desc = "References with page quality below this "
"will be excluded. (set to 101 to disable references while "
"still generating related pages.";
m->m_cgi = "mrpq";
m->m_off = (char *)&cr.m_refs_minQuality - x;
//m->m_soff = (char *)&si.m_refs_minQuality - y;
m->m_type = TYPE_LONG;
m->m_def = "1";
m->m_group = 0;
m->m_priv = 2;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "min links per references";
m->m_desc = "References need this many links to results to "
"be included.";
m->m_cgi = "mlpr";
m->m_off = (char *)&cr.m_refs_minLinksPerReference - x;
//m->m_soff = (char *)&si.m_refs_minLinksPerReference - y;
m->m_type = TYPE_LONG;
m->m_def = "2";
m->m_group = 0;
m->m_priv = 2;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "max linkers to consider for references per page";
m->m_desc = "Stop processing referencing pages after hitting this "
"limit.";
m->m_cgi = "mrpl";
m->m_off = (char *)&cr.m_refs_maxLinkers - x;
//m->m_soff = (char *)&si.m_refs_maxLinkers - y;
m->m_smaxc = (char *)&cr.m_refs_maxLinkersCeiling - x;
m->m_type = TYPE_LONG;
m->m_def = "500";
m->m_group = 0;
m->m_priv = 2;
m->m_smin = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "page fetch multiplier for references";
m->m_desc = "Use this multiplier to fetch more than the required "
"number of reference pages. fetches N * (this parm) "
"references and displays the top scoring N.";
m->m_cgi = "ptrfr";
m->m_off = (char *)&cr.m_refs_additionalTRFetch - x;
//m->m_soff = (char *)&si.m_refs_additionalTRFetch - y;
m->m_smaxc = (char *)&cr.m_refs_additionalTRFetchCeiling - x;
m->m_type = TYPE_FLOAT;
m->m_def = "1.5";
m->m_group = 0;
m->m_priv = 2;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "number of links coefficient";
m->m_desc = "A in A * numLinks + B * quality + C * "
"numLinks/totalLinks.";
m->m_cgi = "nlc";
m->m_off = (char *)&cr.m_refs_numLinksCoefficient - x;
//m->m_soff = (char *)&si.m_refs_numLinksCoefficient - y;
m->m_type = TYPE_LONG;
m->m_def = "0";
m->m_group = 0;
m->m_priv = 2;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "quality coefficient";
m->m_desc = "B in A * numLinks + B * quality + C * "
"numLinks/totalLinks.";
m->m_cgi = "qc";
m->m_off = (char *)&cr.m_refs_qualityCoefficient - x;
//m->m_soff = (char *)&si.m_refs_qualityCoefficient - y;
m->m_type = TYPE_LONG;
m->m_def = "1";
m->m_group = 0;
m->m_priv = 2;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "link density coefficient";
m->m_desc = "C in A * numLinks + B * quality + C * "
"numLinks/totalLinks.";
m->m_cgi = "ldc";
m->m_off = (char *)&cr.m_refs_linkDensityCoefficient - x;
//m->m_soff = (char *)&si.m_refs_linkDensityCoefficient - y;
m->m_type = TYPE_LONG;
m->m_def = "1000";
m->m_group = 0;
m->m_priv = 2;
//m->m_sparm = 1;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "add or multipy quality times link density";
m->m_desc = "[+|*] in A * numLinks + B * quality [+|*]"
" C * numLinks/totalLinks.";
m->m_cgi = "mrs";
m->m_off = (char *)&cr.m_refs_multiplyRefScore - x;
//m->m_soff = (char *)&si.m_refs_multiplyRefScore - y;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_group = 0;
m->m_priv = 2;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
// reference pages ceiling parameters
m->m_title = "maximum allowed value for "
"numReferences parameter";
m->m_desc = "maximum allowed value for "
"numReferences parameter";
m->m_cgi = "nrpc";
m->m_off = (char *)&cr.m_refs_numToGenerateCeiling - x;
m->m_type = TYPE_LONG;
m->m_def = "100";
m->m_group = 0;
m->m_priv = 2;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "maximum allowed value for "
"docsToScanForReferences parameter";
m->m_desc = "maximum allowed value for "
"docsToScanForReferences parameter";
m->m_cgi = "dsrpc";
m->m_off = (char *)&cr.m_refs_docsToScanCeiling - x;
m->m_type = TYPE_LONG;
m->m_def = "100";
m->m_group = 0;
m->m_priv = 2;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "maximum allowed value for "
"maxLinkers parameter";
m->m_desc = "maximum allowed value for "
"maxLinkers parameter";
m->m_cgi = "mrplc";
m->m_off = (char *)&cr.m_refs_maxLinkersCeiling - x;
m->m_type = TYPE_LONG;
m->m_def = "5000";
m->m_group = 0;
m->m_priv = 2;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "maximum allowed value for "
"additionalTRFetch";
m->m_desc = "maximum allowed value for "
"additionalTRFetch parameter";
m->m_cgi = "ptrfrc";
m->m_off = (char *)&cr.m_refs_additionalTRFetchCeiling - x;
m->m_type = TYPE_FLOAT;
m->m_def = "10";
m->m_group = 0;
m->m_priv = 2;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
// related pages parameters
m->m_title = "number of related pages to generate";
m->m_desc = "number of related pages to generate.";
m->m_cgi = "nrpg";
m->m_off = (char *)&cr.m_rp_numToGenerate - x;
//m->m_soff = (char *)&si.m_rp_numToGenerate - y;
m->m_smaxc = (char *)&cr.m_rp_numToGenerateCeiling - x;
m->m_type = TYPE_LONG;
m->m_def = "0";
m->m_priv = 0;
m->m_smin = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "number of related pages to display";
m->m_desc = "number of related pages to display.";
m->m_cgi = "nrpd";
m->m_off = (char *)&cr.m_rp_numToDisplay - x;
//m->m_soff = (char *)&si.m_rp_numToDisplay - y;
m->m_type = TYPE_LONG;
m->m_def = "0";
m->m_group = 0;
m->m_priv = 0; // allow the (more) link
m->m_sprpg = 0; // do not propagate
m->m_sprpp = 0; // do not propagate
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "number of links to scan for related pages";
m->m_desc = "number of links per reference page to scan for related "
"pages.";
m->m_cgi = "nlpd";
m->m_off = (char *)&cr.m_rp_numLinksPerDoc - x;
//m->m_soff = (char *)&si.m_rp_numLinksPerDoc - y;
m->m_smaxc = (char *)&cr.m_rp_numLinksPerDocCeiling - x;
m->m_type = TYPE_LONG;
m->m_def = "1024";
m->m_group = 0;
m->m_priv = 2;
m->m_smin = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "min related page quality";
m->m_desc = "related pages with a quality lower than this will be "
"ignored.";
m->m_cgi = "merpq";
m->m_off = (char *)&cr.m_rp_minQuality - x;
//m->m_soff = (char *)&si.m_rp_minQuality - y;
m->m_type = TYPE_LONG;
m->m_def = "30";
m->m_group = 0;
m->m_priv = 2;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "min related page score";
m->m_desc = "related pages with an adjusted score lower than this "
"will be ignored.";
m->m_cgi = "merps";
m->m_off = (char *)&cr.m_rp_minScore - x;
//m->m_soff = (char *)&si.m_rp_minScore - y;
m->m_type = TYPE_LONG;
m->m_def = "1";
m->m_group = 0;
m->m_priv = 2;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "min related page links";
m->m_desc = "related pages with less than this number of links"
" will be ignored.";
m->m_cgi = "merpl";
m->m_off = (char *)&cr.m_rp_minLinks - x;
//m->m_soff = (char *)&si.m_rp_minLinks - y;
m->m_type = TYPE_LONG;
m->m_def = "2";
m->m_group = 0;
m->m_priv = 2;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "coefficient for number of links in related pages score "
"calculation";
m->m_desc = "A in A * numLinks + B * avgLnkrQlty + C * PgQlty"
" + D * numSRPLinks.";
m->m_cgi = "nrplc";
m->m_off = (char *)&cr.m_rp_numLinksCoeff - x;
//m->m_soff = (char *)&si.m_rp_numLinksCoeff - y;
m->m_type = TYPE_LONG;
m->m_def = "10";
m->m_group = 0;
m->m_priv = 2;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "coefficient for average linker quality in related pages "
"score calculation";
m->m_desc = "B in A * numLinks + B * avgLnkrQlty + C * PgQlty"
" + D * numSRPLinks.";
m->m_cgi = "arplqc";
m->m_off = (char *)&cr.m_rp_avgLnkrQualCoeff - x;
//m->m_soff = (char *)&si.m_rp_avgLnkrQualCoeff - y;
m->m_type = TYPE_LONG;
m->m_def = "1";
m->m_group = 0;
m->m_priv = 2;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "coefficient for page quality in related pages "
"score calculation";
m->m_desc = "C in A * numLinks + B * avgLnkrQlty + C * PgQlty"
" + D * numSRPLinks";
m->m_cgi = "qrpc";
m->m_off = (char *)&cr.m_rp_qualCoeff - x;
//m->m_soff = (char *)&si.m_rp_qualCoeff - y;
m->m_type = TYPE_LONG;
m->m_def = "1";
m->m_group = 0;
m->m_priv = 2;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "coefficient for search result links in related pages "
"score calculation";
m->m_desc = "D in A * numLinks + B * avgLnkrQlty + C * PgQlty"
" + D * numSRPLinks.";
m->m_cgi = "srprpc";
m->m_off = (char *)&cr.m_rp_srpLinkCoeff - x;
//m->m_soff = (char *)&si.m_rp_srpLinkCoeff - y;
m->m_type = TYPE_LONG;
m->m_def = "1";
m->m_group = 0;
m->m_priv = 2;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "number of related page summary excerpts";
m->m_desc = "What is the maximum number of "
"excerpts displayed in the summary of a related page?";
m->m_cgi = "nrps";
m->m_off = (char *)&cr.m_rp_numSummaryLines - x;
//m->m_soff = (char *)&si.m_rp_numSummaryLines - y;
m->m_smaxc = (char *)&cr.m_rp_numSummaryLinesCeiling - x;
m->m_type = TYPE_LONG;
m->m_def = "1";
m->m_group = 0;
m->m_priv = 2;
m->m_smin = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "highlight query terms in related pages summary";
m->m_desc = "Highlight query terms in related pages summary.";
m->m_cgi = "hqtirps";
m->m_off = (char *)&cr.m_rp_doRelatedPageSumHighlight - x;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_group = 0;
m->m_priv = 2;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "number of characters to display in title before "
"truncating";
m->m_desc = "Truncates a related page title after this many "
"charaters and adds ...";
m->m_cgi = "ttl";
m->m_off = (char *)&cr.m_rp_titleTruncateLimit - x;
//m->m_soff = (char *)&si.m_rp_titleTruncateLimit - y;
m->m_type = TYPE_LONG;
m->m_def = "50";
m->m_group = 0;
m->m_priv = 2;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "use results pages as references";
m->m_desc = "Use the search results' links in order to generate "
"related pages.";
m->m_cgi = "urar";
m->m_off = (char *)&cr.m_rp_useResultsAsReferences - x;
//m->m_soff = (char *)&si.m_rp_useResultsAsReferences - y;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_group = 0;
m->m_priv = 2;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "get related pages from other cluster";
m->m_desc = "Say yes here to make Gigablast check another Gigablast "
"cluster for title rec for related pages. Gigablast will "
"use the hosts2.conf file in the working directory to "
"tell it what hosts belong to the other cluster.";
m->m_cgi = "erp"; // external related pages
m->m_off = (char *)&cr.m_rp_getExternalPages - x;
//m->m_soff = (char *)&si.m_rp_getExternalPages - y;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_group = 0;
m->m_priv = 2;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "collection for other related pages cluster";
m->m_desc = "Gigablast will fetch the related pages title record "
"from this collection in the other cluster.";
m->m_cgi = "erpc"; // external related pages collection
m->m_off = (char *)&cr.m_rp_externalColl - x;
//m->m_soff = (char *)&si.m_rp_externalColl - y;
m->m_type = TYPE_STRING;
m->m_size = MAX_COLL_LEN;
m->m_def = "main";
m->m_group = 0;
m->m_priv = 2;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
// relate pages ceiling parameters
m->m_title = "maximum allowed value for numToGenerate parameter";
m->m_desc = "maximum allowed value for numToGenerate parameter";
m->m_cgi = "nrpgc";
m->m_off = (char *)&cr.m_rp_numToGenerateCeiling - x;
m->m_type = TYPE_LONG;
m->m_def = "100";
m->m_group = 0;
m->m_priv = 2;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "maximum allowed value for numRPLinksPerDoc parameter";
m->m_desc = "maximum allowed value for numRPLinksPerDoc parameter";
m->m_cgi = "nlpdc";
m->m_off = (char *)&cr.m_rp_numLinksPerDocCeiling - x;
m->m_type = TYPE_LONG;
m->m_def = "5000";
m->m_group = 0;
m->m_priv = 2;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "maximum allowed value for numSummaryLines parameter";
m->m_desc = "maximum allowed value for numSummaryLines parameter";
m->m_cgi = "nrpsc";
m->m_off = (char *)&cr.m_rp_numSummaryLinesCeiling - x;
m->m_type = TYPE_LONG;
m->m_def = "10";
m->m_group = 0;
m->m_priv = 2;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
// import search results controls
m->m_title = "how many imported results should we insert";
m->m_desc = "Gigablast will import X search results from the "
"external cluster given by hosts2.conf and merge those "
"search results into the current set of search results. "
"Set to 0 to disable.";
m->m_cgi = "imp";
m->m_off = (char *)&cr.m_numResultsToImport - x;
//m->m_soff = (char *)&si.m_numResultsToImport - y;
m->m_type = TYPE_LONG;
m->m_def = "0";
m->m_priv = 2;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "imported score weight";
m->m_desc = "The score of all imported results will be multiplied "
"by this number. Since results are mostly imported from "
"a large collection they will usually have higher scores "
"because of having more link texts or whatever, so tone it "
"down a bit to put it on par with the integrating collection.";
m->m_cgi = "impw";
m->m_off = (char *)&cr.m_importWeight - x;
//m->m_soff = (char *)&si.m_importWeight - y;
m->m_type = TYPE_FLOAT;
m->m_def = ".80";
m->m_group = 0;
m->m_priv = 2;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "how many linkers must each imported result have";
m->m_desc = "The urls of imported search results must be linked to "
"by at least this many documents in the primary collection.";
m->m_cgi = "impl";
m->m_off = (char *)&cr.m_minLinkersPerImportedResult - x;
//m->m_soff = (char *)&si.m_minLinkersPerImportedResult - y;
m->m_type = TYPE_LONG;
m->m_def = "3";
m->m_group = 0;
m->m_priv = 2;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "num linkers weight";
m->m_desc = "The number of linkers an imported result has from "
"the base collection is multiplied by this weight and then "
"added to the final score. The higher this is the more an "
"imported result with a lot of linkers will be boosted. "
"Currently, 100 is the max number of linkers permitted.";
m->m_cgi = "impnlw";
m->m_off = (char *)&cr.m_numLinkerWeight - x;
//m->m_soff = (char *)&si.m_numLinkerWeight - y;
m->m_type = TYPE_LONG;
m->m_def = "50";
m->m_group = 0;
m->m_priv = 2;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "the name of the collection to import from";
m->m_desc = "Gigablast will import X search results from this "
"external collection and merge them into the current search "
"results.";
m->m_cgi = "impc";
m->m_off = (char *)&cr.m_importColl - x;
//m->m_soff = (char *)&si.m_importColl - y;
m->m_type = TYPE_STRING;
m->m_size = MAX_COLL_LEN;
m->m_def = "main";
m->m_group = 0;
m->m_priv = 2;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "max similar results for cluster by topic";
m->m_desc = "Max similar results to show when clustering by topic.";
m->m_cgi = "ncbt";
m->m_off = (char *)&cr.m_maxClusterByTopicResults - x;
m->m_type = TYPE_LONG;
m->m_def = "10";
m->m_group = 0;
//m->m_scgi = "ncbt";
//m->m_soff = (char *)&si.m_maxClusterByTopicResults - y;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "number of extra results to get for cluster by topic";
m->m_desc = "number of extra results to get for cluster by topic";
m->m_cgi = "ntwo";
m->m_off = (char *)&cr.m_numExtraClusterByTopicResults - x;
m->m_type = TYPE_LONG;
m->m_def = "100";
m->m_group = 0;
//m->m_scgi = "ntwo";
//m->m_soff = (char *)&si.m_numExtraClusterByTopicResults - y;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "Minimum number of in linkers required to consider getting"
" the title from in linkers";
m->m_desc = "Minimum number of in linkers required to consider getting"
" the title from in linkers";
m->m_cgi = "mininlinkers";
m->m_off = (char *)&cr.m_minTitleInLinkers - x;
m->m_type = TYPE_LONG;
m->m_def = "10";
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "Max number of in linkers to consider";
m->m_desc = "Max number of in linkers to consider for getting in "
"linkers titles.";
m->m_cgi = "maxinlinkers";
m->m_off = (char *)&cr.m_maxTitleInLinkers - x;
m->m_type = TYPE_LONG;
m->m_def = "128";
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "max title len";
m->m_desc = "What is the maximum number of "
"characters allowed in titles displayed in the search "
"results?";
m->m_cgi = "tml";
m->m_defOff= (char *)&cr.m_titleMaxLen - x;
m->m_off = (char *)&si.m_titleMaxLen - y;
m->m_type = TYPE_LONG;
m->m_flags = PF_API;
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m++;
/*
m->m_title = "use new summary generator";
m->m_desc = "Also used for gigabits and titles.";
m->m_cgi = "uns"; // external related pages
m->m_off = (char *)&cr.m_useNewSummaries - x;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_sparm = 1;
m->m_scgi = "uns";
m->m_soff = (char *)&si.m_useNewSummaries - y;
m++;
*/
m->m_title = "summary mode";
m->m_desc = "0 = old compatibility mode, 1 = UTF-8 mode, "
"2 = fast ASCII mode, "
"3 = Ascii Proximity Summary, "
"4 = Utf8 Proximity Summary, "
"5 = Ascii Pre Proximity Summary, "
"6 = Utf8 Pre Proximity Summary:";
m->m_cgi = "smd";
m->m_off = (char *)&cr.m_summaryMode - x;
m->m_type = TYPE_LONG;
m->m_def = "0";
//m->m_scgi = "smd";
//m->m_soff = (char*) &si.m_summaryMode - y;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "number of summary excerpts";
m->m_desc = "How many summary excerpts to display per search result?";
m->m_cgi = "ns";
m->m_type = TYPE_LONG;
m->m_defOff= (char *)&cr.m_summaryMaxNumLines - x;
m->m_group = 0;
m->m_off = (char *)&si.m_numLinesInSummary - y;
m->m_flags = PF_API;
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m++;
m->m_title = "max summary line width";
m->m_desc = "&lt;br&gt; tags are inserted to keep the number "
"of chars in the summary per line at or below this width. "
"Also affects title. "
"Strings without spaces that exceed this "
"width are not split. Has no affect on xml or json feed, "
"only works on html.";
m->m_cgi = "sw";
//m->m_off = (char *)&cr.m_summaryMaxWidth - x;
m->m_off = (char *)&si.m_summaryMaxWidth - y;
m->m_defOff= (char *)&cr.m_summaryMaxWidth - x;
m->m_type = TYPE_LONG;
m->m_group = 0;
m->m_flags = PF_API;
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m++;
m->m_title = "max summary excerpt length";
m->m_desc = "What is the maximum number of "
"characters allowed per summary excerpt?";
m->m_cgi = "smxcpl";
m->m_off = (char *)&si.m_summaryMaxNumCharsPerLine - y;
m->m_defOff= (char *)&cr.m_summaryMaxNumCharsPerLine - x;
m->m_type = TYPE_LONG;
m->m_group = 0;
m->m_flags = PF_API;
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m++;
/*
m->m_title = "enable page turk";
m->m_desc = "If enabled, search results shall feed the page turk "
"is used to mechanically rank websites.";
m->m_cgi = "ept";
m->m_def = "0";
m->m_off = (char *)&cr.m_pageTurkEnabled - x;
m->m_type = TYPE_BOOL;
m++;
*/
m->m_title = "results to scan for gigabits generation";
m->m_desc = "How many search results should we "
"scan for gigabit (related topics) generation. Set this to "
"zero to disable gigabits!";
m->m_cgi = "dsrt";
m->m_off = (char *)&si.m_docsToScanForTopics - y;
m->m_type = TYPE_LONG;
m->m_defOff= (char *)&cr.m_docsToScanForTopics - x;
m->m_flags = PF_API;
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m++;
m->m_title = "ip restriction for gigabits";
m->m_desc = "Should Gigablast only get one document per IP domain "
"and per domain for gigabits (related topics) generation?";
m->m_cgi = "ipr";
m->m_off = (char *)&si.m_ipRestrictForTopics - y;
m->m_defOff= (char *)&cr.m_ipRestrict - x;
m->m_type = TYPE_BOOL;
m->m_group = 0;
m->m_flags = PF_API;
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m++;
m->m_title = "number of gigabits to show";
m->m_desc = "What is the number of gigabits (related topics) "
"displayed per query? Set to 0 to save a little CPU time.";
m->m_cgi = "nrt";
m->m_defOff= (char *)&cr.m_numTopics - x;
m->m_off = (char *)&si.m_numTopicsToDisplay - y;
m->m_type = TYPE_LONG;
m->m_def = "11";
m->m_group = 0;
m->m_sprpg = 0; // do not propagate
m->m_sprpp = 0; // do not propagate
m->m_flags = PF_API;
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m++;
m->m_title = "min topics score";
m->m_desc = "Gigabits (related topics) with scores below this "
"will be excluded. Scores range from 0% to over 100%.";
m->m_cgi = "mts";
m->m_defOff= (char *)&cr.m_minTopicScore - x;
m->m_off = (char *)&si.m_minTopicScore - y;
m->m_type = TYPE_LONG;
m->m_group = 0;
m->m_flags = PF_API;
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m++;
m->m_title = "min gigabit doc count by default";
m->m_desc = "How many documents must contain the gigabit "
"(related topic) in order for it to be displayed.";
m->m_cgi = "mdc";
m->m_defOff= (char *)&cr.m_minDocCount - x;
m->m_off = (char *)&si.m_minDocCount - y;
m->m_type = TYPE_LONG;
m->m_def = "2";
m->m_group = 0;
m->m_flags = PF_API;
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m++;
m->m_title = "dedup doc percent for gigabits (related topics)";
m->m_desc = "If a document is this percent similar to another "
"document with a higher score, then it will not contribute "
"to the gigabit generation.";
m->m_cgi = "dsp";
m->m_defOff= (char *)&cr.m_dedupSamplePercent - x;
m->m_off = (char *)&si.m_dedupSamplePercent - y;
m->m_type = TYPE_LONG;
m->m_def = "80";
m->m_group = 0;
m->m_flags = PF_API;
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m++;
///////////////////////////////////////////
// SPIDER PROXY CONTROLS
//
///////////////////////////////////////////
m->m_title = "always use spider proxies for all collections";
m->m_desc = "ALWAYS Use the spider proxies listed below for "
"spidering. If none are "
"listed then gb will not use any. Applies to all collections. "
"If you want to regulate this on a per collection basis then "
"set this to <b>NO</b> here and adjust the "
"proxy controls on the "
"<b>spider controls</b> page. If the list of proxy IPs below "
"is empty, then of course, no proxies will be used.";
m->m_cgi = "useproxyips";
m->m_xml = "useSpiderProxies";
m->m_off = (char *)&g_conf.m_useProxyIps - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
// hide this for now. just make it a per collection parm.
m->m_flags = PF_HIDDEN;
m->m_page = PAGE_SPIDERPROXIES;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "automatically use spider proxies for all collections";
m->m_desc = "AUTOMATICALLY use the spider proxies listed below for "
"spidering. If none are "
"listed then gb will not use any. Applies to all collections. "
"If you want to regulate this on a per collection basis then "
"set this to <b>NO</b> here and adjust the "
"proxy controls on the "
"<b>spider controls</b> page. If the list of proxy IPs below "
"is empty, then of course, no proxies will be used.";
m->m_cgi = "autouseproxyips";
m->m_xml = "automaticallyUseSpiderProxies";
m->m_off = (char *)&g_conf.m_automaticallyUseProxyIps - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
// hide this for now. just make it a per collection parm.
m->m_flags = PF_HIDDEN;
m->m_page = PAGE_SPIDERPROXIES;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "spider proxy ips";
m->m_desc = "List of white space-separated spider proxy IPs. Put "
"in IP:port format. Example <i>1.2.3.4:80 4.5.6.7:99</i>. "
"You can also use <i>username:password@1.2.3.4:80</i>. "
"If a proxy itself times out when downloading through it "
"it will be perceived as a normal download timeout and the "
"page will be retried according to the url filters table, so "
"you might want to modify the url filters to retry network "
"errors more aggressively. Search for 'private proxies' on "
"google to find proxy providers. Try to ensure all your "
"proxies are on different class C IPs if possible. "
"That is, the first 3 numbers in the IP addresses are all "
"different.";
m->m_cgi = "proxyips";
m->m_xml = "proxyIps";
m->m_off = (char *)&g_conf.m_proxyIps - g;
m->m_type = TYPE_SAFEBUF; // TYPE_IP;
m->m_def = "";
m->m_flags = PF_TEXTAREA | PF_REBUILDPROXYTABLE;
m->m_page = PAGE_SPIDERPROXIES;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "spider proxy test url";
m->m_desc = "Download this url every minute through each proxy "
"listed above to ensure they are up. Typically you should "
"make this a URL you own so you do not aggravate another "
"webmaster.";
m->m_xml = "proxyTestUrl";
m->m_cgi = "proxytesturl";
m->m_off = (char *)&g_conf.m_proxyTestUrl - g;
m->m_type = TYPE_SAFEBUF;
m->m_def = "http://www.gigablast.com/";
m->m_flags = 0;
m->m_page = PAGE_SPIDERPROXIES;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "reset proxy table";
m->m_desc = "Reset the proxy statistics in the table below. Makes "
"all your proxies treated like new again.";
m->m_cgi = "resetproxytable";
m->m_type = TYPE_CMD;
m->m_func = CommandResetProxyTable;
m->m_cast = 1;
m->m_page = PAGE_SPIDERPROXIES;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "mix up user agents";
m->m_desc = "Use random user-agents when downloading through "
"a spider proxy listed above to "
"protecting gb's anonymity. The User-Agent used is a function "
"of the proxy IP/port and IP of the url being downloaded. "
"That way it is consistent when downloading the same website "
"through the same proxy.";
m->m_cgi = "userandagents";
m->m_xml = "useRandAgents";
m->m_off = (char *)&g_conf.m_useRandAgents - g;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_flags = 0;
m->m_page = PAGE_SPIDERPROXIES;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "squid proxy authorized users";
m->m_desc = "Gigablast can also simulate a squid proxy, "
"complete with "
"caching. It will forward your request to the proxies you "
"list above, if any. This list consists of space-separated "
"<i>username:password</i> items. Leave this list empty "
"to disable squid caching behaviour. The default cache "
"size for this is 10MB per shard. Use item *:* to allow "
"anyone access.";
m->m_xml = "proxyAuth";
m->m_cgi = "proxyAuth";
m->m_off = (char *)&g_conf.m_proxyAuth - g;
m->m_type = TYPE_SAFEBUF;
m->m_def = "";
m->m_flags = PF_TEXTAREA;
m->m_page = PAGE_SPIDERPROXIES;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "max words per gigabit (related topic) by default";
m->m_desc = "Maximum number of words a gigabit (related topic) "
"can have. Affects xml feeds, too.";
m->m_cgi = "mwpt";
m->m_defOff= (char *)&cr.m_maxWordsPerTopic - x;
m->m_off = (char *)&si.m_maxWordsPerTopic - y;
m->m_type = TYPE_LONG;
m->m_def = "6";
m->m_group = 0;
m->m_flags = PF_API;
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m++;
m->m_title = "show images";
m->m_desc = "Should we return or show the thumbnail images in the "
"search results?";
m->m_cgi = "showimages";
m->m_off = (char *)&si.m_showImages - y;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_sprpg = 0;
m->m_sprpp = 0;
m->m_flags = PF_NOSAVE;
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m++;
m->m_title = "use cache";
m->m_desc = "Use 0 if Gigablast should not read or write from "
"any caches at any level.";
m->m_def = "-1";
m->m_off = (char *)&si.m_useCache - y;
m->m_type = TYPE_CHAR;
m->m_cgi = "usecache";
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m++;
m->m_title = "read from cache";
m->m_desc = "Should we read search results from the cache? Set "
"to false to fix dmoz bug.";
m->m_cgi = "rcache";
m->m_off = (char *)&si.m_rcache - y;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_sprpg = 0;
m->m_sprpp = 0;
m->m_flags = PF_NOSAVE;
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m++;
m->m_title = "write to cache";
m->m_desc = "Use 0 if Gigablast should not write to "
"any caches at any level.";
m->m_def = "-1";
m->m_off = (char *)&si.m_wcache - y;
m->m_type = TYPE_CHAR;
m->m_cgi = "wcache";
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m++;
m->m_title = "max serp docid";
m->m_desc = "Start displaying results after this score/docid pair. "
"Used by widget to append results to end when index is "
"volatile.";
m->m_def = "0";
m->m_off = (char *)&si.m_minSerpDocId - y;
m->m_type = TYPE_LONG_LONG;
m->m_cgi = "minserpdocid";
m->m_flags = PF_API;
m->m_smin = 0;
m->m_sprpg = 0;
m->m_sprpp = 0;
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m++;
m->m_title = "max serp score";
m->m_desc = "Start displaying results after this score/docid pair. "
"Used by widget to append results to end when index is "
"volatile.";
m->m_def = "0";
m->m_off = (char *)&si.m_maxSerpScore - y;
m->m_type = TYPE_DOUBLE;
m->m_cgi = "maxserpscore";
m->m_flags = PF_API;
m->m_smin = 0;
m->m_sprpg = 0;
m->m_sprpp = 0;
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m++;
m->m_title = "restrict search to this url";
m->m_desc = "Does a url: query.";
m->m_off = (char *)&si.m_url - y;
m->m_type = TYPE_CHARPTR;//STRING;
//m->m_size = MAX_URL_LEN;
m->m_cgi = "url";
m->m_sprpg = 0;
m->m_sprpp = 0;
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m->m_flags = PF_NOAPI;
m++;
m->m_title = "restrict search to pages that link to this url";
m->m_desc = "The url which the pages must link to.";
m->m_off = (char *)&si.m_link - y;
m->m_type = TYPE_CHARPTR;//STRING;
//m->m_size = MAX_URL_LEN;
m->m_cgi = "link";
m->m_sprpg = 0;
m->m_sprpp = 0;
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m++;
m->m_title = "search for this phrase quoted";
m->m_desc = "The phrase which will be quoted in the query. From the "
"advanced search page, adv.html.";
m->m_off = (char *)&si.m_quote1 - y;
m->m_type = TYPE_CHARPTR;//STRING;
//m->m_size = 512;
m->m_cgi = "quotea";
m->m_sprpg = 0;
m->m_sprpp = 0;
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m->m_flags = PF_NOAPI;
m++;
m->m_title = "search for this second phrase quoted";
m->m_desc = "The phrase which will be quoted in the query. From the "
"advanced search page, adv.html.";
m->m_off = (char *)&si.m_quote2 - y;
m->m_type = TYPE_CHARPTR;//STRING;
//m->m_size = 512;
m->m_cgi = "quoteb";
m->m_sprpg = 0;
m->m_sprpp = 0;
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m->m_flags = PF_NOAPI;
m++;
/*
m->m_title = "restrict results to this site";
m->m_desc = "Returned results will have URLs from this site, X.";
m->m_off = (char *)&si.m_site - y;
m->m_type = TYPE_CHARPTR;//STRING;
m->m_cgi = "site";
m->m_size = 1024; // MAX_SITE_LEN;
m->m_sprpg = 1;
m->m_sprpp = 1;
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m++;
*/
m->m_title = "restrict results to these sites";
m->m_desc = "Returned results will have URLs from these "
"space-separated list of sites. Can have up to 200 sites. "
"A site can include sub folders. This is allows you to build "
"a <a href=\"/cts.html\">Custom Topic Search Engine</a>.";
m->m_off = (char *)&si.m_sites - y;
m->m_type = TYPE_CHARPTR;
//m->m_size = 32*1024; // MAX_SITES_LEN;
m->m_cgi = "sites";
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m->m_sprpg = 1;
m->m_sprpp = 1;
m++;
m->m_title = "require these query terms";
m->m_desc = "Returned results will have all the words in X. "
"From the advanced search page, adv.html.";
m->m_off = (char *)&si.m_plus - y;
m->m_def = NULL;
m->m_type = TYPE_CHARPTR;//STRING;
m->m_cgi = "plus";
//m->m_size = 500;
m->m_sprpg = 0;
m->m_sprpp = 0;
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m->m_flags = PF_NOAPI;
m++;
m->m_title = "avoid these query terms";
m->m_desc = "Returned results will NOT have any of the words in X. "
"From the advanced search page, adv.html.";
m->m_off = (char *)&si.m_minus - y;
m->m_type = TYPE_CHARPTR;//STRING;
m->m_cgi = "minus";
//m->m_size = 500;
m->m_sprpg = 0;
m->m_sprpp = 0;
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m->m_flags = PF_NOAPI;
m++;
m->m_title = "format of the returned search results";
m->m_desc = "Can be html, xml or json to get results back in that "
"format.";
m->m_def = "html";
m->m_off = (char *)&si.m_formatStr - y;
m->m_type = TYPE_CHARPTR;
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m->m_cgi = "format";
m->m_flags = PF_NOAPI; // alread in the api, so don't repeat
m++;
m->m_title = "family filter";
m->m_desc = "Remove objectionable results if this is enabled.";
m->m_def = "0";
m->m_off = (char *)&si.m_familyFilter - y;
m->m_type = TYPE_BOOL;
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m->m_cgi = "ff";
m++;
m->m_title = "highlight query terms in summaries";
m->m_desc = "Use to disable or enable "
"highlighting of the query terms in the summaries.";
m->m_def = "1";
m->m_off = (char *)&si.m_doQueryHighlighting - y;
m->m_type = TYPE_BOOL;
m->m_cgi = "qh";
m->m_smin = 0;
m->m_smax = 8;
m->m_sprpg = 1; // turn off for now
m->m_sprpp = 1;
m->m_flags = PF_API;
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m++;
m->m_title = "cached page highlight query";
m->m_desc = "Highlight the terms in this query instead.";
m->m_def = NULL;
m->m_off = (char *)&si.m_highlightQuery - y;
m->m_type = TYPE_CHARPTR;//STRING;
m->m_cgi = "hq";
//m->m_size = 1000;
m->m_sprpg = 0; // no need to propagate this one
m->m_sprpp = 0;
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m++;
/*
m->m_title = "highlight event date in summaries.";
m->m_desc = "Can be 0 or 1 to respectively disable or enable "
"highlighting of the event date terms in the summaries.";
m->m_def = "0";
m->m_off = (char *)&si.m_doDateHighlighting - y;
m->m_type = TYPE_BOOL;
m->m_cgi = "dh";
m->m_smin = 0;
m->m_smax = 8;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m++;
*/
/*
m->m_title = "limit search results to this ruleset";
m->m_desc = "limit search results to this ruleset";
m->m_def = "0";
m->m_off = (char *)&si.m_ruleset - y;
m->m_type = TYPE_LONG;
m->m_cgi = "ruleset";
m->m_smin = 0;
m++;
*/
m->m_title = "Query match offsets";
m->m_desc = "Return a list of the offsets of each query word "
"actually matched in the document. 1 means byte offset, "
"and 2 means word offset.";
m->m_def = "0";
m->m_off = (char *)&si.m_queryMatchOffsets - y;
m->m_type = TYPE_LONG;
m->m_cgi = "qmo";
m->m_smin = 0;
m->m_smax = 2;
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m->m_flags = PF_NOAPI;
m++;
m->m_title = "boolean status";
m->m_desc = "Can be 0 or 1 or 2. 0 means the query is NOT boolean, "
"1 means the query is boolean and 2 means to auto-detect.";
m->m_def = "2";
m->m_off = (char *)&si.m_boolFlag - y;
m->m_type = TYPE_LONG;
m->m_cgi = "bq";
m->m_smin = 0;
m->m_smax = 2;
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m++;
m->m_title = "meta tags to display";
m->m_desc = "A space-separated string of <b>meta tag names</b>. "
"Do not forget to url-encode the spaces to +'s or %%20's. "
"Gigablast will extract the contents of these specified meta "
"tags out of the pages listed in the search results and "
"display that content after each summary. i.e. "
"<i>&dt=description</i> will display the meta description of "
"each search result. <i>&dt=description:32+keywords:64</i> "
"will display the meta description and meta keywords of each "
"search result and limit the fields to 32 and 64 characters "
"respectively. When used in an XML feed the <i>&lt;display "
"name=\"meta_tag_name\"&gt;meta_tag_content&lt;/&gt;</i> XML "
"tag will be used to convey each requested meta tag's "
"content.";
m->m_off = (char *)&si.m_displayMetas - y;
m->m_type = TYPE_CHARPTR;
m->m_cgi = "dt";
//m->m_size = 3000;
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m++;
/*
// . you can have multiple topics= parms in you query url...
// . this is used to set the TopicGroups array in SearchInput
m->m_title = "related topic parameters";
m->m_desc =
"X=<b>NUM+MAX+SCAN+MIN+MAXW+META+DEL+IDF+DEDUP</b>\n"
"<br><br>\n"
"<b>NUM</b> is how many <b>related topics</b> you want "
"returned.\n"
"<br><br>\n"
"<b>MAX</b> is the maximum number of topics to generate "
"and store in cache, so if TW is increased, but still below "
"MT, it will result in a fast cache hit.\n"
"<br><br>\n"
"<b>SCAN</b> is how many documents to scan for related "
"topics. If this is 30, for example, then Gigablast will "
"scan the first 30 search results for related topics.\n"
"<br><br>\n"
"<b>MIN</b> is the minimum score of returned topics. Ranges "
"from 0%% to over 100%%. 50%% is considered pretty good. "
"BUG: This must be at least 1 to get any topics back.\n"
"<br><br>\n"
"<b>MAXW</b> is the maximum number of words per topic.\n"
"<br><br>\n"
"<b>META</b> is the meta tag name to which Gigablast will "
"restrict the content used to generate the topics. Do not "
"specify this field to restrict the content to the body of "
"each document, that is the default.\n"
"<br><br>\n"
"<b>DEL</b> is a single character delimeter which defines "
"the topic candidates. All candidates must be separated from "
"the other candidates with the delimeter. So &lt;meta "
"name=test content=\" cat dog ; pig rabbit horse\"&gt; "
"when using the ; as a delimeter would only have two topic "
"candidates: \"cat dog\" and \"pig rabbit horse\". If no "
"delimeter is provided, default funcationality is assumed.\n"
"<br><br>\n"
"<b>IDF</b> is 1, the default, if you want Gigablast to "
"weight topic candidates by their idf, 0 otherwise."
"<br><br>\n"
"<b>DEDUP</b> is 1, the default, if the topics should be "
"deduped. This involves removing topics that are substrings "
"or superstrings of other higher-scoring topics."
"<br><br>\n"
"Example: topics=49+100+30+1+6+author+%%3B+0+0"
"<br><br>\n"
"The default values for those parameters with unspecifed "
"defaults can be defined on the \"Search Controls\" page. "
"<br><br>\n"
"XML feeds will contain the generated topics like: "
"&lt;topic&gt;&lt;name&gt;&lt;![CDATA[some topic]]&gt;&lt;"
"/name&gt;&lt;score&gt;13&lt;/score&gt;&lt;from&gt;"
"metaTagName&lt;/from&gt;&lt;/topic&gt;"
"<br><br>\n"
"Even though somewhat nonstandard, you can specify multiple "
"<i>&amp;topic=</i> parameters to get back multiple topic "
"groups."
"<br><br>\n"
"Performance will decrease if you increase the MAX, SCAN or "
"MAXW.";
m->m_type = TYPE_STRING;
m->m_size = 512;
m->m_cgi = "topics";
m->m_size = 100;
// MDW: NO NO NO... was causing a write breach!!! -- take this all out
m->m_off = -2; // bogus offset
//m->m_off = (char *)&si.m_topics - y;
m++;
*/
m->m_title = "niceness";
m->m_desc = "Can be 0 or 1. 0 is usually a faster, high-priority "
"query, 1 is a slower, lower-priority query.";
m->m_def = "0";
m->m_off = (char *)&si.m_niceness - y;
m->m_type = TYPE_LONG;
m->m_cgi = "niceness";
m->m_smin = 0;
m->m_smax = 1;
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m++;
m->m_title = "debug flag";
m->m_desc = "Is 1 to log debug information, 0 otherwise.";
m->m_def = "0";
m->m_off = (char *)&si.m_debug - y;
m->m_type = TYPE_BOOL;
m->m_cgi = "debug";
//m->m_priv = 1;
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m++;
m->m_title = "return number of docs per topic";
m->m_desc = "Use 1 if you want Gigablast to return the number of "
"documents in the search results that contained each topic "
"(gigabit).";
m->m_def = "1";
m->m_off = (char *)&si.m_returnDocIdCount - y;
m->m_type = TYPE_BOOL;
m->m_cgi = "rdc";
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m++;
m->m_title = "return docids per topic";
m->m_desc = "Use 1 if you want Gigablast to return the list of "
"docIds from the search results that contained each topic "
"(gigabit).";
m->m_def = "0";
m->m_off = (char *)&si.m_returnDocIds - y;
m->m_type = TYPE_BOOL;
m->m_cgi = "rd";
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m++;
m->m_title = "return popularity per topic";
m->m_desc = "Use 1 if you want Gigablast to return the popularity "
"of each topic (gigabit).";
m->m_def = "0";
m->m_off = (char *)&si.m_returnPops - y;
m->m_type = TYPE_BOOL;
m->m_cgi = "rp";
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m++;
//m->m_title = "compound list max size";
//m->m_desc = "Is the max size in bytes of the compound termlist. "
// "Each document id is 6 bytes.";
//m->m_def = "-1";
//m->m_off = (char *)&si.m_compoundListMaxSize - y;
//m->m_type = TYPE_LONG;
//m->m_cgi = "clms";
//m->m_smin = 0;
//m->m_priv = 1;
//m++;
m->m_title = "debug gigabits flag";
m->m_desc = "Is 1 to log gigabits debug information, 0 otherwise.";
m->m_def = "0";
m->m_off = (char *)&si.m_debugGigabits - y;
m->m_type = TYPE_BOOL;
m->m_cgi = "debuggigabits";
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m++;
m->m_title = "return docids only";
m->m_desc = "Is 1 to return only docids as query results.";
m->m_def = "0";
m->m_off = (char *)&si.m_docIdsOnly - y;
m->m_type = TYPE_BOOL;
m->m_cgi = "dio";
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m++;
m->m_title = "image url";
m->m_desc = "The url of an image to co-brand on the search "
"results page.";
m->m_off = (char *)&si.m_imgUrl - y;
m->m_type = TYPE_CHARPTR;//STRING;
m->m_def = NULL;
//m->m_size = 512;
m->m_cgi = "iu";
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m->m_flags = PF_NOAPI;
m++;
m->m_title = "image link";
m->m_desc = "The hyperlink to use on the image to co-brand on "
"the search results page.";
m->m_off = (char *)&si.m_imgLink - y;
m->m_type = TYPE_CHARPTR;//STRING;
m->m_def = NULL;
//m->m_size = 512;
m->m_cgi = "ix";
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m->m_flags = PF_NOAPI;
m++;
m->m_title = "image width";
m->m_desc = "The width of the image on the search results page.";
m->m_off = (char *)&si.m_imgWidth - y;
m->m_type = TYPE_LONG;
m->m_cgi = "iw";
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m->m_def = "200";
m->m_flags = PF_NOAPI;
m++;
m->m_title = "image height";
m->m_desc = "The height of the image on the search results "
"page.";
m->m_off = (char *)&si.m_imgHeight - y;
m->m_type = TYPE_LONG;
m->m_cgi = "ih";
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m->m_def = "200";
m->m_flags = PF_NOAPI;
m++;
// m->m_title = "password";
// m->m_desc = "The password.";
// m->m_off = (char *)&si.m_pwd - y;
// m->m_type = TYPE_CHARPTR;//STRING;
// m->m_cgi = "pwd";
// m->m_size = 32;
// m->m_flags = PF_HIDDEN | PF_NOSAVE;
// m->m_page = PAGE_RESULTS;
// m->m_obj = OBJ_SI;
// m++;
m->m_title = "admin override";
m->m_desc = "admin override";
m->m_off = (char *)&si.m_isMasterAdmin - y;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_cgi = "admin";
m->m_sprpg = 1; // propagate on GET request
m->m_sprpp = 1; // propagate on POST request
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m++;
/*
m->m_title = "language";
m->m_desc = "Language code to restrict search. 0 = All. Uses "
"Clusterdb to filter languages. This is being phased out "
"please do not use much, use gblang instead.";
m->m_off = (char *)&si.m_languageCode - y;
m->m_type = TYPE_STRING;
m->m_size = 5+1;
m->m_def = "none";
// our google gadget gets &lang=en passed to it from google, so
// change this!!
m->m_cgi = "clang";
m++;
*/
/*
this should be a hash on the lang abbr line gblang:en
m->m_title = "GB language";
m->m_desc = "Language code to restrict search. 0 = All. Uses "
"the gblang: keyword to filter languages.";
m->m_off = (char *)&si.m_gblang - y;
m->m_type = TYPE_LONG;
m->m_def = "0";
m->m_cgi = "gblang";
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m++;
*/
// prepend to query
m->m_title = "prepend";
m->m_desc = "prepend this to the supplied query followed by a |.";
m->m_off = (char *)&si.m_prepend - y;
m->m_type = TYPE_CHARPTR;
m->m_def = NULL;
m->m_cgi = "prepend";
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m++;
m->m_title = "GB Country";
m->m_desc = "Country code to restrict search";
m->m_off = (char *)&si.m_gbcountry - y;
m->m_type = TYPE_CHARPTR;//STRING;
//m->m_size = 4+1;
m->m_def = NULL;
//m->m_def = "iso-8859-1";
m->m_cgi = "gbcountry";
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m->m_flags = PF_NOAPI;
m++;
/*
m->m_title = "rerank ruleset";
m->m_desc = "Use this ruleset to rerank the search results. Will "
"rerank at least the first X results specified with &amp;n=X. "
"And be sure to say &amp;recycle=0 to recompute the quality "
"of each page in the search results.";
m->m_off = (char *)&si.m_rerankRuleset - y;
m->m_type = TYPE_LONG;
m->m_def = "-1";
m->m_cgi = "rerank";
m++;
m->m_title = "apply ruleset to roots";
m->m_desc = "Recompute the quality of the root urls of each "
"search result in order to compute the quality of that "
"search result, since it depends on its root quality. This "
"can take a lot longer when enabled.";
m->m_off = (char *)&si.m_artr - y;
m->m_type = TYPE_LONG;
m->m_def = "0";
m->m_cgi = "artr";
m++;
*/
m->m_title = "show banned pages";
m->m_desc = "show banned pages";
m->m_off = (char *)&si.m_showBanned - y;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_cgi = "sb";
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m++;
m->m_title = "allow punctuation in query phrases";
m->m_desc = "allow punctuation in query phrases";
m->m_off = (char *)&si.m_allowPunctInPhrase - y;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_cgi = "apip";
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m++;
/*
m->m_title = "use ad feed num";
m->m_desc = "use ad feed num";
m->m_off = (char *)&si.m_useAdFeedNum - y;
m->m_type = TYPE_LONG;
m->m_def = "0";
m->m_cgi = "uafn";
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m++;
*/
/*
m->m_title = "do bot detection";
m->m_desc = "Passed in for raw feeds that want bot detection cgi "
"parameters passed back in the XML.";
m->m_off = (char *)&si.m_doBotDetection - y;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_cgi = "bd";
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m++;
*/
/*
m->m_title = "bot detection query";
m->m_desc = "Passed in for raw feeds that want bot detection cgi "
"parameters passed back in the XML. Use this variable "
"when an actual query against gigablast is not needed "
"(i.e. - image/video/news searches).";
m->m_off = (char *)&si.m_botDetectionQuery - y;
m->m_type = TYPE_CHARPTR;//STRING;
m->m_cgi = "bdq";
m->m_def = NULL;
m->m_size = MAX_QUERY_LEN;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m++;
*/
m->m_title = "queryCharset";
m->m_desc = "Charset in which the query is encoded";
m->m_off = (char *)&si.m_queryCharset - y;
m->m_type = TYPE_CHARPTR;//STRING;
//m->m_size = 32+1;
m->m_def = "utf-8";
//m->m_def = "iso-8859-1";
m->m_cgi = "qcs";
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m->m_flags = PF_NOAPI;
m++;
// buzz
m->m_title = "display inlinks";
m->m_desc = "Display all inlinks of each result.";
m->m_off = (char *)&si.m_displayInlinks - y;
m->m_type = TYPE_LONG;
m->m_def = "0";
m->m_cgi = "inlinks";
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m->m_flags = PF_NOAPI;
m++;
// buzz
m->m_title = "display outlinks";
m->m_desc = "Display all outlinks of each result. outlinks=1 "
"displays only external outlinks. outlinks=2 displays "
"external and internal outlinks.";
m->m_off = (char *)&si.m_displayOutlinks - y;
m->m_type = TYPE_LONG;
m->m_def = "0";
m->m_cgi = "outlinks";
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m->m_flags = PF_NOAPI;
m++;
// buzz
m->m_title = "display term frequencies";
m->m_desc = "Display Terms and Frequencies in results.";
m->m_off = (char *)&si.m_displayTermFreqs - y;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_cgi = "tf";
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m++;
// buzz
m->m_title = "spider results";
m->m_desc = "Results of this query will be forced into the spider "
"queue for reindexing.";
m->m_off = (char *)&si.m_spiderResults - y;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_cgi = "spiderresults";
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m++;
// buzz
m->m_title = "spider result roots";
m->m_desc = "Root urls of the results of this query will be forced "
"into the spider queue for reindexing.";
m->m_off = (char *)&si.m_spiderResultRoots - y;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_cgi = "spiderresultroots";
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m++;
// buzz
m->m_title = "just mark clusterlevels";
m->m_desc = "Check for deduping, but just mark the cluster levels "
"and the doc deduped against, don't remove the result.";
m->m_off = (char *)&si.m_justMarkClusterLevels - y;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_cgi = "jmcl";
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m++;
m->m_title = "include cached copy of page";
m->m_desc = "Will cause a cached copy of content to be returned "
"instead of summary.";
m->m_off = (char *)&si.m_includeCachedCopy - y;
m->m_type = TYPE_LONG;
m->m_def = "0";
m->m_cgi = "icc";
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m->m_flags = PF_API;
m++;
// m->m_title = "get section voting info in json";
// m->m_desc = "Will cause section voting info to be returned.";
// m->m_off = (char *)&si.m_getSectionVotingInfo - y;
// m->m_type = TYPE_CHAR;
// m->m_def = "0";
// m->m_cgi = "sectionvotes";
// m->m_page = PAGE_RESULTS;
// m->m_obj = OBJ_SI;
// m->m_flags = PF_API;
// m++;
//////////////
// END /search
//////////////
//////////
// PAGE GET (cached web pages)
///////////
m->m_title = "docId";
m->m_desc = "The docid of the cached page to view.";
m->m_off = (char *)&gr.m_docId - (char *)&gr;
m->m_type = TYPE_LONG_LONG;
m->m_page = PAGE_GET;
m->m_obj = OBJ_GBREQUEST; // generic request class
m->m_def = "0";
m->m_cgi = "d";
m->m_flags = PF_API | PF_REQUIRED;
m++;
m->m_title = "url";
m->m_desc = "Instead of specifying a docid, you can get the "
"cached webpage by url as well.";
m->m_off = (char *)&gr.m_url - (char *)&gr;
m->m_type = TYPE_CHARPTR; // reference into the HttpRequest
m->m_page = PAGE_GET;
m->m_obj = OBJ_GBREQUEST; // generic request class
m->m_def = NULL;
m->m_cgi = "url";
m->m_flags = PF_API | PF_REQUIRED;
m++;
m->m_title = "collection";
m->m_desc = "Get the cached page from this collection.";
m->m_cgi = "c";
m->m_page = PAGE_GET;
m->m_obj = OBJ_GBREQUEST;
m->m_off = (char *)&gr.m_coll - (char *)&gr;
m->m_type = TYPE_CHARPTR;//SAFEBUF;
m->m_def = NULL;
m->m_flags = PF_REQUIRED | PF_API;
m++;
m->m_title = "strip";
m->m_desc = "Is 1 or 2 two strip various tags from the "
"cached content.";
m->m_off = (char *)&gr.m_strip - (char *)&gr;
m->m_page = PAGE_GET;
m->m_obj = OBJ_GBREQUEST;
m->m_cgi = "strip";
m->m_def = "0";
m->m_type = TYPE_LONG;
m->m_flags = PF_API;
m++;
m->m_title = "include header";
m->m_desc = "Is 1 to include the Gigablast header at the top of "
"the cached page, 0 to exclude the header.";
m->m_def = "1";
m->m_type = TYPE_BOOL;
m->m_page = PAGE_GET;
m->m_obj = OBJ_GBREQUEST;
m->m_cgi = "ih";
m->m_off = (char *)&gr.m_includeHeader - (char *)&gr;
m->m_flags = PF_API;
m++;
m->m_title = "query";
m->m_desc = "Highlight this query in the page.";
m->m_def = "";
m->m_type = TYPE_CHARPTR;
m->m_page = PAGE_GET;
m->m_obj = OBJ_GBREQUEST;
m->m_cgi = "q";
m->m_off = (char *)&gr.m_query - (char *)&gr;
m->m_flags = PF_API;
m++;
/*
// for /get
m->m_title = "query highlighting query";
m->m_desc = "Is 1 to highlight query terms in the cached page.";
m->m_def = "1";
m->m_type = TYPE_BOOL;
m->m_cgi = "qh";
m->m_off = (char *)&si.m_queryHighlighting - y;
m++;
*/
// for /addurl
/*
m->m_title = "url to add";
m->m_desc = "Used by add url page.";
m->m_type = TYPE_STRING;
m->m_size = MAX_URL_LEN;
m->m_cgi = "u";
m->m_off = (char *)&si.m_url2 - y;
m++;
*/
// Process.cpp calls Msg28::massConfig with &haspower=[0|1] to
// indicate power loss or coming back on from a power loss
m->m_title = "power on status notificiation";
m->m_desc = "Indicates power is back on.";
m->m_cgi = "poweron";
m->m_type = TYPE_CMD;
m->m_func = CommandPowerOnNotice;
m->m_cast = 0;
m->m_page = PAGE_NONE;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "power off status notificiation";
m->m_desc = "Indicates power is off.";
m->m_cgi = "poweroff";
m->m_type = TYPE_CMD;
m->m_func = CommandPowerOffNotice;
m->m_cast = 0;
m->m_page = PAGE_NONE;
m->m_obj = OBJ_CONF;
m++;
//////////////
// END PAGE_GET
//////////////
///////////////////////////////////////////
// MASTER CONTROLS
///////////////////////////////////////////
m->m_title = "spidering enabled";
m->m_desc = "Controls all spidering for all collections";
m->m_cgi = "se";
m->m_off = (char *)&g_conf.m_spideringEnabled - g;
m->m_type = TYPE_BOOL;
m->m_def = "1";
//m->m_cast = 0;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "injections enabled";
m->m_desc = "Controls injecting for all collections";
m->m_cgi = "injen";
m->m_off = (char *)&g_conf.m_injectionsEnabled - g;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "querying enabled";
m->m_desc = "Controls querying for all collections";
m->m_cgi = "qryen";
m->m_off = (char *)&g_conf.m_queryingEnabled - g;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "return results even if a shard is down";
m->m_desc = "If you turn this off then Gigablast will return "
"an error message if a shard was down and did not return "
"results for a query. The XML and JSON feed let's you know "
"when a shard is down and will give you the results back "
"any way, but if you would rather have just and error message "
"and no results, then set then set this to 'NO'.";
m->m_cgi = "rra";
m->m_off = (char *)&g_conf.m_returnResultsAnyway - g;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "max mem";
m->m_desc = "Mem available to this process. May be exceeded due "
"to fragmentation.";
m->m_cgi = "maxmem";
m->m_off = (char *)&g_conf.m_maxMem - g;
m->m_def = "8000000000";
m->m_obj = OBJ_CONF;
m->m_page = PAGE_MASTER; // PAGE_NONE;
m->m_type = TYPE_LONG_LONG;
//m->m_flags = PF_NOAPI;
m++;
m->m_title = "max total spiders";
m->m_desc = "What is the maximum number of web "
"pages the spider is allowed to download "
"simultaneously for ALL collections PER HOST? Caution: "
"raising this too high could result in some Out of Memory "
"(OOM) errors. The hard limit is currently 300. Each "
"collection has its own limit in the <i>spider controls</i> "
"that you may have to increase as well.";
m->m_cgi = "mtsp";
m->m_off = (char *)&g_conf.m_maxTotalSpiders - g;
m->m_type = TYPE_LONG;
m->m_def = "100";
m->m_group = 0;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
/*
m->m_title = "web spidering enabled";
m->m_desc = "Spiders events on web";
m->m_cgi = "wse";
m->m_off = (char *)&g_conf.m_webSpideringEnabled - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m++;
*/
m->m_title = "add url enabled";
m->m_desc = "Can people use the add url interface to add urls "
"to the index?";
m->m_cgi = "ae";
m->m_off = (char *)&g_conf.m_addUrlEnabled - g;
m->m_type = TYPE_BOOL;
m->m_def = "1";
//m->m_cast = 0;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "use collection passwords";
m->m_desc = "Should collections have individual password settings "
"so different users can administrer different collections? "
"If not the only the master passwords and IPs will be able "
"to administer any collection.";
m->m_cgi = "ucp";
m->m_off = (char *)&g_conf.m_useCollectionPasswords - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "allow cloud users";
m->m_desc = "Can guest users create and administer "
"a collection? Limit: 1 "
"collection per IP address. This is mainly for doing "
"demos on the gigablast.com domain.";
m->m_cgi = "acu";
m->m_off = (char *)&g_conf.m_allowCloudUsers - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "auto save frequency";
m->m_desc = "Save data in memory to disk after this many minutes "
"have passed without the data having been dumped or saved "
"to disk. Use 0 to disable.";
m->m_cgi = "asf";
m->m_off = (char *)&g_conf.m_autoSaveFrequency - g;
m->m_type = TYPE_LONG;
m->m_def = "5";
m->m_units = "mins";
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "max http sockets";
m->m_desc = "Maximum sockets available to serve incoming HTTP "
"requests. Too many outstanding requests will increase "
"query latency. Excess requests will simply have their "
"sockets closed.";
m->m_cgi = "ms";
m->m_off = (char *)&g_conf.m_httpMaxSockets - g;
m->m_type = TYPE_LONG;
// up this some, am seeing sockets closed because of using gb
// as a cache...
m->m_def = "300";
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "max https sockets";
m->m_desc = "Maximum sockets available to serve incoming HTTPS "
"requests. Like max http sockets, but for secure sockets.";
m->m_cgi = "mss";
m->m_off = (char *)&g_conf.m_httpsMaxSockets - g;
m->m_type = TYPE_LONG;
m->m_def = "100";
m->m_group = 0;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "spider user agent";
m->m_desc = "Identification seen by web servers when "
"the Gigablast spider downloads their web pages. "
"It is polite to insert a contact email address here so "
"webmasters that experience problems from the Gigablast "
"spider have somewhere to vent.";
m->m_cgi = "sua";
m->m_off = (char *)&g_conf.m_spiderUserAgent - g;
m->m_type = TYPE_STRING;
m->m_size = USERAGENTMAXSIZE;
m->m_def = "GigablastOpenSource/1.0";
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "use temporary cluster";
m->m_desc = "Used by proxy to point to a temporary cluster while the "
"original cluster is updated with a new binary. The "
"temporary cluster is the same as the original cluster but "
"the ports are all incremented by one from what is in "
"the hosts.conf. This should ONLY be used for the proxy.";
m->m_cgi = "aotp";
m->m_off = (char *)&g_conf.m_useTmpCluster - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
/*
m->m_title = "url injection enabled";
m->m_desc = "If enabled you can directly inject URLs into the index.";
m->m_cgi = "ie";
m->m_off = (char *)&g_conf.m_injectionEnabled - g;
m->m_type = TYPE_BOOL;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m->m_def = "1";
m++;
*/
m->m_title = "init QA tests";
m->m_desc = "If initiated gb performs some integrity tests "
"to ensure injecting, spidering and searching works "
"properly. Uses ./test/ subdirectory. Injects "
"urls in ./test/inject.txt. Spiders urls "
"in ./test/spider.txt. "
"Each of those two files is essentially a simple format of "
"a url followed by the http reply received from the server "
"for that url. "
// TODO: generate these files
;
m->m_cgi = "qasptei";
m->m_type = TYPE_CMD;
m->m_func = CommandSpiderTestInit;
m->m_def = "1";
m->m_cast = 1;
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "init parser test run";
m->m_desc = "If enabled gb injects the urls in the "
"./test-parser/urls.txt "
"file and outputs ./test-parser/qa.html";
m->m_cgi = "qaptei";
m->m_type = TYPE_CMD;
m->m_func = CommandParserTestInit;
m->m_def = "1";
m->m_cast = 1;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "init spider test run";
m->m_desc = "If enabled gb injects the urls in "
"./test-spider/spider.txt "
"and spiders links.";
m->m_cgi = "qasptei";
m->m_type = TYPE_CMD;
m->m_func = CommandSpiderTestInit;
m->m_def = "1";
m->m_cast = 1;
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "continue spider test run";
m->m_desc = "Resumes the test.";
m->m_cgi = "qaspter";
m->m_type = TYPE_CMD;
m->m_func = CommandSpiderTestCont;
m->m_def = "1";
m->m_cast = 1;
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
/*
m->m_title = "do docid range splitting";
m->m_desc = "Split msg39 docids into ranges to save mem?";
m->m_cgi = "ddrs";
m->m_off = (char *)&g_conf.m_doDocIdRangeSplitting - g;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_group = 0;
m++;
*/
m->m_title = "qa search test enabled";
m->m_desc = "If enabled gb does the search queries in "
"./test-search/queries.txt and compares to the last run and "
"outputs the diffs for inspection and validation.";
m->m_cgi = "qasste";
m->m_off = (char *)&g_conf.m_testSearchEnabled - g;
m->m_type = TYPE_BOOL;
m->m_def = "1";
//m->m_cast = 0;
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
/*
m->m_title = "just save";
m->m_desc = "Copies the data in memory to disk for just this host. "
"Does Not exit.";
m->m_cgi = "js";
m->m_type = TYPE_CMD;
m->m_func = CommandJustSave;
m->m_page = PAGE_MASTER;
m->m_cast = 0;
m++;
*/
m->m_title = "save";
m->m_desc = "Saves in-memory data for ALL hosts. Does Not exit.";
m->m_cgi = "js";
m->m_type = TYPE_CMD;
m->m_func = CommandJustSave;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
/*
m->m_title = "all spiders on";
m->m_desc = "Enable spidering on all hosts";
m->m_cgi = "ase";
m->m_def = "1";
m->m_off = (char *)&g_conf.m_spideringEnabled - g;
m->m_type = TYPE_BOOL2; // no yes or no, just a link
m++;
m->m_title = "all spiders off";
m->m_desc = "Disable spidering on all hosts";
m->m_cgi = "ase";
m->m_def = "0";
m->m_off = (char *)&g_conf.m_spideringEnabled - g;
m->m_type = TYPE_BOOL2; // no yes or no, just a link
m++;
*/
/*
m->m_title = "save & exit";
m->m_desc = "Copies the data in memory to disk for just this host "
"and then shuts down the gb process.";
m->m_cgi = "save";
m->m_type = TYPE_CMD;
m->m_func = CommandSaveAndExit;
m->m_cast = 0;
m++;
m->m_title = "urgent save & exit";
m->m_desc = "Copies the data in memory to disk for just this host "
"and then shuts down the gb process.";
m->m_cgi = "usave";
m->m_type = TYPE_CMD;
m->m_func = CommandUrgentSaveAndExit;
m->m_cast = 0;
m->m_priv = 4;
m++;
*/
m->m_title = "save & exit";
m->m_desc = "Saves the data and exits for ALL hosts.";
m->m_cgi = "save";
m->m_type = TYPE_CMD;
m->m_func = CommandSaveAndExit;
m->m_group = 0;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "rebalance shards";
m->m_desc = "Tell all hosts to scan all records in all databases, "
"and move "
"records to the shard they belong to. You only need to run "
"this if Gigablast tells you to, when you are changing "
"hosts.conf to add or remove more nodes/hosts.";
m->m_cgi = "rebalance";
m->m_type = TYPE_CMD;
m->m_func = CommandRebalance;
m->m_group = 0;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "dump to disk";
m->m_desc = "Flushes all records in memory to the disk on all hosts.";
m->m_cgi = "dump";
m->m_type = TYPE_CMD;
m->m_func = CommandDiskDump;
m->m_cast = 1;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "force reclaim";
m->m_desc = "Force reclaim of doledb mem.";
m->m_cgi = "forceit";
m->m_type = TYPE_CMD;
m->m_func = CommandForceIt;
m->m_cast = 1;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m++;
m->m_title = "tight merge posdb";
m->m_desc = "Merges all outstanding posdb (index) files.";
m->m_cgi = "pmerge";
m->m_type = TYPE_CMD;
m->m_func = CommandMergePosdb;
m->m_cast = 1;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
//m->m_title = "tight merge sectiondb";
//m->m_desc = "Merges all outstanding sectiondb files.";
//m->m_cgi = "smerge";
//m->m_type = TYPE_CMD;
//m->m_func = CommandMergeSectiondb;
//m->m_cast = 1;
//m++;
m->m_title = "tight merge titledb";
m->m_desc = "Merges all outstanding titledb (web page cache) files.";
m->m_cgi = "tmerge";
m->m_type = TYPE_CMD;
m->m_func = CommandMergeTitledb;
m->m_cast = 1;
m->m_group = 0;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "tight merge spiderdb";
m->m_desc = "Merges all outstanding spiderdb files.";
m->m_cgi = "spmerge";
m->m_type = TYPE_CMD;
m->m_func = CommandMergeSpiderdb;
m->m_cast = 1;
m->m_group = 0;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "clear kernel error message";
m->m_desc = "Clears the kernel error message. You must do this "
"to stop getting email alerts for a kernel ring buffer "
"error alert.";
m->m_cgi = "clrkrnerr";
m->m_type = TYPE_CMD;
m->m_func = CommandClearKernelError;
m->m_cast = 1;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "disk page cache off";
m->m_desc = "Disable all disk page caches to save mem for "
"tmp cluster. Run "
"gb cacheoff to do for all hosts.";
m->m_cgi = "dpco";
m->m_type = TYPE_CMD;
m->m_func = CommandDiskPageCacheOff;
m->m_cast = 1;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
//m->m_title = "http server enabled";
//m->m_desc = "Disable this if you do not want anyone hitting your "
// "http server. Admin and local IPs are still permitted, "
// "however.";
//m->m_cgi = "hse";
//m->m_off = (char *)&g_conf.m_httpServerEnabled - g;
//m->m_type = TYPE_BOOL;
//m->m_def = "1";
//m++;
/*
m->m_title = "ad feed enabled";
m->m_desc = "Serves ads unless pure=1 is in cgi parms.";
m->m_cgi = "afe";
m->m_off = (char *)&g_conf.m_adFeedEnabled - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_scgi = "ads";
m->m_soff = (char *)&si.m_adFeedEnabled - y;
m->m_sparm = 1;
m->m_priv = 2;
m++;
*/
m->m_title = "do stripe balancing";
m->m_desc = "Stripe #n contains twin #n from each group. Doing "
"stripe balancing helps prevent too many query requests "
"coming into one host. This parm is only for the proxy. "
"Stripe balancing is done by default unless the parm is "
"disabled on the proxy in which case it appends a "
"&dsb=0 to the query url it sends to the host. The proxy "
"alternates to which host it forwards the incoming query "
"based on the stripe. It takes the number of query terms in "
"the query into account to make a more even balance.";
m->m_cgi = "dsb";
m->m_off = (char *)&g_conf.m_doStripeBalancing - g;
m->m_type = TYPE_BOOL;
m->m_def = "1";
//m->m_scgi = "dsb";
//m->m_soff = (char *)&si.m_doStripeBalancing - y;
//m->m_sparm = 1;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "is live cluster";
m->m_desc = "Is this cluster part of a live production cluster? "
"If this is true we make sure that elvtune is being "
"set properly for best performance, otherwise, gb will "
"not startup.";
m->m_cgi = "live";
m->m_off = (char *)&g_conf.m_isLive - g;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
/*
m->m_title = "is BuzzLogic";
m->m_desc = "Is this a BuzzLogic cluster?";
m->m_cgi = "isbuzz";
m->m_off = (char *)&g_conf.m_isBuzzLogic - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m++;
*/
// we use wikipedia cluster for quick categorization
m->m_title = "is wikipedia cluster";
m->m_desc = "Is this cluster just used for indexing wikipedia pages?";
m->m_cgi = "iswiki";
m->m_off = (char *)&g_conf.m_isWikipedia - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "ask for gzipped docs when downloading";
m->m_desc = "If this is true, gb will send Accept-Encoding: gzip "
"to web servers when doing http downloads. It does have "
"a tendency to cause out-of-memory errors when you enable "
"this, so until that is fixed better, it's probably a good "
"idea to leave this disabled.";
m->m_cgi = "afgdwd";
m->m_off = (char *)&g_conf.m_gzipDownloads - g;
m->m_type = TYPE_BOOL;
// keep this default off because it seems some pages are huge
// uncomressed causing OOM errors and possibly corrupting stuff?
// not sure exactly, but i don't like going OOM. so maybe until
// that is fixed leave this off.
m->m_def = "0";
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "search results cache max age";
m->m_desc = "How many seconds should we cache a search results "
"page for?";
m->m_cgi = "srcma";
m->m_off = (char *)&g_conf.m_searchResultsMaxCacheAge - g;
m->m_def = "10800"; // 3 hrs
m->m_type = TYPE_LONG;
m->m_units = "seconds";
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "autoban IPs which violate the queries per day quotas";
m->m_desc = "Keep track of ips which do queries, disallow "
"non-customers from hitting us too hard.";
m->m_cgi = "ab";
m->m_off = (char *)&g_conf.m_doAutoBan - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
if ( g_isYippy ) {
m->m_title = "Max outstanding search requests out for yippy";
m->m_desc = "Max outstanding search requests out for yippy";
m->m_cgi = "ymo";
m->m_off = (char *)&g_conf.m_maxYippyOut - g;
m->m_type = TYPE_LONG;
m->m_def = "150";
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
}
m->m_title = "free queries per day ";
m->m_desc = "Non-customers get this many queries per day before"
"being autobanned";
m->m_cgi = "nfqpd";
m->m_off = (char *)&g_conf.m_numFreeQueriesPerDay - g;
m->m_type = TYPE_LONG;
m->m_def = "1024";
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "free queries per minute ";
m->m_desc = "Non-customers get this many queries per minute before"
"being autobanned";
m->m_cgi = "nfqpm";
m->m_off = (char *)&g_conf.m_numFreeQueriesPerMinute - g;
m->m_type = TYPE_CHAR;
m->m_def = "30";
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "max heartbeat delay in milliseconds";
m->m_desc = "If a heartbeat is delayed this many milliseconds "
"dump a core so we can see where the CPU was. "
"Logs 'db: missed heartbeat by %"INT64" ms'. "
"Use 0 or less to disable.";
m->m_cgi = "mhdms";
m->m_off = (char *)&g_conf.m_maxHeartbeatDelay - g;
m->m_type = TYPE_LONG;
m->m_def = "0";
m->m_flags = PF_CLONE; // PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "max delay before logging a callback or handler";
m->m_desc = "If a call to a message callback or message handler "
"in the udp server takes more than this many milliseconds, "
"then log it. "
"Logs 'udp: Took %"INT64" ms to call callback for msgType="
"0x%hhx niceness=%"INT32"'. "
"Use -1 or less to disable the logging.";
m->m_cgi = "mdch";
m->m_off = (char *)&g_conf.m_maxCallbackDelay - g;
m->m_type = TYPE_LONG;
m->m_def = "-1";
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "sendmail IP";
m->m_desc = "We send crawlbot notification emails to this sendmail "
"server which forwards them to the specified email address.";
m->m_cgi = "smip";
m->m_off = (char *)&g_conf.m_sendmailIp - g;
m->m_type = TYPE_STRING;
m->m_def = "10.5.54.47";
m->m_size = MAX_MX_LEN;
m->m_priv = 2;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "send email alerts";
m->m_desc = "Sends emails to admin if a host goes down.";
m->m_cgi = "sea";
m->m_off = (char *)&g_conf.m_sendEmailAlerts - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_priv = 2;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "delay non critical email alerts";
m->m_desc = "Do not send email alerts about dead hosts to "
"anyone except sysadmin@gigablast.com between the times "
"given below unless all the twins of the dead host are "
"also dead. Instead, wait till after if the host "
"is still dead. ";
m->m_cgi = "dnca";
m->m_off = (char *)&g_conf.m_delayNonCriticalEmailAlerts - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_priv = 2;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
//m->m_title = "send email alerts to matt at tmobile 450-3518";
//m->m_desc = "Sends to cellphone.";
//m->m_cgi = "seatmt";
//m->m_off = (char *)&g_conf.m_sendEmailAlertsToMattTmobile - g;
//m->m_type = TYPE_BOOL;
//m->m_def = "1";
//m->m_priv = 2;
//m->m_group = 0;
//m++;
//m->m_title = "send email alerts to matt at alltel 362-6809";
/*
m->m_title = "send email alerts to matt at alltel 450-3518";
m->m_desc = "Sends to cellphone.";
m->m_cgi = "seatmv";
m->m_off = (char *)&g_conf.m_sendEmailAlertsToMattAlltell - g;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_priv = 2;
m->m_group = 0;
m++;
m->m_title = "send email alerts to javier";
m->m_desc = "Sends to cellphone.";
m->m_cgi = "seatj";
m->m_off = (char *)&g_conf.m_sendEmailAlertsToJavier - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_priv = 2;
m->m_group = 0;
m++;
*/
// m->m_title = "send email alerts to melissa";
// m->m_desc = "Sends to cell phone.";
// m->m_cgi = "seatme";
// m->m_off = (char *)&g_conf.m_sendEmailAlertsToMelissa - g;
// m->m_type = TYPE_BOOL;
// m->m_def = "0";
// m->m_priv = 2;
// m->m_group = 0;
// m++;
/*
m->m_title = "send email alerts to partap";
m->m_desc = "Sends to cell phone.";
m->m_cgi = "seatp";
m->m_off = (char *)&g_conf.m_sendEmailAlertsToPartap - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_priv = 2;
m->m_group = 0;
m++;
*/
// m->m_title = "send email alerts to cinco";
// m->m_desc = "Sends to cell phone.";
// m->m_cgi = "seatc";
// m->m_off = (char *)&g_conf.m_sendEmailAlertsToCinco - g;
// m->m_type = TYPE_BOOL;
// m->m_def = "0";
// m->m_priv = 2;
// m->m_group = 0;
// m++;
/* m->m_title = "maximum hops from parent page";
m->m_desc = "Only index pages that are within a particular number "
"of hops from the parent page given in Page Add Url. -1 means "
"that max hops is infinite.";
m->m_cgi = "mnh";
m->m_off = (char *)&cr.m_maxNumHops - x;
m->m_type = TYPE_CHAR2;
m->m_def = "-1";
m->m_group = 0;
m++;*/
m->m_title = "cluster name";
m->m_desc = "Email alerts will include the cluster name";
m->m_cgi = "cn";
m->m_off = (char *)&g_conf.m_clusterName - g;
m->m_type = TYPE_STRING;
m->m_size = 32;
m->m_def = "unspecified";
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m->m_title = "spider round start time";
m->m_desc = "When the next spider round starts. If you force this to "
"zero it sets it to the current time. That way you can "
"respider all the urls that were already spidered, and urls "
"that were not yet spidered in the round will still be "
"spidered.";
m->m_cgi = "spiderRoundStart";
m->m_size = 0;
m->m_off = (char *)&cr.m_spiderRoundStartTime - x;
m->m_type = TYPE_LONG;
m->m_def = "0";
m->m_group = 0;
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m->m_flags = PF_HIDDEN | PF_REBUILDURLFILTERS ;
m++;
// DIFFBOT:
// this http parm actually ads the "forceround" parm to the parmlist
// below with the appropriate args.
m->m_title = "manually restart a spider round";
m->m_desc = "Updates round number and resets local processed "
"and crawled counts to 0.";
m->m_cgi = "roundStart";
m->m_type = TYPE_CMD;
m->m_func = NULL;
m->m_group = 0;
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m->m_flags = PF_HIDDEN;
m++;
// DIFFBOT:
// . this is sent to each shard by issuing a "&roundStart=1" cmd
// . similar to the "addcoll" cmd we add args to it and make it
// the "forceround" cmd parm and add THAT to the parmlist.
// so "roundStart=1" is really an alias for us.
m->m_title = "manually restart a spider round on shard";
m->m_desc = "Updates round number and resets local processed "
"and crawled counts to 0.";
m->m_cgi = "forceround";
//m->m_off = (char *)&cr.m_spiderRoundStartTime - x;
m->m_type = TYPE_CMD;
m->m_func = CommandForceNextSpiderRound;
m->m_group = 0;
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m->m_flags = PF_HIDDEN | PF_REBUILDURLFILTERS ;
m++;
m->m_title = "spider round num";
m->m_desc = "The spider round number.";
m->m_cgi = "spiderRoundNum";
m->m_off = (char *)&cr.m_spiderRoundNum - x;
m->m_type = TYPE_LONG;
m->m_def = "0";
m->m_group = 0;
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m->m_flags = PF_HIDDEN ;
m++;
m->m_title = "send email alerts to sysadmin";
m->m_desc = "Sends to sysadmin@gigablast.com.";
m->m_cgi = "seatsa";
m->m_off = (char *)&g_conf.m_sendEmailAlertsToSysadmin - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_priv = 2;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
/*
m->m_title = "send email alerts to zak";
m->m_desc = "Sends to zak@gigablast.com.";
m->m_cgi = "seatz";
m->m_off = (char *)&g_conf.m_sendEmailAlertsToZak - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_priv = 2;
m->m_group = 0;
m++;
m->m_title = "send email alerts to sabino";
m->m_desc = "Sends to cell phone.";
m->m_cgi = "seatms";
m->m_off = (char *)&g_conf.m_sendEmailAlertsToSabino - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_priv = 2;
m->m_group = 0;
m++;
*/
m->m_title = "dead host timeout";
m->m_desc = "Consider a host in the Gigablast network to be dead if "
"it does not respond to successive pings for this number of "
"seconds. Gigablast does not send requests to dead hosts. "
"Outstanding requests may be re-routed to a twin.";
m->m_cgi = "dht";
m->m_off = (char *)&g_conf.m_deadHostTimeout - g;
m->m_type = TYPE_LONG;
m->m_def = "4000";
m->m_units = "milliseconds";
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "send email timeout";
m->m_desc = "Send an email after a host has not responded to "
"successive pings for this many milliseconds.";
m->m_cgi = "set";
m->m_off = (char *)&g_conf.m_sendEmailTimeout - g;
m->m_type = TYPE_LONG;
m->m_def = "62000";
m->m_priv = 2;
m->m_units = "milliseconds";
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "ping spacer";
m->m_desc = "Wait this many milliseconds before pinging the next "
"host. Each host pings all other hosts in the network.";
m->m_cgi = "ps";
m->m_off = (char *)&g_conf.m_pingSpacer - g;
m->m_min = 50; // i've seen values of 0 hammer the cpu
m->m_type = TYPE_LONG;
m->m_def = "100";
m->m_units = "milliseconds";
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
//m->m_title = "max query time";
//m->m_desc = "When computing the avgerage query latency "
// "truncate query latency times to this so that "
// "a single insanely int32_t query latency time does "
// "not trigger the alarm. This is in seconds.";
//m->m_cgi = "mqlr";
//m->m_off = (char *)&g_conf.m_maxQueryTime - g;
//m->m_type = TYPE_FLOAT;
//m->m_def = "30.0";
//m->m_priv = 2;
//m->m_group = 0;
//m++;
m->m_title = "query success rate threshold";
m->m_desc = "Send email alerts when query success rate goes below "
"this threshold. (percent rate between 0.0 and 1.0)";
m->m_cgi = "qsrt";
m->m_off = (char *)&g_conf.m_querySuccessThreshold - g;
m->m_type = TYPE_FLOAT;
m->m_def = "0.850000";
m->m_priv = 2;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "average query latency threshold";
m->m_desc = "Send email alerts when average query latency goes above "
"this threshold. (in seconds)";
m->m_cgi = "aqpst";
m->m_off = (char *)&g_conf.m_avgQueryTimeThreshold - g;
m->m_type = TYPE_FLOAT;
// a titlerec fetch times out after 2 seconds and is re-routed
m->m_def = "2.000000";
m->m_priv = 2;
m->m_units = "seconds";
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "number of query times in average";
m->m_desc = "Record this number of query times before calculating "
"average query latency.";
m->m_cgi = "nqt";
m->m_off = (char *)&g_conf.m_numQueryTimes - g;
m->m_type = TYPE_LONG;
m->m_def = "300";
m->m_priv = 2;
m->m_group = 0;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "max corrupt index lists";
m->m_desc = "If we reach this many corrupt index lists, send "
"an admin email. Set to -1 to disable.";
m->m_cgi = "mcil";
m->m_off = (char *)&g_conf.m_maxCorruptLists - g;
m->m_type = TYPE_LONG;
m->m_def = "5";
m->m_priv = 2;
m->m_group = 0;
m->m_flags = PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "max hard drive temperature";
m->m_desc = "At what temperature in Celsius should we send "
"an email alert if a hard drive reaches it?";
m->m_cgi = "mhdt";
m->m_off = (char *)&g_conf.m_maxHardDriveTemp - g;
m->m_type = TYPE_LONG;
m->m_def = "45";
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
/*
m->m_title = "delay emails after";
m->m_desc = "If delay non critical email alerts is on, don't send "
"emails after this time. Time is hh:mm. Time is take from "
"host #0's system clock in UTC.";
m->m_cgi = "dea";
m->m_off = (char *)&g_conf.m_delayEmailsAfter - g;
m->m_type = TYPE_TIME; // time format -- very special
m->m_def = "00:00";
m->m_priv = 2;
m++;
m->m_title = "delay emails before";
m->m_desc = "If delay non critical email alerts is on, don't send "
"emails before this time. Time is hh:mm Time is take from "
"host #0's system clock in UTC.";
m->m_cgi = "deb";
m->m_off = (char *)&g_conf.m_delayEmailsBefore - g;
m->m_type = TYPE_TIME; // time format -- very special
m->m_def = "00:00";
m->m_priv = 2;
m++;
*/
/*
Disable this until it works.
m->m_title = "use merge token";
m->m_desc = "If used, prevents twins, or hosts on the same ide "
"channel, from merging simultaneously.";
m->m_cgi = "umt";
m->m_off = (char *)&g_conf.m_useMergeToken - g;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m++;
*/
m->m_title = "error string 1";
m->m_desc = "Look for this string in the kernel buffer for sending "
"email alert. Useful for detecting some strange "
"hard drive failures that really slow performance.";
m->m_cgi = "errstrone";
m->m_off = (char *)&g_conf.m_errstr1 - g;
m->m_type = TYPE_STRING;
m->m_def = "I/O error";
m->m_size = MAX_URL_LEN;
m->m_priv = 2;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "error string 2";
m->m_desc = "Look for this string in the kernel buffer for sending "
"email alert. Useful for detecting some strange "
"hard drive failures that really slow performance.";
m->m_cgi = "errstrtwo";
m->m_off = (char *)&g_conf.m_errstr2 - g;
m->m_type = TYPE_STRING;
m->m_def = "";
m->m_size = MAX_URL_LEN;
m->m_priv = 2;
m->m_group = 0;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "error string 3";
m->m_desc = "Look for this string in the kernel buffer for sending "
"email alert. Useful for detecting some strange "
"hard drive failures that really slow performance.";
m->m_cgi = "errstrthree";
m->m_off = (char *)&g_conf.m_errstr3 - g;
m->m_type = TYPE_STRING;
m->m_def = "";
m->m_size = MAX_URL_LEN;
m->m_priv = 2;
m->m_group = 0;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "send email alerts to email 1";
m->m_desc = "Sends to email address 1 through email server 1.";
m->m_cgi = "seatone";
m->m_off = (char *)&g_conf.m_sendEmailAlertsToEmail1 - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_priv = 2;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "send parm change email alerts to email 1";
m->m_desc = "Sends to email address 1 through email server 1 if "
"any parm is changed.";
m->m_cgi = "seatonep";
m->m_off = (char *)&g_conf.m_sendParmChangeAlertsToEmail1 - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_priv = 2;
m->m_group = 0;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "email server 1";
m->m_desc = "Connects to this IP or hostname "
"directly when sending email 1. "
"Use <i>apt-get install sendmail</i> to install sendmail "
"on that IP or hostname. Add <i>From:10.5 RELAY</i> to "
"/etc/mail/access to allow sendmail to forward email it "
"receives from gigablast if gigablast hosts are on the "
"10.5.*.* IPs. Then run <i>/etc/init.d/sendmail restart</i> "
"as root to pick up those changes so sendmail will forward "
"Gigablast's email to the email address you give below.";
m->m_cgi = "esrvone";
m->m_off = (char *)&g_conf.m_email1MX - g;
m->m_type = TYPE_STRING;
m->m_def = "127.0.0.1";
m->m_size = MAX_MX_LEN;
m->m_priv = 2;
m->m_group = 0;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "email address 1";
m->m_desc = "Sends to this address when sending email 1 ";
m->m_cgi = "eaddrone";
m->m_off = (char *)&g_conf.m_email1Addr - g;
m->m_type = TYPE_STRING;
m->m_def = "4081234567@vtext.com";
m->m_size = MAX_EMAIL_LEN;
m->m_priv = 2;
m->m_group = 0;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "from email address 1";
m->m_desc = "The from field when sending email 1 ";
m->m_cgi = "efaddrone";
m->m_off = (char *)&g_conf.m_email1From - g;
m->m_type = TYPE_STRING;
m->m_def = "sysadmin@mydomain.com";
m->m_size = MAX_EMAIL_LEN;
m->m_priv = 2;
m->m_group = 0;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "send email alerts to email 2";
m->m_desc = "Sends to email address 2 through email server 2.";
m->m_cgi = "seattwo";
m->m_off = (char *)&g_conf.m_sendEmailAlertsToEmail2 - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_priv = 2;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "send parm change email alerts to email 2";
m->m_desc = "Sends to email address 2 through email server 2 if "
"any parm is changed.";
m->m_cgi = "seattwop";
m->m_off = (char *)&g_conf.m_sendParmChangeAlertsToEmail2 - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_priv = 2;
m->m_group = 0;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "email server 2";
m->m_desc = "Connects to this server directly when sending email 2 ";
m->m_cgi = "esrvtwo";
m->m_off = (char *)&g_conf.m_email2MX - g;
m->m_type = TYPE_STRING;
m->m_def = "mail.mydomain.com";
m->m_size = MAX_MX_LEN;
m->m_priv = 2;
m->m_group = 0;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "email address 2";
m->m_desc = "Sends to this address when sending email 2 ";
m->m_cgi = "eaddrtwo";
m->m_off = (char *)&g_conf.m_email2Addr - g;
m->m_type = TYPE_STRING;
m->m_def = "";
m->m_size = MAX_EMAIL_LEN;
m->m_priv = 2;
m->m_group = 0;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "from email address 2";
m->m_desc = "The from field when sending email 2 ";
m->m_cgi = "efaddrtwo";
m->m_off = (char *)&g_conf.m_email2From - g;
m->m_type = TYPE_STRING;
m->m_def = "sysadmin@mydomain.com";
m->m_size = MAX_EMAIL_LEN;
m->m_priv = 2;
m->m_group = 0;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "send email alerts to email 3";
m->m_desc = "Sends to email address 3 through email server 3.";
m->m_cgi = "seatthree";
m->m_off = (char *)&g_conf.m_sendEmailAlertsToEmail3 - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_priv = 2;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "send parm change email alerts to email 3";
m->m_desc = "Sends to email address 3 through email server 3 if "
"any parm is changed.";
m->m_cgi = "seatthreep";
m->m_off = (char *)&g_conf.m_sendParmChangeAlertsToEmail3 - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_priv = 2;
m->m_group = 0;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "email server 3";
m->m_desc = "Connects to this server directly when sending email 3 ";
m->m_cgi = "esrvthree";
m->m_off = (char *)&g_conf.m_email3MX - g;
m->m_type = TYPE_STRING;
m->m_def = "mail.mydomain.com";
m->m_size = MAX_MX_LEN;
m->m_priv = 2;
m->m_group = 0;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "email address 3";
m->m_desc = "Sends to this address when sending email 3 ";
m->m_cgi = "eaddrthree";
m->m_off = (char *)&g_conf.m_email3Addr - g;
m->m_type = TYPE_STRING;
m->m_def = "";
m->m_size = MAX_EMAIL_LEN;
m->m_priv = 2;
m->m_group = 0;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "from email address 3";
m->m_desc = "The from field when sending email 3 ";
m->m_cgi = "efaddrthree";
m->m_off = (char *)&g_conf.m_email3From - g;
m->m_type = TYPE_STRING;
m->m_def = "sysadmin@mydomain.com";
m->m_size = MAX_EMAIL_LEN;
m->m_priv = 2;
m->m_group = 0;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "send email alerts to email 4";
m->m_desc = "Sends to email address 4 through email server 4.";
m->m_cgi = "seatfour";
m->m_off = (char *)&g_conf.m_sendEmailAlertsToEmail4 - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_priv = 2;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "send parm change email alerts to email 4";
m->m_desc = "Sends to email address 4 through email server 4 if "
"any parm is changed.";
m->m_cgi = "seatfourp";
m->m_off = (char *)&g_conf.m_sendParmChangeAlertsToEmail4 - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_priv = 2;
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "email server 4";
m->m_desc = "Connects to this server directly when sending email 4 ";
m->m_cgi = "esrvfour";
m->m_off = (char *)&g_conf.m_email4MX - g;
m->m_type = TYPE_STRING;
m->m_def = "mail.mydomain.com";
m->m_size = MAX_MX_LEN;
m->m_priv = 2;
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "email address 4";
m->m_desc = "Sends to this address when sending email 4 ";
m->m_cgi = "eaddrfour";
m->m_off = (char *)&g_conf.m_email4Addr - g;
m->m_type = TYPE_STRING;
m->m_def = "";
m->m_size = MAX_EMAIL_LEN;
m->m_priv = 2;
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "from email address 4";
m->m_desc = "The from field when sending email 4 ";
m->m_cgi = "efaddrfour";
m->m_off = (char *)&g_conf.m_email4From - g;
m->m_type = TYPE_STRING;
m->m_def = "sysadmin@mydomain.com";
m->m_size = MAX_EMAIL_LEN;
m->m_priv = 2;
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "prefer local reads";
m->m_desc = "If you have scsi drives or a slow network, say yes here "
"to minimize data fetches across the network.";
m->m_cgi = "plr";
m->m_off = (char *)&g_conf.m_preferLocalReads - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
/*
m->m_title = "use biased tfndb";
m->m_desc = "Should we always send titledb record lookup requests "
"to a particular host in order to increase tfndb page cache "
"hits? This bypasses load balancing and may result in "
"slower hosts being more of a bottleneck. Keep this disabled "
"unless you notice tfndb disk seeks slowing things down.";
m->m_cgi = "ubu";
m->m_off = (char *)&g_conf.m_useBiasedTfndb - g;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_group = 0;
m++;
*/
// this is ifdef'd out in Msg3.cpp for performance reasons,
// so do it here, too
#ifdef GBSANITYCHECK
m->m_title = "max corrupted read retries";
m->m_desc = "How many times to retry disk reads that had corrupted "
"data before requesting the list from a twin, and, if that "
"fails, removing the bad data.";
m->m_cgi = "crr";
m->m_off = (char *)&g_conf.m_corruptRetries - g;
m->m_type = TYPE_LONG;
m->m_def = "100";
m->m_group = 0;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
#endif
m->m_title = "do incremental updating";
m->m_desc = "When reindexing a document, do not re-add data "
"that should already be in index or clusterdb "
"since the last time the document was indexed. Otherwise, "
"re-add the data regardless.";
m->m_cgi = "oic";
//m->m_off = (char *)&g_conf.m_onlyAddUnchangedTermIds - g;
m->m_off = (char *)&g_conf.m_doIncrementalUpdating - g;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
// you can really screw up the index if this is false, so
// comment it out for now
/*
m->m_title = "index deletes";
m->m_desc = "Should we allow indexdb recs to be deleted? This is "
"always true, except in very rare indexdb rebuilds.";
m->m_cgi = "id";
m->m_off = (char *)&g_conf.m_indexDeletes - g;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_group = 0;
m++;
*/
m->m_title = "use etc hosts";
m->m_desc = "Use /etc/hosts file to resolve hostnames? the "
"/etc/host file is reloaded every minute, so if you make "
"a change to it you might have to wait one minute for the "
"change to take affect.";
m->m_cgi = "ueh";
m->m_off = (char *)&g_conf.m_useEtcHosts - g;
m->m_def = "0";
m->m_type = TYPE_BOOL;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "twins are split";
m->m_desc = "If enabled, Gigablast assumes the first half of "
"machines in hosts.conf "
"are on a different network switch than the second half, "
"and minimizes transmits between the switches.";
m->m_cgi = "stw";
m->m_off = (char *)&g_conf.m_splitTwins - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "do out of memory testing";
m->m_desc = "When enabled Gigablast will randomly fail at "
"allocating memory. Used for testing stability.";
m->m_cgi = "dot";
m->m_off = (char *)&g_conf.m_testMem - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "do consistency testing";
m->m_desc = "When enabled Gigablast will make sure it reparses "
"the document exactly the same way. It does this every "
"1000th document anyway, but enabling this makes it do it "
"for every document.";
m->m_cgi = "dct";
m->m_off = (char *)&g_conf.m_doConsistencyTesting - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "use shotgun";
m->m_desc = "If enabled, all servers must have two gigabit "
"ethernet ports hooked up and Gigablast will round robin "
"packets between both ethernet ports when sending to another "
"host. Can speed up network transmissions as much as 2x.";
m->m_cgi = "usht";
m->m_off = (char *)&g_conf.m_useShotgun - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "use quickpoll";
m->m_desc = "If enabled, Gigablast will use quickpoll. Significantly "
"improves performance. Only turn this off for testing.";
m->m_cgi = "uqp";
m->m_off = (char *)&g_conf.m_useQuickpoll - g;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
// m->m_title = "quickpoll core on error";
// m->m_desc = "If enabled, quickpoll will terminate the process and "
// "generate a core file when callbacks are called with the "
// "wrong niceness.";
// m->m_cgi = "qpoe";
// m->m_off = (char *)&g_conf.m_quickpollCoreOnError - g;
// m->m_type = TYPE_BOOL;
// m->m_def = "1";
// m++;
// . this will leak the shared mem if the process is Ctrl+C'd
// . that is expected behavior
// . you can clean up the leaks using 'gb freecache 20000000'
// and use 'ipcs -m' to see what leaks you got
// . generally, only the main gb should use shared mem, so
// keep this off for teting
m->m_title = "use shared mem";
m->m_desc = "If enabled, Gigablast will use shared memory. "
"Should really only be used on the live cluster, "
"keep this on the testing cluster since it can "
"leak easily.";
m->m_cgi = "ushm";
m->m_off = (char *)&g_conf.m_useSHM - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
// disable disk caches... for testing really
/*
m->m_title = "use disk page cache for indexdb";
m->m_desc = "Use disk page cache?";
m->m_cgi = "udpci";
m->m_off = (char *)&g_conf.m_useDiskPageCacheIndexdb - g;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m++;
*/
m->m_title = "posdb disk cache size";
m->m_desc = "How much file cache size to use in bytes? Posdb is "
"the index.";
m->m_cgi = "dpcsp";
m->m_off = (char *)&g_conf.m_posdbFileCacheSize - g;
m->m_type = TYPE_LONG_LONG;
m->m_def = "30000000";
m->m_flags = 0;//PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "tagdb disk cache size";
m->m_desc = "How much file cache size to use in bytes? Tagdb is "
"consulted at spider time and query time to determine "
"if a url or outlink is banned or what its siterank is, etc.";
m->m_cgi = "dpcst";
m->m_off = (char *)&g_conf.m_tagdbFileCacheSize - g;
m->m_type = TYPE_LONG_LONG;
m->m_def = "30000000";
m->m_flags = 0;//PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m->m_group = 0;
m++;
m->m_title = "clusterdb disk cache size";
m->m_desc = "How much file cache size to use in bytes? "
"Gigablast does a "
"lookup in clusterdb for each search result at query time to "
"get its site information for site clustering. If you "
"disable site clustering in the search controls then "
"clusterdb will not be consulted.";
m->m_cgi = "dpcsc";
m->m_off = (char *)&g_conf.m_clusterdbFileCacheSize - g;
m->m_type = TYPE_LONG_LONG;
m->m_def = "30000000";
m->m_flags = 0;//PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m->m_group = 0;
m++;
m->m_title = "titledb disk cache size";
m->m_desc = "How much file cache size to use in bytes? Titledb "
"holds the cached web pages, compressed. Gigablast consults "
"it to generate a summary for a search result, or to see if "
"a url Gigablast is spidering is already in the index.";
m->m_cgi = "dpcsx";
m->m_off = (char *)&g_conf.m_titledbFileCacheSize - g;
m->m_type = TYPE_LONG_LONG;
m->m_def = "30000000";
m->m_flags = 0;//PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m->m_group = 0;
m++;
m->m_title = "spiderdb disk cache size";
m->m_desc = "How much file cache size to use in bytes? Titledb "
"holds the cached web pages, compressed. Gigablast consults "
"it to generate a summary for a search result, or to see if "
"a url Gigablast is spidering is already in the index.";
m->m_cgi = "dpcsy";
m->m_off = (char *)&g_conf.m_spiderdbFileCacheSize - g;
m->m_type = TYPE_LONG_LONG;
m->m_def = "30000000";
m->m_flags = 0;//PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m->m_group = 0;
m++;
/*
m->m_title = "exclude link text";
m->m_desc = "Exclude search results that have one or more query "
"that only appear in the incoming link text";
m->m_cgi = "exlt";
m->m_off = (char *)&g_conf.m_excludeLinkText - g;
m->m_sparm = 1;
m->m_soff = (char *)&si.m_excludeLinkText - y;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_scgi = "excludelinktext";
m++;
m->m_title = "exclude meta text";
m->m_desc = "Exclude search results that have one or more query "
"that only appear in the meta text";
m->m_cgi = "exmt";
m->m_off = (char *)&g_conf.m_excludeMetaText - g;
m->m_sparm = 1;
m->m_soff = (char *)&si.m_excludeMetaText - y;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_scgi = "excludemetatext";
m++;
*/
m->m_title = "scan all if not found";
m->m_desc = "Scan all titledb files if rec not found. You should "
"keep this on to avoid corruption. Do not turn it off unless "
"you are Matt Wells.";
m->m_cgi = "sainf";
m->m_off = (char *)&g_conf.m_scanAllIfNotFound - g;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "interface machine";
m->m_desc = "for specifying if this is an interface machine"
"messages are rerouted from this machine to the main"
"cluster set in the hosts.conf.";
m->m_cgi = "intmch";
m->m_off = (char *)&g_conf.m_interfaceMachine - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_priv = 2;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "generate vector at query time";
m->m_desc = "At query time, should Gigablast generate content "
"vectors for title records lacking them? This is an "
"expensive operation, so is really just for testing purposes.";
m->m_cgi = "gv";
m->m_off = (char *)&g_conf.m_generateVectorAtQueryTime - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "redirect non-raw traffic";
m->m_desc = "If this is non empty, http traffic will be redirected "
"to the specified address.";
m->m_cgi = "redir";
m->m_off = (char *)&g_conf.m_redirect - g;
m->m_type = TYPE_STRING;
m->m_size = MAX_URL_LEN;
m->m_def = "";
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "send requests to compression proxy";
m->m_desc = "If this is true, gb will route download requests for"
" web pages to proxies in hosts.conf. Proxies will"
" download and compress docs before sending back. ";
m->m_cgi = "srtcp";
m->m_off = (char *)&g_conf.m_useCompressionProxy - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "synchronize proxy to cluster time";
m->m_desc = "Enable/disable the ability to synchronize time between "
"the cluster and the proxy";
m->m_cgi = "sptct";
m->m_off = (char *)&g_conf.m_timeSyncProxy - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
/*
m->m_title = "use data feed account server";
m->m_desc = "Enable/disable the use of a remote account verification "
"for Data Feed Customers. This should ONLY be used for the "
"proxy.";
m->m_cgi = "pdfuas";
m->m_off = (char *)&g_conf.m_useDFAcctServer - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m++;
m->m_title = "data feed server ip";
m->m_desc = "The ip address of the Gigablast data feed server to "
"retrieve customer account information from. This should ONLY "
"be used for the proxy.";
m->m_cgi = "pdfip";
m->m_off = (char *)&g_conf.m_dfAcctIp - g;
m->m_type = TYPE_IP;
m->m_def = "2130706433";
m->m_group = 0;
m++;
m->m_title = "data feed server port";
m->m_desc = "The port of the Gigablast data feed server to retrieve "
"customer account information from. This should ONLY be used "
"for the proxy";
m->m_cgi = "pdfport";
m->m_off = (char *)&g_conf.m_dfAcctPort - g;
m->m_type = TYPE_LONG;
m->m_def = "8040";
m->m_group = 0;
m++;
m->m_title = "data feed server collection";
m->m_desc = "The collection on the Gigablast data feed server to "
"retrieve customer account information from. This should ONLY "
"be used for the proxy.";
m->m_cgi = "pdfcoll";
m->m_off = (char *)&g_conf.m_dfAcctColl - g;
m->m_type = TYPE_STRING;
m->m_size = MAX_COLL_LEN;
m->m_def = "customers";
m->m_group = 0;
m++;
*/
m->m_title = "allow scaling of hosts";
m->m_desc = "Allows scaling up of hosts by deleting recs not in "
"the correct group. This should only happen why copying "
"a set of servers to the new hosts. Otherwise corrupted "
"data will cause a halt.";
m->m_cgi = "asoh";
m->m_off = (char *)&g_conf.m_allowScale - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "allow bypass of db validation";
m->m_desc = "Allows bypass of db validation so gigablast will not "
"halt if a corrupt db is discovered durring load. Use this "
"when attempting to load with a collection that has known "
"corruption.";
m->m_cgi = "abov";
m->m_off = (char *)&g_conf.m_bypassValidation - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
/*
m->m_title = "reload language pages";
m->m_desc = "Reloads language specific pages.";
m->m_cgi = "rlpages";
m->m_type = TYPE_CMD;
m->m_func = CommandReloadLanguagePages;
m->m_cast = 0;
m++;
m->m_title = "proxy port";
m->m_desc = "Retrieve pages from the proxy on "
"this port.";
m->m_cgi = "proxyport";
m->m_off = (char *)&cr.m_proxyPort - x;
m->m_type = TYPE_LONG;
m->m_def = "0";
m++;
m->m_title = "all reload language pages";
m->m_desc = "Reloads language specific pages for all hosts.";
m->m_cgi = "rlpages";
m->m_type = TYPE_CMD;
m++;
*/
// do we need this any more?
/*
m->m_title = "give up on dead hosts";
m->m_desc = "Give up requests to dead hosts. Only set this when you "
"know a host is dead and will not come back online without "
"a restarting all hosts. Messages will timeout on the dead "
"host but will not error, allowing outstanding spidering to "
"finish to the twin.";
m->m_cgi = "gvup";
m->m_off = (char *)&g_conf.m_giveupOnDeadHosts - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m++;
*/
/*
m->m_title = "ask root name servers";
m->m_desc = "if enabled Gigablast will direct DNS requests to "
"the root DNS servers, otherwise it will continue to "
"send DNS queries to the bind9 servers defined in "
"the Master Controls.";
m->m_cgi = "bdns";
m->m_off = (char *)&g_conf.m_askRootNameservers - g;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m++;
*/
/*
m->m_title = "do dig sanity checks";
m->m_desc = "call dig @nameServer hostname and on timedout lookups"
" and see if dig also timed out";
m->m_cgi = "dig";
m->m_off = (char *)&g_conf.m_useDig - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m++;
*/
/*
m->m_title = "dns root name server 1";
m->m_desc = "IP address of a DNS root server. Assumes UDP "
"port 53.";
m->m_cgi = "rnsa";
m->m_off = (char *)&g_conf.m_rnsIps[0] - g;
m->m_type = TYPE_IP;
m->m_def = "192.228.79.201";
m++;
m->m_title = "dns root name server 2";
m->m_desc = "IP address of a DNS root server. Assumes UDP "
"port 53.";
m->m_cgi = "rnsb";
m->m_off = (char *)&g_conf.m_rnsIps[1] - g;
m->m_type = TYPE_IP;
m->m_def = "192.33.4.12";
m++;
m->m_title = "dns root name server 3";
m->m_desc = "IP address of a DNS root server. Assumes UDP "
"port 53.";
m->m_cgi = "rnsc";
m->m_off = (char *)&g_conf.m_rnsIps[2] - g;
m->m_type = TYPE_IP;
m->m_def = "128.8.10.90";
m++;
m->m_title = "dns root name server 4";
m->m_desc = "IP address of a DNS root server. Assumes UDP "
"port 53.";
m->m_cgi = "rnsd";
m->m_off = (char *)&g_conf.m_rnsIps[3] - g;
m->m_type = TYPE_IP;
m->m_def = "192.203.230.10";
m++;
m->m_title = "dns root name server 5";
m->m_desc = "IP address of a DNS root server. Assumes UDP "
"port 53.";
m->m_cgi = "rnse";
m->m_off = (char *)&g_conf.m_rnsIps[4] - g;
m->m_type = TYPE_IP;
m->m_def = "192.5.5.241";
m++;
m->m_title = "dns root name server 6";
m->m_desc = "IP address of a DNS root server. Assumes UDP "
"port 53.";
m->m_cgi = "rnsf";
m->m_off = (char *)&g_conf.m_rnsIps[5] - g;
m->m_type = TYPE_IP;
m->m_def = "192.112.36.4";
m++;
m->m_title = "dns root name server 7";
m->m_desc = "IP address of a DNS root server. Assumes UDP "
"port 53.";
m->m_cgi = "rnsg";
m->m_off = (char *)&g_conf.m_rnsIps[6] - g;
m->m_type = TYPE_IP;
m->m_def = "128.63.2.53";
m++;
m->m_title = "dns root name server 8";
m->m_desc = "IP address of a DNS root server. Assumes UDP "
"port 53.";
m->m_cgi = "rnsh";
m->m_off = (char *)&g_conf.m_rnsIps[7] - g;
m->m_type = TYPE_IP;
m->m_def = "192.36.148.17";
m++;
m->m_title = "dns root name server 9";
m->m_desc = "IP address of a DNS root server. Assumes UDP "
"port 53.";
m->m_cgi = "rnsi";
m->m_off = (char *)&g_conf.m_rnsIps[8] - g;
m->m_type = TYPE_IP;
m->m_def = "192.58.128.30";
m++;
m->m_title = "dns root name server 10";
m->m_desc = "IP address of a DNS root server. Assumes UDP "
"port 53.";
m->m_cgi = "rnsj";
m->m_off = (char *)&g_conf.m_rnsIps[9] - g;
m->m_type = TYPE_IP;
m->m_def = "193.0.14.129";
m++;
m->m_title = "dns root name server 11";
m->m_desc = "IP address of a DNS root server. Assumes UDP "
"port 53.";
m->m_cgi = "rnsk";
m->m_off = (char *)&g_conf.m_rnsIps[10] - g;
m->m_type = TYPE_IP;
m->m_def = "198.32.64.12";
m++;
m->m_title = "dns root name server 12";
m->m_desc = "IP address of a DNS root server. Assumes UDP "
"port 53.";
m->m_cgi = "rnsl";
m->m_off = (char *)&g_conf.m_rnsIps[11] - g;
m->m_type = TYPE_IP;
m->m_def = "202.12.27.33";
m++;
m->m_title = "dns root name server 13";
m->m_desc = "IP address of a DNS root server. Assumes UDP "
"port 53.";
m->m_cgi = "rnsm";
m->m_off = (char *)&g_conf.m_rnsIps[12] - g;
m->m_type = TYPE_IP;
m->m_def = "198.41.0.4";
m++;
*/
m->m_title = "dns 0";
m->m_desc = "IP address of the primary DNS server. Assumes UDP "
"port 53. REQUIRED FOR SPIDERING! Use Google's "
"public DNS 8.8.8.8 as default.";
m->m_cgi = "pdns";
m->m_off = (char *)&g_conf.m_dnsIps[0] - g;
m->m_type = TYPE_IP;
// default to google public dns #1
m->m_def = "8.8.8.8";
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "dns 1";
m->m_desc = "IP address of the secondary DNS server. Assumes UDP "
"port 53. Will be accessed in conjunction with the primary "
"dns, so make sure this is always up. An ip of 0 means "
"disabled. Google's secondary public DNS is 8.8.4.4.";
m->m_cgi = "sdns";
m->m_off = (char *)&g_conf.m_dnsIps[1] - g;
m->m_type = TYPE_IP;
// default to google public dns #2
m->m_def = "8.8.4.4";
m->m_group = 0;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "dns 2";
m->m_desc = "All hosts send to these DNSes based on hash "
"of the subdomain to try to split DNS load evenly.";
m->m_cgi = "sdnsa";
m->m_off = (char *)&g_conf.m_dnsIps[2] - g;
m->m_type = TYPE_IP;
m->m_def = "0.0.0.0";
m->m_group = 0;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "dns 3";
m->m_desc = "";
m->m_cgi = "sdnsb";
m->m_off = (char *)&g_conf.m_dnsIps[3] - g;
m->m_type = TYPE_IP;
m->m_def = "0.0.0.0";
m->m_group = 0;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "dns 4";
m->m_desc = "";
m->m_cgi = "sdnsc";
m->m_off = (char *)&g_conf.m_dnsIps[4] - g;
m->m_type = TYPE_IP;
m->m_def = "0.0.0.0";
m->m_group = 0;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "dns 5";
m->m_desc = "";
m->m_cgi = "sdnsd";
m->m_off = (char *)&g_conf.m_dnsIps[5] - g;
m->m_type = TYPE_IP;
m->m_def = "0.0.0.0";
m->m_group = 0;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "dns 6";
m->m_desc = "";
m->m_cgi = "sdnse";
m->m_off = (char *)&g_conf.m_dnsIps[6] - g;
m->m_type = TYPE_IP;
m->m_def = "0.0.0.0";
m->m_group = 0;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "dns 7";
m->m_desc = "";
m->m_cgi = "sdnsf";
m->m_off = (char *)&g_conf.m_dnsIps[7] - g;
m->m_type = TYPE_IP;
m->m_def = "0.0.0.0";
m->m_group = 0;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "dns 8";
m->m_desc = "";
m->m_cgi = "sdnsg";
m->m_off = (char *)&g_conf.m_dnsIps[8] - g;
m->m_type = TYPE_IP;
m->m_def = "0.0.0.0";
m->m_group = 0;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "dns 9";
m->m_desc = "";
m->m_cgi = "sdnsh";
m->m_off = (char *)&g_conf.m_dnsIps[9] - g;
m->m_type = TYPE_IP;
m->m_def = "0.0.0.0";
m->m_group = 0;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "dns 10";
m->m_desc = "";
m->m_cgi = "sdnsi";
m->m_off = (char *)&g_conf.m_dnsIps[10] - g;
m->m_type = TYPE_IP;
m->m_def = "0.0.0.0";
m->m_group = 0;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "dns 11";
m->m_desc = "";
m->m_cgi = "sdnsj";
m->m_off = (char *)&g_conf.m_dnsIps[11] - g;
m->m_type = TYPE_IP;
m->m_def = "0.0.0.0";
m->m_group = 0;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "dns 12";
m->m_desc = "";
m->m_cgi = "sdnsk";
m->m_off = (char *)&g_conf.m_dnsIps[12] - g;
m->m_type = TYPE_IP;
m->m_def = "0.0.0.0";
m->m_group = 0;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "dns 13";
m->m_desc = "";
m->m_cgi = "sdnsl";
m->m_off = (char *)&g_conf.m_dnsIps[13] - g;
m->m_type = TYPE_IP;
m->m_def = "0.0.0.0";
m->m_group = 0;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "dns 14";
m->m_desc = "";
m->m_cgi = "sdnsm";
m->m_off = (char *)&g_conf.m_dnsIps[14] - g;
m->m_type = TYPE_IP;
m->m_def = "0.0.0.0";
m->m_group = 0;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "dns 15";
m->m_desc = "";
m->m_cgi = "sdnsn";
m->m_off = (char *)&g_conf.m_dnsIps[15] - g;
m->m_type = TYPE_IP;
m->m_def = "0.0.0.0";
m->m_group = 0;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "geocoder IP #1";
m->m_desc = "";
m->m_cgi = "gca";
m->m_off = (char *)&g_conf.m_geocoderIps[0] - g;
m->m_type = TYPE_IP;
m->m_def = "10.5.66.11"; // sp1
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "geocoder IP #2";
m->m_desc = "";
m->m_cgi = "gcb";
m->m_off = (char *)&g_conf.m_geocoderIps[1] - g;
m->m_type = TYPE_IP;
m->m_def = "0.0.0.0";
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "geocoder IP #3";
m->m_desc = "";
m->m_cgi = "gcc";
m->m_off = (char *)&g_conf.m_geocoderIps[2] - g;
m->m_type = TYPE_IP;
m->m_def = "0.0.0.0";
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "geocoder IP #4";
m->m_desc = "";
m->m_cgi = "gcd";
m->m_off = (char *)&g_conf.m_geocoderIps[3] - g;
m->m_type = TYPE_IP;
m->m_def = "0.0.0.0";
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "wiki proxy ip";
m->m_desc = "Access the wiki coll through this proxy ip";
m->m_cgi = "wpi";
m->m_off = (char *)&g_conf.m_wikiProxyIp - g;
m->m_type = TYPE_IP;
m->m_def = "0";
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "wiki proxy port";
m->m_desc = "Access the wiki coll through this proxy port";
m->m_cgi = "wpp";
m->m_off = (char *)&g_conf.m_wikiProxyPort - g;
m->m_type = TYPE_LONG;
m->m_def = "0";
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "default collection";
m->m_desc = "When no collection is explicitly specified, assume "
"this collection name.";
m->m_cgi = "dcn";
m->m_off = (char *)&g_conf.m_defaultColl - g;
m->m_type = TYPE_STRING;
m->m_size = MAX_COLL_LEN+1;
m->m_def = "";
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "directory collection";
m->m_desc = "Collection to be used for directory searching and "
"display of directory topic pages.";
m->m_cgi = "dircn";
m->m_off = (char *)&g_conf.m_dirColl - g;
m->m_type = TYPE_STRING;
m->m_size = MAX_COLL_LEN+1;
m->m_def = "main";
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "directory hostname";
m->m_desc = "Hostname of the server providing the directory. "
"Leave empty to use this host.";
m->m_cgi = "dirhn";
m->m_off = (char *)&g_conf.m_dirHost - g;
m->m_type = TYPE_STRING;
m->m_size = MAX_URL_LEN;
m->m_def = "";
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "max incoming bandwidth for spider";
m->m_desc = "Total incoming bandwidth used by all spiders should "
"not exceed this many kilobits per second. ";
m->m_cgi = "mkbps";
m->m_off = (char *)&g_conf.m_maxIncomingKbps - g;
m->m_type = TYPE_FLOAT;
m->m_def = "999999.0";
m->m_units = "Kbps";
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "max 1-minute sliding-window loadavg";
m->m_desc = "Spiders will shed load when their host exceeds this "
"value for the 1-minute load average in /proc/loadavg. "
"The value 0.0 disables this feature.";
m->m_cgi = "mswl";
m->m_off = (char *)&g_conf.m_maxLoadAvg - g;
m->m_type = TYPE_FLOAT;
m->m_def = "0.0";
m->m_units = "";
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "max pages per second";
m->m_desc = "Maximum number of pages to index or delete from index "
"per second for all hosts combined.";
m->m_cgi = "mpps";
m->m_off = (char *)&g_conf.m_maxPagesPerSecond - g;
m->m_type = TYPE_FLOAT;
m->m_def = "999999.0";
m->m_units = "pages/second";
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
/*
m->m_title = "distributed spider balance";
m->m_desc = "Max number of ready domains a host can have distributed "
"to it by all other host. This should be some multiple of the "
"total number of hosts in the cluster.";
m->m_cgi = "dsb";
m->m_off = (char *)&g_conf.m_distributedSpiderBalance - g;
m->m_type = TYPE_LONG;
m->m_def = "1024";
m->m_units = "domains";
m++;
m->m_title = "distributed same ip wait (hack)";
m->m_desc = "Amount of time to wait if this IP is already being "
"downloaded by a host. Works only in conjunction with "
"distribute spider downloads by ip in Spider Controls.";
m->m_cgi = "dsiw";
m->m_off = (char *)&g_conf.m_distributedIpWait - g;
m->m_type = TYPE_LONG;
m->m_def = "0";
m->m_units = "ms";
m->m_group = 0;
m->m_min = 0;
m++;
*/
/*
m->m_title = "root quality max cache age base";
m->m_desc = "Maximum age to cache quality of a root url in seconds. "
"Computing "
"the quality of especially root urls can be expensive. "
"This number is multiplied by (Q-30)/10 where Q is the cached "
"quality of the root url. Therefore, higher quality and more "
"stable root urls are updated less often, which is a good thing "
"since they are more expensive to recompute.";
m->m_cgi = "rqmca";
m->m_off = (char *)&g_conf.m_siteQualityMaxCacheAge - g;
m->m_type = TYPE_LONG;
m->m_def = "7257600"; // 3 months (in seconds)
m->m_units = "seconds";
m++;
*/
m->m_title = "use threads";
m->m_desc = "If enabled, Gigablast will use threads.";
m->m_cgi = "ut";
m->m_off = (char *)&g_conf.m_useThreads - g;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
// turn off for now. after seeing how SLOOOOOW brian's merge op was
// when all 16 shards on a 16-core machine were merging (even w/ SSDs)
// i turned threads off and it was over 100x faster. so until we have
// pooling or something turn these off
m->m_title = "use threads for disk";
m->m_desc = "If enabled, Gigablast will use threads for disk ops. "
"Now that Gigablast uses pthreads more effectively, "
"leave this enabled for optimal performance in all cases.";
//"Until pthreads is any good leave this off. If you have "
//"SSDs performance can be as much as 100x better.";
m->m_cgi = "utfd";
m->m_off = (char *)&g_conf.m_useThreadsForDisk - g;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_flags = 0;//PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "use threads for intersects and merges";
m->m_desc = "If enabled, Gigablast will use threads for these ops. "
"Default is now on in the event you have simultaneous queries "
"so one query does not hold back the other. There seems "
"to be a bug so leave this ON for now.";
//"Until pthreads is any good leave this off.";
m->m_cgi = "utfio";
m->m_off = (char *)&g_conf.m_useThreadsForIndexOps - g;
m->m_type = TYPE_BOOL;
// enable this in the event of multiple cores available and
// large simultaneous queries coming in
m->m_def = "1";
m->m_flags = 0;//PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m->m_group = 0;
m++;
m->m_title = "use threads for system calls";
m->m_desc = "Gigablast does not make too many system calls so "
"leave this on in case the system call is slow.";
m->m_cgi = "utfsc";
m->m_off = (char *)&g_conf.m_useThreadsForSystemCalls - g;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_flags = 0;//PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m->m_group = 0;
m++;
m->m_title = "max cpu threads";
m->m_desc = "Maximum number of threads to use per Gigablast process "
"for intersecting docid lists.";
m->m_cgi = "mct";
m->m_off = (char *)&g_conf.m_maxCpuThreads - g;
m->m_type = TYPE_LONG;
// make it 3 for new gb in case one query takes way longer
// than the others
m->m_def = "6"; // "2";
m->m_units = "threads";
m->m_min = 1;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m->m_group = 0;
m++;
m->m_title = "max cpu merge threads";
m->m_desc = "Maximum number of threads to use per Gigablast process "
"for merging lists read from disk.";
m->m_cgi = "mcmt";
m->m_off = (char *)&g_conf.m_maxCpuMergeThreads - g;
m->m_type = TYPE_LONG;
m->m_def = "10";
m->m_units = "threads";
m->m_min = 1;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m->m_group = 0;
m++;
m->m_title = "max write threads";
m->m_desc = "Maximum number of threads to use per Gigablast process "
"for writing data to the disk. "
"Keep low to reduce file interlace effects and impact "
"on query response time.";
m->m_cgi = "mwt";
m->m_off = (char *)&g_conf.m_maxWriteThreads - g;
m->m_type = TYPE_LONG;
m->m_def = "1";
m->m_units = "threads";
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m->m_group = 0;
m++;
m->m_title = "flush disk writes";
m->m_desc = "If enabled then all writes will be flushed to disk. "
"If not enabled, then gb uses the Linux disk write cache.";
m->m_cgi = "fw";
m->m_off = (char *)&g_conf.m_flushWrites - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_group = 0;
m->m_flags = PF_API;//PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m->m_group = 0;
m++;
/*
m->m_title = "files group writable";
m->m_desc = "Make all created files group writable? If you have "
"multiple user accounts starting Gigablast processes you "
"will want the files to be group writable. You will "
"need to make sure you run gigablast under the "
"primary group you want to use for gigablast administration.";
m->m_cgi = "afgw";
m->m_off = (char *)&g_conf.m_makeAllFilesGroupWritable - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_group = 0;
m->m_flags = PF_API;//PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m->m_group = 0;
m++;
*/
m->m_title = "verify written lists";
m->m_desc = "Ensure lists being written to disk are not corrupt. "
"That title recs appear valid, etc. Helps isolate sources "
"of corruption. Used for debugging.";
m->m_cgi = "vwl";
m->m_off = (char *)&g_conf.m_verifyDumpedLists - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_group = 0;
m->m_flags = 0;//PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m->m_group = 0;
m++;
m->m_title = "verify disk writes";
m->m_desc = "Read what was written in a verification step. Decreases "
"performance, but may help fight disk corruption mostly on "
"Maxtors and Western Digitals.";
m->m_cgi = "vdw";
m->m_off = (char *)&g_conf.m_verifyWrites - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_group = 0;
m->m_flags = 0;//PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m->m_group = 0;
m++;
m->m_title = "max spider read threads";
m->m_desc = "Maximum number of threads to use per Gigablast process "
"for accessing the disk "
"for index-building purposes. Keep low to reduce impact "
"on query response time. Increase for fast disks or when "
"preferring build speed over lower query latencies";
m->m_cgi = "smdt";
m->m_off = (char *)&g_conf.m_spiderMaxDiskThreads - g;
m->m_type = TYPE_LONG;
m->m_def = "20";
m->m_units = "threads";
m->m_flags = 0;//PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m->m_group = 0;
m++;
/*
m->m_title = "max spider big read threads";
m->m_desc = "This particular number applies to all disk "
"reads above 1MB. "
"The number of total threads is also "
"limited to MAX_STACKS which is currently 20.";
m->m_cgi = "smbdt";
m->m_off = (char *)&g_conf.m_spiderMaxBigDiskThreads - g;
m->m_type = TYPE_LONG;
m->m_def = "2";
m->m_units = "threads";
m->m_group = 0;
m->m_flags = 0;//PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "max spider medium read threads";
m->m_desc = "This particular number applies to all disk "
"reads above 100K. "
"The number of total threads is also "
"limited to MAX_STACKS which is currently 20.";
m->m_cgi = "smmdt";
m->m_off = (char *)&g_conf.m_spiderMaxMedDiskThreads - g;
m->m_type = TYPE_LONG;
m->m_def = "4";
m->m_units = "threads";
m->m_group = 0;
m->m_flags = 0;//PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "max spider small read threads";
m->m_desc = "This particular number applies to all disk "
"reads above 1MB. "
"The number of total threads is also "
"limited to MAX_STACKS which is currently 20.";
m->m_cgi = "smsdt";
m->m_off = (char *)&g_conf.m_spiderMaxSmaDiskThreads - g;
m->m_type = TYPE_LONG;
m->m_def = "15";
m->m_units = "threads";
m->m_group = 0;
m->m_flags = 0;//PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
*/
m->m_title = "separate disk reads";
m->m_desc = "If enabled then we will not launch a low priority "
"disk read or write while a high priority is outstanding. "
"Help improve query response time at the expense of "
"spider performance.";
m->m_cgi = "sdt";
m->m_off = (char *)&g_conf.m_separateDiskReads - g;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_flags = 0;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
/*
m->m_title = "max query read threads";
m->m_desc = "Maximum number of threads to use per Gigablast process "
"for accessing the disk "
"for querying purposes.";
//IDE systems tend to be more "
// "responsive when this is low. Increase for SCSI or RAID "
// "systems.";
m->m_cgi = "qmdt";
m->m_off = (char *)&g_conf.m_queryMaxDiskThreads - g;
m->m_type = TYPE_LONG;
m->m_def = "100";
m->m_units = "threads";
m->m_flags = 0;//PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m->m_group = 0;
m++;
*/
/*
m->m_title = "max query big read threads";
m->m_desc = "This particular number applies to all reads above 1MB. "
"The number of total threads is also "
"limited to MAX_STACKS which is currently 20.";
m->m_cgi = "qmbdt";
m->m_off = (char *)&g_conf.m_queryMaxBigDiskThreads - g;
m->m_type = TYPE_LONG;
m->m_def = "20"; // 1
m->m_units = "threads";
m->m_group = 0;
m->m_flags = 0;//PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "max query medium read threads";
m->m_desc = "This particular number applies to all disk "
"reads above 100K. "
"The number of total threads is also "
"limited to MAX_STACKS which is currently 20.";
m->m_cgi = "qmmdt";
m->m_off = (char *)&g_conf.m_queryMaxMedDiskThreads - g;
m->m_type = TYPE_LONG;
m->m_def = "20"; // 3
m->m_units = "threads";
m->m_group = 0;
m->m_flags = 0;//PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "max query small read threads";
m->m_desc = "This particular number applies to all disk "
"reads above 1MB. "
"The number of total threads is also "
"limited to MAX_STACKS which is currently 20.";
m->m_cgi = "qmsdt";
m->m_off = (char *)&g_conf.m_queryMaxSmaDiskThreads - g;
m->m_type = TYPE_LONG;
m->m_def = "20";
m->m_units = "threads";
m->m_group = 0;
m->m_flags = 0;//PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
*/
m->m_title = "min popularity for speller";
m->m_desc = "Word or phrase must be present in this percent "
"of documents in order to qualify as a spelling "
"recommendation.";
m->m_cgi = "mps";
m->m_off = (char *)&g_conf.m_minPopForSpeller - g;
m->m_type = TYPE_FLOAT;
m->m_def = ".01";
m->m_units = "%%";
m->m_priv = 2;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "phrase weight";
m->m_desc = "Percent to weight phrases in queries.";
m->m_cgi = "qp";
m->m_off = (char *)&g_conf.m_queryPhraseWeight - g;
m->m_type = TYPE_FLOAT;
// was 350, but 'new mexico tourism' and 'boots uk'
// emphasized the phrase terms too much!!
m->m_def = "100";
m->m_units = "%%";
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "weights.cpp slider parm (tmp)";
m->m_desc = "Percent of how much to use words to phrase ratio weights.";
m->m_cgi = "wsp";
m->m_off = (char *)&g_conf.m_sliderParm - g;
m->m_type = TYPE_LONG;
m->m_def = "90";
m->m_units = "%%";
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
/*
m->m_title = "indextable intersection algo to use";
m->m_desc = "0 means adds the term scores, 1 means average them "
"and 2 means take the RMS.";
m->m_cgi = "iia";
m->m_off = (char *)&g_conf.m_indexTableIntersectionAlgo - g;
m->m_type = TYPE_LONG;
m->m_def = "2";
m->m_group = 0;
m++;
*/
/*
m->m_title = "max weight";
m->m_desc = "Maximum, relative query term weight. Set to 0 or less "
"to indicate now max. 10.0 or 20.0 might be a good value.";
m->m_cgi = "qm";
m->m_off = (char *)&g_conf.m_queryMaxMultiplier - g;
m->m_type = TYPE_FLOAT;
m->m_def = "0.0";
m->m_group = 0;
m++;
*/
/*
m->m_title = "query term exponent";
m->m_desc = "Raise the weights of the query "
"terms to this power. The weight of a query term is "
"basically the log of its term frequency. Increasing "
"this will increase the effects of the term frequency "
"related to each term in the query. Term frequency is "
"also known as the term popularity. Very common words "
"typically have lower weights tied to them, but the effects "
"of such weighting will be increased if you increase this "
"exponent.";
m->m_cgi = "qte";
m->m_off = (char *)&g_conf.m_queryExp - g;
m->m_type = TYPE_FLOAT;
m->m_def = "1.1";
m->m_group = 0;
m++;
*/
/*
m->m_title = "use dynamic phrase weighting";
m->m_desc = "A new algorithm which reduces the weight on a query "
"word term if the query phrase terms it is in are of "
"similar popularity (term frequency) to that of the word "
"term.";
m->m_cgi = "udpw";
m->m_off = (char *)&g_conf.m_useDynamicPhraseWeighting - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_group = 0;
m++;
*/
m->m_title = "maximum serialized query size";
m->m_desc = "When passing queries around the network, send the raw "
"string instead of the serialized query if the required "
"buffer is bigger than this. Smaller values decrease network "
"traffic for large queries at the expense of processing time.";
m->m_cgi = "msqs";
m->m_off = (char *)&g_conf.m_maxSerializedQuerySize - g;
m->m_type = TYPE_LONG;
m->m_def = "8192";
m->m_units = "bytes";
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "merge buf size";
m->m_desc = "Read and write this many bytes at a time when merging "
"files. Smaller values are kinder to query performance, "
" but the merge takes longer. Use at least 1000000 for "
"fast merging.";
m->m_cgi = "mbs";
m->m_off = (char *)&g_conf.m_mergeBufSize - g;
m->m_type = TYPE_LONG;
// keep this way smaller than that 800k we had in here, 100k seems
// to be way better performance for qps
m->m_def = "500000";
m->m_units = "bytes";
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "catdb minRecSizes";
m->m_desc = "minRecSizes for Catdb lookups";
m->m_cgi = "catmsr";
m->m_off = (char *)&g_conf.m_catdbMinRecSizes - g;
m->m_type = TYPE_LONG;
m->m_def = "100000000"; // 100 million
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
/*
m->m_title = "max http download sockets";
m->m_desc = "Maximum sockets available to spiders for downloading "
"web pages.";
m->m_cgi = "mds";
m->m_off = (char *)&g_conf.m_httpMaxDownloadSockets - g;
m->m_type = TYPE_LONG;
m->m_def = "5000";
m->m_group = 0;
m++;
*/
m->m_title = "doc count adjustment";
m->m_desc = "Add this number to the total document count in the "
"index. Just used for displaying on the homepage.";
m->m_cgi = "dca";
m->m_off = (char *)&g_conf.m_docCountAdjustment - g;
m->m_type = TYPE_LONG;
m->m_def = "0";
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "dynamic performance graph";
m->m_desc = "Generates profiling data for callbacks on page "
"performance";
m->m_cgi = "dpg";
m->m_off = (char *)&g_conf.m_dynamicPerfGraph - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "enable profiling";
m->m_desc = "Enable profiler to do accounting of time taken by "
"functions. ";
m->m_cgi = "enp";
m->m_off = (char *)&g_conf.m_profilingEnabled - g;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "minimum profiling threshold";
m->m_desc = "Profiler will not show functions which take less "
"than this many milliseconds "
"in the log or on the perfomance graph.";
m->m_cgi = "mpt";
m->m_off = (char *)&g_conf.m_minProfThreshold - g;
m->m_type = TYPE_LONG;
m->m_def = "10";
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "sequential profiling.";
m->m_desc = "Produce a LOG_TIMING log message for each "
"callback called, along with the time it took. "
"Profiler must be enabled.";
m->m_cgi = "ensp";
m->m_off = (char *)&g_conf.m_sequentialProfiling - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "use statsdb";
m->m_desc = "Archive system statistics information in Statsdb.";
m->m_cgi = "usdb";
m->m_off = (char *)&g_conf.m_useStatsdb - g;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
/*
m->m_title = "statsdb snapshots.";
m->m_desc = "Archive system statistics information in Statsdb. "
"Takes one snapshot every minute.";
m->m_cgi = "sdbss";
m->m_off = (char *)&g_conf.m_statsdbSnapshots - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_group = 0;
m++;
m->m_title = "statsdb web interface.";
m->m_desc = "Enable the Statsdb page for viewing stats history.";
m->m_cgi = "sdbwi";
m->m_off = (char *)&g_conf.m_statsdbPageEnabled - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_group = 0;
m++;
*/
/*
m->m_title = "max synonyms";
m->m_desc = "Maximum possible synonyms to expand a word to.";
m->m_cgi = "msyn";
m->m_off = (char *)&g_conf.m_maxSynonyms - g;
m->m_def = "5";
m->m_type = TYPE_LONG;
m++;
m->m_title = "default affinity";
m->m_desc = "spelling/number synonyms get this number as their "
"affinity; negative values mean treat them as unknown, "
"values higher than 1.0 get treated as 1.0";
m->m_cgi = "daff";
m->m_off = (char *)&g_conf.m_defaultAffinity - g;
m->m_def = "0.9";
m->m_type = TYPE_FLOAT;
m++;
m->m_title = "frequency threshold";
m->m_desc = "the minimum amount a synonym term has to be in relation "
"to its master term in order to be considered as a synonym";
m->m_cgi = "fqth";
m->m_off = (char *)&g_conf.m_frequencyThreshold - g;
m->m_def = "0.25";
m->m_type = TYPE_FLOAT;
m++;
m->m_title = "maximum affinity requests";
m->m_desc = "Maximum number of outstanding requests the affinity "
"builder can generate. Keep this number at 10 or lower for "
"local servers, higher for internet servers or servers with "
"high latency.";
m->m_cgi = "mar";
m->m_off = (char *)&g_conf.m_maxAffinityRequests - g;
m->m_def = "10";
m->m_type = TYPE_LONG;
m->m_group = 0;
m++;
m->m_title = "maximum affinity errors";
m->m_desc = "Maximum number of times the affinity builder should "
"encounter an error before giving up entirely.";
m->m_cgi = "mae";
m->m_off = (char *)&g_conf.m_maxAffinityErrors - g;
m->m_def = "100";
m->m_type = TYPE_LONG;
m->m_group = 0;
m++;
m->m_title = "affinity timeout";
m->m_desc = "Amount of time in milliseconds to wait for a response to "
"an affinity query. You shouldn't have to touch this unless "
"the network is slow or overloaded.";
m->m_cgi = "ato";
m->m_off = (char *)&g_conf.m_affinityTimeout - g;
m->m_def = "30000";
m->m_type = TYPE_LONG;
m->m_group = 0;
m++;
m->m_title = "affinity rebuild server";
m->m_desc = "Use this server:port to rebuild the affinity.";
m->m_cgi = "ars";
m->m_off = (char *)&g_conf.m_affinityServer - g;
m->m_def = "localhost:8000";
m->m_type = TYPE_STRING;
m->m_size = MAX_URL_LEN;
m->m_group = 0;
m++;
m->m_title = "additional affinity parameters";
m->m_desc = "Additional parameters to pass in the query. Tweak these "
"to get better/faster responses. Don't touch the raw parameter "
"unless you know what you are doing.";
m->m_cgi = "aap";
m->m_off = (char *)&g_conf.m_affinityParms - g;
m->m_def = "&raw=5&dio=1&n=1000&code=gbmonitor";
m->m_type = TYPE_STRING;
m->m_size = MAX_URL_LEN;
m->m_group = 0;
m++;
*/
//////
// END MASTER CONTROLS
//////
///////////////////////////////////////////
// ACCESS CONTROLS
///////////////////////////////////////////
/*
// ARRAYS
// each will have its own table, title will be in first row
// of that table, 2nd row is description, then one row per
// element in the array, then a final row for adding new elements
// if not exceeding our m->m_max limit.
m->m_title = "Passwords Required to Search this Collection";
m->m_desc ="Passwords allowed to perform searches on this collection."
" If no passwords are specified, then anyone can search it.";
m->m_cgi = "searchpwd";
m->m_xml = "searchPassword";
m->m_max = MAX_SEARCH_PASSWORDS;
m->m_off = (char *)cr.m_searchPwds - x;
m->m_type = TYPE_STRINGNONEMPTY;
m->m_size = PASSWORD_MAX_LEN+1; // string size max
m->m_page = PAGE_ACCESS;
m->m_def = "";
m++;
m->m_title = "IPs Banned from Searching this Collection";
m->m_desc = "These IPs are not allowed to search this collection or "
"use add url. Useful to keep out miscreants. Use zero for the "
"last number of the IP to ban an entire IP domain.";
m->m_cgi = "bip";
m->m_xml = "bannedIp";
m->m_max = MAX_BANNED_IPS;
m->m_off = (char *)cr.m_banIps - x;
m->m_type = TYPE_IP;
m->m_def = "";
m++;
m->m_title = "Only These IPs can Search this Collection";
m->m_desc = "Only these IPs are allowed to search the collection and "
"use the add url facilities. If you'd like to make your "
"collection publically searchable then do not add any IPs "
"here.Use zero for the "
"last number of the IP to restrict to an entire "
"IP domain, i.e. 1.2.3.0.";
m->m_cgi = "searchip";
m->m_xml = "searchIp";
m->m_max = MAX_SEARCH_IPS;
m->m_off = (char *)cr.m_searchIps - x;
m->m_type = TYPE_IP;
m->m_def = "";
m++;
m->m_title = "Spam Assassin IPs";
m->m_desc = "Browsers coming from these IPs are deemed to be spam "
"assassins and have access to a subset of the controls to "
"ban and remove domains and IPs from the index.";
m->m_cgi = "assip";
m->m_xml = "assassinIp";
m->m_max = MAX_SPAM_IPS;
m->m_off = (char *)cr.m_spamIps - x;
m->m_type = TYPE_IP;
m->m_def = "";
m++;
m->m_title = "Admin Passwords";
m->m_desc = "Passwords allowed to edit this collection record. "
"First password can only be deleted by the master "
"administrator. If no password of Admin IP is given at time "
"of creation then the default password of 'footbar23' will "
"be assigned.";
m->m_cgi = "apwd";
m->m_xml = "adminPassword";
m->m_max = MAX_ADMIN_PASSWORDS;
m->m_off = (char *)cr.m_adminPwds - x;
m->m_type = TYPE_STRINGNONEMPTY;
m->m_size = PASSWORD_MAX_LEN+1;
m->m_def = "";
m++;
m->m_title = "Admin IPs";
m->m_desc = "If someone connects from one of these IPs and provides "
"a password from the table above then they will have full "
"administrative priviledges for this collection. If you "
"specified no Admin Passwords above then they need only "
"connect from an IP in this table to get the privledges. ";
m->m_cgi = "adminip";
m->m_xml = "adminIp";
m->m_max = MAX_ADMIN_IPS;
m->m_off = (char *)cr.m_adminIps - x;
m->m_type = TYPE_IP;
m->m_def = "";
m++;
*/
///////////////////////////////////////////
// URL FILTERS
///////////////////////////////////////////
//m->m_title = "Url Filters";
// this is description just for the conf file.
//m->m_cdesc = "See overview.html for a description of URL filters.";
//m->m_type = TYPE_COMMENT;
//m++;
m->m_cgi = "ufp";
m->m_title = "url filters profile";
m->m_xml = "urlFiltersProfile";
m->m_desc = "Rather than editing the table below, you can select "
"a predefined set of url instructions in this drop down menu "
"that will update the table for you. Selecting <i>custom</i> "
"allows you to make custom changes to the table. "
"Selecting <i>web</i> configures the table for spidering "
"the web in general. "
"Selcting <i>news</i> configures the table for spidering "
"new sites. "
"Selecting <i>chinese</i> makes the spider prioritize the "
"spidering of chinese pages, etc. "
"Selecting <i>shallow</i> makes the spider go deep on "
"all sites unless they are tagged <i>shallow</i> in the "
"site list. "
"Important: "
"If you select a profile other than <i>custom</i> "
"then your changes "
"to the table will be lost.";
m->m_off = (char *)&cr.m_urlFiltersProfile - x;
m->m_colspan = 3;
m->m_type = TYPE_SAFEBUF;//UFP;// 1 byte dropdown menu
m->m_def = "web"; // UFP_WEB
m->m_flags = PF_REBUILDURLFILTERS | PF_CLONE;
m->m_page = PAGE_FILTERS;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "expression";
m->m_desc = "Before downloading the contents of a URL, Gigablast "
"first chains down this "
"list of "
"expressions</a>, "
"starting with expression #0. "
//"This table is also consulted "
//"for every outlink added to spiderdb. "
"The first expression it matches is the ONE AND ONLY "
"matching row for that url. "
"It then uses the "
//"<a href=/overview.html#spiderfreq>"
"respider frequency, "
//"<a href=/overview.html#spiderpriority>"
"spider priority, etc. on the MATCHING ROW when spidering "
//"and <a href=/overview.html#ruleset>ruleset</a> to "
"that URL. "
"If you specify the <i>expression</i> as "
"<i><b>default</b></i> then that MATCHES ALL URLs. "
"URLs with high spider priorities take spidering "
"precedence over "
"URLs with lower spider priorities. "
"The respider frequency dictates how often a URL will "
"be respidered. "
"See the help table below for examples of all the supported "
"expressions. "
"Use the <i>&&</i> operator to string multiple expressions "
"together in the same expression text box. "
"If you check the <i>delete</i> checkbox then urls matching "
"that row will be deleted if already indexed, otherwise, "
"they just won't be indexed."
//"A <i>spider priority</i> of "
//"<i>FILTERED</i> or <i>BANNED</i> "
// "<i>DELETE</i> "
// "will cause the URL to not be spidered, "
// "or if it has already "
// "been indexed, it will be deleted when it is respidered."
"<br><br>";
/*
"A URL is respidered according to the "
"spider frequency. If this is blank then Gigablast will "
"use the spider frequency explicitly dictated by the rule "
"set. If the ruleset does not contain a <spiderFrequency> "
"xml tag, then Gigablast will "
"intelligently determine the best time to respider that "
"URL.<br><br>"
"If the "
"<a href=/overview.html#spiderpriority>"
"spider priority</a> of a URL is undefined then "
"Gigablast will use the spider priority explicitly "
"dictated by the ruleset. If the ruleset does not contain "
"a <spiderPriority> xml tag, then Gigablast "
"will spider that URL with a priority of its linking parent "
"minus 1, "
"resulting in breadth first spidering. A URL of spider "
"priority X will be placed in spider priority queue #X. "
"Many spider parameters can be configured on a per "
"spider priority queue basis. For instance, spidering "
"can be toggled on a per queue basis, as can link "
"harvesting.<br><br>"
"The <b>ruleset</b> you select corresponds to a file on "
"disk named tagdb*.xml, where the '*' is a number. Each of "
"these files is a set of rules in XML that dictate how to "
"index and spider a document. "
"You can add your own ruleset file to Gigablast's working "
"directory and it will automatically be "
"included in the ruleset drop down menu. Once a document "
"has been indexed with a ruleset, then the corresponding "
"ruleset file cannot be deleted without risk of corruption."
"<br><br>"
"You can have up to 32 regular expressions. "
"Example: <b>^http://.*\\.uk/</b> would match all urls from "
"the UK. See this "
"<a href=/?redir="
"http://www.phpbuilder.com/columns/dario19990616.php3>"
"tutorial by example</a> for more information."
"<br><br>"
"Gigablast also supports the following special \"regular "
"expressions\": "
"link:gigablast and doc:quality<X and doc:quality>X.";
*/
m->m_cgi = "fe";
m->m_xml = "filterExpression";
m->m_max = MAX_FILTERS;
// array of safebufs i guess...
m->m_off = (char *)cr.m_regExs - x;
// this is a safebuf, dynamically allocated string really
m->m_type = TYPE_SAFEBUF;//STRINGNONEMPTY
// the size of each element in the array:
m->m_size = sizeof(SafeBuf);//MAX_REGEX_LEN+1;
m->m_page = PAGE_FILTERS;
m->m_rowid = 1; // if we START a new row
m->m_def = "";
m->m_flags = PF_REBUILDURLFILTERS | PF_CLONE;
m->m_page = PAGE_FILTERS;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "harvest links";
m->m_cgi = "hspl";
m->m_xml = "harvestLinks";
m->m_max = MAX_FILTERS;
m->m_off = (char *)cr.m_harvestLinks - x;
m->m_type = TYPE_CHECKBOX;
m->m_def = "1";
m->m_page = PAGE_FILTERS;
m->m_rowid = 1;
m->m_flags = PF_REBUILDURLFILTERS | PF_CLONE;
m->m_obj = OBJ_COLL;
m++;
/*
m->m_title = "spidering enabled";
m->m_cgi = "cspe";
m->m_xml = "spidersEnabled";
m->m_max = MAX_FILTERS;
m->m_off = (char *)cr.m_spidersEnabled - x;
m->m_type = TYPE_CHECKBOX;
m->m_def = "1";
m->m_page = PAGE_FILTERS;
m->m_rowid = 1;
m->m_flags = PF_REBUILDURLFILTERS;
m++;
*/
m->m_title = "respider frequency (days)";
m->m_cgi = "fsf";
m->m_xml = "filterFrequency";
m->m_max = MAX_FILTERS;
m->m_off = (char *)cr.m_spiderFreqs - x;
m->m_type = TYPE_FLOAT;
// why was this default 0 days?
m->m_def = "30.0"; // 0.0
m->m_page = PAGE_FILTERS;
m->m_obj = OBJ_COLL;
m->m_units = "days";
m->m_rowid = 1;
m->m_flags = PF_REBUILDURLFILTERS | PF_CLONE;
m++;
m->m_title = "max spiders";
m->m_desc = "Do not allow more than this many outstanding spiders "
"for all urls in this priority."; // was "per rule"
m->m_cgi = "mspr";
m->m_xml = "maxSpidersPerRule";
m->m_max = MAX_FILTERS;
m->m_off = (char *)cr.m_maxSpidersPerRule - x;
m->m_type = TYPE_LONG;
m->m_def = "99";
m->m_page = PAGE_FILTERS;
m->m_obj = OBJ_COLL;
m->m_rowid = 1;
m->m_flags = PF_REBUILDURLFILTERS | PF_CLONE;
m++;
m->m_title = "max spiders per ip";
m->m_desc = "Allow this many spiders per IP.";
m->m_cgi = "mspi";
m->m_xml = "maxSpidersPerIp";
m->m_max = MAX_FILTERS;
m->m_off = (char *)cr.m_spiderIpMaxSpiders - x;
m->m_type = TYPE_LONG;
m->m_def = "7";
m->m_page = PAGE_FILTERS;
m->m_obj = OBJ_COLL;
m->m_rowid = 1;
m->m_flags = PF_REBUILDURLFILTERS | PF_CLONE;
m++;
m->m_title = "same ip wait (ms)";
m->m_desc = "Wait at least this int32_t before downloading urls from "
"the same IP address.";
m->m_cgi = "xg";
m->m_xml = "spiderIpWait";
m->m_max = MAX_FILTERS;
//m->m_fixed = MAX_PRIORITY_QUEUES;
m->m_off = (char *)cr.m_spiderIpWaits - x;
m->m_type = TYPE_LONG;
m->m_def = "1000";
m->m_page = PAGE_FILTERS;
m->m_obj = OBJ_COLL;
m->m_units = "milliseconds";
m->m_rowid = 1;
m->m_flags = PF_REBUILDURLFILTERS | PF_CLONE;
m++;
/*
m->m_title = "page quota";
m->m_cgi = "fsq";
m->m_xml = "filterQuota";
m->m_max = MAX_FILTERS;
m->m_off = (char *)cr.m_spiderQuotas - x;
m->m_type = TYPE_LONG_LONG;
m->m_def = "-1"; // -1 means no quota
m->m_page = PAGE_FILTERS;
m->m_units = "pages";
m->m_rowid = 1;
m++;
*/
m->m_title = "delete";
m->m_cgi = "fdu";
m->m_xml = "forceDeleteUrls";
m->m_max = MAX_FILTERS;
m->m_off = (char *)cr.m_forceDelete - x;
m->m_type = TYPE_CHECKBOX;
m->m_def = "0";
m->m_page = PAGE_FILTERS;
m->m_rowid = 1;
m->m_flags = PF_REBUILDURLFILTERS | PF_CLONE;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "spider priority";
m->m_cgi = "fsp";
m->m_xml = "filterPriority";
m->m_max = MAX_FILTERS;
m->m_off = (char *)cr.m_spiderPriorities - x;
m->m_type = TYPE_PRIORITY2; // includes UNDEFINED priority in dropdown
m->m_page = PAGE_FILTERS;
m->m_obj = OBJ_COLL;
m->m_rowid = 1;
m->m_def = "50";
m->m_flags = PF_REBUILDURLFILTERS | PF_CLONE;
m->m_addin = 1; // "insert" follows?
m++;
/*
m->m_title = "diffbot api";
m->m_cgi = "dapi";
m->m_xml = "diffbotAPI";
m->m_max = MAX_FILTERS;
m->m_off = (char *)cr.m_spiderDiffbotApiUrl - x;
// HACK: we print a dropdown for this but the value is a string
// because the items in the drop down can change so we can't store
// an item # here, it has to be a string, i.e. the diffbot api url.
// john might add a new custom api to m_diffbotApiList at any time.
// so we select the item in the drop down if it matches THIS string.
m->m_type = TYPE_SAFEBUF;//DIFFBOT_DROPDOWN;
m->m_def = "";
m->m_page = PAGE_FILTERS;
m->m_size = sizeof(SafeBuf);
m->m_rowid = 1;
m->m_addin = 1; // "insert" follows?
m->m_flags = PF_REBUILDURLFILTERS | PF_DIFFBOT;
m++;
*/
//m->m_title = "<a href=/overview.html#ruleset>ruleset</a>";
//m->m_cgi = "frs";
//m->m_xml = "filterRuleset";
//m->m_max = MAX_FILTERS;
//m->m_off = (char *)cr.m_rulesets - x;
//m->m_type = TYPE_RULESET; // int32_t with dropdown of rulesets
//m->m_page = PAGE_FILTERS;
//m->m_rowid = 1;
//m->m_addin = 1; // "insert" follows?
//m->m_def = "";
//m++;
/*
// default rule
m->m_title = "<b>DEFAULT</b>";
m->m_desc = "Use the following values by default if no ruleset in "
"tagdb matches the URL.";
m->m_type = TYPE_CONSTANT;
m->m_page = PAGE_FILTERS;
m->m_rowid = 2;
m->m_hdrs = 0;
m++;
//m->m_cdesc = "The default parameters if no reg exs above matched.";
m->m_cgi = "fsfd";
m->m_xml = "filterFrequencyDefault";
m->m_off = (char *)&cr.m_defaultSpiderFrequency - x;
m->m_type = TYPE_FLOAT;
m->m_def = "0.0";
m->m_page = PAGE_FILTERS;
m->m_units = "days";
m->m_rowid = 2;
m->m_hdrs = 0;
m++;
m->m_cgi = "fsqd";
m->m_xml = "filterQuotaDefault";
m->m_off = (char *)&cr.m_defaultSpiderQuota - x;
m->m_type = TYPE_LONG_LONG;
m->m_def = "-1";
m->m_page = PAGE_FILTERS;
m->m_units = "pages";
m->m_rowid = 2;
m->m_hdrs = 0;
m++;
m->m_cgi = "fspd";
m->m_xml = "filterPriorityDefault";
m->m_off = (char *)&cr.m_defaultSpiderPriority - x;
m->m_type = TYPE_PRIORITY2; // includes UNDEFINED priority in dropdown
m->m_def = "4";
m->m_page = PAGE_FILTERS;
m->m_rowid = 2;
m->m_hdrs = 0;
m++;
*/
/*
m->m_cgi = "frsd";
m->m_xml = "filterRulesetDefault";
m->m_off = (char *)&cr.m_defaultSiteFileNum - x;
m->m_type = TYPE_RULESET; // int32_t with dropdown of rulesets
m->m_def = "0";
m->m_page = PAGE_FILTERS;
m->m_rowid = 2;
m->m_hdrs = 0;
m++;
*/
/*
///////////////////////////////////////////
// PRIORITY CONTROLS
///////////////////////////////////////////
// . show the priority in this column
// . a monotnic sequence repeating each number twice,
// basically, div 2 is what "D2" means
// . so we get 0,0,1,1,2,2,3,3, ...
m->m_title = "priority";
//m->m_desc = "What priority is this spdier queue?";
m->m_max = MAX_PRIORITY_QUEUES;
m->m_fixed = MAX_PRIORITY_QUEUES;
m->m_type = TYPE_MONOD2;
m->m_page = PAGE_PRIORITIES;
m->m_rowid = 3;
m++;
// . show an alternating 0 and 1 in this column
// because it is type MONOM2, a monotonic sequence
// modulus 2.
// . so we get 0,1,0,1,0,1,0,1,0,1, ...
m->m_title = "is new";
m->m_desc = "Does this priority contain new (unindexed) urls?";
m->m_max = MAX_PRIORITY_QUEUES;
m->m_fixed = MAX_PRIORITY_QUEUES;
m->m_type = TYPE_MONOM2;
m->m_page = PAGE_PRIORITIES;
m->m_rowid = 3;
m++;
m->m_title = "spidering enabled";
m->m_desc = "Are spiders enabled for this priority?";
m->m_cgi = "xa";
m->m_xml = "spiderPrioritySpideringEnabled";
m->m_max = MAX_PRIORITY_QUEUES;
m->m_fixed = MAX_PRIORITY_QUEUES;
m->m_off = (char *)cr.m_pq_spideringEnabled - x;
m->m_type = TYPE_CHECKBOX;
m->m_def = "1";
m->m_page = PAGE_PRIORITIES;
m->m_rowid = 3;
m++;
m->m_title = "time slice weight";
m->m_desc = "What percentage of the time to draw urls from "
"this priority?";
m->m_cgi = "xb";
m->m_xml = "spiderPriotiyTimeSlice";
m->m_max = MAX_PRIORITY_QUEUES;
m->m_fixed = MAX_PRIORITY_QUEUES;
m->m_off = (char *)cr.m_pq_timeSlice - x;
m->m_type = TYPE_FLOAT;
m->m_page = PAGE_PRIORITIES;
m->m_rowid = 3; // if we START a new row
m->m_def = "100.0";
m->m_units = "%%";
m++;
m->m_title = "spidered";
m->m_desc = "How many urls we spidered so far last 5 minutes.";
m->m_cgi = "sps";
m->m_xml = "spiderPriotiySpidered";
m->m_max = MAX_PRIORITY_QUEUES;
m->m_fixed = MAX_PRIORITY_QUEUES;
m->m_off = (char *)cr.m_pq_spidered - x;
m->m_type = TYPE_LONG_CONST;
m->m_page = PAGE_PRIORITIES;
m->m_rowid = 3; // if we START a new row
m->m_def = "0";
m->m_sync = false; // do not sync this parm
m++;
m->m_title = "spider links";
m->m_desc = "Harvest links from the content and add to spiderdb.";
m->m_cgi = "xc";
m->m_xml = "spiderPrioritySpiderLinks";
m->m_max = MAX_PRIORITY_QUEUES;
m->m_fixed = MAX_PRIORITY_QUEUES;
m->m_off = (char *)cr.m_pq_spiderLinks - x;
m->m_type = TYPE_CHECKBOX;
m->m_def = "1";
m->m_page = PAGE_PRIORITIES;
m->m_rowid = 3;
m++;
m->m_title = "spider same host outlinks only";
m->m_desc = "Harvest links to the same hostnames (www.xyz.com) "
"and add to spiderdb.";
m->m_cgi = "xd";
m->m_xml = "spiderPrioritySpiderSameHostnameLinks";
m->m_max = MAX_PRIORITY_QUEUES;
m->m_fixed = MAX_PRIORITY_QUEUES;
m->m_off = (char *)cr.m_pq_spiderSameHostnameLinks - x;
m->m_type = TYPE_CHECKBOX;
m->m_def = "0";
m->m_page = PAGE_PRIORITIES;
m->m_rowid = 3;
m++;
m->m_title = "force links into queue";
m->m_desc = "If slated to be added to this queue, and link is "
"already in a non-forced queue, force it into this queue. "
"Keep a cache to reduce reptitious adds to this queue.";
m->m_cgi = "xdd";
m->m_xml = "spiderPriorityForceQueue";
m->m_max = MAX_PRIORITY_QUEUES;
m->m_fixed = MAX_PRIORITY_QUEUES;
m->m_off = (char *)cr.m_pq_autoForceQueue - x;
m->m_type = TYPE_CHECKBOX;
m->m_def = "0";
m->m_page = PAGE_PRIORITIES;
m->m_rowid = 3;
m++;
m->m_title = "max spiders per ip";
m->m_desc = "Do not allow more than this many simultaneous "
"downloads per IP address.";
m->m_cgi = "xe";
m->m_xml = "spiderPriorityMaxSpidersPerIp";
m->m_max = MAX_PRIORITY_QUEUES;
m->m_fixed = MAX_PRIORITY_QUEUES;
m->m_off = (char *)cr.m_pq_maxSpidersPerIp - x;
m->m_type = TYPE_LONG;
m->m_def = "1";
m->m_page = PAGE_PRIORITIES;
m->m_rowid = 3;
m++;
m->m_title = "max spiders per domain";
m->m_desc = "Do not allow more than this many simultaneous "
"downloads per domain.";
m->m_cgi = "xf";
m->m_xml = "spiderPriorityMaxSpidersPerDom";
m->m_max = MAX_PRIORITY_QUEUES;
m->m_fixed = MAX_PRIORITY_QUEUES;
m->m_off = (char *)cr.m_pq_maxSpidersPerDom - x;
m->m_type = TYPE_LONG;
m->m_def = "1";
m->m_page = PAGE_PRIORITIES;
m->m_rowid = 3;
m++;
m->m_title = "max respider wait (days)";
m->m_desc = "Do not wait longer than this before attempting to "
"respider.";
m->m_cgi = "xr";
m->m_xml = "spiderPriorityMaxRespiderWait";
m->m_max = MAX_PRIORITY_QUEUES;
m->m_fixed = MAX_PRIORITY_QUEUES;
m->m_off = (char *)cr.m_pq_maxRespiderWait - x;
m->m_type = TYPE_FLOAT;
m->m_def = "180.0";
m->m_page = PAGE_PRIORITIES;
m->m_rowid = 3;
m->m_units = "days";
m++;
m->m_title = "first respider wait (days)";
m->m_desc = "Reschedule a new url for respidering this many days "
"from the first time it is actually spidered.";
m->m_cgi = "xfrw";
m->m_xml = "spiderPriorityFirstRespiderWait";
m->m_max = MAX_PRIORITY_QUEUES;
m->m_fixed = MAX_PRIORITY_QUEUES;
m->m_off = (char *)cr.m_pq_firstRespiderWait - x;
m->m_type = TYPE_FLOAT;
m->m_def = "60.0";
m->m_page = PAGE_PRIORITIES;
m->m_rowid = 3;
m->m_units = "days";
m++;
m->m_title = "same ip wait (ms)";
m->m_desc = "Wait at least this int32_t before downloading urls from "
"the same IP address.";
m->m_cgi = "xg";
m->m_xml = "spiderPrioritySameIpWait";
m->m_max = MAX_PRIORITY_QUEUES;
m->m_fixed = MAX_PRIORITY_QUEUES;
m->m_off = (char *)cr.m_pq_sameIpWait - x;
m->m_type = TYPE_LONG;
m->m_def = "10000";
m->m_page = PAGE_PRIORITIES;
m->m_rowid = 3;
m->m_units = "milliseconds";
m++;
m->m_title = "same domain wait (ms)";
m->m_desc = "Wait at least this int32_t before downloading urls from "
"the same domain.";
m->m_cgi = "xh";
m->m_xml = "spiderPrioritySameDomainWait";
m->m_max = MAX_PRIORITY_QUEUES;
m->m_fixed = MAX_PRIORITY_QUEUES;
m->m_off = (char *)cr.m_pq_sameDomainWait - x;
m->m_type = TYPE_LONG;
m->m_def = "10000";
m->m_page = PAGE_PRIORITIES;
m->m_rowid = 3;
m->m_units = "milliseconds";
m++;
*/
///////////////////////////////////////////
// SITEDB FILTERS
///////////////////////////////////////////
/*
m->m_title = "site expression";
m->m_desc = "The site of a url is a substring of that url, which "
"defined a set of urls which are all primarily controlled "
"by the same entity. The smallest such site of a url is "
"returned, because a url can have multiple sites. Like "
"fred.blogspot.com is a site and the blogspot.com site "
"contains that site.";
m->m_cgi = "sdbfe";
m->m_xml = "siteExpression";
m->m_max = MAX_SITE_EXPRESSIONS;
m->m_off = (char *)cr.m_siteExpressions - x;
m->m_type = TYPE_STRINGNONEMPTY;
m->m_size = MAX_SITE_EXPRESSION_LEN+1;
m->m_page = PAGE_RULES;
m->m_rowid = 1; // if we START a new row
m->m_def = "";
m++;
m->m_title = "site rule";
m->m_cgi = "sdbsrs";
m->m_xml = "siteRule";
m->m_max = MAX_SITE_EXPRESSIONS;
m->m_off = (char *)cr.m_siteRules - x;
m->m_type = TYPE_SITERULE;
m->m_page = PAGE_RULES;
m->m_rowid = 1;
m->m_def = "0";
m++;
*/
/*
m->m_title = "siterec default ruleset";
m->m_cgi = "sdbfdr";
m->m_xml = "siterecDefaultRuleset";
m->m_max = MAX_SITEDB_FILTERS;
m->m_off = (char *)cr.m_sitedbFilterRulesets - x;
m->m_type = TYPE_RULESET;
m->m_page = PAGE_FILTERS2;
m->m_rowid = 1;
m->m_def = "-1";
m++;
m->m_title = "ban subdomains";
m->m_cgi = "sdbbsd";
m->m_xml = "siterecBanSubdomains";
m->m_max = MAX_SITEDB_FILTERS;
m->m_off = (char *)cr.m_sitedbFilterBanSubdomains - x;
m->m_type = TYPE_BOOL;
m->m_page = PAGE_FILTERS2;
m->m_rowid = 1;
m->m_addin = 1; // "insert" follows
m->m_def = "0";
m++;
*/
// ///////////////////////////////////////////
// // SPAM CONTROLS //
// ///////////////////////////////////////////
// m->m_title = "char in url";
// m->m_desc = "url has - or _ or a digit in the domain, "
// "has a plus in the cgi part.";
// m->m_cgi = "spamctrla";
// m->m_off = (char *)&cr.m_spamTests[CHAR_IN_URL] - x;
// m->m_type = TYPE_LONG;
// m->m_page = PAGE_SPAM;
// m->m_def = "20";
// //m->m_smaxc = (char *)&cr.m_spamMaxes[CHAR_IN_URL] - x;
// m->m_group = 1;
// m->m_sparm = 0;
// m++;
// m->m_title = "bad tld";
// m->m_desc = "tld is info or biz";
// m->m_cgi = "spamctrlb";
// m->m_off = (char *)&cr.m_spamTests[BAD_TLD] - x;
// m->m_type = TYPE_LONG;
// m->m_page = PAGE_SPAM;
// m->m_def = "20";
// m->m_group = 0;
// m->m_sparm = 0;
// m++;
// m->m_title = "good tld";
// m->m_desc = "tld is gov, edu or mil";
// m->m_cgi = "spamctrlc";
// m->m_off = (char *)&cr.m_spamTests[GOOD_TLD] - x;
// m->m_type = TYPE_LONG;
// m->m_page = PAGE_SPAM;
// m->m_def = "-20";
// m->m_group = 0;
// m->m_sparm = 0;
// m++;
// m->m_title = "title has spammy words";
// m->m_desc = "Title has spammy words, is all lower case, "
// "or has > 200 chars. ";
// m->m_cgi = "spamctrld";
// m->m_off = (char *)&cr.m_spamTests[WORD_IN_TITLE] - x;
// m->m_type = TYPE_LONG;
// m->m_page = PAGE_SPAM;
// m->m_def = "20";
// m->m_group = 0;
// m->m_sparm = 0;
// m++;
// m->m_title = "img src to other domains";
// m->m_desc = "Page has img src to other domains. ";
// m->m_cgi = "spamctrle";
// m->m_off = (char *)&cr.m_spamTests[IMG_SRC_OTHER_DOMAIN] - x;
// m->m_type = TYPE_LONG;
// m->m_page = PAGE_SPAM;
// m->m_def = "5";
// m->m_group = 0;
// m->m_sparm = 0;
// m++;
// m->m_title = "page has spammy words";
// m->m_desc = "Page has spammy words. ";
// m->m_cgi = "spamctrlf";
// m->m_off = (char *)&cr.m_spamTests[SPAMMY_WORDS] - x;
// m->m_type = TYPE_LONG;
// m->m_page = PAGE_SPAM;
// m->m_def = "5";
// m->m_group = 0;
// m->m_sparm = 0;
// m++;
// m->m_title = "consecutive link text";
// m->m_desc = "Three consecutive link texts "
// "contain the same word. ";
// m->m_cgi = "spamctrlg";
// m->m_off = (char *)&cr.m_spamTests[CONSECUTIVE_LINK_TEXT] - x;
// m->m_type = TYPE_LONG;
// m->m_page = PAGE_SPAM;
// m->m_def = "10";
// m->m_group = 0;
// m->m_sparm = 0;
// m++;
// m->m_title = "affiliate company links";
// m->m_desc = "links to amazon, allposters, or zappos. ";
// m->m_cgi = "spamctrlh";
// m->m_off = (char *)&cr.m_spamTests[AFFILIATE_LINKS] - x;
// m->m_type = TYPE_LONG;
// m->m_page = PAGE_SPAM;
// m->m_def = "10";
// m->m_group = 0;
// m->m_sparm = 0;
// m++;
// m->m_title = "affiliate in links";
// m->m_desc = "Has string 'affiliate' in the links. ";
// m->m_cgi = "spamctrli";
// m->m_off = (char *)&cr.m_spamTests[AFFILIATE_LINKS2] - x;
// m->m_type = TYPE_LONG;
// m->m_page = PAGE_SPAM;
// m->m_def = "40";
// m->m_group = 0;
// m->m_sparm = 0;
// m++;
// m->m_title = "Iframe to amazon";
// m->m_desc = "Has an iframe whose src is amazon. ";
// m->m_cgi = "spamctrlj";
// m->m_off = (char *)&cr.m_spamTests[IFRAME_TO_AMAZON] - x;
// m->m_type = TYPE_LONG;
// m->m_page = PAGE_SPAM;
// m->m_def = "30";
// m->m_group = 0;
// m->m_sparm = 0;
// m++;
// m->m_title = "int32_t links";
// m->m_desc = "Links to urls which are > 128 chars. ";
// m->m_cgi = "spamctrlk";
// m->m_off = (char *)&cr.m_spamTests[LINKS_OVER_128_CHARS] - x;
// m->m_type = TYPE_LONG;
// m->m_page = PAGE_SPAM;
// m->m_def = "5";
// m->m_group = 0;
// m->m_sparm = 0;
// m++;
// m->m_title = "links to queries";
// m->m_desc = "links have ?q= or &q= in them. ";
// m->m_cgi = "spamctrll";
// m->m_off = (char *)&cr.m_spamTests[LINKS_HAVE_QUERIES] - x;
// m->m_type = TYPE_LONG;
// m->m_page = PAGE_SPAM;
// m->m_def = "5";
// m->m_group = 0;
// m->m_sparm = 0;
// m++;
// m->m_title = "google ad client";
// m->m_desc = "Page has a google ad client. ";
// m->m_cgi = "spamctrlm";
// m->m_off = (char *)&cr.m_spamTests[GOOGLE_AD_CLIENT] - x;
// m->m_type = TYPE_LONG;
// m->m_page = PAGE_SPAM;
// m->m_def = "20";
// m->m_group = 0;
// m->m_sparm = 0;
// m++;
// m->m_title = "percent text in links";
// m->m_desc = "percent of text in links (over 50 percent). ";
// m->m_cgi = "spamctrln";
// m->m_off = (char *)&cr.m_spamTests[PERCENT_IN_LINKS] - x;
// m->m_type = TYPE_LONG;
// m->m_page = PAGE_SPAM;
// m->m_def = "15";
// m->m_group = 0;
// m->m_sparm = 0;
// m++;
// m->m_title = "links to a url with a - or _ in the domain";
// m->m_desc = "Links to a url with a - or _ in the domain";
// m->m_cgi = "spamctrlo";
// m->m_off = (char *)&cr.m_spamTests[DASH_IN_LINK] - x;
// m->m_type = TYPE_LONG;
// m->m_page = PAGE_SPAM;
// m->m_def = "2";
// m->m_group = 0;
// m->m_sparm = 0;
// m++;
// m->m_title = "links to a url which is .info or .biz";
// m->m_desc = "Links to a url which is .info or .biz.";
// m->m_cgi = "spamctrlp";
// m->m_off = (char *)&cr.m_spamTests[LINK_TO_BADTLD] - x;
// m->m_type = TYPE_LONG;
// m->m_page = PAGE_SPAM;
// m->m_def = "2";
// m->m_group = 0;
// m->m_sparm = 0;
// m++;
// m->m_title = "links to a dmoz category";
// m->m_desc = "Links to a dmoz category.";
// m->m_cgi = "spamctrlq";
// m->m_off = (char *)&cr.m_spamTests[LINKS_ARE_DMOZ_CATS] - x;
// m->m_type = TYPE_LONG;
// m->m_page = PAGE_SPAM;
// m->m_def = "4";
// m->m_group = 0;
// m->m_sparm = 0;
// m++;
// m->m_title = "consecutive bold text";
// m->m_desc = "Three consecutive bold texts "
// "contain the same word. ";
// m->m_cgi = "spamctrlr";
// m->m_off = (char *)&cr.m_spamTests[CONSECUTIVE_BOLD_TEXT] - x;
// m->m_type = TYPE_LONG;
// m->m_page = PAGE_SPAM;
// m->m_def = "10";
// m->m_group = 0;
// m->m_sparm = 0;
// m++;
// m->m_title = "link text doesn't match domain";
// m->m_desc = "Link text looks like a domain, but the link doesn't go there";
// m->m_cgi = "spamctrls";
// m->m_off = (char *)&cr.m_spamTests[LINK_TEXT_NEQ_DOMAIN] - x;
// m->m_type = TYPE_LONG;
// m->m_page = PAGE_SPAM;
// m->m_def = "10";
// m->m_group = 0;
// m->m_sparm = 0;
// m++;
// m->m_title = "force multiplier";
// m->m_desc = "Multiply this by the number of spam categories "
// "that have points times the total points, for the final"
// " score. Range between 0 and 1.";
// m->m_cgi = "frcmult";
// m->m_off = (char *)&cr.m_forceMultiplier - x;
// m->m_type = TYPE_FLOAT;
// m->m_page = PAGE_SPAM;
// m->m_def = "0.01";
// m->m_group = 1;
// m->m_sparm = 0;
// m++;
// /////////////////////// MAXES FOR SPAM CONTROLS ///////////////////////
// m->m_title = "max points for char in url";
// m->m_desc = "Max points for url has - or _ or a digit in the domain";
// m->m_cgi = "spammaxa";
// m->m_off = (char *)&cr.m_spamMaxes[CHAR_IN_URL] - x;
// m->m_type = TYPE_LONG;
// m->m_page = PAGE_SPAM;
// m->m_def = "300";
// m->m_group = 1;
// m->m_sparm = 0;
// m++;
// m->m_title = "max points for bad tld";
// m->m_desc = "Max points for tld is info or biz";
// m->m_cgi = "spammaxb";
// m->m_off = (char *)&cr.m_spamMaxes[BAD_TLD] - x;
// m->m_type = TYPE_LONG;
// m->m_page = PAGE_SPAM;
// m->m_group = 0;
// m->m_def = "300";
// m->m_sparm = 0;
// m++;
// m->m_title = "max points for good tld";
// m->m_desc = "Max points for tld is gov, edu or mil";
// m->m_cgi = "spammaxc";
// m->m_off = (char *)&cr.m_spamMaxes[GOOD_TLD] - x;
// m->m_type = TYPE_LONG;
// m->m_page = PAGE_SPAM;
// m->m_def = "300";
// m->m_group = 0;
// m->m_sparm = 0;
// m++;
// m->m_title = "max points for title has spammy words";
// m->m_desc = "Max points for Title has spammy words. ";
// m->m_cgi = "spammaxd";
// m->m_off = (char *)&cr.m_spamMaxes[WORD_IN_TITLE] - x;
// m->m_type = TYPE_LONG;
// m->m_page = PAGE_SPAM;
// m->m_def = "300";
// m->m_group = 0;
// m->m_sparm = 0;
// m++;
// m->m_title = "max points for img src to other domains";
// m->m_desc = "Max points for Page has img src to other domains. ";
// m->m_cgi = "spammaxe";
// m->m_off = (char *)&cr.m_spamMaxes[IMG_SRC_OTHER_DOMAIN] - x;
// m->m_type = TYPE_LONG;
// m->m_page = PAGE_SPAM;
// m->m_def = "300";
// m->m_group = 0;
// m->m_sparm = 0;
// m++;
// m->m_title = "max points for page has spammy words";
// m->m_desc = "Max points for Page has spammy words. ";
// m->m_cgi = "spammaxf";
// m->m_off = (char *)&cr.m_spamMaxes[SPAMMY_WORDS] - x;
// m->m_type = TYPE_LONG;
// m->m_page = PAGE_SPAM;
// m->m_def = "300";
// m->m_group = 0;
// m->m_sparm = 0;
// m++;
// m->m_title = "max points for consecutive link text";
// m->m_desc = "Max points for three consecutive link texts"
// "contain the same word. ";
// m->m_cgi = "spammaxg";
// m->m_off = (char *)&cr.m_spamMaxes[CONSECUTIVE_LINK_TEXT] - x;
// m->m_type = TYPE_LONG;
// m->m_page = PAGE_SPAM;
// m->m_def = "300";
// m->m_group = 0;
// m->m_sparm = 0;
// m++;
// m->m_title = "max points for affiliate company links";
// m->m_desc = "Max points for links to amazon, allposters, or zappos. ";
// m->m_cgi = "spammaxh";
// m->m_off = (char *)&cr.m_spamMaxes[AFFILIATE_LINKS] - x;
// m->m_type = TYPE_LONG;
// m->m_page = PAGE_SPAM;
// m->m_def = "300";
// m->m_group = 0;
// m->m_sparm = 0;
// m++;
// m->m_title = "max points for affiliate in links";
// m->m_desc = "Max points for Has string 'affiliate' in the links. ";
// m->m_cgi = "spammaxi";
// m->m_off = (char *)&cr.m_spamMaxes[AFFILIATE_LINKS2] - x;
// m->m_type = TYPE_LONG;
// m->m_page = PAGE_SPAM;
// m->m_def = "300";
// m->m_group = 0;
// m->m_sparm = 0;
// m++;
// m->m_title = "max points for Iframe to amazon";
// m->m_desc = "Max points for Has an iframe whose src is amazon. ";
// m->m_cgi = "spammaxj";
// m->m_off = (char *)&cr.m_spamMaxes[IFRAME_TO_AMAZON] - x;
// m->m_type = TYPE_LONG;
// m->m_page = PAGE_SPAM;
// m->m_def = "300";
// m->m_group = 0;
// m->m_sparm = 0;
// m++;
// m->m_title = "max points for int32_t links";
// m->m_desc = "Max points for Links to urls which are > 128 chars. ";
// m->m_cgi = "spammaxk";
// m->m_off = (char *)&cr.m_spamMaxes[LINKS_OVER_128_CHARS] - x;
// m->m_type = TYPE_LONG;
// m->m_page = PAGE_SPAM;
// m->m_def = "300";
// m->m_group = 0;
// m->m_sparm = 0;
// m++;
// m->m_title = "max points for links to queries";
// m->m_desc = "Max points for links have ?q= or &q= in them. ";
// m->m_cgi = "spammaxl";
// m->m_off = (char *)&cr.m_spamMaxes[LINKS_HAVE_QUERIES] - x;
// m->m_type = TYPE_LONG;
// m->m_page = PAGE_SPAM;
// m->m_def = "300";
// m->m_group = 0;
// m->m_sparm = 0;
// m++;
// m->m_title = "max points for google ad client";
// m->m_desc = "Max points for Page has a google ad client. ";
// m->m_cgi = "spammaxm";
// m->m_off = (char *)&cr.m_spamMaxes[GOOGLE_AD_CLIENT] - x;
// m->m_type = TYPE_LONG;
// m->m_page = PAGE_SPAM;
// m->m_def = "300";
// m->m_group = 0;
// m++;
// m->m_title = "max points for percent text in links";
// m->m_desc = "Max points for percent of text in links (over 50 percent). ";
// m->m_cgi = "spammaxn";
// m->m_off = (char *)&cr.m_spamMaxes[PERCENT_IN_LINKS] - x;
// m->m_type = TYPE_LONG;
// m->m_page = PAGE_SPAM;
// m->m_def = "300";
// m->m_group = 0;
// m++;
// m->m_title = "max points for links have - or _";
// m->m_desc = "Max points for links have - or _";
// m->m_cgi = "spammaxo";
// m->m_off = (char *)&cr.m_spamMaxes[DASH_IN_LINK] - x;
// m->m_type = TYPE_LONG;
// m->m_page = PAGE_SPAM;
// m->m_def = "300";
// m->m_group = 0;
// m++;
// m->m_title = "max points for links to .info or .biz";
// m->m_desc = "Max points for links to .info or .biz ";
// m->m_cgi = "spammaxp";
// m->m_off = (char *)&cr.m_spamMaxes[LINK_TO_BADTLD] - x;
// m->m_type = TYPE_LONG;
// m->m_page = PAGE_SPAM;
// m->m_def = "300";
// m->m_group = 0;
// m++;
// m->m_title = "max points for links to a dmoz category";
// m->m_desc = "Max points for links to a dmoz category.";
// m->m_cgi = "spammaxq";
// m->m_off = (char *)&cr.m_spamMaxes[LINKS_ARE_DMOZ_CATS] - x;
// m->m_type = TYPE_LONG;
// m->m_page = PAGE_SPAM;
// m->m_def = "300";
// m->m_group = 0;
// m++;
// m->m_title = "max points for consecutive bold text";
// m->m_desc = "Max points for three consecutive bold texts"
// "contain the same word. ";
// m->m_cgi = "spammaxr";
// m->m_off = (char *)&cr.m_spamMaxes[CONSECUTIVE_BOLD_TEXT] - x;
// m->m_type = TYPE_LONG;
// m->m_page = PAGE_SPAM;
// m->m_def = "300";
// m->m_group = 0;
// m++;
// m->m_title = "max points for link text doesn't match domain";
// m->m_desc = "Max points for link text doesn't match domain";
// m->m_cgi = "spammaxs";
// m->m_off = (char *)&cr.m_spamMaxes[LINK_TEXT_NEQ_DOMAIN] - x;
// m->m_type = TYPE_LONG;
// m->m_page = PAGE_SPAM;
// m->m_def = "300";
// m->m_group = 0;
// m++;
// ///////////////////////////////////////////
// // END SPAM CONTROLS //
// ///////////////////////////////////////////
///////////////////////////////////////////
// QUALITY AGENT CONTROLS
///////////////////////////////////////////
/*
m->m_title = "all agents on";
m->m_desc = "Enable quality agent on all hosts for this collection";
m->m_cgi = "aqae";
m->m_obj = OBJ_COLL;
m->m_def = "1";
m->m_off = (char *)&cr.m_qualityAgentEnabled - x;
m->m_type = TYPE_BOOL2; // no yes or no, just a link
m->m_page = PAGE_QAGENT;
m++;
m->m_title = "all agents off";
m->m_desc = "Disable quality agent on all hosts for this collection";
m->m_cgi = "aqad";
m->m_def = "0";
m->m_off = (char *)&cr.m_qualityAgentEnabled - x;
m->m_type = TYPE_BOOL2; // no yes or no, just a link
m++;
m->m_title = "quality agent enabled";
m->m_desc = "If enabled, the agent will find quality modifiers for "
"all of the sites found in titledb.";
m->m_cgi = "qae";
m->m_off = (char *)&cr.m_qualityAgentEnabled - x;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_cast = 0;
m->m_page = PAGE_QAGENT;
m++;
m->m_title = "quality agent continuous loop";
m->m_desc = "If enabled, the agent will loop when it reaches "
"the end of titledb. Otherwise, it will disable itself.";
m->m_cgi = "qale";
m->m_off = (char *)&cr.m_qualityAgentLoop - x;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_page = PAGE_QAGENT;
m->m_cast = 1;
m++;
m->m_title = "ban subsites";
m->m_desc = "If enabled, the agent will look at the paths of"
" its titlerec sample, if the offending spam scores"
" all come from the same subsite, we just ban that one."
" Good for banning hijacked forums or spammed archives.";
m->m_cgi = "qabs";
m->m_off = (char *)&cr.m_qualityAgentBanSubSites - x;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_page = PAGE_QAGENT;
m->m_cast = 1;
m++;
m->m_title = "start document";
m->m_desc = "The agent will start at this docid when scanning "
"titledb looking for sites.";
m->m_cgi = "qasd";
m->m_off = (char *)&cr.m_qualityAgentStartDoc - x;
m->m_type = TYPE_LONG_LONG;
m->m_def = "0";
m->m_cast = 1;
m->m_page = PAGE_QAGENT;
m->m_sync = false; // do not sync this parm
m++;
m->m_title = "site quality refresh rate";
m->m_desc = "The quality agent will try to reexamine entries in "
"tagdb which were added more than this many seconds ago";
m->m_cgi = "qasqrr";
m->m_off = (char *)&cr.m_tagdbRefreshRate - x;
m->m_type = TYPE_LONG;
m->m_page = PAGE_QAGENT;
m->m_group = 1;
m->m_cast = 1;
m->m_def = "2592000";
m++;
m->m_title = "link samples to get";
m->m_desc = "Lookup the qualities of this many links in tagdb.";
m->m_cgi = "lstg";
m->m_off = (char *)&cr.m_linkSamplesToGet - x;
m->m_type = TYPE_LONG;
m->m_page = PAGE_QAGENT;
m->m_cast = 1;
m->m_def = "256";
m++;
m->m_title = "min pages to evaluate";
m->m_desc = "The quality agent will skip this site if there are"
" less than this many pages to evaluate.";
m->m_cgi = "mpte";
m->m_off = (char *)&cr.m_minPagesToEvaluate - x;
m->m_type = TYPE_LONG;
m->m_page = PAGE_QAGENT;
m->m_cast = 1;
m->m_def = "1";
m++;
m->m_title = "link bonus divisor";
m->m_desc = "Decrease a page's spam score if it has a high "
"link quality. The bonus is computed by dividing the "
"page's link quality by this parm. LinkInfos older "
"than 30 days are considered stale and are not used.";
m->m_cgi = "lbd";
m->m_off = (char *)&cr.m_linkBonusDivisor - x;
m->m_type = TYPE_LONG;
m->m_page = PAGE_QAGENT;
m->m_cast = 1;
m->m_def = "20";
m++;
m->m_title = "points per banned link";
m->m_desc = "Subtract x points per banned site that a site links to.";
m->m_cgi = "nppbl";
m->m_off = (char *)&cr.m_negPointsPerBannedLink - x;
m->m_type = TYPE_LONG;
m->m_page = PAGE_QAGENT;
m->m_cast = 1;
m->m_def = "3";
m++;
m->m_title = "points per link to different sites on the same IP";
m->m_desc = "Subtract x points per site linked to that is on the "
"same IP as other links. Good for catching domain parking "
"lots and spammers in general, but looking up the IPs "
"slows down the agent considerably. (set to 0 to disable.)";
m->m_cgi = "pfltdssi";
m->m_off = (char *)&cr.m_penaltyForLinksToDifferentSiteSameIp - x;
m->m_type = TYPE_LONG;
m->m_page = PAGE_QAGENT;
m->m_cast = 1;
m->m_def = "0";
m++;
m->m_title = "number of sites on an ip to sample";
m->m_desc = "Examine this many sites on the same ip as this site";
m->m_cgi = "nsoits";
m->m_off = (char *)&cr.m_numSitesOnIpToSample - x;
m->m_type = TYPE_LONG;
m->m_page = PAGE_QAGENT;
m->m_cast = 1;
m->m_def = "100";
m++;
m->m_title = "points per banned site on ip";
m->m_desc = "Subtract x points from a site quality for each banned "
"site on the ip";
m->m_cgi = "nppbsoi";
m->m_off = (char *)&cr.m_negPointsPerBannedSiteOnIp - x;
m->m_type = TYPE_LONG;
m->m_page = PAGE_QAGENT;
m->m_cast = 1;
m->m_def = "2";
m++;
m->m_title = "max penalty from being on a bad IP";
m->m_desc = "The penalty for being on a bad IP will not"
" exceed this value.";
m->m_cgi = "qampfboabi";
m->m_off = (char *)&cr.m_maxPenaltyFromIp - x;
m->m_type = TYPE_LONG;
m->m_page = PAGE_QAGENT;
m->m_cast = 1;
m->m_def = "-30";
m++;
m->m_title = "max sites per second";
m->m_desc = "The agent will not process more than this many"
" sites per second. Can be less than 1.";
m->m_cgi = "msps";
m->m_off = (char *)&cr.m_maxSitesPerSecond - x;
m->m_type = TYPE_FLOAT;
m->m_page = PAGE_QAGENT;
m->m_cast = 1;
m->m_def = "99999.0";
m++;
m->m_title = "site agent banned ruleset";
m->m_desc = "Site agent will assign this ruleset to documents "
" which are determined to be low quality.";
m->m_cgi = "";
m->m_off = (char *)&cr.m_qualityAgentBanRuleset - x;
m->m_type = TYPE_RULESET; // int32_t with dropdown of rulesets
m->m_page = PAGE_QAGENT;
m->m_cast = 1;
m->m_def = "30";
m++;
m->m_title = "ban quality theshold";
m->m_desc = "If the site has a spam score greater than this parm, it will"
" be inserted into the above ruleset.";
m->m_cgi = "tttsb";
m->m_off = (char *)&cr.m_siteQualityBanThreshold - x;
m->m_type = TYPE_LONG;
m->m_page = PAGE_QAGENT;
m->m_cast = 1;
m->m_def = "-100";
m++;
m->m_title = "theshold to trigger site reindex";
m->m_desc = "If the site has a quality less than this parm, it will"
" be added to the spider queue for reindexing";
m->m_cgi = "tttsr";
m->m_off = (char *)&cr.m_siteQualityReindexThreshold - x;
m->m_type = TYPE_LONG;
m->m_page = PAGE_QAGENT;
m->m_cast = 1;
m->m_def = "-100";
m++;
// m->m_title = "";
// m->m_desc = "";
// m->m_cgi = "";
// m->m_off = (char *)&cr.m_ - x;
// m->m_type = TYPE_LONG;
// m->m_page = PAGE_QAGENT;
// m->m_def = "";
// m++;
*/
///////////////////////////////////////////
// END QUALITY AGENT CONTROLS
///////////////////////////////////////////
///////////////////////////////////////////
// AD FEED CONTROLS
///////////////////////////////////////////
/*
m->m_title = "num ads in paid inclusion ad feed";
m->m_desc = "The number of ads we would like returned from the ad"
" server. This applies to all paid inclusion ads below.";
m->m_cgi = "apin";
m->m_off = (char *)&cr.m_adPINumAds - x;
m->m_type = TYPE_LONG;
m->m_def = "2";
m->m_page = PAGE_ADFEED;
m++;
m->m_title = "num ads in skyscraper ad feed";
m->m_desc = "The number of ads we would like returned from the ad"
" server. This applies to all skyscraper ads below.";
m->m_cgi = "assn";
m->m_off = (char *)&cr.m_adSSNumAds - x;
m->m_type = TYPE_LONG;
m->m_def = "5";
m->m_page = PAGE_ADFEED;
m++;
m->m_title = "skyscraper ad width";
m->m_desc = "The width of the skyscraper ad column in pixels";
m->m_cgi = "awd";
m->m_off = (char *)&cr.m_adWidth - x;
m->m_type = TYPE_LONG;
m->m_def = "300";
m->m_page = PAGE_ADFEED;
m++;
m->m_title = "ad feed timeout";
m->m_desc = "The time (in milliseconds) to wait for an ad list to be "
"returned before timing out and displaying the results "
"without any ads. This applies to all ads below.";
m->m_cgi = "afto";
m->m_off = (char *)&cr.m_adFeedTimeOut - x;
m->m_type = TYPE_LONG;
m->m_def = "1000";
m->m_page = PAGE_ADFEED;
m->m_group = 0;
m++;
m->m_title = "(1) paid inclusion ad enable";
m->m_desc = "Enable/Disable the paid inclusion ad.";
m->m_cgi = "apie";
m->m_off = (char *)&cr.m_adPIEnable - x;
m->m_type = TYPE_BOOL;
m->m_page = PAGE_ADFEED;
m->m_def = "1";
m++;
m->m_title = "(1) paid inclusion ad feed link";
m->m_desc = "Full link with address and parameters to retrieve an ad "
"feed. To specify parameter input: %q for query, %n "
"for num results, %p for page number, %i for query ip, "
"and %% for %.";
m->m_cgi = "apicgi";
m->m_off = (char *)cr.m_adCGI[0] - x;
m->m_type = TYPE_STRING;
m->m_size = MAX_CGI_URL;
m->m_page = PAGE_ADFEED;
m->m_def = "";
m->m_group = 0;
m++;
m->m_title = "(1) paid inclusion ad feed xml result tag";
m->m_desc = "Specify the full xml path for a result.";
m->m_cgi = "apirx";
m->m_off = (char *)cr.m_adResultXml[0] - x;
m->m_type = TYPE_STRING;
m->m_size = MAX_XML_LEN;
m->m_page = PAGE_ADFEED;
m->m_def = "";
m->m_group = 0;
m++;
m->m_title = "(1) paid inclusion ad feed xml title tag";
m->m_desc = "Specify the full xml path for the results title.";
m->m_cgi = "apitx";
m->m_off = (char *)cr.m_adTitleXml[0] - x;
m->m_type = TYPE_STRING;
m->m_size = MAX_XML_LEN;
m->m_page = PAGE_ADFEED;
m->m_def = "";
m->m_group = 0;
m++;
m->m_title = "(1) paid inclusion ad feed xml description tag";
m->m_desc = "Specify the full xml path for the results description.";
m->m_cgi = "apidx";
m->m_off = (char *)cr.m_adDescXml[0] - x;
m->m_type = TYPE_STRING;
m->m_size = MAX_XML_LEN;
m->m_page = PAGE_ADFEED;
m->m_def = "";
m->m_group = 0;
m++;
m->m_title = "(1) paid inclusion ad feed xml link tag";
m->m_desc = "Specify the full xml path for the results link. This "
"is the link that is shown as plain text, not an actual "
"link, below the ad description.";
m->m_cgi = "apilx";
m->m_off = (char *)cr.m_adLinkXml[0] - x;
m->m_type = TYPE_STRING;
m->m_size = MAX_XML_LEN;
m->m_page = PAGE_ADFEED;
m->m_def = "";
m->m_group = 0;
m++;
m->m_title = "(1) paid inclusion ad feed xml url tag";
m->m_desc = "Specify the full xml path for the results url. This is "
"the link associated with the title.";
m->m_cgi = "apiux";
m->m_off = (char *)cr.m_adUrlXml[0] - x;
m->m_type = TYPE_STRING;
m->m_size = MAX_XML_LEN;
m->m_page = PAGE_ADFEED;
m->m_def = "";
m->m_group = 0;
m++;
m->m_title = "(1) paid inclusion backup ad feed link";
m->m_desc = "Full link with address and parameters to retrieve an ad "
"feed. To specify parameter input: %q for query, %n "
"for num results, %p for page number, %i for query ip, "
"and %% for %.";
m->m_cgi = "apicgib";
m->m_off = (char *)cr.m_adCGI[1] - x;
m->m_type = TYPE_STRING;
m->m_size = MAX_CGI_URL;
m->m_page = PAGE_ADFEED;
m->m_def = "";
m++;
m->m_title = "(1) paid inclusion backup ad feed xml result tag";
m->m_desc = "Specify the full xml path for a result.";
m->m_cgi = "apirxb";
m->m_off = (char *)cr.m_adResultXml[1] - x;
m->m_type = TYPE_STRING;
m->m_size = MAX_XML_LEN;
m->m_page = PAGE_ADFEED;
m->m_def = "";
m->m_group = 0;
m++;
m->m_title = "(1) paid inclusion backup ad feed xml title tag";
m->m_desc = "Specify the full xml path for the results title.";
m->m_cgi = "apitxb";
m->m_off = (char *)cr.m_adTitleXml[1] - x;
m->m_type = TYPE_STRING;
m->m_size = MAX_XML_LEN;
m->m_page = PAGE_ADFEED;
m->m_def = "";
m->m_group = 0;
m++;
m->m_title = "(1) paid inclusion backup ad feed xml description tag";
m->m_desc = "Specify the full xml path for the results description.";
m->m_cgi = "apidxb";
m->m_off = (char *)cr.m_adDescXml[1] - x;
m->m_type = TYPE_STRING;
m->m_size = MAX_XML_LEN;
m->m_page = PAGE_ADFEED;
m->m_def = "";
m->m_group = 0;
m++;
m->m_title = "(1) paid inclusion backup ad feed xml link tag";
m->m_desc = "Specify the full xml path for the results link. This "
"is the link that is shown as plain text, not an actual "
"link, below the ad description.";
m->m_cgi = "apilxb";
m->m_off = (char *)cr.m_adLinkXml[1] - x;
m->m_type = TYPE_STRING;
m->m_size = MAX_XML_LEN;
m->m_page = PAGE_ADFEED;
m->m_def = "";
m->m_group = 0;
m++;
m->m_title = "(1) paid inclusion backup ad feed xml url tag";
m->m_desc = "Specify the full xml path for the results url. This is "
"the link associated with the title.";
m->m_cgi = "apiuxb";
m->m_off = (char *)cr.m_adUrlXml[1] - x;
m->m_type = TYPE_STRING;
m->m_size = MAX_XML_LEN;
m->m_page = PAGE_ADFEED;
m->m_def = "";
m->m_group = 0;
m++;
m->m_title = "(1) paid inclusion format text";
m->m_desc = "Specify the formatting text from the <div tag in";
m->m_cgi = "apift";
m->m_off = (char *)cr.m_adPIFormat - x;
m->m_plen = (char *)&cr.m_adPIFormatLen - x; // length of string
m->m_type = TYPE_STRINGBOX;
m->m_size = MAX_HTML_LEN + 1;
m->m_page = PAGE_ADFEED;
m->m_def = "style=\"padding: 3px;"
"text-align: left; background-color: "
"lightyellow;\"><span style=\"font-size: larger; "
"font-weight: bold;\">Sponsored Results</span>\n"
"<br><br>";
m->m_group = 0;
m++;
m->m_title = "(1) skyscraper ad enable";
m->m_desc = "Enable/Disable the skyscraper ad.";
m->m_cgi = "asse";
m->m_off = (char *)&cr.m_adSSEnable - x;
m->m_type = TYPE_BOOL;
m->m_page = PAGE_ADFEED;
m->m_def = "1";
m++;
m->m_title = "(1) skyscraper ad feed same as paid inclusion";
m->m_desc = "Use the same feed CGI as used above for the paid "
"inclusion.";
m->m_cgi = "asssap";
m->m_off = (char *)&cr.m_adSSSameasPI - x;
m->m_type = TYPE_BOOL;
m->m_page = PAGE_ADFEED;
m->m_def = "0";
m->m_group = 0;
m++;
m->m_title = "(1) skyscraper ad feed link";
m->m_desc = "Full link with address and parameters to retrieve an ad "
"feed. To specify parameter input: %q for query, %n "
"for num results, %p for page number, %i for query ip, "
"and %% for %.";
m->m_cgi = "asscgi";
m->m_off = (char *)cr.m_adCGI[2] - x;
m->m_type = TYPE_STRING;
m->m_size = MAX_CGI_URL;
m->m_page = PAGE_ADFEED;
m->m_def = "";
m->m_group = 0;
m++;
m->m_title = "(1) skyscraper ad feed xml result tag";
m->m_desc = "Specify the full xml path for a result.";
m->m_cgi = "assrx";
m->m_off = (char *)cr.m_adResultXml[2] - x;
m->m_type = TYPE_STRING;
m->m_size = MAX_XML_LEN;
m->m_page = PAGE_ADFEED;
m->m_def = "";
m->m_group = 0;
m++;
m->m_title = "(1) skyscraper ad feed xml title tag";
m->m_desc = "Specify the full xml path for the results title.";
m->m_cgi = "asstx";
m->m_off = (char *)cr.m_adTitleXml[2] - x;
m->m_type = TYPE_STRING;
m->m_size = MAX_XML_LEN;
m->m_page = PAGE_ADFEED;
m->m_def = "";
m->m_group = 0;
m++;
m->m_title = "(1) skyscraper ad feed xml description tag";
m->m_desc = "Specify the full xml path for the results description.";
m->m_cgi = "assdx";
m->m_off = (char *)cr.m_adDescXml[2] - x;
m->m_type = TYPE_STRING;
m->m_size = MAX_XML_LEN;
m->m_page = PAGE_ADFEED;
m->m_def = "";
m->m_group = 0;
m++;
m->m_title = "(1) skyscraper ad feed xml link tag";
m->m_desc = "Specify the full xml path for the results link. This "
"is the link that is shown as plain text, not an actual "
"link, below the ad description.";
m->m_cgi = "asslx";
m->m_off = (char *)cr.m_adLinkXml[2] - x;
m->m_type = TYPE_STRING;
m->m_size = MAX_XML_LEN;
m->m_page = PAGE_ADFEED;
m->m_def = "";
m->m_group = 0;
m++;
m->m_title = "(1) skyscraper ad feed xml url tag";
m->m_desc = "Specify the full xml path for the results url. This is "
"the link associated with the title.";
m->m_cgi = "assux";
m->m_off = (char *)cr.m_adUrlXml[2] - x;
m->m_type = TYPE_STRING;
m->m_size = MAX_XML_LEN;
m->m_page = PAGE_ADFEED;
m->m_def = "";
m->m_group = 0;
m++;
m->m_title = "(1) skyscraper backup ad feed same as paid inclusion";
m->m_desc = "Use the same feed CGI as used above for the backup paid "
"inclusion.";
m->m_cgi = "asssapb";
m->m_off = (char *)&cr.m_adBSSSameasBPI - x;
m->m_type = TYPE_BOOL;
m->m_page = PAGE_ADFEED;
m->m_def = "0";
m->m_group = 0;
m++;
m->m_title = "(1) skyscraper backup ad feed link";
m->m_desc = "Full link with address and parameters to retrieve an ad "
"feed. To specify parameter input: %q for query, %n "
"for num results, %p for page number, %i for query ip, "
"and %% for %.";
m->m_cgi = "asscgib";
m->m_off = (char *)cr.m_adCGI[3] - x;
m->m_type = TYPE_STRING;
m->m_size = MAX_CGI_URL;
m->m_page = PAGE_ADFEED;
m->m_def = "";
m->m_group = 0;
m++;
m->m_title = "(1) skyscraper backup ad feed xml result tag";
m->m_desc = "Specify the full xml path for a result.";
m->m_cgi = "assrxb";
m->m_off = (char *)cr.m_adResultXml[3] - x;
m->m_type = TYPE_STRING;
m->m_size = MAX_XML_LEN;
m->m_page = PAGE_ADFEED;
m->m_def = "";
m->m_group = 0;
m++;
m->m_title = "(1) skyscraper backup ad feed xml title tag";
m->m_desc = "Specify the full xml path for the results title.";
m->m_cgi = "asstxb";
m->m_off = (char *)cr.m_adTitleXml[3] - x;
m->m_type = TYPE_STRING;
m->m_size = MAX_XML_LEN;
m->m_page = PAGE_ADFEED;
m->m_def = "";
m->m_group = 0;
m++;
m->m_title = "(1) skyscraper backup ad feed xml description tag";
m->m_desc = "Specify the full xml path for the results description.";
m->m_cgi = "assdxb";
m->m_off = (char *)cr.m_adDescXml[3] - x;
m->m_type = TYPE_STRING;
m->m_size = MAX_XML_LEN;
m->m_page = PAGE_ADFEED;
m->m_def = "";
m->m_group = 0;
m++;
m->m_title = "(1) skyscraper backup ad feed xml link tag";
m->m_desc = "Specify the full xml path for the results link. This "
"is the link that is shown as plain text, not an actual "
"link, below the ad description.";
m->m_cgi = "asslxb";
m->m_off = (char *)cr.m_adLinkXml[3] - x;
m->m_type = TYPE_STRING;
m->m_size = MAX_XML_LEN;
m->m_page = PAGE_ADFEED;
m->m_def = "";
m->m_group = 0;
m++;
m->m_title = "(1) skyscraper backup ad feed xml url tag";
m->m_desc = "Specify the full xml path for the results url. This is "
"the link associated with the title.";
m->m_cgi = "assuxb";
m->m_off = (char *)cr.m_adUrlXml[3] - x;
m->m_type = TYPE_STRING;
m->m_size = MAX_XML_LEN;
m->m_page = PAGE_ADFEED;
m->m_def = "";
m->m_group = 0;
m++;
m->m_title = "(1) skyscraper format text";
m->m_desc = "Specify the formatting text from the <div tag in";
m->m_cgi = "assft";
m->m_off = (char *)cr.m_adSSFormat - x;
m->m_plen = (char *)&cr.m_adSSFormatLen - x; // length of string
m->m_size = MAX_HTML_LEN + 1;
m->m_type = TYPE_STRINGBOX;
m->m_page = PAGE_ADFEED;
m->m_def = "style=\"height: 100%; padding: 3px;"
"text-align: center;background-color: "
"lightyellow;\"><span style=\""
"font-size: larger; font-weight: bold;\">"
"Sponsored Results</span><br><br> ";
m->m_group = 0;
m++;
*/
///////////////////////////////////////////
// END AD FEED CONTROLS
///////////////////////////////////////////
///////////////////////////////////////////
// SEARCH URL CONTROLS
// these are only specified in the search url when doing a search
///////////////////////////////////////////
/////
//
// OLDER SEARCH INPUTS
//
////
// when we do &qa=1 we do not show things like responseTime in
// search results so we can verify serp checksum consistency for QA
// in qa.cpp
/*
m->m_title = "quality assurance";
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_SI;
m->m_desc = "This is 1 if doing a QA test in qa.cpp";
m->m_def = "0";
m->m_soff = (char *)&si.m_qa - y;
m->m_type = TYPE_CHAR;
m->m_sparm = 1;
m->m_scgi = "qa";
m++;
*/
//m->m_title = "show turk forms";
//m->m_desc = "If enabled summaries in search results will be "
// "turkable input forms.";
//m->m_def = "0";
//m->m_soff = (char *)&si.m_getTurkForm - y;
//m->m_type = TYPE_BOOL;
//m->m_sparm = 1;
//m->m_scgi = "turk";
//m++;
// IMPORT PARMS
m->m_title = "enable document importation";
m->m_desc = "Import documents into this collection.";
m->m_cgi = "import";
m->m_page = PAGE_IMPORT;
m->m_obj = OBJ_COLL;
m->m_off = (char *)&cr.m_importEnabled - x;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_flags = PF_API;
m++;
// m->m_title = "collection";
// m->m_desc = "Collection to import documents into.";
// m->m_cgi = "c";
// m->m_page = PAGE_IMPORT;
// m->m_obj = OBJ_GBREQUEST;
// m->m_off = (char *)&cr.m_imcoll - (char *)&gr;
// m->m_type = TYPE_CHARPTR;
// m->m_def = NULL;
// // PF_COLLDEFAULT: so it gets set to default coll on html page
// m->m_flags = PF_API|PF_REQUIRED|PF_NOHTML;
// m++;
m->m_title = "directory containing titledb files";
m->m_desc = "Import documents contained in titledb files in this "
"directory. This is an ABSOLUTE directory path.";
m->m_cgi = "importdir";
m->m_xml = "importDir";
m->m_page = PAGE_IMPORT;
m->m_obj = OBJ_COLL;
m->m_off = (char *)&cr.m_importDir - x;
m->m_type = TYPE_SAFEBUF;
m->m_def = "";
m->m_flags = PF_API;
m++;
m->m_title = "number of simultaneous injections";
m->m_desc = "Typically try one or two injections per host in "
"your cluster.";
m->m_cgi = "numimportinjects";
m->m_xml = "numImportInjects";
m->m_page = PAGE_IMPORT;
m->m_obj = OBJ_COLL;
m->m_off = (char *)&cr.m_numImportInjects - x;
m->m_type = TYPE_LONG;
m->m_def = "2";
m->m_flags = PF_API;
m++;
///////////
//
// ADD URL PARMS
//
///////////
m->m_title = "collection";
m->m_desc = "Add urls into this collection.";
m->m_cgi = "c";
m->m_page = PAGE_ADDURL2;
m->m_obj = OBJ_GBREQUEST;
m->m_off = (char *)&gr.m_coll - (char *)&gr;
m->m_type = TYPE_CHARPTR;
m->m_def = NULL;
// PF_COLLDEFAULT: so it gets set to default coll on html page
m->m_flags = PF_API|PF_REQUIRED|PF_NOHTML;
m++;
m->m_title = "urls to add";
m->m_desc = "List of urls to index. One per line or space separated. "
"If your url does not index as you expect you "
"can check it's spider history by doing a url: search on it. "
"Added urls will have a "
"<a href=/admin/filters#hopcount>hopcount</a> of 0. "
"Added urls will match the <i><a href=/admin/filters#isaddurl>"
"isaddurl</a></i> directive on "
"the url filters page. "
"The add url api is described on the "
"<a href=/admin/api>api</a> page.";
m->m_cgi = "urls";
m->m_page = PAGE_ADDURL2;
m->m_obj = OBJ_GBREQUEST; // do not store in g_conf or collectionrec
m->m_off = (char *)&gr.m_urlsBuf - (char *)&gr;
m->m_type = TYPE_CHARPTR;
m->m_def = NULL;
m->m_flags = PF_TEXTAREA | PF_NOSAVE | PF_API|PF_REQUIRED;
m++;
/*
// the new upload post submit button
m->m_title = "upload urls";
m->m_desc = "Upload your file of urls.";
m->m_cgi = "urls";
m->m_page = PAGE_ADDURL2;
m->m_obj = OBJ_NONE;
m->m_def = NULL;
m->m_type = TYPE_FILEUPLOADBUTTON;
m++;
*/
m->m_title = "strip sessionids";
m->m_desc = "Strip added urls of their session ids.";
m->m_cgi = "strip";
m->m_page = PAGE_ADDURL2;
m->m_obj = OBJ_GBREQUEST;
m->m_off = (char *)&gr.m_stripBox - (char *)&gr;
m->m_type = TYPE_CHECKBOX;
m->m_def = "1";
m->m_flags = PF_API;
m++;
m->m_title = "harvest links";
m->m_desc = "Harvest links of added urls so we can spider them?.";
m->m_cgi = "spiderlinks";
m->m_page = PAGE_ADDURL2;
m->m_obj = OBJ_GBREQUEST;
m->m_off = (char *)&gr.m_harvestLinks - (char *)&gr;
m->m_type = TYPE_CHECKBOX;
m->m_def = "1";
m->m_flags = PF_API;
m++;
/*
m->m_title = "force respider";
m->m_desc = "Force an immediate respider even if the url "
"is already indexed.";
m->m_cgi = "force";
m->m_page = PAGE_ADDURL2;
m->m_obj = OBJ_GBREQUEST;
m->m_off = (char *)&gr.m_forceRespiderBox - (char *)&gr;
m->m_type = TYPE_CHECKBOX;
m->m_def = "0";
m->m_flags = PF_API;
m++;
*/
////////
//
// now the new injection parms
//
////////
m->m_title = "url";
m->m_desc = "Specify the URL that will be immediately crawled "
"and indexed in real time while you wait. The browser "
"will return the "
"final index status code. Alternatively, "
"use the <a href=/admin/addurl>add url</a> page "
"to add urls individually or in bulk "
"without having to wait for the pages to be "
"actually indexed in realtime. "
"By default, injected urls "
"take precedence over the \"insitelist\" expression in the "
"<a href=/admin/filters>url filters</a> "
"so injected urls need not match the patterns in your "
"<a href=/admin/sites>site list</a>. You can "
"change that behavior in the <a href=/admin/filters>url "
"filters</a> if you want. "
"Injected urls will have a "
"<a href=/admin/filters#hopcount>hopcount</a> of 0. "
"The injection api is described on the "
"<a href=/admin/api>api</a> page. "
"Make up a fake url if you are injecting content that "
"does not have one."
"<br>"
"<br>"
"If the url ends in .warc or .arc or .warc.gz or .arc.gz "
"Gigablast will index the contained documents as individual "
"documents, using the appropriate dates and other meta "
"information contained in the containing archive file."
;
m->m_cgi = "url";
//m->m_cgi2 = "u";
//m->m_cgi3 = "seed"; // pagerawlbot
//m->m_cgi4 = "injecturl";
m->m_obj = OBJ_IR;
m->m_type = TYPE_CHARPTR;
m->m_def = NULL;
m->m_flags = PF_API | PF_REQUIRED;
m->m_page = PAGE_INJECT;
m->m_off = (char *)&ir.ptr_url - (char *)&ir;
m++;
// alias #1
m->m_title = "url";
m->m_cgi = "u";
m->m_obj = OBJ_IR;
m->m_type = TYPE_CHARPTR;
m->m_def = NULL;
m->m_flags = PF_HIDDEN;
m->m_page = PAGE_INJECT;
m->m_off = (char *)&ir.ptr_url - (char *)&ir;
m++;
// alias #2
m->m_title = "url";
m->m_cgi = "seed";
m->m_obj = OBJ_IR;
m->m_type = TYPE_CHARPTR;
m->m_def = NULL;
m->m_flags = PF_HIDDEN | PF_DIFFBOT;
m->m_page = PAGE_INJECT;
m->m_off = (char *)&ir.ptr_url - (char *)&ir;
m++;
// alias #3
m->m_title = "url";
m->m_cgi = "injecturl";
m->m_obj = OBJ_IR;
m->m_type = TYPE_CHARPTR;
m->m_def = NULL;
m->m_flags = PF_HIDDEN;
m->m_page = PAGE_INJECT;
m->m_off = (char *)&ir.ptr_url - (char *)&ir;
m++;
m->m_title = "query to scrape";
m->m_desc = "Scrape popular search engines for this query "
"and inject their links. You are not required to supply "
"the <i>url</i> parm if you supply this parm.";
m->m_cgi = "qts";
m->m_obj = OBJ_IR;
m->m_type = TYPE_CHARPTR;
m->m_def = NULL;
m->m_flags = PF_API;
m->m_page = PAGE_INJECT;
m->m_off = (char *)&ir.ptr_queryToScrape - (char *)&ir;
m++;
m->m_title = "inject links";
m->m_desc = "Should we inject the links found in the injected "
"content as well?";
m->m_cgi = "injectlinks";
m->m_obj = OBJ_IR;
m->m_type = TYPE_CHECKBOX;
m->m_def = "0";
m->m_flags = PF_API;
m->m_page = PAGE_INJECT;
m->m_off = (char *)&ir.m_injectLinks - (char *)&ir;
m++;
m->m_title = "spider links";
m->m_desc = "Add the outlinks of the injected content into spiderdb "
"for spidering?";
m->m_cgi = "spiderlinks";
m->m_obj = OBJ_IR;
m->m_type = TYPE_CHECKBOX;
// leave off because could start spidering whole web unintentionally
m->m_def = "0";
m->m_flags = PF_API;
m->m_page = PAGE_INJECT;
m->m_off = (char *)&ir.m_spiderLinks - (char *)&ir;
m++;
m->m_title = "short reply";
m->m_desc = "Should the injection response be short and simple?";
m->m_cgi = "quick";
m->m_obj = OBJ_IR;
m->m_type = TYPE_CHECKBOX;
m->m_def = "0";
m->m_flags = PF_HIDDEN;
m->m_page = PAGE_INJECT;
m->m_off = (char *)&ir.m_shortReply - (char *)&ir;
m++;
m->m_title = "only inject content if new";
m->m_desc = "If the specified url is already in the index then "
"skip the injection.";
m->m_cgi = "newonly";
m->m_obj = OBJ_IR;
m->m_type = TYPE_CHECKBOX;
m->m_def = "0";
m->m_flags = PF_API;
m->m_page = PAGE_INJECT;
m->m_off = (char *)&ir.m_newOnly - (char *)&ir;
m++;
m->m_title = "delete from index";
m->m_desc = "Delete the specified url from the index.";
m->m_cgi = "deleteurl";
m->m_obj = OBJ_IR;
m->m_type = TYPE_CHECKBOX;
m->m_def = "0";
m->m_flags = PF_API;
m->m_page = PAGE_INJECT;
m->m_off = (char *)&ir.m_deleteUrl - (char *)&ir;
m++;
m->m_title = "recycle content";
m->m_desc = "If the url is already in the index, then do not "
"re-download the content, just use the content that was "
"stored in the cache from last time.";
m->m_cgi = "recycle";
m->m_obj = OBJ_IR;
m->m_type = TYPE_CHECKBOX;
m->m_def = "0";
m->m_flags = PF_API;
m->m_page = PAGE_INJECT;
m->m_off = (char *)&ir.m_recycle - (char *)&ir;
m++;
m->m_title = "dedup url";
m->m_desc = "Do not index the url if there is already another "
"url in the index with the same content.";
m->m_cgi = "dedup";
m->m_obj = OBJ_IR;
m->m_type = TYPE_CHECKBOX;
m->m_def = "0";
m->m_flags = PF_API;
m->m_page = PAGE_INJECT;
m->m_off = (char *)&ir.m_dedup - (char *)&ir;
m++;
m->m_title = "do consistency checking";
m->m_desc = "Turn this on for debugging.";
m->m_cgi = "consist";
m->m_obj = OBJ_IR;
m->m_type = TYPE_CHECKBOX;
m->m_def = "0";
m->m_flags = PF_HIDDEN; // | PF_API
m->m_page = PAGE_INJECT;
m->m_off = (char *)&ir.m_doConsistencyTesting - (char *)&ir;
m++;
m->m_title = "hop count";
m->m_desc = "Use this hop count when injecting the page.";
m->m_cgi = "hopcount";
m->m_obj = OBJ_IR;
m->m_type = TYPE_LONG;
m->m_def = "0";
m->m_flags = PF_HIDDEN; // | PF_API
m->m_page = PAGE_INJECT;
m->m_off = (char *)&ir.m_hopCount - (char *)&ir;
m++;
m->m_title = "url IP";
m->m_desc = "Use this IP when injecting the document. Do not use or "
"set to 0.0.0.0, if unknown. If provided, it will save an IP "
"lookup.";
m->m_cgi = "urlip";
m->m_obj = OBJ_IR;
m->m_type = TYPE_IP;
m->m_def = "0.0.0.0";
m->m_flags = PF_API;
m->m_page = PAGE_INJECT;
m->m_off = (char *)&ir.m_injectDocIp - (char *)&ir;
m++;
m->m_title = "last spider time";
m->m_desc = "Override last time spidered";
m->m_cgi = "lastspidered";
m->m_obj = OBJ_IR;
m->m_type = TYPE_LONG;
m->m_def = "0";
m->m_flags = PF_HIDDEN; // | PF_API
m->m_page = PAGE_INJECT;
m->m_off = (char *)&ir.m_lastSpidered - (char *)&ir;
m++;
m->m_title = "first indexed";
m->m_desc = "Override first indexed time";
m->m_cgi = "firstindexed";
m->m_obj = OBJ_IR;
m->m_type = TYPE_LONG;
m->m_def = "0";
m->m_flags = PF_HIDDEN; // | PF_API
m->m_page = PAGE_INJECT;
m->m_off = (char *)&ir.m_firstIndexed - (char *)&ir;
m++;
m->m_title = "content has mime";
m->m_desc = "If the content of the url is provided below, does "
"it begin with an HTTP mime header?";
m->m_cgi = "hasmime";
m->m_obj = OBJ_IR;
m->m_type = TYPE_CHECKBOX;
m->m_def = "0";
m->m_flags = PF_API;
m->m_page = PAGE_INJECT;
m->m_off = (char *)&ir.m_hasMime - (char *)&ir;
m++;
m->m_title = "content delimeter";
m->m_desc = "If the content of the url is provided below, then "
"it consist of multiple documents separated by this "
"delimeter. Each such item will be injected as an "
"independent document. Some possible delimeters: "
"<i>========</i> or <i>&lt;doc&gt;</i>. If you set "
"<i>hasmime</i> above to true then Gigablast will check "
"for a url after the delimeter and use that url as the "
"injected url. Otherwise it will append numbers to the "
"url you provide above.";
m->m_cgi = "delim";
m->m_obj = OBJ_IR;
m->m_type = TYPE_CHARPTR;
m->m_def = NULL;
m->m_flags = PF_API;
m->m_page = PAGE_INJECT;
m->m_off = (char *)&ir.ptr_contentDelim - (char *)&ir;
m++;
m->m_title = "content type";
m->m_desc = "If you supply content in the text box below without "
"an HTTP mime header, "
"then you need to enter the content type. "
"Possible values: <b>text/html text/plain text/xml "
"application/json</b>";
m->m_cgi = "contenttype";
m->m_obj = OBJ_IR;
m->m_type = TYPE_CHARPTR; //text/html application/json application/xml
m->m_def = "text/html";
m->m_flags = PF_API;
m->m_page = PAGE_INJECT;
m->m_off = (char *)&ir.ptr_contentTypeStr - (char *)&ir;
m++;
m->m_title = "content charset";
m->m_desc = "A number representing the charset of the content "
"if provided below and no HTTP mime header "
"is given. Defaults to utf8 "
"which is 106. "
"See iana_charset.h for the numeric values.";
m->m_cgi = "charset";
m->m_obj = OBJ_IR;
m->m_type = TYPE_LONG;
m->m_def = "106";
m->m_flags = PF_API;
m->m_page = PAGE_INJECT;
m->m_off = (char *)&ir.m_charset - (char *)&ir;
m++;
m->m_title = "upload content file";
m->m_desc = "Instead of specifying the content to be injected in "
"the text box below, upload this file for it.";
m->m_cgi = "file";
m->m_obj = OBJ_IR;
m->m_type = TYPE_FILEUPLOADBUTTON;
m->m_def = NULL;
m->m_flags = PF_NOAPI;
m->m_page = PAGE_INJECT;
m->m_off = (char *)&ir.ptr_contentFile - (char *)&ir;
m++;
m->m_title = "content";
m->m_desc = "If you want to supply the URL's content "
"rather than have Gigablast download it, then "
"enter the content here. "
"Enter MIME header "
"first if \"content has mime\" is set to true above. "
"Separate MIME from actual content with two returns. "
"At least put a single space in here if you want to "
"inject empty content, otherwise the content will "
"be downloaded from the url. This is because the "
"page injection form always submits the content text area "
"even if it is empty, which should signify that the "
"content should be downloaded.";
m->m_cgi = "content";
m->m_obj = OBJ_IR;
m->m_type = TYPE_CHARPTR;
m->m_def = NULL;
m->m_flags = PF_API|PF_TEXTAREA;
m->m_page = PAGE_INJECT;
m->m_off = (char *)&ir.ptr_content - (char *)&ir;
m++;
m->m_title = "metadata";
m->m_desc = "Json encoded metadata to be indexed with the document.";
m->m_cgi = "metadata";
m->m_obj = OBJ_IR;
m->m_type = TYPE_CHARPTR;
m->m_def = NULL;
m->m_flags = PF_API|PF_TEXTAREA;
m->m_page = PAGE_INJECT;
m->m_off = (char *)&ir.ptr_metadata - (char *)&ir;
m++;
m->m_title = "get sectiondb voting info";
m->m_desc = "Return section information of injected content for "
"the injected subdomain. ";
m->m_cgi = "sections";
m->m_obj = OBJ_IR;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_flags = PF_API|PF_NOHTML;
m->m_page = PAGE_INJECT;
m->m_off = (char *)&ir.m_getSections - (char *)&ir;
m++;
m->m_title = "diffbot reply";
m->m_desc = "Used exclusively by diffbot. Do not use.";
m->m_cgi = "diffbotreply";
m->m_obj = OBJ_IR;
m->m_type = TYPE_CHARPTR;
m->m_def = NULL;
m->m_flags = PF_API|PF_TEXTAREA|PF_NOHTML; // do not show in our api
m->m_page = PAGE_INJECT;
m->m_off = (char *)&ir.ptr_diffbotReply - (char *)&ir;
m++;
///////////////////
//
// QUERY REINDEX
//
///////////////////
m->m_title = "collection";
m->m_desc = "query reindex in this collection.";
m->m_cgi = "c";
m->m_obj = OBJ_GBREQUEST;
m->m_type = TYPE_CHARPTR;
m->m_def = NULL;
// PF_COLLDEFAULT: so it gets set to default coll on html page
m->m_flags = PF_API|PF_REQUIRED|PF_NOHTML;
m->m_page = PAGE_REINDEX;
m->m_off = (char *)&gr.m_coll - (char *)&gr;
m++;
m->m_title = "query to reindex or delete";
m->m_desc = "We either reindex or delete the search results of "
"this query. Reindexing them will redownload them and "
"possible update the siterank, which is based on the "
"number of links to the site. This will add the url "
"requests to "
"the spider queue so ensure your spiders are enabled.";
m->m_cgi = "q";
m->m_off = (char *)&gr.m_query - (char *)&gr;
m->m_type = TYPE_CHARPTR;
m->m_page = PAGE_REINDEX;
m->m_obj = OBJ_GBREQUEST;
m->m_def = NULL;
m->m_flags = PF_API |PF_REQUIRED;
m++;
m->m_title = "start result number";
m->m_desc = "Starting with this result #. Starts at 0.";
m->m_cgi = "srn";
m->m_off = (char *)&gr.m_srn - (char *)&gr;
m->m_type = TYPE_LONG;
m->m_page = PAGE_REINDEX;
m->m_obj = OBJ_GBREQUEST;
m->m_def = "0";
m->m_flags = PF_API ;
m++;
m->m_title = "end result number";
m->m_desc = "Ending with this result #. 0 is the first result #.";
m->m_cgi = "ern";
m->m_off = (char *)&gr.m_ern - (char *)&gr;
m->m_type = TYPE_LONG;
m->m_page = PAGE_REINDEX;
m->m_obj = OBJ_GBREQUEST;
m->m_def = "99999999";
m->m_flags = PF_API ;
m++;
m->m_title = "query language";
m->m_desc = "The language the query is in. Used to rank results. "
"Just use xx to indicate no language in particular. But "
"you should use the same qlang value you used for doing "
"the query if you want consistency.";
m->m_cgi = "qlang";
m->m_off = (char *)&gr.m_qlang - (char *)&gr;
m->m_type = TYPE_CHARPTR;
m->m_page = PAGE_REINDEX;
m->m_obj = OBJ_GBREQUEST;
m->m_def = "en";
m->m_flags = PF_API ;
m++;
m->m_title = "recycle content";
m->m_desc = "If you check this box then Gigablast will not "
"re-download the content, but use the content that was "
"stored in the cache from last time. Useful for rebuilding "
"the index to pick up new inlink text or fresher "
"sitenuminlinks counts which influence ranking.";
m->m_cgi = "qrecycle";
m->m_obj = OBJ_GBREQUEST;
m->m_type = TYPE_CHECKBOX;
m->m_def = "0";
m->m_flags = PF_API;
m->m_page = PAGE_REINDEX;
m->m_off = (char *)&gr.m_recycleContent - (char *)&gr;
m++;
m->m_title = "FORCE DELETE";
m->m_desc = "Check this checkbox to delete the results, not just "
"reindex them.";
m->m_cgi = "forcedel";
m->m_off = (char *)&gr.m_forceDel - (char *)&gr;
m->m_type = TYPE_CHECKBOX;
m->m_page = PAGE_REINDEX;
m->m_obj = OBJ_GBREQUEST;
m->m_def = "0";
m->m_flags = PF_API ;
m++;
///////////////////
//
// SEARCH CONTROLS
//
///////////////////
m->m_title = "do spell checking by default";
m->m_desc = "If enabled while using the XML feed, "
"when Gigablast finds a spelling recommendation it will be "
"included in the XML <spell> tag. Default is 0 if using an "
"XML feed, 1 otherwise.";
m->m_cgi = "spell";
m->m_off = (char *)&cr.m_spellCheck - x;
//m->m_soff = (char *)&si.m_spellCheck - y;
//m->m_sparm = 1;
m->m_type = TYPE_BOOL;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m->m_def = "1";
m->m_flags = PF_API | PF_NOSAVE | PF_CLONE;
m++;
m->m_title = "get scoring info by default";
m->m_desc = "Get scoring information for each result so you "
"can see how each result is scored. You must explicitly "
"request this using &scores=1 for the XML feed because it "
"is not included by default.";
m->m_cgi = "scores"; // dedupResultsByDefault";
m->m_off = (char *)&cr.m_getDocIdScoringInfo - x;
m->m_type = TYPE_BOOL;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m->m_def = "1";
m->m_flags = PF_API | PF_CLONE;
m++;
m->m_title = "do query expansion by default";
m->m_desc = "If enabled, query expansion will expand your query "
"to include the various forms and "
"synonyms of the query terms.";
m->m_def = "1";
m->m_off = (char *)&cr.m_queryExpansion - x;
m->m_type = TYPE_BOOL;
m->m_cgi = "qe";
m->m_page = PAGE_SEARCH;
m->m_flags = PF_API | PF_CLONE;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "highlight query terms in summaries by default";
m->m_desc = "Use to disable or enable "
"highlighting of the query terms in the summaries.";
m->m_def = "1";
m->m_off = (char *)&cr.m_doQueryHighlighting - x;
m->m_type = TYPE_BOOL;
m->m_cgi = "qh";
m->m_smin = 0;
m->m_smax = 8;
m->m_sprpg = 1; // turn off for now
m->m_sprpp = 1;
m->m_flags = PF_API | PF_CLONE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "max title len";
m->m_desc = "What is the maximum number of "
"characters allowed in titles displayed in the search "
"results?";
m->m_cgi = "tml";
m->m_off = (char *)&cr.m_titleMaxLen - x;
m->m_type = TYPE_LONG;
m->m_flags = PF_API | PF_CLONE;
m->m_def = "80";
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "consider titles from body";
m->m_desc = "Can Gigablast make titles from the document content? "
"Used mostly for the news collection where the title tags "
"are not very reliable.";
m->m_cgi = "gtfb";
m->m_off = (char *)&cr.m_considerTitlesFromBody - x;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_group = 0;
//m->m_soff = (char *)&si.m_considerTitlesFromBody - y;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "site cluster by default";
m->m_desc = "Should search results be site clustered? This "
"limits each site to appearing at most twice in the "
"search results. Sites are subdomains for the most part, "
"like abc.xyz.com.";
m->m_cgi = "scd";
m->m_off = (char *)&cr.m_siteClusterByDefault - x;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_flags = PF_API | PF_CLONE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
// buzz
m->m_title = "hide all clustered results";
m->m_desc = "Only display at most one result per site.";
m->m_cgi = "hacr";
m->m_off = (char *)&cr.m_hideAllClustered - x;
m->m_type = TYPE_BOOL;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m->m_def = "0";
m->m_group = 0;
m->m_flags = PF_API | PF_CLONE;
m++;
m->m_title = "dedup results by default";
m->m_desc = "Should duplicate search results be removed? This is "
"based on a content hash of the entire document. "
"So documents must be exactly the same for the most part.";
m->m_cgi = "drd"; // dedupResultsByDefault";
m->m_off = (char *)&cr.m_dedupResultsByDefault - x;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_group = 1;
m->m_flags = PF_API | PF_CLONE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "do tagdb lookups for queries";
m->m_desc = "For each search result a tagdb lookup is made, "
"usually across the network on distributed clusters, to "
"see if the URL's site has been manually banned in tagdb. "
"If you don't manually ban sites then turn this off for "
"extra speed.";
m->m_cgi = "stgdbl";
m->m_off = (char *)&cr.m_doTagdbLookups - x;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_group = 1;
m->m_flags = PF_API | PF_CLONE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "percent similar dedup summary default value";
m->m_desc = "If document summary (and title) are "
"this percent similar "
"to a document summary above it, then remove it from the "
"search results. 100 means only to remove if exactly the "
"same. 0 means no summary deduping.";
m->m_cgi = "psds";
m->m_off = (char *)&cr.m_percentSimilarSummary - x;
m->m_type = TYPE_LONG;
m->m_def = "90";
m->m_group = 0;
m->m_smin = 0;
m->m_smax = 100;
m->m_flags = PF_API | PF_CLONE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "number of lines to use in summary to dedup";
m->m_desc = "Sets the number of lines to generate for summary "
"deduping. This is to help the deduping process not throw "
"out valid summaries when normally displayed summaries are "
"smaller values. Requires percent similar dedup summary to "
"be non-zero.";
m->m_cgi = "msld";
m->m_off = (char *)&cr.m_summDedupNumLines - x;
m->m_type = TYPE_LONG;
m->m_def = "4";
m->m_group = 0;
m->m_flags = PF_API | PF_CLONE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "dedup URLs by default";
m->m_desc = "Should we dedup URLs with case insensitivity? This is "
"mainly to correct duplicate wiki pages.";
m->m_cgi = "ddu";
m->m_off = (char *)&cr.m_dedupURLDefault - x;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_group = 0;
m->m_flags = PF_API | PF_CLONE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "use vhost language detection";
m->m_desc = "Use language specific pages for home, etc.";
m->m_cgi = "vhost";
m->m_off = (char *)&cr.m_useLanguagePages - x;
//m->m_soff = (char *)&si.m_useLanguagePages - y;
m->m_type = TYPE_BOOL;
m->m_def = "1";
//m->m_scgi = "vhost";
m->m_smin = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "sort language preference default";
m->m_desc = "Default language to use for ranking results. "
//"This should only be used on limited collections. "
"Value should be any language abbreviation, for example "
"\"en\" for English. Use <i>xx</i> to give ranking "
"boosts to no language in particular. See the language "
"abbreviations at the bottom of the "
"<a href=/admin/filters>url filters</a> page.";
m->m_cgi = "defqlang";
m->m_off = (char *)&cr.m_defaultSortLanguage2 - x;
m->m_type = TYPE_STRING;
m->m_size = 6; // up to 5 chars + NULL, e.g. "en_US"
m->m_def = "xx";//_US";
//m->m_group = 0;
m->m_flags = PF_API | PF_CLONE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "sort country preference default";
m->m_desc = "Default country to use for ranking results. "
//"This should only be used on limited collections. "
"Value should be any country code abbreviation, for example "
"\"us\" for United States. This is currently not working.";
m->m_cgi = "qcountry";
m->m_off = (char *)&cr.m_defaultSortCountry - x;
m->m_type = TYPE_STRING;
m->m_size = 2+1;
m->m_def = "us";
m->m_group = 0;
m->m_flags = PF_API | PF_CLONE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
// for post query reranking
m->m_title = "docs to check for post query demotion by default";
m->m_desc = "How many search results should we "
"scan for post query demotion? "
"0 disables all post query reranking. ";
m->m_cgi = "pqrds";
m->m_off = (char *)&cr.m_pqr_docsToScan - x;
//m->m_soff = (char *)&si.m_docsToScanForReranking - y;
m->m_type = TYPE_LONG;
m->m_def = "0";
m->m_group = 1;
//m->m_scgi = "pqrds";
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "max summary len";
m->m_desc = "What is the maximum number of "
"characters displayed in a summary for a search result?";
m->m_cgi = "sml";
m->m_off = (char *)&cr.m_summaryMaxLen - x;
m->m_type = TYPE_LONG;
m->m_def = "512";
m->m_flags = PF_API | PF_CLONE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "max summary excerpts";
m->m_desc = "What is the maximum number of "
"excerpts displayed in the summary of a search result?";
m->m_cgi = "smnl";
m->m_off = (char *)&cr.m_summaryMaxNumLines - x;
m->m_type = TYPE_LONG;
m->m_def = "4";
m->m_group = 0;
m->m_flags = PF_API | PF_CLONE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "max summary excerpt length";
m->m_desc = "What is the maximum number of "
"characters allowed per summary excerpt?";
m->m_cgi = "smxcpl";
m->m_off = (char *)&cr.m_summaryMaxNumCharsPerLine - x;
m->m_type = TYPE_LONG;
m->m_def = "90";
m->m_group = 0;
m->m_flags = PF_API | PF_CLONE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
/*
m->m_title = "default number of summary excerpts by default";
m->m_desc = "What is the default number of "
"summary excerpts displayed per search result?";
m->m_cgi = "sdnl";
m->m_off = (char *)&cr.m_summaryDefaultNumLines - x;
m->m_type = TYPE_LONG;
m->m_def = "3";
m->m_group = 0;
m->m_flags = PF_API;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
*/
m->m_title = "max summary line width by default";
m->m_desc = "&lt;br&gt; tags are inserted to keep the number "
"of chars in the summary per line at or below this width. "
"Also affects title. "
"Strings without spaces that exceed this "
"width are not split. Has no affect on xml or json feed, "
"only works on html.";
m->m_cgi = "smw";
m->m_off = (char *)&cr.m_summaryMaxWidth - x;
m->m_type = TYPE_LONG;
m->m_def = "80";
m->m_group = 0;
m->m_flags = PF_API | PF_CLONE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "bytes of doc to scan for summary generation";
m->m_desc = "Truncating this will miss out on good summaries, but "
"performance will increase.";
m->m_cgi = "clmfs";
m->m_off = (char *)&cr.m_contentLenMaxForSummary - x;
m->m_type = TYPE_LONG;
m->m_def = "70000";
m->m_group = 0;
m->m_flags = PF_API | PF_CLONE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "Prox summary carver radius";
m->m_desc = "Maximum number of characters to allow in between "
"search terms.";
m->m_cgi = "pscr";
m->m_off = (char *)&cr.m_proxCarveRadius - x;
m->m_type = TYPE_LONG;
m->m_def = "256";
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "front highlight tag";
m->m_desc = "Front html tag used for highlightig query terms in the "
"summaries displated in the search results.";
m->m_cgi = "sfht";
m->m_off = (char *)cr.m_summaryFrontHighlightTag - x;
m->m_type = TYPE_STRING;
m->m_size = SUMMARYHIGHLIGHTTAGMAXSIZE ;
m->m_def = "<b style=\"color:black;background-color:#ffff66\">";
m->m_group = 0;
m->m_flags = PF_API | PF_CLONE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "back highlight tag";
m->m_desc = "Front html tag used for highlightig query terms in the "
"summaries displated in the search results.";
m->m_cgi = "sbht";
m->m_off = (char *)cr.m_summaryBackHighlightTag - x;
m->m_type = TYPE_STRING;
m->m_size = SUMMARYHIGHLIGHTTAGMAXSIZE ;
m->m_def = "</b>";
m->m_group = 0;
m->m_flags = PF_API | PF_CLONE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "results to scan for gigabits generation by default";
m->m_desc = "How many search results should we "
"scan for gigabit (related topics) generation. Set this to "
"zero to disable gigabits generation by default.";
m->m_cgi = "dsrt";
m->m_off = (char *)&cr.m_docsToScanForTopics - x;
m->m_type = TYPE_LONG;
m->m_def = "30";
m->m_flags = PF_API | PF_CLONE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "ip restriction for gigabits by default";
m->m_desc = "Should Gigablast only get one document per IP domain "
"and per domain for gigabits (related topics) generation?";
m->m_cgi = "ipr";
m->m_off = (char *)&cr.m_ipRestrict - x;
m->m_type = TYPE_BOOL;
// default to 0 since newspaperarchive only has docs from same IP dom
m->m_def = "0";
m->m_group = 0;
m->m_flags = PF_API | PF_CLONE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "remove overlapping topics";
m->m_desc = "Should Gigablast remove overlapping topics (gigabits)?";
m->m_cgi = "rot";
m->m_off = (char *)&cr.m_topicRemoveOverlaps - x;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_group = 0;
m->m_flags = PF_API | PF_CLONE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "number of gigabits to show by default";
m->m_desc = "What is the number of "
"related topics (gigabits) "
"displayed per query? Set to 0 to save "
"CPU time.";
m->m_cgi = "nrt";
m->m_off = (char *)&cr.m_numTopics - x;
m->m_type = TYPE_LONG;
m->m_def = "11";
m->m_group = 0;
m->m_sprpg = 0; // do not propagate
m->m_sprpp = 0; // do not propagate
m->m_flags = PF_API | PF_CLONE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "min gigabit score by default";
m->m_desc = "Gigabits (related topics) with scores below this "
"will be excluded. Scores range from 0% to over 100%.";
m->m_cgi = "mts";
m->m_off = (char *)&cr.m_minTopicScore - x;
m->m_type = TYPE_LONG;
m->m_def = "5";
m->m_group = 0;
m->m_flags = PF_API | PF_CLONE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "min gigabit doc count by default";
m->m_desc = "How many documents must contain the gigabit "
"(related topic) in order for it to be displayed.";
m->m_cgi = "mdc";
m->m_off = (char *)&cr.m_minDocCount - x;
m->m_type = TYPE_LONG;
m->m_def = "2";
m->m_group = 0;
m->m_flags = PF_API | PF_CLONE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "dedup doc percent for gigabits (related topics)";
m->m_desc = "If a document is this percent similar to another "
"document with a higher score, then it will not contribute "
"to the gigabit generation.";
m->m_cgi = "dsp";
m->m_off = (char *)&cr.m_dedupSamplePercent - x;
m->m_type = TYPE_LONG;
m->m_def = "80";
m->m_group = 0;
m->m_flags = PF_API | PF_CLONE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "max words per gigabit (related topic) by default";
m->m_desc = "Maximum number of words a gigabit (related topic) "
"can have. Affects xml feeds, too.";
m->m_cgi = "mwpt";
m->m_off = (char *)&cr.m_maxWordsPerTopic - x;
m->m_type = TYPE_LONG;
m->m_def = "6";
m->m_group = 0;
m->m_flags = PF_API | PF_CLONE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "gigabit max sample size";
m->m_desc = "Max chars to sample from each doc for gigabits "
"(related topics).";
m->m_cgi = "tmss";
m->m_off = (char *)&cr.m_topicSampleSize - x;
m->m_type = TYPE_LONG;
m->m_def = "4096";
m->m_group = 0;
m->m_flags = PF_API | PF_CLONE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "gigabit max punct len";
m->m_desc = "Max sequential punct chars allowed in a gigabit "
"(related topic). "
" Set to 1 for speed, 5 or more for best topics but twice as "
"slow.";
m->m_cgi = "tmpl";
m->m_off = (char *)&cr.m_topicMaxPunctLen - x;
m->m_type = TYPE_LONG;
m->m_def = "1";
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "display dmoz categories in results";
m->m_desc = "If enabled, results in dmoz will display their "
"categories on the results page.";
m->m_cgi = "ddc";
m->m_off = (char *)&cr.m_displayDmozCategories - x;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_flags = PF_API | PF_CLONE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "display indirect dmoz categories in results";
m->m_desc = "If enabled, results in dmoz will display their "
"indirect categories on the results page.";
m->m_cgi = "didc";
m->m_off = (char *)&cr.m_displayIndirectDmozCategories - x;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_group = 0;
m->m_flags = PF_API | PF_CLONE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "display Search Category link to query category of result";
m->m_desc = "If enabled, a link will appear next to each category "
"on each result allowing the user to perform their query "
"on that entire category.";
m->m_cgi = "dscl";
m->m_off = (char *)&cr.m_displaySearchCategoryLink - x;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_group = 0;
m->m_flags = PF_API | PF_CLONE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "use dmoz for untitled";
m->m_desc = "Yes to use DMOZ given title when a page is untitled but "
"is in DMOZ.";
m->m_cgi = "udfu";
m->m_off = (char *)&cr.m_useDmozForUntitled - x;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_group = 0;
m->m_flags = PF_API | PF_CLONE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "show dmoz summaries";
m->m_desc = "Yes to always show DMOZ summaries with search results "
"that are in DMOZ.";
m->m_cgi = "udsm";
m->m_off = (char *)&cr.m_showDmozSummary - x;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_group = 0;
m->m_flags = PF_API | PF_CLONE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "show adult category on top";
m->m_desc = "Yes to display the Adult category in the Top category";
m->m_cgi = "sacot";
m->m_off = (char *)&cr.m_showAdultCategoryOnTop - x;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_group = 0;
m->m_flags = PF_API | PF_CLONE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
/*
m->m_title = "show sensitive info in xml feed";
m->m_desc = "If enabled, we show certain tagb tags for each "
"search result, allow &amp;inlinks=1 cgi parms, show "
"<docsInColl>, etc. in the xml feed. Created for buzzlogic.";
m->m_cgi = "sss";
m->m_off = (char *)&cr.m_showSensitiveStuff - x;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m++;
*/
m->m_title = "display indexed date";
m->m_desc = "Display the indexed date along with results.";
m->m_cgi = "didt";
m->m_off = (char *)&cr.m_displayIndexedDate - x;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "display last modified date";
m->m_desc = "Display the last modified date along with results.";
m->m_cgi = "dlmdt";
m->m_off = (char *)&cr.m_displayLastModDate - x;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "display published date";
m->m_desc = "Display the published date along with results.";
m->m_cgi = "dipt";
m->m_off = (char *)&cr.m_displayPublishDate - x;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "enable click 'n' scroll";
m->m_desc = "The [cached] link on results pages loads click n "
"scroll.";
m->m_cgi = "ecns";
m->m_off = (char *)&cr.m_clickNScrollEnabled - x;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "use data feed account server";
m->m_desc = "Enable/disable the use of a remote account verification "
"for Data Feed Customers.";
m->m_cgi = "dfuas";
m->m_off = (char *)&cr.m_useDFAcctServer - x;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "data feed server ip";
m->m_desc = "The ip address of the Gigablast data feed server to "
"retrieve customer account information from.";
m->m_cgi = "dfip";
m->m_off = (char *)&cr.m_dfAcctIp - x;
m->m_type = TYPE_IP;
m->m_def = "2130706433";
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "data feed server port";
m->m_desc = "The port of the Gigablast data feed server to retrieve "
"customer account information from.";
m->m_cgi = "dfport";
m->m_off = (char *)&cr.m_dfAcctPort - x;
m->m_type = TYPE_LONG;
m->m_def = "8040";
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
/*
m->m_title = "data feed server collection";
m->m_desc = "The collection on the Gigablast data feed server to "
"retrieve customer account information from.";
m->m_cgi = "dfcoll";
m->m_off = (char *)&cr.m_dfAcctColl - x;
m->m_type = TYPE_STRING;
m->m_size = MAX_COLL_LEN;
m->m_def = "customers";
m->m_group = 0;
m++;
*/
//
// not sure cols=x goes here or not
//
/*
m->m_title = "Number Of Columns(1-6)";
m->m_desc = "How many columns results should be shown in. (1-6)";
m->m_cgi = "cols";
m->m_smin = 1;
m->m_smax = 6;
m->m_off = (char *)&cr.m_numCols - x;
m->m_soff = (char *)&si.m_numCols - y;
m->m_type = TYPE_LONG;
m->m_def = "1";
m->m_group = 0;
m->m_sparm = 1;
m++;
*/
//
// Gets the screen width
//
/*
m->m_title = "Screen Width";
m->m_desc = "screen size of browser window";
m->m_cgi = "ws";
m->m_smin = 600;
m->m_off = (char *)&cr.m_screenWidth - x;
m->m_soff = (char *)&si.m_screenWidth - y;
m->m_type = TYPE_LONG;
m->m_def = "1100";
m->m_group = 0;
m->m_sparm = 1;
m++;
*/
/*
m->m_title = "collection hostname";
m->m_desc = "Hostname that will default to this collection. Blank"
" for none or default collection.";
m->m_cgi = "chstn";
m->m_off = (char *)cr.m_collectionHostname - x;
m->m_type = TYPE_STRING;
m->m_size = MAX_URL_LEN;
m->m_def = "";
m++;
m->m_title = "collection hostname (1)";
m->m_desc = "Hostname that will default to this collection. Blank"
" for none or default collection.";
m->m_cgi = "chstna";
m->m_off = (char *)cr.m_collectionHostname1 - x;
m->m_type = TYPE_STRING;
m->m_size = MAX_URL_LEN;
m->m_def = "";
m->m_group = 0;
m++;
m->m_title = "collection hostname (2)";
m->m_desc = "Hostname that will default to this collection. Blank"
" for none or default collection.";
m->m_cgi = "chstnb";
m->m_off = (char *)cr.m_collectionHostname2 - x;
m->m_type = TYPE_STRING;
m->m_size = MAX_URL_LEN;
m->m_def = "";
m->m_group = 0;
m++;
*/
m->m_title = "home page";
static SafeBuf s_tmpBuf;
s_tmpBuf.setLabel("stmpb1");
s_tmpBuf.safePrintf (
"Html to display for the home page. "
"Leave empty for default home page. "
"Use %%N for total "
"number of pages indexed. Use %%n for number of "
"pages indexed "
"for the current collection. "
//"Use %%H so Gigablast knows where to insert "
//"the hidden form input tags, which must be there. "
"Use %%c to insert the current collection name. "
//"Use %T to display the standard footer. "
"Use %%q to display the query in "
"a text box. "
"Use %%t to display the directory TOP. "
"Example to paste into textbox: "
"<br><i>"
);
s_tmpBuf.htmlEncode (
"<html>"
"<title>My Gigablast Search Engine</title>"
"<script>\n"
//"<!--"
"function x(){document.f.q.focus();}"
//"// -->"
"\n</script>"
"<body onload=\"x()\">"
"<br><br>"
"<center>"
"<a href=/>"
"<img border=0 width=500 height=122 "
"src=/logo-med.jpg></a>"
"<br><br>"
"<b>My Search Engine</b>"
"<br><br>"
// "<br><br><br>"
// "<b>web</b> "
// "&nbsp;&nbsp;&nbsp;&nbsp; "
// "<a href=\"/Top\">directory</a> "
// "&nbsp;&nbsp;&nbsp;&nbsp; "
// "<a href=/adv.html>advanced search</a> "
// "&nbsp;&nbsp;&nbsp;&nbsp; "
// "<a href=/addurl "
// "title=\"Instantly add your url to "
//"the index\">"
// "add url</a>"
// "<br><br>"
"<form method=get action=/search name=f>"
"<input type=hidden name=c value=\"%c\">"
"<input name=q type=text size=60 value=\"\">"
"&nbsp;"
"<input type=\"submit\" value=\"Search\">"
"</form>"
"<br>"
"<center>"
"Searching the <b>%c</b> collection of %n "
"documents."
"</center>"
"<br>"
"</body></html>") ;
s_tmpBuf.safePrintf("</i>");
m->m_desc = s_tmpBuf.getBufStart();
m->m_xml = "homePageHtml";
m->m_cgi = "hp";
m->m_off = (char *)&cr.m_htmlRoot - x;
//m->m_plen = (char *)&cr.m_htmlRootLen - x; // length of string
m->m_type = TYPE_SAFEBUF;//STRINGBOX;
//m->m_size = MAX_HTML_LEN + 1;
m->m_def = "";
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m->m_flags = PF_TEXTAREA | PF_CLONE;
m++;
m->m_title = "html head";
static SafeBuf s_tmpBuf2;
s_tmpBuf2.setLabel("stmpb2");
s_tmpBuf2.safePrintf("Html to display before the search results. ");
char *fff = "Leave empty for default. "
"Convenient "
"for changing colors and displaying logos. Use "
"the variable, "
"%q, to represent the query to display in a "
"text box. "
"Use %e to print the url encoded query. "
//"Use %e to print the page encoding. "
// i guess this is out for now
//"Use %D to "
//"print a drop down "
//"menu for the number of search results to return. "
"Use %S "
"to print sort by date or relevance link. Use "
"%L to "
"display the logo. Use %R to display radio "
"buttons for site "
"search. Use %F to begin the form. and use %H to "
"insert "
"hidden text "
"boxes of parameters like the current search result "
"page number. "
"BOTH %F and %H are necessary for the html head, but do "
"not duplicate them in the html tail. "
"Use %f to display "
"the family filter radio buttons. "
// take this out for now
//"Directory: Use %s to display the directory "
//"search type options. "
//"Use %l to specify the "
//"location of "
//"dir=rtl in the body tag for RTL pages. "
//"Use %where and %when to substitute the where "
//"and when of "
//"the query. "
//"These values may be set based on the cookie "
//"if "
//"none was explicitly given. "
//"IMPORTANT: In the xml configuration file, "
//"this html "
//"must be encoded (less thans mapped to &lt;, "
//"etc.).";
"Example to paste into textbox: <br><i>";
s_tmpBuf2.safeStrcpy(fff);
s_tmpBuf2.htmlEncode(
"<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 "
"Transitional//EN\">\n"
"<html>\n"
"<head>\n"
"<title>My Gigablast Search Results</title>\n"
"<meta http-equiv=\"Content-Type\" "
"content=\"text/html; charset=utf-8\">\n"
"</head>\n"
"<body%l>\n"
//"<form method=\"get\" action=\"/search\" name=\"f\">\n"
// . %F prints the <form method=...> tag
// . method will be GET or POST depending on the size of the
// input data. MSIE can't handle sending large GETs requests
// that are more than like 1k or so, which happens a lot with
// our CTS technology (the sites= cgi parm can be very large)
"%F"
"<table cellpadding=\"2\" cellspacing=\"0\" border=\"0\">\n"
"<tr>\n"
"<td valign=top>"
// this prints the Logo
"%L"
//"<a href=\"/\">"
//"<img src=\"logo2.gif\" alt=\"Gigablast Logo\" "
//"width=\"210\" height=\"25\" border=\"0\" valign=\"top\">"
//"</a>"
"</td>\n"
"<td valign=top>\n"
"<nobr>\n"
"<input type=\"text\" name=\"q\" size=\"60\" value=\"\%q\"> "
// %D is the number of results drop down menu
"\%D"
"<input type=\"submit\" value=\"Blast It!\" border=\"0\">\n"
"</nobr>\n"
// family filter
// %R radio button for site(s) search
"<br>%f %R\n"
// directory search options
// MDW: i guess this is out for now
//"</td><td>%s</td>\n"
"</tr>\n"
"</table>\n"
// %H prints the hidden for vars. Print them *after* the input
// text boxes, radio buttons, etc. so these hidden vars can be
// overriden as they should be.
"%H");
s_tmpBuf2.safePrintf("</i>");
m->m_desc = s_tmpBuf2.getBufStart();
m->m_xml = "htmlHead";
m->m_cgi = "hh";
m->m_off = (char *)&cr.m_htmlHead - x;
m->m_type = TYPE_SAFEBUF;//STRINGBOX;
m->m_def = "";
//m->m_sparm = 1;
//m->m_soff = (char *)&si.m_htmlHead - y;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m->m_flags = PF_TEXTAREA | PF_CLONE;
m++;
m->m_title = "html tail";
static SafeBuf s_tmpBuf3;
s_tmpBuf3.setLabel("stmpb3");
s_tmpBuf3.safePrintf("Html to display after the search results. ");
s_tmpBuf3.safeStrcpy(fff);
s_tmpBuf3.htmlEncode (
"<br>\n"
//"%F"
"<table cellpadding=2 cellspacing=0 border=0>\n"
"<tr><td></td>\n"
//"<td valign=top align=center>\n"
// this old query is overriding a newer query above so
// i commented out. mfd 6/2014
//"<nobr>"
//"<input type=text name=q size=60 value=\"%q\"> %D\n"
//"<input type=submit value=\"Blast It!\" border=0>\n"
//"</nobr>"
// family filter
//"<br>%f %R\n"
//"<br>"
//"%R\n"
//"</td>"
"<td>%s</td>\n"
"</tr>\n"
"</table>\n"
"Try your search on \n"
"<a href=http://www.google.com/search?q=%e>google</a> &nbsp;\n"
"<a href=http://search.yahoo.com/bin/search?p=%e>yahoo</a> "
"&nbsp;\n"
//"<a href=http://www.alltheweb.com/search?query=%e>alltheweb"
//"</a>\n"
"<a href=http://search.dmoz.org/cgi-bin/search?search=%e>"
"dmoz</a> &nbsp;\n"
//"<a href=http://search01.altavista.com/web/results?q=%e>"
//"alta vista</a>\n"
//"<a href=http://s.teoma.com/search?q=%e>teoma</a> &nbsp;\n"
//"<a href=http://wisenut.com/search/query.dll?q=%e>wisenut"
//"</a>\n"
"</font></body>\n");
s_tmpBuf3.safePrintf("</i>");
m->m_desc = s_tmpBuf3.getBufStart();
m->m_xml = "htmlTail";
m->m_cgi = "ht";
m->m_off = (char *)&cr.m_htmlTail - x;
m->m_type = TYPE_SAFEBUF;//STRINGBOX;
m->m_def = "";
//m->m_sparm = 1;
//m->m_soff = (char *)&si.m_htmlHead - y;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m->m_flags = PF_TEXTAREA | PF_CLONE;
m++;
///////////////////////////////////////////
// PAGE SPIDER CONTROLS
///////////////////////////////////////////
// just a comment in the conf file
m->m_desc =
"All <, >, \" and # characters that are values for a field "
"contained herein must be represented as "
"&lt;, &gt;, &#34; and &#035; respectively.";
m->m_type = TYPE_COMMENT;
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "spidering enabled";
m->m_desc = "Controls just the spiders for this collection.";
m->m_cgi = "cse";
m->m_off = (char *)&cr.m_spideringEnabled - x;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
// this linked list of colls is in Spider.cpp and used to only
// poll the active spider colls for spidering. so if coll
// gets paused/unpaused we have to update it.
m->m_flags = PF_CLONE | PF_REBUILDACTIVELIST;
m++;
m->m_title = "site list";
m->m_xml = "siteList";
m->m_desc = "List of sites to spider, one per line. "
"See <a href=#examples>example site list</a> below. "
"Gigablast uses the "
"<a href=/admin/filters#insitelist>insitelist</a> "
"directive on "
"the <a href=/admin/filters>url filters</a> "
"page to make sure that the spider only indexes urls "
"that match the site patterns you specify here, other than "
"urls you add individually via the add urls or inject url "
"tools. "
"Limit list to 300MB. If you have a lot of INDIVIDUAL urls "
"to add then consider using the <a href=/admin/addurl>addurl"
"</a> interface.";
m->m_cgi = "sitelist";
m->m_off = (char *)&cr.m_siteListBuf - x;
m->m_page = PAGE_SPIDER;// PAGE_SITES;
m->m_obj = OBJ_COLL;
m->m_type = TYPE_SAFEBUF;
m->m_func = CommandUpdateSiteList;
m->m_def = "";
// rebuild urlfilters now will nuke doledb and call updateSiteList()
m->m_flags = PF_TEXTAREA | PF_REBUILDURLFILTERS | PF_CLONE;
m++;
m->m_title = "reset collection";
m->m_desc = "Remove all documents from the collection and turn "
"spiders off.";
m->m_cgi = "reset";
m->m_type = TYPE_CMD;
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m->m_func2 = CommandResetColl;
m->m_cast = 1;
m->m_flags = PF_HIDDEN;
m++;
m->m_title = "restart collection";
m->m_desc = "Remove all documents from the collection and re-add "
"seed urls from site list.";
m->m_cgi = "restart";
m->m_type = TYPE_CMD;
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m->m_func2 = CommandRestartColl;
m->m_cast = 1;
m++;
/*
m->m_title = "new spidering enabled";
m->m_desc = "When enabled the spider adds NEW "
"pages to your index. ";
m->m_cgi = "nse";
m->m_off = (char *)&cr.m_newSpideringEnabled - x;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m++;
m->m_title = "old spidering enabled";
m->m_desc = "When enabled the spider will re-visit "
"and update pages that are already in your index.";
m->m_cgi = "ose";
m->m_off = (char *)&cr.m_oldSpideringEnabled - x;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_group = 0;
m++;
m->m_title = "new spider weight";
m->m_desc = "Weight time slices of new spiders in the priority "
"page by this factor relative to the old spider queues.";
m->m_cgi = "nsw";
m->m_off = (char *)&cr.m_newSpiderWeight - x;
m->m_type = TYPE_FLOAT;
m->m_def = "1.0";
m->m_group = 0;
m++;
*/
m->m_title = "max spiders";
m->m_desc = "What is the maximum number of web "
"pages the spider is allowed to download "
"simultaneously PER HOST for THIS collection? The "
"maximum number of spiders over all collections is "
"controlled in the <i>master controls</i>.";
m->m_cgi = "mns";
m->m_off = (char *)&cr.m_maxNumSpiders - x;
m->m_type = TYPE_LONG;
// make it the hard max so control is really in the master controls
m->m_def = "300";
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m->m_flags = PF_CLONE;
m++;
m->m_title = "spider delay in milliseconds";
m->m_desc = "make each spider wait this many milliseconds before "
"getting the ip and downloading the page.";
m->m_cgi = "sdms";
m->m_off = (char *)&cr.m_spiderDelayInMilliseconds - x;
m->m_type = TYPE_LONG;
m->m_def = "0";
m->m_group = 0;
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m->m_flags = PF_CLONE;
m++;
m->m_title = "obey robots.txt";
m->m_xml = "useRobotstxt";
m->m_desc = "If this is true Gigablast will respect "
"the robots.txt convention and rel no follow meta tags.";
m->m_cgi = "obeyRobots";
m->m_off = (char *)&cr.m_useRobotsTxt - x;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m->m_flags = PF_CLONE;
m++;
m->m_title = "obey rel no follow links";
m->m_desc = "If this is true Gigablast will respect "
"the rel no follow link attribute.";
m->m_cgi = "obeyRelNoFollow";
m->m_off = (char *)&cr.m_obeyRelNoFollowLinks - x;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m->m_flags = PF_CLONE;
m++;
m->m_title = "max robots.txt cache age";
m->m_desc = "How many seconds to cache a robots.txt file for. "
"86400 is 1 day. 0 means Gigablast will not read from the "
"cache at all and will download the robots.txt before every "
"page if robots.txt use is enabled above. However, if this is "
"0 then Gigablast will still store robots.txt files in the "
"cache.";
m->m_cgi = "mrca";
m->m_off = (char *)&cr.m_maxRobotsCacheAge - x;
m->m_type = TYPE_LONG;
m->m_def = "86400"; // 24*60*60 = 1day
m->m_units = "seconds";
m->m_group = 0;
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m->m_flags = PF_CLONE;
m++;
m->m_title = "always use spider proxies";
m->m_desc = "If this is true Gigablast will ALWAYS use the proxies "
"listed on the <a href=/admin/proxies>proxies</a> "
"page for "
"spidering for "
"this collection."
//"regardless whether the proxies are enabled "
//"on the <a href=/admin/proxies>proxies</a> page."
;
m->m_cgi = "useproxies";
m->m_off = (char *)&cr.m_forceUseFloaters - x;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m->m_flags = PF_CLONE;
m++;
m->m_title = "automatically use spider proxies";
m->m_desc = "Use the spider proxies listed on the proxies page "
"if gb detects that "
"a webserver is throttling the spiders. This way we can "
"learn the webserver's spidering policy so that our spiders "
"can be more polite. If no proxies are listed on the "
"proxies page then this parameter will have no effect.";
m->m_cgi = "automaticallyuseproxies";
m->m_off = (char *)&cr.m_automaticallyUseProxies - x;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_group = 0;
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m->m_flags = PF_CLONE;
m++;
m->m_title = "automatically back off";
m->m_desc = "Set the crawl delay to 5 seconds if gb detects "
"that an IP is throttling or banning gigabot from crawling "
"it. The crawl delay just applies to that IP. "
"Such throttling will be logged.";
m->m_cgi = "automaticallybackoff";
m->m_xml = "automaticallyBackOff";
m->m_off = (char *)&cr.m_automaticallyBackOff - x;
m->m_type = TYPE_BOOL;
// a lot of pages have recaptcha links but they have valid content
// so leave this off for now... they have it in a hidden div which
// popups to email the article link or whatever to someone.
m->m_def = "0";
m->m_group = 0;
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m->m_flags = PF_CLONE;
m++;
m->m_title = "use time axis";
m->m_desc = "If this is true Gigablast will index the same "
"url multiple times if its content varies over time, "
"rather than overwriting the older version in the index. "
"Useful for archive web pages as they change over time.";
m->m_cgi = "usetimeaxis";
m->m_off = (char *)&cr.m_useTimeAxis - x;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m->m_flags = PF_CLONE;
m++;
m->m_title = "index warc or arc files";
m->m_desc = "If this is true Gigablast will index .warc and .arc "
"files by injecting the pages contained in them as if they "
"were spidered with the content in the .warc or .arc file. "
"The spidered time will be taken from the archive file "
"as well.";
m->m_cgi = "indexwarcs";
m->m_off = (char *)&cr.m_indexWarcs - x;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m->m_flags = PF_CLONE;
m++;
/*
m->m_title = "add url enabled";
m->m_desc = "If this is enabled others can add "
"web pages to your index via the add url page.";
m->m_cgi = "aue";
m->m_off = (char *)&cr.m_addUrlEnabled - x;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m++;
*/
m->m_title = "daily merge time";
m->m_desc = "Do a tight merge on posdb and titledb at this time "
"every day. This is expressed in MINUTES past midnight UTC. "
"UTC is 5 hours ahead "
"of EST and 7 hours ahead of MST. Leave this as -1 to "
"NOT perform a daily merge. To merge at midnight EST use "
"60*5=300 and midnight MST use 60*7=420.";
m->m_cgi = "dmt";
m->m_off = (char *)&cr.m_dailyMergeTrigger - x;
m->m_type = TYPE_LONG;
m->m_def = "-1";
m->m_units = "minutes";
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m->m_flags = PF_CLONE;
m++;
m->m_title = "daily merge days";
m->m_desc = "Comma separated list of days to merge on. Use "
"0 for Sunday, 1 for Monday, ... 6 for Saturday. Leaving "
"this parmaeter empty or without any numbers will make the "
"daily merge happen every day";
m->m_cgi = "dmdl";
m->m_off = (char *)&cr.m_dailyMergeDOWList - x;
m->m_type = TYPE_STRING;
m->m_size = 48;
// make sunday the default
m->m_def = "0";
m->m_group = 0;
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m->m_flags = PF_CLONE;
m++;
m->m_title = "daily merge last started";
m->m_desc = "When the daily merge was last kicked off. Expressed in "
"UTC in seconds since the epoch.";
m->m_cgi = "dmls";
m->m_off = (char *)&cr.m_dailyMergeStarted - x;
m->m_type = TYPE_LONG_CONST;
m->m_def = "-1";
m->m_group = 0;
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m->m_flags = PF_NOAPI;
m++;
/*
m->m_title = "use datedb";
m->m_desc = "Index documents for generating results sorted by date "
"or constrained by date range. Only documents indexed while "
"this is enabled will be returned for date-related searches.";
m->m_cgi = "ud";
m->m_off = (char *)&cr.m_useDatedb - x;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m++;
m->m_title = "age cutoff for datedb";
m->m_desc = "Do not index pubdates into datedb that are more "
"than this many days old. Use -1 for no limit. A value "
"of zero essentially turns off datedb. Pre-existing pubdates "
"in datedb that fail to meet this constraint WILL BE "
"COMPLETELY ERASED when datedb is merged.";
m->m_cgi = "dbc";
m->m_off = (char *)&cr.m_datedbCutoff - x;
m->m_type = TYPE_LONG;
m->m_def = "-1";
m->m_units = "days";
m++;
m->m_title = "datedb default timezone";
m->m_desc = "Default timezone to use when none specified on parsed "
"time. Use offset from GMT, i.e 0400 (AMT) or -0700 (MST)";
m->m_cgi = "ddbdt";
m->m_off = (char *)&cr.m_datedbDefaultTimezone - x;
m->m_type = TYPE_LONG;
m->m_def = "0";
m->m_group = 0;
m++;
*/
//m->m_title = "days before now to index";
//m->m_desc = "Only index page if the datedb date was found to be "
// "within this many days of the current time. Use 0 to index "
// "all dates. Parm is float for fine control.";
//m->m_cgi = "ddbdbn";
//m->m_off = (char *)&cr.m_datedbDaysBeforeNow - x;
//m->m_type = TYPE_FLOAT;
//m->m_def = "0";
//m->m_group = 0;
//m++;
m->m_title = "turing test enabled";
m->m_desc = "If this is true, users will have to "
"pass a simple Turing test to add a url. This prevents "
"automated url submission.";
m->m_cgi = "dtt";
m->m_off = (char *)&cr.m_doTuringTest - x;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m->m_flags = PF_CLONE;
m++;
m->m_title = "max add urls";
m->m_desc = "Maximum number of urls that can be "
"submitted via the addurl interface, per IP domain, per "
"24 hour period. A value less than or equal to zero "
"implies no limit.";
m->m_cgi = "mau";
m->m_off = (char *)&cr.m_maxAddUrlsPerIpDomPerDay - x;
m->m_type = TYPE_LONG;
m->m_def = "0";
m->m_group = 0;
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m->m_flags = PF_CLONE;
m++;
// use url filters harvest links parm for this now
/*
m->m_title = "spider links";
m->m_desc = "If this is false, the spider will not "
"harvest links from web pages it visits. Links that it does "
"harvest will be attempted to be indexed at a later time. ";
m->m_cgi = "sl";
m->m_off = (char *)&cr.m_spiderLinks - x;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m++;
*/
/*
MDW: use the "onsite" directive in the url filters page now...
m->m_title = "only spider links from same host";
m->m_desc = "If this is true the spider will only harvest links "
"to pages that are contained on the same host as the page "
"that is being spidered. "
"Example: When spidering a page from "
"www.gigablast.com, only links to pages that are from "
"www.gigablast.com would "
"be harvested, if this switch were enabled. This allows you "
"to seed the spider with URLs from a specific set of hosts "
"and ensure that only links to pages that are from those "
"hosts are harvested.";
m->m_cgi = "slsh";
m->m_off = (char *)&cr.m_sameHostLinks - x;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_group = 0;
m++;
*/
m->m_title = "do not re-add old outlinks more than this many days";
m->m_desc = "If less than this many days have elapsed since the "
"last time we added the outlinks to spiderdb, do not re-add "
"them to spiderdb. Saves resources.";
m->m_cgi = "slrf";
m->m_off = (char *)&cr.m_outlinksRecycleFrequencyDays - x;
m->m_type = TYPE_FLOAT;
m->m_def = "30";
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m++;
/*
m->m_title = "spider links by priority";
m->m_desc = "Specify priorities for which links should be spidered. "
"If the <i>spider links</i> option above is "
"disabled then these setting will have no effect.";
m->m_cgi = "slp";
m->m_xml = "spiderLinksByPriority";
m->m_off = (char *)&cr.m_spiderLinksByPriority - x;
m->m_type = TYPE_PRIORITY_BOXES; // array of numbered (0-(MAX_SPIDER_PRIORITIES-1)) checkboxes
m->m_fixed = MAX_SPIDER_PRIORITIES;
m->m_def = "1"; // default for each one is on
m->m_group = 0;
m++;
*/
/*
m->m_title = "min link priority";
m->m_desc = "Only add links to the spider "
"queue if their spider priority is this or higher. "
"This can make the spider process more efficient "
"since a lot of disk seeks are used when adding "
"links.";
m->m_cgi = "mlp";
m->m_off = (char *)&cr.m_minLinkPriority - x;
m->m_type = TYPE_PRIORITY;
m->m_def = "0";
m->m_group = 0;
m++;
*/
/* m->m_title = "maximum hops from parent page";
m->m_desc = "Only index pages that are within a particular number "
"of hops from the parent page given in Page Add Url. -1 means "
"that max hops is infinite.";
m->m_cgi = "mnh";
m->m_off = (char *)&cr.m_maxNumHops - x;
m->m_type = TYPE_CHAR2;
m->m_def = "-1";
m->m_group = 0;
m++;*/
m->m_title = "scraping enabled procog";
m->m_desc = "Do searches for queries in this hosts part of the "
"query log.";
m->m_cgi = "scrapepc";
m->m_off = (char *)&cr.m_scrapingEnabledProCog - x;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "scraping enabled web";
m->m_desc = "Perform random searches on googles news search engine "
"to add sites with ingoogle tags into tagdb.";
m->m_cgi = "scrapeweb";
m->m_off = (char *)&cr.m_scrapingEnabledWeb - x;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "scraping enabled news";
m->m_desc = "Perform random searches on googles news search engine "
"to add sites with news and goognews and ingoogle "
"tags into tagdb.";
m->m_cgi = "scrapenews";
m->m_off = (char *)&cr.m_scrapingEnabledNews - x;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "scraping enabled blogs";
m->m_desc = "Perform random searches on googles news search engine "
"to add sites with blogs and googblogs and ingoogle "
"tags into tagdb.";
m->m_cgi = "scrapeblogs";
m->m_off = (char *)&cr.m_scrapingEnabledBlogs - x;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m++;
/*
m->m_title = "subsite detection enabled";
m->m_desc = "Add the \"sitepathdepth\" to tagdb if a hostname "
"is determined to have subsites at a particular depth.";
m->m_cgi = "ssd";
m->m_off = (char *)&cr.m_subsiteDetectionEnabled - x;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m++;
*/
m->m_title = "deduping enabled";
m->m_desc = "When enabled, the spider will "
"discard web pages which are identical to other web pages "
"that are already in the index. "//AND that are from the same "
//"hostname.
//"An example of a hostname is www1.ibm.com. "
"However, root urls, urls that have no path, are never "
"discarded. It most likely has to hit disk to do these "
"checks so it does cause some slow down. Only use it if you "
"need it.";
m->m_cgi = "de";
m->m_off = (char *)&cr.m_dedupingEnabled - x;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m->m_flags = PF_CLONE;
m++;
m->m_title = "deduping enabled for www";
m->m_desc = "When enabled, the spider will "
"discard web pages which, when a www is prepended to the "
"page's url, result in a url already in the index.";
m->m_cgi = "dew";
m->m_off = (char *)&cr.m_dupCheckWWW - x;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_group = 0;
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m->m_flags = PF_CLONE;
m++;
m->m_title = "detect custom error pages";
m->m_desc = "Detect and do not index pages which have a 200 status"
" code, but are likely to be error pages.";
m->m_cgi = "dcep";
m->m_off = (char *)&cr.m_detectCustomErrorPages - x;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m->m_flags = PF_CLONE;
m++;
m->m_title = "delete 404s";
m->m_desc = "Should pages be removed from the index if they are no "
"longer accessible on the web?";
m->m_cgi = "dnf";
m->m_off = (char *)&cr.m_delete404s - x;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m->m_flags = PF_HIDDEN;
m++;
m->m_title = "delete timed out docs";
m->m_desc = "Should documents be deleted from the index "
"if they have been retried them enough times and the "
"last received error is a time out? "
"If your internet connection is flaky you may say "
"no here to ensure you do not lose important docs.";
m->m_cgi = "dtod";
m->m_off = (char *)&cr.m_deleteTimeouts - x;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "use simplified redirects";
m->m_desc = "If this is true, the spider, when a url redirects "
"to a \"simpler\" url, will add that simpler url into "
"the spider queue and abandon the spidering of the current "
"url.";
m->m_cgi = "usr";
m->m_off = (char *)&cr.m_useSimplifiedRedirects - x;
m->m_type = TYPE_BOOL;
// turn off for now. spider time deduping should help any issues
// by disabling this.
m->m_def = "0";
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m->m_flags = PF_CLONE;
m++;
m->m_title = "use canonical redirects";
m->m_desc = "If page has a <link canonical> on it then treat it "
"as a redirect, add it to spiderdb for spidering "
"and abandon the indexing of the current url.";
m->m_cgi = "ucr";
m->m_off = (char *)&cr.m_useCanonicalRedirects - x;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m->m_flags = PF_CLONE;
m->m_group = 0;
m++;
m->m_title = "use ifModifiedSince";
m->m_desc = "If this is true, the spider, when "
"updating a web page that is already in the index, will "
"not even download the whole page if it hasn't been "
"updated since the last time Gigablast spidered it. "
"This is primarily a bandwidth saving feature. It relies on "
"the remote webserver's returned Last-Modified-Since field "
"being accurate.";
m->m_cgi = "uims";
m->m_off = (char *)&cr.m_useIfModifiedSince - x;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m->m_flags = PF_CLONE;
m++;
m->m_title = "build similarity vector from content only";
m->m_desc = "If this is true, the spider, when checking the page "
"if it has changed enough to reindex or update the "
"published date, it will build the vector only from "
"the content located on that page.";
m->m_cgi = "bvfc";
m->m_off = (char *)&cr.m_buildVecFromCont - x;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "use content similarity to index publish date";
m->m_desc = "This requires build similarity from content only to be "
"on. This indexes the publish date (only if the content "
"has changed enough) to be between the last two spider "
"dates.";
m->m_cgi = "uspd";
m->m_off = (char *)&cr.m_useSimilarityPublishDate - x;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "max percentage similar to update publish date";
m->m_desc = "This requires build similarity from content only and "
"use content similarity to index publish date to be "
"on. This percentage is the maximum similarity that can "
"exist between an old document and new before the publish "
"date will be updated.";
m->m_cgi = "mpspd";
m->m_off = (char *)&cr.m_maxPercentSimilarPublishDate - x;
m->m_type = TYPE_LONG;
m->m_def = "80";
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m++;
// use url filters for this. this is a crawlbot parm really.
/*
m->m_title = "restrict domain";
m->m_desc = "Keep crawler on same domain as seed urls?";
m->m_cgi = "restrictDomain";
m->m_off = (char *)&cr.m_restrictDomain - x;
m->m_type = TYPE_BOOL;
m->m_def = "1";
// we need to save this it is a diffbot parm
m->m_flags = PF_HIDDEN | PF_DIFFBOT;// | PF_NOSAVE;
m++;
*/
m->m_title = "do url sporn checking";
m->m_desc = "If this is true and the spider finds "
"lewd words in the hostname of a url it will throw "
"that url away. It will also throw away urls that have 5 or "
"more hyphens in their hostname.";
m->m_cgi = "dusc";
m->m_off = (char *)&cr.m_doUrlSpamCheck - x;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m++;
/*
m->m_title = "hours before adding unspiderable url to spiderdb";
m->m_desc = "Hours to wait after trying to add an unspiderable url "
"to spiderdb again.";
m->m_cgi = "dwma";
m->m_off = (char *)&cr.m_deadWaitMaxAge - x;
m->m_type = TYPE_LONG;
m->m_def = "24";
m++;
*/
//m->m_title = "link text anomaly threshold";
//m->m_desc = "Prevent pages from link voting for "
// "another page if its link text has a "
// "word which doesn't occur in at least this "
// "many other link texts. (set to 1 to disable)";
//m->m_cgi = "ltat";
//m->m_off = (char *)&cr.m_linkTextAnomalyThresh - x;
//m->m_type = TYPE_LONG;
//m->m_def = "2";
//m++;
/*
m->m_title = "enforce domain quotas on new docs";
m->m_desc = "If this is true then new documents will be removed "
"from the index if the quota for their domain "
"has been breeched.";
m->m_cgi = "enq";
m->m_off = (char *)&cr.m_enforceNewQuotas - x;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m++;
m->m_title = "enforce domain quotas on indexed docs";
m->m_desc = "If this is true then indexed documents will be removed "
"from the index if the quota for their domain has been "
"breeched.";
m->m_cgi = "eoq";
m->m_off = (char *)&cr.m_enforceOldQuotas - x;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_group = 0;
m++;
m->m_title = "use exact quotas";
m->m_desc = "Does not use approximations so will do more disk seeks "
"and may impact indexing performance significantly.";
m->m_cgi = "ueq";
m->m_off = (char *)&cr.m_exactQuotas - x;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_group = 0;
m++;
m->m_title = "restrict indexdb for spidering";
m->m_desc = "If this is true then only the root indexb file is "
"searched for linkers. Saves on disk seeks, "
"but may use older versions of indexed web pages.";
m->m_cgi = "ris";
m->m_off = (char *)&cr.m_restrictIndexdbForSpider - x;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m++;
*/
/*
m->m_title = "indexdb max total files to merge";
m->m_desc = "Do not merge more than this many files during a single "
"merge operation. Merge does not scale well to numbers above "
"50 or so.";
m->m_cgi = "mttftm";
m->m_off = (char *)&cr.m_indexdbMinTotalFilesToMerge - x;
m->m_def = "50";
//m->m_max = 100;
m->m_type = TYPE_LONG;
m++;
m->m_title = "indexdb min files needed to trigger merge";
m->m_desc = "Merge is triggered when this many indexdb data files "
"are on disk.";
m->m_cgi = "miftm";
m->m_off = (char *)&cr.m_indexdbMinFilesToMerge - x;
m->m_def = "6"; // default to high query performance, not spider
m->m_type = TYPE_LONG;
m->m_group = 0;
m++;
m->m_title = "datedb min files needed to trigger to merge";
m->m_desc = "Merge is triggered when this many datedb data files "
"are on disk.";
m->m_cgi = "mdftm";
m->m_off = (char *)&cr.m_datedbMinFilesToMerge - x;
m->m_def = "5";
m->m_type = TYPE_LONG;
m->m_group = 0;
m++;
m->m_title = "spiderdb min files needed to trigger to merge";
m->m_desc = "Merge is triggered when this many spiderdb data files "
"are on disk.";
m->m_cgi = "msftm";
m->m_off = (char *)&cr.m_spiderdbMinFilesToMerge - x;
m->m_def = "2";
m->m_type = TYPE_LONG;
m->m_group = 0;
m++;
m->m_title = "checksumdb min files needed to trigger to merge";
m->m_desc = "Merge is triggered when this many checksumdb data files "
"are on disk.";
m->m_cgi = "mcftm";
m->m_off = (char *)&cr.m_checksumdbMinFilesToMerge - x;
m->m_def = "2";
m->m_type = TYPE_LONG;
m->m_group = 0;
m++;
m->m_title = "clusterdb min files needed to trigger to merge";
m->m_desc = "Merge is triggered when this many clusterdb data files "
"are on disk.";
m->m_cgi = "mclftm";
m->m_off = (char *)&cr.m_clusterdbMinFilesToMerge - x;
m->m_def = "2";
m->m_type = TYPE_LONG;
m->m_group = 0;
m++;
*/
m->m_title = "linkdb min files needed to trigger to merge";
m->m_desc = "Merge is triggered when this many linkdb data files "
"are on disk. Raise this when initially growing an index "
"in order to keep merging down.";
m->m_cgi = "mlkftm";
m->m_off = (char *)&cr.m_linkdbMinFilesToMerge - x;
m->m_def = "6";
m->m_type = TYPE_LONG;
m->m_group = 0;
m->m_flags = PF_CLONE;//PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "tagdb min files to merge";
m->m_desc = "Merge is triggered when this many linkdb data files "
"are on disk.";
m->m_cgi = "mtftgm";
m->m_off = (char *)&cr.m_tagdbMinFilesToMerge - x;
m->m_def = "2";
m->m_type = TYPE_LONG;
m->m_group = 0;
m->m_flags = PF_CLONE;//PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m++;
// this is overridden by collection
m->m_title = "titledb min files needed to trigger to merge";
m->m_desc = "Merge is triggered when this many titledb data files "
"are on disk.";
m->m_cgi = "mtftm";
m->m_off = (char *)&cr.m_titledbMinFilesToMerge - x;
m->m_def = "6";
m->m_type = TYPE_LONG;
//m->m_save = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m++;
//m->m_title = "sectiondb min files to merge";
//m->m_desc ="Merge is triggered when this many sectiondb data files "
// "are on disk.";
//m->m_cgi = "mscftm";
//m->m_off = (char *)&cr.m_sectiondbMinFilesToMerge - x;
//m->m_def = "4";
//m->m_type = TYPE_LONG;
//m->m_group = 0;
//m++;
m->m_title = "posdb min files needed to trigger to merge";
m->m_desc = "Merge is triggered when this many posdb data files "
"are on disk. Raise this while doing massive injections "
"and not doing much querying. Then when done injecting "
"keep this low to make queries fast.";
m->m_cgi = "mpftm";
m->m_off = (char *)&cr.m_posdbMinFilesToMerge - x;
m->m_def = "6";
m->m_type = TYPE_LONG;
m->m_group = 0;
m->m_flags = PF_CLONE;//PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "recycle content";
m->m_desc = "Rather than downloading the content again when "
"indexing old urls, use the stored content. Useful for "
"reindexing documents under a different ruleset or for "
"rebuilding an index. You usually "
"should turn off the 'use robots.txt' switch. "
"And turn on the 'use old ips' and "
"'recycle link votes' switches for speed. If rebuilding an "
"index then you should turn off the 'only index changes' "
"switches.";
m->m_cgi = "rc";
m->m_off = (char *)&cr.m_recycleContent - x;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "enable link voting";
m->m_desc = "If this is true Gigablast will "
"index hyper-link text and use hyper-link "
"structures to boost the quality of indexed documents. "
"You can disable this when doing a ton of injections to "
"keep things fast. Then do a posdb (index) rebuild "
"after re-enabling this when you are done injecting. Or "
"if you simply do not want link voting this will speed up"
"your injections and spidering a bit.";
m->m_cgi = "glt";
m->m_off = (char *)&cr.m_getLinkInfo - x;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_flags = PF_CLONE|PF_API;//PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "compute inlinks to sites";
m->m_desc = "If this is true Gigablast will "
"compute the number of site inlinks for the sites it "
"indexes. This is a measure of the sites popularity and is "
"used for ranking and some times spidering prioritzation. "
"It will cache the site information in tagdb. "
"The greater the number of inlinks, the longer the cached "
"time, because the site is considered more stable. If this "
"is NOT true then Gigablast will use the included file, "
"sitelinks.txt, which stores the site inlinks of millions "
"of the most popular sites. This is the fastest way. If you "
"notice a lot of <i>getting link info</i> requests in the "
"<i>sockets table</i> you may want to disable this "
"parm.";
m->m_cgi = "csni";
m->m_off = (char *)&cr.m_computeSiteNumInlinks - x;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_group = 0;
m->m_flags = PF_CLONE|PF_API;//PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "do link spam checking";
m->m_desc = "If this is true, do not allow spammy inlinks to vote. "
"This check is "
"too aggressive for some collections, i.e. it "
"does not allow pages with cgi in their urls to vote.";
m->m_cgi = "dlsc";
m->m_off = (char *)&cr.m_doLinkSpamCheck - x;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_group = 0;
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m->m_flags = PF_CLONE;
m++;
m->m_title = "restrict link voting by ip";
m->m_desc = "If this is true Gigablast will "
"only allow one vote per the top 2 significant bytes "
"of the IP address. Otherwise, multiple pages "
"from the same top IP can contribute to the link text and "
"link-based quality ratings of a particular URL. "
"Furthermore, no votes will be accepted from IPs that have "
"the same top 2 significant bytes as the IP of the page "
"being indexed.";
m->m_cgi = "ovpid";
m->m_off = (char *)&cr.m_oneVotePerIpDom - x;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_group = 0;
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m->m_flags = PF_CLONE;
m++;
m->m_title = "use new link algo";
m->m_desc = "Use the links: termlists instead of link:. Also "
"allows pages linking from the same domain or IP to all "
"count as a single link from a different IP. This is also "
"required for incorporating RSS and Atom feed information "
"when indexing a document.";
m->m_cgi = "na";
m->m_off = (char *)&cr.m_newAlgo - x;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m++;
/*
m->m_title = "recycle link votes";
m->m_desc = "If this is true Gigablast will "
"use the old links and link text when re-indexing old urls "
"and not do any link voting when indexing new urls.";
m->m_cgi = "rv";
m->m_off = (char *)&cr.m_recycleVotes - x;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_group = 0;
m++;
*/
m->m_title = "update link info frequency";
m->m_desc = "How often should Gigablast recompute the "
"link info for a url. "
"Also applies to getting the quality of a site "
"or root url, which is based on the link info. "
"In days. Can use decimals. 0 means to update "
"the link info every time the url's content is re-indexed. "
"If the content is not reindexed because it is unchanged "
"then the link info will not be updated. When getting the "
"link info or quality of the root url from an "
"external cluster, Gigablast will tell the external cluster "
"to recompute it if its age is this or higher.";
m->m_cgi = "uvf";
m->m_off = (char *)&cr.m_updateVotesFreq - x;
m->m_type = TYPE_FLOAT;
m->m_def = "60.000000";
m->m_group = 0;
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m->m_flags = PF_CLONE;
m++;
/*
m->m_title = "recycle imported link info";
m->m_desc = "If true, we ALWAYS recycle the imported link info and "
"NEVER recompute it again. Otherwise, recompute it when we "
"recompute the local link info.";
m->m_cgi = "rili";
m->m_off = (char *)&cr.m_recycleLinkInfo2 - x;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_group = 0;
m++;
*/
/*
m->m_title = "use imported link info for quality";
m->m_desc = "If true, we will use the imported link info to "
"help us determine the quality of the page we are indexing.";
m->m_cgi = "uifq";
m->m_off = (char *)&cr.m_useLinkInfo2ForQuality - x;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_group = 0;
m++;
*/
// this can hurt us too much if mis-assigned, remove it
/*
m->m_title = "restrict link voting to roots";
m->m_desc = "If this is true Gigablast will "
"not perform link analysis on urls that are not "
"root urls.";
m->m_cgi = "rvr";
m->m_off = (char *)&cr.m_restrictVotesToRoots - x;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_group = 0;
m++;
*/
/*
m->m_title = "index link text";
m->m_desc = "If this is true Gigablast will "
"index both incoming and outgoing link text for the "
"appropriate documents, depending on url filters and "
"site rules, under the gbinlinktext: and gboutlinktext: "
"fields. Generally, you want this disabled, it was for "
"a client.";
m->m_cgi = "ilt";
m->m_off = (char *)&cr.m_indexLinkText - x;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_group = 0;
m++;
*/
/*
m->m_title = "index incoming link text";
m->m_desc = "If this is false no incoming link text is indexed.";
m->m_cgi = "iilt";
m->m_off = (char *)&cr.m_indexLinkText - x;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_group = 0;
m++;
*/
m->m_title = "index inlink neighborhoods";
m->m_desc = "If this is true Gigablast will "
"index the plain text surrounding the hyper-link text. The "
"score will be x times that of the hyper-link text, where x "
"is the scalar below.";
m->m_cgi = "iin";
m->m_off = (char *)&cr.m_indexInlinkNeighborhoods - x;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m++;
/*
// this is now hard-coded in XmlNode.cpp, currently .8
m->m_title = "inlink neighborhoods score scalar";
m->m_desc = "Gigablast can "
"index the plain text surrounding the hyper-link text. The "
"score will be x times that of the hyper-link text, where x "
"is this number.";
m->m_cgi = "inss";
m->m_off = (char *)&cr.m_inlinkNeighborhoodsScoreScalar - x;
m->m_type = TYPE_FLOAT;
m->m_def = ".20";
m->m_group = 0;
m++;
*/
/*
m->m_title = "break web rings";
m->m_desc = "If this is true Gigablast will "
"attempt to detect link spamming rings and decrease "
"their influence on the link text for a URL.";
m->m_cgi = "bwr";
m->m_off = (char *)&cr.m_breakWebRings - x;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_group = 0;
m++;
*/
/*
m->m_title = "break log spam";
m->m_desc = "If this is true Gigablast will attempt to detect "
"dynamically generated pages and remove their voting power. "
"Additionally, pages over 100k will not be have their "
"outgoing links counted. Pages that have a form which POSTS "
"to a cgi page will not be considered either.";
m->m_cgi = "bls";
m->m_off = (char *)&cr.m_breakLogSpam - x;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m++;
*/
m->m_title = "tagdb collection name";
m->m_desc = "Sometimes you want the spiders to use the tagdb of "
"another collection, like the <i>main</i> collection. "
"If this is empty it defaults to the current collection.";
m->m_cgi = "tdbc";
m->m_off = (char *)&cr.m_tagdbColl - x;
m->m_type = TYPE_STRING;
m->m_size = MAX_COLL_LEN+1;
m->m_def = "";
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "catdb lookups enabled";
m->m_desc = "Spiders will look to see if the current page is in "
"catdb. If it is, all Directory information for that page "
"will be indexed with it.";
m->m_cgi = "cdbe";
m->m_off = (char *)&cr.m_catdbEnabled - x;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "recycle catdb info";
m->m_desc = "Rather than requesting new info from DMOZ, like "
"titles and topic ids, grab it from old record. Increases "
"performance if you are seeing a lot of "
"\"getting catdb record\" entries in the spider queues.";
m->m_cgi = "rci";
m->m_off = (char *)&cr.m_recycleCatdb - x;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "allow banning of pages in catdb";
m->m_desc = "If this is 'NO' then pages that are in catdb, "
"but banned from tagdb or the url filters page, can not "
"be banned.";
m->m_cgi = "abpc";
m->m_off = (char *)&cr.m_catdbPagesCanBeBanned - x;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "override spider errors for catdb";
m->m_desc = "Ignore and skip spider errors if the spidered site"
" is found in Catdb (DMOZ).";
m->m_cgi = "catose";
m->m_off = (char *)&cr.m_overrideSpiderErrorsForCatdb - x;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m++;
//m->m_title = "only spider root urls";
//m->m_desc = "Only spider urls that are roots.";
//m->m_cgi = "osru";
//m->m_off = (char *)&cr.m_onlySpiderRoots - x;
//m->m_type = TYPE_BOOL;
//m->m_def = "0";
//m++;
m->m_title = "allow asian docs";
m->m_desc = "If this is disabled the spider "
"will not allow any docs from the gb2312 charset "
"into the index.";
m->m_cgi = "aad";
m->m_off = (char *)&cr.m_allowAsianDocs - x;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "allow adult docs";
m->m_desc = "If this is disabled the spider "
"will not allow any docs which contain adult content "
"into the index (overides tagdb).";
m->m_cgi = "aprnd";
m->m_off = (char *)&cr.m_allowAdultDocs - x;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_group = 0 ;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "allow xml docs";
m->m_desc = "If this is disabled the spider "
"will not allow any xml "
"into the index.";
m->m_cgi = "axd";
m->m_off = (char *)&cr.m_allowXmlDocs - x;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "do serp detection";
m->m_desc = "If this is eabled the spider "
"will not allow any docs which are determined to "
"be serps.";
m->m_cgi = "dsd";
m->m_off = (char *)&cr.m_doSerpDetection - x;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m->m_flags = PF_CLONE;
m++;
m->m_title = "do IP lookup";
m->m_desc = "If this is disabled and the proxy "
"IP below is not zero then Gigablast will assume "
"all spidered URLs have an IP address of 1.2.3.4.";
m->m_cgi = "dil";
m->m_off = (char *)&cr.m_doIpLookups - x;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "use old IPs";
m->m_desc = "Should the stored IP "
"of documents we are reindexing be used? Useful for "
"pages banned by IP address and then reindexed with "
"the reindexer tool.";
m->m_cgi = "useOldIps";
m->m_off = (char *)&cr.m_useOldIps - x;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "remove banned pages";
m->m_desc = "Remove banned pages from the index. Pages can be "
"banned using tagdb or the Url Filters table.";
m->m_cgi = "rbp";
m->m_off = (char *)&cr.m_removeBannedPages - x;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m++;
/*
m->m_title = "ban domains of urls banned by IP";
m->m_desc = "Most urls are banned by IP "
"address. But owners often will keep the same "
"domains and change their IP address. So when "
"banning a url that was banned by IP, should its domain "
"be banned too? (obsolete)";
m->m_cgi = "banDomains";
m->m_off = (char *)&cr.m_banDomains - x;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m++;
*/
m->m_title = "allow HTTPS pages using SSL";
m->m_desc = "If this is true, spiders will read "
"HTTPS pages using SSL Protocols.";
m->m_cgi = "ahttps";
m->m_off = (char *)&cr.m_allowHttps - x;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m++;
/*
m->m_title = "require dollar sign";
m->m_desc = "If this is YES, then do not allow document to be "
"indexed if they do not contain a dollar sign ($), but the "
"links will still be harvested. Used for building shopping "
"index.";
m->m_cgi = "nds";
m->m_off = (char *)&cr.m_needDollarSign - x;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m++;
*/
/*
m->m_title = "require numbers in url";
m->m_desc = "If this is YES, then do not allow document to be "
"indexed if they do not have two back-to-back digits in the "
"path of the url, but the links will still be harvested. Used "
"to build a news index.";
m->m_cgi = "nniu";
m->m_off = (char *)&cr.m_needNumbersInUrl - x;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_group = 0;
m++;
m->m_title = "index news topics";
m->m_desc = "If this is YES, Gigablast will attempt to categorize "
"every page as being in particular news categories like "
"sports, business, etc. and will be searchable by doing a "
"query like \"newstopic:sports.";
m->m_cgi = "int";
m->m_off = (char *)&cr.m_getNewsTopic - x;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m++;
*/
m->m_title = "follow RSS links";
m->m_desc = "If an item on a page has an RSS feed link, add the "
"RSS link to the spider queue and index the RSS pages "
"instead of the current page.";
m->m_cgi = "frss";
m->m_off = (char *)&cr.m_followRSSLinks - x;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "only index articles from RSS feeds";
m->m_desc = "Only index pages that were linked to by an RSS feed. "
"Follow RSS Links must be enabled (above).";
m->m_cgi = "orss";
m->m_off = (char *)&cr.m_onlyIndexRSS - x;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "max text doc length";
m->m_desc = "Gigablast will not download, index or "
"store more than this many bytes of an HTML or text "
"document. XML is NOT considered to be HTML or text, use "
"the rule below to control the maximum length of an XML "
"document. "
"Use -1 for no max.";
m->m_cgi = "mtdl";
m->m_off = (char *)&cr.m_maxTextDocLen - x;
m->m_type = TYPE_LONG;
m->m_def = "1048576"; // 1MB
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m->m_flags = PF_CLONE|PF_API;
m++;
m->m_title = "max other doc length";
m->m_desc = "Gigablast will not download, index or "
"store more than this many bytes of a non-html, non-text "
"document. XML documents will be restricted to this "
"length. "
"Use -1 for no max.";
m->m_cgi = "modl";
m->m_off = (char *)&cr.m_maxOtherDocLen - x;
m->m_type = TYPE_LONG;
m->m_def = "1048576"; // 1MB
m->m_group = 0;
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m->m_flags = PF_CLONE|PF_API;
m++;
//m->m_title = "indexdb truncation limit";
//m->m_cgi = "itl";
//m->m_desc = "How many documents per term? Keep this very high.";
//m->m_off = (char *)&cr.m_indexdbTruncationLimit - x;
//m->m_def = "50000000";
//m->m_type = TYPE_LONG;
//m->m_min = MIN_TRUNC; // from Indexdb.h
//m++;
m->m_title = "apply filter to text pages";
m->m_desc = "If this is false then the filter "
"will not be used on html or text pages.";
m->m_cgi = "aft";
m->m_off = (char *)&cr.m_applyFilterToText - x;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m->m_flags = PF_CLONE;
m++;
m->m_title = "filter name";
m->m_desc = "Program to spawn to filter all HTTP "
"replies the spider receives. Leave blank for none.";
m->m_cgi = "filter";
m->m_def = "";
m->m_off = (char *)&cr.m_filter - x;
m->m_type = TYPE_STRING;
m->m_size = MAX_FILTER_LEN+1;
m->m_group = 0;
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m->m_flags = PF_CLONE;
m++;
m->m_title = "filter timeout";
m->m_desc = "Kill filter shell after this many seconds. Assume it "
"stalled permanently.";
m->m_cgi = "fto";
m->m_def = "40";
m->m_off = (char *)&cr.m_filterTimeout - x;
m->m_type = TYPE_LONG;
m->m_group = 0;
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m->m_flags = PF_CLONE;
m++;
/*
m->m_title = "proxy ip";
m->m_desc = "Retrieve pages from the proxy at this IP address.";
m->m_cgi = "proxyip";
m->m_off = (char *)&cr.m_proxyIp - x;
m->m_type = TYPE_IP;
m->m_def = "0.0.0.0";
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m->m_flags = PF_CLONE;
m++;
m->m_title = "proxy port";
m->m_desc = "Retrieve pages from the proxy on "
"this port.";
m->m_cgi = "proxyport";
m->m_off = (char *)&cr.m_proxyPort - x;
m->m_type = TYPE_LONG;
m->m_def = "0";
m->m_group = 0;
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m->m_flags = PF_CLONE;
m++;
*/
m->m_title = "make image thumbnails";
m->m_desc = "Try to find the best image on each page and "
"store it as a thumbnail for presenting in the search "
"results.";
m->m_cgi = "mit";
m->m_off = (char *)&cr.m_makeImageThumbnails - x;
m->m_type = TYPE_BOOL;
// default to off since it slows things down to do this
m->m_def = "0";
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m->m_flags = PF_CLONE;
m++;
m->m_title = "max thumbnail width or height";
m->m_desc = "This is in pixels and limits the size of the thumbnail. "
"Gigablast tries to make at least the width or the height "
"equal to this maximum, but, unless the thumbnail is sqaure, "
"one side will be longer than the other.";
m->m_cgi = "mtwh";
m->m_off = (char *)&cr.m_thumbnailMaxWidthHeight - x;
m->m_type = TYPE_LONG;
m->m_def = "250";
m->m_group = 0;
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m->m_flags = PF_CLONE;
m++;
m->m_title = "index spider status documents";
m->m_desc = "Index a spider status \"document\" "
"for every url the spider "
"attempts to spider. Search for them using special "
"query operators like type:status or gberrorstr:success or "
"stats:gberrornum to get a histogram. "
"See <a href=/syntax.html>syntax</a> page for more examples. "
"They will not otherwise "
"show up in the search results.";
// "This will not work for "
// "diffbot crawlbot collections yet until it has proven "
// "more stable.";
m->m_cgi = "isr";
m->m_off = (char *)&cr.m_indexSpiderReplies - x;
m->m_type = TYPE_BOOL;
// default off for now until we fix it better. 5/26/14 mdw
// turn back on 6/21 now that we do not index plain text terms
// and we add gbdocspidertime and gbdocindextime terms so you
// can use those to sort regular docs and not have spider reply
// status docs in the serps.
// back on 4/21/2015 seems pretty stable.
// but it uses disk space so turn off for now again. 6/16/2015
m->m_def = "0";
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m->m_flags = PF_CLONE;
m++;
// i put this in here so i can save disk space for my global
// diffbot json index
m->m_title = "index body";
m->m_desc = "Index the body of the documents so you can search it. "
"Required for searching that. You wil pretty much always "
"want to keep this enabled. Does not apply to JSON "
"documents.";
m->m_cgi = "ib";
m->m_off = (char *)&cr.m_indexBody - x;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m->m_flags = PF_CLONE ;//| PF_HIDDEN;
m++;
m->m_cgi = "apiUrl";
m->m_desc = "Send every spidered url to this url and index "
"the reply in addition to the normal indexing process. "
"Example: by specifying http://api.diffbot.com/v3/"
"analyze?mode=high-precision&token=<yourDiffbotToken> here "
"you can index the structured JSON replies from diffbot for "
"every url that is spidered. "
"Gigablast will automatically "
"append a &url=<urlBeingSpidered> to this url "
"before sending it to diffbot.";
m->m_xml = "diffbotApiUrl";
m->m_title = "diffbot api url";
m->m_off = (char *)&cr.m_diffbotApiUrl - x;
m->m_type = TYPE_SAFEBUF;
m->m_page = PAGE_SPIDER;
m->m_flags = PF_REBUILDURLFILTERS | PF_CLONE;
m->m_def = "";
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m++;
m->m_cgi = "urlProcessPatternTwo";
m->m_desc = "Only send urls that match this simple substring "
"pattern to Diffbot. Separate substrings with two pipe "
"operators, ||. Leave empty for no restrictions.";
m->m_xml = "diffbotUrlProcessPattern";
m->m_title = "diffbot url process pattern";
m->m_off = (char *)&cr.m_diffbotUrlProcessPattern - x;
m->m_type = TYPE_SAFEBUF;
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m->m_def = "";
m->m_group = 0;
m->m_flags = PF_REBUILDURLFILTERS | PF_CLONE;
m++;
m->m_cgi = "urlProcessRegExTwo";
m->m_desc = "Only send urls that match this regular expression "
"to Diffbot. "
"Leave empty for no restrictions.";
m->m_xml = "diffbotUrlProcessRegEx";
m->m_title = "diffbot url process regex";
m->m_off = (char *)&cr.m_diffbotUrlProcessRegEx - x;
m->m_type = TYPE_SAFEBUF;
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m->m_def = "";
m->m_group = 0;
m->m_flags = PF_REBUILDURLFILTERS | PF_CLONE;
m++;
m->m_cgi = "pageProcessPatternTwo";
m->m_desc = "Only send urls whose content matches this simple "
"substring "
"pattern to Diffbot. Separate substrings with two pipe "
"operators, ||. "
"Leave empty for no restrictions.";
m->m_xml = "diffbotPageProcessPattern";
m->m_title = "diffbot page process pattern";
m->m_off = (char *)&cr.m_diffbotPageProcessPattern - x;
m->m_type = TYPE_SAFEBUF;
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m->m_def = "";
m->m_group = 0;
m->m_flags = PF_REBUILDURLFILTERS | PF_CLONE;
m++;
m->m_title = "spider start time";
m->m_desc = "Only spider URLs scheduled to be spidered "
"at this time or after. In UTC.";
m->m_cgi = "sta";
m->m_off = (char *)&cr.m_spiderTimeMin - x;
m->m_type = TYPE_DATE; // date format -- very special
m->m_def = "01 Jan 1970";
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "spider end time";
m->m_desc = "Only spider URLs scheduled to be spidered "
"at this time or before. If \"use current time\" is true "
"then the current local time is used for this value instead. "
"in UTC.";
m->m_cgi = "stb";
m->m_off = (char *)&cr.m_spiderTimeMax - x;
m->m_type = TYPE_DATE2;
m->m_def = "01 Jan 2010";
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "use current time";
m->m_desc = "Use the current time as the spider end time?";
m->m_cgi = "uct";
m->m_off = (char *)&cr.m_useCurrentTime - x;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m++;
/*
m->m_title = "default ruleset site file num";
m->m_desc = "Use this as the current Sitedb file num for Sitedb "
"entries that always use the current default";
m->m_cgi = "dftsfn";
m->m_off = (char *)&cr.m_defaultSiteRec - x;
m->m_type = TYPE_LONG;
m->m_def = "16";
m++;
m->m_title = "RSS ruleset site file num";
m->m_desc = "Use this Sitedb file num ruleset for RSS feeds";
m->m_cgi = "rssrs";
m->m_off = (char *)&cr.m_rssSiteRec - x;
m->m_type = TYPE_LONG;
m->m_def = "25";
m->m_group = 0;
m++;
m->m_title = "TOC ruleset site file num";
m->m_desc = "Use this Sitedb file num ruleset "
"for Table of Contents pages";
m->m_cgi = "tocrs";
m->m_off = (char *)&cr.m_tocSiteRec - x;
m->m_type = TYPE_LONG;
m->m_def = "29";
m->m_group = 0;
m++;
*/
/*
m->m_title = "store topics vector";
m->m_desc = "Should Gigablast compute and store a topics vector "
"for every document indexed. This allows Gigablast to "
"do topic clustering without having to compute this vector "
"at query time. You can turn topic clustering on in the "
"Search Controls page.";
m->m_cgi = "utv";
m->m_off = (char *)&cr.m_useGigabitVector - x;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m++;
m->m_title = "use gigabits for vector";
m->m_desc = "For news collection. "
"Should Gigablast form the similarity vector using "
"Gigabits, as opposed to a straight out random sample. "
"This does clustering more "
"by topic rather than by explicit content in common.";
m->m_cgi = "uct";
m->m_off = (char *)&cr.m_useGigabitVector - x;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m++;
m->m_title = "max similarity to reindex";
m->m_desc = "If the url's content is over X% similar to what we "
"already "
"have indexed, then do not reindex it, and treat the content "
"as if it were unchanged for intelligent spider scheduling "
"purposes. Set to 100% to always reindex the document, "
"regardless, although the use-ifModifiedSince check "
"above may still be in affect, as well as the "
"deduping-enabled check. This will also affect the re-spider "
"time, because Gigablast spiders documents that change "
"frequently faster.";
m->m_cgi = "msti";
m->m_off = (char *)&cr.m_maxSimilarityToIndex - x;
m->m_type = TYPE_LONG;
m->m_def = "100";
m->m_group = 0;
m++;
*/
// this is obsolete -- we can use the reg exp "isroot"
/*
m->m_title = "root url priority";
m->m_desc = "What spider priority should root urls "
"be assigned? Spider priorities range from 0 to 31. If no "
"urls are scheduled to be spidered in the priority 31 "
"bracket, the spider moves down to 30, etc., until it finds "
"a url to spider. If this priority is undefined "
"then that url's priority is determined based on the rules "
"on the URL filters page. If the priority is still "
"undefined then the priority is taken to be the priority of "
"the parent minus one, which results in a breadth first "
"spidering algorithm."; // html
m->m_cgi = "srup";
m->m_off = (char *)&cr.m_spiderdbRootUrlPriority - x;
m->m_type = TYPE_PRIORITY2;// 0-(MAX_SPIDER_PRIORITIES-1)dropdown menu
m->m_def = "15";
m++;
*/
/*
-- mdw, now in urlfilters using "isaddurl" "reg exp"
m->m_title = "add url priority";
m->m_desc = "What is the priority of a url which "
"is added to the spider queue via the "
"add url page?"; // html
m->m_cgi = "saup";
m->m_off = (char *)&cr.m_spiderdbAddUrlPriority - x;
m->m_type = TYPE_PRIORITY; // 0-(MAX_SPIDER_PRIORITIES-1)dropdown menu
m->m_def = "16";
m->m_group = 0;
m++;
*/
/*
m->m_title = "new spider by priority";
m->m_desc = "Specify priorities for which "
"new urls not yet in the index should be spidered.";
m->m_cgi = "sn";
m->m_xml = "spiderNewBits";
m->m_off = (char *)&cr.m_spiderNewBits - x;
m->m_type = TYPE_PRIORITY_BOXES; // array of numbered (0-(MAX_SPIDER_PRIORITIES-1)) checkboxes
m->m_fixed = MAX_SPIDER_PRIORITIES;
m->m_def = "1"; // default for each one is on
m++;
m->m_title = "old spider by priority";
m->m_desc = "Specify priorities for which old "
"urls already in the index should be spidered.";
m->m_cgi = "so";
m->m_xml = "spiderOldBits";
m->m_off = (char *)&cr.m_spiderOldBits - x;
m->m_type = TYPE_PRIORITY_BOXES; // array of numbered (0-(MAX_SPIDER_PRIORITIES-1)) checkboxes
m->m_fixed = MAX_SPIDER_PRIORITIES;
m->m_def = "1"; // default for each one is on
m->m_group = 0;
m++;
m->m_title = "max spiders per domain";
m->m_desc = "How many pages should the spider "
"download simultaneously from any one domain? This can "
"prevents the spider from hitting one server too hard.";
m->m_cgi = "mspd";
m->m_off = (char *)&cr.m_maxSpidersPerDomain - x;
m->m_type = TYPE_LONG;
m->m_def = "1";
m++;
m->m_title = "same domain wait";
m->m_desc = "How many milliseconds should Gigablast wait "
"between spidering a second url from the same domain. "
"This is used to prevent the spiders from hitting a "
"website too hard.";
m->m_cgi = "sdw";
m->m_off = (char *)&cr.m_sameDomainWait - x;
m->m_type = TYPE_LONG;
m->m_def = "500";
m->m_group = 0;
m++;
m->m_title = "same ip wait";
m->m_desc = "How many milliseconds should Gigablast wait "
"between spidering a second url from the same IP address. "
"This is used to prevent the spiders from hitting a "
"website too hard.";
m->m_cgi = "siw";
m->m_off = (char *)&cr.m_sameIpWait - x;
m->m_type = TYPE_LONG;
m->m_def = "10000";
m->m_group = 0;
m++;
*/
/*
m->m_title = "use distributed spider lock";
m->m_desc = "Enable distributed spider locking to strictly enforce "
"same domain waits at a global level.";
m->m_cgi = "udsl";
m->m_off = (char *)&cr.m_useSpiderLocks - x;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_group = 0;
m++;
m->m_title = "distribute spider download based on ip";
m->m_desc = "Distribute web downloads based on the ip of the host so "
"only one spider ip hits the same hosting ip. Helps "
"webmaster's logs look nicer.";
m->m_cgi = "udsd";
m->m_off = (char*)&cr.m_distributeSpiderGet - x;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_group = 0;
m++;
m->m_title = "percent of water mark to reload queues";
m->m_desc = "When a spider queue drops below this percent of its "
"max level it will reload from disk.";
m->m_cgi = "rlqp";
m->m_off = (char*)&cr.m_reloadQueuePercent - x;
m->m_type = TYPE_LONG;
m->m_def = "25";
m++;
*/
/*
m->m_title = "min respider wait";
m->m_desc = "What is the minimum number of days "
"the spider should wait before re-visiting a particular "
"web page? "
"The spiders attempts to determine the update cycle of "
"each web page and it tries to visit them as needed, but it "
"will not wait less than this number of days regardless.";
m->m_cgi = "mrw";
m->m_off = (char *)&cr.m_minRespiderWait - x;
m->m_type = TYPE_FLOAT;
m->m_def = "1.0";
m++;
m->m_title = "max respider wait";
m->m_desc = "What is the maximum number of days "
"the spider should wait before re-visiting a particular "
"web page?";
m->m_cgi = "xrw";
m->m_off = (char *)&cr.m_maxRespiderWait - x;
m->m_type = TYPE_FLOAT;
m->m_def = "90.0";
m->m_group = 0;
m++;
m->m_title = "first respider wait";
m->m_desc = "What is the number of days "
"Gigablast should wait before spidering a particular web page "
"for the second time? Tag in ruleset will override this value "
"if it is present.";
m->m_cgi = "frw";
m->m_off = (char *)&cr.m_firstRespiderWait - x;
m->m_type = TYPE_FLOAT;
m->m_def = "30.0";
m->m_group = 0;
m++;
m->m_title = "error respider wait";
m->m_desc = "If a spidered web page has a network "
"error, such as a DNS not found error, or a time out error, "
"how many days should Gigablast wait before reattempting "
"to spider that web page?";
m->m_cgi = "erw";
m->m_off = (char *)&cr.m_errorRespiderWait - x;
m->m_type = TYPE_FLOAT;
m->m_def = "2.0";
m->m_group = 0;
m++;
m->m_title = "doc not found error respider wait";
m->m_desc = "If a spidered web page has a http status "
"error, such as a 404 page not found error, "
"how many days should Gigablast wait before reattempting "
"to spider that web page?";
m->m_cgi = "dnferw";
m->m_off = (char *)&cr.m_docNotFoundErrorRespiderWait - x;
m->m_type = TYPE_FLOAT;
m->m_def = "7.0";
m->m_group = 0;
m++;
*/
/*
m->m_title = "spider max kbps";
m->m_desc = "The maximum kilobits per second "
"that the spider can download.";
m->m_cgi = "cmkbps";
m->m_off = (char *)&cr.m_maxKbps - x;
m->m_type = TYPE_FLOAT;
m->m_def = "999999.0";
m++;
m->m_title = "spider max pages per second";
m->m_desc = "The maximum number of pages per "
"second that can be indexed or deleted from the index.";
m->m_cgi = "cmpps";
m->m_off = (char *)&cr.m_maxPagesPerSecond - x;
m->m_type = TYPE_FLOAT;
m->m_def = "999999.0";
m->m_group = 0;
m++;
*/
/*
m->m_title = "spider new percent";
m->m_desc = "Approximate percentage of new vs. old docs to spider. "
"If set to a negative number, the old alternating "
"priority algorithm is used.";
m->m_cgi = "snp";
m->m_off = (char *)&cr.m_spiderNewPct - x;
m->m_type = TYPE_FLOAT;
m->m_def = "-1.0";
m->m_group = 0;
m++;
*/
/*
m->m_title = "number retries per url";
m->m_desc = "How many times should the spider be "
"allowed to fail to download a particular web page before "
"it gives up? "
"Failure may result from temporary loss of internet "
"connectivity on the remote end, dns or routing problems.";
m->m_cgi = "nr";
m->m_off = (char *)&cr.m_numRetries - x;
m->m_type = TYPE_RETRIES; // dropdown from 0 to 3
m->m_def = "1";
m++;
m->m_title = "priority of urls being retried";
m->m_desc = "Keep this pretty high so that we get problem urls "
"out of the index fast, otherwise, you might be waiting "
"months for another retry. Use <i>undefined</i> to indicate "
"no change in the priority of the url.";
m->m_cgi = "rtp";
m->m_off = (char *)&cr.m_retryPriority - x;
m->m_type = TYPE_PRIORITY2; // -1 to 31
m->m_def = "-1";
m->m_group = 0;
m++;
m->m_title = "max pages in index";
m->m_desc = "What is the maximum number of "
"pages that are permitted for this collection?";
m->m_cgi = "mnp";
m->m_off = (char *)&cr.m_maxNumPages - x;
m->m_type = TYPE_LONG_LONG;
m->m_def = "10000000000"; // 10 billion
m++;
m->m_title = "import link info"; // from other cluster";
m->m_desc = "Say yes here to make Gigablast import "
"link text from another collection into this one "
"when spidering urls. Gigablast will "
"use the hosts.conf file in the working directory to "
"tell it what hosts belong to the cluster to import from. "
"Gigablast "
"will use the \"update link votes frequency\" parm above "
"to determine if the info should be recomputed on the other "
"cluster.";
m->m_cgi = "eli"; // external link info
m->m_off = (char *)&cr.m_getExternalLinkInfo - x;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_priv = 2;
m++;
m->m_title = "use hosts2.conf for import cluster";
m->m_desc = "Tell Gigablast to import from the cluster defined by "
"hosts2.conf in the working directory, rather than "
"hosts.conf";
m->m_cgi = "elib"; // external link info
m->m_off = (char *)&cr.m_importFromHosts2Conf - x;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_priv = 2;
m->m_group = 0;
m++;
//m->m_title = "get link info from other cluster in real-time";
//m->m_desc = "Say yes here to make Gigablast tell the other "
// "cluster to compute the link info, not just return a "
// "stale copy from the last time it computed it.";
//m->m_cgi = "elif"; // external link info fresh
//m->m_off = (char *)&cr.m_getExternalLinkInfoFresh - x;
//m->m_type = TYPE_BOOL;
//m->m_def = "0";
//m->m_group = 0;
//m->m_priv = 2;
//m++;
m->m_title = "collection to import from";
m->m_desc = "Gigablast will fetch the link info from this "
"collection.";
m->m_cgi = "elic"; // external link info
m->m_off = (char *)&cr.m_externalColl - x;
m->m_type = TYPE_STRING;
m->m_size = MAX_COLL_LEN+1;
m->m_def = "";
m->m_group = 0;
m->m_priv = 2;
m++;
m->m_title = "turk tags to display";
m->m_desc = "Tell pageturk to display the tag questions "
"for the comma seperated tag names."
" no space allowed.";
m->m_cgi = "ttags";
m->m_xml = "turkTags";
m->m_type = TYPE_STRING;
m->m_size = 256;
m->m_def = "blog,spam,news";
m->m_off = (char *)&cr.m_turkTags - x;
m->m_group = 0;
m->m_priv = 2;
m++;
*/
/*
// now we store this in title recs, so we can change it on the fly
m->m_title = "title weight";
m->m_desc = "Weight title this much more or less. This units are "
"percentage. A 100 means to not give the title any special "
"weight. Generally, though, you want to give it significantly "
"more weight than that, so 2400 is the default.";
m->m_cgi = "tw";
m->m_off = (char *)&cr.m_titleWeight - x;
m->m_type = TYPE_LONG;
m->m_def = "4600";
m->m_min = 0;
m++;
// now we store this in title recs, so we can change it on the fly
m->m_title = "header weight";
m->m_desc = "Weight terms in header tags by this much more or less. "
"This units are "
"percentage. A 100 means to not give the header any special "
"weight. Generally, though, you want to give it significantly "
"more weight than that, so 600 is the default.";
m->m_cgi = "hw";
m->m_off = (char *)&cr.m_headerWeight - x;
m->m_type = TYPE_LONG;
m->m_def = "600";
m->m_min = 0;
m->m_group = 0;
m++;
// now we store this in title recs, so we can change it on the fly
m->m_title = "url path word weight";
m->m_desc = "Weight text in url path this much more. "
"The units are "
"percentage. A 100 means to not give any special "
"weight. Generally, though, you want to give it significantly "
"more weight than that, so 600 is the default.";
m->m_cgi = "upw";
m->m_off = (char *)&cr.m_urlPathWeight - x;
m->m_type = TYPE_LONG;
m->m_def = "1600";
m->m_min = 0;
m->m_group = 0;
m++;
// now we store this in title recs, so we can change it on the fly
m->m_title = "external link text weight";
m->m_desc = "Weight text in the incoming external link text this "
"much more. The units are percentage. It already receives a "
"decent amount of weight naturally.";
m->m_cgi = "eltw";
m->m_off = (char *)&cr.m_externalLinkTextWeight - x;
m->m_type = TYPE_LONG;
m->m_def = "600";
m->m_min = 0;
m->m_group = 0;
m++;
// now we store this in title recs, so we can change it on the fly
m->m_title = "internal link text weight";
m->m_desc = "Weight text in the incoming internal link text this "
"much more. The units are percentage. It already receives a "
"decent amount of weight naturally.";
m->m_cgi = "iltw";
m->m_off = (char *)&cr.m_internalLinkTextWeight - x;
m->m_type = TYPE_LONG;
m->m_def = "200";
m->m_min = 0;
m->m_group = 0;
m++;
// now we store this in title recs, so we can change it on the fly
m->m_title = "concept weight";
m->m_desc = "Weight concepts this much more. "
"The units are "
"percentage. It already receives a decent amount of weight "
"naturally. AKA: surrounding text boost.";
m->m_cgi = "cw";
m->m_off = (char *)&cr.m_conceptWeight - x;
m->m_type = TYPE_LONG;
m->m_def = "50";
m->m_min = 0;
m->m_group = 0;
m++;
*/
/*
// now we store this in title recs, so we can change it on the fly
m->m_title = "site num inlinks boost base";
m->m_desc = "Boost the score of all terms in the document using "
"this number. "
"The boost itself is expressed as a percentage. "
"The boost is B^X, where X is the number of good "
"inlinks to the document's site "
"and B is this is this boost base. "
"The score of each term in the "
"document is multiplied by the boost. That product "
"becomes the new score of that term. "
"For purposes of this calculation we limit X to 1000.";
m->m_cgi = "qbe";
m->m_off = (char *)&cr.m_siteNumInlinksBoostBase - x;
m->m_type = TYPE_FLOAT;
m->m_def = "1.005";
m->m_min = 0;
m->m_group = 0;
m++;
*/
/*
// use menu elimination technology?
m->m_title = "only index article content";
m->m_desc = "If this is true gigablast will only index the "
"article content on pages identifed as permalinks. It will "
"NOT index any page content on non-permalink pages, and it "
"will avoid indexing menu content on any page. It will not "
"index meta tags on any page. It will only index incoming "
"link text for permalink pages. Useful when "
"indexing blog or news sites.";
m->m_cgi = "met";
m->m_off = (char *)&cr.m_eliminateMenus - x;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m++;
*/
// replace by lang== lang!= in url filters
//m->m_title = "collection language";
//m->m_desc = "Only spider pages determined to be in "
// "this language (see Language.h)";
//m->m_cgi = "clang";
//m->m_off = (char *)&cr.m_language - x;
//m->m_type = TYPE_LONG;
//m->m_def = "0";
//m++;
////////////////
// END PAGE SPIDER CONTROLS
////////////////
///////////////////////////////////////////
// PAGE REPAIR CONTROLS
///////////////////////////////////////////
m->m_title = "rebuild mode enabled";
m->m_desc = "If enabled, gigablast will rebuild the rdbs as "
"specified by the parameters below. When a particular "
"collection is in rebuild mode, it can not spider or merge "
"titledb files.";
m->m_cgi = "rme";
m->m_off = (char *)&g_conf.m_repairingEnabled - g;
m->m_type = TYPE_BOOL;
m->m_page = PAGE_REPAIR;
m->m_obj = OBJ_CONF;
m->m_def = "0";
m->m_sync = false; // do not sync this parm
m++;
m->m_title = "collection to rebuild";
m->m_xml = "collectionToRebuild";
m->m_desc = "Name of collection to rebuild.";
// m->m_desc = "Comma or space separated list of the collections "
// "to rebuild.";
m->m_cgi = "rctr"; // repair collections to repair
m->m_off = (char *)&g_conf.m_collsToRepair - g;
m->m_type = TYPE_SAFEBUF;//STRING;
//m->m_size = 1024;
m->m_def = "";
m->m_page = PAGE_REPAIR;
m->m_obj = OBJ_CONF;
m->m_group = 0;
m->m_flags = PF_REQUIRED;// | PF_COLLDEFAULT;//| PF_NOHTML;
m++;
m->m_title = "rebuild ALL collections";
m->m_desc = "If enabled, gigablast will rebuild all collections.";
m->m_cgi = "rac";
m->m_off = (char *)&g_conf.m_rebuildAllCollections - g;
m->m_type = TYPE_BOOL;
m->m_page = PAGE_REPAIR;
m->m_obj = OBJ_CONF;
m->m_def = "0";
m->m_group = 0;
m++;
m->m_title = "memory to use for rebuild";
m->m_desc = "In bytes.";
m->m_cgi = "rmtu"; // repair mem to use
m->m_off = (char *)&g_conf.m_repairMem - g;
m->m_type = TYPE_LONG;
m->m_page = PAGE_REPAIR;
m->m_obj = OBJ_CONF;
m->m_def = "200000000";
m->m_units = "bytes";
m->m_group = 0;
m++;
m->m_title = "max rebuild injections";
m->m_desc = "Maximum number of outstanding injections for "
"rebuild.";
m->m_cgi = "mrps";
m->m_off = (char *)&g_conf.m_maxRepairSpiders - g;
m->m_type = TYPE_LONG;
m->m_page = PAGE_REPAIR;
m->m_obj = OBJ_CONF;
m->m_def = "2";
m->m_group = 0;
m++;
m->m_title = "full rebuild";
m->m_desc = "If enabled, gigablast will reinject the content of "
"all title recs into a secondary rdb system. That will "
"the primary rdb system when complete.";
m->m_cgi = "rfr"; // repair full rebuild
m->m_off = (char *)&g_conf.m_fullRebuild - g;
m->m_type = TYPE_BOOL;
m->m_page = PAGE_REPAIR;
m->m_obj = OBJ_CONF;
m->m_def = "1";
m->m_group = 0;
m++;
m->m_title = "add spiderdb recs of non indexed urls";
m->m_desc = "If enabled, gigablast will add the spiderdb "
"records of unindexed urls "
"when doing the full rebuild or the spiderdb "
"rebuild. Otherwise, only the indexed urls will get "
"spiderdb records in spiderdb. This can be faster because "
"Gigablast does not have to do an IP lookup on every url "
"if its IP address is not in tagdb already.";
m->m_cgi = "rfrknsx";
m->m_off = (char *)&g_conf.m_rebuildAddOutlinks - g;
m->m_type = TYPE_BOOL;
m->m_page = PAGE_REPAIR;
m->m_obj = OBJ_CONF;
m->m_def = "0";
m->m_group = 0;
m++;
m->m_title = "recycle link text";
m->m_desc = "If enabled, gigablast will recycle the link text "
"when rebuilding titledb. "
"The siterank, which is determined by the "
"number of inlinks to a site, is stored/cached in tagdb "
"so that is a separate item. If you want to pick up new "
"link text you will want to set this to <i>NO</i> and "
"make sure to rebuild titledb, since that stores the "
"link text.";
m->m_cgi = "rrli"; // repair full rebuild
m->m_off = (char *)&g_conf.m_rebuildRecycleLinkInfo - g;
m->m_type = TYPE_BOOL;
m->m_page = PAGE_REPAIR;
m->m_obj = OBJ_CONF;
m->m_def = "1";
m->m_group = 0;
m++;
/*
m->m_title = "recycle imported link info";
m->m_desc = "If enabled, gigablast will recycle the imported "
"link info when rebuilding titledb.";
m->m_cgi = "rrlit"; // repair full rebuild
m->m_off = (char *)&g_conf.m_rebuildRecycleLinkInfo2 - g;
m->m_type = TYPE_BOOL;
m->m_page = PAGE_REPAIR;
m->m_def = "1";
m->m_group = 0;
m++;
*/
/*
m->m_title = "remove bad pages";
m->m_desc = "If enabled, gigablast just scans the titledb recs "
"in the given collection and removes those that are "
"banned or filtered according to the url filters table. It "
"will also lookup in tagdb.";
m->m_cgi = "rbadp";
m->m_off = (char *)&g_conf.m_removeBadPages - g;
m->m_type = TYPE_BOOL;
m->m_page = PAGE_REPAIR;
m->m_def = "0";
m++;
*/
m->m_title = "rebuild titledb";
m->m_desc = "If enabled, gigablast will rebuild this rdb";
m->m_cgi = "rrt"; // repair rebuild titledb
m->m_off = (char *)&g_conf.m_rebuildTitledb - g;
m->m_type = TYPE_BOOL;
m->m_page = PAGE_REPAIR;
m->m_obj = OBJ_CONF;
m->m_def = "0";
m++;
/*
m->m_title = "rebuild tfndb";
m->m_desc = "If enabled, gigablast will rebuild this rdb";
m->m_cgi = "rru"; // repair rebuild tfndb
m->m_off = (char *)&g_conf.m_rebuildTfndb - g;
m->m_type = TYPE_BOOL;
m->m_page = PAGE_REPAIR;
m->m_def = "0";
m->m_group = 0;
m++;
m->m_title = "rebuild indexdb";
m->m_desc = "If enabled, gigablast will rebuild this rdb";
m->m_cgi = "rri";
m->m_off = (char *)&g_conf.m_rebuildIndexdb - g;
m->m_type = TYPE_BOOL;
m->m_page = PAGE_REPAIR;
m->m_def = "0";
m->m_group = 0;
m++;
*/
m->m_title = "rebuild posdb";
m->m_desc = "If enabled, gigablast will rebuild this rdb";
m->m_cgi = "rri";
m->m_off = (char *)&g_conf.m_rebuildPosdb - g;
m->m_type = TYPE_BOOL;
m->m_page = PAGE_REPAIR;
m->m_obj = OBJ_CONF;
m->m_def = "0";
m->m_group = 0;
m++;
/*
m->m_title = "rebuild no splits";
m->m_desc = "If enabled, gigablast will just re-add the no split "
"lists from all the current title recs back into indexdb.";
m->m_cgi = "rns";
m->m_off = (char *)&g_conf.m_rebuildNoSplits - g;
m->m_type = TYPE_BOOL;
m->m_page = PAGE_REPAIR;
m->m_def = "0";
m->m_group = 0;
m++;
m->m_title = "rebuild datedb";
m->m_desc = "If enabled, gigablast will rebuild this rdb";
m->m_cgi = "rrd";
m->m_off = (char *)&g_conf.m_rebuildDatedb - g;
m->m_type = TYPE_BOOL;
m->m_page = PAGE_REPAIR;
m->m_def = "0";
m->m_group = 0;
m++;
m->m_title = "rebuild checksumdb";
m->m_desc = "If enabled, gigablast will rebuild this rdb";
m->m_cgi = "rrch";
m->m_off = (char *)&g_conf.m_rebuildChecksumdb - g;
m->m_type = TYPE_BOOL;
m->m_page = PAGE_REPAIR;
m->m_def = "0";
m->m_group = 0;
m++;
*/
m->m_title = "rebuild clusterdb";
m->m_desc = "If enabled, gigablast will rebuild this rdb";
m->m_cgi = "rrcl";
m->m_off = (char *)&g_conf.m_rebuildClusterdb - g;
m->m_type = TYPE_BOOL;
m->m_page = PAGE_REPAIR;
m->m_obj = OBJ_CONF;
m->m_def = "0";
m->m_group = 0;
m++;
m->m_title = "rebuild spiderdb";
m->m_desc = "If enabled, gigablast will rebuild this rdb";
m->m_cgi = "rrsp";
m->m_off = (char *)&g_conf.m_rebuildSpiderdb - g;
m->m_type = TYPE_BOOL;
m->m_page = PAGE_REPAIR;
m->m_obj = OBJ_CONF;
m->m_def = "0";
m->m_group = 0;
m++;
/*
m->m_title = "rebuild tagdb";
m->m_desc = "If enabled, gigablast will rebuild this rdb";
m->m_cgi = "rrsi";
m->m_off = (char *)&g_conf.m_rebuildSitedb - g;
m->m_type = TYPE_BOOL;
m->m_page = PAGE_REPAIR;
m->m_def = "0";
m->m_group = 0;
m++;
*/
m->m_title = "rebuild linkdb";
m->m_desc = "If enabled, gigablast will rebuild this rdb";
m->m_cgi = "rrld";
m->m_off = (char *)&g_conf.m_rebuildLinkdb - g;
m->m_type = TYPE_BOOL;
m->m_page = PAGE_REPAIR;
m->m_obj = OBJ_CONF;
m->m_def = "0";
m->m_group = 0;
m++;
/*
m->m_title = "rebuild tagdb";
m->m_desc = "If enabled, gigablast will rebuild this rdb";
m->m_cgi = "rrtgld";
m->m_off = (char *)&g_conf.m_rebuildTagdb - g;
m->m_type = TYPE_BOOL;
m->m_page = PAGE_REPAIR;
m->m_def = "0";
m->m_group = 0;
m++;
m->m_title = "rebuild placedb";
m->m_desc = "If enabled, gigablast will rebuild this rdb";
m->m_cgi = "rrpld";
m->m_off = (char *)&g_conf.m_rebuildPlacedb - g;
m->m_type = TYPE_BOOL;
m->m_page = PAGE_REPAIR;
m->m_def = "0";
m->m_group = 0;
m++;
m->m_title = "rebuild timedb";
m->m_desc = "If enabled, gigablast will rebuild this rdb";
m->m_cgi = "rrtmd";
m->m_off = (char *)&g_conf.m_rebuildTimedb - g;
m->m_type = TYPE_BOOL;
m->m_page = PAGE_REPAIR;
m->m_def = "0";
m->m_group = 0;
m++;
m->m_title = "rebuild sectiondb";
m->m_desc = "If enabled, gigablast will rebuild this rdb";
m->m_cgi = "rrsnd";
m->m_off = (char *)&g_conf.m_rebuildSectiondb - g;
m->m_type = TYPE_BOOL;
m->m_page = PAGE_REPAIR;
m->m_def = "0";
m->m_group = 0;
m++;
m->m_title = "rebuild revdb";
m->m_desc = "If enabled, gigablast will rebuild this rdb";
m->m_cgi = "rrrvd";
m->m_off = (char *)&g_conf.m_rebuildRevdb - g;
m->m_type = TYPE_BOOL;
m->m_page = PAGE_REPAIR;
m->m_def = "0";
m->m_group = 0;
m++;
*/
m->m_title = "rebuild root urls";
m->m_desc = "If disabled, gigablast will skip root urls.";
m->m_cgi = "ruru";
m->m_off = (char *)&g_conf.m_rebuildRoots - g;
m->m_type = TYPE_BOOL;
m->m_page = PAGE_REPAIR;
m->m_obj = OBJ_CONF;
m->m_def = "1";
m++;
m->m_title = "rebuild non-root urls";
m->m_desc = "If disabled, gigablast will skip non-root urls.";
m->m_cgi = "runru";
m->m_off = (char *)&g_conf.m_rebuildNonRoots - g;
m->m_type = TYPE_BOOL;
m->m_page = PAGE_REPAIR;
m->m_obj = OBJ_CONF;
m->m_def = "1";
m->m_group = 0;
m++;
/*
m->m_title = "skip tagdb lookup";
m->m_desc = "When rebuilding spiderdb and scanning it for new "
"spiderdb records, should a tagdb lookup be performed? "
"Runs much much "
"faster without it. Will also keep the original doc quality "
"and "
"spider priority in tact.";
m->m_cgi = "rssl";
m->m_off = (char *)&g_conf.m_rebuildSkipSitedbLookup - g;
m->m_type = TYPE_BOOL;
m->m_page = PAGE_REPAIR;
m->m_obj = OBJ_CONF;
m->m_def = "0";
m->m_group = 0;
m++;
*/
///////////////////////////////////////////
// END PAGE REPAIR //
///////////////////////////////////////////
///////////////////////////////////////////
// AUTOBAN CONTROLS
//
///////////////////////////////////////////
m->m_title = "ban IPs";
m->m_desc = "add Ips here to bar them from accessing this "
"gigablast server.";
m->m_cgi = "banIps";
m->m_xml = "banIps";
m->m_off = (char *)g_conf.m_banIps - g;
m->m_type = TYPE_STRINGBOX;
m->m_page = PAGE_AUTOBAN;
m->m_obj = OBJ_CONF;
m->m_size = AUTOBAN_TEXT_SIZE;
m->m_group = 1;
m->m_def = "";
m->m_plen = (char *)&g_conf.m_banIpsLen - g; // length of string
m++;
m->m_title = "allow IPs";
m->m_desc = "add Ips here to give them an infinite query quota.";
m->m_cgi = "allowIps";
m->m_xml = "allowIps";
m->m_off = (char *)g_conf.m_allowIps - g;
m->m_type = TYPE_STRINGBOX;
m->m_page = PAGE_AUTOBAN;
m->m_size = AUTOBAN_TEXT_SIZE;
m->m_group = 1;
m->m_def = "";
m->m_plen = (char *)&g_conf.m_allowIpsLen - g; // length of string
m->m_obj = OBJ_CONF;
m++;
m->m_title = "valid search codes";
m->m_desc = "Don't try to autoban queries that have one "
"of these codes. Also, the code must be valid for us "
"to use &uip=IPADDRESS as the IP address of the submitter "
"for purposes of autoban AND purposes of addurl daily quotas.";
m->m_cgi = "validCodes";
m->m_xml = "validCodes";
m->m_off = (char *)g_conf.m_validCodes - g;
m->m_type = TYPE_STRINGBOX;
m->m_page = PAGE_AUTOBAN;
m->m_size = AUTOBAN_TEXT_SIZE;
m->m_group = 1;
m->m_def = "";
m->m_plen = (char *)&g_conf.m_validCodesLen - g; // length of string
m->m_obj = OBJ_CONF;
m++;
m->m_title = "Extra Parms";
m->m_desc = "Append extra default parms to queries that match "
"certain substrings. Format: text to match in url, "
"followed by a space, then the list of extra parms as "
"they would appear appended to the url. "
"One match per line.";
m->m_cgi = "extraParms";
m->m_xml = "extraParms";
m->m_off = (char *)g_conf.m_extraParms - g;
m->m_type = TYPE_STRINGBOX;
m->m_page = PAGE_AUTOBAN;
m->m_size = AUTOBAN_TEXT_SIZE;
m->m_group = 1;
m->m_def = "";
m->m_plen = (char *)&g_conf.m_extraParmsLen - g; // length of string
m->m_obj = OBJ_CONF;
m++;
m->m_title = "ban substrings";
m->m_desc = "ban any query that matches this list of "
"substrings. Must match all comma-separated strings "
"on the same line. ('\\n' = OR, ',' = AND)";
m->m_cgi = "banRegex";
m->m_xml = "banRegex";
m->m_off = (char *)g_conf.m_banRegex - g;
m->m_type = TYPE_STRINGBOX;
m->m_page = PAGE_AUTOBAN;
m->m_size = AUTOBAN_TEXT_SIZE;
m->m_group = 1;
m->m_def = "";
m->m_plen = (char *)&g_conf.m_banRegexLen - g; // length of string
m->m_obj = OBJ_CONF;
m++;
/////////////
// END AUTOBAN CONTROLS
/////////////
///////////////////////////////////////////
// ROOT PASSWORDS page
///////////////////////////////////////////
m->m_title = "Master Passwords";
m->m_desc = "Whitespace separated list of passwords. "
"Any matching password will have administrative access "
"to Gigablast and all collections.";
//"If no Admin Password or Admin IP is specified then "
//"Gigablast will only allow local IPs to connect to it "
//"as the master admin.";
m->m_cgi = "masterpwds";
m->m_xml = "masterPasswords";
m->m_def = "";
m->m_obj = OBJ_CONF;
m->m_off = (char *)&g_conf.m_masterPwds - g;
m->m_type = TYPE_SAFEBUF; // STRINGNONEMPTY;
m->m_page = PAGE_MASTERPASSWORDS;
//m->m_max = MAX_MASTER_PASSWORDS;
//m->m_size = PASSWORD_MAX_LEN+1;
//m->m_addin = 1; // "insert" follows?
m->m_flags = PF_PRIVATE | PF_TEXTAREA | PF_SMALLTEXTAREA;
m++;
m->m_title = "Master IPs";
//m->m_desc = "Allow UDP requests from this list of IPs. Any datagram "
// "received not coming from one of these IPs, or an IP in "
// "hosts.conf, is dropped. If another cluster is accessing this "
// "cluster for getting link text or whatever, you will need to "
// "list the IPs of the accessing machines here. These IPs are "
// "also used to allow access to the HTTP server even if it "
// "was disabled in the Master Controls. IPs that have 0 has "
// "their Least Significant Byte are treated as wildcards for "
// "IP blocks. That is, 1.2.3.0 means 1.2.3.*.";
m->m_desc = "Whitespace separated list of Ips. "
"Any IPs in this list will have administrative access "
"to Gigablast and all collections.";
m->m_cgi = "masterips";
m->m_xml = "masterIps";
m->m_page = PAGE_MASTERPASSWORDS;
m->m_off = (char *)&g_conf.m_connectIps - g;
m->m_type = TYPE_SAFEBUF;//IP;
m->m_def = "";
//m->m_max = MAX_CONNECT_IPS;
//m->m_priv = 2;
//m->m_addin = 1; // "insert" follows?
//m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_obj = OBJ_CONF;
m->m_flags = PF_PRIVATE | PF_TEXTAREA | PF_SMALLTEXTAREA;
m++;
// m->m_title = "remove connect ip";
// m->m_desc = "remove a connect ip";
// m->m_cgi = "removeip";
// m->m_type = TYPE_CMD;
// m->m_page = PAGE_NONE;
// m->m_func = CommandRemoveConnectIpRow;
// m->m_cast = 1;
// m->m_obj = OBJ_CONF;
// m++;
// m->m_title = "remove a password";
// m->m_desc = "remove a password";
// m->m_cgi = "removepwd";
// m->m_type = TYPE_CMD;
// m->m_page = PAGE_NONE;
// m->m_func = CommandRemovePasswordRow;
// m->m_cast = 1;
// m->m_obj = OBJ_CONF;
// m++;
/*
m->m_title = "Super Turks";
m->m_desc = "Add facebook user IDs here so those people can "
"turk the results. Later we may limit each person to "
"turking a geographic region.";
m->m_cgi = "supterturks";
m->m_xml = "supterturks";
m->m_def = "";
m->m_off = (char *)&g_conf.m_superTurks - g;
m->m_type = TYPE_STRINGBOX;
m->m_perms = PAGE_MASTER;
m->m_size = USERS_TEXT_SIZE;
m->m_plen = (char *)&g_conf.m_superTurksLen - g;
m->m_page = PAGE_MASTERPASSWORDS;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m++;
*/
/*
m->m_title = "Users";
m->m_desc = "Add users here. The format is "
"collection:ip:username:password:relogin:pages:tagnames"
" Username and password cannot be blank."
" You can specify "
"* for collection to indicate all collections. "
" * can be used in IP as wildcard. "
" * in pages means user has access to all pages. Also"
" you can specify individual pages. A \'-\' sign at the"
" start of page means user is not allowed to access that"
" page. Please refer the page reference table at the bottom "
"of this page for available pages. If you want to just login "
" once and avoid relogin for gb shutdowns then set relogin=1,"
" else set it to 0. If relogin is 1 your login will never expire either."
"<br>"
" Ex: 1. master user -> *:*:master:master:1:*:english<br>"
" 2. public user -> *:*:public:1234:0:index.html"
",get,search,login,dir:english<br>"
"3. turk user -> 66.28.58.122:main:turk:1234:0:pageturkhome,"
"pageturk,pageturkget,get,login:english";
m->m_cgi = "users";
m->m_xml = "users";
m->m_off = (char *)&g_conf.m_users - g;
m->m_type = TYPE_STRINGBOX;
m->m_perms = PAGE_MASTER;
m->m_size = USERS_TEXT_SIZE;
m->m_plen = (char *)&g_conf.m_usersLen - g;
m->m_page = PAGE_MASTERPASSWORDS;
m++;
*/
/*
m->m_title = "Master IPs";
m->m_desc = "If someone connects from one of these IPs "
"then they will have full "
"master administrator priviledges. "
"If no IPs are specified, then master administrators can "
"get access for any IP. "
"Connecting from 127.0.0.1 always grants master privledges. "
"If no Master Password or Master IP is specified then "
"Gigablast will assign a default password of footbar23.";
m->m_cgi = "masterip";
m->m_xml = "masterIp";
m->m_max = MAX_MASTER_IPS;
m->m_off = (char *)g_conf.m_masterIps - g;
m->m_type = TYPE_IP;
m++;
*/
m->m_title = "Collection Passwords";
m->m_desc = "Whitespace separated list of passwords. "
"Any matching password will have administrative access "
"to the controls for just this collection. The master "
"password and IPs are controled through the "
"<i>master passwords</i> link under the ADVANCED controls "
"tab. The master passwords or IPs have administrative "
"access to all collections.";
m->m_cgi = "collpwd";
m->m_xml = "collectionPasswords";
m->m_obj = OBJ_COLL;
m->m_off = (char *)&cr.m_collectionPasswords - x;
m->m_def = "";
m->m_type = TYPE_SAFEBUF; // STRINGNONEMPTY;
m->m_page = PAGE_COLLPASSWORDS;
m->m_flags = PF_PRIVATE | PF_TEXTAREA | PF_SMALLTEXTAREA;
m++;
m->m_title = "Collection IPs";
m->m_desc = "Whitespace separated list of IPs. "
"Any matching IP will have administrative access "
"to the controls for just this collection.";
m->m_cgi = "collips";
m->m_xml = "collectionIps";
m->m_obj = OBJ_COLL;
m->m_off = (char *)&cr.m_collectionIps - x;
m->m_def = "";
m->m_type = TYPE_SAFEBUF; // STRINGNONEMPTY;
m->m_page = PAGE_COLLPASSWORDS;
m->m_flags = PF_PRIVATE | PF_TEXTAREA | PF_SMALLTEXTAREA;
m++;
//////
// END SECURITY CONTROLS
//////
///////////////////////////////////////////
// LOG CONTROLS
///////////////////////////////////////////
m->m_title = "log http requests";
m->m_desc = "Log GET and POST requests received from the "
"http server?";
m->m_cgi = "hr";
m->m_off = (char *)&g_conf.m_logHttpRequests - g;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_page = PAGE_LOG;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "log autobanned queries";
m->m_desc = "Should we log queries that are autobanned? "
"They can really fill up the log.";
m->m_cgi = "laq";
m->m_off = (char *)&g_conf.m_logAutobannedQueries - g;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_page = PAGE_LOG;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "log query time threshold";
m->m_desc = "If query took this many millliseconds or longer, then log the "
"query and the time it took to process.";
m->m_cgi = "lqtt";
m->m_off = (char *)&g_conf.m_logQueryTimeThreshold- g;
m->m_type = TYPE_LONG;
m->m_def = "5000";
m->m_page = PAGE_LOG;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "log query reply";
m->m_desc = "Log query reply in proxy, but only for those queries "
"above the time threshold above.";
m->m_cgi = "lqr";
m->m_off = (char *)&g_conf.m_logQueryReply - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_group = 0;
m->m_page = PAGE_LOG;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "log spidered urls";
m->m_desc = "Log status of spidered or injected urls?";
m->m_cgi = "lsu";
m->m_off = (char *)&g_conf.m_logSpideredUrls - g;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_page = PAGE_LOG;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "log network congestion";
m->m_desc = "Log messages if Gigablast runs out of udp sockets?";
m->m_cgi = "lnc";
m->m_off = (char *)&g_conf.m_logNetCongestion - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_priv = 1;
m->m_page = PAGE_LOG;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "log informational messages";
m->m_desc = "Log messages not related to an error condition, "
"but meant more to give an idea of the state of "
"the gigablast process. These can be useful when "
"diagnosing problems.";
m->m_cgi = "li";
m->m_off = (char *)&g_conf.m_logInfo - g;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_page = PAGE_LOG;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "log limit breeches";
m->m_desc = "Log it when document not added due to quota "
"breech. Log it when url is too long and it gets "
"truncated.";
m->m_cgi = "ll";
m->m_off = (char *)&g_conf.m_logLimits - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_priv = 1;
m->m_page = PAGE_LOG;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "log debug admin messages";
m->m_desc = "Log various debug messages.";
m->m_cgi = "lda";
m->m_off = (char *)&g_conf.m_logDebugAdmin - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_priv = 1;
m->m_page = PAGE_LOG;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "log debug build messages";
m->m_cgi = "ldb";
m->m_off = (char *)&g_conf.m_logDebugBuild - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_priv = 1;
m->m_page = PAGE_LOG;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "log debug build time messages";
m->m_cgi = "ldbt";
m->m_off = (char *)&g_conf.m_logDebugBuildTime - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_priv = 1;
m->m_page = PAGE_LOG;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "log debug database messages";
m->m_cgi = "ldd";
m->m_off = (char *)&g_conf.m_logDebugDb - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_priv = 1;
m->m_page = PAGE_LOG;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "log debug dirty messages";
m->m_cgi = "lddm";
m->m_off = (char *)&g_conf.m_logDebugDirty - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_priv = 1;
m->m_page = PAGE_LOG;
m->m_obj = OBJ_CONF;
m->m_page = PAGE_LOG;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "log debug disk messages";
m->m_cgi = "lddi";
m->m_off = (char *)&g_conf.m_logDebugDisk - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_priv = 1;
m->m_page = PAGE_LOG;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "log debug disk page cache";
m->m_cgi = "ldpc";
m->m_off = (char *)&g_conf.m_logDebugDiskPageCache - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_priv = 1;
m->m_page = PAGE_LOG;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "log debug dns messages";
m->m_cgi = "lddns";
m->m_off = (char *)&g_conf.m_logDebugDns - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_priv = 1;
m->m_page = PAGE_LOG;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "log debug http messages";
m->m_cgi = "ldh";
m->m_off = (char *)&g_conf.m_logDebugHttp - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_priv = 1;
m->m_page = PAGE_LOG;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "log debug image messages";
m->m_cgi = "ldi";
m->m_off = (char *)&g_conf.m_logDebugImage - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_priv = 1;
m->m_page = PAGE_LOG;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "log debug loop messages";
m->m_cgi = "ldl";
m->m_off = (char *)&g_conf.m_logDebugLoop - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_priv = 1;
m->m_page = PAGE_LOG;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "log debug language detection messages";
m->m_cgi = "ldg";
m->m_off = (char *)&g_conf.m_logDebugLang - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_priv = 1;
m->m_page = PAGE_LOG;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "log debug link info";
m->m_cgi = "ldli";
m->m_off = (char *)&g_conf.m_logDebugLinkInfo - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_priv = 1;
m->m_page = PAGE_LOG;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "log debug mem messages";
m->m_cgi = "ldm";
m->m_off = (char *)&g_conf.m_logDebugMem - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_priv = 1;
m->m_page = PAGE_LOG;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "log debug mem usage messages";
m->m_cgi = "ldmu";
m->m_off = (char *)&g_conf.m_logDebugMemUsage - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_priv = 1;
m->m_page = PAGE_LOG;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "log debug net messages";
m->m_cgi = "ldn";
m->m_off = (char *)&g_conf.m_logDebugNet - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_priv = 1;
m->m_page = PAGE_LOG;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "log debug post query rerank messages";
m->m_cgi = "ldpqr";
m->m_off = (char *)&g_conf.m_logDebugPQR - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_priv = 1;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_LOG;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "log debug query messages";
m->m_cgi = "ldq";
m->m_off = (char *)&g_conf.m_logDebugQuery - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_priv = 1;
m->m_page = PAGE_LOG;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "log debug quota messages";
m->m_cgi = "ldqta";
m->m_off = (char *)&g_conf.m_logDebugQuota - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_priv = 1;
m->m_page = PAGE_LOG;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "log debug robots messages";
m->m_cgi = "ldr";
m->m_off = (char *)&g_conf.m_logDebugRobots - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_priv = 1;
m->m_page = PAGE_LOG;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "log debug spider cache messages";
m->m_cgi = "lds";
m->m_off = (char *)&g_conf.m_logDebugSpcache - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_priv = 1;
m->m_page = PAGE_LOG;
m->m_obj = OBJ_CONF;
m++;
/*
m->m_title = "log debug spider wait messages";
m->m_cgi = "ldspw";
m->m_off = (char *)&g_conf.m_logDebugSpiderWait - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_priv = 1;
m++;
*/
m->m_title = "log debug speller messages";
m->m_cgi = "ldsp";
m->m_off = (char *)&g_conf.m_logDebugSpeller - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_priv = 1;
m->m_page = PAGE_LOG;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "log debug sections messages";
m->m_cgi = "ldscc";
m->m_off = (char *)&g_conf.m_logDebugSections - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_priv = 1;
m->m_page = PAGE_LOG;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "log debug seo insert messages";
m->m_cgi = "ldsi";
m->m_off = (char *)&g_conf.m_logDebugSEOInserts - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_priv = 1;
m->m_page = PAGE_LOG;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "log debug seo messages";
m->m_cgi = "ldseo";
m->m_off = (char *)&g_conf.m_logDebugSEO - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_priv = 1;
m->m_page = PAGE_LOG;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "log debug stats messages";
m->m_cgi = "ldst";
m->m_off = (char *)&g_conf.m_logDebugStats - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_priv = 1;
m->m_page = PAGE_LOG;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "log debug summary messages";
m->m_cgi = "ldsu";
m->m_off = (char *)&g_conf.m_logDebugSummary - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_priv = 1;
m->m_page = PAGE_LOG;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "log debug spider messages";
m->m_cgi = "ldspid";
m->m_off = (char *)&g_conf.m_logDebugSpider - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_priv = 1;
m->m_page = PAGE_LOG;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "log debug msg13 messages";
m->m_cgi = "ldspmth";
m->m_off = (char *)&g_conf.m_logDebugMsg13 - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_priv = 1;
m->m_page = PAGE_LOG;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "disable host0 for msg13 reception hack";
m->m_cgi = "dmth";
m->m_off = (char *)&g_conf.m_diffbotMsg13Hack - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_priv = 1;
m->m_page = PAGE_LOG;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "log debug spider proxies";
m->m_cgi = "ldspr";
m->m_off = (char *)&g_conf.m_logDebugProxies - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_priv = 1;
m->m_page = PAGE_LOG;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "log debug url attempts";
m->m_cgi = "ldspua";
m->m_off = (char *)&g_conf.m_logDebugUrlAttempts - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_priv = 1;
m->m_page = PAGE_LOG;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "log debug spider downloads";
m->m_cgi = "ldsd";
m->m_off = (char *)&g_conf.m_logDebugDownloads - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_priv = 1;
m->m_page = PAGE_LOG;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "log debug facebook";
m->m_cgi = "ldfb";
m->m_off = (char *)&g_conf.m_logDebugFacebook - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_priv = 1;
m->m_page = PAGE_LOG;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "log debug tagdb messages";
m->m_cgi = "ldtm";
m->m_off = (char *)&g_conf.m_logDebugTagdb - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_priv = 1;
m->m_page = PAGE_LOG;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "log debug tcp messages";
m->m_cgi = "ldt";
m->m_off = (char *)&g_conf.m_logDebugTcp - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_priv = 1;
m->m_page = PAGE_LOG;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "log debug tcp buffer messages";
m->m_cgi = "ldtb";
m->m_off = (char *)&g_conf.m_logDebugTcpBuf - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_priv = 1;
m->m_page = PAGE_LOG;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "log debug thread messages";
m->m_cgi = "ldth";
m->m_off = (char *)&g_conf.m_logDebugThread - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_priv = 1;
m->m_page = PAGE_LOG;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "log debug title messages";
m->m_cgi = "ldti";
m->m_off = (char *)&g_conf.m_logDebugTitle - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_priv = 1;
m->m_page = PAGE_LOG;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "log debug timedb messages";
m->m_cgi = "ldtim";
m->m_off = (char *)&g_conf.m_logDebugTimedb - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_priv = 1;
m->m_page = PAGE_LOG;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "log debug topic messages";
m->m_cgi = "ldto";
m->m_off = (char *)&g_conf.m_logDebugTopics - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_priv = 1;
m->m_page = PAGE_LOG;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "log debug topDoc messages";
m->m_cgi = "ldtopd";
m->m_off = (char *)&g_conf.m_logDebugTopDocs - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_priv = 1;
m->m_page = PAGE_LOG;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "log debug udp messages";
m->m_cgi = "ldu";
m->m_off = (char *)&g_conf.m_logDebugUdp - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_priv = 1;
m->m_page = PAGE_LOG;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "log debug unicode messages";
m->m_cgi = "ldun";
m->m_off = (char *)&g_conf.m_logDebugUnicode - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_priv = 1;
m->m_page = PAGE_LOG;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "log debug repair messages";
m->m_cgi = "ldre";
m->m_off = (char *)&g_conf.m_logDebugRepair - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_priv = 1;
m->m_page = PAGE_LOG;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "log debug pub date extraction messages";
m->m_cgi = "ldpd";
m->m_off = (char *)&g_conf.m_logDebugDate - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_priv = 1;
m->m_page = PAGE_LOG;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "log timing messages for build";
m->m_desc = "Log various timing related messages.";
m->m_cgi = "ltb";
m->m_off = (char *)&g_conf.m_logTimingBuild - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_priv = 1;
m->m_page = PAGE_LOG;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "log timing messages for admin";
m->m_desc = "Log various timing related messages.";
m->m_cgi = "ltadm";
m->m_off = (char *)&g_conf.m_logTimingAdmin - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_priv = 1;
m->m_page = PAGE_LOG;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "log timing messages for database";
m->m_cgi = "ltd";
m->m_off = (char *)&g_conf.m_logTimingDb - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_priv = 1;
m->m_page = PAGE_LOG;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "log timing messages for network layer";
m->m_cgi = "ltn";
m->m_off = (char *)&g_conf.m_logTimingNet - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_priv = 1;
m->m_page = PAGE_LOG;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "log timing messages for query";
m->m_cgi = "ltq";
m->m_off = (char *)&g_conf.m_logTimingQuery - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_priv = 1;
m->m_page = PAGE_LOG;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "log timing messages for spcache";
m->m_desc = "Log various timing related messages.";
m->m_cgi = "ltspc";
m->m_off = (char *)&g_conf.m_logTimingSpcache - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_priv = 1;
m->m_page = PAGE_LOG;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "log timing messages for related topics";
m->m_cgi = "ltt";
m->m_off = (char *)&g_conf.m_logTimingTopics - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_priv = 1;
m->m_page = PAGE_LOG;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "log reminder messages";
m->m_desc = "Log reminders to the programmer. You do not need this.";
m->m_cgi = "lr";
m->m_off = (char *)&g_conf.m_logReminders - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_priv = 1;
m->m_page = PAGE_LOG;
m->m_obj = OBJ_CONF;
m++;
/////
// END PAGE LOG CONTROLS
/////
// END PARMS PARM END PARMS END
m_numParms = m - m_parms;
// sanity check
if ( m_numParms >= MAX_PARMS ) {
log("admin: Boost MAX_PARMS.");
exit(-1);
}
// make xml tag names and store in here
static char s_tbuf [ 18000 ];
char *p = s_tbuf;
char *pend = s_tbuf + 18000;
int32_t size;
char t;
// . set hashes of title
// . used by Statsdb.cpp for identifying a parm
for ( int32_t i = 0 ; i < m_numParms ; i++ ) {
if ( ! m_parms[i].m_title ) continue;
m_parms[i].m_hash = hash32n ( m_parms[i].m_title );
}
// cgi hashes
for ( int32_t i = 0 ; i < m_numParms ; i++ ) {
if ( ! m_parms[i].m_cgi ) continue;
m_parms[i].m_cgiHash = hash32n ( m_parms[i].m_cgi );
}
// sanity check: ensure all cgi parms are different
for ( int32_t i = 0 ; i < m_numParms ; i++ ) {
for ( int32_t j = 0 ; j < m_numParms ; j++ ) {
if ( j == i ) continue;
if ( m_parms[i].m_type == TYPE_BOOL2 ) continue;
if ( m_parms[j].m_type == TYPE_BOOL2 ) continue;
if ( m_parms[i].m_type == TYPE_CMD ) continue;
if ( m_parms[j].m_type == TYPE_CMD ) continue;
if ( m_parms[i].m_type == TYPE_FILEUPLOADBUTTON ) continue;
if ( m_parms[j].m_type == TYPE_FILEUPLOADBUTTON ) continue;
if ( m_parms[i].m_obj == OBJ_NONE ) continue;
if ( m_parms[j].m_obj == OBJ_NONE ) continue;
if ( m_parms[i].m_flags & PF_DUP ) continue;
if ( m_parms[j].m_flags & PF_DUP ) continue;
// hack to allow "c" for search, inject, addurls
if ( m_parms[j].m_page != m_parms[i].m_page &&
m_parms[i].m_obj != OBJ_COLL &&
m_parms[i].m_obj != OBJ_CONF )
continue;
if ( ! m_parms[i].m_cgi ) continue;
if ( ! m_parms[j].m_cgi ) continue;
// gotta be on same page now i guess
int32_t obj1 = m_parms[i].m_obj;
int32_t obj2 = m_parms[j].m_obj;
if ( obj1 != OBJ_COLL && obj1 != OBJ_CONF ) continue;
if ( obj2 != OBJ_COLL && obj2 != OBJ_CONF ) continue;
//if ( m_parms[i].m_page != m_parms[j].m_page ) continue;
// a different m_scmd means a different cgi parm really...
//if ( m_parms[i].m_sparm && m_parms[j].m_sparm &&
// strcmp ( m_parms[i].m_scmd, m_parms[j].m_scmd) != 0 )
// continue;
if ( strcmp ( m_parms[i].m_cgi , m_parms[j].m_cgi ) != 0 &&
// ensure cgi hashes are different as well!
m_parms[i].m_cgiHash != m_parms[j].m_cgiHash )
continue;
// upload file buttons are always dup of another parm
if ( m_parms[j].m_type == TYPE_FILEUPLOADBUTTON )
continue;
log(LOG_LOGIC,"conf: Cgi parm for #%"INT32" \"%s\" "
"matches #%"INT32" \"%s\". Exiting.",
i,m_parms[i].m_cgi,j,m_parms[j].m_cgi);
exit(-1);
}
}
int32_t mm = (int32_t)sizeof(CollectionRec);
if ( (int32_t)sizeof(Conf) > mm ) mm = (int32_t)sizeof(Conf);
if ( (int32_t)sizeof(SearchInput) > mm ) mm = (int32_t)sizeof(SearchInput);
// . set size of each parm based on its type
// . also do page and obj inheritance
// . also do sanity checking
for ( int32_t i = 0 ; i < m_numParms ; i++ ) {
// sanity check
if ( m_parms[i].m_off > mm ||
//m_parms[i].m_soff > mm ||
m_parms[i].m_smaxc > mm ) {
log(LOG_LOGIC,"conf: Bad offset in parm #%"INT32" %s."
" (%"INT32",%"INT32",%"INT32"). Did you FORGET to include "
"an & before the cr.myVariable when setting "
"m_off for this parm? Or subtract 'x' instead "
"of 'g' or vice versa.",
i,m_parms[i].m_title,
mm,
m_parms[i].m_off,
//m_parms[i].m_soff,
m_parms[i].m_smaxc);
exit(-1);
}
// do not allow numbers in cgi parms, they are used for
// denoting array indices
int32_t j = 0;
for ( ; m_parms[i].m_cgi && m_parms[i].m_cgi[j] ; j++ ) {
if ( is_digit ( m_parms[i].m_cgi[j] ) ) {
log(LOG_LOGIC,"conf: Parm #%"INT32" \"%s\" has "
"number in cgi name.",
i,m_parms[i].m_title);
exit(-1);
}
}
// these inheriting cause too many problems when moving
// parms around in the array
// inherit page
//if ( i > 0 && m_parms[i].m_page == -1 )
// m_parms[i].m_page = m_parms[i-1].m_page;
// inherit obj
//if ( i > 0 && m_parms[i].m_obj == -1 )
// m_parms[i].m_obj = m_parms[i-1].m_obj;
// sanity now
if ( m_parms[i].m_page == -1 ) {
log("parms: bad page \"%s\"",m_parms[i].m_title);
char *xx=NULL;*xx=0; }
if ( m_parms[i].m_obj == -1 ) {
log("parms: bad obj \"%s\"",m_parms[i].m_title);
char *xx=NULL;*xx=0; }
// if its a fixed size then make sure m_size is not set
if ( m_parms[i].m_fixed > 0 ) {
if ( m_parms[i].m_size != 0 ) {
log(LOG_LOGIC,"conf: Parm #%"INT32" \"%s\" is "
"fixed but size is not 0.",
i,m_parms[i].m_title);
exit(-1);
}
}
// skip if already set
if ( m_parms[i].m_size ) goto skipSize;
// string sizes should already be set!
size = 0;
t = m_parms[i].m_type;
if ( t == -1 ) {
log(LOG_LOGIC,"conf: Parm #%"INT32" \"%s\" has no type.",
i,m_parms[i].m_title);
exit(-1);
}
if ( t == TYPE_CHAR ) size = 1;
if ( t == TYPE_CHAR2 ) size = 1;
if ( t == TYPE_BOOL ) size = 1;
if ( t == TYPE_BOOL2 ) size = 1;
if ( t == TYPE_CHECKBOX ) size = 1;
if ( t == TYPE_PRIORITY ) size = 1;
if ( t == TYPE_PRIORITY2 ) size = 1;
//if ( t ==TYPE_DIFFBOT_DROPDOWN) size = 1;
if ( t == TYPE_UFP ) size = 1;
if ( t == TYPE_PRIORITY_BOXES ) size = 1;
if ( t == TYPE_RETRIES ) size = 1;
if ( t == TYPE_TIME ) size = 6;
if ( t == TYPE_DATE2 ) size = 4;
if ( t == TYPE_DATE ) size = 4;
if ( t == TYPE_FLOAT ) size = 4;
if ( t == TYPE_DOUBLE ) size = 8;
if ( t == TYPE_IP ) size = 4;
if ( t == TYPE_RULESET ) size = 4;
if ( t == TYPE_LONG ) size = 4;
if ( t == TYPE_LONG_CONST ) size = 4;
if ( t == TYPE_LONG_LONG ) size = 8;
if ( t == TYPE_STRING ) size = m_parms[i].m_size;
if ( t == TYPE_STRINGBOX ) size = m_parms[i].m_size;
if ( t == TYPE_STRINGNONEMPTY ) size = m_parms[i].m_size;
if ( t == TYPE_SITERULE ) size = 4;
// comments and commands do not control underlying variables
if ( size == 0 && t != TYPE_COMMENT && t != TYPE_CMD &&
t != TYPE_SAFEBUF &&
t != TYPE_FILEUPLOADBUTTON &&
t != TYPE_CONSTANT &&
t != TYPE_CHARPTR &&
t != TYPE_MONOD2 &&
t != TYPE_MONOM2 ) {
log(LOG_LOGIC,"conf: Size of parm #%"INT32" \"%s\" "
"not set.", i,m_parms[i].m_title);
exit(-1);
}
m_parms[i].m_size = size;
skipSize:
// check offset
if ( m_parms[i].m_obj == OBJ_NONE ) continue;
if ( t == TYPE_COMMENT ) continue;
if ( t == TYPE_FILEUPLOADBUTTON ) continue;
if ( t == TYPE_CMD ) continue;
if ( t == TYPE_CONSTANT ) continue;
if ( t == TYPE_MONOD2 ) continue;
if ( t == TYPE_MONOM2 ) continue;
if ( t == TYPE_SAFEBUF ) continue;
// search parms do not need an offset
if ( m_parms[i].m_off == -1 ){//&& m_parms[i].m_sparm == 0 ) {
log(LOG_LOGIC,"conf: Parm #%"INT32" \"%s\" has no offset.",
i,m_parms[i].m_title);
exit(-1);
}
if ( m_parms[i].m_off < -1 ) {
log(LOG_LOGIC,"conf: Parm #%"INT32" \"%s\" has bad offset "
"of %"INT32".", i,m_parms[i].m_title,m_parms[i].m_off);
exit(-1);
}
if ( m->m_obj == OBJ_CONF && m->m_off >= (int32_t)sizeof(Conf) ) {
log("admin: Parm %s has bad m_off value.",m->m_title);
char *xx = NULL; *xx = 0;
}
if (m->m_obj==OBJ_COLL&&m->m_off>=(int32_t)sizeof(CollectionRec)){
log("admin: Parm %s has bad m_off value.",m->m_title);
char *xx = NULL; *xx = 0;
}
if ( m->m_off >= 0 &&
m->m_obj == OBJ_SI &&
m->m_off >= (int32_t)sizeof(SearchInput)){
log("admin: Parm %s has bad m_off value.",m->m_title);
char *xx = NULL; *xx = 0;
}
if ( m_parms[i].m_page == -1 ) {
log(LOG_LOGIC,"conf: Parm #%"INT32" \"%s\" has no page.",
i,m_parms[i].m_title);
exit(-1);
}
if ( m_parms[i].m_obj == -1 ) {
log(LOG_LOGIC,"conf: Parm #%"INT32" \"%s\" has no object.",
i,m_parms[i].m_title);
exit(-1);
}
//if ( ! m_parms[i].m_title[0] ) {
// log(LOG_LOGIC,"conf: Parm #%"INT32" \"%s\" has no title.",
// i,m_parms[i].m_cgi);
// exit(-1);
//}
// continue if already have the xml name
if ( m_parms[i].m_xml ) continue;
// set xml based on title
char *tt = m_parms[i].m_title;
if ( p + gbstrlen(tt) >= pend ) {
log(LOG_LOGIC,"conf: Not enough room to store xml "
"tag name in buffer.");
exit(-1);
}
m_parms[i].m_xml = p;
for ( int32_t k = 0 ; tt[k] ; k++ ) {
if ( ! is_alnum_a(tt[k]) ) continue;
if ( k > 0 && tt[k-1]==' ') *p++ = to_upper_a(tt[k]);
else *p++ = tt[k];
}
*p++ = '\0';
}
// set m_searchParms
int32_t n = 0;
for ( int32_t i = 0 ; i < m_numParms ; i++ ) {
//if ( ! m_parms[i].m_sparm ) continue;
if ( m_parms[i].m_obj != OBJ_SI ) continue;
m_searchParms[n++] = &m_parms[i];
// sanity check
if ( m_parms[i].m_off == -1 ) {
log(LOG_LOGIC,"conf: SEARCH Parm #%"INT32" \"%s\" has "
"m_off < 0 (offset into SearchInput).",
i,m_parms[i].m_title);
exit(-1);
}
}
m_numSearchParms = n;
// . sanity check
// . we should have it all covered!
si.test();
//
// parm overlap detector
//
// . fill in each parm's buffer with byte #b
// . inc b for each parm
#ifndef _VALGRIND_
overlapTest(+1);
overlapTest(-1);
#endif
}
void Parms::overlapTest ( char step ) {
int32_t start = 0;
if ( step == -1 ) start = m_numParms - 1;
//log("conf: Using step=%"INT32"",(int32_t)step);
SearchInput tmpsi;
GigablastRequest tmpgr;
InjectionRequest tmpir;
CollectionRec tmpcr;
Conf tmpconf;
char b;
char *p1 , *p2;
int32_t i;
// sanity check: ensure parms do not overlap
for ( i = start ; i < m_numParms && i >= 0 ; i += step ) {
// skip comments
if ( m_parms[i].m_type == TYPE_COMMENT ) continue;
if ( m_parms[i].m_type == TYPE_FILEUPLOADBUTTON ) continue;
// skip if it is a broadcast switch, like "all spiders on"
// because that modifies another parm, "spidering enabled"
if ( m_parms[i].m_type == TYPE_BOOL2 ) continue;
if ( m_parms[i].m_type == TYPE_SAFEBUF ) continue;
// we use cr->m_spideringEnabled for PAGE_BASIC_SETTINGS too!
if ( m_parms[i].m_flags & PF_DUP ) continue;
p1 = NULL;
if ( m_parms[i].m_obj == OBJ_COLL ) p1 = (char *)&tmpcr;
if ( m_parms[i].m_obj == OBJ_CONF ) p1 = (char *)&tmpconf;
if ( m_parms[i].m_obj == OBJ_SI ) p1 = (char *)&tmpsi;
if ( m_parms[i].m_obj == OBJ_GBREQUEST ) p1 = (char *)&tmpgr;
if ( m_parms[i].m_obj == OBJ_IR ) p1 = (char *)&tmpir;
if ( p1 ) p1 += m_parms[i].m_off;
p2 = NULL;
int32_t size = m_parms[i].m_size;
// use i now
b = (char)i;
// string box type is a pointer!!
if ( p1 ) memset ( p1 , b , size );
//log("conf: setting %"INT32" bytes for %s at 0x%"XINT32" char=0x%hhx",
// size,m_parms[i].m_title,(int32_t)p1,b);
// search input uses character ptrs!!
if ( m_parms[i].m_type == TYPE_STRINGBOX ) size = 4;
if ( m_parms[i].m_type == TYPE_STRING ) size = 4;
if ( m_parms[i].m_fixed > 0 ) size *= m_parms[i].m_fixed ;
if ( p2 ) memset ( p2 , b , size );
//log("conf: setting %"INT32" bytes for %s at 0x%"XINT32" char=0x%hhx "
// "i=%"INT32"", size,m_parms[i].m_title,(int32_t)p2,b,i);
}
//
// now make sure they are the same
//
if ( step == -1 ) b--;
else b = 0;
char *objStr = "none";
int32_t obj;
char infringerB;
int32_t j;
int32_t savedi = -1;
for ( i = 0 ; i < m_numParms ; i++ ) {
// skip comments
if ( m_parms[i].m_type == TYPE_COMMENT ) continue;
if ( m_parms[i].m_type == TYPE_FILEUPLOADBUTTON ) continue;
// skip if it is a broadcast switch, like "all spiders on"
// because that modifies another parm, "spidering enabled"
if ( m_parms[i].m_type == TYPE_BOOL2 ) continue;
if ( m_parms[i].m_type == TYPE_SAFEBUF ) continue;
// we use cr->m_spideringEnabled for PAGE_BASIC_SETTINGS too!
if ( m_parms[i].m_flags & PF_DUP ) continue;
p1 = NULL;
if ( m_parms[i].m_obj == OBJ_COLL ) p1 = (char *)&tmpcr;
if ( m_parms[i].m_obj == OBJ_CONF ) p1 = (char *)&tmpconf;
if ( m_parms[i].m_obj == OBJ_SI ) p1 = (char *)&tmpsi;
if ( m_parms[i].m_obj == OBJ_GBREQUEST ) p1 = (char *)&tmpgr;
if ( m_parms[i].m_obj == OBJ_IR ) p1 = (char *)&tmpir;
if ( p1 ) p1 += m_parms[i].m_off;
p2 = NULL;
int32_t size = m_parms[i].m_size;
b = (char) i;
// save it
obj = m_parms[i].m_obj;
//log("conf: testing %"INT32" bytes for %s at 0x%"XINT32" char=0x%hhx "
// "i=%"INT32"", size,m_parms[i].m_title,(int32_t)p1,b,i);
for ( j = 0 ; p1 && j < size ; j++ ) {
if ( p1[j] == b ) continue;
// this has multiple parms pointing to it!
//if ( m_parms[i].m_type == TYPE_BOOL2 ) continue;
// or special cases...
//if ( p1 == (char *)&tmpconf.m_spideringEnabled )
// continue;
// set object type
objStr = "??????";
if ( m_parms[i].m_obj == OBJ_COLL )
objStr = "CollectionRec.h";
if ( m_parms[i].m_obj == OBJ_CONF )
objStr = "Conf.h";
if ( m_parms[i].m_obj == OBJ_SI )
objStr = "SearchInput.h";
if ( m_parms[i].m_obj == OBJ_GBREQUEST )
objStr = "GigablastRequest/Parms.h";
if ( m_parms[i].m_obj == OBJ_IR )
objStr = "InjectionRequest/PageInject.h";
// save it
infringerB = p1[j];
savedi = i;
goto error;
}
// search input uses character ptrs!!
if ( m_parms[i].m_type == TYPE_STRINGBOX ) size = 4;
if ( m_parms[i].m_type == TYPE_STRING ) size = 4;
if ( m_parms[i].m_fixed > 0 ) size *= m_parms[i].m_fixed ;
objStr = "SearchInput.h";
//log("conf: testing %"INT32" bytes for %s at 0x%"XINT32" char=0x%hhx "
// "i=%"INT32"", size,m_parms[i].m_title,(int32_t)p2,b,i);
for ( j = 0 ; p2 && j < size ; j++ ) {
if ( p2[j] == b ) continue;
// save it
infringerB = p2[j];
savedi = i;
log("conf: got b=0x%hhx when it should have been "
"b=0x%hhx",p2[j],b);
goto error;
}
}
return;
error:
log("conf: Had a parm value collision. Parm #%"INT32" "
"\"%s\" (size=%"INT32") in %s has overlapped with another parm. "
"Your TYPE_* for this parm or a neighbor of it "
"does not agree with what you have declared it as in the *.h "
"file.",i,m_parms[i].m_title,m_parms[i].m_size,objStr);
if ( step == -1 ) b--;
else b = 0;
// show possible parms that could have overwritten it!
for ( i = start ; i < m_numParms && i >= 0 ; i += step ) {
//char *p1 = NULL;
//if ( m_parms[i].m_obj == OBJ_COLL ) p1 = (char *)&tmpcr;
//if ( m_parms[i].m_obj == OBJ_CONF ) p1 = (char *)&tmpconf;
// skip if comment
if ( m_parms[i].m_type == TYPE_COMMENT ) continue;
if ( m_parms[i].m_type == TYPE_FILEUPLOADBUTTON ) continue;
if ( m_parms[i].m_flags & PF_DUP ) continue;
if ( m_parms[i].m_obj != m_parms[savedi].m_obj ) continue;
// skip if no match
//bool match = false;
//if ( m_parms[i].m_obj == obj ) match = true;
//if ( m_parms[i].m_sparm &&
// NOTE: these need to be fixed!!!
b = (char) i;
if ( b == infringerB )
log("conf: possible overlap with parm #%"INT32" in %s "
"\"%s\" (size=%"INT32") "
"xml=%s "
"desc=\"%s\"",
i,objStr,m_parms[i].m_title,
m_parms[i].m_size,
m_parms[i].m_xml,
m_parms[i].m_desc);
}
log("conf: try including \"m->m_obj = OBJ_COLL;\" or "
"\"m->m_obj = OBJ_CONF;\" in your parm definitions");
log("conf: failed overlap test. exiting.");
exit(-1);
}
bool Parm::getValueAsBool ( SearchInput *si ) {
if ( m_obj != OBJ_SI ) { char *xx=NULL;*xx=0; }
char *p = (char *)si + m_off;
return *(bool *)p;
}
int32_t Parm::getValueAsLong ( SearchInput *si ) {
if ( m_obj != OBJ_SI ) { char *xx=NULL;*xx=0; }
char *p = (char *)si + m_off;
return *(int32_t *)p;
}
char *Parm::getValueAsString ( SearchInput *si ) {
if ( m_obj != OBJ_SI ) { char *xx=NULL;*xx=0; }
char *p = (char *)si + m_off;
return *(char **)p;
}
/////////
//
// new functions
//
/////////
bool Parms::addNewParmToList1 ( SafeBuf *parmList ,
collnum_t collnum ,
char *parmValString ,
int32_t occNum ,
char *parmName ) {
// get the parm descriptor
Parm *m = getParmFast1 ( parmName , NULL );
if ( ! m ) return log("parms: got bogus parm2 %s",parmName );
return addNewParmToList2 ( parmList,collnum,parmValString,occNum,m );
}
// . make a parm rec using the prodivded string
// . used to convert http requests into a parmlist
// . string could be a float or int32_t or int64_t in ascii, as well as a string
// . returns false w/ g_errno set on error
bool Parms::addNewParmToList2 ( SafeBuf *parmList ,
collnum_t collnum ,
char *parmValString ,
int32_t occNum ,
Parm *m ) {
// get value
char *val = NULL;
int32_t valSize = 0;
//char buf[2+MAX_COLL_LEN];
int32_t val32;
int64_t val64;
char val8;
float valf;
/*
char *obj = NULL;
// we might be adding a collnum if a collection that is being
// added via the CommandAddColl0() "addColl" or "addCrawl" or
// "addBulk" commands. they will reserve the collnum, so it might
// not be ready yet.
if ( collnum != -1 ) {
CollectionRec *cr = g_collectiondb.getRec ( collnum );
if ( cr ) obj = (char *)cr;
// log("parms: no coll rec for %"INT32"",(int32_t)collnum);
// return false;
//}
//obj = (char *)cr;
}
else {
obj = (char *)&g_conf;
}
*/
if ( m->m_type == TYPE_STRING ||
m->m_type == TYPE_STRINGBOX ||
m->m_type == TYPE_SAFEBUF ||
m->m_type == TYPE_STRINGNONEMPTY ) {
// point to string
//val = obj + m->m_off;
// Parm::m_size is the max string size
//if ( occNum > 0 ) val += occNum * m->m_size;
// stringlength + 1. no just make it the whole string in
// case it does not use the \0 protocol
//valSize = m->m_max;
val = parmValString;
// include \0
valSize = gbstrlen(val)+1;
// sanity
if ( val[valSize-1] != '\0' ) { char *xx=NULL;*xx=0; }
}
else if ( m->m_type == TYPE_LONG ) {
// watch out for unsigned 32-bit numbers, so use atoLL()
val64 = atoll(parmValString);
val = (char *)&val64;
valSize = 4;
}
else if ( m->m_type == TYPE_FLOAT ) {
valf = atof(parmValString);
val = (char *)&valf;
valSize = 4;
}
else if ( m->m_type == TYPE_LONG_LONG ) {
val64 = atoll(parmValString);
val = (char *)&val64;
valSize = 8;
}
else if ( m->m_type == TYPE_BOOL ||
m->m_type == TYPE_BOOL2 ||
m->m_type == TYPE_CHECKBOX ||
m->m_type == TYPE_PRIORITY2 ||
m->m_type == TYPE_UFP ||
m->m_type == TYPE_CHAR ) {
val8 = atol(parmValString);
//if ( parmValString && to_lower_a(parmValString[0]) == 'y' )
// val8 = 1;
//if ( parmValString && to_lower_a(parmValString[0]) == 'n' )
// val8 = 0;
val = (char *)&val8;
valSize = 1;
}
// for resetting or restarting a coll i think the ascii arg is
// the NEW reserved collnum, but for other commands then parmValString
// will be NULL
else if ( m->m_type == TYPE_CMD ) {
val = parmValString;
if ( val ) valSize = gbstrlen(val)+1;
// . addcoll collection can not be too long
// . TODO: supply a Parm::m_checkValFunc to ensure val is
// legitimate, and set g_errno on error
if ( strcmp(m->m_cgi,"addcoll") == 0 &&valSize-1>MAX_COLL_LEN){
log("admin: addcoll coll too long");
g_errno = ECOLLTOOBIG;
return false;
}
// scan for holes if we hit the limit
//if ( g_collectiondb.m_numRecs >= 1LL>>sizeof(collnum_t) )
}
else if ( m->m_type == TYPE_IP ) {
// point to string
//val = obj + m->m_off;
// Parm::m_size is the max string size
//if ( occNum > 0 ) val += occNum * m->m_size;
// stringlength + 1. no just make it the whole string in
// case it does not use the \0 protocol
val32 = atoip(parmValString);
// store ip in binary format
val = (char *)&val32;
valSize = 4;
}
else {
log("parms: shit unsupported parm type");
char *xx=NULL;*xx=0;
}
key96_t key = makeParmKey ( collnum , m , occNum );
// then key
if ( ! parmList->safeMemcpy ( &key , sizeof(key) ) )
return false;
// datasize
if ( ! parmList->pushLong ( valSize ) )
return false;
// and data
if ( val && valSize && ! parmList->safeMemcpy ( val , valSize ) )
return false;
return true;
}
// g_parms.addCurrentParmToList1 ( &parmList , cr , "spiderRoundNum" );
bool Parms::addCurrentParmToList1 ( SafeBuf *parmList ,
CollectionRec *cr ,
char *parmName ) {
collnum_t collnum = -1;
if ( cr ) collnum = cr->m_collnum;
// get the parm descriptor
int32_t occNum;
Parm *m = getParmFast1 ( parmName , &occNum );
if ( ! m ) return log("parms: got bogus parm1 %s",parmName );
return addCurrentParmToList2 ( parmList , collnum, -1 , m );
}
// . use the current value of the parm to make this record
// . parm class itself already helps us reference the binary parm value
bool Parms::addCurrentParmToList2 ( SafeBuf *parmList ,
collnum_t collnum ,
int32_t occNum ,
Parm *m ) {
char *obj = NULL;
if ( collnum != -1 ) {
CollectionRec *cr = g_collectiondb.getRec ( collnum );
if ( ! cr ) return false;
obj = (char *)cr;
}
else {
obj = (char *)&g_conf;
}
char *data = obj + m->m_off;
// Parm::m_size is the max string size
int32_t dataSize = m->m_size;
if ( occNum > 0 ) data += occNum * m->m_size;
if ( m->m_type == TYPE_STRING ||
m->m_type == TYPE_STRINGBOX ||
m->m_type == TYPE_SAFEBUF ||
m->m_type == TYPE_STRINGNONEMPTY )
// include \0 in string
dataSize = gbstrlen(data) + 1;
// if a safebuf, point to the string within
if ( m->m_type == TYPE_SAFEBUF ) {
SafeBuf *sb = (SafeBuf *)data;
data = sb->getBufStart();
dataSize = sb->length();
// sanity
if ( dataSize > 0 && !data[dataSize-1]){char *xx=NULL;*xx=0;}
// include the \0 since we do it for strings above
if ( dataSize > 0 ) dataSize++;
// empty? make it \0 then to be like strings i guess
if ( dataSize == 0 ) {
data = "\0";
dataSize = 1;
}
// sanity check
if ( dataSize > 0 && data[dataSize-1] ) {char *xx=NULL;*xx=0;}
// if just a \0 then make it empty
//if ( dataSize && !data[0] ) {
// data = NULL;
// dataSize = 0;
//}
}
//int32_t occNum = -1;
key96_t key = makeParmKey ( collnum , m , occNum );
/*
// debug it
log("parms: adding parm collnum=%i title=%s "
"key=%s datasize=%i data=%s hash=%"UINT32
,(int)collnum
,m->m_title
,KEYSTR(&key,sizeof(key))
,(int)dataSize
,data
,(uint32_t)hash32(data,dataSize));
*/
// then key
if ( ! parmList->safeMemcpy ( &key , sizeof(key) ) )
return false;
// size
if ( ! parmList->pushLong ( dataSize ) )
return false;
// and data
if ( dataSize && ! parmList->safeMemcpy ( data , dataSize ) )
return false;
return true;
}
// returns false and sets g_errno on error
bool Parms::convertHttpRequestToParmList (HttpRequest *hr, SafeBuf *parmList,
int32_t page , TcpSocket *sock ) {
// false = useDefaultRec?
CollectionRec *cr = g_collectiondb.getRec ( hr , false );
//if ( c ) {
// cr = g_collectiondb.getRec ( hr );
// if ( ! cr ) log("parms: coll not found");
//}
bool isMasterAdmin = g_conf.isMasterAdmin ( sock , hr );
// does this user have permission to update the parms?
bool isCollAdmin = g_conf.isCollAdmin ( sock , hr ) ;
// might be g_conf specific, not coll specific
//bool hasPerm = false;
// just knowing the collection name of a custom crawl means you
// know the token, so you have permission
//if ( cr && cr->m_isCustomCrawl ) hasPerm = true;
//if ( hr->isLocal() ) hasPerm = true;
// fix jenkins "GET /v2/crawl?token=crawlbottesting" request
char *name = hr->getString("name");
char *token = hr->getString("token");
//if ( ! cr && token ) hasPerm = true;
//if ( ! hasPerm ) {
// //log("parms: no permission to set parms");
// //g_errno = ENOPERM;
// //return false;
// // just leave the parm list empty and fail silently
// return true;
//}
// we set the parms in this collnum
collnum_t parmCollnum = -1;
if ( cr ) parmCollnum = cr->m_collnum;
// turn the collnum into an ascii string for providing as args
// when &reset=1 &restart=1 &delete=1 is given along with a
// &c= or a &name=/&token= pair.
char oldCollName[MAX_COLL_LEN+1];
oldCollName[0] = '\0';
if ( cr ) sprintf(oldCollName,"%"INT32"",(int32_t)cr->m_collnum);
////////
//
// HACK: if crawlbot user supplies a token, name, and seeds, and the
// corresponding collection does not exist then assume it is an add
//
////////
char customCrawl = 0;
char *path = hr->getPath();
// i think /crawlbot is only used by me to see PageCrawlBot.cpp
// so don't bother...
if ( strncmp(path,"/crawlbot",9) == 0 ) customCrawl = 0;
if ( strncmp(path,"/v2/crawl",9) == 0 ) customCrawl = 1;
if ( strncmp(path,"/v2/bulk" ,8) == 0 ) customCrawl = 2;
if ( strncmp(path,"/v3/crawl",9) == 0 ) customCrawl = 1;
if ( strncmp(path,"/v3/bulk" ,8) == 0 ) customCrawl = 2;
// throw error if collection record custom crawl type doesn't equal
// the crawl type of current request
if (cr && customCrawl && customCrawl != cr->m_isCustomCrawl ) {
g_errno = ECUSTOMCRAWLMISMATCH;
return false;
}
bool hasAddCrawl = hr->hasField("addCrawl");
bool hasAddBulk = hr->hasField("addBulk");
bool hasAddColl = hr->hasField("addColl");
// sometimes they try to delete a collection that is not there so do
// not apply this logic in that case!
bool hasDelete = hr->hasField("delete");
bool hasRestart = hr->hasField("restart");
bool hasReset = hr->hasField("reset");
bool hasSeeds = hr->hasField("seeds");
// check for bulk jobs as well
if ( ! hasSeeds ) hasSeeds = hr->hasField("urls");
if ( ! cr &&
token &&
name &&
customCrawl &&
hasSeeds &&
! hasDelete &&
! hasRestart &&
! hasReset &&
! hasAddCrawl &&
! hasAddBulk &&
! hasAddColl ) {
// reserve a new collnum for adding this crawl
parmCollnum = g_collectiondb.reserveCollNum();
// must be there!
if ( parmCollnum == -1 ) {
g_errno = EBADENGINEER;
return false;
}
// log it for now
log("parms: trying to add custom crawl (%"INT32")",
(int32_t)parmCollnum);
// formulate name
char newName[MAX_COLL_LEN+1];
snprintf(newName,MAX_COLL_LEN,"%s-%s",token,name);
char *cmdStr = "addCrawl";
if ( customCrawl == 2 ) cmdStr = "addBulk";
// add to parm list
if ( ! addNewParmToList1 ( parmList ,
parmCollnum ,
newName ,
-1 , // occNum
cmdStr ) )
return false;
}
// loop through cgi parms
for ( int32_t i = 0 ; i < hr->getNumFields() ; i++ ) {
// get cgi parm name
char *field = hr->getField ( i );
// get value of the cgi field
char *val = hr->getValue (i);
// convert field to parm
int32_t occNum;
// parm names can be shared across pages, like "c"
// for search, addurl, inject, etc.
Parm *m = getParmFast1 ( field , &occNum );
if ( ! m ) continue;
// skip if not a command parm, like "addcoll"
if ( m->m_type != TYPE_CMD ) continue;
if ( m->m_obj != OBJ_CONF && m->m_obj != OBJ_COLL )
continue;
//
// HACK
//
// if its a resetcoll/restartcoll/addcoll we have to
// get the next available collnum and use that for setting
// any additional parms. that is the coll it will act on.
if ( strcmp(m->m_cgi,"addColl") == 0 ||
// lowercase support. camelcase is obsolete.
strcmp(m->m_cgi,"addcoll") == 0 ||
strcmp(m->m_cgi,"addCrawl") == 0 ||
strcmp(m->m_cgi,"addBulk" ) == 0 ||
strcmp(m->m_cgi,"reset" ) == 0 ||
strcmp(m->m_cgi,"restart" ) == 0 ) {
// if we wanted to we could make the data the
// new parmCollnum since we already store the old
// collnum in the parm rec key
parmCollnum = g_collectiondb.reserveCollNum();
//
//
// NOTE: the old collnum is in the "val" already
// like "&reset=462" or "&addColl=test"
//
//
// sanity. if all are full! we hit our limit of
// 32k collections. should increase collnum_t from
// int16_t to int32_t...
if ( parmCollnum == -1 ) {
g_errno = EBADENGINEER;
return false;
}
}
// . DIFFBOT HACK: so ppl can manually restart a spider round
// . val can be 0 or 1 or anything. i.e. roundStart=0 works.
// . map this parm to another parm with the round start
// time (current time) and the new round # as the args.
// . this will call CommandForceNextSpiderRound() function
// on every shard with these args, "tmpVal".
if ( cr && strcmp(m->m_cgi,"roundStart") == 0 ) {
// use the current time so anything spidered before
// this time (the round start time) will be respidered
//sprintf(tmp,"%"UINT32"",getTimeGlobalNoCore());
//val = tmp;
char tmpVal[64];
// use the same round start time for all shards
sprintf(tmpVal,
"%"UINT32",%"INT32""
,(uint32_t)getTimeGlobalNoCore()
,cr->m_spiderRoundNum+1
);
// . also add command to reset crawl/process counts
// so if you hit maxToProcess/maxToCrawl it will
// not stop the round from restarting
// . CommandResetCrawlCounts()
if ( ! addNewParmToList1 ( parmList ,
parmCollnum ,
tmpVal, // a string
0 , // occNum (for arrays)
"forceround" ) )
return false;
// don't bother going below
continue;
}
// if a collection name was also provided, assume that is
// the target of the reset/delete/restart. we still
// need PageAddDelete.cpp to work...
if ( cr &&
( strcmp(m->m_cgi,"reset" ) == 0 ||
strcmp(m->m_cgi,"delete" ) == 0 ||
strcmp(m->m_cgi,"restart" ) == 0 ) )
// the collnum to reset/restart/del
// given as a string.
val = oldCollName;
//
// CLOUD SEARCH ENGINE SUPPORT
//
//
// if this is the "delcoll" parm then "c" may have been
// excluded from http request, therefore isCollAdmin and
// isMasterAdmin may be false, so see if they have permission
// for the "val" collection for this one...
bool hasPerm = false;
if ( m->m_page == PAGE_DELCOLL &&
strcmp(m->m_cgi,"delcoll") == 0 ) {
// permission override for /admin/delcoll cmd & parm
hasPerm = g_conf.isCollAdminForColl (sock,hr,val);
}
// if this IP c-block as already added a collection then do not
// allow it to add another.
if ( m->m_page == PAGE_ADDCOLL &&
g_conf.m_allowCloudUsers &&
! isMasterAdmin &&
strcmp(m->m_cgi,"addcoll")==0 ) {
// see if user's c block has already added a collection
int32_t numAdded = 0;
if ( numAdded >= 1 ) {
g_errno = ENOPERM;
log("parms: already added a collection from "
"this cloud user's c-block.");
return false;
}
hasPerm = true;
}
// master controls require root permission
if ( m->m_obj == OBJ_CONF && ! isMasterAdmin ) {
log("parms: could not run root parm \"%s\" no perm.",
m->m_title);
continue;
}
// need to have permission for collection for collrec parms
if ( m->m_obj == OBJ_COLL && ! isCollAdmin && ! hasPerm ) {
log("parms: could not run coll parm \"%s\" no perm.",
m->m_title);
continue;
}
// add the cmd parm
if ( ! addNewParmToList2 ( parmList ,
// it might be a collection-less
// command like 'gb stop' which
// uses the "save=1" parm.
// this is the "new" collnum to
// create in the case of
// add/reset/restart, but in the
// case of delete it is -1 or old.
parmCollnum ,
// the argument to the function...
// in the case of delete, the
// collnum to delete in ascii.
// in the case of add, the name
// of the new coll. in the case
// of reset/restart the OLD
// collnum is ascii to delete.
val,
occNum ,
m ) )
return false;
}
// if we are one page url filters, turn off all checkboxes!
// html should really transmit them as =0 if they are unchecked!!
// "fe" is a url filter expression for the first row.
//if ( hr->hasField("fe") && page == PAGE_FILTERS && cr ) {
// for ( int32_t i = 0 ; i < cr->m_numRegExs ; i++ ) {
// //cr->m_harvestLinks [i] = 0;
// //cr->m_spidersEnabled[i] = 0;
// if ( ! addNewParmToList2 ( parmList ,
// cr->m_collnum,
// "0",
// i,
// }
//}
//
// CLOUD SEARCH ENGINE SUPPORT
//
// provide userip so when adding a new collection we can
// store it in the collection rec to ensure that the same
// IP address cannot add more than one collection.
//
if ( sock && page == PAGE_ADDCOLL ) {
char *ipStr = iptoa(sock->m_ip);
int32_t occNum;
Parm *um = getParmFast1 ( "userip" , &occNum); // NULL = occNum
if ( ! addNewParmToList2 ( parmList ,
// HACK! operate on the to-be-added
// collrec, if there was an addcoll
// reset or restart coll cmd...
parmCollnum ,
ipStr, // val ,
occNum ,
um ) )
return false;
}
//
// now add the parms that are NOT commands
//
// loop through cgi parms
for ( int32_t i = 0 ; i < hr->getNumFields() ; i++ ) {
// get cgi parm name
char *field = hr->getField ( i );
// get value of the cgi field
char *val = hr->getValue (i);
// get the occurence # if its regex. this is the row #
// in the url filters table, since those parms repeat names.
// url filter expression.
//if ( strcmp(field,"fe") == 0 ) occNum++;
// convert field to parm
int32_t occNum;
Parm *m = getParmFast1 ( field , &occNum );
//
// map "pause" to spidering enabled
//
if ( strcmp(field,"pause" ) == 0 ||
strcmp(field,"pauseCrawl") == 0 ) {
m = getParmFast1 ( "cse", &occNum);
if ( val && val[0] == '0' ) val = "1";
else if ( val && val[0] == '1' ) val = "0";
if ( ! m ) { char *xx=NULL;*xx=0; }
}
if ( ! m ) continue;
// skip if IS a command parm, like "addcoll", we did that above
if ( m->m_type == TYPE_CMD )
continue;
if ( m->m_obj != OBJ_CONF && m->m_obj != OBJ_COLL )
continue;
//
// CLOUD SEARCH ENGINE SUPPORT
//
// master controls require root permission. otherwise, just
// knowing the collection name is enough for a cloud user
// to change settings.
//
bool hasPerm = false;
// master controls require root permission
if ( m->m_obj == OBJ_CONF && ! isMasterAdmin ) {
log("parms: could not set root parm \"%s\" no perm.",
m->m_title);
continue;
}
// need to have permission for collection for collrec parms
if ( m->m_obj == OBJ_COLL && ! isCollAdmin && ! hasPerm ) {
log("parms: could not set coll parm \"%s\" no perm.",
m->m_title);
continue;
}
// convert spiderRoundStartTime=0 (roundStart=0 roundStart=1)
// to spiderRoundStartTime=<currenttime>+30secs
// so that will force the next spider round to kick in
/*
bool restartRound = false;
char tmp[24];
if ( strcmp(field,"roundStart")==0 &&
val && (val[0]=='0'||val[0]=='1') && val[1]==0 )
sprintf(tmp,"%"UINT32"",(int32_t)getTimeGlobalNoCore()+0);
val = tmp;
}
*/
// add it to a list now
if ( ! addNewParmToList2 ( parmList ,
// HACK! operate on the to-be-added
// collrec, if there was an addcoll
// reset or restart coll cmd...
parmCollnum ,
val ,
occNum ,
m ) )
return false;
}
return true;
}
Parm *Parms::getParmFast2 ( int32_t cgiHash32 ) {
static HashTableX s_pht;
static char s_phtBuf[26700];
static bool s_init = false;
if ( ! s_init ) {
// init hashtable
s_pht.set ( 4,sizeof(char *),2048,s_phtBuf,26700,
false,0,"phttab" );
// reduce hash collisions:
s_pht.m_useKeyMagic = true;
// wtf?
if ( m_numParms <= 0 ) init();
if ( m_numParms <= 0 ) { char *xx=NULL;*xx=0; }
// fill up hashtable
for ( int32_t i = 0 ; i < m_numParms ; i++ ) {
// get it
Parm *parm = &m_parms[i];
// skip parms that are not for conf or coll lest
// it bitch that "c" is duplicated...
if ( parm->m_obj != OBJ_CONF &&
parm->m_obj != OBJ_COLL )
continue;
// skip comments
if ( parm->m_type == TYPE_COMMENT ) continue;
if ( parm->m_type == TYPE_FILEUPLOADBUTTON ) continue;
// skip if no cgi
if ( ! parm->m_cgi ) continue;
// get its hash of its cgi
int32_t ph32 = parm->m_cgiHash;
// sanity!
if ( s_pht.isInTable ( &ph32 ) ) {
// get the dup guy
Parm *dup = *(Parm **)s_pht.getValue(&ph32);
// same underlying parm?
// like for "all spiders on" vs.
// "all spiders off"?
if ( dup->m_off == parm->m_off )
continue;
// otherwise bitch about it and drop core
log("parms: dup parm h32=%"INT32" "
"\"%s\" vs \"%s\"",
ph32, dup->m_title,parm->m_title);
char *xx=NULL;*xx=0;
}
// add that to hash table
s_pht.addKey ( &ph32 , &parm );
}
// do not do this again
s_init = true;
}
Parm **pp = (Parm **)s_pht.getValue ( &cgiHash32 );
if ( ! pp ) return NULL;
return *pp;
}
Parm *Parms::getParmFast1 ( char *cgi , int32_t *occNum ) {
// strip off the %"INT32" for things like 'fe3' for example
// because that is the occurence # for parm arrays.
int32_t clen = gbstrlen(cgi);
char *d = NULL;
if ( clen > 1 ) {
d = cgi + clen - 1;
while ( is_digit(*d) ) d--;
d++;
}
int32_t h32;
// assume not an array
if ( occNum ) *occNum = -1;
if ( d && *d ) {
if ( occNum ) *occNum = atol(d);
h32 = hash32 ( cgi , d - cgi );
}
else
h32 = hash32n ( cgi );
Parm *m = getParmFast2 ( h32 );
if ( ! m ) return NULL;
// the first element does not have a number after it
if ( m->isArray() && occNum && *occNum == -1 )
*occNum = 0;
return m;
}
////////////
//
// functions for distributing/syncing parms to/with all hosts
//
////////////
class ParmNode {
public:
SafeBuf m_parmList;
int32_t m_numRequests;
int32_t m_numReplies;
int32_t m_numGoodReplies;
int32_t m_numHostsTotal;
class ParmNode *m_prevNode;
class ParmNode *m_nextNode;
int64_t m_parmId;
bool m_calledCallback;
int32_t m_startTime;
void *m_state;
void (* m_callback)(void *state);
bool m_sendToGrunts;
bool m_sendToProxies;
int32_t m_hostId; // -1 means send parm update to all hosts
// . if not -1 then [m_hostId,m_hostId2] is a range
// . used by main.cpp cmd line cmds like 'gb stop 3-5'
int32_t m_hostId2;
};
static ParmNode *s_headNode = NULL;
static ParmNode *s_tailNode = NULL;
static int64_t s_parmId = 0LL;
// . will send the parm update request to each host and retry forever,
// until dead hosts come back up
// . keeps parm update requests in order received
// . returns true and sets g_errno on error
// . returns false if blocked and will call your callback
bool Parms::broadcastParmList ( SafeBuf *parmList ,
void *state ,
void (* callback)(void *) ,
bool sendToGrunts ,
bool sendToProxies ,
// this is -1 if sending to all hosts
int32_t hostId ,
// this is not -1 if its range [hostId,hostId2]
int32_t hostId2 ) {
// empty list?
if ( parmList->getLength() <= 0 ) return true;
// only us? no need for this then. we now do this...
//if ( g_hostdb.m_numHosts <= 1 ) return true;
// make a new parm transmit node
ParmNode *pn = (ParmNode *)mmalloc ( sizeof(ParmNode) , "parmnode" );
if ( ! pn ) return true;
pn->m_parmList.constructor();
// update the ticket #. we use this to keep things ordered too.
// this should never be zero since it starts off at zero.
s_parmId++;
// set it
pn->m_parmList.stealBuf ( parmList );
pn->m_numRequests = 0;
pn->m_numReplies = 0;
pn->m_numGoodReplies = 0;
pn->m_numHostsTotal = 0;
pn->m_prevNode = NULL;
pn->m_nextNode = NULL;
pn->m_parmId = s_parmId; // take a ticket
pn->m_calledCallback = false;
pn->m_startTime = getTimeLocal();
pn->m_state = state;
pn->m_callback = callback;
pn->m_sendToGrunts = sendToGrunts;
pn->m_sendToProxies = sendToProxies;
pn->m_hostId = hostId;
pn->m_hostId2 = hostId2; // a range? then not -1 here.
// store it ordered in our linked list of parm transmit nodes
if ( ! s_tailNode ) {
s_headNode = pn;
s_tailNode = pn;
}
else {
// link pn at end of tail
s_tailNode->m_nextNode = pn;
pn->m_prevNode = s_tailNode;
// pn becomes the new tail
s_tailNode = pn;
}
// just the regular proxies, not compression proxies
if ( pn->m_sendToProxies )
pn->m_numHostsTotal += g_hostdb.getNumProxies();
if ( pn->m_sendToGrunts )
pn->m_numHostsTotal += g_hostdb.getNumGrunts();
if ( hostId >= 0 )
pn->m_numHostsTotal = 1;
// pump the parms out to other hosts in the network
doParmSendingLoop ( );
// . if waiting for more replies to come in that should be in soon
// . doParmSendingLoop() is called when a reply comes in so that
// the next requests can be sent out
//if ( waitingForLiveHostsToReply() ) return false;
// all done. how did this happen?
//return true;
// wait for replies
return false;
}
void tryToCallCallbacks ( ) {
ParmNode *pn = s_headNode;
int32_t now = getTimeLocal();
for ( ; pn ; pn = pn->m_nextNode ) {
// skip if already called callback
if ( pn->m_calledCallback ) continue;
// should we call the callback?
bool callIt = false;
if ( pn->m_numReplies >= pn->m_numRequests ) callIt = true;
// sometimes we don't launch any requests to update parms
// because we are jammed up. same logic as we use for
// freeing the pn below.
if ( pn->m_numGoodReplies < pn->m_numHostsTotal )
callIt = false;
// 8 seconds is enough to wait for all replies to come in.
// a host might be dead, so we need this here lest the
// underlying page handler (i.e. sendPageCrawlbot()) never
// get called if a host is dead. if you are updating some
// parms you want the page to return.
if ( now - pn->m_startTime > 8 &&
! callIt &&
g_hostdb.hasDeadHost() )
callIt = true;
if ( ! callIt ) continue;
// callback is NULL for updating parms like spiderRoundNum
// in Spider.cpp
if ( pn->m_callback ) pn->m_callback ( pn->m_state );
pn->m_calledCallback = true;
}
}
void gotParmReplyWrapper ( void *state , UdpSlot *slot ) {
// don't let upserver free the send buf! that's the ParmNode parmlist
slot->m_sendBufAlloc = NULL;
// in case host table is dynamically modified, go by #
Host *h = g_hostdb.getHost((int32_t)(PTRTYPE)state);
int32_t parmId = h->m_currentParmIdInProgress;
ParmNode *pn = h->m_currentNodePtr;
// inc this count
pn->m_numReplies++;
// nothing in progress now
h->m_currentParmIdInProgress = 0;
h->m_currentNodePtr = NULL;
// this is usually timeout on a dead host i guess
if ( g_errno ) {
log("parms: got parm update reply from host #%"INT32": %s",
h->m_hostId,mstrerror(g_errno));
}
// . note it so we do not retry every 1ms!
// . and only retry on time outs or no mem errors for now...
// . it'll retry once every 10 seconds using the sleep
// wrapper below
if ( g_errno != EUDPTIMEDOUT && g_errno != ENOMEM )
g_errno = 0;
if ( g_errno ) {
// remember error info for retry
h->m_lastTryError = g_errno;
h->m_lastTryTime = getTimeLocal();
// if a host timed out he could be dead, so try to call
// the callback for this "pn" anyway. if the only hosts we
// do not have replies for are dead, then we'll call the
// callback, but still keep trying to send to them.
tryToCallCallbacks ();
// try to send more i guess? i think this is right otherwise
// the callback might not ever get called
g_parms.doParmSendingLoop();
return;
}
// no error, otherwise
h->m_lastTryError = 0;
// successfully completed
h->m_lastParmIdCompleted = parmId;
// inc this count
pn->m_numGoodReplies++;
// . this will try to call any callback that can be called
// . for instances, if the "pn" has recvd all the replies
// . OR if the remaining hosts are "DEAD"
// . the callback is in the "pn"
tryToCallCallbacks ();
// nuke it?
if ( pn->m_numGoodReplies >= pn->m_numHostsTotal &&
pn->m_numReplies >= pn->m_numRequests ) {
// . we must always be the head lest we send out of order.
// . ParmNodes only destined to a specific hostid are ignored
// for this check, only look at those whose m_hostId is -1
if(pn != s_headNode && pn->m_hostId==-1){
log("parms: got parm request out of band. not head.");
}
// a new head
if ( pn == s_headNode ) {
// sanity
if ( pn->m_prevNode ) { char *xx=NULL;*xx=0; }
// the guy after us is the new head
s_headNode = pn->m_nextNode;
}
// a new tail?
if ( pn == s_tailNode ) {
// sanity
if ( pn->m_nextNode ) { char *xx=NULL;*xx=0; }
// the guy before us is the new tail
s_tailNode = pn->m_prevNode;
}
// empty?
if ( ! s_headNode ) s_tailNode = NULL;
// wtf?
if ( ! pn->m_calledCallback ) { char *xx=NULL;*xx=0; }
// do callback first before freeing pn
//if ( pn->m_callback ) pn->m_callback ( pn->m_state );
if ( pn->m_prevNode )
pn->m_prevNode->m_nextNode = pn->m_nextNode;
if ( pn->m_nextNode )
pn->m_nextNode->m_prevNode = pn->m_prevNode;
mfree ( pn , sizeof(ParmNode) , "pndfr");
}
// try to send more for him
g_parms.doParmSendingLoop();
}
void parmLoop ( int fd , void *state ) {
g_parms.doParmSendingLoop();
}
static bool s_registeredSleep = false;
static bool s_inLoop = false;
// . host #0 runs this to send out parms in the the parm queue (linked list)
// to all other hosts.
// . he also sends to himself, if m_sendToGrunts is true
bool Parms::doParmSendingLoop ( ) {
if ( ! s_headNode ) return true;
if ( g_isDumpingRdbFromMain ) return true;
if ( s_inLoop ) return true;
s_inLoop = true;
if ( ! s_registeredSleep &&
! g_loop.registerSleepCallback(2000,NULL,parmLoop,0) )
log("parms: failed to reg parm loop");
// do not re-register
s_registeredSleep = true;
int32_t now = getTimeLocal();
// try to send a parm update request to each host
for ( int32_t i = 0 ; i < g_hostdb.m_numHosts ; i++ ) {
// get it
Host *h = g_hostdb.getHost(i);
// skip ourselves, host #0. we now send to ourselves
// so updateParm() will be called on us...
//if ( h->m_hostId == g_hostdb.m_myHostId ) continue;
// . if in progress, gotta wait for that to complete
// . 0 is not a legit parmid, it starts at 1
if ( h->m_currentParmIdInProgress ) continue;
// if his last completed parmid is the current he is uptodate
if ( h->m_lastParmIdCompleted == s_parmId ) continue;
// if last try had an error, wait 10 secs i guess
if ( h->m_lastTryError &&
h->m_lastTryError != EUDPTIMEDOUT &&
now - h->m_lastTryTime < 10 )
continue;
// otherwise get him the next to send
ParmNode *pn = s_headNode;
for ( ; pn ; pn = pn->m_nextNode ) {
// stop when we got a parmnode we have not sent to
// him yet, we'll send it now
if ( pn->m_parmId > h->m_lastParmIdCompleted ) break;
}
// nothing? strange. something is not right.
if ( ! pn ) {
log("parms: pn is null");
break;
char *xx=NULL; *xx=0;
}
// give him a free pass? some parm updates are directed to
// a single host, we use this for syncing parms at startup.
if ( pn->m_hostId >= 0 &&
pn->m_hostId2 == -1 && // not a range
h->m_hostId != pn->m_hostId ) {
// assume we sent it to him
h->m_lastParmIdCompleted = pn->m_parmId;
h->m_currentNodePtr = NULL;
continue;
}
// range? if not in range, give free pass
if ( pn->m_hostId >= 0 &&
pn->m_hostId2 >= 0 &&
( h->m_hostId < pn->m_hostId ||
h->m_hostId > pn->m_hostId2 ) ) {
// assume we sent it to him
h->m_lastParmIdCompleted = pn->m_parmId;
h->m_currentNodePtr = NULL;
continue;
}
// force completion if we should NOT send to him
if ( (h->isProxy() && ! pn->m_sendToProxies) ||
(h->isGrunt() && ! pn->m_sendToGrunts ) ) {
h->m_lastParmIdCompleted = pn->m_parmId;
h->m_currentNodePtr = NULL;
continue;
}
// debug log
log(LOG_INFO,"parms: sending parm request id %i "
"to hostid %"INT32"",(int)pn->m_parmId,h->m_hostId);
// count it
pn->m_numRequests++;
// ok, he's available
if ( ! g_udpServer.sendRequest ( pn->m_parmList.getBufStart(),
pn->m_parmList.length() ,
// a new msgtype
0x3f,
h->m_ip, // ip
h->m_port, // port
h->m_hostId ,
NULL, // retslot
(void *)(PTRTYPE)h->m_hostId , // state
gotParmReplyWrapper ,
30 , // timeout secs
-1 , // backoff
-1 , // maxwait
NULL , // replybuf
0 , // replybufmaxsize
0 ) ) { // niceness
log("parms: faild to send: %s",mstrerror(g_errno));
continue;
}
// flag this
h->m_currentParmIdInProgress = pn->m_parmId;
h->m_currentNodePtr = pn;
}
s_inLoop = false;
return true;
}
void handleRequest3fLoop ( void *weArg ) ;
void handleRequest3fLoop2 ( void *state , UdpSlot *slot ) {
handleRequest3fLoop(state);
}
// if a tree is saving while we are trying to delete a collnum (or reset)
// then the call to updateParm() below returns false and we must re-call
// in this sleep wrapper here
void handleRequest3fLoop3 ( int fd , void *state ) {
g_loop.unregisterSleepCallback(state,handleRequest3fLoop3);
handleRequest3fLoop(state);
}
// . host #0 is requesting that we update some parms
void handleRequest3fLoop ( void *weArg ) {
WaitEntry *we = (WaitEntry *)weArg;
CollectionRec *cx = NULL;
// process them
char *p = we->m_parmPtr;
for ( ; p < we->m_parmEnd ; ) {
// int16_tcut
char *rec = p;
// get size
int32_t dataSize = *(int32_t *)(rec+sizeof(key96_t));
int32_t recSize = sizeof(key96_t) + 4 + dataSize;
// skip it
p += recSize;
// get the actual parm
Parm *parm = getParmFromParmRec ( rec );
if ( ! parm ) {
int32_t h32 = getHashFromParmRec(rec);
log("parms: unknown parm sent to us hash=%"INT32"",h32);
for ( int32_t i = 0 ; i < g_parms.m_numParms ; i++ ) {
Parm *x = &g_parms.m_parms[i];
if ( x->m_cgiHash != h32 ) continue;
log("parms: unknown parm=%s",x->m_title);
break;
}
continue;
}
// if was the cmd to save & exit then first send a reply back
if ( ! we->m_sentReply &&
parm->m_cgi &&
parm->m_cgi[0] == 's' &&
parm->m_cgi[1] == 'a' &&
parm->m_cgi[2] == 'v' &&
parm->m_cgi[3] == 'e' &&
parm->m_cgi[4] == '\0' ) {
// do not re-do this
we->m_sentReply = 1;
// note it
log("parms: sending early parm update reply");
// wait for reply to be sent and ack'd
g_udpServer.sendReply_ass ( NULL,0,
NULL,0,
we->m_slot,
8, // timeout in secs
// come back here when done
we ,
handleRequest3fLoop2 );
return;
}
// . determine if it alters the url filters
// . if those were changed we have to nuke doledb and
// waiting tree in Spider.cpp and rebuild them!
if ( parm->m_flags & PF_REBUILDURLFILTERS )
we->m_doRebuilds = true;
if ( parm->m_flags & PF_REBUILDPROXYTABLE )
we->m_doProxyRebuild = true;
if ( parm->m_flags & PF_REBUILDACTIVELIST )
we->m_rebuildActiveList = true;
// get collnum i guess
if ( parm->m_type != TYPE_CMD )
we->m_collnum = getCollnumFromParmRec ( rec );
// see if our spider round changes
int32_t oldRound;
if ( we->m_collnum >= 0 && ! cx ) {
cx = g_collectiondb.getRec ( we->m_collnum );
// i guess coll might gotten deleted! so check cx
if ( cx ) oldRound = cx->m_spiderRoundNum;
}
// . this returns false if blocked, returns true and sets
// g_errno on error
// . it'll block if trying to delete a coll when the tree
// is saving or something (CommandDeleteColl())
if ( ! g_parms.updateParm ( rec , we ) ) {
////////////
//
// . it blocked! it will call we->m_callback when done
// . we must re-call
// . try again in 100ms
//
////////////
if(!g_loop.registerSleepCallback(100,
we ,
handleRequest3fLoop3,
0 ) ){// niceness
log("parms: failed to reg sleeper");
return;
}
log("parms: updateParm blocked. waiting.");
return;
}
if ( cx && oldRound != cx->m_spiderRoundNum )
we->m_updatedRound = true;
// do the next parm
we->m_parmPtr = p;
// error?
if ( ! g_errno ) continue;
// this could mean failed to add coll b/c out of disk or
// something else that is bad
we->m_errno = g_errno;
}
// one last thing... kinda hacky. if we change certain spidering parms
// we have to do a couple rebuilds.
// reset page round counts
if ( we->m_updatedRound && cx ) {
// Spider.cpp will reset the *ThisRound page counts and
// the sent notification flag
spiderRoundIncremented ( cx );
}
// basically resetting the spider here...
if ( we->m_doRebuilds && cx ) {
// . this tells Spider.cpp to rebuild the spider queues
// . this is NULL if spider stuff never initialized yet,
// like if you just added the collection
if ( cx->m_spiderColl )
cx->m_spiderColl->m_waitingTreeNeedsRebuild = true;
// . assume we have urls ready to spider too
// . no, because if they change the filters and there are
// still no urls to spider i don't want to get another
// email alert!!
//cr->m_localCrawlInfo .m_hasUrlsReadyToSpider = true;
//cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider = true;
// . reconstruct the url filters if we were a custom crawl
// . this is used to abstract away the complexity of url
// filters in favor of simple regular expressions and
// substring matching for diffbot
cx->rebuildUrlFilters();
}
if ( we->m_rebuildActiveList && cx )
g_spiderLoop.m_activeListValid = false;
// if user changed the list of proxy ips rebuild the binary
// array representation of the proxy ips we have
if ( we->m_doProxyRebuild )
buildProxyTable();
// note it
if ( ! we->m_sentReply )
log("parms: sending parm update reply");
// send back reply now. empty reply for the most part
if ( we->m_errno && ! we->m_sentReply )
g_udpServer.sendErrorReply ( we->m_slot,we->m_errno,0 );
else if ( ! we->m_sentReply )
g_udpServer.sendReply_ass ( NULL,0,NULL,0,we->m_slot);
// all done
mfree ( we , sizeof(WaitEntry) , "weparm" );
return;
}
// . host #0 is requesting that we update some parms
// . the readbuf in the request is the list of the parms
void handleRequest3f ( UdpSlot *slot , int32_t niceness ) {
// sending to host #0 is not right...
//if ( g_hostdb.m_hostId == 0 ) { char *xx=NULL;*xx=0; }
char *parmRecs = slot->m_readBuf;
char *parmEnd = parmRecs + slot->m_readBufSize;
log("parms: got parm update request. size=%"INT32".",
(int32_t)(parmEnd-parmRecs));
// make a new waiting entry
WaitEntry *we ;
we = (WaitEntry *) mmalloc ( sizeof(WaitEntry),"weparm");
if ( ! we ) {
g_udpServer.sendErrorReply(slot,g_errno,60);
return;
}
we->m_slot = slot;
we->m_callback = handleRequest3fLoop;
we->m_parmPtr = parmRecs;
we->m_parmEnd = parmEnd;
we->m_errno = 0;
we->m_doRebuilds = false;
we->m_rebuildActiveList = false;
we->m_updatedRound = false;
we->m_doProxyRebuild = false;
we->m_collnum = -1;
we->m_sentReply = 0;
handleRequest3fLoop ( we );
}
////
//
// functions for syncing parms with host #0
//
////
// 1. we do not accept any recs into rdbs until in sync with host #0
// 2. at startup we send the hash of all parms for each collrec and
// for g_conf (collnum -1) to host #0, then he will send us all the
// parms for a collrec (or g_conf) if we are out of sync.
// 3. when host #0 changes a parm it lets everyone know via broadcastParmList()
// 4. only host #0 may initiate parm changes. so don't let that go down!
// 5. once in sync a host can drop recs for collnums that are invalid
// 6. until in parm sync with host #0 reject adds to collnums we don't
// have with ETRYAGAIN in Msg4.cpp
void tryToSyncWrapper ( int fd , void *state ) {
g_parms.syncParmsWithHost0();
}
// host #0 just sends back an empty reply, but it will hit us with
// 0x3f parmlist requests. that way it uses the same mechanism and can
// guarantee ordering of the parm update requests
void gotReplyFromHost0Wrapper ( void *state , UdpSlot *slot ) {
// ignore his reply unless error?
if ( g_errno ) {
log("parms: got error syncing with host 0: %s. Retrying.",
mstrerror(g_errno));
// re-try it!
g_parms.m_triedToSync = false;
}
else {
log("parms: synced with host #0");
// do not re-call
g_loop.unregisterSleepCallback(NULL,tryToSyncWrapper);
}
g_errno = 0;
}
// returns false and sets g_errno on error, true otherwise
bool Parms::syncParmsWithHost0 ( ) {
if ( m_triedToSync ) return true;
m_triedToSync = true;
m_inSyncWithHost0 = false;
// dont sync with ourselves
if ( g_hostdb.m_hostId == 0 ) {
m_inSyncWithHost0 = true;
return true;
}
// only grunts for now can sync, not proxies, so stop if we are proxy
if ( g_hostdb.m_myHost->m_type != HT_GRUNT ) {
m_inSyncWithHost0 = true;
return true;
}
SafeBuf hashList;
if ( ! makeSyncHashList ( &hashList ) ) return false;
// copy for sending
SafeBuf sendBuf;
if ( ! sendBuf.safeMemcpy ( &hashList ) ) return false;
if ( sendBuf.getCapacity() != hashList.length() ){char *xx=NULL;*xx=0;}
if ( sendBuf.length() != hashList.length() ){char *xx=NULL;*xx=0;}
// allow udpserver to free it
char *request = sendBuf.getBufStart();
int32_t requestLen = sendBuf.length();
sendBuf.detachBuf();
Host *h = g_hostdb.getHost(0);
log("parms: trying to sync with host #0");
// . send it off. use 3e i guess
// . host #0 will reply using msg4 really
// . msg4 guarantees ordering of requests
// . there will be a record that is CMD_INSYNC so when we get
// that we set g_parms.m_inSyncWithHost0 to true
if ( ! g_udpServer.sendRequest ( request ,//hashList.getBufStart() ,
requestLen, //hashList.length() ,
0x3e , // msgtype
h->m_ip, // ip
h->m_port, // port
h->m_hostId , // hostid , host #0!!!
NULL, // retslot
NULL , // state
gotReplyFromHost0Wrapper ,
99999999 ) ) { // timeout in secs
log("parms: error syncing with host 0: %s",mstrerror(g_errno));
return false;
}
// wait now
return true;
}
// . here host #0 is receiving a sync request from another host
// . host #0 scans this list of hashes to make sure the requesting host is
// in sync
// . host #0 will broadcast parm updates by calling broadcastParmList() which
// uses 0x3f, so this just returns and empty reply on success
// . sends CMD "addcoll" and "delcoll" cmd parms as well
// . include an "insync" command parm as last parm
void handleRequest3e ( UdpSlot *slot , int32_t niceness ) {
// right now we must be host #0
if ( g_hostdb.m_hostId != 0 ) {
g_errno = EBADENGINEER;
hadError:
g_udpServer.sendErrorReply(slot,g_errno,60);
return;
}
//
// 0. scan our collections and clear a flag
//
for ( int32_t i = 0 ; i < g_collectiondb.m_numRecs ; i++ ) {
// skip if empty
CollectionRec *cr = g_collectiondb.m_recs[i];
if ( ! cr ) continue;
// clear flag
cr->m_hackFlag = 0;
}
Host *host = slot->m_host;
int32_t hostId = -1;
if ( host ) hostId = host->m_hostId;
SafeBuf replyBuf;
//
// 1. update parms on collections we both have
// 2. tell him to delete collections we do not have but he does
//
SafeBuf tmp;
char *p = slot->m_readBuf;
char *pend = p + slot->m_readBufSize;
for ( ; p < pend ; ) {
// get collnum
collnum_t c = *(collnum_t *)p;
p += sizeof(collnum_t);
// then coll NAME hash
uint32_t collNameHash32 = *(int32_t *)p;
p += 4;
// sanity check. -1 means g_conf. i guess.
if ( c < -1 ) { char *xx=NULL;*xx=0; }
// and parm hash
int64_t h64 = *(int64_t *)p;
p += 8;
// if we being host #0 do not have this collnum tell
// him to delete it!
CollectionRec *cr = NULL;
if ( c >= 0 ) cr = g_collectiondb.getRec ( c );
// if collection names are different delete it
if ( cr && collNameHash32 != hash32n ( cr->m_coll ) ) {
log("sync: host had collnum %i but wrong name, "
"name not %s like it should be",(int)c,cr->m_coll);
cr = NULL;
}
if ( c >= 0 && ! cr ) {
// note in log
logf(LOG_INFO,"sync: telling host #%"INT32" to delete "
"collnum %"INT32"", hostId,(int32_t)c);
// add the parm rec as a parm cmd
if (! g_parms.addNewParmToList1( &replyBuf,
c,
NULL,
-1,
"delete"))
goto hadError;
// ok, get next collection hash
continue;
}
// set our hack flag so we know he has this collection
if ( cr ) cr->m_hackFlag = 1;
// get our parmlist for that collnum
tmp.reset();
// c is -1 for g_conf
if ( ! g_parms.addAllParmsToList ( &tmp, c ) ) goto hadError;
// get checksum of that
int64_t m64 = hash64 ( tmp.getBufStart(),tmp.length() );
// if match, keep chugging, that's in sync
if ( h64 == m64 ) continue;
// note in log
logf(LOG_INFO,"sync: sending all parms for collnum %"INT32" "
"to host #%"INT32"", (int32_t)c, hostId);
// otherwise, send him the list
if ( ! replyBuf.safeMemcpy ( &tmp ) ) goto hadError;
}
//
// 3. now if he's missing one of our collections tell him to add it
//
for ( int32_t i = 0 ; i < g_collectiondb.m_numRecs ; i++ ) {
// skip if empty
CollectionRec *cr = g_collectiondb.m_recs[i];
if ( ! cr ) continue;
// clear flag
if ( cr->m_hackFlag ) continue;
//char *cmdStr = "addColl";
// now use lowercase, not camelcase
char *cmdStr = "addcoll";
if ( cr->m_isCustomCrawl == 1 ) cmdStr = "addCrawl";
if ( cr->m_isCustomCrawl == 2 ) cmdStr = "addBulk";
// note in log
logf(LOG_INFO,"sync: telling host #%"INT32" to add "
"collnum %"INT32" coll=%s", hostId,(int32_t)cr->m_collnum,
cr->m_coll);
// add the parm rec as a parm cmd
if ( ! g_parms.addNewParmToList1 ( &replyBuf,
(collnum_t)i,
cr->m_coll, // parm val
-1,
cmdStr ) )
goto hadError;
// and the parmlist for it
if (!g_parms.addAllParmsToList (&replyBuf, i ) ) goto hadError;
}
// . final parm is the in sync stamp of approval which will set
// g_parms.m_inSyncWithHost0 to true. CommandInSync()
// . use -1 for collnum for this cmd
if ( ! g_parms.addNewParmToList1 ( &replyBuf,-1,NULL,-1,"insync"))
goto hadError;
// this should at least have the in sync command
log("parms: sending %"INT32" bytes of parms to sync to host #%"INT32"",
replyBuf.length(),hostId);
// . use the broadcast call here so things keep their order!
// . we do not need a callback when they have been completely
// broadcasted to all hosts so use NULL for that
// . crap, we only want to send this to host #x ...
g_parms.broadcastParmList ( &replyBuf , NULL , NULL ,
true , // sendToGrunts?
false , // sendToProxies?
hostId );
// but do send back an empty reply to this 0x3e request
g_udpServer.sendReply_ass ( NULL,0,NULL,0,slot);
// send that back now
//g_udpServer.sendReply_ass ( replyBuf.getBufStart() ,
// replyBuf.length() ,
// replyBuf.getBufStart() ,
// replyBuf.getCapacity() ,
// slot );
// udpserver will free it
//replyBuf.detachBuf();
}
// get the hash of every collection's parmlist
bool Parms::makeSyncHashList ( SafeBuf *hashList ) {
SafeBuf tmp;
// first do g_conf, collnum -1!
for ( int32_t i = -1 ; i < g_collectiondb.m_numRecs ; i++ ) {
// shortcut
CollectionRec *cr = NULL;
if ( i >= 0 ) cr = g_collectiondb.m_recs[i];
// skip if empty
if ( i >=0 && ! cr ) continue;
// clear since last time
tmp.reset();
// g_conf? if i is -1 do g_conf
if ( ! addAllParmsToList ( &tmp , i ) )
return false;
// store collnum first as 4 bytes
if ( ! hashList->safeMemcpy ( &i , sizeof(collnum_t) ) )
return false;
// then store the collection name hash, 32 bit hash
uint32_t collNameHash32 = 0;
if ( cr ) collNameHash32 = hash32n ( cr->m_coll );
if ( ! hashList->safeMemcpy ( &collNameHash32, 4 ) )
return false;
// hash the parms
int64_t h64 = hash64 ( tmp.getBufStart(),tmp.length() );
// and store it
if ( ! hashList->pushLongLong ( h64 ) )
return false;
}
return true;
}
int32_t Parm::getNumInArray ( collnum_t collnum ) {
char *obj = (char *)&g_conf;
if ( m_obj == OBJ_COLL ) {
CollectionRec *cr = g_collectiondb.getRec ( collnum );
if ( ! cr ) return -1;
obj = (char *)cr;
}
// # in array is before it
return *(int32_t *)(obj+m_off-4);
}
// . we use this for syncing parms between hosts
// . called by convertAllCollRecsToParmList
// . returns false and sets g_errno on error
// . "rec" can be CollectionRec or g_conf ptr
bool Parms::addAllParmsToList ( SafeBuf *parmList, collnum_t collnum ) {
// loop over parms
for ( int32_t i = 0 ; i < m_numParms ; i++ ) {
// get it
Parm *parm = &m_parms[i];
// skip comments
if ( parm->m_type == TYPE_COMMENT ) continue;
if ( parm->m_type == TYPE_FILEUPLOADBUTTON ) continue;
// cmds
if ( parm->m_type == TYPE_CMD ) continue;
if ( parm->m_type == TYPE_BOOL2 ) continue;
// daily merge last started. do not sync this...
if ( parm->m_type == TYPE_LONG_CONST ) continue;
if ( collnum == -1 && parm->m_obj != OBJ_CONF ) continue;
if ( collnum >= 0 && parm->m_obj != OBJ_COLL ) continue;
if ( collnum < -1 ) { char *xx=NULL;*xx=0; }
// like 'statsdb max cache mem' etc.
if ( parm->m_flags & PF_NOSYNC ) continue;
// sanity, need cgi hash to look up the parm on the
// receiving end
if ( parm->m_cgiHash == 0 ) {
log("parms: no cgi for parm %s",parm->m_title);
char *xx=NULL; *xx=0;
}
int32_t occNum = -1;
int32_t maxOccNum = 0;
if ( parm->isArray() ) {
maxOccNum = parm->getNumInArray(collnum) ;
occNum = 0;
}
for ( ; occNum < maxOccNum ; occNum ++ ) {
// add each occ # to list
if ( ! addCurrentParmToList2 ( parmList ,
collnum ,
occNum ,
parm ) )
return false;
/*
//
// use this to debug parm list checksums being off
//
int64_t h64 ;
h64 = hash64 ( parmList->getBufStart(),
parmList->length() );
// note it for debugging hash
SafeBuf xb;
parm->printVal ( &xb ,collnum,occNum);
log("parms: adding (h=%"XINT64") parm %s = %s",
h64,parm->m_title,xb.getBufStart());
*/
}
}
return true;
}
void resetImportLoopFlag () ;
// . this adds the key if not a cmd key to parmdb rdbtree
// . this executes cmds
// . this updates the CollectionRec which may disappear later and be fully
// replaced by Parmdb, just an RdbTree really.
// . returns false if blocked
// . returns true and sets g_errno on error
bool Parms::updateParm ( char *rec , WaitEntry *we ) {
collnum_t collnum = getCollnumFromParmRec ( rec );
g_errno = 0;
Parm *parm = getParmFromParmRec ( rec );
if ( ! parm ) {
log("parmdb: could not find parm for rec");
g_errno = EBADENGINEER;
return true;
}
// cmd to execute?
if ( parm->m_type == TYPE_CMD ||
// sitelist is a safebuf but it requires special deduping
// logic to update it so it uses CommandUpdateSiteList() to
// do the updating
parm->m_func ) {
// all parm rec data for TYPE_CMD should be ascii/utf8 chars
// and should be \0 terminated
char *data = getDataFromParmRec ( rec );
int32_t dataSize = getDataSizeFromParmRec ( rec );
if ( dataSize == 0 ) data = NULL;
log("parmdb: running function for "
"parm \"%s\" (collnum=%"INT32") args=\"%s\""
, parm->m_title
, (int32_t)collnum
, data
);
// sets g_errno on error
if ( parm->m_func ) {
parm->m_func ( rec );
return true;
}
// fix core from using "roundstart=1" on non-existent coll
if ( ! parm->m_func2 ) {
return true;
}
// . returns true and sets g_errno on error
// . returns false if blocked
// . this is for CommandDeleteColl() and CommandResetColl()
if ( parm->m_func2 ( rec , we ) ) return true;
// . it did not complete.
// . we need to re-call it using sleep wrapper above
return false;
}
// "cr" will remain null when updating g_conf and collnum -1
CollectionRec *cr = NULL;
if ( collnum >= 0 ) {
cr = g_collectiondb.getRec ( collnum );
if ( ! cr ) {
char *ps = "unknown parm";
if ( parm ) ps = parm->m_title;
log("parmdb: invalid collnum %"INT32" for parm \"%s\"",
(int32_t)collnum,ps);
g_errno = ENOCOLLREC;
return true;
}
}
// what are we updating?
void *base = NULL;
// we might have a collnum specified even if parm is global,
// maybe there are some collection/local parms specified as well
// that that collnum applies to
if ( parm->m_obj == OBJ_COLL ) base = cr;
else base = &g_conf;
if ( ! base ) {
log("parms: no collrec (%"INT32") to change parm",(int32_t)collnum);
g_errno = ENOCOLLREC;
return true;
}
int32_t occNum = getOccNumFromParmRec ( rec );
// get data
int32_t dataSize = *(int32_t *)(rec+sizeof(key96_t));
char *data = rec+sizeof(key96_t)+4;
// point to where to copy the data into collrect
char *dst = (char *)base + parm->m_off;
// point to count in case it is an array
int32_t *countPtr = NULL;
// array?
if ( parm->isArray() ) {
if ( occNum < 0 ) {
log("parms: bad occnum for %s",parm->m_title);
return false;
}
// point to count in case it is an array
countPtr = (int32_t *)(dst - 4);
// now point "dst" to the occNum-th element
dst += parm->m_size * occNum;
}
//
// compare parm to see if it changed value
//
SafeBuf val1;
parm->printVal ( &val1 , collnum , occNum );
// if parm is a safebuf...
if ( parm->m_type == TYPE_SAFEBUF ) {
// point to it
SafeBuf *sb = (SafeBuf *)dst;
// nuke it
sb->purge();
// require that the \0 be part of the update i guess
//if ( ! data || dataSize <= 0 ) { char *xx=NULL;*xx=0; }
// check for \0
if ( data && dataSize > 0 ) {
if ( data[dataSize-1] != '\0') { char *xx=NULL;*xx=0;}
// this means that we can not use string POINTERS as
// parms!! don't include \0 as part of length
sb->safeStrcpy ( data ); // , dataSize );
// ensure null terminated
sb->nullTerm();
sb->setLabel("parm2");
}
//return true;
// sanity
// we no longer include the \0 in the dataSize...so a dataSize
// of 0 means empty string...
//if ( data[dataSize-1] != '\0' ) { char *xx=NULL;*xx=0; }
}
else {
// and copy the data into collrec or g_conf
gbmemcpy ( dst , data , dataSize );
}
SafeBuf val2;
parm->printVal ( &val2 , collnum , occNum );
// did this parm change value?
bool changed = true;
if ( strcmp ( val1.getBufStart() , val2.getBufStart() ) == 0 )
changed = false;
// . update array count if necessary
// . parm might not have changed value based on what was in there
// by default, but for PAGE_FILTERS the default value in the row
// for this parm might have been zero! so we gotta update its
// "count" in that scenario even though the parm val was unchanged.
if ( parm->isArray() ) {
// the int32_t before the array is the # of elements
int32_t currentCount = *countPtr;
// update our # elements in our array if this is bigger
int32_t newCount = occNum + 1;
bool updateCount = false;
if ( newCount > currentCount ) updateCount = true;
// do not update counts if we are url filters
// and we are currently >= the expression count. we have
// to have a non-empty expression at the end in order to
// add the expression. this prevents the empty line from
// being added!
if ( parm->m_page == PAGE_FILTERS &&
cr->m_regExs[occNum].getLength() == 0 )
updateCount = false;
// and for other pages, like master ips, skip if empty!
// PAGE_PASSWORDS, PAGE_MASTERPASSWORDS, ...
if ( parm->m_page != PAGE_FILTERS && ! changed )
updateCount = false;
// ok, increment the array count of items in the array
if ( updateCount )
*countPtr = newCount;
}
// all done if value was unchanged
if ( ! changed )
return true;
// show it
log("parms: updating parm \"%s\" "
"(%s[%"INT32"]) (collnum=%"INT32") from \"%s\" -> \"%s\"",
parm->m_title,
parm->m_cgi,
occNum,
(int32_t)collnum,
val1.getBufStart(),
val2.getBufStart());
if ( cr ) cr->m_needsSave = true;
// HACK #2
if ( base == cr && dst == (char *)&cr->m_importEnabled )
resetImportLoopFlag();
//
// HACK
//
// special hack. if spidering re-enabled then reset last spider
// attempt time to 0 to avoid the "has no more urls to spider"
// msg followed by the reviving url msg.
if ( base == cr && dst == (char *)&cr->m_spideringEnabled )
cr->m_localCrawlInfo.m_lastSpiderAttempt = 0;
if ( base == &g_conf && dst == (char *)&g_conf.m_spideringEnabled ){
for(int32_t i = 0;i<g_collectiondb.m_numRecs;i++){
CollectionRec *cr = g_collectiondb.m_recs[i];
if ( ! cr ) continue;
cr->m_localCrawlInfo.m_lastSpiderAttempt = 0;
}
}
//
// if user changed the crawl/process max then reset here so
// spiders will resume
//
if ( base == cr &&
dst == (char *)&cr->m_maxToCrawl &&
cr->m_spiderStatus == SP_MAXTOCRAWL ) {
// reset this for rebuilding of active spider collections
// so this collection can be in the linked list again
cr->m_spiderStatus = SP_INPROGRESS;
// rebuild list of active spider collections then
g_spiderLoop.m_activeListValid = false;
}
if ( base == cr &&
dst == (char *)&cr->m_maxToProcess &&
cr->m_spiderStatus == SP_MAXTOPROCESS ) {
// reset this for rebuilding of active spider collections
// so this collection can be in the linked list again
cr->m_spiderStatus = SP_INPROGRESS;
// rebuild list of active spider collections then
g_spiderLoop.m_activeListValid = false;
}
if ( base == cr &&
dst == (char *)&cr->m_maxCrawlRounds &&
cr->m_spiderStatus == SP_MAXROUNDS ) {
// reset this for rebuilding of active spider collections
// so this collection can be in the linked list again
cr->m_spiderStatus = SP_INPROGRESS;
// rebuild list of active spider collections then
g_spiderLoop.m_activeListValid = false;
}
//
// END HACK
//
// all done
return true;
}
bool Parm::printVal ( SafeBuf *sb , collnum_t collnum , int32_t occNum ) {
CollectionRec *cr = NULL;
if ( collnum >= 0 ) cr = g_collectiondb.getRec ( collnum );
// no value if no storage record offset
//if ( m_off < 0 ) return true;
char *base;
if ( m_obj == OBJ_COLL ) base = (char *)cr;
else base = (char *)&g_conf;
if ( ! base ) {
log("parms: no collrec (%"INT32") to change parm",(int32_t)collnum);
g_errno = ENOCOLLREC;
return true;
}
// point to where to copy the data into collrect
char *val = (char *)base + m_off;
if ( isArray() && occNum < 0 ) {
log("parms: bad occnum for %s",m_title);
return false;
}
// add array index to ptr
if ( isArray() ) val += m_size * occNum;
if ( m_type == TYPE_SAFEBUF ) {
// point to it
SafeBuf *sb2 = (SafeBuf *)val;
return sb->safePrintf("%s",sb2->getBufStart());
}
if ( m_type == TYPE_STRING ||
m_type == TYPE_STRINGBOX ||
m_type == TYPE_SAFEBUF ||
m_type == TYPE_STRINGNONEMPTY )
return sb->safePrintf("%s",val);
if ( m_type == TYPE_LONG || m_type == TYPE_LONG_CONST )
return sb->safePrintf("%"INT32"",*(int32_t *)val);
if ( m_type == TYPE_DATE )
return sb->safePrintf("%"INT32"",*(int32_t *)val);
if ( m_type == TYPE_DATE2 )
return sb->safePrintf("%"INT32"",*(int32_t *)val);
if ( m_type == TYPE_FLOAT )
return sb->safePrintf("%f",*(float *)val);
if ( m_type == TYPE_LONG_LONG )
return sb->safePrintf("%"INT64"",*(int64_t *)val);
if ( m_type == TYPE_CHARPTR ) {
if ( val ) return sb->safePrintf("%s",val);
return true;
}
if ( m_type == TYPE_BOOL ||
m_type == TYPE_BOOL2 ||
m_type == TYPE_CHECKBOX ||
m_type == TYPE_PRIORITY2 ||
m_type == TYPE_UFP ||
m_type == TYPE_CHAR )
return sb->safePrintf("%hhx",*val);
if ( m_type == TYPE_CMD )
return sb->safePrintf("CMD");
if ( m_type == TYPE_IP )
// may print 0.0.0.0
return sb->safePrintf("%s",iptoa(*(int32_t *)val) );
log("parms: missing parm type!!");
char *xx=NULL;*xx=0;
return false;
}
bool printUrlExpressionExamples ( SafeBuf *sb ) {
/*
CollectionRec *cr = (CollectionRec *)THIS;
// if testUrl is provided, find in the table
char testUrl [ 1025 ];
char *tt = r->getString ( "qatest123" , NULL );
testUrl[0]='\0';
if ( tt ) strncpy ( testUrl , tt , 1024 );
char *tu = testUrl;
if ( ! tu ) tu = "";
char matchString[12];
matchString[0] = '\0';
if ( testUrl[0] ) {
Url u;
u.set ( testUrl , gbstrlen(testUrl) );
//since we don't know the doc's quality, sfn, or
//other stuff, just give default values
int32_t n = cr->getRegExpNum ( &u ,
false , // links2gb?
false , // searchboxToGB
false , // onsite?
-1 , // docQuality
-1 , // hopCount
false , // siteInDmoz?
//-1 , // ruleset #
-1 , // langId
-1 , // parent priority
0 , // niceness
NULL , // tagRec
false , // isRSS?
false , // isPermalink?
false , // new outlink?
-1 , // age
NULL , // LinkInfo
NULL , // parentUrl
-1 , // priority
false , // isAddUrl
false , // parentRSS?
false , // parentIsNew?
false , // parentIsPermlnk
false );// isIndexed?
if ( n == -1 ) sprintf ( matchString , "default" );
else sprintf ( matchString, "%"INT32"", n+1 );
}
// test table
sb.safePrintf (
//"</form><form method=get action=/cgi/14.cgi>"
//"<input type=hidden name="
"<table width=100%% cellpadding=4 border=1 "
"bgcolor=#%s>"
"<tr><td colspan=2 bgcolor=#%s><center>"
//"<font size=+1>"
"<b>"
"URL Filters Test</b>"
//"</font>"
"</td></tr>"
"<tr><td colspan=2>"
"<font size=1>"
"To test your URL filters simply enter a URL into "
"this box and submit it. The URL filter line number "
"that it matches will be displayed to the right."
"</font>"
"</td></tr>"
"<tr>"
"<td><b>Test URL</b></td>"
"<td><b>Matching Expression #</b></td>"
"</tr>"
"<tr>"
"<td><input type=text size=55 value=\"%s\" "
"name=test> "
"<input type=submit name=action value=test></td>"
"<td>%s</td></tr></table><br><br>\n" ,
LIGHT_BLUE , DARK_BLUE , testUrl , matchString );
*/
sb->safePrintf(
"<style>"
".poo { background-color:#%s;}\n"
"</style>\n" ,
LIGHT_BLUE );
sb->safePrintf (
"<table %s>"
"<tr><td colspan=2><center>"
"<b>"
"Supported Expressions</b>"
"</td></tr>"
"<tr class=poo><td>default</td>"
"<td>Matches every url."
"</td></tr>"
"<tr class=poo><td>^http://whatever</td>"
"<td>Matches if the url begins with "
"<i>http://whatever</i>"
"</td></tr>"
"<tr class=poo><td>$.css</td>"
"<td>Matches if the url ends with \".css\"."
"</td></tr>"
"<tr class=poo><td>foobar</td>"
"<td>Matches if the url CONTAINS <i>foobar</i>."
"</td></tr>"
"<tr class=poo><td>tld==uk,jp</td>"
"<td>Matches if url's TLD ends in \"uk\" or \"jp\"."
"</td></tr>"
/*
"<tr class=poo><td>doc:quality&lt;40</td>"
"<td>Matches if document quality is "
"less than 40. Can be used for assigning to spider "
"priority.</td></tr>"
"<tr class=poo><td>doc:quality&lt;40 && tag:ruleset==22</td>"
"<td>Matches if document quality less than 40 and "
"belongs to ruleset 22. Only for assinging to "
"spider priority.</td></tr>"
"<tr class=poo><td><nobr>"
"doc:quality&lt;40 && tag:manualban==1</nobr></td>"
"<td>Matches if document quality less than 40 and "
"is has a value of \"1\" for its \"manualban\" "
"tag.</td></tr>"
"<tr class=poo><td>tag:ruleset==33 && doc:quality&lt;40</td>"
"<td>Matches if document quality less than 40 and "
"belongs to ruleset 33. Only for assigning to "
"spider priority or a banned ruleset.</td></tr>"
*/
"<tr class=poo><td><a name=hopcount></a>"
"hopcount<4 && iswww</td>"
"<td>Matches if document has a hop count of 4, and "
"is a \"www\" url (or domain-only url).</td></tr>"
"<tr class=poo><td>hopcount</td>"
"<td>All root urls, those that have only a single "
"slash for their path, and no cgi parms, have a "
"hop count of 0. Also, all RSS urls, ping "
"server urls and site roots (as defined in the "
"site rules table) have a hop count of 0. Their "
"outlinks have a hop count of 1, and the outlinks "
"of those outlinks a hop count of 2, etc."
"</td></tr>"
"<tr class=poo><td>sitepages</td>"
"<td>The number of pages that are currently indexed "
"for the subdomain of the URL. "
"Used for doing quotas."
"</td></tr>"
// MDW: 7/11/2014 take this out until it works.
// problem is that the quota table m_localTable
// in Spider.cpp gets reset for each firstIp scan,
// and we have a.walmart.com and b.walmart.com
// with different first ips even though on same
// domain. perhaps we should use the domain as the
// key to getting the firstip for and subdomain.
// but out whole selection algo in spider.cpp is
// firstIp based, so it scans all the spiderrequests
// from a single firstip to get the winner for that
// firstip.
// "<tr class=poo><td>domainpages</td>"
// "<td>The number of pages that are currently indexed "
// "for the domain of the URL. "
// "Used for doing quotas."
// "</td></tr>"
"<tr class=poo><td>siteadds</td>"
"<td>The number URLs manually added to the "
"subdomain of the URL. Used to guage a subdomain's "
"popularity."
"</td></tr>"
// taken out for the same reason as domainpages
// above was taken out. see expanation up there.
// "<tr class=poo><td>domainadds</td>"
// "<td>The number URLs manually added to the "
// "domain of the URL. Used to guage a domain's "
// "popularity."
// "</td></tr>"
"<tr class=poo><td>isrss | !isrss</td>"
"<td>Matches if document is an RSS feed. Will "
"only match this rule if the document has been "
"successfully spidered before, because it requires "
"downloading the document content to see if it "
"truly is an RSS feed.."
"</td></tr>"
"<tr class=poo><td>isrssext | !isrssext</td>"
"<td>Matches if url ends in .xml .rss or .atom. "
"TODO: Or if the link was in an "
"alternative link tag."
"</td></tr>"
//"<tr class=poo><td>!isrss</td>"
//"<td>Matches if document is NOT an rss feed."
//"</td></tr>"
"<tr class=poo><td>ispermalink | !ispermalink</td>"
"<td>Matches if document is a permalink. "
"When harvesting outlinks we <i>guess</i> if they "
"are a permalink by looking at the structure "
"of the url.</td></tr>"
//"<tr class=poo><td>!ispermalink</td>"
//"<td>Matches if document is NOT a permalink."
//"</td></tr>"
/*
"<tr class=poo><td>outlink | !outlink</td>"
"<td>"
"<b>This is true if url being added to spiderdb "
"is an outlink from the page being spidered. "
"Otherwise, the url being added to spiderdb "
"directly represents the page being spidered. It "
"is often VERY useful to partition the Spiderdb "
"records based on this criteria."
"</td></tr>"
*/
"<tr class=poo><td><nobr>isnewoutlink | !isnewoutlink"
"</nobr></td>"
"<td>"
"This is true since the outlink was not there "
"the last time we spidered the page we harvested "
"it from."
"</td></tr>"
"<tr class=poo><td>hasreply | !hasreply</td>"
"<td>"
"This is true if we have tried to spider "
"this url, even if we got an error while trying."
"</td></tr>"
"<tr class=poo><td>isnew | !isnew</td>"
"<td>"
"This is the opposite of hasreply above. A url "
"is new if it has no spider reply, including "
"error replies. So once a url has been attempted to "
"be spidered then this will be false even if there "
"was any kind of error."
"</td></tr>"
"<tr class=poo><td>lastspidertime >= "
"<b>{roundstart}</b></td>"
"<td>"
"This is true if the url's last spidered time "
"indicates it was spidered already for this "
"current round of spidering. When no more urls "
"are available for spidering, then gigablast "
"automatically sets {roundstart} to the current "
"time so all the urls can be spidered again. This "
"is how you do round-based spidering. "
"You have to use the respider frequency as well "
"to adjust how often you want things respidered."
"</td></tr>"
"<tr class=poo><td>urlage</td>"
"<td>"
"This is the time, in seconds, since a url was first "
"added to spiderdb to be spidered. This is "
"its discovery date. "
"Can use <, >, <=, >=, ==, != comparison operators."
"</td></tr>"
//"<tr class=poo><td>!newoutlink</td>"
//"<td>Matches if document is NOT a new outlink."
//"</td></tr>"
"<tr class=poo><td>age</td>"
"<td>"
"How old is the doucment <b>in seconds</b>. "
"The age is based on the publication date of "
"the document, which could also be the "
"time that the document was last significantly "
"modified. If this date is unknown then the age "
"will be -1 and only match the expression "
"<i>age==-1</i>. "
"When harvesting links, we guess the publication "
"date of the oulink by detecting dates contained "
"in the url itself, which is popular among some "
"forms of permalinks. This allows us to put "
"older permalinks into a slower spider queue."
"</td></tr>"
"<tr class=poo><td>spiderwaited &lt; 3600</td>"
"<td>"
"<i>spiderwaited</i> is how many seconds have elapsed "
"since the last time "
"we tried to spider/download the url. "
"The constaint containing <i>spiderwaited</i> will "
"fail to be matched if the url has never been "
"attempted to be spidered/downloaded before. Therefore, "
"it will only ever match urls that have a spider reply "
"of some sort, so there is no need to add an additional "
"<i>hasreply</i>-based constraint."
"</td></tr>"
"<tr class=poo><td>"
"<a name=insitelist>"
"insitelist | !insitelist"
"</a>"
"</td>"
"<td>"
"This is true if the url matches a pattern in "
"the list of sites on the <a href=/admin/sites>"
"site list</a> page. That site list is useful for "
"adding a large number of sites that can not be "
"accomodated by the url fitlers table. Plus "
"it is higher performance and easier to use, but "
"lacks the url filter table's "
"fine level of control."
"</td></tr>"
"<tr class=poo><td>"
"<a name=isaddurl>"
"isaddurl | !isaddurl"
"</a>"
"</td>"
"<td>"
"This is true if the url was added from the add "
"url interface or API."
//"This replaces the add url priority "
//"parm."
"</td></tr>"
"<tr class=poo><td>isinjected | !isinjected</td>"
"<td>"
"This is true if the url was directly "
"injected from the "
"<a href=/admin/inject>inject page</a> or API."
"</td></tr>"
"<tr class=poo><td>isreindex | !isreindex</td>"
"<td>"
"This is true if the url was added from the "
"<a href=/admin/reindex>query reindex</a> "
"interface. The request does not contain "
"a url, but only a docid, that way we can add "
"millions of search results very quickly without "
"having to lookup each of their urls. You should "
"definitely have this if you use the reindexing "
"feature. "
"You can set max spiders to 0 "
"for non "
"isreindex requests while you reindex or delete "
"the results of a query for extra speed."
"</td></tr>"
"<tr class=poo><td>ismanualadd | !ismanualadd</td>"
"<td>"
"This is true if the url was added manually. "
"Which means it matches isaddurl, isinjected, "
" or isreindex. as opposed to only "
"being discovered from the spider. "
"</td></tr>"
"<tr class=poo><td><nobr>inpingserver | !inpingserver"
"</nobr></td>"
"<td>"
"This is true if the url has an inlink from "
"a recognized ping server. Ping server urls are "
"hard-coded in Url.cpp. <b><font color=red> "
"pingserver urls are assigned a hop count of 0"
"</font></b>"
"</td></tr>"
"<tr class=poo><td>isparentrss | !isparentrss</td>"
"<td>"
"If a parent of the URL was an RSS page "
"then this will be matched."
"</td></tr>"
"<tr class=poo><td>isparentsitemap | "
"!isparentsitemap</td>"
"<td>"
"If a parent of the URL was a sitemap.xml page "
"then this will be matched."
"</td></tr>"
/*
"<tr class=poo><td>parentisnew | !parentisnew</td>"
"<td>"
"<b>Parent providing this outlink is not currently "
"in the index but is trying to be added right now. "
"</b>This is a special expression in that "
"it only applies to assigning spider priorities "
"to outlinks we are harvesting on a page.</b>"
"</td></tr>"
*/
"<tr class=poo><td>isindexed | !isindexed</td>"
"<td>"
"This url matches this if in the index already. "
"</td></tr>"
"<tr class=poo><td>errorcount==1</td>"
"<td>"
"The number of times the url has failed to "
"be indexed. 1 means just the last time, two means "
"the last two times. etc. Any kind of error parsing "
"the document (bad utf8, bad charset, etc.) "
"or any HTTP status error, like 404 or "
"505 is included in this count, in addition to "
"\"temporary\" errors like DNS timeouts."
"</td></tr>"
"<tr class=poo><td>errorcode==32880</td>"
"<td>"
"If the last time it was spidered it had this "
"numeric error code. See the error codes in "
"Errno.cpp. In this particular example 32880 is "
"for EBADURL."
"</td></tr>"
"<tr class=poo><td>hastmperror</td>"
"<td>"
"This is true if the last spider attempt resulted "
"in an error like EDNSTIMEDOUT or a similar error, "
"usually indicative of a temporary internet "
"failure, or local resource failure, like out of "
"memory, and should be retried soon. "
"Currently: "
"dns timed out, "
"tcp timed out, "
"dns dead, "
"network unreachable, "
"host unreachable, "
"diffbot internal error, "
"out of memory."
"</td></tr>"
"<tr class=poo><td>percentchangedperday&lt=5</td>"
"<td>"
"Looks at how much a url's page content has changed "
"between the last two times it was spidered, and "
"divides that percentage by the number of days. "
"So if a URL's last two downloads were 10 days "
"apart and its page content changed 30%% then "
"the <i>percentchangedperday</i> will be 3. "
"Can use <, >, <=, >=, ==, != comparison operators. "
"</td></tr>"
"<tr class=poo><td>sitenuminlinks&gt;20</td>"
"<td>"
"How many inlinks does the URL's site have? "
"We only count non-spammy inlinks, and at most only "
"one inlink per IP address C-Class is counted "
"so that a webmaster who owns an entire C-Class "
"of IP addresses will only have his inlinks counted "
"once."
"Can use <, >, <=, >=, ==, != comparison operators. "
"</td></tr>"
"<tr class=poo><td>numinlinks&gt;20</td>"
"<td>"
"How many inlinks does the URL itself have? "
"We only count one link per unique C-Class IP "
"address "
"so that a webmaster who owns an entire C-Class "
"of IP addresses will only have her inlinks counted "
"once."
"Can use <, >, <=, >=, ==, != comparison operators. "
"This is useful for spidering popular URLs quickly."
"</td></tr>"
"<tr class=poo><td>httpstatus==404</td>"
"<td>"
"For matching the URL based on the http status "
"of its last download. Does not apply to URLs "
"that have not yet been successfully downloaded."
"Can use <, >, <=, >=, ==, != comparison operators. "
"</td></tr>"
/*
"<tr class=poo><td>priority==30</td>"
"<td>"
"<b>If the current priority of the url is 30, then "
"it will match this expression. Does not apply "
"to outlinks, of course."
"</td></tr>"
"<tr class=poo><td>parentpriority==30</td>"
"<td>"
"<b>This is a special expression in that "
"it only applies to assigning spider priorities "
"to outlinks we are harvesting on a page.</b> "
"Matches if the url being added to spider queue "
"is from a parent url in priority queue 30. "
"The parent's priority queue is the one it got "
"moved into while being spidered. So if it was "
"in priority 20, but ended up in 25, then 25 will "
"be used when scanning the URL Filters table for "
"each of its outlinks. Only applies "
"to the FIRST time the url is added to spiderdb. "
"Use <i>parentpriority==-3</i> to indicate the "
"parent was FILTERED and <i>-2</i> to indicate "
"the parent was BANNED. A parentpriority of "
"<i>-1</i>"
" means that the urls is not a link being added to "
"spiderdb but rather a url being spidered."
"</td></tr>"
"<tr class=poo><td>inlink==...</td>"
"<td>"
"If the url has an inlinker which contains the "
"given substring, then this rule is matched. "
"We use this like <i>inlink=www.weblogs.com/"
"int16_tChanges.xml</i> to detect if a page is in "
"the ping server or not, and if it is, then we "
"assign it to a slower-spidering queue, because "
"we can reply on the ping server for updates. Saves "
"us from having to spider all the blogspot.com "
"subdomains a couple times a day each."
"</td></tr>"
*/
//"NOTE: Until we get the link info to get the doc "
//"quality before calling msg8 in Msg16.cpp, we "
//"can not involve doc:quality for purposes of "
//"assigning a ruleset, unless banning it.</td>"
"<tr class=poo><td><nobr>tld!=com,org,edu"// && "
//"doc:quality&lt;70"
"</nobr></td>"
"<td>Matches if the "
"url's TLD does NOT end in \"com\", \"org\" or "
"\"edu\". "
"</td></tr>"
"<tr class=poo><td><nobr>lang==zh_cn,de"
"</nobr></td>"
"<td>Matches if "
"the url's content is in the language \"zh_cn\" or "
"\"de\". See table below for supported language "
"abbreviations. Used to only keep certain languages "
"in the index. This is hacky because the language "
"may not be known at spider time, so Gigablast "
"will check after downloading the document to "
"see if the language <i>spider priority</i> is "
"DELETE thereby discarding it.</td></tr>"
//"NOTE: Until we move the language "
//"detection up before any call to XmlDoc::set1() "
//"in Msg16.cpp, we can not use for purposes of "
//"assigning a ruleset, unless banning it.</td>"
//"</tr>"
"<tr class=poo><td><nobr>lang!=xx,en,de"
"</nobr></td>"
"<td>Matches if "
"the url's content is NOT in the language \"xx\" "
"(unknown), \"en\" or \"de\". "
"See table below for supported language "
"abbreviations.</td></tr>"
"<tr class=poo><td><nobr>parentlang==zh_cn,zh_tw,xx"
"</nobr></td>"
"<td>Matches if "
"the url's referring parent url is primarily in "
"this language. Useful for prioritizing spidering "
"pages of a certain language."
"See table below for supported language "
"abbreviations."
"</td></tr>"
/*
"<tr class=poo><td>link:gigablast</td>"
"<td>Matches if the document links to gigablast."
"</td></tr>"
"<tr class=poo><td>searchbox:gigablast</td>"
"<td>Matches if the document has a submit form "
"to gigablast."
"</td></tr>"
"<tr class=poo><td>site:dmoz</td>"
"<td>Matches if the document is directly or "
"indirectly in the DMOZ directory."
"</td></tr>"
"<tr class=poo><td>tag:spam>X</td>"
"<td>Matches if the document's tagdb record "
"has a score greater than X for the sitetype, "
"'spam' in this case. "
"Can use <, >, <=, >=, ==, != comparison operators. "
"Other sitetypes include: "
"..."
"</td></tr>"
*/
"<tr class=poo><td>iswww | !iswww</td>"
"<td>Matches if the url's hostname is www or domain "
"only. For example: <i>www.xyz.com</i> would match, "
"and so would <i>abc.com</i>, but "
"<i>foo.somesite.com</i> would NOT match."
"</td></tr>"
"<tr class=poo><td>isroot | !isroot</td>"
"<td>Matches if the URL is a root URL. Like if "
"its path is just '/'. Example: http://www.abc.com "
"is a root ur but http://www.abc.com/foo is not. "
"</td></tr>"
"<tr class=poo><td>isonsamedomain | !isonsamedomain</td>"
"<td>"
"This is true if the url is from the same "
"DOMAIN as the page from which it was "
"harvested."
//"Only effective for links being added from a page "
//"being spidered, because this information is "
//"not preserved in the titleRec."
"</td></tr>"
"<tr class=poo><td><nobr>"
"isonsamesubdomain | !isonsamesubdomain"
"</nobr></td>"
"<td>"
"This is true if the url is from the same "
"SUBDOMAIN as the page from which it was "
"harvested."
//"Only effective for links being added from a page "
//"being spidered, because this information is "
//"not preserved in the titleRec."
"</td></tr>"
"<tr class=poo><td>ismedia | !ismedia</td>"
"<td>"
"Does the url have a media or css related "
"extension. Like gif, jpg, mpeg, css, etc.? "
"</td></tr>"
"<tr class=poo><td>tag:<i>tagname</i></td>"
"<td>"
"This is true if the url is tagged with this "
"<i>tagname</i> in the site list. Read about tags "
"on the <a href=/admin/settings>"//#examples>"
"site list</a> "
"page."
"</td></tr>"
"</td></tr></table><br><br>\n",
TABLE_STYLE );
// show the languages you can use
sb->safePrintf (
"<table %s>"
"<tr><td colspan=2><center>"
"<b>"
"Supported Language Abbreviations "
"for lang== Filter</b>"
"</td></tr>",
TABLE_STYLE );
for ( int32_t i = 0 ; i < 256 ; i++ ) {
char *lang1 = getLanguageAbbr ( i );
char *lang2 = getLanguageString ( i );
if ( ! lang1 ) continue;
sb->safePrintf("<tr class=poo>"
"<td>%s</td><td>%s</td></tr>\n",
lang1,lang2);
}
// wrap it up
sb->safePrintf("</table><br><br>");
return true;
}
// . copy/clone parms from one collrec to another
// . returns false and sets g_errno on error
// . if doing this after creating a new collection on host #0 we have to call
// syncParmsWithHost0() to get all the shards in sync.
bool Parms::cloneCollRec ( char *dstCR , char *srcCR ) {
// now set THIS based on the parameters in the xml file
for ( int32_t i = 0 ; i < m_numParms ; i++ ) {
// get it
Parm *m = &m_parms[i];
if ( m->m_obj != OBJ_COLL ) continue;
//log(LOG_DEBUG, "Parms: %s: parm: %s", filename, m->m_xml);
// . there are 2 object types, coll recs and g_conf, aka
// OBJ_COLL and OBJ_CONF.
// skip comments and command
if ( !(m->m_flags & PF_CLONE) ) continue;
// get parm data ptr
char *src = srcCR + m->m_off;
char *dst = dstCR + m->m_off;
// if not an array use this
if ( ! m->isArray() ) {
if ( m->m_type == TYPE_SAFEBUF ) {
SafeBuf *a = (SafeBuf *)src;
SafeBuf *b = (SafeBuf *)dst;
b->reset();
b->safeMemcpy ( a );
b->nullTerm();
}
else {
// this should work for most types
gbmemcpy ( dst , src , m->m_size );
}
continue;
}
//
// arrays only below here
//
// for arrays only
int32_t *srcNum = (int32_t *)(src-4);
int32_t *dstNum = (int32_t *)(dst-4);
// array can have multiple values
for ( int32_t j = 0 ; j < *srcNum ; j++ ) {
if ( m->m_type == TYPE_SAFEBUF ) {
SafeBuf *a = (SafeBuf *)src;
SafeBuf *b = (SafeBuf *)dst;
b->reset();
b->safeMemcpy ( a );
b->nullTerm();
}
else {
// this should work for most types
gbmemcpy ( dst , src , m->m_size );
}
src += m->m_size;
dst += m->m_size;
}
// update # elements in array
*dstNum = *srcNum;
}
return true;
}