open-source-search-engine/gb.conf
2013-10-16 16:24:08 -06:00

700 lines
24 KiB
Plaintext

# All <, >, " and # characters that are values for a field contained herein
# must be represented as &lt;, &gt;, &#34; and &#035; respectively.
# Mem available to this process. May be exceeded due to fragmentation.
<maxMem>4000000000</>
# Below the various Gigablast databases are configured.
# <*dbMaxTreeMem> - mem used for holding new recs
# <*dbMaxDiskPageCacheMem> - disk page cache mem for this db
# <*dbMaxCacheMem> - cache mem for holding single recs
# <*dbSaveCache> - save the rec cache on exit?
# <*dbMaxCacheAge> - max age (seconds) for recs in rec cache
# See that Stats page for record counts and stats.
# How many bytes should be used for caching DNS replies?
<dnsMaxCacheMem>128000</>
# A tagdb record assigns a url or site to a ruleset. Each tagdb record is
# about 100 bytes or so.
<tagdbMaxTreeMem>1028000</>
<tagdbMaxPageCacheMem>200000</>
# A catdb record assigns a url or site to DMOZ categories. Each catdb record
# is about 100 bytes.
<catdbMaxTreeMem>1000000</>
<catdbMaxPageCacheMem>25000000</>
<catdbMaxCacheMem>0</>
# Clusterdb caches small records for site clustering and deduping.
<clusterdbMaxTreeMem>1000000</>
<clusterdbSaveCache>0</>
# Max memory for dup vector cache.
<maxVectorCacheMem>10000000</>
# Robotdb caches robot.txt files.
<robotdbMaxCacheMem>128000</>
<robotdbSaveCache>0</>
<linkdbMaxPageCacheMem>0</>
<statsdbMaxTreeMem>5000000</>
<statsdbMaxCacheMem>0</>
<statsdbMaxDiskPageCacheMem>1000000</>
# Maximum bytes of a doc that can be sent before having to read more from disk
<httpMaxSendBufSize>128000</>
# Bytes to use for caching search result pages.
<searchResultsMaxCacheMem>100000</>
# Read only mode does not allow spidering.
<readOnlyMode>0</>
# Spell check using the dictionary.
<doSpellChecking>0</>
# give narrow search suggestions.
<doNarrowSearch>0</>
# Overrides all spidering for all collections on just this host.
<localSpideringEnabled>0</>
# Overrides all add urls for all collections on just this host.
<localAddUrlEnabled>1</>
# Used by proxy to point to a temporary cluster while the original cluster is
# updated with a new binary. The temporary cluster is the same as the original
# cluster but the ports are all incremented by one from what is in the
# hosts.conf. This should ONLY be used for the proxy.
<useTemporaryCluster>0</>
# If enabled gb does the search queries in ./test-search/queries.txt and
# compares to the last run and outputs the diffs for inspection and validation.
<qaSearchTestEnabled>1</>
# Enable spidering on all hosts
<allSpidersOn>0</>
# Disable spidering on all hosts
<allSpidersOff>0</>
# Serves ads unless pure=1 is in cgi parms.
<adFeedEnabled>0</>
# Stripe #n contains twin #n from each group. Doing stripe balancing helps
# prevent too many query requests coming into one host. This parm is only for
# the proxy. Stripe balancing is done by default unless the parm is disabled
# on the proxy in which case it appends a &dsb=0 to the query url it sends to
# the host. The proxy alternates to which host it forwards the incoming query
# based on the stripe. It takes the number of query terms in the query into
# account to make a more even balance.
<doStripeBalancing>1</>
# Is this cluster part of a live production cluster? If this is true we make
# sure that elvtune is being set properly for best performance, otherwise, gb
# will not startup.
<isLiveCluster>0</>
# Is this cluster just used for indexing wikipedia pages?
<isWikipediaCluster>0</>
# At what temperature in Celsius should we send an email alert if a hard drive
# reaches it?
<maxHardDriveTemperature>45</>
# If a heartbeat is delayed this many milliseconds dump a core so we can see
# where the CPU was. Logs 'db: missed heartbeat by %lli ms'. Use 0 or less to
# disable.
<maxHeartbeatDelayInMilliseconds>0</>
# If a call to a message callback or message handler in the udp server takes
# more than this many milliseconds, then log it. Logs 'udp: Took %lli ms to
# call callback for msgType=0x%hhx niceness=%li'. Use -1 or less to disable
# the logging.
<maxDelayBeforeLoggingACallbackOrHandler>-1</>
# Sends emails to admin if a host goes down.
<sendEmailAlerts>1</>
# Sends to sysadmin@gigablast.com.
<sendEmailAlertsToSysadmin>0</>
# Sends to email address 1 through email server 1.
<sendEmailAlertsToEmail1>1</>
# Sends to email address 1 through email server 1 if any parm is changed.
<sendParmChangeEmailAlertsToEmail1>1</>
# Connects to this server directly when sending email 1
<emailServer1><![CDATA[10.5.54.47]]></>
# Sends to this address when sending email 1
<emailAddress1><![CDATA[5051234567@vtext.com]]></>
# The from field when sending email 1
<fromEmailAddress1><![CDATA[sysadmin@mydomain.com]]></>
# Sends to email address 2 through email server 2.
<sendEmailAlertsToEmail2>0</>
# Sends to email address 2 through email server 2 if any parm is changed.
<sendParmChangeEmailAlertsToEmail2>1</>
# Connects to this server directly when sending email 2
<emailServer2><![CDATA[mail.mydomain.com]]></>
# Sends to this address when sending email 2
<emailAddress2><![CDATA[]]></>
# The from field when sending email 2
<fromEmailAddress2><![CDATA[sysadmin@mydomain.com]]></>
# Sends to email address 3 through email server 3.
<sendEmailAlertsToEmail3>0</>
# Sends to email address 3 through email server 3 if any parm is changed.
<sendParmChangeEmailAlertsToEmail3>1</>
# Connects to this server directly when sending email 3
<emailServer3><![CDATA[mail.mydomain.com]]></>
# Sends to this address when sending email 3
<emailAddress3><![CDATA[]]></>
# The from field when sending email 3
<fromEmailAddress3><![CDATA[sysadmin@mydomain.com]]></>
# Sends to email address 4 through email server 4.
<sendEmailAlertsToEmail4>0</>
# Sends to email address 4 through email server 4 if any parm is changed.
<sendParmChangeEmailAlertsToEmail4>1</>
# Connects to this server directly when sending email 4
<emailServer4><![CDATA[mail.mydomain.com]]></>
# Sends to this address when sending email 4
<emailAddress4><![CDATA[]]></>
# The from field when sending email 4
<fromEmailAddress4><![CDATA[sysadmin@mydomain.com]]></>
# Do not send email alerts about dead hosts to anyone except
# sysadmin@gigablast.com between the times given below unless all the twins of
# the dead host are also dead. Instead, wait till after if the host is still
# dead.
<delayNonCriticalEmailAlerts>0</>
# Look for this string in the kernel buffer for sending email
<errorString1><![CDATA[]]></>
# Look for this string in the kernel buffer for sending email
<errorString2><![CDATA[]]></>
# Look for this string in the kernel buffer for sending email
<errorString3><![CDATA[]]></>
# If you have scsi drives or a slow network, say yes here to minimize data
# fetches across the network.
<preferLocalReads>0</>
# If enabled then all writes will be flushed to disk. This is generally a good
# thing.
<doSynchronousWrites>1</>
# Read what was written in a verification step. Decreases performance, but may
# help fight disk corruption mostly on Maxtors and Western Digitals.
<verifyDiskWrites>0</>
# When reindexing a document, do not re-add data that should already be in
# index or clusterdb since the last time the document was indexed. Otherwise,
# re-add the data regardless.
<doIncrementalUpdating>0</>
# Use /etc/hosts file to resolve hostnames? the /etc/host file is reloaded
# every minute, so if you make a change to it you might have to wait one
# minute for the change to take affect.
<useEtcHosts>0</>
# If enabled, Gigablast assumes the first half of machines in hosts.conf are
# on a different network switch than the second half, and minimizes transmits
# between the switches.
<twinsAreSplit>0</>
# When enabled Gigablast will randomly fail at allocating memory. Used for
# testing stability.
<doOutOfMemoryTesting>0</>
# When enabled Gigablast will make sure it reparses the document exactly the
# same way. It does this every 1000th document anyway, but enabling this makes
# it do it for every document.
<doConsistencyTesting>0</>
# If enabled, all servers must have two gigabit ethernet ports hooked up and
# Gigablast will round robin packets between both ethernet ports when sending
# to another host. Can speed up network transmissions as much as 2x.
<useShotgun>0</>
# If enabled, Gigablast will use quickpoll. Significantly improves
# performance. Only turn this off for testing.
<useQuickpoll>1</>
# If enabled, Gigablast will use threads.
<useThreads>1</>
# If enabled, Gigablast will use shared memory. Should really only be used on
# the live cluster, keep this on the testing cluster since it can leak easily.
<useSharedMem>0</>
# Use disk page cache?
<useDiskPageCacheForPosdb>1</>
# Use disk page cache?
<useDiskPageCacheForDatedb>1</>
# Use disk page cache?
<useDiskPageCacheForTitledb>1</>
# Use disk page cache?
<useDiskPageCacheForSpiderdb>1</>
# Use disk page cache?
<useDiskPageCacheForTagdb>1</>
# Use disk page cache?
<useDiskPageCacheForChecksumdb>1</>
# Use disk page cache?
<useDiskPageCacheForClusterdb>1</>
# Use disk page cache?
<useDiskPageCacheForCatdb>1</>
# Use disk page cache?
<useDiskPageCacheForLinkdb>1</>
# Scan all titledb files if rec not found. You should keep this on to avoid
# corruption. Do not turn it off unless you are Matt Wells.
<scanAllIfNotFound>1</>
# for specifying if this is an interface machinemessages are rerouted from
# this machine to the maincluster set in the hosts.conf.
<interfaceMachine>0</>
# At query time, should Gigablast generate content vectors for title records
# lacking them? This is an expensive operation, so is really just for testing
# purposes.
<generateVectorAtQueryTime>0</>
# Keep track of ips which do queries, disallow non-customers from hitting us
# too hard.
<autobanIPsWhichViolateTheQueriesPerDayQuotas>0</>
# Non-customers get this many queries per day beforebeing autobanned
<freeQueriesPerDay>1024</>
# Non-customers get this many queries per minute beforebeing autobanned
<freeQueriesPerMinute>30</>
# If this is non empty, http traffic will be redirected to the specified
# address.
<redirectNonrawTraffic><![CDATA[]]></>
# If this is true, gb will route download requests for web pages to proxies in
# hosts.conf. Proxies will download and compress docs before sending back.
<sendRequestsToCompressionProxy>0</>
# Enable/disable the ability to synchronize time between the cluster and the
# proxy
<synchronizeProxyToClusterTime>0</>
# Allows scaling up of hosts by deleting recs not in the correct group. This
# should only happen why copying a set of servers to the new hosts. Otherwise
# corrupted data will cause a halt.
<allowScalingOfHosts>0</>
# Allows bypass of db validation so gigablast will not halt if a corrupt db is
# discovered durring load. Use this when attempting to load with a collection
# that has known corruption.
<allowBypassOfDbValidation>0</>
# IP address of the primary DNS server. Assumes UDP port 53.
<dns0>8.8.8.8</>
# IP address of the secondary DNS server. Assumes UDP port 53. Will be
# accessed in conjunction with the primary dns, so make sure this is always
# up. An ip of 0 means disabled.
<dns1>8.8.4.4</>
<dns2>0.0.0.0</>
<dns3>0.0.0.0</>
<dns4>0.0.0.0</>
<dns5>0.0.0.0</>
<dns6>0.0.0.0</>
<dns7>0.0.0.0</>
<dns8>0.0.0.0</>
<dns9>0.0.0.0</>
<dns10>0.0.0.0</>
<dns11>0.0.0.0</>
<dns12>0.0.0.0</>
<dns13>0.0.0.0</>
<dns14>0.0.0.0</>
<dns15>0.0.0.0</>
<geocoderIP1>10.5.66.11</>
<geocoderIP2>0.0.0.0</>
<geocoderIP3>0.0.0.0</>
<geocoderIP4>0.0.0.0</>
# Access the wiki coll through this proxy ip
<wikiProxyIp>0.0.0.0</>
# Access the wiki coll through this proxy port
<wikiProxyPort>0</>
# Email alerts will include the cluster name
<clusterName><![CDATA[unspecified]]></>
# Identification seen by web servers when the Gigablast spider downloads their
# web pages. It is polite to insert a contact email address here so webmaster
# that experience problems from the Gigablast spider have somewhere to vent.
<spiderUserAgent><![CDATA[GigablastOpenSource/1]]></>
# If this is true, gb will send accept-encoding: gzipwhen doing http downloads.
<askForGzippedDocsWhenDownloading>0</>
# When no collection is explicitly specified, assume this collection name.
<defaultCollection><![CDATA[main]]></>
# Collection to be used for directory searching and display of directory topic
# pages.
<directoryCollection><![CDATA[]]></>
# Hostname of the server providing the directory. Leave empty to use this host.
<directoryHostname><![CDATA[]]></>
# Total incoming bandwidth used by all spiders should not exceed this many
# kilobits per second.
<maxIncomingBandwidthForSpider>999999.000</>
# Spiders will shed load when their host exceeds this value for the 1-minute
# load average in /proc/loadavg. The value 0.0 disables this feature.
<max1minuteSlidingwindowLoadavg>0.000</>
# Maximum number of threads to use per Gigablast process for intersecting
# docid lists. Generally, set this to the number of CPUs on the machine.
<maxCpuThreads>1</>
# Maximum number of pages to index or delete from index per second for all
# hosts combined.
<maxPagesPerSecond>999999.000</>
# Consider a host in the Gigablast network to be dead if it does not respond
# to successive pings for this number of seconds. Gigablast does not send
# requests to dead hosts. Outstanding requests may be re-routed to a twin.
<deadHostTimeout>4000</>
# Send an email after a host has not responded to successive pings for this
# many milliseconds.
<sendEmailTimeout>62000</>
# Wait this many milliseconds before pinging the next host. Each host pings
# all other hosts in the network.
<pingSpacer>100</>
# Send email alerts when average query latency goes above this threshold.
<averageQueryLatencyThreshold>2.000</>
# Send email alerts when query success rate goes below this threshold.
<querySuccessRateThreshold>0.850</>
# Record this number of query times before calculating average query latency.
<numberOfQueryTimesInAverage>3000</>
# If we reach this many corrupt index lists, send an admin email. Set to -1
# to disable.
<maxCorruptIndexLists>5</>
# Maximum number of threads to use per Gigablast process for writing data to
# the disk. Keep low to reduce file interlace effects and impact on query
# response time.
<maxWriteThreads>1</>
# Maximum number of threads to use per Gigablast process for accessing the
# disk for index-building purposes. Keep low to reduce impact on query
# response time. Increase for RAID systems or when initially building an index.
<maxSpiderReadThreads>7</>
# This particular number applies to all reads above 1MB.
<maxSpiderBigReadThreads>3</>
# This particular number applies to all reads above 100K.
<maxSpiderMediumReadThreads>4</>
# This particular number applies to all reads above 1MB.
<maxSpiderSmallReadThreads>5</>
# Maximum number of threads to use per Gigablast process for accessing the
# disk for querying purposes. IDE systems tend to be more responsive when this
# is low. Increase for SCSI or RAID systems.
<maxQueryReadThreads>20</>
# This particular number applies to all reads above 1MB.
<maxQueryBigReadThreads>20</>
# This particular number applies to all reads above 100K.
<maxQueryMediumReadThreads>20</>
# This particular number applies to all reads above 1MB.
<maxQuerySmallReadThreads>20</>
# Word or phrase must be present in this percent of documents in order to
# qualify as a spelling recommendation.
<minPopularityForSpeller>0.010</>
# Percent to weight phrases in queries.
<phraseWeight>100.000</>
# Percent of how much to use words to phrase ratio weights.
<weightscppSliderParmtmp>90</>
# When passing queries around the network, send the raw string instead of the
# serialized query if the required buffer is bigger than this. Smaller values
# decrease network traffic for large queries at the expense of processing time.
<maximumSerializedQuerySize>8192</>
# Read and write this many bytes at a time when merging files. Smaller values
# are kinder to query performance, but the merge takes longer. Use at least
# 1000000 for fast merging.
<mergeBufSize>800000</>
# minRecSizes for Catdb lookups
<catdbMinRecSizes>100000000</>
# Maximum sockets available to serve incoming HTTP requests. Too many
# outstanding requests will increase query latency. Excess requests will
# simply have their sockets closed.
<maxHttpSockets>100</>
# Maximum sockets available to serve incoming HTTPS requests. Like max http
# sockets, but for secure sockets.
<maxHttpsSockets>100</>
# Copy data in memory to disk after this many minutes have passed without the
# data having been dumped or saved to disk. Use 0 to disable.
<autoSaveFrequency>5</>
# Add this number to the total document count in the index. Just used for
# displaying on the homepage.
<docCountAdjustment>0</>
# Generates profiling data for callbacks on page performance
<dynamicPerformanceGraph>0</>
# Enable profiler to do accounting of time taken by functions.
<enableProfiling>1</>
# Profiler will not show functions which take less than this many milliseconds
# in the log or on the perfomance graph.
<minimumProfilingThreshold>10</>
# Produce a LOG_TIMING log message for each callback called, along with the
# time it took. Profiler must be enabled.
<sequentialProfiling>0</>
# Archive system statistics information in Statsdb.
<useStatsdb>1</>
# How many seconds should we cache a search results page for?
<searchResultsCacheMaxAge>10800</>
# add Ips here to bar them from accessing this gigablast server.
<banIps><![CDATA[]]></>
# add Ips here to give them an infinite query quota.
<allowIps><![CDATA[]]></>
# Don't try to autoban queries that have one of these codes. Also, the code
# must be valid for us to use &uip=IPADDRESS as the IP address of the
# submitter for purposes of autoban AND purposes of addurl daily quotas.
<validCodes><![CDATA[]]></>
# Append extra default parms to queries that match certain substrings.
# Format: text to match in url, followed by a space, then the list of extra
# parms as they would appear appended to the url. One match per line.
<extraParms><![CDATA[]]></>
# ban any query that matches this list of substrings. Must match all
# comma-separated strings on the same line. ('\n' = OR, ',' = AND)
<banRegex><![CDATA[]]></>
# Add facebook user IDs here so those people can turk the results. Later we
# may limit each person to turking a geographic region.
<supterturks><![CDATA[]]></>
# Add users here. The format is
# collection:ip:username:password:relogin:pages:tagnames Username and password
# cannot be blank. You can specify * for collection to indicate all
# collections. * can be used in IP as wildcard. * in pages means user has
# access to all pages. Also you can specify individual pages. A '-' sign at
# the start of page means user is not allowed to access that page. Please
# refer the page reference table at the bottom of this page for available
# pages. If you want to just login once and avoid relogin for gb shutdowns
# then set relogin=1, else set it to 0. If relogin is 1 your login will never
# expire either.<br> Ex: 1. master user -> *:*:master:master:1:*:english<br>
# 2. public user ->
# *:*:public:1234:0:index.html,get,search,login,dir:english<br>3. turk user ->
#
#
#
#
#
# 8.58.122:main:turk:1234:0:pageturkhome,pageturk,pageturkget,get,login:english
<users><![CDATA[*:*:mwells:mwells62:1:*:
*:*:public:1234:0:index.html,get,search,login,dir:]]></>
# Allow UDP requests from this list of IPs. Any datagram received not coming
# from one of these IPs, or an IP in hosts.conf, is dropped. If another
# cluster is accessing this cluster for getting link text or whatever, you
# will need to list the IPs of the accessing machines here. These IPs are also
# used to allow access to the HTTP server even if it was disabled in the
# Master Controls. IPs that have 0 has their Least Significant Byte are
# treated as wildcards for IP blocks. That is, 1.2.3.0 means 1.2.3.*.
<connectIp>10.5.0.3</>
# Log GET and POST requests received from the http server?
<logHttpRequests>1</>
# Should we log queries that are autobanned? They can really fill up the log.
<logAutobannedQueries>1</>
# If query took this many millliseconds or longer, then log the query and the
# time it took to process.
<logQueryTimeThreshold>5000</>
# Log query reply in proxy, but only for those queries above the time
# threshold above.
<logQueryReply>0</>
# Log status of spidered or injected urls?
<logSpideredUrls>1</>
# Log messages if Gigablast runs out of udp sockets?
<logNetworkCongestion>0</>
# Log messages not related to an error condition, but meant more to give an
# idea of the state of the gigablast process. These can be useful when
# diagnosing problems.
<logInformationalMessages>1</>
# Log it when document not added due to quota breech. Log it when url is too
# long and it gets truncated.
<logLimitBreeches>0</>
# Log various debug messages.
<logDebugAdminMessages>0</>
<logDebugBuildMessages>0</>
<logDebugBuildTimeMessages>0</>
<logDebugDatabaseMessages>0</>
<logDebugDirtyMessages>0</>
<logDebugDiskMessages>0</>
<logDebugDnsMessages>0</>
<logDebugHttpMessages>0</>
<logDebugLoopMessages>0</>
<logDebugLanguageDetectionMessages>0</>
<logDebugLinkInfo>0</>
<logDebugMemMessages>0</>
<logDebugMemUsageMessages>0</>
<logDebugNetMessages>0</>
<logDebugPostQueryRerankMessages>0</>
<logDebugQueryMessages>0</>
<logDebugQuotaMessages>0</>
<logDebugRobotsMessages>0</>
<logDebugSpiderCacheMessages>0</>
<logDebugSpellerMessages>0</>
<logDebugSectionsMessages>0</>
<logDebugSeoInsertMessages>1</>
<logDebugSeoMessages>0</>
<logDebugStatsMessages>0</>
<logDebugSummaryMessages>0</>
<logDebugSpiderMessages>0</>
<logDebugUrlAttempts>0</>
<logDebugSpiderDownloads>0</>
<logDebugFacebook>0</>
<logDebugTagdbMessages>0</>
<logDebugTcpMessages>0</>
<logDebugThreadMessages>0</>
<logDebugTitleMessages>0</>
<logDebugTimedbMessages>0</>
<logDebugTopicMessages>0</>
<logDebugTopDocMessages>0</>
<logDebugUdpMessages>0</>
<logDebugUnicodeMessages>0</>
<logDebugRepairMessages>0</>
<logDebugPubDateExtractionMessages>0</>
# Log various timing related messages.
<logTimingMessagesForBuild>0</>
# Log various timing related messages.
<logTimingMessagesForAdmin>0</>
<logTimingMessagesForDatabase>0</>
<logTimingMessagesForNetworkLayer>0</>
<logTimingMessagesForQuery>0</>
# Log various timing related messages.
<logTimingMessagesForSpcache>0</>
<logTimingMessagesForRelatedTopics>0</>
# Log reminders to the programmer. You do not need this.
<logReminderMessages>0</>
# If enabled, gigablast will repair the rdbs as specified by the parameters
# below. When a particular collection is in repair mode, it can not spider or
# merge titledb files.
<repairModeEnabled>0</>
# Comma or space separated list of the collections to repair or rebuild.
<collectionsToRepairOrRebuild><![CDATA[main]]></>
# In bytes.
<memoryToUseForRepair>300000000</>
# Maximum number of outstanding inject spiders for repair.
<maxRepairSpiders>32</>
# If enabled, gigablast will reinject the content of all title recs into a
# secondary rdb system. That will the primary rdb system when complete.
<fullRebuild>0</>
# If enabled, gigablast will keep the new spiderdb records when doing the full
# rebuild or the spiderdb rebuild.
<keepNewSpiderdbRecs>1</>
# If enabled, gigablast will recycle the link info when rebuilding titledb.
<recycleLinkInfo>0</>
# If enabled, gigablast will rebuild this rdb
<rebuildTitledb>1</>
# If enabled, gigablast will rebuild this rdb
<rebuildPosdb>0</>
# If enabled, gigablast will rebuild this rdb
<rebuildClusterdb>0</>
# If enabled, gigablast will rebuild this rdb
<rebuildSpiderdb>0</>
# If enabled, gigablast will rebuild this rdb
<rebuildLinkdb>0</>
# If disabled, gigablast will skip root urls.
<rebuildRootUrls>1</>
# If disabled, gigablast will skip non-root urls.
<rebuildNonrootUrls>1</>
# When rebuilding spiderdb and scanning it for new spiderdb records, should a
# tagdb lookup be performed? Runs much much faster without it. Will also keep
# the original doc quality and spider priority in tact.
<skipTagdbLookup>0</>