open-source-search-engine/gb.conf

376 lines
12 KiB
Plaintext
Raw Normal View History

2013-08-03 00:12:24 +04:00
# All <, >, " and # characters that are values for a field contained herein
# must be represented as &lt;, &gt;, &#34; and &#035; respectively.
# Mem available to this process. May be exceeded due to fragmentation.
<maxMem>4000000000</>
# Below the various Gigablast databases are configured.
# <*dbMaxTreeMem> - mem used for holding new recs
# <*dbMaxDiskPageCacheMem> - disk page cache mem for this db
# <*dbMaxCacheMem> - cache mem for holding single recs
# <*dbSaveCache> - save the rec cache on exit?
# <*dbMaxCacheAge> - max age (seconds) for recs in rec cache
# See that Stats page for record counts and stats.
# How many bytes should be used for caching DNS replies?
<dnsMaxCacheMem>128000</>
# A tagdb record assigns a url or site to a ruleset. Each tagdb record is
# about 100 bytes or so.
<tagdbMaxTreeMem>1028000</>
<tagdbMaxPageCacheMem>200000</>
# A catdb record assigns a url or site to DMOZ categories. Each catdb record
# is about 100 bytes.
<catdbMaxTreeMem>1000000</>
<catdbMaxPageCacheMem>25000000</>
<catdbMaxCacheMem>0</>
# Clusterdb caches small records for site clustering and deduping.
<clusterdbMaxTreeMem>1000000</>
<clusterdbSaveCache>0</>
# Max memory for dup vector cache.
<maxVectorCacheMem>10000000</>
# Robotdb caches robot.txt files.
<robotdbMaxCacheMem>128000</>
<robotdbSaveCache>0</>
<linkdbMaxPageCacheMem>0</>
<statsdbMaxTreeMem>5000000</>
<statsdbMaxCacheMem>0</>
<statsdbMaxDiskPageCacheMem>1000000</>
# Maximum bytes of a doc that can be sent before having to read more from disk
<httpMaxSendBufSize>128000</>
# Bytes to use for caching search result pages.
<searchResultsMaxCacheMem>100000</>
# Read only mode does not allow spidering.
<readOnlyMode>0</>
2014-01-10 07:59:02 +04:00
# Controls all spidering for all collections
<spideringEnabled>0</>
2013-08-03 00:12:24 +04:00
# What is the maximum number of web pages the spider is allowed to download
# simultaneously for ALL collections PER HOST?
<maxTotalSpiders>100</>
2014-01-10 07:59:02 +04:00
# Can people use the add url interface to add urls to the index?
<addUrlEnabled>1</>
2014-01-10 09:13:41 +04:00
# Save data in memory to disk after this many minutes have passed without the
2014-01-10 07:59:02 +04:00
# data having been dumped or saved to disk. Use 0 to disable.
2014-01-10 08:07:19 +04:00
<autoSaveFrequency>5</>
2014-01-10 07:59:02 +04:00
# Maximum sockets available to serve incoming HTTP requests. Too many
# outstanding requests will increase query latency. Excess requests will
# simply have their sockets closed.
<maxHttpSockets>100</>
# Maximum sockets available to serve incoming HTTPS requests. Like max http
# sockets, but for secure sockets.
<maxHttpsSockets>100</>
# Identification seen by web servers when the Gigablast spider downloads their
2014-01-10 09:13:41 +04:00
# web pages. It is polite to insert a contact email address here so webmasters
2014-01-10 07:59:02 +04:00
# that experience problems from the Gigablast spider have somewhere to vent.
<spiderUserAgent><![CDATA[GigablastOpenSource/1.0]]></>
2014-01-10 09:13:41 +04:00
# If this is true, gb will send Accept-Encoding: gzip to web servers when
# doing http downloads.
2014-01-10 07:59:02 +04:00
<askForGzippedDocsWhenDownloading>0</>
2013-08-03 00:12:24 +04:00
2014-01-10 07:59:02 +04:00
# How many seconds should we cache a search results page for?
<searchResultsCacheMaxAge>10800</>
# Keep track of ips which do queries, disallow non-customers from hitting us
# too hard.
<autobanIPsWhichViolateTheQueriesPerDayQuotas>0</>
2013-08-03 00:12:24 +04:00
# If a call to a message callback or message handler in the udp server takes
# more than this many milliseconds, then log it. Logs 'udp: Took %lli ms to
# call callback for msgType=0x%hhx niceness=%li'. Use -1 or less to disable
# the logging.
<maxDelayBeforeLoggingACallbackOrHandler>-1</>
# Sends emails to admin if a host goes down.
<sendEmailAlerts>0</>
2013-08-03 00:12:24 +04:00
2014-01-10 07:59:02 +04:00
# Do not send email alerts about dead hosts to anyone except
# sysadmin@gigablast.com between the times given below unless all the twins of
# the dead host are also dead. Instead, wait till after if the host is still
# dead.
<delayNonCriticalEmailAlerts>0</>
# Email alerts will include the cluster name
<clusterName><![CDATA[unspecified]]></>
# Send an email after a host has not responded to successive pings for this
# many milliseconds.
<sendEmailTimeout>62000</>
# Send email alerts when query success rate goes below this threshold.
# (percent rate between 0.0 and 1.0)
<querySuccessRateThreshold>0.850000</>
# Send email alerts when average query latency goes above this threshold. (in
# seconds)
<averageQueryLatencyThreshold>2.000000</>
# Record this number of query times before calculating average query latency.
2014-01-10 08:07:19 +04:00
<numberOfQueryTimesInAverage>300</>
2014-01-10 07:59:02 +04:00
# At what temperature in Celsius should we send an email alert if a hard drive
# reaches it?
<maxHardDriveTemperature>45</>
# Look for this string in the kernel buffer for sending email alert. Useful
# for detecting some strange hard drive failures that really slow performance.
<errorString1><![CDATA[]]></>
# Look for this string in the kernel buffer for sending email alert. Useful
# for detecting some strange hard drive failures that really slow performance.
<errorString2><![CDATA[]]></>
# Look for this string in the kernel buffer for sending email alert. Useful
# for detecting some strange hard drive failures that really slow performance.
<errorString3><![CDATA[]]></>
2013-08-03 00:12:24 +04:00
# Sends to email address 1 through email server 1.
2014-01-10 08:07:19 +04:00
<sendEmailAlertsToEmail1>0</>
2013-08-03 00:12:24 +04:00
# Sends to email address 1 through email server 1 if any parm is changed.
2014-01-10 08:07:19 +04:00
<sendParmChangeEmailAlertsToEmail1>0</>
2013-08-03 00:12:24 +04:00
# Connects to this server directly when sending email 1
<emailServer1><![CDATA[10.5.54.47]]></>
# Sends to this address when sending email 1
2014-01-10 08:07:19 +04:00
<emailAddress1><![CDATA[4081234567@vtext.com]]></>
2013-08-03 00:12:24 +04:00
# The from field when sending email 1
<fromEmailAddress1><![CDATA[sysadmin@mydomain.com]]></>
# Sends to email address 2 through email server 2.
<sendEmailAlertsToEmail2>0</>
# Sends to email address 2 through email server 2 if any parm is changed.
2014-01-10 07:59:02 +04:00
<sendParmChangeEmailAlertsToEmail2>0</>
2013-08-03 00:12:24 +04:00
# Connects to this server directly when sending email 2
<emailServer2><![CDATA[mail.mydomain.com]]></>
# Sends to this address when sending email 2
<emailAddress2><![CDATA[]]></>
# The from field when sending email 2
<fromEmailAddress2><![CDATA[sysadmin@mydomain.com]]></>
# Sends to email address 3 through email server 3.
<sendEmailAlertsToEmail3>0</>
# Sends to email address 3 through email server 3 if any parm is changed.
2014-01-10 08:07:19 +04:00
<sendParmChangeEmailAlertsToEmail3>0</>
2013-08-03 00:12:24 +04:00
# Connects to this server directly when sending email 3
<emailServer3><![CDATA[mail.mydomain.com]]></>
# Sends to this address when sending email 3
<emailAddress3><![CDATA[]]></>
# The from field when sending email 3
<fromEmailAddress3><![CDATA[sysadmin@mydomain.com]]></>
2014-01-10 07:59:02 +04:00
# IP address of the primary DNS server. Assumes UDP port 53. REQUIRED FOR
# SPIDERING! Use Google's public DNS 8.8.8.8 as default.
2013-08-03 00:12:24 +04:00
<dns0>8.8.8.8</>
# IP address of the secondary DNS server. Assumes UDP port 53. Will be
# accessed in conjunction with the primary dns, so make sure this is always
2014-01-10 07:59:02 +04:00
# up. An ip of 0 means disabled. Google's secondary public DNS is 8.8.4.4.
2013-08-20 01:25:37 +04:00
<dns1>8.8.4.4</>
2014-01-10 07:59:02 +04:00
# All hosts send to these DNSes based on hash of the subdomain to try to split
# DNS load evenly.
2013-08-03 00:12:24 +04:00
<dns2>0.0.0.0</>
<dns3>0.0.0.0</>
<dns4>0.0.0.0</>
<dns5>0.0.0.0</>
<dns6>0.0.0.0</>
<dns7>0.0.0.0</>
<dns8>0.0.0.0</>
<dns9>0.0.0.0</>
<dns10>0.0.0.0</>
<dns11>0.0.0.0</>
<dns12>0.0.0.0</>
<dns13>0.0.0.0</>
<dns14>0.0.0.0</>
<dns15>0.0.0.0</>
# add Ips here to bar them from accessing this gigablast server.
<banIps><![CDATA[]]></>
# add Ips here to give them an infinite query quota.
<allowIps><![CDATA[]]></>
# Don't try to autoban queries that have one of these codes. Also, the code
# must be valid for us to use &uip=IPADDRESS as the IP address of the
# submitter for purposes of autoban AND purposes of addurl daily quotas.
<validCodes><![CDATA[]]></>
# Append extra default parms to queries that match certain substrings.
# Format: text to match in url, followed by a space, then the list of extra
# parms as they would appear appended to the url. One match per line.
<extraParms><![CDATA[]]></>
# ban any query that matches this list of substrings. Must match all
# comma-separated strings on the same line. ('\n' = OR, ',' = AND)
<banRegex><![CDATA[]]></>
2014-02-12 11:36:09 +04:00
# Any matching password will have administrative access to Gigablast and all
# collections.
2014-02-10 03:41:43 +04:00
# Use <masterPassword> tag.
2014-02-12 11:36:09 +04:00
# Any IPs in this list will have administrative access to Gigablast and all
# collections.
# Use <masterIp> tag.
2014-02-08 11:34:45 +04:00
2013-08-03 00:12:24 +04:00
# Log GET and POST requests received from the http server?
<logHttpRequests>1</>
# Should we log queries that are autobanned? They can really fill up the log.
<logAutobannedQueries>1</>
# If query took this many millliseconds or longer, then log the query and the
# time it took to process.
<logQueryTimeThreshold>5000</>
# Log query reply in proxy, but only for those queries above the time
# threshold above.
<logQueryReply>0</>
# Log status of spidered or injected urls?
<logSpideredUrls>1</>
# Log messages if Gigablast runs out of udp sockets?
<logNetworkCongestion>0</>
# Log messages not related to an error condition, but meant more to give an
# idea of the state of the gigablast process. These can be useful when
# diagnosing problems.
<logInformationalMessages>1</>
# Log it when document not added due to quota breech. Log it when url is too
# long and it gets truncated.
<logLimitBreeches>0</>
# Log various debug messages.
<logDebugAdminMessages>0</>
<logDebugBuildMessages>0</>
<logDebugBuildTimeMessages>0</>
<logDebugDatabaseMessages>0</>
2013-09-18 22:02:09 +04:00
<logDebugDirtyMessages>0</>
2013-08-03 00:12:24 +04:00
<logDebugDiskMessages>0</>
<logDebugDnsMessages>0</>
<logDebugHttpMessages>0</>
<logDebugLoopMessages>0</>
<logDebugLanguageDetectionMessages>0</>
<logDebugLinkInfo>0</>
<logDebugMemMessages>0</>
<logDebugMemUsageMessages>0</>
<logDebugNetMessages>0</>
<logDebugQueryMessages>0</>
<logDebugQuotaMessages>0</>
<logDebugRobotsMessages>0</>
<logDebugSpiderCacheMessages>0</>
<logDebugSpellerMessages>0</>
<logDebugSectionsMessages>0</>
2014-01-19 13:09:38 +04:00
<logDebugSeoInsertMessages>0</>
2013-08-03 00:12:24 +04:00
<logDebugSeoMessages>0</>
<logDebugStatsMessages>0</>
<logDebugSummaryMessages>0</>
<logDebugSpiderMessages>0</>
2013-08-03 00:12:24 +04:00
<logDebugUrlAttempts>0</>
<logDebugSpiderDownloads>0</>
<logDebugFacebook>0</>
<logDebugTagdbMessages>0</>
<logDebugTcpMessages>0</>
<logDebugThreadMessages>0</>
<logDebugTitleMessages>0</>
<logDebugTimedbMessages>0</>
<logDebugTopicMessages>0</>
<logDebugTopDocMessages>0</>
<logDebugUdpMessages>0</>
<logDebugUnicodeMessages>0</>
<logDebugRepairMessages>0</>
<logDebugPubDateExtractionMessages>0</>
# Log various timing related messages.
<logTimingMessagesForBuild>0</>
# Log various timing related messages.
<logTimingMessagesForAdmin>0</>
<logTimingMessagesForDatabase>0</>
<logTimingMessagesForNetworkLayer>0</>
<logTimingMessagesForQuery>0</>
# Log various timing related messages.
<logTimingMessagesForSpcache>0</>
<logTimingMessagesForRelatedTopics>0</>
# Log reminders to the programmer. You do not need this.
<logReminderMessages>0</>
# If enabled, gigablast will repair the rdbs as specified by the parameters
# below. When a particular collection is in repair mode, it can not spider or
# merge titledb files.
<repairModeEnabled>0</>
# Comma or space separated list of the collections to repair or rebuild.
<collectionsToRepairOrRebuild><![CDATA[main]]></>
# In bytes.
<memoryToUseForRepair>300000000</>
# Maximum number of outstanding inject spiders for repair.
<maxRepairSpiders>32</>
# If enabled, gigablast will reinject the content of all title recs into a
# secondary rdb system. That will the primary rdb system when complete.
<fullRebuild>0</>
# If enabled, gigablast will keep the new spiderdb records when doing the full
# rebuild or the spiderdb rebuild.
<keepNewSpiderdbRecs>1</>
# If enabled, gigablast will recycle the link info when rebuilding titledb.
<recycleLinkInfo>0</>
# If enabled, gigablast will rebuild this rdb
<rebuildTitledb>1</>
# If enabled, gigablast will rebuild this rdb
<rebuildPosdb>0</>
# If enabled, gigablast will rebuild this rdb
<rebuildClusterdb>0</>
# If enabled, gigablast will rebuild this rdb
<rebuildSpiderdb>0</>
# If enabled, gigablast will rebuild this rdb
<rebuildLinkdb>0</>
# If disabled, gigablast will skip root urls.
<rebuildRootUrls>1</>
# If disabled, gigablast will skip non-root urls.
<rebuildNonrootUrls>1</>
# When rebuilding spiderdb and scanning it for new spiderdb records, should a
# tagdb lookup be performed? Runs much much faster without it. Will also keep
# the original doc quality and spider priority in tact.
<skipTagdbLookup>0</>