mirror of
https://github.com/gigablast/open-source-search-engine.git
synced 2024-10-04 20:27:43 +03:00
382 lines
13 KiB
Plaintext
382 lines
13 KiB
Plaintext
# All <, >, " and # characters that are values for a field contained herein
|
|
# must be represented as <, >, " and # respectively.
|
|
|
|
# Mem available to this process. May be exceeded due to fragmentation.
|
|
<maxMem>4000000000</>
|
|
|
|
# Below the various Gigablast databases are configured.
|
|
# <*dbMaxTreeMem> - mem used for holding new recs
|
|
# <*dbMaxDiskPageCacheMem> - disk page cache mem for this db
|
|
# <*dbMaxCacheMem> - cache mem for holding single recs
|
|
# <*dbSaveCache> - save the rec cache on exit?
|
|
# <*dbMaxCacheAge> - max age (seconds) for recs in rec cache
|
|
# See that Stats page for record counts and stats.
|
|
|
|
# How many bytes should be used for caching DNS replies?
|
|
<dnsMaxCacheMem>128000</>
|
|
|
|
# A tagdb record assigns a url or site to a ruleset. Each tagdb record is
|
|
# about 100 bytes or so.
|
|
<tagdbMaxTreeMem>1028000</>
|
|
<tagdbMaxPageCacheMem>200000</>
|
|
|
|
# A catdb record assigns a url or site to DMOZ categories. Each catdb record
|
|
# is about 100 bytes.
|
|
<catdbMaxTreeMem>1000000</>
|
|
<catdbMaxPageCacheMem>25000000</>
|
|
<catdbMaxCacheMem>0</>
|
|
|
|
# Clusterdb caches small records for site clustering and deduping.
|
|
<clusterdbMaxTreeMem>1000000</>
|
|
<clusterdbSaveCache>0</>
|
|
|
|
# Max memory for dup vector cache.
|
|
<maxVectorCacheMem>10000000</>
|
|
|
|
# Robotdb caches robot.txt files.
|
|
<robotdbMaxCacheMem>128000</>
|
|
<robotdbSaveCache>0</>
|
|
<linkdbMaxPageCacheMem>0</>
|
|
<statsdbMaxTreeMem>5000000</>
|
|
<statsdbMaxCacheMem>0</>
|
|
<statsdbMaxDiskPageCacheMem>1000000</>
|
|
|
|
# Maximum bytes of a doc that can be sent before having to read more from disk
|
|
<httpMaxSendBufSize>128000</>
|
|
|
|
# Bytes to use for caching search result pages.
|
|
<searchResultsMaxCacheMem>100000</>
|
|
|
|
# Read only mode does not allow spidering.
|
|
<readOnlyMode>0</>
|
|
|
|
# Controls all spidering for all collections
|
|
<spideringEnabled>1</>
|
|
|
|
# What is the maximum number of web pages the spider is allowed to download
|
|
# simultaneously for ALL collections PER HOST?
|
|
<maxTotalSpiders>100</>
|
|
|
|
# Can people use the add url interface to add urls to the index?
|
|
<addUrlEnabled>1</>
|
|
|
|
# Save data in memory to disk after this many minutes have passed without the
|
|
# data having been dumped or saved to disk. Use 0 to disable.
|
|
<autoSaveFrequency>5</>
|
|
|
|
# Maximum sockets available to serve incoming HTTP requests. Too many
|
|
# outstanding requests will increase query latency. Excess requests will
|
|
# simply have their sockets closed.
|
|
<maxHttpSockets>100</>
|
|
|
|
# Maximum sockets available to serve incoming HTTPS requests. Like max http
|
|
# sockets, but for secure sockets.
|
|
<maxHttpsSockets>100</>
|
|
|
|
# Identification seen by web servers when the Gigablast spider downloads their
|
|
# web pages. It is polite to insert a contact email address here so webmasters
|
|
# that experience problems from the Gigablast spider have somewhere to vent.
|
|
<spiderUserAgent><![CDATA[GigablastOpenSource/1.0]]></>
|
|
|
|
# If this is true, gb will send Accept-Encoding: gzip to web servers when
|
|
# doing http downloads.
|
|
<askForGzippedDocsWhenDownloading>0</>
|
|
|
|
# How many seconds should we cache a search results page for?
|
|
<searchResultsCacheMaxAge>10800</>
|
|
|
|
# Keep track of ips which do queries, disallow non-customers from hitting us
|
|
# too hard.
|
|
<autobanIPsWhichViolateTheQueriesPerDayQuotas>0</>
|
|
|
|
# If a call to a message callback or message handler in the udp server takes
|
|
# more than this many milliseconds, then log it. Logs 'udp: Took %lli ms to
|
|
# call callback for msgType=0x%hhx niceness=%li'. Use -1 or less to disable
|
|
# the logging.
|
|
<maxDelayBeforeLoggingACallbackOrHandler>-1</>
|
|
|
|
# Sends emails to admin if a host goes down.
|
|
<sendEmailAlerts>0</>
|
|
|
|
# Do not send email alerts about dead hosts to anyone except
|
|
# sysadmin@gigablast.com between the times given below unless all the twins of
|
|
# the dead host are also dead. Instead, wait till after if the host is still
|
|
# dead.
|
|
<delayNonCriticalEmailAlerts>0</>
|
|
|
|
# Email alerts will include the cluster name
|
|
<clusterName><![CDATA[unspecified]]></>
|
|
|
|
# Send an email after a host has not responded to successive pings for this
|
|
# many milliseconds.
|
|
<sendEmailTimeout>62000</>
|
|
|
|
# Send email alerts when query success rate goes below this threshold.
|
|
# (percent rate between 0.0 and 1.0)
|
|
<querySuccessRateThreshold>0.850000</>
|
|
|
|
# Send email alerts when average query latency goes above this threshold. (in
|
|
# seconds)
|
|
<averageQueryLatencyThreshold>2.000000</>
|
|
|
|
# Record this number of query times before calculating average query latency.
|
|
<numberOfQueryTimesInAverage>300</>
|
|
|
|
# At what temperature in Celsius should we send an email alert if a hard drive
|
|
# reaches it?
|
|
<maxHardDriveTemperature>45</>
|
|
|
|
# Look for this string in the kernel buffer for sending email alert. Useful
|
|
# for detecting some strange hard drive failures that really slow performance.
|
|
<errorString1><![CDATA[]]></>
|
|
|
|
# Look for this string in the kernel buffer for sending email alert. Useful
|
|
# for detecting some strange hard drive failures that really slow performance.
|
|
<errorString2><![CDATA[]]></>
|
|
|
|
# Look for this string in the kernel buffer for sending email alert. Useful
|
|
# for detecting some strange hard drive failures that really slow performance.
|
|
<errorString3><![CDATA[]]></>
|
|
|
|
# Sends to email address 1 through email server 1.
|
|
<sendEmailAlertsToEmail1>0</>
|
|
|
|
# Sends to email address 1 through email server 1 if any parm is changed.
|
|
<sendParmChangeEmailAlertsToEmail1>0</>
|
|
|
|
# Connects to this IP or hostname directly when sending email 1. Use
|
|
# <i>apt-get install sendmail</i> to install sendmail on that IP or hostname.
|
|
# Add <i>From:10.5 RELAY</i> to /etc/mail/access to allow sendmail to forward
|
|
# email it receives from gigablast if gigablast hosts are on the 10.5.*.* IPs.
|
|
# Then run <i>/etc/init.d/sendmail restart</i> as root to pick up those
|
|
# changes so sendmail will forward Gigablast's mail to the address you give
|
|
# below.
|
|
<emailServer1><![CDATA[10.5.54.47]]></>
|
|
|
|
# Sends to this address when sending email 1
|
|
<emailAddress1><![CDATA[4081234567@vtext.com]]></>
|
|
|
|
# The from field when sending email 1
|
|
<fromEmailAddress1><![CDATA[sysadmin@mydomain.com]]></>
|
|
|
|
# Sends to email address 2 through email server 2.
|
|
<sendEmailAlertsToEmail2>0</>
|
|
|
|
# Sends to email address 2 through email server 2 if any parm is changed.
|
|
<sendParmChangeEmailAlertsToEmail2>0</>
|
|
|
|
# Connects to this server directly when sending email 2
|
|
<emailServer2><![CDATA[mail.mydomain.com]]></>
|
|
|
|
# Sends to this address when sending email 2
|
|
<emailAddress2><![CDATA[]]></>
|
|
|
|
# The from field when sending email 2
|
|
<fromEmailAddress2><![CDATA[sysadmin@mydomain.com]]></>
|
|
|
|
# Sends to email address 3 through email server 3.
|
|
<sendEmailAlertsToEmail3>0</>
|
|
|
|
# Sends to email address 3 through email server 3 if any parm is changed.
|
|
<sendParmChangeEmailAlertsToEmail3>0</>
|
|
|
|
# Connects to this server directly when sending email 3
|
|
<emailServer3><![CDATA[mail.mydomain.com]]></>
|
|
|
|
# Sends to this address when sending email 3
|
|
<emailAddress3><![CDATA[]]></>
|
|
|
|
# The from field when sending email 3
|
|
<fromEmailAddress3><![CDATA[sysadmin@mydomain.com]]></>
|
|
|
|
# IP address of the primary DNS server. Assumes UDP port 53. REQUIRED FOR
|
|
# SPIDERING! Use Google's public DNS 8.8.8.8 as default.
|
|
<dns0>8.8.8.8</>
|
|
|
|
# IP address of the secondary DNS server. Assumes UDP port 53. Will be
|
|
# accessed in conjunction with the primary dns, so make sure this is always
|
|
# up. An ip of 0 means disabled. Google's secondary public DNS is 8.8.4.4.
|
|
<dns1>8.8.4.4</>
|
|
|
|
# All hosts send to these DNSes based on hash of the subdomain to try to split
|
|
# DNS load evenly.
|
|
<dns2>0.0.0.0</>
|
|
<dns3>0.0.0.0</>
|
|
<dns4>0.0.0.0</>
|
|
<dns5>0.0.0.0</>
|
|
<dns6>0.0.0.0</>
|
|
<dns7>0.0.0.0</>
|
|
<dns8>0.0.0.0</>
|
|
<dns9>0.0.0.0</>
|
|
<dns10>0.0.0.0</>
|
|
<dns11>0.0.0.0</>
|
|
<dns12>0.0.0.0</>
|
|
<dns13>0.0.0.0</>
|
|
<dns14>0.0.0.0</>
|
|
<dns15>0.0.0.0</>
|
|
|
|
# add Ips here to bar them from accessing this gigablast server.
|
|
<banIps><![CDATA[]]></>
|
|
|
|
# add Ips here to give them an infinite query quota.
|
|
<allowIps><![CDATA[]]></>
|
|
|
|
# Don't try to autoban queries that have one of these codes. Also, the code
|
|
# must be valid for us to use &uip=IPADDRESS as the IP address of the
|
|
# submitter for purposes of autoban AND purposes of addurl daily quotas.
|
|
<validCodes><![CDATA[]]></>
|
|
|
|
# Append extra default parms to queries that match certain substrings.
|
|
# Format: text to match in url, followed by a space, then the list of extra
|
|
# parms as they would appear appended to the url. One match per line.
|
|
<extraParms><![CDATA[]]></>
|
|
|
|
# ban any query that matches this list of substrings. Must match all
|
|
# comma-separated strings on the same line. ('\n' = OR, ',' = AND)
|
|
<banRegex><![CDATA[]]></>
|
|
|
|
# Any matching password will have administrative access to Gigablast and all
|
|
# collections.
|
|
# Use <masterPassword> tag.
|
|
|
|
# Any IPs in this list will have administrative access to Gigablast and all
|
|
# collections.
|
|
# Use <masterIp> tag.
|
|
|
|
# Log GET and POST requests received from the http server?
|
|
<logHttpRequests>1</>
|
|
|
|
# Should we log queries that are autobanned? They can really fill up the log.
|
|
<logAutobannedQueries>1</>
|
|
|
|
# If query took this many millliseconds or longer, then log the query and the
|
|
# time it took to process.
|
|
<logQueryTimeThreshold>5000</>
|
|
|
|
# Log query reply in proxy, but only for those queries above the time
|
|
# threshold above.
|
|
<logQueryReply>0</>
|
|
|
|
# Log status of spidered or injected urls?
|
|
<logSpideredUrls>1</>
|
|
|
|
# Log messages if Gigablast runs out of udp sockets?
|
|
<logNetworkCongestion>0</>
|
|
|
|
# Log messages not related to an error condition, but meant more to give an
|
|
# idea of the state of the gigablast process. These can be useful when
|
|
# diagnosing problems.
|
|
<logInformationalMessages>1</>
|
|
|
|
# Log it when document not added due to quota breech. Log it when url is too
|
|
# long and it gets truncated.
|
|
<logLimitBreeches>0</>
|
|
|
|
# Log various debug messages.
|
|
<logDebugAdminMessages>0</>
|
|
<logDebugBuildMessages>0</>
|
|
<logDebugBuildTimeMessages>0</>
|
|
<logDebugDatabaseMessages>0</>
|
|
<logDebugDirtyMessages>0</>
|
|
<logDebugDiskMessages>0</>
|
|
<logDebugDnsMessages>0</>
|
|
<logDebugHttpMessages>0</>
|
|
<logDebugLoopMessages>0</>
|
|
<logDebugLanguageDetectionMessages>0</>
|
|
<logDebugLinkInfo>0</>
|
|
<logDebugMemMessages>0</>
|
|
<logDebugMemUsageMessages>0</>
|
|
<logDebugNetMessages>0</>
|
|
<logDebugQueryMessages>0</>
|
|
<logDebugQuotaMessages>0</>
|
|
<logDebugRobotsMessages>0</>
|
|
<logDebugSpiderCacheMessages>0</>
|
|
<logDebugSpellerMessages>0</>
|
|
<logDebugSectionsMessages>0</>
|
|
<logDebugSeoInsertMessages>0</>
|
|
<logDebugSeoMessages>0</>
|
|
<logDebugStatsMessages>0</>
|
|
<logDebugSummaryMessages>0</>
|
|
<logDebugSpiderMessages>0</>
|
|
<logDebugUrlAttempts>0</>
|
|
<logDebugSpiderDownloads>0</>
|
|
<logDebugFacebook>0</>
|
|
<logDebugTagdbMessages>0</>
|
|
<logDebugTcpMessages>0</>
|
|
<logDebugThreadMessages>0</>
|
|
<logDebugTitleMessages>0</>
|
|
<logDebugTimedbMessages>0</>
|
|
<logDebugTopicMessages>0</>
|
|
<logDebugTopDocMessages>0</>
|
|
<logDebugUdpMessages>0</>
|
|
<logDebugUnicodeMessages>0</>
|
|
<logDebugRepairMessages>0</>
|
|
<logDebugPubDateExtractionMessages>0</>
|
|
|
|
# Log various timing related messages.
|
|
<logTimingMessagesForBuild>0</>
|
|
|
|
# Log various timing related messages.
|
|
<logTimingMessagesForAdmin>0</>
|
|
<logTimingMessagesForDatabase>0</>
|
|
<logTimingMessagesForNetworkLayer>0</>
|
|
<logTimingMessagesForQuery>0</>
|
|
|
|
# Log various timing related messages.
|
|
<logTimingMessagesForSpcache>0</>
|
|
<logTimingMessagesForRelatedTopics>0</>
|
|
|
|
# Log reminders to the programmer. You do not need this.
|
|
<logReminderMessages>0</>
|
|
|
|
# If enabled, gigablast will repair the rdbs as specified by the parameters
|
|
# below. When a particular collection is in repair mode, it can not spider or
|
|
# merge titledb files.
|
|
<repairModeEnabled>0</>
|
|
|
|
# Comma or space separated list of the collections to repair or rebuild.
|
|
<collectionsToRepairOrRebuild><![CDATA[main]]></>
|
|
|
|
# In bytes.
|
|
<memoryToUseForRepair>300000000</>
|
|
|
|
# Maximum number of outstanding inject spiders for repair.
|
|
<maxRepairSpiders>32</>
|
|
|
|
# If enabled, gigablast will reinject the content of all title recs into a
|
|
# secondary rdb system. That will the primary rdb system when complete.
|
|
<fullRebuild>0</>
|
|
|
|
# If enabled, gigablast will keep the new spiderdb records when doing the full
|
|
# rebuild or the spiderdb rebuild.
|
|
<keepNewSpiderdbRecs>1</>
|
|
|
|
# If enabled, gigablast will recycle the link info when rebuilding titledb.
|
|
<recycleLinkInfo>0</>
|
|
|
|
# If enabled, gigablast will rebuild this rdb
|
|
<rebuildTitledb>1</>
|
|
|
|
# If enabled, gigablast will rebuild this rdb
|
|
<rebuildPosdb>0</>
|
|
|
|
# If enabled, gigablast will rebuild this rdb
|
|
<rebuildClusterdb>0</>
|
|
|
|
# If enabled, gigablast will rebuild this rdb
|
|
<rebuildSpiderdb>0</>
|
|
|
|
# If enabled, gigablast will rebuild this rdb
|
|
<rebuildLinkdb>0</>
|
|
|
|
# If disabled, gigablast will skip root urls.
|
|
<rebuildRootUrls>1</>
|
|
|
|
# If disabled, gigablast will skip non-root urls.
|
|
<rebuildNonrootUrls>1</>
|
|
|
|
# When rebuilding spiderdb and scanning it for new spiderdb records, should a
|
|
# tagdb lookup be performed? Runs much much faster without it. Will also keep
|
|
# the original doc quality and spider priority in tact.
|
|
<skipTagdbLookup>0</>
|