mirror of
https://github.com/gigablast/open-source-search-engine.git
synced 2024-10-04 12:17:35 +03:00
edbd61b0c5
keep thread queue and just return. will try to relaunch later. do not count delete keys towards shard rebalance count.
376 lines
12 KiB
Plaintext
376 lines
12 KiB
Plaintext
# All <, >, " and # characters that are values for a field contained herein
|
|
# must be represented as <, >, " and # respectively.
|
|
|
|
# Mem available to this process. May be exceeded due to fragmentation.
|
|
<maxMem>4000000000</>
|
|
|
|
# Below the various Gigablast databases are configured.
|
|
# <*dbMaxTreeMem> - mem used for holding new recs
|
|
# <*dbMaxDiskPageCacheMem> - disk page cache mem for this db
|
|
# <*dbMaxCacheMem> - cache mem for holding single recs
|
|
# <*dbSaveCache> - save the rec cache on exit?
|
|
# <*dbMaxCacheAge> - max age (seconds) for recs in rec cache
|
|
# See that Stats page for record counts and stats.
|
|
|
|
# How many bytes should be used for caching DNS replies?
|
|
<dnsMaxCacheMem>128000</>
|
|
|
|
# A tagdb record assigns a url or site to a ruleset. Each tagdb record is
|
|
# about 100 bytes or so.
|
|
<tagdbMaxTreeMem>1028000</>
|
|
<tagdbMaxPageCacheMem>200000</>
|
|
|
|
# A catdb record assigns a url or site to DMOZ categories. Each catdb record
|
|
# is about 100 bytes.
|
|
<catdbMaxTreeMem>1000000</>
|
|
<catdbMaxPageCacheMem>25000000</>
|
|
<catdbMaxCacheMem>0</>
|
|
|
|
# Clusterdb caches small records for site clustering and deduping.
|
|
<clusterdbMaxTreeMem>1000000</>
|
|
<clusterdbSaveCache>0</>
|
|
|
|
# Max memory for dup vector cache.
|
|
<maxVectorCacheMem>10000000</>
|
|
|
|
# Robotdb caches robot.txt files.
|
|
<robotdbMaxCacheMem>128000</>
|
|
<robotdbSaveCache>0</>
|
|
<linkdbMaxPageCacheMem>0</>
|
|
<statsdbMaxTreeMem>5000000</>
|
|
<statsdbMaxCacheMem>0</>
|
|
<statsdbMaxDiskPageCacheMem>1000000</>
|
|
|
|
# Maximum bytes of a doc that can be sent before having to read more from disk
|
|
<httpMaxSendBufSize>128000</>
|
|
|
|
# Bytes to use for caching search result pages.
|
|
<searchResultsMaxCacheMem>100000</>
|
|
|
|
# Read only mode does not allow spidering.
|
|
<readOnlyMode>0</>
|
|
|
|
# Controls all spidering for all collections
|
|
<spideringEnabled>0</>
|
|
|
|
# What is the maximum number of web pages the spider is allowed to download
|
|
# simultaneously for ALL collections PER HOST?
|
|
<maxTotalSpiders>100</>
|
|
|
|
# Can people use the add url interface to add urls to the index?
|
|
<addUrlEnabled>1</>
|
|
|
|
# Save data in memory to disk after this many minutes have passed without the
|
|
# data having been dumped or saved to disk. Use 0 to disable.
|
|
<autoSaveFrequency>5</>
|
|
|
|
# Maximum sockets available to serve incoming HTTP requests. Too many
|
|
# outstanding requests will increase query latency. Excess requests will
|
|
# simply have their sockets closed.
|
|
<maxHttpSockets>100</>
|
|
|
|
# Maximum sockets available to serve incoming HTTPS requests. Like max http
|
|
# sockets, but for secure sockets.
|
|
<maxHttpsSockets>100</>
|
|
|
|
# Identification seen by web servers when the Gigablast spider downloads their
|
|
# web pages. It is polite to insert a contact email address here so webmasters
|
|
# that experience problems from the Gigablast spider have somewhere to vent.
|
|
<spiderUserAgent><![CDATA[GigablastOpenSource/1.0]]></>
|
|
|
|
# If this is true, gb will send Accept-Encoding: gzip to web servers when
|
|
# doing http downloads.
|
|
<askForGzippedDocsWhenDownloading>0</>
|
|
|
|
# How many seconds should we cache a search results page for?
|
|
<searchResultsCacheMaxAge>10800</>
|
|
|
|
# Keep track of ips which do queries, disallow non-customers from hitting us
|
|
# too hard.
|
|
<autobanIPsWhichViolateTheQueriesPerDayQuotas>0</>
|
|
|
|
# If a call to a message callback or message handler in the udp server takes
|
|
# more than this many milliseconds, then log it. Logs 'udp: Took %lli ms to
|
|
# call callback for msgType=0x%hhx niceness=%li'. Use -1 or less to disable
|
|
# the logging.
|
|
<maxDelayBeforeLoggingACallbackOrHandler>-1</>
|
|
|
|
# Sends emails to admin if a host goes down.
|
|
<sendEmailAlerts>0</>
|
|
|
|
# Do not send email alerts about dead hosts to anyone except
|
|
# sysadmin@gigablast.com between the times given below unless all the twins of
|
|
# the dead host are also dead. Instead, wait till after if the host is still
|
|
# dead.
|
|
<delayNonCriticalEmailAlerts>0</>
|
|
|
|
# Email alerts will include the cluster name
|
|
<clusterName><![CDATA[unspecified]]></>
|
|
|
|
# Send an email after a host has not responded to successive pings for this
|
|
# many milliseconds.
|
|
<sendEmailTimeout>62000</>
|
|
|
|
# Send email alerts when query success rate goes below this threshold.
|
|
# (percent rate between 0.0 and 1.0)
|
|
<querySuccessRateThreshold>0.850000</>
|
|
|
|
# Send email alerts when average query latency goes above this threshold. (in
|
|
# seconds)
|
|
<averageQueryLatencyThreshold>2.000000</>
|
|
|
|
# Record this number of query times before calculating average query latency.
|
|
<numberOfQueryTimesInAverage>300</>
|
|
|
|
# At what temperature in Celsius should we send an email alert if a hard drive
|
|
# reaches it?
|
|
<maxHardDriveTemperature>45</>
|
|
|
|
# Look for this string in the kernel buffer for sending email alert. Useful
|
|
# for detecting some strange hard drive failures that really slow performance.
|
|
<errorString1><![CDATA[]]></>
|
|
|
|
# Look for this string in the kernel buffer for sending email alert. Useful
|
|
# for detecting some strange hard drive failures that really slow performance.
|
|
<errorString2><![CDATA[]]></>
|
|
|
|
# Look for this string in the kernel buffer for sending email alert. Useful
|
|
# for detecting some strange hard drive failures that really slow performance.
|
|
<errorString3><![CDATA[]]></>
|
|
|
|
# Sends to email address 1 through email server 1.
|
|
<sendEmailAlertsToEmail1>0</>
|
|
|
|
# Sends to email address 1 through email server 1 if any parm is changed.
|
|
<sendParmChangeEmailAlertsToEmail1>0</>
|
|
|
|
# Connects to this server directly when sending email 1
|
|
<emailServer1><![CDATA[10.5.54.47]]></>
|
|
|
|
# Sends to this address when sending email 1
|
|
<emailAddress1><![CDATA[4081234567@vtext.com]]></>
|
|
|
|
# The from field when sending email 1
|
|
<fromEmailAddress1><![CDATA[sysadmin@mydomain.com]]></>
|
|
|
|
# Sends to email address 2 through email server 2.
|
|
<sendEmailAlertsToEmail2>0</>
|
|
|
|
# Sends to email address 2 through email server 2 if any parm is changed.
|
|
<sendParmChangeEmailAlertsToEmail2>0</>
|
|
|
|
# Connects to this server directly when sending email 2
|
|
<emailServer2><![CDATA[mail.mydomain.com]]></>
|
|
|
|
# Sends to this address when sending email 2
|
|
<emailAddress2><![CDATA[]]></>
|
|
|
|
# The from field when sending email 2
|
|
<fromEmailAddress2><![CDATA[sysadmin@mydomain.com]]></>
|
|
|
|
# Sends to email address 3 through email server 3.
|
|
<sendEmailAlertsToEmail3>0</>
|
|
|
|
# Sends to email address 3 through email server 3 if any parm is changed.
|
|
<sendParmChangeEmailAlertsToEmail3>0</>
|
|
|
|
# Connects to this server directly when sending email 3
|
|
<emailServer3><![CDATA[mail.mydomain.com]]></>
|
|
|
|
# Sends to this address when sending email 3
|
|
<emailAddress3><![CDATA[]]></>
|
|
|
|
# The from field when sending email 3
|
|
<fromEmailAddress3><![CDATA[sysadmin@mydomain.com]]></>
|
|
|
|
# IP address of the primary DNS server. Assumes UDP port 53. REQUIRED FOR
|
|
# SPIDERING! Use Google's public DNS 8.8.8.8 as default.
|
|
<dns0>8.8.8.8</>
|
|
|
|
# IP address of the secondary DNS server. Assumes UDP port 53. Will be
|
|
# accessed in conjunction with the primary dns, so make sure this is always
|
|
# up. An ip of 0 means disabled. Google's secondary public DNS is 8.8.4.4.
|
|
<dns1>8.8.4.4</>
|
|
|
|
# All hosts send to these DNSes based on hash of the subdomain to try to split
|
|
# DNS load evenly.
|
|
<dns2>0.0.0.0</>
|
|
<dns3>0.0.0.0</>
|
|
<dns4>0.0.0.0</>
|
|
<dns5>0.0.0.0</>
|
|
<dns6>0.0.0.0</>
|
|
<dns7>0.0.0.0</>
|
|
<dns8>0.0.0.0</>
|
|
<dns9>0.0.0.0</>
|
|
<dns10>0.0.0.0</>
|
|
<dns11>0.0.0.0</>
|
|
<dns12>0.0.0.0</>
|
|
<dns13>0.0.0.0</>
|
|
<dns14>0.0.0.0</>
|
|
<dns15>0.0.0.0</>
|
|
|
|
# add Ips here to bar them from accessing this gigablast server.
|
|
<banIps><![CDATA[]]></>
|
|
|
|
# add Ips here to give them an infinite query quota.
|
|
<allowIps><![CDATA[]]></>
|
|
|
|
# Don't try to autoban queries that have one of these codes. Also, the code
|
|
# must be valid for us to use &uip=IPADDRESS as the IP address of the
|
|
# submitter for purposes of autoban AND purposes of addurl daily quotas.
|
|
<validCodes><![CDATA[]]></>
|
|
|
|
# Append extra default parms to queries that match certain substrings.
|
|
# Format: text to match in url, followed by a space, then the list of extra
|
|
# parms as they would appear appended to the url. One match per line.
|
|
<extraParms><![CDATA[]]></>
|
|
|
|
# ban any query that matches this list of substrings. Must match all
|
|
# comma-separated strings on the same line. ('\n' = OR, ',' = AND)
|
|
<banRegex><![CDATA[]]></>
|
|
|
|
# Any matching password will have administrative access to Gigablast and all
|
|
# collections.
|
|
# Use <masterPassword> tag.
|
|
|
|
# Any IPs in this list will have administrative access to Gigablast and all
|
|
# collections.
|
|
# Use <masterIp> tag.
|
|
|
|
# Log GET and POST requests received from the http server?
|
|
<logHttpRequests>1</>
|
|
|
|
# Should we log queries that are autobanned? They can really fill up the log.
|
|
<logAutobannedQueries>1</>
|
|
|
|
# If query took this many millliseconds or longer, then log the query and the
|
|
# time it took to process.
|
|
<logQueryTimeThreshold>5000</>
|
|
|
|
# Log query reply in proxy, but only for those queries above the time
|
|
# threshold above.
|
|
<logQueryReply>0</>
|
|
|
|
# Log status of spidered or injected urls?
|
|
<logSpideredUrls>1</>
|
|
|
|
# Log messages if Gigablast runs out of udp sockets?
|
|
<logNetworkCongestion>0</>
|
|
|
|
# Log messages not related to an error condition, but meant more to give an
|
|
# idea of the state of the gigablast process. These can be useful when
|
|
# diagnosing problems.
|
|
<logInformationalMessages>1</>
|
|
|
|
# Log it when document not added due to quota breech. Log it when url is too
|
|
# long and it gets truncated.
|
|
<logLimitBreeches>0</>
|
|
|
|
# Log various debug messages.
|
|
<logDebugAdminMessages>0</>
|
|
<logDebugBuildMessages>0</>
|
|
<logDebugBuildTimeMessages>0</>
|
|
<logDebugDatabaseMessages>0</>
|
|
<logDebugDirtyMessages>0</>
|
|
<logDebugDiskMessages>0</>
|
|
<logDebugDnsMessages>0</>
|
|
<logDebugHttpMessages>0</>
|
|
<logDebugLoopMessages>0</>
|
|
<logDebugLanguageDetectionMessages>0</>
|
|
<logDebugLinkInfo>0</>
|
|
<logDebugMemMessages>0</>
|
|
<logDebugMemUsageMessages>0</>
|
|
<logDebugNetMessages>0</>
|
|
<logDebugQueryMessages>0</>
|
|
<logDebugQuotaMessages>0</>
|
|
<logDebugRobotsMessages>0</>
|
|
<logDebugSpiderCacheMessages>0</>
|
|
<logDebugSpellerMessages>0</>
|
|
<logDebugSectionsMessages>0</>
|
|
<logDebugSeoInsertMessages>0</>
|
|
<logDebugSeoMessages>0</>
|
|
<logDebugStatsMessages>0</>
|
|
<logDebugSummaryMessages>0</>
|
|
<logDebugSpiderMessages>0</>
|
|
<logDebugUrlAttempts>0</>
|
|
<logDebugSpiderDownloads>0</>
|
|
<logDebugFacebook>0</>
|
|
<logDebugTagdbMessages>0</>
|
|
<logDebugTcpMessages>0</>
|
|
<logDebugThreadMessages>0</>
|
|
<logDebugTitleMessages>0</>
|
|
<logDebugTimedbMessages>0</>
|
|
<logDebugTopicMessages>0</>
|
|
<logDebugTopDocMessages>0</>
|
|
<logDebugUdpMessages>0</>
|
|
<logDebugUnicodeMessages>0</>
|
|
<logDebugRepairMessages>0</>
|
|
<logDebugPubDateExtractionMessages>0</>
|
|
|
|
# Log various timing related messages.
|
|
<logTimingMessagesForBuild>0</>
|
|
|
|
# Log various timing related messages.
|
|
<logTimingMessagesForAdmin>0</>
|
|
<logTimingMessagesForDatabase>0</>
|
|
<logTimingMessagesForNetworkLayer>0</>
|
|
<logTimingMessagesForQuery>0</>
|
|
|
|
# Log various timing related messages.
|
|
<logTimingMessagesForSpcache>0</>
|
|
<logTimingMessagesForRelatedTopics>0</>
|
|
|
|
# Log reminders to the programmer. You do not need this.
|
|
<logReminderMessages>0</>
|
|
|
|
# If enabled, gigablast will repair the rdbs as specified by the parameters
|
|
# below. When a particular collection is in repair mode, it can not spider or
|
|
# merge titledb files.
|
|
<repairModeEnabled>0</>
|
|
|
|
# Comma or space separated list of the collections to repair or rebuild.
|
|
<collectionsToRepairOrRebuild><![CDATA[main]]></>
|
|
|
|
# In bytes.
|
|
<memoryToUseForRepair>300000000</>
|
|
|
|
# Maximum number of outstanding inject spiders for repair.
|
|
<maxRepairSpiders>32</>
|
|
|
|
# If enabled, gigablast will reinject the content of all title recs into a
|
|
# secondary rdb system. That will the primary rdb system when complete.
|
|
<fullRebuild>0</>
|
|
|
|
# If enabled, gigablast will keep the new spiderdb records when doing the full
|
|
# rebuild or the spiderdb rebuild.
|
|
<keepNewSpiderdbRecs>1</>
|
|
|
|
# If enabled, gigablast will recycle the link info when rebuilding titledb.
|
|
<recycleLinkInfo>0</>
|
|
|
|
# If enabled, gigablast will rebuild this rdb
|
|
<rebuildTitledb>1</>
|
|
|
|
# If enabled, gigablast will rebuild this rdb
|
|
<rebuildPosdb>0</>
|
|
|
|
# If enabled, gigablast will rebuild this rdb
|
|
<rebuildClusterdb>0</>
|
|
|
|
# If enabled, gigablast will rebuild this rdb
|
|
<rebuildSpiderdb>0</>
|
|
|
|
# If enabled, gigablast will rebuild this rdb
|
|
<rebuildLinkdb>0</>
|
|
|
|
# If disabled, gigablast will skip root urls.
|
|
<rebuildRootUrls>1</>
|
|
|
|
# If disabled, gigablast will skip non-root urls.
|
|
<rebuildNonrootUrls>1</>
|
|
|
|
# When rebuilding spiderdb and scanning it for new spiderdb records, should a
|
|
# tagdb lookup be performed? Runs much much faster without it. Will also keep
|
|
# the original doc quality and spider priority in tact.
|
|
<skipTagdbLookup>0</>
|