mirror of
https://github.com/gigablast/open-source-search-engine.git
synced 2024-10-04 20:27:43 +03:00
b90ef3de0d
use msg12 to remove rec from doledb/doleiptable and add 0 entry to waiting table so doledb is again immediately repopulated with that firstIp so we can spider multiple urls from the same ip at the same time.
680 lines
23 KiB
Plaintext
680 lines
23 KiB
Plaintext
# All <, >, " and # characters that are values for a field contained herein
|
|
# must be represented as <, >, " and # respectively.
|
|
|
|
# Mem available to this process. May be exceeded due to fragmentation.
|
|
<maxMem>4000000000</>
|
|
|
|
# Below the various Gigablast databases are configured.
|
|
# <*dbMaxTreeMem> - mem used for holding new recs
|
|
# <*dbMaxDiskPageCacheMem> - disk page cache mem for this db
|
|
# <*dbMaxCacheMem> - cache mem for holding single recs
|
|
# <*dbSaveCache> - save the rec cache on exit?
|
|
# <*dbMaxCacheAge> - max age (seconds) for recs in rec cache
|
|
# See that Stats page for record counts and stats.
|
|
|
|
# How many bytes should be used for caching DNS replies?
|
|
<dnsMaxCacheMem>128000</>
|
|
|
|
# A tagdb record assigns a url or site to a ruleset. Each tagdb record is
|
|
# about 100 bytes or so.
|
|
<tagdbMaxTreeMem>1028000</>
|
|
<tagdbMaxPageCacheMem>200000</>
|
|
|
|
# A catdb record assigns a url or site to DMOZ categories. Each catdb record
|
|
# is about 100 bytes.
|
|
<catdbMaxTreeMem>1000000</>
|
|
<catdbMaxPageCacheMem>25000000</>
|
|
<catdbMaxCacheMem>0</>
|
|
|
|
# Clusterdb caches small records for site clustering and deduping.
|
|
<clusterdbMaxTreeMem>1000000</>
|
|
<clusterdbSaveCache>0</>
|
|
|
|
# Max memory for dup vector cache.
|
|
<maxVectorCacheMem>10000000</>
|
|
|
|
# Robotdb caches robot.txt files.
|
|
<robotdbMaxCacheMem>128000</>
|
|
<robotdbSaveCache>0</>
|
|
<linkdbMaxPageCacheMem>0</>
|
|
<statsdbMaxTreeMem>5000000</>
|
|
<statsdbMaxCacheMem>0</>
|
|
<statsdbMaxDiskPageCacheMem>1000000</>
|
|
|
|
# Maximum bytes of a doc that can be sent before having to read more from disk
|
|
<httpMaxSendBufSize>128000</>
|
|
|
|
# Bytes to use for caching search result pages.
|
|
<searchResultsMaxCacheMem>100000</>
|
|
|
|
# Read only mode does not allow spidering.
|
|
<readOnlyMode>0</>
|
|
|
|
# Spell check using the dictionary.
|
|
<doSpellChecking>0</>
|
|
|
|
# give narrow search suggestions.
|
|
<doNarrowSearch>0</>
|
|
|
|
# Overrides all spidering for all collections on just this host.
|
|
<localSpideringEnabled>1</>
|
|
|
|
# Overrides all add urls for all collections on just this host.
|
|
<localAddUrlEnabled>1</>
|
|
|
|
# Used by proxy to point to a temporary cluster while the original cluster is
|
|
# updated with a new binary. The temporary cluster is the same as the original
|
|
# cluster but the ports are all incremented by one from what is in the
|
|
# hosts.conf. This should ONLY be used for the proxy.
|
|
<useTemporaryCluster>0</>
|
|
|
|
# If enabled gb does the search queries in ./test-search/queries.txt and
|
|
# compares to the last run and outputs the diffs for inspection and validation.
|
|
<qaSearchTestEnabled>1</>
|
|
|
|
# Enable spidering on all hosts
|
|
<allSpidersOn>0</>
|
|
|
|
# Disable spidering on all hosts
|
|
<allSpidersOff>0</>
|
|
|
|
# Serves ads unless pure=1 is in cgi parms.
|
|
<adFeedEnabled>0</>
|
|
|
|
# Stripe #n contains twin #n from each group. Doing stripe balancing helps
|
|
# prevent too many query requests coming into one host. This parm is only for
|
|
# the proxy. Stripe balancing is done by default unless the parm is disabled
|
|
# on the proxy in which case it appends a &dsb=0 to the query url it sends to
|
|
# the host. The proxy alternates to which host it forwards the incoming query
|
|
# based on the stripe. It takes the number of query terms in the query into
|
|
# account to make a more even balance.
|
|
<doStripeBalancing>1</>
|
|
|
|
# Is this cluster part of a live production cluster? If this is true we make
|
|
# sure that elvtune is being set properly for best performance, otherwise, gb
|
|
# will not startup.
|
|
<isLiveCluster>0</>
|
|
|
|
# Is this cluster just used for indexing wikipedia pages?
|
|
<isWikipediaCluster>0</>
|
|
|
|
# At what temperature in Celsius should we send an email alert if a hard drive
|
|
# reaches it?
|
|
<maxHardDriveTemperature>45</>
|
|
|
|
# If a heartbeat is delayed this many milliseconds dump a core so we can see
|
|
# where the CPU was. Logs 'db: missed heartbeat by %lli ms'. Use 0 or less to
|
|
# disable.
|
|
<maxHeartbeatDelayInMilliseconds>0</>
|
|
|
|
# If a call to a message callback or message handler in the udp server takes
|
|
# more than this many milliseconds, then log it. Logs 'udp: Took %lli ms to
|
|
# call callback for msgType=0x%hhx niceness=%li'. Use -1 or less to disable
|
|
# the logging.
|
|
<maxDelayBeforeLoggingACallbackOrHandler>-1</>
|
|
|
|
# Sends emails to admin if a host goes down.
|
|
<sendEmailAlerts>0</>
|
|
|
|
# Sends to sysadmin@gigablast.com.
|
|
<sendEmailAlertsToSysadmin>0</>
|
|
|
|
# Sends to email address 1 through email server 1.
|
|
<sendEmailAlertsToEmail1>1</>
|
|
|
|
# Sends to email address 1 through email server 1 if any parm is changed.
|
|
<sendParmChangeEmailAlertsToEmail1>1</>
|
|
|
|
# Connects to this server directly when sending email 1
|
|
<emailServer1><![CDATA[10.5.54.47]]></>
|
|
|
|
# Sends to this address when sending email 1
|
|
<emailAddress1><![CDATA[5051234567@vtext.com]]></>
|
|
|
|
# The from field when sending email 1
|
|
<fromEmailAddress1><![CDATA[sysadmin@mydomain.com]]></>
|
|
|
|
# Sends to email address 2 through email server 2.
|
|
<sendEmailAlertsToEmail2>0</>
|
|
|
|
# Sends to email address 2 through email server 2 if any parm is changed.
|
|
<sendParmChangeEmailAlertsToEmail2>1</>
|
|
|
|
# Connects to this server directly when sending email 2
|
|
<emailServer2><![CDATA[mail.mydomain.com]]></>
|
|
|
|
# Sends to this address when sending email 2
|
|
<emailAddress2><![CDATA[]]></>
|
|
|
|
# The from field when sending email 2
|
|
<fromEmailAddress2><![CDATA[sysadmin@mydomain.com]]></>
|
|
|
|
# Sends to email address 3 through email server 3.
|
|
<sendEmailAlertsToEmail3>0</>
|
|
|
|
# Sends to email address 3 through email server 3 if any parm is changed.
|
|
<sendParmChangeEmailAlertsToEmail3>1</>
|
|
|
|
# Connects to this server directly when sending email 3
|
|
<emailServer3><![CDATA[mail.mydomain.com]]></>
|
|
|
|
# Sends to this address when sending email 3
|
|
<emailAddress3><![CDATA[]]></>
|
|
|
|
# The from field when sending email 3
|
|
<fromEmailAddress3><![CDATA[sysadmin@mydomain.com]]></>
|
|
|
|
# Sends to email address 4 through email server 4.
|
|
<sendEmailAlertsToEmail4>0</>
|
|
|
|
# Sends to email address 4 through email server 4 if any parm is changed.
|
|
<sendParmChangeEmailAlertsToEmail4>1</>
|
|
|
|
# Connects to this server directly when sending email 4
|
|
<emailServer4><![CDATA[mail.mydomain.com]]></>
|
|
|
|
# Sends to this address when sending email 4
|
|
<emailAddress4><![CDATA[]]></>
|
|
|
|
# The from field when sending email 4
|
|
<fromEmailAddress4><![CDATA[sysadmin@mydomain.com]]></>
|
|
|
|
# Do not send email alerts about dead hosts to anyone except
|
|
# sysadmin@gigablast.com between the times given below unless all the twins of
|
|
# the dead host are also dead. Instead, wait till after if the host is still
|
|
# dead.
|
|
<delayNonCriticalEmailAlerts>0</>
|
|
|
|
# Look for this string in the kernel buffer for sending email
|
|
<errorString1><![CDATA[]]></>
|
|
|
|
# Look for this string in the kernel buffer for sending email
|
|
<errorString2><![CDATA[]]></>
|
|
|
|
# Look for this string in the kernel buffer for sending email
|
|
<errorString3><![CDATA[]]></>
|
|
|
|
# If you have scsi drives or a slow network, say yes here to minimize data
|
|
# fetches across the network.
|
|
<preferLocalReads>0</>
|
|
|
|
# If enabled then all writes will be flushed to disk. This is generally a good
|
|
# thing.
|
|
<doSynchronousWrites>1</>
|
|
|
|
# Read what was written in a verification step. Decreases performance, but may
|
|
# help fight disk corruption mostly on Maxtors and Western Digitals.
|
|
<verifyDiskWrites>0</>
|
|
|
|
# When reindexing a document, do not re-add data that should already be in
|
|
# index or clusterdb since the last time the document was indexed. Otherwise,
|
|
# re-add the data regardless.
|
|
<doIncrementalUpdating>0</>
|
|
|
|
# Use /etc/hosts file to resolve hostnames? the /etc/host file is reloaded
|
|
# every minute, so if you make a change to it you might have to wait one
|
|
# minute for the change to take affect.
|
|
<useEtcHosts>0</>
|
|
|
|
# If enabled, Gigablast assumes the first half of machines in hosts.conf are
|
|
# on a different network switch than the second half, and minimizes transmits
|
|
# between the switches.
|
|
<twinsAreSplit>0</>
|
|
|
|
# When enabled Gigablast will randomly fail at allocating memory. Used for
|
|
# testing stability.
|
|
<doOutOfMemoryTesting>0</>
|
|
|
|
# When enabled Gigablast will make sure it reparses the document exactly the
|
|
# same way. It does this every 1000th document anyway, but enabling this makes
|
|
# it do it for every document.
|
|
<doConsistencyTesting>0</>
|
|
|
|
# If enabled, all servers must have two gigabit ethernet ports hooked up and
|
|
# Gigablast will round robin packets between both ethernet ports when sending
|
|
# to another host. Can speed up network transmissions as much as 2x.
|
|
<useShotgun>0</>
|
|
|
|
# If enabled, Gigablast will use quickpoll. Significantly improves
|
|
# performance. Only turn this off for testing.
|
|
<useQuickpoll>1</>
|
|
|
|
# If enabled, Gigablast will use threads.
|
|
<useThreads>1</>
|
|
|
|
# If enabled, Gigablast will use shared memory. Should really only be used on
|
|
# the live cluster, keep this on the testing cluster since it can leak easily.
|
|
<useSharedMem>0</>
|
|
|
|
# Use disk page cache?
|
|
<useDiskPageCacheForPosdb>1</>
|
|
|
|
# Use disk page cache?
|
|
<useDiskPageCacheForDatedb>1</>
|
|
|
|
# Use disk page cache?
|
|
<useDiskPageCacheForTitledb>1</>
|
|
|
|
# Use disk page cache?
|
|
<useDiskPageCacheForSpiderdb>1</>
|
|
|
|
# Use disk page cache?
|
|
<useDiskPageCacheForTagdb>1</>
|
|
|
|
# Use disk page cache?
|
|
<useDiskPageCacheForChecksumdb>1</>
|
|
|
|
# Use disk page cache?
|
|
<useDiskPageCacheForClusterdb>1</>
|
|
|
|
# Use disk page cache?
|
|
<useDiskPageCacheForCatdb>1</>
|
|
|
|
# Use disk page cache?
|
|
<useDiskPageCacheForLinkdb>1</>
|
|
|
|
# Scan all titledb files if rec not found. You should keep this on to avoid
|
|
# corruption. Do not turn it off unless you are Matt Wells.
|
|
<scanAllIfNotFound>1</>
|
|
|
|
# for specifying if this is an interface machinemessages are rerouted from
|
|
# this machine to the maincluster set in the hosts.conf.
|
|
<interfaceMachine>0</>
|
|
|
|
# At query time, should Gigablast generate content vectors for title records
|
|
# lacking them? This is an expensive operation, so is really just for testing
|
|
# purposes.
|
|
<generateVectorAtQueryTime>0</>
|
|
|
|
# Keep track of ips which do queries, disallow non-customers from hitting us
|
|
# too hard.
|
|
<autobanIPsWhichViolateTheQueriesPerDayQuotas>0</>
|
|
|
|
# Non-customers get this many queries per day beforebeing autobanned
|
|
<freeQueriesPerDay>1024</>
|
|
|
|
# Non-customers get this many queries per minute beforebeing autobanned
|
|
<freeQueriesPerMinute>30</>
|
|
|
|
# If this is non empty, http traffic will be redirected to the specified
|
|
# address.
|
|
<redirectNonrawTraffic><![CDATA[]]></>
|
|
|
|
# If this is true, gb will route download requests for web pages to proxies in
|
|
# hosts.conf. Proxies will download and compress docs before sending back.
|
|
<sendRequestsToCompressionProxy>0</>
|
|
|
|
# Enable/disable the ability to synchronize time between the cluster and the
|
|
# proxy
|
|
<synchronizeProxyToClusterTime>0</>
|
|
|
|
# Allows scaling up of hosts by deleting recs not in the correct group. This
|
|
# should only happen why copying a set of servers to the new hosts. Otherwise
|
|
# corrupted data will cause a halt.
|
|
<allowScalingOfHosts>0</>
|
|
|
|
# Allows bypass of db validation so gigablast will not halt if a corrupt db is
|
|
# discovered durring load. Use this when attempting to load with a collection
|
|
# that has known corruption.
|
|
<allowBypassOfDbValidation>0</>
|
|
|
|
# IP address of the primary DNS server. Assumes UDP port 53.
|
|
<dns0>8.8.8.8</>
|
|
|
|
# IP address of the secondary DNS server. Assumes UDP port 53. Will be
|
|
# accessed in conjunction with the primary dns, so make sure this is always
|
|
# up. An ip of 0 means disabled.
|
|
<dns1>8.8.4.4</>
|
|
<dns2>0.0.0.0</>
|
|
<dns3>0.0.0.0</>
|
|
<dns4>0.0.0.0</>
|
|
<dns5>0.0.0.0</>
|
|
<dns6>0.0.0.0</>
|
|
<dns7>0.0.0.0</>
|
|
<dns8>0.0.0.0</>
|
|
<dns9>0.0.0.0</>
|
|
<dns10>0.0.0.0</>
|
|
<dns11>0.0.0.0</>
|
|
<dns12>0.0.0.0</>
|
|
<dns13>0.0.0.0</>
|
|
<dns14>0.0.0.0</>
|
|
<dns15>0.0.0.0</>
|
|
<geocoderIP1>10.5.66.11</>
|
|
<geocoderIP2>0.0.0.0</>
|
|
<geocoderIP3>0.0.0.0</>
|
|
<geocoderIP4>0.0.0.0</>
|
|
|
|
# Access the wiki coll through this proxy ip
|
|
<wikiProxyIp>0.0.0.0</>
|
|
|
|
# Access the wiki coll through this proxy port
|
|
<wikiProxyPort>0</>
|
|
|
|
# Email alerts will include the cluster name
|
|
<clusterName><![CDATA[unspecified]]></>
|
|
|
|
# Identification seen by web servers when the Gigablast spider downloads their
|
|
# web pages. It is polite to insert a contact email address here so webmaster
|
|
# that experience problems from the Gigablast spider have somewhere to vent.
|
|
<spiderUserAgent><![CDATA[GigablastOpenSource/1]]></>
|
|
|
|
# If this is true, gb will send accept-encoding: gzipwhen doing http downloads.
|
|
<askForGzippedDocsWhenDownloading>0</>
|
|
|
|
# When no collection is explicitly specified, assume this collection name.
|
|
<defaultCollection><![CDATA[main]]></>
|
|
|
|
# Collection to be used for directory searching and display of directory topic
|
|
# pages.
|
|
<directoryCollection><![CDATA[]]></>
|
|
|
|
# Hostname of the server providing the directory. Leave empty to use this host.
|
|
<directoryHostname><![CDATA[]]></>
|
|
|
|
# Total incoming bandwidth used by all spiders should not exceed this many
|
|
# kilobits per second.
|
|
<maxIncomingBandwidthForSpider>999999.000</>
|
|
|
|
# Spiders will shed load when their host exceeds this value for the 1-minute
|
|
# load average in /proc/loadavg. The value 0.0 disables this feature.
|
|
<max1minuteSlidingwindowLoadavg>0.000</>
|
|
|
|
# Maximum number of threads to use per Gigablast process for intersecting
|
|
# docid lists. Generally, set this to the number of CPUs on the machine.
|
|
<maxCpuThreads>1</>
|
|
|
|
# Maximum number of pages to index or delete from index per second for all
|
|
# hosts combined.
|
|
<maxPagesPerSecond>999999.000</>
|
|
|
|
# Consider a host in the Gigablast network to be dead if it does not respond
|
|
# to successive pings for this number of seconds. Gigablast does not send
|
|
# requests to dead hosts. Outstanding requests may be re-routed to a twin.
|
|
<deadHostTimeout>4000</>
|
|
|
|
# Send an email after a host has not responded to successive pings for this
|
|
# many milliseconds.
|
|
<sendEmailTimeout>62000</>
|
|
|
|
# Wait this many milliseconds before pinging the next host. Each host pings
|
|
# all other hosts in the network.
|
|
<pingSpacer>100</>
|
|
|
|
# Send email alerts when average query latency goes above this threshold.
|
|
<averageQueryLatencyThreshold>2.000</>
|
|
|
|
# Send email alerts when query success rate goes below this threshold.
|
|
<querySuccessRateThreshold>0.850</>
|
|
|
|
# Record this number of query times before calculating average query latency.
|
|
<numberOfQueryTimesInAverage>3000</>
|
|
|
|
# If we reach this many corrupt index lists, send an admin email. Set to -1
|
|
# to disable.
|
|
<maxCorruptIndexLists>5</>
|
|
|
|
# Maximum number of threads to use per Gigablast process for writing data to
|
|
# the disk. Keep low to reduce file interlace effects and impact on query
|
|
# response time.
|
|
<maxWriteThreads>1</>
|
|
|
|
# Maximum number of threads to use per Gigablast process for accessing the
|
|
# disk for index-building purposes. Keep low to reduce impact on query
|
|
# response time. Increase for RAID systems or when initially building an index.
|
|
<maxSpiderReadThreads>7</>
|
|
|
|
# This particular number applies to all reads above 1MB.
|
|
<maxSpiderBigReadThreads>3</>
|
|
|
|
# This particular number applies to all reads above 100K.
|
|
<maxSpiderMediumReadThreads>4</>
|
|
|
|
# This particular number applies to all reads above 1MB.
|
|
<maxSpiderSmallReadThreads>5</>
|
|
|
|
# Maximum number of threads to use per Gigablast process for accessing the
|
|
# disk for querying purposes. IDE systems tend to be more responsive when this
|
|
# is low. Increase for SCSI or RAID systems.
|
|
<maxQueryReadThreads>20</>
|
|
|
|
# This particular number applies to all reads above 1MB.
|
|
<maxQueryBigReadThreads>20</>
|
|
|
|
# This particular number applies to all reads above 100K.
|
|
<maxQueryMediumReadThreads>20</>
|
|
|
|
# This particular number applies to all reads above 1MB.
|
|
<maxQuerySmallReadThreads>20</>
|
|
|
|
# Word or phrase must be present in this percent of documents in order to
|
|
# qualify as a spelling recommendation.
|
|
<minPopularityForSpeller>0.010</>
|
|
|
|
# Percent to weight phrases in queries.
|
|
<phraseWeight>100.000</>
|
|
|
|
# Percent of how much to use words to phrase ratio weights.
|
|
<weightscppSliderParmtmp>90</>
|
|
|
|
# When passing queries around the network, send the raw string instead of the
|
|
# serialized query if the required buffer is bigger than this. Smaller values
|
|
# decrease network traffic for large queries at the expense of processing time.
|
|
<maximumSerializedQuerySize>8192</>
|
|
|
|
# Read and write this many bytes at a time when merging files. Smaller values
|
|
# are kinder to query performance, but the merge takes longer. Use at least
|
|
# 1000000 for fast merging.
|
|
<mergeBufSize>800000</>
|
|
|
|
# minRecSizes for Catdb lookups
|
|
<catdbMinRecSizes>100000000</>
|
|
|
|
# Maximum sockets available to serve incoming HTTP requests. Too many
|
|
# outstanding requests will increase query latency. Excess requests will
|
|
# simply have their sockets closed.
|
|
<maxHttpSockets>100</>
|
|
|
|
# Maximum sockets available to serve incoming HTTPS requests. Like max http
|
|
# sockets, but for secure sockets.
|
|
<maxHttpsSockets>100</>
|
|
|
|
# Copy data in memory to disk after this many minutes have passed without the
|
|
# data having been dumped or saved to disk. Use 0 to disable.
|
|
<autoSaveFrequency>5</>
|
|
|
|
# Add this number to the total document count in the index. Just used for
|
|
# displaying on the homepage.
|
|
<docCountAdjustment>0</>
|
|
|
|
# Generates profiling data for callbacks on page performance
|
|
<dynamicPerformanceGraph>0</>
|
|
|
|
# Enable profiler to do accounting of time taken by functions.
|
|
<enableProfiling>1</>
|
|
|
|
# Profiler will not show functions which take less than this many milliseconds
|
|
# in the log or on the perfomance graph.
|
|
<minimumProfilingThreshold>10</>
|
|
|
|
# Produce a LOG_TIMING log message for each callback called, along with the
|
|
# time it took. Profiler must be enabled.
|
|
<sequentialProfiling>0</>
|
|
|
|
# Archive system statistics information in Statsdb.
|
|
<useStatsdb>1</>
|
|
|
|
# How many seconds should we cache a search results page for?
|
|
<searchResultsCacheMaxAge>10800</>
|
|
|
|
# add Ips here to bar them from accessing this gigablast server.
|
|
<banIps><![CDATA[]]></>
|
|
|
|
# add Ips here to give them an infinite query quota.
|
|
<allowIps><![CDATA[]]></>
|
|
|
|
# Don't try to autoban queries that have one of these codes. Also, the code
|
|
# must be valid for us to use &uip=IPADDRESS as the IP address of the
|
|
# submitter for purposes of autoban AND purposes of addurl daily quotas.
|
|
<validCodes><![CDATA[]]></>
|
|
|
|
# Append extra default parms to queries that match certain substrings.
|
|
# Format: text to match in url, followed by a space, then the list of extra
|
|
# parms as they would appear appended to the url. One match per line.
|
|
<extraParms><![CDATA[]]></>
|
|
|
|
# ban any query that matches this list of substrings. Must match all
|
|
# comma-separated strings on the same line. ('\n' = OR, ',' = AND)
|
|
<banRegex><![CDATA[]]></>
|
|
|
|
# Add facebook user IDs here so those people can turk the results. Later we
|
|
# may limit each person to turking a geographic region.
|
|
<supterturks><![CDATA[]]></>
|
|
|
|
# Allow UDP requests from this list of IPs. Any datagram received not coming
|
|
# from one of these IPs, or an IP in hosts.conf, is dropped. If another
|
|
# cluster is accessing this cluster for getting link text or whatever, you
|
|
# will need to list the IPs of the accessing machines here. These IPs are also
|
|
# used to allow access to the HTTP server even if it was disabled in the
|
|
# Master Controls. IPs that have 0 has their Least Significant Byte are
|
|
# treated as wildcards for IP blocks. That is, 1.2.3.0 means 1.2.3.*.
|
|
<connectIp>10.5.0.3</>
|
|
|
|
# Log GET and POST requests received from the http server?
|
|
<logHttpRequests>1</>
|
|
|
|
# Should we log queries that are autobanned? They can really fill up the log.
|
|
<logAutobannedQueries>1</>
|
|
|
|
# If query took this many millliseconds or longer, then log the query and the
|
|
# time it took to process.
|
|
<logQueryTimeThreshold>5000</>
|
|
|
|
# Log query reply in proxy, but only for those queries above the time
|
|
# threshold above.
|
|
<logQueryReply>0</>
|
|
|
|
# Log status of spidered or injected urls?
|
|
<logSpideredUrls>1</>
|
|
|
|
# Log messages if Gigablast runs out of udp sockets?
|
|
<logNetworkCongestion>0</>
|
|
|
|
# Log messages not related to an error condition, but meant more to give an
|
|
# idea of the state of the gigablast process. These can be useful when
|
|
# diagnosing problems.
|
|
<logInformationalMessages>1</>
|
|
|
|
# Log it when document not added due to quota breech. Log it when url is too
|
|
# long and it gets truncated.
|
|
<logLimitBreeches>0</>
|
|
|
|
# Log various debug messages.
|
|
<logDebugAdminMessages>0</>
|
|
<logDebugBuildMessages>0</>
|
|
<logDebugBuildTimeMessages>0</>
|
|
<logDebugDatabaseMessages>0</>
|
|
<logDebugDirtyMessages>0</>
|
|
<logDebugDiskMessages>0</>
|
|
<logDebugDnsMessages>0</>
|
|
<logDebugHttpMessages>0</>
|
|
<logDebugLoopMessages>0</>
|
|
<logDebugLanguageDetectionMessages>0</>
|
|
<logDebugLinkInfo>0</>
|
|
<logDebugMemMessages>0</>
|
|
<logDebugMemUsageMessages>0</>
|
|
<logDebugNetMessages>0</>
|
|
<logDebugPostQueryRerankMessages>0</>
|
|
<logDebugQueryMessages>0</>
|
|
<logDebugQuotaMessages>0</>
|
|
<logDebugRobotsMessages>0</>
|
|
<logDebugSpiderCacheMessages>0</>
|
|
<logDebugSpiderFlowMessages>1</>
|
|
<logDebugSpellerMessages>0</>
|
|
<logDebugSectionsMessages>0</>
|
|
<logDebugSeoInsertMessages>1</>
|
|
<logDebugSeoMessages>0</>
|
|
<logDebugStatsMessages>0</>
|
|
<logDebugSummaryMessages>0</>
|
|
<logDebugSpiderMessages>0</>
|
|
<logDebugUrlAttempts>0</>
|
|
<logDebugSpiderDownloads>0</>
|
|
<logDebugFacebook>0</>
|
|
<logDebugTagdbMessages>0</>
|
|
<logDebugTcpMessages>0</>
|
|
<logDebugThreadMessages>0</>
|
|
<logDebugTitleMessages>0</>
|
|
<logDebugTimedbMessages>0</>
|
|
<logDebugTopicMessages>0</>
|
|
<logDebugTopDocMessages>0</>
|
|
<logDebugUdpMessages>0</>
|
|
<logDebugUnicodeMessages>0</>
|
|
<logDebugRepairMessages>0</>
|
|
<logDebugPubDateExtractionMessages>0</>
|
|
|
|
# Log various timing related messages.
|
|
<logTimingMessagesForBuild>0</>
|
|
|
|
# Log various timing related messages.
|
|
<logTimingMessagesForAdmin>0</>
|
|
<logTimingMessagesForDatabase>0</>
|
|
<logTimingMessagesForNetworkLayer>0</>
|
|
<logTimingMessagesForQuery>0</>
|
|
|
|
# Log various timing related messages.
|
|
<logTimingMessagesForSpcache>0</>
|
|
<logTimingMessagesForRelatedTopics>0</>
|
|
|
|
# Log reminders to the programmer. You do not need this.
|
|
<logReminderMessages>0</>
|
|
|
|
# If enabled, gigablast will repair the rdbs as specified by the parameters
|
|
# below. When a particular collection is in repair mode, it can not spider or
|
|
# merge titledb files.
|
|
<repairModeEnabled>0</>
|
|
|
|
# Comma or space separated list of the collections to repair or rebuild.
|
|
<collectionsToRepairOrRebuild><![CDATA[main]]></>
|
|
|
|
# In bytes.
|
|
<memoryToUseForRepair>300000000</>
|
|
|
|
# Maximum number of outstanding inject spiders for repair.
|
|
<maxRepairSpiders>32</>
|
|
|
|
# If enabled, gigablast will reinject the content of all title recs into a
|
|
# secondary rdb system. That will the primary rdb system when complete.
|
|
<fullRebuild>0</>
|
|
|
|
# If enabled, gigablast will keep the new spiderdb records when doing the full
|
|
# rebuild or the spiderdb rebuild.
|
|
<keepNewSpiderdbRecs>1</>
|
|
|
|
# If enabled, gigablast will recycle the link info when rebuilding titledb.
|
|
<recycleLinkInfo>0</>
|
|
|
|
# If enabled, gigablast will rebuild this rdb
|
|
<rebuildTitledb>1</>
|
|
|
|
# If enabled, gigablast will rebuild this rdb
|
|
<rebuildPosdb>0</>
|
|
|
|
# If enabled, gigablast will rebuild this rdb
|
|
<rebuildClusterdb>0</>
|
|
|
|
# If enabled, gigablast will rebuild this rdb
|
|
<rebuildSpiderdb>0</>
|
|
|
|
# If enabled, gigablast will rebuild this rdb
|
|
<rebuildLinkdb>0</>
|
|
|
|
# If disabled, gigablast will skip root urls.
|
|
<rebuildRootUrls>1</>
|
|
|
|
# If disabled, gigablast will skip non-root urls.
|
|
<rebuildNonrootUrls>1</>
|
|
|
|
# When rebuilding spiderdb and scanning it for new spiderdb records, should a
|
|
# tagdb lookup be performed? Runs much much faster without it. Will also keep
|
|
# the original doc quality and spider priority in tact.
|
|
<skipTagdbLookup>0</>
|