Gigablast Admin Overview

#include "gb-include.h" #include "TcpSocket.h" #include "HttpRequest.h" #include "Pages.h" #include "Spider.h" // MAX_SPIDERS #include "Users.h" bool sendPageOverview ( TcpSocket *s , HttpRequest *r ) { //char buf [ 256*1024 ]; //char *p = buf; //char *pend = buf + 256*1024; // . print standard header // . do not print big links if only an assassin, just print host ids SafeBuf sb; g_pages.printAdminTop ( &sb , s , r ); //int32_t user = g_pages.getUserType ( s , r ); //sprintf ( p , //" \n" //"Gigablast Admin Overview\n" //"\n" //"\n" //"\n" // ); // p += gbstrlen ( p ); sb->safePrintf( //"Confidential\n" "This Document is Confidential\n" "
" "\n" "\n" ); //if ( user == USER_MASTER ) p += gbstrlen ( p ); //if ( g_users.hasPermission( r, PAGE_MASTER ) ) p += gbstrlen(p); sb->safePrintf( //"

Admin Overview

" //"\n" //"\n" //"

\n" //"\n" "\n" "

Table of Contents\n" "

\n" "\n" "
\n" "Input Parameters - parameters you can pass to Gigablast for search results

\n" "Exact Hit Counts - how to get the number of search results exactly

" "Related Topics - the input parameters for getting related topics

" "The XML Feed - the format of the XML reply which contains the search results

\n" "Error Codes - how to interpret Gigablast errors

\n" "Weighting Query Terms - how to pass in your own query term weights

" ); p += gbstrlen ( p ); sprintf ( p , "

" "Hardware Requirements - what is required to run gigablast

" "Performance Specifications - various statistics.

" "Installation & Configuration - the necessary files to run Gigablast

" "Command Line Options - various command line options (coming soon)

" "Cluster Maintenance - running Gigablast on a cluster of computers.

" "Troubleshooting - how to fix problems

" "Disaster Recovery - dealing with a crashed " "host

" ); //if ( user == USER_MASTER ) p += gbstrlen ( p ); if ( g_users.hasPermission( r, PAGE_MASTER ) ) p += gbstrlen(p); sprintf ( p , "The Security System - how to control access

" "

" "Building an Index - how to start building your index

\n" "The Spider - all about Gigabot, Gigablast's crawling agent

\n" "Document Quotas - how to limit documents into the index

\n" "Injecting Documents - inserting documents directly into Gigablast

" "Deleting Documents - removing documents from the index

" ); p += gbstrlen ( p ); sprintf ( p , "Scoring a Document - how Gigablast scores a document and how to control it

" "Indexing User-Defined Meta Tags - how Gigablast indexes user-defined meta tags

" "Indexing Big Documents - what controls the maximum size of a document that can be indexed?

" "Indexing Different Languages - how Gigablast indexes different languages

" "Rolling the New Index - merging the realtime files into the base file

" "Building a DMOZ Based Directory - using catdb to build a web directory based on open DMOZ data

" "

" "Optimizing - optimizing Gigablast's spider and query performance

\n" "The Log System - how Gigablast logs information

" "gb.conf - describes the gb configuration file

" "Hosts.conf - the file that describes all participating hosts in the network

\n" //"Rulesets - a ruleset tells Gigablast how to parse, score and spider a document

\n" "Stopwords - list of common words generally ignored at query time

\n" "Phrase Breaks - list of punctuation that breaks a phrase

\n" "\n" "
\n" "\n" ); //if ( user == USER_MASTER ) p += gbstrlen ( p ); if ( g_users.hasPermission( r, PAGE_MASTER ) ) p += gbstrlen(p); p += sprintf ( p , "\n" "\n" "

The Input Parameters\n" "

\n" "

\n" "To get search results from Gigablast use a url like: http://www.gigablast.com/search?q=test&sc=0&dr=0&raw=8&topics=20+100 where:
" "" "
" "\n" "\n" "" "" "\n" "\n" "\n" "\n" "" "" "\n" "\n" "\n" "\n" "" "" "\n" "\n" "\n" "\n" "" "" "\n" "\n" "\n" "\n" "" "" "\n" "\n" "\n" "\n" "" "" "" "" "" "\n" "\n" "\n" "\n" "" "" "\n" "\n" "\n" "\n" "" "" "\n" "\n" "\n" "\n" "\n" "\n" "" "\n" "\n" "\n" "\n" "\n" "\n" "" "\n" "\n" "\n" "\n" "" "\n" "\n" "\n" "\n" "" "\n" "\n" "\n" "\n" "" "\n" "\n" "\n" "\n" "" "\n" "\n" "\n" "\n" "" "\n" "\n" "

n=X	returns X search results. Default is 10. Max is 50.
s=X	returns results starting at result #X. The first result is result #0. Default is 0. Max is 499.
ns=X	returns X summary excerpts in the summary of each search result. Default is defined on a per collection basis in the Display Controls.
site=X	returned results will have URLs from the site, X.
plus=X	returned results will have all words in X. Like a default AND.
minus=X	returned results will not have any words in X.
rat=1	returned results will have ALL query terms. This is also known as a default and search. rat means Require All Terms.
sc=X	X can be 0 or 1 to respectively disable or enable site clustering. Default is 1, but 0 if the raw parameter is used.
dr=X	X can be 0 or 1 to respectively disable or enable duplicate result removal. Default is 1, but 0 if the raw parameter is used.
raw=X	X ranges from 0 to 8 to specify the format of the search results. raw=8 requests the XML feed.
raw=2	Just display a list of docids between <pre> tags. Will display one extra docid than requested if possible, so you know if you have more docids available or not. Does not have to generate summaries so it is a bit faster, especially if you do not perform site clustering or dup removal.
qh=X	X can be 0 or 1 to respectively disable or enable highlighting of query terms in the titles and summaries. Default is 1, but 0 if the raw parameter is used.
usecache=X	X can be 0 or 1 to respectively disable or enable caching of the search results pages. Default is 1.
rcache=X	X can be 0 or 1 to respectively disable or enable reading from the search results page cache. Default is 1.
wcache=X	X can be 0 or 1 to respectively disable or enable writing to the search results page cache. Default is 1.
bq=X	X can be 0 or 1 or 2. 0 means the query is NOT boolean, 1 means the query is boolean and 2 means to auto-detect. Default is 2.
rt=X	X can be 0 or 1 to respectively disable or enable real time searches. If enabled, query response time will suffer because Gigablast will have to read from multiple files, usually 3 or 4, of varying ages, to satisfy a query. Default value of rt is 1, but 0 if the raw paramter is used.
dt=X	X is a space-separated string of meta tag names. Do not forget to url-encode the spaces to +'s or %%20's. Gigablast will extract the contents of these specified meta tags out of the pages listed in the search results and display that content after each summary. i.e. &dt=description will display the meta description of each search result. &dt=description:32+keywords:64 will display the meta description and meta keywords of each search result and limit the fields to 32 and 64 characters respectively. When receiving the XML feed from gigablast, the <display name=\"meta_tag_name\">meta_tag_content</display> XML tag will be used to convey each requested meta tag's content.
spell=X	X can be 0 or 1 to respectively disable or enable spell checking. If enabled while using the XML feed, when Gigablast finds a spelling recommendation it will be included in the XML tag. Default is 0 if using an XML feed, 1 otherwise.
topics=NUM+MAX+SCAN+ MIN+MAXW+META+ DEL+IDF+DEDUP	\n" "\n" "NUM is how many related topics you want returned. \n" " \n" "MAX is the maximum number of topics to generate and store in cache, so if TW is increased, but still below MT, it will result in a fast cache hit.\n" " \n" "SCAN is how many documents to scan for related topics. If this is 30, for example, then Gigablast will scan the first 30 search results for related topics.\n" " \n" "MIN is the minimum score of returned topics. Ranges from 0%% to over 100%%. 50%% is considered pretty good. BUG: This must be at least 1 to get any topics back.\n" " \n" "MAXW is the maximum number of words per topic.\n" " \n" "META is the meta tag name to which Gigablast will restrict the content used to generate the topics. Do not specify thie field to restrict the content to the body of each document, that is the default.\n" " \n" "\n" "DEL is a single character delimeter which defines the topic candidates. All candidates must be separated from the other candidates with the delimeter. So <meta name=test content=\" cat dog ; pig rabbit horse\"> when using the ; as a delimeter would only have two topic candidates: \"cat dog\" and \"pig rabbit horse\". If no delimeter is provided, default funcationality is assumed.\n" " \n" "" "IDF is 1, the default, if you want Gigablast to weight topic candidates by their idf, 0 otherwise." " \n" "" "DEDUP is 1, the default, if the topics should be deduped. This involves removing topics that are substrings or superstrings of other higher-scoring topics." " \n" "" "" "Example: topics=49+100+30+1+6+author+%%3B+0+0" " \n" "The default values for those parameters with unspecifed defaults can be defined on the \"Search Controls\" page. " " \n" "" "XML feeds will contain the generated topics like: <topic><name><![CDATA[some topic]]></name><score>13</score><from>metaTagName</from></topic>" " \n" "Even though somewhat nonstandard, you can specify multiple &topic= parameters to get back multiple topic groups." " \n" "Performance will decrease if you increase the MAX, SCAN or MAXW." "
rdc=X	\n" "\n" "X is 1 if you want Gigablast to return the number of documents that " "contained each topic." "
rd=X	\n" "\n" "X is 1 if you want Gigablast to return the list of docIds that " "contained each topic." "
rp=X	\n" "\n" "X is 1 if you want Gigablast to return the popularity of each topic." "
mdc=X	\n" "\n" "Gigablast will not display topics that are not contained in at least X " "documents. The default is configurable in the Search Controls page on a per " "collection basis." "
t0=X	\n" "\n" "Gigablast will use at least X docids from each termlist. Used to get more accurate hit counts." " \n" "For performance reasons, most large search engines nowadays only return a rough estimate of the number of search results, but you may desire to get a better approximation or even an exact count. Gigablast allows you to do this, but it may be at the expense of query resonse time." " \n" "By using the t0 variable you can tell Gigablast to use a minimum number of docids from each termlist. Typically, t0 defaults to something of around 10,000 docids. Often more docids than that are used, but this is just the minimum. So if Gigablast is forced to use more docids it will take longer to compute the search results on average, but it will give you a more precise hit count. By setting t0 to the truncation limit or higher you will max out the hit count precision." " \n" "Example: http://www.gigablast.com/search?q=test&t0=5000000\n" "" "

\n" "" "" "

\n" "Site Clustering \n" "

\n" "It is often undesirable to have many results listed from the same site. Site Clustering will essentially limit the number returned results from any given site to two, but it will provide a link which says \"more results from this site\" in case the searcher wishes it.\n" "

\n" "Duplicate Results Removal \n" "

\n" "When dup results removal is enabled Gigablast will remove results that have the same content as other results. Right now the comparison is very strict, but will be somewhat relaxed in the future.\n" "" "

\n" "Cached Web Page Parameters " "

\n" "To get a cached web page from Gigablast use a url like: http://www.gigablast.com/get?d=12345&ih=1&q=my+query where:
\n" "" "
\n" "\n" "" "\n" "\n" "" "\n" "\n" "" "\n" "\n" "" "\n" "\n" "" "\n" "\n" "" "\n" "\n" "" "

d=X	X is the docId of the page you want returned. DocIds are 64-bit, so you'll need 8 bytes to hold one.
ih=X	X is 1 to include the Gigablast header in the returned page, and 0 to exclude it.
ibh=X	X is 1 to include the Gigablast BASE HREF tag in the cached page. The default is 1.
q=X	X is the the query that, when present, will cause Gigablast to highlight the query terms on the returned page.
cas=X	" "X can be 0 or 1 to respectively disable or enable click and scroll. Default is 1.
strip=X	" "X can be 0, 1 or 2. If X is 0 then no stripping is performed. If X is 1 then image and other tags are removed. An X of 2 is another form of removing tags. Default is 0.

\n" "" "" "

\n" "\n" "" "" "" "\n" "\n" "

The XML Feed\n" "

\n" "

\n" "Gigablast allows you to receive the search results in a number of formats useful for interfacing to your program. By specify a \"raw=8\" as a cgi parameter you can receive the results in XML. Here is an example of the raw=8 feed.\n" "Additionally, raw=9 may be used to obtain the feed encoded in UTF-8.\n" "

\n" "The XML reply has the following format (but without the comments):\n" "

\n" "

\n"
"# The XML reply uses the Latin-1 Character Set (ISO 8859-1) when using raw=8\n"
"<?xml version=\"1.0\" encoding=\"ISO-8859-1\" ?>\n"
"# OR when using raw=9\n"
"<?xml version=\"1.0\" encoding=\"utf-8\" ?>\n"
"\n"
"# It consists of one, and only one, response.\n"
"<response>\n"
"\n"
"  # If any error was received in processing the request, it will be here.\n"
"  <error>Out of memory</error>\n"
"  # The numeric code of the error, if any, goes here.\n"
"  # See all the Error Codes, but the "
"  # following errors are most likely:\n"
"  # %5li - A cached page was not found when it should have been.\n"
"  # %5li - There was a int16_tage of memory to properly process the request.\n"
"  # %5li - Queried collection does not exist.\n",
		  (int32_t)ENOTFOUND,
		  (int32_t)ENOMEM,
		  (int32_t)ENOCOLLREC);

	sprintf( p ,
"  <errno>32790</errno>\n"
"  # Total number of documents in the collection being searched.\n"
"  <docsInCollection>2060245584</docsInCollection>\n"
"  # An APPROXIMATION of the total number of search results for the query.\n"
"  <hits>4838158</hits>\n"
"  # This is \"1\" if more results are available after these, \"0\" if not.\n"
"  <moreResultsFollow>1</moreResultsFollow>\n"
"  # If present and value is 1, some words in the query were censored for content.\n"
"  <queryCensored>1</queryCensored>\n"
"  # If present, the value is the number of results that were censored for content.\n"
"  <resultsCensored>3</resultsCensored>\n"
"  # If this tag is present, it will hold an alternate spelling recommendation \n"
"  # for the query. The &spell=1 parameter must be present in the query url,\n"
"  # however, for you to get a spelling recommendation back.\n"
"  <spell>nose</spell>\n"
"  # If this tag is present, it contains the list of query words that were \n"
"  # ignored as individual words, but not necessarily as part of a phrase\n"
"  <ignoredWords>the in of</ignoredWords>\n"
"  # This is how many of the search results contain ALL of the query terms.\n"
"  # It is only used for printing the \"blue bar\" for doing SuperRecall\n"
"  <minNumExactMatches>300</minNumExactMatches>\n"
"\n"
"  # The list of related topics, each enclosed by <topic> tags. \n"
"  # You must provide a topics parameter to the query url to get "
"topics.\n"
"  <topic>\n"
"    # Each topic has a score. A score of 50%% or more is considered pretty good.\n"
"    <score>63</score>\n"

"    # Out of the documents scanned, how many contain this topic.\n"
"    <docCount>4</docCount>\n"

"    # The topic popularity. A measure of how popular the word or phrase is\n"
"    # based on how many web pages contain it overall. Ranges from 0 to 1000.\n"
"    # 1000 being the most popular.\n"
"    <popularity>16</popularity>\n"

"    # The docIds of the documents scanned that contain this topic.\n"
"    <docId>9030668134</docId>\n"
"    <docId>265962215563</docId>\n"
"    <docId>43940265200</docId>\n"
"    <docId>264861015824</docId>\n"

"    # The topic name.\n"
"    <name><![CDATA[Race Cars]]></name>\n"
"    # And OPTIONALLY the name of the meta tag it was derived from.\n"
"    <from>keywords</from>\n"
"  </topic>\n"
"\n"
"  # The list of reference pages for the search results.  Each reference is\n"
"  # enclosed in <reference> tags.\n"
"  <reference>\n"
"    # Each reference has a score based on its relevance to the query.\n"
"    <score>93</score>\n"
"    # Title of the reference page\n"
"    <title></title>\n"
"    # Url of the reference page\n"
"    <url><![CDATA[http://www.greatreference.com/]]></url>\n"
"  </reference>\n"
"\n"
"  # The list of related pages for the search results.  Each related page is\n"
"  # enclosed in <related> tags.\n"
"  <related>\n"
"    # Each related page has a score based on its relevance to the query.\n"
"    <score>91</score>\n"
"    # Title of the related page.\n"
"    <title></title>\n"
"    # Url of the related page.\n"
"    <url><![CDATA[http://www.similar.com/]]></url>\n"
"    # Summary of the related page.\n"
"    <sum><![CDATA[This page is similar to the results]]></sum>\n"
"  </related>\n"
"\n"
"  # The list of search results, each enclosed in <result> tags.\n"
"  <result>\n"
"    # Each result has a title. This may be empty if none was found on the page.\n"
"    <title><![CDATA[My Homepage]]></title>\n"
"    # Each result has a summary. This may be empty. The summary is generated \n"
"    # so as to contain the query terms if possible.\n"
"    <sum><![CDATA[All about my interests and hobbies]]></sum>\n"
"    # If this result is categorized under the DMOZ Directory, data about each\n"
"    # category it is in will be enclosed in a <dmoz> tag.\n"
"    <dmoz>\n"
"      # The category ID number of this category.\n"
"      <dmozCatId>172</dmozCatId>\n"
"      # The path of this category in the directory.\n"
"      <dmozCat><![CDATA[Health: Dentistry]]></dmozCat>\n"
"      # Title of this result as listed in the directory.\n"
"      <dmozTitle><![CDATA[My Homepage]]></dmozTitle>\n"
"      # Description of this page as listed in the directory.\n"
"      <dmozDesc><![CDATA[A Dentist's Home Page]]></dmozDesc>\n"
"    </dmoz>\n"
"    # If the directory is being given along with the results, this is the number of\n"
"    # stars given to this page based on its quality.\n"
"    <stars>3</stars>\n"
"    # Each result may have a sequence of <display> tags if the feed input\n"
"    # contained a dt parameter. This allows you to extract\n"
"    # information contained in meta tags in the content of each search result.\n"
"    # To obtain the contents of the author meta tag, you would need to pass in\n"
"    # dt=author.\n"
"    <display name=\"author\"><![CDATA[Contents of the meta author tag]]></display>\n"
"    # Each result has a URL. This should never be empty.\n"
"    <url><![CDATA[http://www.mydomain.com/mypage.html]]></url>\n"
"    # The size of the page in kilobytes. Accurate to the tenth of a kilobyte.\n"
"    <size>5.6</size>\n"
"    # The time the page was last INDEXED. It may not have been indexed in a \n"
"    # int32_t time if the page's content has not changed. The time is expressed \n"
"    # in seconds since the epoch. (Jan 1, 1969)\n"
"    <spidered>1064367311</spidered> \n"
"    # The time the page was last modified. This is taken from the HTTP reply \n"
"    # of the web server when downloading the page. It is 0 if unknown. The time\n"
"    # is expressed in seconds since the epoch. (Jan 1, 1969)\n"
"    <lastMod>1058477041</lastMod>\n"
"    # The assigned docid for this page. This number is unique and used \n"
"    # internally by Gigablast to identify this page. It is used to retrieve the\n"
"    # \"cached copy\" of the page.\n"
"    <docId>65990704587</docId>\n"
"    # When doing site clustering, this tag will be present if the result is \n"
"    # from the same hostname as a previous result for the same query. It \n"
"    # indicates that you might want to indent the result. Any further results \n"
"    # from this same hostname will be stripped from the feed.\n"
"    <clustered>1</clustered>\n"
"    # When Topic Clustering is being used, these will display results which \n"
"    # are considered similar to this result and have been clustered under it. \n"
"    # Each similar result is enclosed in a <similar> tag. \n"
"    <similar>\n"
"      # The url for the similar result.\n"
"      <url><![CDATA[http://www.similar.com/]]></url>\n"
"      # The title of the similar result.\n"
"      <title><![CDATA[A similar topic]]></title>\n"
"    </similar>\n"
"    # If this is present and set to 1, there are more similar results beyond \n"
"    # those given here. \n"
"    <moreSimilar>1</moreSimilar>\n"
"    # This is a standard HTTP MIME content classification of the result. It is \n"
"    # not present if the page is text/html. Otherwise, it will be one of the\n"
"    # following: text/plain\n"
"    #            text/xml\n"
"    #            application/pdf\n"
"    #            application/msword\n"
"    #            application/vnd.ms-excel\n"
"    #            application/mspowerpoint\n"
"    #            application/postscript\n"
"    <contentType>text/plain</contentType>\n"
"    # The documents are all sorted by this score. This score is a generally a\n"
"    # product of the WEIGHT of the query term and the COUNT of the query term\n"
"    # in this document. The WEIGHT is usually influenced by them term frequency\n"
"    # of the query term (rarer terms get more WEIGHT), by the additional weight\n"
"    # received by phrases which can be adjusted in the Master Controls, and,\n"
"    # possibly, by any user-defined weight in the query (See Weighting Query Terms).\n"
"    # This score is normalized by dividing by the maximum\n"
"    # score for all documents in the search results and then making it into a\n"
"    # percentage, so the score ranges from 0 to 100, and the first result\n"
"    # should always have score 100.\n"
"    <score>100</score>\n"
"    # This is the absolute score. Useful for merging results from other\n"
"    # collections or other search engines.\n"
"    <absScore>5132</absScore>\n"
"    # This is the language the page was detected as.\n"
"    <language><![CDATA[English]]></language>\n"
"    # The character set this page was originally encoded in. \n"
"    <charset><![CDATA[utf-8]]></charset>\n"
"  </result>\n"
"\n"
"  <result>\n"
"  ...\n"
"  </result>\n"
"\n"
"  ...\n"
"\n"
"  # If the directory has been requested, this node will include the directory\n"
"  # structure for the requested category.  Typically this is above the results.\n"
"  <directory>\n"
"    # Category ID for the displayed directory structure.\n"
"    <dirId>172</dirId>\n"
"    # Directory path of this category listing.\n"
"    <dirName>Health: Dentistry</dirName>\n"
"    # Specifies if the directory listing is displayed in a Right-To-Left format.\n"
"    <dirIsRTL>1</dirIsRTL>\n"
"    # Sub-Categories listed as letters meant to be displayed as a letter bar.\n"
"    # Each sub-category will be enclosed in a <letterbar> tag.\n"
"    <letterbar><![CDATA[Health/Dentistry/A]]>"
"    # Every sub category will include a count of how many urls are listed under it.\n"
"      <urlcount>5<urlcount>\n"
"    </letterbar>\n"
"    # Normal sub-categories listed in groups.  These are listed in order of group\n"
"    # and alphabetically within each group. Each sub-category is enclosed in a\n"
"    # <narrow2>, <narrow1>, or <narrow> tag.\n"
"    <narrow2><![CDATA[Health/Dentistry/Regional]]>\n"
"      <urlcount>0<urlcount>\n"
"    </narrow2>\n"
"    <narrow1><![CDATA[Health/Dentistry/Association]]>\n"
"      <urlcount>122<urlcount>\n"
"    </narrow1>\n"
"    <narrow><![CDATA[Health/Dentistry/Children]]>\n"
"      <urlcount>24<urlcount>\n"
"    </narrow>\n"
"    # Symbolically linked sub-categories physically under a different category.\n"
"    # These will be interwoven alphabetically within the respective narrow groups.\n"
"    # The name listed before the path is the symbolic name.  Each symbolically linked\n"
"    # sub-category is enclosed in a <symbolic2>, <symbolic1>, or \n"
"    # <symbolic> tag.\n"
"    <symbolic2><![CDATA[Dentophobia:Health/Mental_Health/Disorders/Anxiety/Phobias/Dentophobia]]>\n"
"      <urlcount>2<urlcount>\n"
"    </symbolic2>\n"
"    <symbolic1><![CDATA[Dental_Laboratories:Buisness/Healthcare/Products_and_Services/Dentistry/Dental_Laboratories]]>\n"
"      <urlcount>71<urlcount>\n"
"    </symbolic1>\n"
"    <symbolic><![CDATA[Products:Shopping/Health/Dental]]>\n"
"      <urlcount>71<urlcount>\n"
"    </symbolic>\n"
"    # Seperate categories in the directory which are related to this one.\n"
"    <related><![CDATA[Society/Issues/Health/Dentistry]]>\n"
"      <urlcount>4</urlcount>\n"
"    </related>\n"
"    # This category in other languages in the directory.\n"
"    <altlang><![CDATA[Basque:World/Euskara/Osasuna/Odontologia]]>\n"
"      <urlcount>7</urlcount>\n"
"    </altlang>\n"
"  </directory>\n"
"\n"
"</response>\n"
"

\n" "\n" "" "\n" "\n" "

Error Codes\n" "

\n" "

\n"); p += gbstrlen(p); p+=sprintf(p, //"Gigablast is often interfaced with to add or delete " //"collections, to inject or delete documents or to obtain " //"search results.
\n" "

In all cases Gigablast may return an error in the " "usual HTTP fashion, where the HTTP reply " "has a format like:
\n" "\n" "HTTP xxx (yyy)\n" "
\n" "Where xxx is 200 on success and 500 on error and " "yyy is the textual error message, as printed out " "by the strerror() function or equivalent. The error " "message will be from one in the table below." "

\n\n" "
When adding or deleting documents via Gigablast's " "injection interface, errors can also be returned as " "stated at the end of the Injecting " "Documents section. In these cases the HTTP status " "is still 200." "

\n\n" "
When obtaining search results via the " "XML feed, " "the error message, and possibly error number, can be " "contained in the <error> and <errno> tags " "respectively. When this happens search results are still " "often presented, with an HTTP status of 200, although the " "error might have caused " "the results to be different than what they should have " "been. For " "instance, if corrupted data prevented from one particular " "result from being displayed." "

\n\n" "

" "
\n" "" "\n" "\n" "\n" "\n" "

Key
a	" "Error used by an add or delete collection " "operation." "
i	" "Error used by an inject (or delete) operation." "
s	" "Error used by a search operation." "

\n\n"); p += sprintf(p,"\n" ); // c errors char *c = "eeeeee"; p += sprintf(p,"\n"); for ( int32_t i = 1 ; i <= EMEDIUMTYPE ; i++ ) { char *b = p; p += sprintf(p,"" "\n"); } // gigablast errors p += sprintf(p,"

\n"); for ( int32_t i = EDUMPFAILED ; i <= ECANCELACK ; i++ ) { char *b = p; p += sprintf(p,"" "\n"); } sprintf ( p , "

C error codes
%"INT32"	%s	", c,i,strerror(i)); char *s = p; // is it for injector, search results or addcoll interface? // use 'i','s','a' switch ( i ) { case EPERM : p += sprintf(p,"a - Did not have permission in the " "working dir to create/delete the " "collection subdir."); break; case ENOENT: p += sprintf(p,"a - When creating the subdir for the " "collection in the working dir, a " "directory component in pathname " "does not exist or is a dangling " "symbolic link."); break; case EIO : p += sprintf(p,"a,i,s - There was an error writing or " "reading data to or from the disk, most " "likely due to a hardware failure."); break; case EACCES: p += sprintf(p,"a,i - The working directory, or its " "parent does not allow write " "permission."); break; case EEXIST: p += sprintf(p,"a - The collection subdir already " "exists in the working dir."); break; case ENOSPC: p += sprintf(p,"a,i - There is no room on the drive " "to write data because the drive is " "full, or the user's disk quota is " "exhausted."); break; case EBADF: p += sprintf(p,"a,i,s - Read or write on a bad file " "descriptor. This should not happen."); break; case ENOBUFS : p += sprintf(p,"a - Collection name limit of %"INT32" is " "exceeded.",(int32_t)MAX_COLL_LEN); break; case ENOMEM: p += sprintf(p,"a,i,s - Out of memory."); break; } // don't print if not used! if ( s == p ) { p = b; continue; } if ( c[0] == 'e' ) c = "ffffff"; else c = "eeeeee"; p += sprintf(p,"
Gigablast error codes" "
%"INT32"	%s	", c,i,mstrerror(i)); char s = p; // is it for injector, search results or addcoll interface? // use 'i','s','a' switch ( i ) { case ETRYAGAIN: p += sprintf(p,"a,i,s - Resources temporarily " "unavailable."); break; case ENOCOLLREC: p += sprintf(p,"a,i,s - Referenced collection does " "not exist."); break; case EBADENGINEER : p += sprintf(p,"a - Collection name being added " "contains an illegal character, or an " "empty name was provided, or the name " "is more than %"INT32" characters. ", (int32_t)MAX_COLL_LEN); // SpiderLoop.cpp Msg7.cpp PageInject.cpp p += sprintf(p,"i - No URL was provided, or URL " "has no hostname. Or provided URL is " "currently being injected. Or %"INT32" " "injects are currently in progress.", (int32_t)MAX_SPIDERS); break; //case EURLTOOLONG : //p += sprintf(p,"i - Injected URL was longer than " // "%"INT32" characters.",(int32_t)MAX_URL_LEN); //break; case EBADREPLY: p += sprintf(p,"i - Received bad internal reply. You " "should never see this error."); break; case EEXIST: p += sprintf(p,"a - Adding a collection name that " "already exists."); break; case ENOTFOUND: p += sprintf(p,"i - When looking up old document " "for injected URL it was not found when " "it should have been. This is due to " "data corruption."); break; case ENODOCID: p += sprintf(p,"i - No docids were available to " "inject the URL. The database has " "reached its limit."); break; case EBUFTOOSMALL: p += sprintf(p,"i - Injected URL was longer than " "%"INT32" characters. Or the injected " "document was too big to fit in memory, " "so consider increasing " " in gb.conf." ,(int32_t)MAX_URL_LEN); break; case ENOSITEDEFAULT: p += sprintf(p,"i - The default tagdb.xml (ruleset) " "file was not found. " "Make sure that the ruleset used " "by tagdb or by the Url Filters page " "for this url is present in the working " "dir."); break; case EDOCBADCONTENTTYPE: p += sprintf(p,"i - The URL's file extension is not " "recognized as an indexable file type."); break; case EBADMIME: p += sprintf(p,"i - The provided HTTP mime (if the " "hasmime flag " "was set) was " "not present or illegal."); break; case ENOSLOTS: p += sprintf(p,"a,i,s - There was a int16_tage of " "sockets, please try again."); break; case ECLOSING: p += sprintf(p,"i - Gigablast is shutting down, " "so the inject failed."); break; case EISCLOSING: p += sprintf(p,"i - Gigablast is shutting down, " "so the inject failed."); break; case EBADTITLEREC: p += sprintf(p,"i,s - A cached document was " "corrupt on disk."); break; case EMISSINGQUERYTERMS: p += sprintf(p,"s - A document in the search results " "did not contain all the query terms."); break; case EQUERYTOOBIG: p += sprintf(p,"s - Query was too long."); break; case EQUERYTRUNCATED: p += sprintf(p,"s - Query was truncated."); break; case ETOOMANYOPERANDS: p += sprintf(p,"s - Query has too many operands."); break; case EDNSBAD: p += sprintf(p,"i - DNS error"); break; case EDNSREFUSED: p += sprintf(p,"i - DNS error"); break; case EDNSTIMEDOUT: p += sprintf(p,"i - DNS error"); break; case ESHUTTINGDOWN: p += sprintf(p,"i - Gigablast is shutting down, " "so the inject failed."); break; // this uses the HTTP interface //case ENOPERM: // p += sprintf(p,"s - Your IP is banned from searching " // "the provided collection. See the " // "Access page to remove it from the ban " // "list." } if ( p == s ) { p = b; continue; } if ( c[0] == 'e' ) c = "ffffff"; else c = "eeeeee"; p += sprintf(p,"

" "\n" "\n" "

Weighting Query Terms" "

\n" "

\n" "Gigablast allows you to pass in weights for each term in the provided query. The query term weight operator, which is directly inserted into the query, takes the form: [XY], where X is the weight you want to apply and Y is a if you want to make it an absolute weight or r for a relative weight. Absolute weights cancel any weights that Gigablast may place on the query term, like weights due to the term's popularity, for instance. The relative weight, on the other hand, is multiplied by any weight Gigablast may have already assigned." "

\n" "The query term weight operator will affect all query terms that follow it. To turn off the effects of the operator just use the blank operator, []. Any weight operators you apply override any previous weight operators." "

\n" "The weight applied to a phrase is unaffected by the weights applied to its constituent terms. In order to weight a phrase you must use the [XYp] operator. To turn off the affects of a phrase weight operator, use the phrase blank operator, [p]." "

\n" "Applying a relative weight of 0 to a query term, like [0r], has the effect of still requiring the term in the search results (if it was not ignored), but not allowing it to contribute to the ranking of the search results. However, when doing a default OR search, if a document contains two such terms, it will rank above a document that only contains one such term. " "

\n" "Applying an absolute weight of 0 to a query term, like [0a], causes it to be completely ignored and not used for generating the search results at all. But such ignored or devalued query terms may still be considered in a phrase context. To affect the phrases in a similar manner, use the phrase operators, [0rp] and [0ap]." "

\n" "Example queries:" "

\n" "[10r]happy [5rp][13r]day []lucky
\n" "happy is weighted 10 times it's normal weight.
\n" "day is weighted 13 times it's normal weight.
\n" "\"day lucky\", the phrase, is weighted 5 times it's normal weight.
\n" "lucky is given it's normal weight assigned by Gigablast." "

\n" "Also, keep in mind not to use these weighting operators between another query operator, like '+', and its affecting query term. If you do, the '+' or '-' operator will not work." "

\n" "" "" "" "" "" "" "" ); p += gbstrlen ( p ); sprintf ( p , "\n" "\n" "

Hardware Requirements" "

\n" "
\n" "" "At least one computer with 512MB RAM, 10GB of hard drive space and " "any distribution of Linux with the 2.4.25 kernel or higher. " "

\n" "" "
\n" "" "\n" "\n" "

Performance Specifications" "

\n" "
\n" "Gigablast can store 100,000 web pages (each around 25k in size) per " "gigabyte of disk storage. A typical single-cpu pentium 4 machine can index " "one to two million web pages per day even when Gigablast is near its maximum " "document capacity for the hardware. A cluster of N such machines can index " "at N times that rate." "

\n" "" "
\n" "" "\n" "\n" "

Installation & Configuration" "

\n" "
\n" "" "1. " "Create one directory for every Gigablast process you would like to run. Each Gigablast process is also called a host. " "

\n" "" "2.\n" "Populate each directory with the following files and subdirectories:" "

\n" "

\n" "\n" "\n" "" "" "" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "

gb	The Gigablast executable. Contains the web server, the database and the spider. This file is required to run gb.
hosts.conf	This file describes each host (gb process) in the Gigablast network. Every gb process uses the same hosts.conf file. This file is required to run gb." "" "
gb.conf	Each gb process is called a host and each gb process has its own gb.conf file. This file is required to run gb." "
coll.XXX.YYY/	For every collection there is a subdirectory of this form, where XXX is the name of the collection and YYY is the collection's unique id. Contained in each of these subdirectories is the data associated with that collection.
coll.XXX.YYY/coll.conf	Each collection contains a configuration file called coll.conf. This file allows you to configure collection specific parameters. Every parameter in this file is also controllable via your the administrative web pages as well.
trash/	Deleted collections are moved into this subdirectory. A timestamp in milliseconds since the epoch is appended to the name of the deleted collection's subdirectory after it is moved into the trash sub directory. Gigablast doesn't physically delete collections in case it was a mistake.
tagdbN.xml	Several files where N is an integer. The files must be contiguous, starting with an N of 0. Each one of these files is a ruleset file. This file is required for indexing and deleting documents.
html/	A subdirectory that holds all the html files and images used by Gigablast. Includes Logos and help files.
dict/	A subdirectory that holds files used by the spell checker and the GigaBits generator. Each file in dict/ holds all the words and phrases starting with a particular letter. The words and phrases in each file are sorted by a popularity score.
antiword	Executable called by gbfilter to convert Microsoft Word files to html for indexing.
.antiword/	A subdirectory that contains information needed by antiword.
pdftohtml	Executable called by gbfilter to convert PDF files to html for indexing.
pstotext	Executable called by gbfilter to convert PostScript files to text for indexing.
ppthtml	Executable called by gbfilter to convert PowerPoint files to html for indexing.
xlhtml	Executable called by gbfilter to convert Microsoft Excel files to html for indexing.
gbfilter	Simple executable called by Gigablast with document HTTP MIME header and document content as input. Output is an HTTP MIME and html or text that can be indexed by Gigablast.
gbstart	An optional simple script used to start up the gb process(es) on each computer in the network. Otherwise, iff you have passwordless ssh capability then you can just use './gb start' and it will spawn an ssh command to start up a gb process for each host listed in hosts.conf.

\n" "

\n" "" "2. " "Edit or create the hosts.conf file." "

\n" "" "3. " "Edit or create the gb.conf file." "

\n" /*"" "4. " "Edit or create the gbstart shell script on each participating computer so it will run all the required gb processes on that computer." "

\n" "" "5. " "Execute the gbstart shell script on each participating computer." "

\n" */ "" "4. " "Direct your browser to any host listed in the hosts.conf file to begin administration." "

\n" "" "
\n" "" ); //if ( user == USER_MASTER ) p += gbstrlen ( p ); if ( g_users.hasPermission( r, PAGE_MASTER ) ) p += gbstrlen(p); sprintf ( p , "\n" "\n" "

Cluster Maintenance" "

\n" "
\n" "For the purposes of this section, we assume the name of the cluster " "is gf and all hosts in the cluster " "are named gf*. The Master host of the cluster is gf0. The " "gigablast working directory is assumed to be /a/ and the /etc/dsh/machines.list file contains only the machine names in the cluster." "

" "To perform operations on all machines on the network:" "

To setup dsh:" "
- Install the dsh package, on debian it would be:
  " " $ apt-get install dsh
  " "
- Add the names of all of the machines in the cluster to /etc/dsh/machines.list (newline separated, but does not end in a new line)
  " "
" "To setup dsh on a machine on which we do not have root:" "
- cd to the working directory\n" "
- Copy /usr/lib/libdshconfig.so.1.0.0 to the working directory.\n" "
- export LD_PATH=.\n" "
- run dsh -r rcp -f filename hostname as a test. Use scp if rcp not available. filename is the file that contains the hostnames to dsh to.\n" "
\n" "
To use the dsh command." "
- to copy a master configuration file to all hosts:
  \n" " $ dsh -a 'scp gf0:/a/coll.conf /a/coll.conf'
  \n" "
- to check running processes on all machines concurrently (-c option):
  \n" " $ dsh -ac 'ps auxww'
  \n" "
" "

" "To prepare a new cluster or erase an old cluster:" "

Save /a/gb.conf, /a/hosts.conf, and /a/coll.*.*/coll.conf files somewhere besides on /dev/md0 if they exist and you want to keep them.\n" "
cd to a directory not on /dev/md0\n" "
Login as root using su\n" "
Use dsh -ac 'umount /dev/md0' to unmount the working directory. All login shells must exit or cd to a different directory, and all processes with files opened in /dev/md0 must exit for the unmount to work.\n" "
Use dsh -ac 'umount /dev/md0' to unmount the working directory.\n" "
Use dsh -ac 'mke2fs -b4096 -m0 -N20000 -R stride=32 /dev/md0' to revuild the filesystem on the raid. CAUTION!!! WARNING!! THIS COMPLETELY ERASES ALL DATA ON /dev/md0\n" "
Use dsh -ac 'mount /dev/md0' to remount it.\n" "
Use dsh -ac 'mkdir /mnt/raid/a ; chown mwells:mwells /mnt/raid/a to create the 'a' directory and let user mwells, or other search engine administrator username, own it.\n" "
Recopy over the necessary gb files to every machine.\n" "
\n" "

\n" "
\n" "To test a new gigablast executable:" "

Change to the gigablast working directory.
" " $ cd /a" "
Run gb stop on the running gb executable on gf0.
" " $ gb stop" "
Wait until all hosts have stopped and saved their data. " "(the following line should not print anything)
" " $ dsh -a 'ps auxww' | grep gb" "
Move the current gb executable to gb.SAVED.
" " $ mv gb gb.SAVED " "
Copy the new executable onto gf0
" " $ scp gb user@gf0:/a/" "
Install the executable on all machines.
" " $ gb installgb
" "
This will copy the gb executable to all hosts. You" " must wait until all of the scp processes have completed" " before starting the gb process. Run ps to verify that all of" " the scp processes have finished.
" " $ ps auxww" "
Run gb start
" " $ gb start " "
As soon as all of the hosts have started, you can use the " "web interface to gigablast.
" "

\n" "To switch the live cluster from the current (cluster1) to another" " (cluster2):" "

Ensure that the gb.conf of cluster2 matches that of cluster1," " excluding any desired changes.
" "
Ensure that the coll.conf for each collection on cluster2 matches those" " of cluster1, excluding any desired changes.
" "
Thoroughly test cluster2 using the blaster program.
" "
Test duplicate queries between cluster1 and cluster2 and ensure results" " properly match, with the exception of any known new changes.
" "
Make sure port 80 on cluster2 is directing to the correct port for gb.
" " $ iptables -t nat -A PREROUTING -i eth0 -p tcp -m tcp --dport 80 -j" " DNAT --to-destination 2.2.2.2:8000
" "
Test that cluster2 works correctly by accessing it from a browser using" " only it's IP in the address.
" "
For both primary and secondary DNS servers, perform the following:
" "
- Edit /etc/bind/db.<hostname> (i.e. db.gigablast.com)
  " " $ vi /etc/bind/db.gigablast.com
  " "
- Change lines using cluster1's ip to have cluster2's ip. It is" " recommended that comment out the old line with a ; at the front.
  " " i.e.: \"www IN A 1.1.1.1\" >>" " \"www IN A 2.2.2.2\"
  " "
- Edit /etc/bind/db.64
  " " $ vi /etc/bind/db.64
  " "
- Change lines with cluster1's last IP number to have cluster2's" " last IP number.
  " " i.e.: \"1 IN PTR " "www.gigablast.com\" >> \"2 IN PTR " "www.gigablast.com\"
  " "
- Restart named.
  " " $ /etc/rc3.d/S15bind9 restart
  " "
" "
Again, test that cluster2 works correctly by accessing it from a browser" " using only it's IP in the address.
" "
Check log0 of cluster2 to make sure it is recieving queries.
" " $ tail -f /a/log0
" "
Allow cluster1 to remain active until all users have switched over to" " cluster2.
" "

\n" ); //if ( user == USER_MASTER ) p += gbstrlen ( p ); if ( g_users.hasPermission( r, PAGE_MASTER ) ) p += gbstrlen(p); sprintf ( p , "\n" "\n" "

Troubleshooting" "

\n" "
\n" "" "\n" "A host in the network crashed. How do I temporarily decrease query latency on the network until I get it up again?
" "You can go to the Search Controls page and cut all nine tier sizes in half. This will reduce search result recall, but should cut query latency times in half for slower queries until the crashed host is recovered. " "

" "A host in the network crashed. What is the recovery procedure?
" "First determine if the host's crash was clean or unclean. It was clean " "if the host was able to save all data in memory before it crashed. If the " "log ended with allExit: dumping core after saving then the crash " "was clean, otherwise it was not." "

" "If the crash was clean then you can simply restart the crashed host by typing" " gb start i where i is the hostId of the crashed host. " "However, if the crash was not clean, like in the case of a sudden power " "outtage, then in order to ensure no data gets lost, you must copy the data " "of the crashed host's twin. " "If it does not have a twin then there may be some data loss and/or " "corruption. In that case try reading the section below, How do I minimize " "the damage after an unclean crash with no twin?, but you may be better " "off starting the index build from " "scratch. To recover from an unclean crash using the twin, follow the steps " "below: " "

" "a. Click on 'all spiders off' in the 'master controls' of host #0, or " "host #1 if host #0 was the host that crashed.
" "b. If you were injecting content directly into Gigablast, stop.
" "c. Click on 'all just save' in the 'master controls' of host #0 or host #1 " "if host #0 was the one that crashed.
" "d. Determine the twin of the crashed host by looking in the " "hosts.conf file. " "The twin will have the same group number as the crashed host.
" "e. Recursively copy the working directory of the twin to a spare host " " using rcp since it is much faster than scp.
" "f. Restart the crashed host by typing gb start i where " "i is the hostId of the crashed host. If it is not restartable, then " "skip this step.
" "g. If the crashed host was restarted, wait for it to come back up. Monitor " "another host's hosts table to see when it is up, or watch the log of " "the crashed host.
" "h. If the crashed host was restarted, wait a minute for it to absorb all " "of the data add requests that may still be lingering. Wait for all hosts' " "spider queues of urls currently being spidered to be empty of " "urls.
" "i. Perform another all just save command to relegate any new data " "to disk.
" "j. After the copy completes edit the hosts.conf on host #0 and replace the " "ip address of the crashed host with that of the spare host.
" "k. Do a gb stop to safely shut down all hosts in the network.
" "l. Do a gb installconf to propagate the hosts.conf file from host #0 " "to all other hosts in the network (including the spare host, but not the " "crashed host)
" "m. Do a gb start to bring up all hosts under the new hosts.conf file." "
" "n. Monitor all logs for a little bit by doing dsh -ac 'tail -f /a/log? " "/a/log\?\?'
" "o. Check the hosts table to ensure all hosts are up and running.
" "

" "How do I minimize the damage after an unclean crash with no twin?
" "You may never be able to get the index 100%% back into shape right now, but " "in the near future there may be some technology that allows gigablast to " "easily recover from these situations. For now though, " "2. Try to determine the last url that was indexed and *fully* saved to disk. " "Every time you index a url some data is added to all of these databases: " "checksumdb, indexdb, spiderdb, titledb and tfndb. These databases all have " "in-memory data that is periodically dumped to disk. So you must determine " "the last time each of these databases dumped to disk by looking at the " "timestamp on the corresponding files in the appropriate collection " "subdirectories contained in the working directory. If tfndb was " "dumped to disk the longest time ago, then use its timestamp " "to indicate when the last url was successfully added or injected. You might " "want to subtract thirty minutes from that timestamp to make sure because it " "is really the time that that file started being dumped to disk that " "you are after, and that timestamp represents the time of the last write to " "that file. Now you can re-add the potentially missing urls from that time " "forward and get a semi-decent recovery." "

" "Gigablast is slow to respond to queries. How do I speed it up?
" "a. If you see int32_t purple lines in the Performance graph when " "Gigablast is slow then that " "means Gigablast is operating on a slow network OR your tier sizes, " "adjustable on the Search Controls page, are way " "too high so that too much data is clogging the network. " "If your tier sizes are at the default values or lower, then the problem may " "be that the bandwidth between one gigablast host and another is below " "the required 1000Mbps. " "Try doing a 'dmesg | grep Mbps' to see what speed your card is operating at. " "Also try testing the bandwidth between hosts using the thunder program or " "try copying a large file using rcp and timing it. Do not use scp since it is " "often bottlenecked on the CPU due to the encryption that it does. If your " "gigabit card is operating at 100Mbps that can sometimes be fixed by " "rebooting. I've found that there is about a 20%% chance that the reboot " "will make the card come back to 1000Mbps." "

" "b. If you see lots of int32_t black lines on the Performance graph then that " "means your disk is slowing everything down. Make sure that if you are doing " "realtime queries that you do not have too many big indexdb files. If you " "tight merge everything it should fix that problem. Otherwise, consider " "getting a raid level 0 and faster disks. Perhaps the filesystem is " "severly fragmented." "Or maybe your query traffic is repetetive. If the queries are sorted " "alphabetically, or you have many duplicate queries, then most of the " "workload might be falling on one particular host in the network, thus " "bottle-necking everything." "

" "I get different results for the XML feed " "(raw=X) as compared to the HTML feed. " "What is going on?
" " Try adding the &rt=1 cgi parameter to the " "search string to tell Gigablast to return real " "time results." "rt is set to 0 by default for the XML feed, but " "not for the HTML feed. That means Gigablast will " "only look at the root indexdb file when looking up " "queries. Any newly added pages will be indexed " "outside of the root file until a merge is done. " "This is done for performance reasons. You can enable " "real time look ups by adding &rt=1 to the search " "string. Also, in your search controls there are " "options to enable or disable real time lookups for " "regular queries and XML feeds, labeled as \"restrict " "indexdb for queries\" and \"restrict indexdb for " "xml feed\". Make sure both regular queries and " "xml queries are doing the same thing when comparing " "results." "
" "
" "Also, you need to look at the tier sizes at the " "top of the Search Controls page. The tier sizes " " (tierStage0, tierStage1, ...) listed for the " "raw (XML feed) queries needs to match non-raw " "in order to get exactly the same results. Smaller " "tier sizes yield better performance but yield " "less search results." "

" "The spider is on but no urls are showing up in the Spider Queue table " "as being spidered. What is wrong?
" "" "" "" "" "" "" "" "

1. Set log spidered urls to YES on the log page. Then " "check the log to see if something is being logged." "

2. Check the master controls page for the following:
" " a. the spider enabled switch is set to YES.
" " b. the spider max pages per second control is set " "high enough.
" " c. the spider max kbps control is set high enough.

3. Check the spider controls page for the following:
" " a. the collection you wish to spider for is selected (in red).
" " a. the old or new spidering is set to YES.
" " b. the appropriate old and new spider priority " "checkboxes are checked.
" " c. the spider start and end times are set " "appropriately.
" " d. the use current time control is set correctly.
" " e. the spider max pages per second control is set " "high enough.
" " f. the spider max kbps control is set high enough.

3. If you have urls from only a few domains then the same domain " "wait or same ip wait controls could be limiting the spidering " "of the urls such that you do not see any in the Spider Queue table. If the " "indexed document count on the home page is increasing then this may be the " "case. Even if the count is not increasing, it may still be the case if the " "documents all have errors, like 404 not found.

" "4. Make sure you have urls to spider by running 'gb dump s ' " "on the command line to dump out spiderdb. See 'gb -h' for the help menu and " "more options.

" "

" "The spider is slow.
" "" "" "

In the current spider queue, what are the statuses of each url? If " "they are mostly \"getting cached web page\" and the IP address column is " "mostly empty, then Gigablast may be bogged down looking up the cached web " "pages of each url in the spider queue only to discover it is from a domain " "that was just spidered. This is a wasted lookup, and it can bog things down " "pretty quickly when you are spidering a lot of old urls from the same " "domain. " "Try setting same domain wait and same ip wait both to 0. This " "will pound those domain's server, though, so be careful. Maybe set it to " "1000ms or so instead. We plan to fix this in the future." "

" "

" "The spider is always bottlenecking on adding links.
\n" "\n" "\n" "

Try increasing the <tfnbMaxPageCacheMem> in the gb.conf for all hosts in the cluster to minimize the disk seeks into tfndb as seen on the Stats page. Stop all gb processes then use ./gb installconf to distribute the gb.conf to all hosts in the cluster. You migh also try decreasing the size of the url filters table, every regular expression in that table is consulted for every link added and it can really block the cpu.\n" "

\n" "

" ); //if ( user == USER_MASTER ) p += gbstrlen ( p ); if ( g_users.hasPermission( r, PAGE_MASTER ) ) p += gbstrlen(p); p += sprintf ( p , "\n" "\n" "

The Security System\n" "

\n" "
\n" "Every request sent to the Gigablast server is assumed to come from one " "of four types of users. A public user, a spam assassin, a collection admin, " "or a master admin. " "A collection admin has control over the controls corresponding to a " "particular collection. A spam assassin has control over even fewer controls " "over a particular collection in order to remove pages from it. " "A master admin has control over all aspects and all collections. " "

" "To verify a request is from an admin or spam assassin Gigablast requires " "that the request contain a password or come from a listed IP. " "To maintain these lists of passwords and IPs for the master admin, " "click on the \"security\" tab. To maintain them for a collection admin or " "for a spam assassin, click on the \"access\" tab for that collection. " "Alternatively, the master passwords and IPs can be edited in the gb.conf " "file in the working dir and collection admin passwords and IPs can be edited " "in the coll.conf file in the collections subdirectory in the working dir. " "

" "To add a further layer of security, Gigablast can server all of its pages " "through the https interface. By changing http:// to https:// and using the " "SSL port you specified in hosts.conf, all requests and responses will be " "made secure. " "

\n" ); p += sprintf ( p , "\n" "\n" "

Building an Index\n" "

\n" "
\n" "\n" "1. Determine a collection name for your index. You may just want to use the default, unnamed collection. Gigablast is capable of handling many sub-indexes, known as collections. Each collection is independent of the other collections. You can add a new collection by clicking on the add new collection link on the Spider Controls page." "

\n" "\n" "2. Describe the set of URLs you would like to index to Gigablast by inputting regular expressions into Gigablast's \n" "URL Filters page. \n" "On that page you can tell Gigablast how often to \n" "re-index a URL in order to pick up any changes to that URL's content.\n" "You can assign rulesets and spider priorities to URLs as well. Furthermore, you can assign a default ruleset and spider priority for all URLs not conforming to any regular expressions you entered.\n" "

\n" "\n" "3. Test your Regular Expressions. Once you've submitted your \n" "regular expressions try entering some URLs in the second pink box, entitled,\n" "URL Filters Test on the URL Filters page. This will help you make sure that you've entered your regular expressions correctly.\n" "

\n" "\n" "4. Enable \"add url\". By enabling the add url interface you will be able to tell Gigablast to index some URLs. You must make sure add url is enabled on the Master Controls page and also on the Spider Controls page for your collection. If it is disabled on the Master Controls page then you will not be able to add URLs for *any* collection.\n" "

\n" "\n" "5. Submit some seed URLs. Go to the add url \n" "page for your collection and submit some URLs you'd like to put in your\n" "index. Usually you want these URLs to have a lot of outgoing links that \n" "point to other pages you would like to have in your index as well. Gigablast's\n" "spiders will follow these links and index whatever web pages they point to,\n" "then whatever pages the links on those pages point to, ad inifinitum. But you\n" "must make sure that spider links is enabled on the Spider Controls page for your collection.\n" "

\n" "\n" "5.a. Check the spiders. You can go to the Spider Queue page to " "see what urls are currently being spidered from all collections, as well as see what urls exist in various priority queues, and what urls are cached from various priority queues. If you urls are not being spidered check to see if they are in the various spider queues. Urls added via the add url interface usually go to priority queue 5 by default, but that may have been changed on the Spider Controls page to another priority queue. And it may have been added to any of the hosts' priority queue on the network, so you may have to check each one to find it." "

\n" "If you do not see it on any hosts you can do an all just save in the Master Controls on host #0 and then dump spiderdb using gb's command line dumping function, gb dump s 0 -1 1 -1 5 (see gb -h for help) on every host in the cluster and grep out the url you added to see if you can find it in spiderdb." "

" "Then make sure that your spider start and end time on the Spider Controls encompas, and old or new spidering is enabled, and spidering is enabled for that priority queue. If all these check out the url should be spidered asap." "

\n" "6. Regulate the Spiders. Given enough hardware, Gigablast can index \n" "millions of pages PER HOUR. If you don't want Gigablast to thrash your or\n" "someone else's website\n" "then you should adjust the time Gigablast waits between page requests to the\n" "same web server. To do this go to the \n" "Spider Controls page for your collection and set\n" "the same domain wait and same ip wait values to how long you want Gigablast to wait in between page requests to the same domain or the same IP address respectively. This value is in milliseconds (ms). There are 1000" "milliseconds in one second. That is, 1000 ms equals 1 second.\n" "You must then click on the\n" "update button at the bottom of that page to submit your new value.\n" "

\n" "\n" "7. Turn on the new spider. Go to the \n" "Spider Controls page for your collection and \n" "turn on enable new spidering. It should be at the top of the \n" "controls table. You may also have to turn on spidering from the \n" "Master Controls page which is a master switch for all\n" "collections.\n" "

\n" "\n" "8. Monitor the spider's progress. By visiting the \n" "Spider Queue page for your collection you can see what\n" "URLs are currently being indexed in real-time. Gigablast.com currently has 32" "hosts and each host spiders different URLs. You can easily switch between \n" "these hosts by clicking on the host numbers at the top of the page.\n" "

\n" "\n" "\n" "\n" "\n" "\n" "\n" "

The Spider\n" "

\n" "
\n" "Robots.txt\n" "

\n" "The name of Gigablast's spider is Gigabot. \n" "Gigabot/1.0 is used for the User-Agent field of all HTTP mime headers that Gigablast transmits. \n" "Gigabot respects the robots.txt convention (robot exclusion) as well as supporting the meta noindex, noarchive and nofollow meta tags. You can tell Gigabot to ignore robots.txt files on the Spider Controls page.\n" "

\n" "Classifying URLs \n" "

\n" "You can specify different indexing and spider parameters on a per URL basis by one or more of the following methods:\n" "

\n" "

Using the tagdb interface, you can assign a ruleset to a set of sites. All you do is provide Gigablast with a list of sites and the ruleset to use for those sites.\n" "You can enter the sites via the HTML form or you can provide Gigablast with a file of the sites. Each file must be limited to 1 Megabyte, but you can add hundreds of millions of sites. \n" "Sites can be full URLs, hostnames, domain names or IP addresses.\n" "If you add a site which is just a canonical domain name with no explicit host name, like gigablast.com, then any URL with the same domain name, regardless of its host name will match that site. That is, \"hostname.gigablast.com\" will match the site \"gigablast.com\" and therefore be assigned the associated ruleset.\n" "Sites may also use IP addresses instead of domain names. If the least significant byte of an IP address that you submit to tagdb is 0 then any URL with the same top 3 IP bytes as that IP will be considered a match.\n" "
You can specify a regular expression to describe a set of URLs using the interface on the URL filters page. You can then assign a ruleset that describes how to spider those URLs and how to index their content. Currently, you can also explicitly assign a spider frequency and spider queue to matching URLs. If these are specified they will override any values in the ruleset." "

\n" "If the URL being spidered matches a site in tagdb then Gigablast will use the corresponding ruleset from that and will not bother searching the regular expressions on the URL filters page.\n" "

\n" "\n" "Spider Queues\n" "

\n" "Gigablast uses spider queues to hold and partition URLs. Each spider queue has an associated priority which ranges from 0 to 7. Furthermore, each queue is either denoted as old or new. Old spider queues hold URLs whose content is currently in the index. New spider queues hold URLs whose content is not in the index. The priority of a URL is the same as the priority of the spider queue to which it belongs. You can explicitly assign the priority of a URL by specifying it in a ruleset to which that URL has been assigned or by assigning it on the URL filters page.\n" "

\n" "On the Spider Controls page you can toggle the spidering of individual spider queues as well as link harvesting. More control on a per queue basis will be available soon, perhaps including the ability to assign a ruleset to a spider queue.\n" "

\n" "The general idea behind spider queues is that it allows Gigablast to prioritize its spidering. If two URLs are overdue to be spidered, Gigabot will download the one in the spider queue with the highest priority before downloading the other. If the two URLs have the same spider priority then Gigabot will prefer the one in the new spider queue. If they are both in the new queue or both in the old queue, then Gigabot will spider them based on their scheduled spider time.\n" "

\n" "Another aspect of the spider queues is that they allow Gigabot to perform depth-first spidering. When no priority is explicitly given for a URL then Gigabot will assign the URL the priority of the \"linker from which it was found\" minus one.\n" "

\n" "Custom Filters\n" "

\n" "You can write your own filters and hook them into Gigablast. A filter is an executable that takes an HTTP reply as input through stdin and makes adjustments to that input before passing it back out through stdout. The HTTP reply is essentially the reply Gigabot received from a web server when requesting a URL. The HTTP reply consists of an HTTP MIME header followed by the content for the URL.\n" "\n" "

\n" "Gigablast also appends Last-Indexed-Date, Collection, Url and DocId fields to the MIME in order to supply your filter with more information. The Last-Indexed-Date is the time that Gigablast last indexed that URL. It is -1 if the URL's content is currently not in the index.\n" "

\n" "You can specify the name of your filter (an executable program) on the Spider Controls page. After Gigabot downloads a web page it will write the HTTP reply into a temporary file stored in the /tmp directory. Then it will pass the filename as the first argument to the first filter by calling the system() function. popen() was used previously but was found to be buggy under Linux 2.4.17. Your program should send the filtered reply back out through stdout.\n" "

\n" "You can use multiple filters by using the pipe operator and entering a filter like \"./filter1 | ./filter2 | ./filter3\". In this case, only \"filter1\" would receive the temporary filename as its argument, the others would read from stdin.\n" "

\n" "\n" "Document Quotas\n" "

\n" "You can limit the number of documents on a per site basis. By default " "the site is defined to be the full hostname of a url, like, " "www.ibm.com. However, using tagdb you can define the site as a " "domain or even a subfolder within the url. By adjusting the " "<maxDocs> " "parameter in the ruleset for a particular url you " "can control how many documents are allowed into the index from that site. " "Additionally, the quotaBoost tables in the same ruleset file allow you to " "influence how a quota is changed based on the quality of the url being " "indexed and the quality of its root page. Furthermore, the Spider Controls " "allow you to turn quota checking on and off for old and new documents. " "

" "The quota checking routine quickly obtains a decent approximation of how " "many documents a particular site has in the index, but this approximation " "becomes " "higher than the actual count as the number of big indexdb files increases, " "so you may want to keep <indexdbMinFilesToMerge> in " "gb.conf " "down to a value of around " "five or so to ensure a half way decent approximation. Typically you can " "excpect to be off by about 1000 to 2000 documents for every indexdb file " "you have." "

\n" "

\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "

Injecting Documents" "

\n" "
\n" "Injection Methods\n" "

\n" "Gigablast allows you to inject documents directly into the index by using the command gb [-c <hosts.conf>] <hostId> --inject <file> where <file> must be a sequence of HTTP requests as described below. They will be sent to the host with id <hostId>." "

\n" "You can also inject your own content a second way, by using the Inject URL page. " "

\n" "Thirdly you can use your own program to feed the content directly to Gigablast using the same form parameters as the form on the Inject URL page." "

\n" "In any of the three cases, be sure that url injection is enabled on the Master Controls page." "" "

\n" "Input Parameters\n" "

\n" "When sending an injection HTTP request to a Gigablast server, you may optionally supply an HTTP MIME in addition to the content. This MIME is treated as if Gigablast's spider downloaded the page you are injecting and received that MIME. If you do supply this MIME you must make sure it is HTTP compliant, preceeds the actual content and ends with a \"\r\n\r\n\" followed by the content itself. The smallest mime header you can get away with is \"HTTP 200\r\n\r\n\" which is just an \"OK\" reply from an HTTP server." "" "

\n" "The cgi parameters accepted by the /inject URL for injecting content are the following: (remember to map spaces to +'s, etc.)" "

\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "

u=X	X is the url you are injecting. This is required.
c=X	X is the name of the collection into which you are injecting the content. This is required.
delete=X	X is 0 to add the URL/content and 1 to delete the URL/content from the index. Default is 0.
ip=X	X is the ip of the URL (i.e. 1.2.3.4). If this is ommitted or invalid then Gigablast will lookup the IP, provided iplookups is true. But if iplookups is false, Gigablast will use the default IP of 1.2.3.4.
iplookups=X	If X is 1 and the ip of the URL is not valid or provided then Gigablast will look it up. If X is 0 Gigablast will never look up the IP of the URL. Default is 1.
dedup=X	If X is 1 then Gigablast will not add the URL if another already exists in the index from the same domain with the same content. If X is 0 then Gigablast will not do any deduping. Default is 1.
rs=X	X is the number of the ruleset to use to index the URL and its content. It will be auto-determined if rs is omitted or rs is -1.
quick=X	If X is 1 then the reply returned after the content is injected is the reply described directly below this table. If X is 0 then the reply will be the HTML form interface.
hasmime=X	X is 1 if the provided content includes a valid HTTP MIME header, 0 otherwise. Default is 0.
content=X	X is the content for the provided URL. If hasmime is true then the first part of the content is really an HTTP mime header, followed by \"\r\n\r\n\", and then the actual content.
ucontent=X	X is the UNencoded content for the provided URL. Use this one instead of the content cgi parameter if you do not want to encode the content. This breaks the HTTP protocol standard, but is convenient because the caller does not have to convert special characters in the document to their corresponding HTTP code sequences. IMPORTANT: this cgi parameter must be the last one in the list.

\n" "\n" "

\n" "Sample Injection Request (line breaks are exclusively specified by \r\n sequences):
\n" "\n" "

\n"
"POST /inject HTTP/1.0\r\n\n"
"Content-Length: 291\r\n\n"
"Content-Type: text/html\r\n\n"
"Connection: Close\r\n\n"
"\r\n\n"
"u=myurl&c=&delete=0&ip=4.5.6.7&iplookups=0&dedup=1&rs=7&quick=1&hasmime=1&ucontent=HTTP 200\r\n\n"
"Last-Modified: Sun, 06 Nov 1994 08:49:37 GMT\r\n\n"
"Connection: Close\r\nContent-Type: text/html\r\n\r\nOverview\n"
"This is the unencoded content of the page we are injecting.\n"
"

\n" "
\n" "The Reply\n" "

\n" "" "The reply is always a typical HTTP reply, but if you defined quick=1 then the *content* (the stuff below the returned MIME) of the HTTP reply to the injection request is of the format:" "
\n" "
\n" "<X> docId=<Y> hostId=<Z>" "
\n" "
\n" "OR" "
\n" "
\n" "<X> <error message>" "
\n" "
\n" "Where <X> is a string of digits in ASCII, corresponding to the error code. X is 0 on success (no error) in which case it will be followed by a int64_t docId and a hostId, which corresponds to the host in the hosts.conf file that stored the document. Any twins in its group will also have copies. If there was an error then X will be greater than 0 and may be followed by a space then the error message itself. If you did not define quick=1, then you will get back a response meant to be viewed on a browser." "
\n" "
\n" " Make sure to read the complete reply before spawning another request, lest Gigablast become flooded with requests." "
\n" "
\n" "Example success reply: 0 docId=123543 hostId=0
\n" "Example error reply: 12 Cannot allocate memory\n" "
\n" "
\n" "See the Error Codes for all errors, but the following\n" "errors are most likely:
\n" "\n" "" "\n" "" "\n" "" "\n" "" "\n" "

%5li %s	There was a int16_tage of memory to properly " "process the request.
%05"INT32" %s	A cached page was not found when it should have " "been, likely due to corrupt data on disk.
%5li %s	There was a int16_tage of resources so the " "request should be repeated.
%5li %s	The injection was to a collection that does " "not exist.

\n", (int32_t)ENOMEM ,mstrerror(ENOMEM), (int32_t)ENOTFOUND,mstrerror(ENOTFOUND), (int32_t)ETRYAGAIN,mstrerror(ETRYAGAIN), (int32_t)ENOCOLLREC,mstrerror(ENOCOLLREC)); sprintf ( p , "" "
\n" "\n" "

\n" "\n" "\n" "\n" "\n" "\n" "\n" "

Deleting Documents" "

\n" "
\n" "You can delete documents from the index two ways:" "

Perhaps the most popular is to use the Reindex URLs tool which allows you to delete all documents that match a simple query. Furthermore, that tool allows you to assign rulesets to all the domains of all the matching documents. All documents that match the query will have their docids stored in a spider queue of a user-specified priority. The spider will have to be enabled for that priority queue for the deletion to take place. Deleting documents is very similar to adding documents." "

\n" "
To delete a single document you can use the Inject URL page. Make sure that url injection is enabled on the Master Controls page." "

\n" "

\n" "" "" "\n" "\n" "\n" ); p += gbstrlen ( p ); sprintf ( p , "" "" "" "" "\n" "\n" "

Scoring A Document\n" "

\n" "

\n" "Gigablast scores word and phrases in a document based on the following criteria:" "
\n" "

the quality of the document" "
the count of the word or phrase in the document" "
the locations of the word or phrase in the document" "
the spaminess of the word or phrase in the document" "
the length of the part of the document being indexed" "

\n" "
\n" "By assigning a ruleset to a document, you can control exactly how Gigablast uses these criteria to generate the score of a word or phrase in that document." "

\n" "
\n" "Index Rules\n" "
\n" "When Gigablast indexes a document it first chains down the <index> tags that are listed in the ruleset in the order they are presented. Each of these <index> tags, and all that it contains, represents one index rule. Each index rule describes how to index a portion of the document. Different portions of the document may be indexed and scored in different ways. The order of these index rules can be very important, since the same word's score accumulates from one index rule to the next, and different index rules may have different score ceilings for that word." "" "

\n" "In addition to describing the various sub tags of an index rule in the sample ruleset file, they are further described in the following table:" "" "

\n" "Sub Tags of an Index Rule\n" "
\n" "\n" "\n" "\n" "\n" "\n" "" "\n" "\n" "\n" "\n" "" "\n" "\n" "\n" "\n" "" "\n" "\n" "\n" "\n" "" "" "\n" "\n" "\n" "\n" "" "\n" "\n" "\n" "\n" "" "\n" "\n" "\n" "\n" "" "\n" "\n" "\n" "\n" "" "\n" "\n" "\n" "\n" "" "\n" "\n" "\n" "\n" "" "\n" "\n" "\n" "\n" "" "\n" "\n" "\n" "\n" "" "" "\n" "\n" "\n" "\n" "" "" "\n" "\n" "\n" "\n" "" "\n" "\n" "\n" "\n" "" "\n" "\n" "\n" "\n" "

\n" "<name>X</>" "

This tag tells Gigablast what part of the document to index. You can have multiple <name> tags in the same index rule. X can be one of the following values:

\n" "" "\n" "\n" "\n" "" "" "" "" "" "\n" "

ABSENT	If the <name> tag is not present Gigablast will restrict itself to all words and phrases in the document that are not in between a < and > of any tag in the document.
title	Tells Gigablast to index the words and phrases in between the first pair of <title> and </title> tags present in the document, if one exists.
meta.keywords	Tells Gigablast to index the words and phrases contained in the content field of the <meta name=keywords content=X> tag.
meta.keyword	See above
meta.summary	See above
meta.description	See above
meta.foobar	COMING SOON (user-defined meta tags)
foobar	You can add your own tag, like <foobar>index this</> to the document and then have an index rule with a <name>foobar</> that contains rules on how to index it.

\n" "

\n" "<prefix>X</>" "

\n" "If present, Gigablast will index the words and phrases with the specified prefix, X. Fielded searches can then be performed. Example: <prefix>title</>" "

\n" "<maxQualityForSpamDetect>X</>" "

\n" "Spam detection will be performed on the words and phrases if the document's quality is X or lower. Spam detection generally lowers the scores of repeated words and phrases based on the degree of repetition." "

\n" "<minQualityToIndex>X</>" "

\n" "If the document's quality is below X, then do not index the words and phrases for this index rule." "

\n" "<filterHtmlEntities>X</>" "

\n" "If X is yes then convert HTML entities, like >, into their represented characters before indexing." "

\n" "<indexIfUniqueOnly>X</>" "

\n" "If X is yes then each word or phrase will only be indexed if not already indexed by a previous index rule in the ruleset, and only the first occurence of the word or phrase will be indexed, subsequent occurences will not count towards the score." "

\n" "<indexSingletons>X</>" "

\n" "If X is yes then index the words, otherwise do not." "

\n" "<indexPhrases>X</>" "

\n" "If X is yes then index the phrases, otherwise do not." "

\n" "<indexAsWhole>X</>" "

\n" "If X is yes then index the whole sequence of indexable words as a checksum." "

\n" "<useStopWords>X</>" "

\n" "If X is yes then use stop words when forming phrases." "

\n" "<useStems>X</>" "

\n" "If X is yes then index stems. Currently unsupported." "

\n" "" "<quality11> X1 </>
\n" "<quality12> X2 </>...
\n" "<quality1N> XN </>
\n" "<maxLen11> Y1 </>
\n" "<maxLen12> Y2 </>...
\n" "<maxLen1N> YN </>
\n" "" "

\n" "This maps the quality of the document to a maximum number of CHARACTERS to index. The (Xn,Yn) points form a piecewise function which is linearly interpolated between points. The edges are horizontal, meaning, if X is 0 Y will be Y1, or if X is infinite, Y will be YN.\n" "

\n" "\n" "<quality21> X1 </>
\n" "<quality22> X2 </>...
\n" "<quality2N> XN </>
\n" "<maxScore21> Y1 </>
\n" "<maxScore22> Y2 </>...
\n" "<maxScore2N> YN </>
\n" "" "

\n" "This maps the quality of the document to a percentage of the absolute max score a word or phrase can have. This is the QUALITY_WEIGHT_MAX value in the formula." "

\n" "\n" "<quality31> X1 </>
\n" "<quality32> X2 </>...
\n" "<quality3N> XN </>
\n" "<scoreWeight31> Y1 </>
\n" "<scoreWeight32> Y2 </>...
\n" "<scoreWeight3N> YN </>
\n" "" "

\n" "This maps the quality of the document to a percentage weight on the base score of the words and phrases being indexed. This is the QUALITY_WEIGHT value in the formula." "

\n" "\n" "<len41> X1 </>
\n" "<len42> X2 </>...
\n" "<len4N> XN </>
\n" "<scoreWeight41> Y1 </>
\n" "<scoreWeight42> Y2 </>...
\n" "<scoreWeight4N> YN </>
\n" "" "

\n" "This maps the length (in characters) of the what is being indexed to a percentage weight on the base score of the words and phrases being indexed. This is the LENGTH_WEIGHT value in the formula." "

\n" "\n" "<len51> X1 </>
\n" "<len52> X2 </>...
\n" "<len5N> XN </>
\n" "<maxScore51> Y1 </>
\n" "<maxScore52> Y2 </>...
\n" "<maxScore5N> YN </>
\n" "

\n" "This maps the length (in characters) of the what is being indexed to a percentage of the absolute maximum score a word or phrase can have. This is the LENGTH_WEIGHT_MAX value in the formula." "

\n" "" "\n" "

\n" "Computing the Score\n" "

\n" "Each word in the document is assigned a base score with this formula : " "
\n" "

\n"
"BASE_SCORE = min { (256          * QUALITY_WEIGHT     * LENGTH_WEIGHT     ) / 10000 + BOOST ,\n"
"                   (0xffffffffLL * QUALITY_WEIGHT_MAX * LENGTH_WEIGHT_MAX ) / 10000         }\n"
"

Item	Ruleset Tag	Desription
\n" "<meta name=foo content=bar>" "	\n" "--" "	\n" "User-defined meta tags use the quality of the document multiplied by 256 as their score. If this product is 0 it is upped to 1. This score is then mapped to an 8-bit final score an indexed. Furthermore, when indexing user-defined meta tags, only one occurence of each word or phrase is counted. In the future, these meta tags may have their own index rule." "
\n" "http://www.xxx.com/abc" "	\n" "<indexUrl>X</>" "	If X is yes then the entire url is indexed as one word with a BASE_SCORE of 1 and with a url: prefix so a search for url:http://www.xxx.com/ will bring up the document." "
\n" "http://www.xxx.com/abc" "	\n" "<indexSubUrl>X</>" "	If X is yes then the url is indexed as if it occured in the document, but with a random BASE_SCORE (based on url hash) and a suburl: prefix so a search for suburl:\"com/abc\" will bring up the document." "
\n" "http://www.xxx.com/abc" "	\n" "<indexIp>X</>" "	If X is yes then the IP of the url will be indexed as if it were one word but with a random BASE_SCORE (based on url hash). Furthermore, the last number of the IP address is replaced with a zero and that IP address is indexed in order to provide an IP domain search ability. So if a url has the IP address 1.2.3.4 then a search for ip:1.2.3.4 or for ip:1.2.3 should bring it up." "
\n" "http://www.xxx.com/abc?q=hi" "	\n" "<indexSite>X</>" "	If X is yes then the following terms would be indexed with a base score of BASE_SCORE (but multiplied by 3 if the url is a root url): " " \n" " site:www.xxx.com/abc?q=hi" " site:www.xxx.com/abc?" " site:www.xxx.com/" " site:xxx.com/abc?q=hi" " site:xxx.com/abc?" " site:xxx.com/" " \n" "
\n" "http://www.xxx.com/form.php" "	\n" "<indexExt>X</>" "	If X is yes then the file extension, if any, of the url would be indexed with the ext: prefix and a score of BASE_SCORE. So a query of ext:php would bring up the document in this example case." "
\n" "links" "	\n" "<indexLinks>X</>" "	If X is yes then the various links in the document will be indexed with a link: prefix. Scores are special in this case." "
\n" "collection name" "	\n" "--" "	The collection name of the document is indexed with the coll: prefix and a BASE_SCORE of 1. " "
\n" "content type" "	\n" "--" "	The content type of the document is indexed with the type: (or filetype:) prefix and a BASE_SCORE of 1. If the content type is not one of these supported content types, then nothing will be indexed: " " \n" " html" " text" " xml" " pdf" " doc" " xls" " ppt" " ps" " \n" "

\n" "
\n" "
\n" "" "" "" "" "" "" "\n" "\n" "

Indexing User-Defined Meta Tags" "

\n" "
\n" "Gigablast supports the indexing, searching and displaying of user-defined meta tags. For instance, if you have a tag like <meta name=\"foo\" content=\"bar baz\"> in your document, then you will be able to do a search like foo:bar or foo:\"bar baz\" and Gigablast will find your document. " "

\n" "You can tell Gigablast to display the contents of arbitrary meta tags in the search results, like this. Note that you must assign the dt cgi parameter to a space-separated list of the names of the meta tags you want to display. You can limit the number of returned characters of each tag to X characters by appending a :X to the name of the meta tag supplied to the dt parameter. In the link above, I limited the displayed keywords to 32 characters. The content of the meta tags is also provided in the <display> tags in the XML feed\n" "

\n" "Gigablast will index the content of all meta tags in this manner. Meta tags with the same name parameter as other meta tags in the same document will be indexed as well.\n" "

\n" "Why use user-defined metas? Because it is very powerful. It allows you to embed custom data in your documents, search for it and retrieve it." "
\n" "
\n" "You can also explicitly specify how to index certain meta tags by making an <index> tag in the ruleset as shown here. The specified meta tags will be indexed in the user-defined meta tag fashion as described above, in addition to any method described in the ruleset." "
\n" "
\n" "
\n" "" "" "" "" "" "\n" "\n" "

Indexing Big Documents" "

\n" "
\n" "When indexing a document you will be bound by the available memory of the machine that is doing the indexing. A document that is dense in words can takes as much as ten times the memory as the size of the document in order to process it for indexing. Therefore you need to make sure that the amount of available memory is adequate to process the document you want to index. You can turn off Spam detection to reduce the processing overhead by a little bit." "
\n" "
\n" "The <maxMem> tag in the gb.conf file controls the maximum amount of memory that the whole Gigablast process can use. HOWEVER, this memory is shared by databases, thread stacks, protocol stacks and other things that may or may not use most of it. Probably, the best way to see much memory is available to the Gigablast process for processing a big document is to look at the Stats Page. It shows you exactly how much memory is being used at the time you look at it. Hit refresh to see it change." "
\n" "
\n" "You can also check all the tags in the gb.conf file that have the word \"mem\" in them to see where memory is being allocated. In addition, you will need to check the first 100 lines of the log file for the gigablast process to see how much memory is being used for thread and protocol stacks. These should be displayed on the Stats page, but are currently not." "
\n" "
\n" "After ensuring you have enough extra memory to handle the document size, you will need to make sure the document fits into the tree that is used to hold the documents in memory before they get dumped to disk. The documents are compressed using zlib before being added to the tree so you might expect a 5:1 compression for a typical web page. The memory used to hold document in this tree is controllable from the <titledbMaxTreeMem> parameter in the gb.conf file. Make sure that is big enough to hold the document you would like to add. If the tree could accomodate the big document, but at the time is partially full, Gigablast will automatically dump the tree to disk and keep trying to add the big document." "
\n" "
\n" "Finally, you need to ensure that the max text doc len and max other doc len controls on the Spider Controls page are set to accomodating sizes. Use -1 to indicate no maximum. Other documents are non-text and non-html documents, like PDF, for example. These controls will physically prohibit the spider from downloading more than this many bytes. This causes excessively int32_t documents to be truncated. If the spider is downloading a PDF that gets truncated then it abandons it, because truncated PDFs are useless." "
\n" "
\n" "
\n" "\n" "\n" "

Indexing Different Languages" "

\n" "
\n" "Gigablast currently just supports indexing the ISO-8859-1 (aka Latin-1) character set. This character set uses one byte (8 bits) per character. It covers most West European languages such as French, Spanish, Catalan, Galician, Basque, Portuguese, Italian, Albanian, Afrikaans, Dutch, German, Danish, Swedish, Norwegian, Finnish, Faroese, Icelandic, Irish, Scottish and English.

" "Gigablast has a switch in the Spider Controls for enabling and disabling " "the indexing of Asian character sets. If the spider is downloading a " "document and Asian character sets are disallowed, then it will check for " "the Content-Encoding field in the mime of the HTTP reply from the web " "server. If the name of the character set is one of the Asian character " "sets that Gigablast recognizes, then the document will NOT be indexed. " "This control covers some of the more popular " "characters sets in Asia, but if the character set is not recognized by " "Gigablast it will be indexed as if it were the Latin-1 character set. " "Likewise, all search queries are interpreted as belonging to the Latin-1 " "character set." "

" "If Gigablast indexes a document as being from the Latin-1 character set " "when in fact it is not, then Gigablast will parse out words as being " "sequences of alpha-numeric characters. All other characters are considered " "to be punctuation. The alpha-numeric characters are defined as a-z, A-Z, " "0-9 and all of the letters with accent marks (192-255, except 215 and 247). " "Everything else is considered a punctuation character. " "

" "Gigablast also has a #define that controls whether or not it will " "convert accented letters into their unaccented equivalents. When accent " "conversion is active, a word like rèsume is indexed " "the same as the word resume. Accent conversion is on by default." "

" "In the future Gigablast may support many different character sets. " "This involves translating the text of a particular character set into " "Unicode, however, before this translation can be performed, the character " "set must be recognized by Gigablast. In most cases the web servers and the " "web browsers do a good job of providing the character set name in the HTTP " "reply or HTTP request by supplying a Content-Encoding field in the " "mime, however, when this is not supplied, Gigablast must make a best effort " "to auto-detect it. Once the character set is determined, and the translation " "to Unicode has been performed, the content can be parsed into \"words\", " " basic units of meaning, inherent to that particular language." "

" "The following table describes the Latin-1 character set:
" "

"
"Char   Decimal    Hex     Entity      Char    Decimal   Hex    Entity\n"
"                         Reference                            Reference\n"
"NUL       0        0                  SOH        1        1\n"
"STX       2        2                  ETX        3        3\n"
"EOT       4        4                  ENQ        5        5\n"
"ACK       6        6                  BEL        7        7\n"
"BS        8        8                   HT        9        9\n"
"NL       10        a                   VT       11        b\n"
"NP       12        c                   CR       13        d\n"
"SO       14        e                   SI       15        f\n"
"DLE      16       10                  DC1       17       11\n"
"DC2      18       12                  DC3       19       13\
DC4      20       14                  NAK       21       15\
SYN      22       16                  ETB       23       17\
CAN      24       18                   EM       25       19\
SUB      26       1a                  ESC       27       1b\
FS       28       1c                   GS       29       1d\
RS       30       1e                   US       31       1f\
SP       32       20                    !       33       21\
\"        34       22     &quot;         #       35       23\
$        36       24                    %%       37       25\
&        38       26      &amp;         '       39       27\
(        40       28                    )       41       29\
*        42       2a                    +       43       2b\
,        44       2c                    -       45       2d\
.        46       2e                    /       47       2f\
0        48       30                    1       49       31\
2        50       32                    3       51       33\
4        52       34                    5       53       35\
6        54       36                    7       55       37\
8        56       38                    9       57       39\
:        58       3a                    ;       59       3b\
<        60       3c       &lt;         =       61       3d\
>        62       3e       &gt;         ?       63       3f\
@        64       40                    A       65       41\
B        66       42                    C       67       43\
D        68       44                    E       69       45\
F        70       46                    G       71       47\
H        72       48                    I       73       49\
J        74       4a                    K       75       4b\
L        76       4c                    M       77       4d\
N        78       4e                    O       79       4f\
P        80       50                    Q       81       51\
R        82       52                    S       83       53\
T        84       54                    U       85       55\
V        86       56                    W       87       57\
X        88       58                    Y       89       59\
Z        90       5a                    [       91       5b\
\\        92       5c                    ]       93       5d\
^        94       5e                    _       95       5f\
`        96       60                    a       97       61\
b        98       62                    c       99       63\
d       100       64                    e       101       65\
f       102       66                    g       103       67\
h       104       68                    i       105       69\
j       106       6a                    k       107       6b\
l       108       6c                    m       109       6d\
n       110       6e                    o       111       6f\
p       112       70                    q       113       71\
r       114       72                    s       115       73\
t       116       74                    u       117       75\
v       118       76                    w       119       77\
x       120       78                    y       121       79\
z       122       7a                    {       123       7b\
|       124       7c                    }       125       7d\
~       126       7e                  DEL       127       7f\
--      128       80                   --       129       81\
--      130       82                   --       131       83\
--      132       84                   --       133       85\
--      134       86                   --       135       87\
--      136       88                   --       137       89\
--      138       8a                   --       139       8b\
--      140       8c                   --       141       8d\
--      142       8e                   --       143       8f\
--      144       90                   --       145       91\
--      146       92                   --       147       93\
--      148       94                   --       149       95\
--      150       96                   --       151       97\
--      152       98                   --       153       99\
--      154       9a                   --       155       9b\
--      156       9c                   --       157       9d\
--      158       9e                   --       159       9f\
�       160       a0     &nbsp;         �       161       a1     &iexcl;\
�       162       a2     &cent;         �       163       a3     &pound;\
�       164       a4     &curren;       �       165       a5     &yen;\
�       166       a6     &brvbar;       �       167       a7     &sect;\
�       168       a8     &uml;          �       169       a9     &copy;\
�       170       aa     &ordf;         �       171       ab     &laquo;\
�       172       ac     &not;          �       173       ad     &shy;\
�       174       ae     &reg;          �       175       af     &macr;\
�       176       b0     &deg;          �       177       b1     &plusmn;\
�       178       b2     &sup2;         �       179       b3     &sup3;\
�       180       b4     &acute;        �       181       b5     &micro;\
�       182       b6     &para;         �       183       b7     &middot;\
�       184       b8     &cedil;        �       185       b9     &sup1;\
�       186       ba     &ordm;         �       187       bb     &raquo;\
�       188       bc     &frac14;       �       189       bd     &frac12;\
�       190       be     &frac34;       �       191       bf     &iquest;\
�       192       c0       &Agrave;     �       193       c1       &Aacute;\
�       194       c2       &Acirc;      �       195       c3       &Atilde;\
�       196       c4       &Auml;       �       197       c5       &Aring;\
�       198       c6       &AElig;      �       199       c7       &Ccedil;\
�       200       c8       &Egrave;     �       201       c9       &Eacute;\
�       202       ca       &Ecirc;      �       203       cb       &Euml;\
�       204       cc       &Igrave;     �       205       cd       &Iacute;\
�       206       ce       &Icirc;      �       207       cf       &Iuml;\
�       208       d0     &ETH;          �       209       d1       &Ntilde;\
�       210       d2       &Ograve;     �       211       d3       &Oacute;\
�       212       d4       &Ocirc;      �       213       d5       &Otilde;\
�       214       d6       &Ouml;       �       215       d7     &times;\
�       216       d8       &Oslash;     �       217       d9       &Ugrave;\
�       218       da       &Uacute;     �       219       db       &Ucirc;\
�       220       dc       &Uuml;       �       221       dd       &Yacute;\
�       222       de       &THORN;      �       223       df       &szlig;\
�       224       e0       &agrave;     �       225       e1       &aacute;\
�       226       e2       &acirc;      �       227       e3       &atilde;\
�       228       e4       &auml;       �       229       e5       &aring;\
�       230       e6       &aelig;      �       231       e7       &ccedil;\
�       232       e8       &egrave;     �       233       e9       &eacute;\
�       234       ea       &ecirc;      �       235       eb       &euml;\
�       236       ec       &igrave;     �       237       ed       &iacute;\
�       238       ee       &icirc;      �       239       ef       &iuml;\
�       240       f0       &eth;        �       241       f1       &ntilde;\
�       242       f2       &ograve;     �       243       f3       &oacute;\
�       244       f4       &ocirc;      �       245       f5       &otilde;\
�       246       f6       &ouml;       �       247       f7     &divide;\
�       248       f8       &oslash;     �       249       f9       &ugrave;\
�       250       fa       &uacute;     �       251       fb       &ucirc;\
�       252       fc       &uuml;       �       253       fd       &yacute;\
�       254       fe       &thorn;      �       255       ff       &yuml;\

" "
\n" "
\n" "
\n" "\n" "\n" "

Rolling the New Index" "

\n" "
\n" "Just because you have indexed a lot of pages does not mean those pages are being searched. If the restrict indexdb for queries switch on the Spider Controls page is on for your collection then any query you do may not be searching some of the more recently indexed data. You have two options:" "

\n" "1.You can turn this switch off which will tell Gigablast to search all the files in the index which will give you a realtime search, but, if <indexdbMinFilesToMerge> is set to X in the gb.conf file, then Gigablast may have to search X files for every query term. So if X is 40 this can destroy your performance. But high X values are indeed useful for speeding up the build time. Typically, I set X to 4 on gigablast.com, but for doing initial builds I will set it to 40." "

\n" "2.The second option you have for making the newer data searchable is to do a tight merge of indexdb. This tells Gigablast to combine the X files into one. Tight merges typically take about 2-4 minutes for every gigabyte of data that is merged. So if all of your indexdb* files are about 50 gigabytes, plan on waiting about 150 minutes for the merge to complete." "

\n" "IMPORTANT: Before you do the tight merge you should do a disk dump which tells Gigablast to dump all data in memory to disk so that it can be merged. In this way you ensure your final merged file will contain *all* your data. You may have to wait a while for the disk dump to complete because it may have to do some merging right after the dump to keep the number of files below <indexdbMinFilesToMerge>." "" "

\n" "Now if you are interfacing to Gigablast from another program you can use the &rt=[0|1] real time search cgi parameter. If you set this to 0 then Gigablast will only search the first file in the index, otherwise it will search all files." "

\n" "" "" "" "" "" "\n" "\n" "

Building a DMOZ Based Directory" "

\n" "
\n" "How Catdb Works:
" "

Catdb is used to create a Web Directory based" " on DMOZ (www.dmoz.org). The actual RDB known as" " catdb is a set of records containing the urls" " in the directory and which categories they belong to." " Catdb is only required at spider time so that the" " url being spidered can be checked for category" " information. Generating Catdb requires the" " gbdmoz.content.dat file generated by the" " dmozparse program, which in turn requires the" " DMOZ RDF files content.rdf.u8 and structure.rdf.u8" " which can be found at http://rdf.dmoz.org/rdf" " (see below)." " Sites will have their directory information stored" " in their TitleRecs to be retrieved at query time." "
" "
To use the Web Directory after it has been" " created and spidered requires only the Categories" " Hierarchy. The Hierarchy loads at startup using the" " gbdmoz.structure.dat file, also created by" " dmozparse. The Hierarchy is used to lookup" " the directory hierarchy information while the" " directory is being browsed by a user. Directory" " results are generated using special terms (see below)" "
All files used by Catdb and the Hierarchy," " including the RDB data itself, are kept in the" " cat directory located under the root gigablast" " directory.
" "
To browse an active Directory, append Top" " to the server address in the browser as though it is" " a folder: http://www.gigablast.com/Top." " Alternatively, append a known Category:" " http://www.gigablast.com/Arts/Movies.
" "

" "Before You Get Started:
" "

Create the dmozparse program.
" " $ make dmozparse
" "
Copy dmozparse to the cat/" " directory under the main gigablast directory on host" " 0.
" "

" "Generating a New Catdb:" "

Download the latest content.rdf.u8 and " " structure.rdf.u8 files from http://rdf.dmoz.org/rdf" " into the cat/ directory on host 0.
" " $ wget http://rdf.dmoz.org/rdf/" "content.rdf.u8.gz
" " $ gunzip content.rdf.u8.gz
" " $ wget http://rdf.dmoz.org/rdf/" "structure.rdf.u8.gz
" " $ gunzip structure.rdf.u8.gz
" "
Execute dmozparse in the cat" " directory with the new" " option to generate the catdb dat files.
" " $ dmozparse new
" "
Execute the installcat script command on host" " 0 to distribute the catdb files to all the hosts.
" " $ gb installcat
" "
Make sure all spiders are stopped and" " inactive.
" "
Goto catdb in the admin section of" " Gigablast and click \"Generate Catdb.\"
" "
Once the command returns, Catdb will be ready" " for use and spidering.
" "

" "Spidering Urls For New Catdb:" "

Execute dmozparse in the cat" " directory with the urldump -s option to" " create the gbdmoz.urls.txt.# files which contain all" " the urls in DMOZ.
" " $ dmozparse urldump -s
" "
Move the gbdmoz.urls.txt.# files to the" " html directory under the main Gigablast" " directory of host 0.
" "
Go to \"add url\" under the admin section" " of Gigablast.
" "
IMPORTANT: Uncheck the strip session" " ids option.
" "
In the \"url of a file of urls to add\" box," " insert the hostname/ip and http port of host 0" " followed by one of the gbdmoz.urls.txt.# files." " Example: http://10.0.0.1:8000/gbdmoz.urls.txt.0
" "
Press the \"add file\" button and allow the" " urls to be added to the spider.
" "
Repeat for all the gbdmoz.urls.txt.# files." "
" "

" "Updating an Existing Catdb With New DMOZ Data:" "

Download the latest content.rdf.u8 and " " structure.rdf.u8 files from http://rdf.dmoz.org/rdf" " into the cat/ directory on host 0 with the" " added extension \".new\".
" " $ wget http://rdf.dmoz.org/rdf/" "content.rdf.u8.gz -O content.rdf.u8.new.gz
" " $ gunzip content.rdf.u8.new.gz
" " $ wget http://rdf.dmoz.org/rdf/" "structure.rdf.u8.gz -O structure.rdf.u8.new.gz
" " $ gunzip structure.rdf.u8.new.gz
" "
Execute dmozparse in the cat" " directory with the update" " option to generate the catdb dat.new and diff files." "
" " $ dmozparse update
" "
NOTE: If you wish to spider the new," " changed, and removed urls from this update, execute" " dmozparse with the diffurldump -s" " option to generate the gbdmoz.diffurls.txt file (See" " below).
" " $ dmozparse diffurldump -s
" "
Execute the installnewcat script command on" " host" " 0 to distribute the catdb files to all the hosts.
" " $ gb installnewcat
" "
Make sure all spiders are stopped and" " inactive.
" "
Goto \"catdb\" in the admin section of" " Gigablast and click \"Update Catdb.\"
" "
Once the command returns, Catdb will be ready" " for use and spidering.
" "

" "Spidering Urls For Updated Catdb:" "

Execute dmozparse in the cat" " directory with the diffurldump -s option to" " create the gbdmoz.diffurls.txt.# files which contain" " all the new, changed, or removed urls in DMOZ.
" " $ dmozparse diffurldump -s
" "
Move the gbdmoz.diffurls.txt.# files to the" " html directory under the main Gigablast" " directory of host 0.
" "
Go to \"add url\" under the admin section" " of Gigablast.
" "
IMPORTANT: Uncheck the strip session" " ids option.
" "
In the \"url of a file of urls to add\" box," " insert the hostname/ip and http port of host 0" " followed by one of the gbdmoz.diffurls.txt.# files." " Example: http://10.0.0.1:8000/gbdmoz.diffurls.txt.0" "
" "
Press the \"add file\" button and allow the" " urls to be added to the spider.
" "
Repeat for all the gbdmoz.diffurls.txt.#" " files." "
" "

" "Deleting Catdb:" "

Shutdown Gigablast.
" "
Delete catdb-saved.dat and all" " cat/catdb*.dat and cat/catdb*.map" " files from all hosts.
" "
Start Gigablast.
" "

" "Troubleshooting:" "

Dmozparse prints an error saying it could" " not open a file:
" " Be sure you are running dmozparse in the" " cat directory and that the steps above have been" " followed correctly so that all the necessary files" " have been downloaded or created.
" "
Dmozparse prints an Out of Memory error:" "
" " Some modes of dmozparse can require several" " hundred megabytes of system memory. Systems with" " insufficient memory, under heavy load, or lacking" " a correctly working swap may have problems running" " dmozparse. Attempt to free up as much memory as" " possible if this occcurs.
" "
How to tell if pages are being added" " with correct directory data:
" " All pages with directory data are indexed" " with special terms utilizing a prefix and sufix." " The prefixes are listed below and represent a" " specific feature under which the page was indexed." " The sufix is always a numerical category ID. To" " search for one of these terms, simply performa a" " query with \"prefix:sufix\", i.e. \"gbpdcat:1\" will" " list all pages under the Top category (or all pages" " in the entire directory).
" "
- gbcatid - The page is listed directly" " under this base category.
  " "
- gbpcatid - The page is listed under this" " category or any child of this category.
  " "
- gbicatid - The page is listed indirectly" " under this base category, meaning it is a page found" " under a site listed in the base category.
  " "
- gbipcat - The page is listed indirectly" " under this category, meaning it is a page found under" " a site listed under this category or any child of" " this category.
  " "
" "
Pages are not being indexed with directory" " data:
" " First check to make sure that sites that are" " actually in DMOZ are those being added by the" " spiders. Next check to see if the sites return" " category information when looked up under the Catdb" " admin section. If they come back with directory" " information, the site may just need to be respidered." " If the lookup does not return category information" " and all hosts are properly running, Catdb may need" " to be rebuilt from scratch.
" "
The Directory shows results but does not" " show sub-category listings or a page error is" " returned and no results are shown:
" " Make sure the gbdmoz.structure.dat and" " structure.rdf.u8 files are in the cat" " directory on every host. Also be sure the current" " dat files were built from the current rdf.u8 files." " Check the log to see if Categories was properly" " loaded from file at startup (grep log# Categories)." "
" "

" "" "" "" "" "" "\n" "\n" "

The Log System" "

\n" "
\n" "\n" "\n" "

Gigablast uses its own format for logging messages, for example,
\n" "

\n"
"1091228736104 0 INIT         Gigablast Version 1.234\n"
"1091228736104 0 INIT  thread Allocated 435333 bytes for thread stacks.\n"
"1091228736104 0 WARN  mem    Failed to alloc 360000 bytes.\n"
"1091228736104 0 WARN  query  Failed to intersect lists. Out of memory.\n"
"1091228736104 0 WARN  query  Too many words. Query truncated.\n"
"1091228736104 0 INFO  build  GET http://hohum.com/foobar.html\n"
"1091228736104 0 INFO  build  http://hohum.com/foobar.html ip=4.5.6.7 : Success\n"
"1091228736104 0 DEBUG build  Skipping xxx.com, would hammer IP.\n"
"

\n" "
\n" "The first field, a large number, is the time in milliseconds since the epoch. This timestamp is useful for evaluating performance.
\n" "
\n" "The second field, a 0 in the above example, is the hostId (from hosts.conf) of the host that logged the message.
\n" "
\n" "The third field, INIT in the first line of the above example, is the type of log message. It can be any of the following:
\n" "
\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "

INIT	Messages printed at the initilization or shutdown of the Gigablast process.
WARN	Most messages fall under this category. These messages are usually due to an error condition, like out of memory.
INFO	Messages that are given for information purposes only and not indicative of an error condition.
LIMIT	Messages printed when a document was not indexed because the document quota specified in the ruleset was breeched. Also, urls that were truncated because they were too long. Or a robots.txt file was too big and was truncated.
TIME	Timestamps, logged for benchmarking various processes.
DEBUG	Messages used for debugging.
LOGIC	Programmer sanity check messages. You should never see these, because they signify a problem with the code logic.
REMND	A reminder to the programmer to do something.

\n" "" "
\n" "The fourth field is the resource that is logging the message. The resource can be one of the following:" "" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "

addurls	Messages related to adding urls. Urls could have been added by the spider or by a user via a web interface.
admin	Messages related to administrative functions and tools like the query-reindex tool and the sync tool.
build	Messages related to the indexing process.
conf	Messages related to hosts.conf or gb.conf.
disk	Messages related to reading or writing to the disk.
dns	Messages related to talking with a dns server.
http	Messages related to the HTTP server.
loop	Messages related to the main loop that Gigablast uses to process incoming signals for network and file communication.
merge	Messages related to performing file merges.
net	Messages related to the network layer above the udp server. Includes the ping and redirect-on-dead functionality.
query	Messages related to executing a query.
db	Messages related to a database. Fairly high level.
spcache	Messages related to the spider cache which is used to efficiently queue urls from disk.
speller	Messages related to the query spell checker.
thread	Messages related to the threads class.
topics	Messages related to related topics generation.
udp	Messages related to the udp server.

\n" "
\n" "Finally, the last field, is the message itself." "

\n" "You can turn many messages on and off by using the Log Controls." "

\n" "The same parameters on the Log Controls page can be adjusted in the gb.conf file." "

\n" "\n" "\n" "\n" "" "" "" "" "\n" "\n" "

Optimizing\n" "

\n" "
\n" "Gigablast is a fairly sophisticated database that has a few things you can tweak to increase query performance or indexing performance.\n" "

\n" "\n" "Query Optimizations:\n" "\n" "

Set restrict indexdb for queries on the \n" "Spider Controls page to YES.\n" "This parameter can also be controlled on a per query basis using the \n" "rt=X cgi parm.\n" "This will decrease freshness of results but typically use\n" "3 or 4 times less disk seeks.\n" "\n" "
If you want to spider at the same time, then you should ensure\n" "that the max spider disk threads parameter on the\n" "Master Controls page is set to around 1 \n" "so the indexing/spidering processes do not hog the disk.\n" "\n" "
Set Gigablast to read-only mode to true to prevent Gigablast from using \n" "memory to hold newly indexed data, so that this memory can \n" "be used for caches. Just set the <readOnlyMode> parameter in your config file to 1.\n" "\n" "
Increase the indexdb cache size. The <indexdbMaxCacheMem> \n" "parameter in\n" "your config file is how many bytes Gigablast uses to store index lists.\n" "Each word has an associated index list which is loaded from disk when that\n" "word is part of a query. The more common the word, the bigger its index list.\n" "By enabling a large indexdb cache you can save some fairly large disk reads.\n" "\n" "
Increase the clusterdb cache size. The <clusterdbMaxCacheMem>\n" "parameter in\n" "your config file is how many bytes Gigablast uses to store cluster records.\n" "Cluster records are used for site clustering and duplicate removal. Every\n" "URL in the index has a corresponding cluster record. When a url appears as a \n" "search result its cluster record must be loaded from disk. Each cluster \n" "record is about 12 to 16 bytes so by keeping these all in memory you can\n" "save around 10 disk seeks every query.\n" "\n" "
Disable site clustering and dup removal. By specifying &sc=0&dr=0\n" "in your query's URL you ensure that these two services are avoided and no\n" "cluster records are loaded. You can also turn them off by default on the\n" "Spider Controls page. But if someone explicitly\n" "specifies &sc=1 or &dr=1 in their query URL then they will\n" "override that switch.\n" "\n" "
If you are experiencing a high average query latency under a high query throughput then consider adding more twins to your architecture. If you do not have any twins, and are serving a large query volume, then data requests tend to clump up onto one particular server at random, slowing everybody else down. If that server has one or more twins available, then its load will be evened out through Gigablast's dynamic load balancing and the average query latency will decrease." "\n" "

\n" "\n" "
\n" "\n" "Build Optimizations:\n" "

Set restrict indexdb for spidering on the \n" "
Disable dup checking. Gigablast will not allow any duplicate pages\n" "from the same domain into the index when this is enabled. This means that\n" "Gigablast must do about one disk seek for every URL indexed to verify it is\n" "not a duplicate. If you keep checksumdb all in memory this will not be a\n" "problem.\n" "
Disable link voting. Gigablast performs at least one disk seek\n" "to determine who link to the URL being indexed. If it does have some linkers\n" "then the Cached Copy of each linker (up to 200) is loaded and the corresponding\n" "link text is extracted. Most pages do not have many linkers so the disk\n" "load is not too bad. Furthermore, if you do enable link voting, you can\n" "restrict it to the first file of indexdb, restrict indexdb for \n" "spidering, to ensure that about one seek is used to determine the linkers.\n" "
Enable use IfModifiedSince. This tells the spider not to do \n" "anything if it finds that a page being reindexed is unchanged since the last\n" "time it was indexed. Some web servers do not support the IfModifiedSince tag,\n" "so Gigablast will compare the old page with the new one to see if anything\n" "changed. This backup method is not quite as efficient as the first, \n" "but it can still save ample disk resources.\n" "\n" "

\n" "\n" "
\n" "\n" "General Optimizations:\n" "

Prevent Linux from unnecessary swapping. Linux will often swap out\n" "Gigablast pages to satisfy Linux's disk cache. By using the swapoff command\n" "to turn off swap you can increase performance, but if the computer runs out\n" "of memory it will start killing processes withouth giving them a chance\n" "to save their data.\n" "\n" "

\n" "
\n" "\n" "\n" "\n" "\n" "\n" "\n" "

gb.conf\n" "

\n" "

\n" "

\n"
//"## This is the IP and port that a user connects to in order to search this\n"
//"## Gigablast network. This should be the same for all gb processes\n"
//"<mainExternalIp>   68.35.105.199</>\n"
//"<mainExternalPort> 8000</>\n"
//"\n"
"## Mem available to this process. May be exceeded due to fragmentation.\n"
"<maxMem> 445000000</>\n"
"\n"
"## Max incoming bandwith to use for spidering, for all hosts combined."
"<maxIncomingKbps> 3000.0</>\n"
"\n"
"## The maximum number of pages to spider per second, for all hosts combined."
"<maxPagesPerSecond> 20.00</>\n"
"\n"
"## Max threads for reading spider-related information on disk.\n"
"<spiderMaxDiskThreads>        1</>\n"
"\n"
"## Max threads for reading big/med/small chunks of spider-related info on disk\n"
"<spiderMaxBigDiskThreads>     1</>\n"
"<spiderMaxMedDiskThreads>     1</>\n"
"<spiderMaxSmaDiskThreads>     5</>\n"
"\n"
"## Max threads for reading query-related information on disk.\n"
"<queryMaxDiskThreads>         20</>\n"
"\n"
"## Max threads for reading big/med/small chunks of query-related info on disk\n"
"<queryMaxBigDiskThreads>      1</>\n"
"<queryMaxMedDiskThreads>      3</>\n"
"<queryMaxSmaDiskThreads>      10</>\n"
"\n"
"## What are the IP addresses and ports of the DNS servers? Accessed randomly.\n"
"<dns><ip>68.35.172.5</><port>53</></>\n"
"<dns><ip>68.35.172.6</><port>53</></>\n"
"\n"
"## How many bytes should we use for caching DNS replies?\n"
"<dnsMaxCacheMem> 13000</>\n"
"\n"
"## Should we save/load the DNS reply cache when we exit/start? 1=YES 0=NO\n"
"<dnsSaveCache> 0</>\n"
"\n"
"## Below the various Gigablast databases are configured.\n"
"## <*dbMaxTreeMem>      - mem used for holding new recs\n"
"## <*dbMaxPageCacheMem> - disk page cache mem for this db\n"
"## <*dbMaxCacheMem>     - cache mem for holding single recs\n"
"## <*dbMinFilesToMerge> - required # files to trigger merge\n"
"## <*dbSaveCache>       - save the rec cache on exit?\n"
"## <*dbMaxCacheAge>     - max age for recs in rec cache\n"
"## See that Stats page for a record counts and stats.\n"
"\n"
"## Sitedb holds site-based parsing info. A tagdb record assigns a url or site\n"
"## to a ruleset. Each tagdb record is about 100 bytes or so.\n"
"<tagdbMaxTreeMem>          1200000</>\n"
"<tagdbMaxPageCacheMem>     200000</>\n"
"<tagdbMaxCacheMem>         131072</>\n"
"<tagdbMinFilesToMerge>     2</>\n"
"\n"
"## Titledb holds the compressed documents that we've indexed.\n"
"<titledbMaxTreeMem>         1000000</>\n"
"<titledbMaxCacheMem         10485760</>\n"
"<titledbMinFilesToMerge>    3</>\n"
"<titledbMaxCacheAge>        86400</>\n"
"<titledbSaveCache>          0</>\n"
"\n"
"## Clusterdb caches small records for site clustering and deduping.\n"
"<clusterdbMaxCacheMem>      131072</>\n"
"<clusterdbSaveCache>        0</>\n"
"\n"
"## Checksumdb is used for deduping same-site urls at index time.\n"
"<checksumdbMaxTreeMem>      1048576</>\n"
"<checksumdbMaxCacheMem>     2097152</>\n"
"<checksumdbMaxPageCacheMem> 2097152</>\n"
"<checksumdbMinFilesToMerge> 2</>\n"
"\n"
"## Tfndb holds small records for each url in Titledb.\n"
"<tfndbMaxTreeMem>           5000000</>\n"
"<tfndbMaxPageCacheMem>      155000000</>\n"
"<tfndbMinFilesToMerge>      2</>\n"
"\n"
"## Spiderdb holds urls to be spidered\n"
"<spiderdbMaxTreeMem>        1200000</>\n"
"<spiderdbMaxCacheMem>       131072</>\n"
"<spiderdbMaxPageCacheMem>   256000</>\n"
"<spiderdbMinFilesToMerge>   2</>\n"
"\n"
"## Robotdb caches robot.txt files.\n"
"<robotdbMaxCacheMem>        131072</>\n"
"<robotdbSaveCache>          0</>\n"
"\n"
"## Indexdb holds the terms extracted from spidered documents.\n"
"<indexdbMaxTreeMem>         8000000</>\n"
"<indexdbMaxCacheMem>        500000</>\n"
"<indexdbMinFilesToMerge>    4</>\n"
"<indexdbMaxIndexListAge>    86400</>\n"
"<indexdbTruncationLimit>    100000</>\n"
"<indexdbSaveCache>          0</>\n"
"<onlyAddUnchangedTermIds>   1</>"
"\n"
"## The HTTP server info\n"
"## Maximum simultaneous connections. Excess will be closed.\n"
"<httpMaxSockets>            500</>\n"
"<httpMaxSendBufSize>        32768</>\n"
"\n"
"## Bytes to use for caching search result pages.\n"
"<maxPageCacheMem>           1000000</>\n"
"## Maximum age in seconds.\n"
"<maxPageCacheAge>           14400</>\n"
"<resultsSaveCache>          0</>\n"
"\n"
"## Max linkers to a doc we sample to determine quality.\n"
"<maxIncomingLinksToSample>  100</>\n"
"\n"
"## Percent more to weight phrases than single words.\n"
"<queryPhraseWeight>         100</>\n"
"\n"
"## Maximum weight one query term can have relative to another in the query.\n"
"<queryMaxMultiplier>        10.0</>\n"
"\n"
"## Sync info\n"
"<syncIndexdb>        1</>\n"
"<syncTitledb>        1</>\n"
"<syncSpiderdb>       1</>\n"
"<syncChecksumdb>     1</>\n"
"<syncSitedb>         1</>\n"
"<syncDoUnion>        1</>\n"
"<syncDryRun>         0</>\n"
"<syncBytesPerSecond> 100000000</>\n"
"\n"
"## Is spidering enabled for this host? 1=YES 0=NO\n"
"<spideringEnabled>          0</>\n"
"\n"
"## Is injection enabled for this host? 1=YES 0=NO\n"
"<injectionEnabled>          1</>\n"
"\n"
"## Can others add urls to a collection? 1=YES 0=NO\n"
"<addUrlEnabled>             0</>\n"
"\n"
"## Serve ads from ah-ha? 1=YES 0=NO\n"
"<adFeedEnabled>             0</>\n"
"\n"
"## Can non-admins connect to this webserver? 1=YES 0=NO\n"
"<httpServerEnabled>         1</>\n"
"\n"
"## Send an email when a host is detected as dead? 1=YES 0=NO\n"
"<sendEmailAlerts>           0</>\n"
"\n"
"## Allow software interrupts? 1=YES 0=NO\n"
"<allowAsyncSignals>         0</>\n"
"\n"
"## Read only mode does not allow spidering. 1=YES 0=NO\n"
"<readOnlyMode>              0</>\n"
"\n"
"## Use /etc/hosts file to resolve hostnames? 1=YES 0=NO\n"
"<useEtcHosts>               0</>\n"
"\n"
"## Restrict merging to one host per token group? Hosts that use the same\n"
"## disk and mirror hosts are generally in the same token group so that only one\n"
"## host in the group can be doing a merge at a time. This prevents query\n"
"## response time from suffering too much. 1=YES 0=NO\n"
"<useMergeToken>             0</>\n"
"\n"
"## If this is true we do not retrieve data from the network if we have it\n"
"## local. Useful if network is slow or drives are fast. 1=YES 0=NO\n"
"<preferLocalReads>          0</>\n"
"\n"
"## If this is true all writes are synchronous. 1=YES 0=NO\n"
"<flushWrites>               1</>\n"
"\n"
"## Spell checking requires considerably more memory, so only a few hosts should\n"
"## have this enabled if possible. 1=YES 0=NO\n"
"<doSpellChecking>           1</>\n"
""
"## The User-Agent field used by the Gigablast spider.\n"
"<spiderUserAgent>           Gigabot/1.0</>\n"
""
"## Try to save unsaved in-memory data to disk every X minutes.\n"
"<autoSaveFrequency>         15</>\n"
"

\n"
"## Log Controls\n"
"<logHttpRequests>  1</>\n"
"<logSpideredUrls>  1</>\n"
"<logInfo>          1</>\n"
"<logNetCongestion> 0</>\n"
"<logLimits>        0</>\n"
"<logDebugAddurl>   0</>\n"
"<logDebugAdmin>    0</>\n"
"<logDebugBuild>    0</>\n"
"<logDebugDb>       0</>\n"
"<logDebugDisk>     0</>\n"
"<logDebugHttp>     0</>\n"
"<logDebugLoop>     0</>\n"
"<logDebugNet>      0</>\n"
"<logDebugQuery>    0</>\n"
"<logDebugSpeller>  0</>\n"
"<logDebugTcp>      0</>\n"
"<logDebugThread>   0</>\n"
"<logDebugTopics>   0</>\n"
"<logDebugUdp>      0</>\n"
"<logTimingBuild>   0</>\n"
"<logTimingDb>      0</>\n"
"<logTimingNet>     0</>\n"
"<logTimingQuery>   0</>\n"
"<logTimingTopics>  0</>\n"
"<logReminders>     0</>\n"
"\n"
"

\n" "\n" "\n" "\n" "\n" "

hosts.conf\n" "

\n" "

\n" "Every gb process uses the same hosts.conf file. The hosts.conf file describes the hosts (gb processes) participating in the network.\n" "Each line in this file is a host entry. The number of participating hosts must be a power of 2. Each host entry uses the following fields:

\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "

ID	Each host has a unique id. The ids must be contiguous.
IP	Each host has an IP. If you are running multiple hosts on the same computer they may all use the same IP.
LINKIP	This is the IP of this host as viewed externally. It may or may not be different from the internal IP. It is only used for generating absolute (non-relative) links for <a href> tags on dynamic HTML pages.\n" "
UDP1	This is the low priority udp port used by the host. Hosts on the same computer must have different ports. Port numbers must be above 2000 or so, because only root has permission to use those ports.
UDP2	This is the high priority udp port used by the host. Hosts on the same computer must have different ports. Port numbers must be above 2000 or so, because only root has permission to use those ports.
DNS	This is the client port we use locally when talking to the dns server.
HTTP	This is the HTTP port used by the host. To avoid conflicts, hosts on the same computer must have different ports. Port numbers must be above 2000 or so, because only root has permission to use those ports.
IDE	The IDE channel number that the host uses. Hosts on the same computer that share the same IDE bus must have this number be the same.
GRP	The redundancy group number to which the host belongs. Hosts that are mirror images (twins) of each other have the same redundancy group number.
DIR	The working directory where the host stores all files related to the gb process.

\n" "\n" "
\n" "IMPORTANT: The group IDS in the hosts.conf must be strictly " "increasing, at least up until it hits a host in group #0 again." "
\n" "Here is a sample hosts.conf file for a network of 8 hosts running on 8 computers:

\n" "\n" "

\n"
"#ID IP            LINKIP        UDP1 UDP2  DNS  HTTP IDE GRP DIR\n"
"\n"
"0   64.62.142.231 64.62.142.231 9000 10000 6000 8000   0   0 /a\n"
"1   64.62.142.233 64.62.142.233 9000 10000 6000 8000   0   1 /a\n"
"2   64.62.142.235 64.62.142.235 9000 10000 6000 8000   0   2 /a\n"
"3   64.62.142.237 64.62.142.237 9000 10000 6000 8000   0   3 /a\n"
"4   64.62.142.239 64.62.142.239 9000 10000 6000 8000   0   0 /a\n"
"5   64.62.142.241 64.62.142.241 9000 10000 6000 8000   0   1 /a\n"
"6   64.62.142.244 64.62.142.244 9000 10000 6000 8000   0   2 /a\n"
"7   64.62.142.246 64.62.142.246 9000 10000 6000 8000   0   3 /a\n"
"

\n" "
\n" "\n" "\n" "" "" /* "\n" "\n" "

Sample Ruleset File\n" "

\n" "

\n" "A ruleset is a set of rules used for spidering and indexing the content of a URL. This section talks about how to assign a ruleset to a URL. Each ruleset is a file in Gigablast's working directory with a file name like tagdb*.xml, where '*' is a number.\n" "

\n" "IMPORTANT: Do not change the indexing section or the <linksUnbanned>, <linksClean> or <linksDirty> tags of a ruleset file if some documents in the index were indexed with that ruleset file. To do so might create some unrepairable data corruption.\n" "

\n" "The following is an example ruleset for a particular URL (\"the URL\"):\n" "\n" "

\n"
"\n"
"# This is the unique name of the ruleset which is used for \n"
"# display in drop-down menus in administrative, web-based GUIs.\n"
"<name>default</>\n"
"\n"
"\n"
"# This is the accompanying description displayed on the Sitedb tool and\n"
"# URL Filters pages.\n"
"<description>This is the default ruleset used for most urls.</>\n"
"\n"
"# If a ruleset is no longer actively used, it is not deleted, but retired.\n"
"# Retired rulesets are not displayed to spam assassins on the Sitedb tool \n"
"# and URL Filters pages.\n"
"<retired>no</>\n"
"\n"
"##############################################################################\n"
"# \n"
"# The Quality Section. This section of the ruleset is used to determine the \n"
"# QUALITY of the URL. The quality ranges from 0%% to over 100%% and is used to \n"
"# influence many other things in this file. A quality of 30%% is considered to \n"
"# be the quality of the average web page.\n"
"#\n"
"##############################################################################\n"
"\n"
"# The quality of the URL will not be allowed to exceed this value.\n"
"<maxQuality>100</> (default 100%%)\n"
"\n"
"# This is the unadjusted quality of the URL. The maps below may modify it to\n"
"# get the final quality of the URL.\n"
"<baseQuality>30</> (default 30%%)\n"
"\n"
"# Now for some maps. Each map is a graph that maps one thing to another.\n"
"# The first thing listed is the X component. All X components are listed first\n"
"# followed by their corresponding Y components. Taken together they create a\n"
"# set of points on the Cartesian graph. In this way Gigablast can map an\n"
"# arbitrary value in the domain (X axis) to its corresponding value in the\n"
"# image (Y axis). The X components must be in ascending order.\n"
"#\n"
"# The tag name of each map component, usually something like 'numLinks13',\n"
"# always contains a number, in the case of this example it is 13. These numbers\n"
"# are just used to ensure that the tag name is unique, nothing more.\n"
"#\n"
"# Gigablast linearly interpolates between the supplied points in the graph in \n"
"# order to map X values that are not explicitly given in the graph. The \n"
"# interpolation function extends horizontally from the first/last points with \n"
"# the same image value of the first/last point.\n"
"#\n"
"# A map can have up to 32 defined points, but typically just 5 are used.\n"
"\n"
"# In this map the number of incoming links is mapped to a quality BOOST for the\n"
"# URL. Only one incoming link is counted per top 2 bytes of the ip address\n"
"# (most significant 2 bytes of the IP) if \"restrict link voting\" is\n"
"# turned on in the Spider Controls. This helps prevent spam. This boost\n"
"# is added to the baseQuality, not multiplied.\n"
"<numLinks11>      0 </>\n"
"<numLinks12>      5 </>\n"
"<numLinks13>     10 </>\n"
"<numLinks14>     20 </>\n"
"<numLinks15>     50 </>\n"
"<qualityBoost11>  0 </>\n"
"<qualityBoost12>  5 </>\n"
"<qualityBoost13> 10 </>\n"
"<qualityBoost14> 15 </>\n"
"<qualityBoost15> 20 </>\n"
"\n"
"# This map is like the above map, but the SUM of the baseQuality of all \n"
"# linkers is mapped to a baseQuality boost for the URL. The boost is added to \n"
"# the baseQuality, not multiplied.\n"
"<linkQualitySum21>   0 </>\n"
"<linkQualitySum22>  50 </>\n"
"<linkQualitySum23> 100 </>\n"
"<linkQualitySum24> 150 </>\n"
"<linkQualitySum25> 200 </>\n"
"<qualityBoost21>     0 </>\n"
"<qualityBoost22>     5 </>\n"
"<qualityBoost23>    10 </>\n"
"<qualityBoost24>    15 </>\n"
"<qualityBoost25>    20 </>\n"
"\n"
"# This map is like the above map, but the quality of the root page of the URL\n"
"# is mapped to a baseQuality boost for the URL. The boost is added to the \n"
"# baseQuality, not multiplied. If the URL is a root URL then the rootQuality\n"
"# for purposes of just this map is assumed to be 30%% to prevent explosive\n"
"# feedback.\n"
"<rootQuality31>    0  </>\n"
"<rootQuality32>   50  </>\n"
"<rootQuality33>  100  </>\n"
"<rootQuality34>  200  </>\n"
"<rootQuality35>  500  </>\n"
"<qualityBoost31>   0  </>\n"
"<qualityBoost32>   5  </>\n"
"<qualityBoost33>  10  </>\n"
"<qualityBoost34>  15  </>\n"
"<qualityBoost35>  20  </>\n"
"\n"
"\n"
"##############################################################################\n"
"#\n"
"# The Quota Section. How many documents should we index from the site of the \n"
"# URL? Quotas can be turned on/off for old/new URLs via the \"Spider Controls\" \n"
"# page.\n"
"#\n"
"##############################################################################\n"
"\n"
"# How many docs from the site of the URL should we allow into the index?\n"
"# A site is typically just the hostname of the URL, but, if a record for\n"
"# the URL exists in tagdb, then the site of that record will be the site.\n"
"# Use -1 for no max. \n"
"<maxDocs>20000</> (default -1)\n"
"\n"
"# This map maps the quality of the root page of the URL to a quota boost.\n"
"# The boost can be negative. A boost of -100%% makes the quota 0.\n"
"# The base quota is given by the <maxDocs> field above.\n"
"<rootQuality71>    0 </>\n"
"<rootQuality72>   30 </>\n"
"<rootQuality73>   50 </>\n"
"<rootQuality74>   60 </>\n"
"<rootQuality75>   70 </>\n"
"<quotaBoost71>  -100 </>\n"
"<quotaBoost72>     0 </>\n"
"<quotaBoost73>   100 </>\n"
"<quotaBoost74>   200 </>\n"
"<quotaBoost75>   300 </>\n"
"\n"
"# Like the above map, but the quality of the URL is mapped to a quota boost.\n"
"# The quota boost is multiplied by the <maxDocs> number and then added\n"
"# to it.\n"
"<quality81>       0 </>\n"
"<quality82>      30 </>\n"
"<quality83>      50 </>\n"
"<quality84>      60 </>\n"
"<quality85>      70 </>\n"
"<quotaBoost81> -100 </>\n"
"<quotaBoost82>    0 </>\n"
"<quotaBoost83>  100 </>\n"
"<quotaBoost84>  200 </>\n"
"<quotaBoost85>  300 </>\n"
"\n"
"##############################################################################\n"
"#\n"
"# The Spider Section. The following parameters control how the URL is spidered.\n"
"# Spidering can be turned on/off as a whole or for various spider priority \n"
"# queues via the Spider Controls page. Many other parameters exist there as \n"
"# well.\n"
"#\n"
"##############################################################################\n"
"\n"
"# How int32_t to wait to respider for the first time.\n"
"# This is in DAYS. This tag overrides Spider Controls if present.\n"
"<firstRespiderWait>3600</> (default is to omit this tag)\n"
"\n"
//"# How int32_t to wait to respider if there was an error.\n"
//"# This is in DAYS. This tag overrides Spider Controls if present.\n"
//"<errorRespiderWait>3600</> (default is to omit this tag)\n"
//"\n"

"# What is the minimum amount of time we should wait before re-spidering a URL?\n"
"# Re-spider frequency is usually intelligently determined using a bisection\n"
"# method based on the update frequency of the URL.\n"
"# This is in seconds. default = 1 day = 24*60*60 = 86400.\n"
"<minRespiderWait>86400</> (default 86400)\n"
"\n"
"# What is the maximum amount of time we should wait before re-spidering a URL?\n"
"# Re-spider frequency is usually intelligently determined using a bisection\n"
"# method based on the update frequency of the URL.\n"
"# This is in seconds. default = 90 days = 90*24*60*60 = 7776000.\n"
"<maxRespiderWait>7776000</> (default 7776000)\n"
"\n"
"# What spider frequency in days should this URL be assigned?\n"
"# If this is -1 then the re-spider frequency is intelligently determined using \n"
"# a bisection method based on the update frequency of the URL.\n"
"# This is not yet supported.\n"
"# <spiderFrequency>-1</>\n"
"\n"
"# What spider priority should this URL be assigned?\n"
"# Use -1, the default, to leave unspecified. If not assigned by a matching\n"
"# regular expression, it may be determined by the spider priority of the\n"
"# page from which it was harvested as a link, minus one.\n"
"# This is not yet supported.\n"
"# <spiderPriority>-1</> (default -1)\n"
"\n"
"# What is the min/max spider priority the URL should be assigned.\n"
"# Priorities range from 0 up to 7. (see <spiderPriority> tag above)\n"
"# This is not yet supported.\n"
"#<spiderMinPriority>0</> (default 0)\n"
"#<spiderMaxPriority>5</> (default 7)\n"
"\n"
"# What spider priority should links harvested on the URL's page be assigned?\n"
"# Priorities range from 0 up to 7.\n"
"# -1, the default, means to use the spider priority of the URL minus one.\n"
"# This results in a breadth first spidering algorithm until the URL is\n"
"# from the priority 0 spider queue, in which case, the harvested links will\n"
"# also be assigned to the priority 0 queue.\n"
"<spiderLinkPriority>-1</> (default -1)\n"
"\n"
"# Should we spider links for the URL?  If \"spider links\" is toggled off on the \n"
"# Spider Controls page then this will *not* override.\n"
"<spiderLinks>yes</> (default yes)\n"
"\n"
"# Should we only harvest links from the same host of the URL? \n"
"# If url is just a domain, then the www hostname is allowed as well.\n"
"# This overrides the same control on Spider Controls page, so leave it \n"
"# out if you do not want to override that control. This is primarily used "
"# good directory sites that have the power to unban soft banned sites, and "
"# such unbanned sites are then only permitted to harvest internal links.\n"
"#<spiderLinksFromSameHostOnly>no</> (default is to omit this tag)\n"
"\n"
"##############################################################################\n"
"#\n"
"# The Classification Section. How is the URL classified?\n"
"#\n"
"##############################################################################\n"
"\n"
"\n"
"# If the URL's quality is at or below this, then it will be checked for adult \n"
"# content.\n"
"<maxQualityForAdultDetect>0</> (default 0%%)\n"
"\n"
"# Do links from the URL point to clean pages?\n"
"<linksClean>no</> (default no)\n"
"\n"
"# Do links from the URL point to clean pages?\n"
"<linksDirty>no</> (default no)\n"
"\n"
"# Is the URL adult-oriented?\n"
"<isAdult>no</> (default no)\n"
"\n"
"# Is the URL banned from the index? The default is no.\n"
"# If it is banned it will not be indexed. If it is already indexed then it\n"
"# will be removed from the index the next time it is respidered/reinjected.\n"
"<isBanned>no</> (default no)\n"
"\n"
"# Can the URL be unbanned? If the URL's <isBanned> tag is set to yes,\n"
"# and this tag is set to yes, then the URL is said to be \"soft banned\".\n"
"# If another URL links to the soft banned URL and that\n"
"# other URL is indexed with <linksUnbanned>yes< in its ruleset then\n"
"# it will UNban the URL. This is useful for doing liberal banning but relying \n"
"# on a directory site like dmoz.org to unban URLs that should not have been \n"
"# banned.\n"
"<canBeUnbanned>no</> (default yes)\n"
"\n"
"# See above description for <canBeUnbanned> tag for how this works.\n"
"<linksUnbanned>no</>  (default no)\n"
"\n"
"# Should we ban the DOMAINS of the the links in the URL's content. The ban \n"
"# from the URL expires if the URL is removed from the index.\n"
"<linksBanned>no</> (default no)\n"
"\n"
"# What ruleset should those URLs that the URL links to use? \n"
"# Specify it by name. This is a useful way of assigning a URL to a ruleset.\n"
"# This is not yet supported.\n"
"# <rulesetOfLinks>special</>\n"
"\n"
"##############################################################################\n"
"#\n"
"# The Filter Section tells Gigablast what to allow into the index.\n"
"#\n"
"##############################################################################\n"
"\n"
"# If the URL's quality is LESS THAN this it will not be indexed. If the URL is\n"
"# being reindexed then it will be removed from the index.\n"
"<minQualityToIndex>0</> (default 0%%)\n"
"\n"
"# Allow URLs ending in .cgi or URLs containing ?'s into the index?\n"
"<allowCgiUrls>yes</> (default yes)\n"
"\n"
"# Allow URLs with no canonical domain name into the index?\n"
"<allowIpUrls>yes</> (default yes)\n"
"\n"
"# Delete 404'ed documents from the index?\n"
"# If you are making a historical index, you may want to set this to no.\n"
"<delete404s>yes</> (default yes)\n"
"\n"
"# Should the URL be indexed if it is adult-oriented? \n"
"<allowAdultContent>yes</> (default yes)\n"
"\n"
"# Index the URL even if it is a duplicate of another page from the same site?\n"
"# This overrides the \"deduping enabled\" switch in the Spider Controls,\n"
"# so omit this tag to rely solely on that Spider Controls switch.\n"
"<indexDupContent>no</> (default is to omit this tag)\n"
"\n"

"# Should the checksum hash be computed just from the indexed words? If this\n"
"# is true then pages from the same site will be detected as dups more\n"
"# often. Useful for newspaper articles where we only index the content of\n"
"# the article. Also, it is independent of the order of the words. This\n"
"# checksum is also used to see if the content of the page has changed in\n"
"# order to set the next respider date for intelligent respidering.\n"
"<useLooseChecksums>no</> (default is no)\n"
"\n"

"# Index document for sort or constrain by date. Almost doubles disk space.\n"
"<indexDate>yes</> (default yes)\n"

"\n"
"# # If the url does not get indexed should we still keep it scheduled to be\n"
"# be spidered again later in spiderdb? Handy for seed pages, like good \n"
"# directory pages that link to the stuff you want to index.\n"
"<keepUnindexedUrls>no</> (default no)\n"

"\n"
"# Index documents without dollar signs. Special case for shopping index.\n"
"<needDollarSign>no</> (default no)\n"

//"\n"
//"# Does the url need to contain back-to-back digits in its path in order to\n"
//"# be indexed?\n"
//"<needNumbersInUrl>no</> (default no)\n"

"\n"
"# If date on page is older than this many days, do not index.\n"
"# Omit this tag to default to the value in Spider Controls page.\n"
"# 0.0 says to index all documents regardless of their extracted date.\n"
"# Good directory sites usually have this set to 0.0 for the news collection.\n"
"<daysBeforeNowToIndex>0.0</> (default is to omit this tag)\n"



"\n"
"##############################################################################\n"
"#\n"
"# The Link Text Section. When a URL is indexed, Gigablast will determine what\n"
"# other URLs link to it and harvest the relevant link text from each of those \n"
"# URLs. That link text is then indexed as if it occurred on the URL's page \n"
"# itself, but it is not subject to spam detection. See the \n"
"# section on link text for more about how link text is \n"
"# indexed and what controls are available in the administrative interface.\n"
"#\n"
"##############################################################################\n"
"\n"
"# Should we index the URL's incoming link text as if it were on the page?\n"
"<indexIncomingLinkText>yes</> (default yes)\n"
"\n"
"# This maps the URL's quality to a weight on the score of its OUTGOING link \n"
"# text. The score of the terms in the link text is multiplied by this weight. \n"
"# If the URL links to nothing then this is useless. Currently we limit \n"
"# link text to up to 256 chars in LinkInfo.cpp.\n"
"<quality41>               0  </>\n"
"<quality42>              30  </>\n"
"<quality43>              50  </>\n"
"<quality44>              70  </>\n"
"<quality45>              85  </>\n"
"<linkTextScoreWeight41>  25  </>\n"
"<linkTextScoreWeight42> 200  </>\n"
"<linkTextScoreWeight43> 250  </>\n"
"<linkTextScoreWeight44> 275  </>\n"
"<linkTextScoreWeight45> 300  </>\n"
"\n"
"# This maps the number of words in the link text of a link to a boost on the \n"
"# score weight of that link text. The score of the terms in the link text is \n"
"# multiplied by this weight. Currently we limit link text to 256 chars in \n"
"# LinkInfo.cpp.\n"
"<linkTextNumWords61>      3  </>\n"
"<linkTextNumWords62>      6  </> \n"
"<linkTextNumWords63>      9  </> \n"
"<linkTextNumWords64>     12  </> \n"
"<linkTextScoreWeight61> 150  </>\n"
"<linkTextScoreWeight62>  80  </> \n"
"<linkTextScoreWeight63>  50  </> \n"
"<linkTextScoreWeight64>  25  </> \n"
"\n"
"# This maps the URL's quality to a maximum score for the terms in the link \n"
"# text. 100%% is the maximum 'maximum score'.\n"
"<quality51>            0  </>\n"
"<quality52>           15  </>\n"
"<quality53>           25  </>\n"
"<quality54>           45  </>\n"
"<quality55>           75  </>\n"
"<linkTextMaxScore51> 100  </>\n"
"<linkTextMaxScore52> 100  </>\n"
"<linkTextMaxScore53> 100  </>\n"
"<linkTextMaxScore54> 100  </>\n"
"<linkTextMaxScore55> 100  </>\n"
"\n"
"##############################################################################\n"
"#\n"
"# The Indexing Section. What parts of the document should be indexed and how?\n"
"# IMPORTANT: Do not change this section if some documents in the index \n"
"# were indexed with this ruleset file. To do so might create some unrepairable\n"
"# data corruption.\n"
"#\n"
"##############################################################################\n"
"\n"
"# Should Gigablast index site:, subsite:, url:, suburl:, ip: or link: terms \n"
"# of the URL respectively?\n"
"<indexSite>    yes</> (default yes) site:    terms \n"
"<indexUrl>     yes</> (default yes) url:     terms\n"
"<indexSubUrl>  yes</> (default yes) suburl:  terms\n"
"<indexIp>      yes</> (default yes) ip:      terms\n"
"<indexLinks>   yes</> (default yes) link:/href: terms\n"
"\n"

"# This is used only for news collections for doing automatic "
"categorization.\n"
"<indexNewsTopic> yes</> (default no) newstopic: terms\n"
"\n"

"# This maps the URL's quality to a spam threshold, X. If more than X%% of\n"
"# the words in the document are spammed (repeated in a pattern) to some\n"
"# degree then all of the words will be indexed with a minimum score.\n"
"<quality61>           30  </>\n"
"<quality62>           40  </>\n"
"<quality63>           50  </>\n"
"<quality64>           70  </>\n"
"<quality65>           90  </>\n"
"<maxPercentSpammed1>   6  </>\n"
"<maxPercentSpammed2>   8  </>\n"
"<maxPercentSpammed3>  10  </>\n"
"<maxPercentSpammed4>  20  </>\n"
"<maxPercentSpammed5>  30  </>\n"
"\n"

"# Gigablast can index the various parts of a document differently. Each\n"
"# part of the document can have its own set of indexing and scoring rules.\n"
"# Each such part can be represented with an <index> tag. The index tags\n"
"# are processed in the order you give them in this ruleset file. Tags that\n"
"# are specialized for the <index> tag which contains them are highlighted\n"
"# in red.\n"
"\n"
"# The following <index> tag block tells Gigablast how to index the words\n"
"# in the HTML <title> tag. The words in the title tag are indexed before \n"
"# the words in the body because we don't want words in the body to count \n"
"# towards the <maxScore> limit placed on the words in the title.\n"
"<index>\n"
"\n"
"   # The part of the document to which this <index> tag applies.\n"
"   # This particular one says to index the terms in the <title>\n"
"   # tag. This could just as easily be an <h1> tag or even a non-HTML\n"
"   # tag like <foobar>. Omit this tag or leave the value of the tag blank\n"
"   # to index the whole body of the document.\n"
"   <name>                   title </>\n"
"\n"
"   # Spam detection will be performed on these terms if the URL's quality is\n"
"   # this or lower. It is mostly disabled for these title terms because they \n"
"   # are restricted in score by other means below. Spam detection may lower the\n"
"   # scores of repeated terms.\n"
"   <maxQualityForSpamDetect>     0 </>\n"
"\n"
"   # If the URL's quality is below this, then do not index the terms in the\n"
"   # title tag.\n"
"   <minQualityToIndex>           0 </>\n"
"\n"
"   # If this is 'yes' then convert HTML entities in the title, like &gt;,\n"
"   # into their represented characters before indexing.\n"
"   <filterHtmlEntities>        yes </>\n"
"\n"
"   # Should each term in the title only be indexed if it has not already been \n"
"   # indexed? You can affect this by changing the order of the <index> tags.\n"
"   <indexIfUniqueOnly>          no </>\n"
"\n"
"   # Should single words in the title be indexed?\n"
"   <indexSingletons>           yes </>\n"
"\n"
"   # Should phrases in the title be indexed?\n"
"   <indexPhrases>              yes </>\n"
"\n"
"   # Should the whole title be indexed as one \"word\"?\n"
"   <indexAsWhole>               no </>\n"
"\n"
"   # Should stop words be used when indexing phrases in the title?\n"
"   <useStopWords>              yes </>\n"
"\n"
"   # Should we also index the stem of each word indexed? If \n"
"   # <indexSingletons> is false this is ignored.\n"
"   <useStems>                   no </>\n"
"\n"
"   # Map the URL's quality to a maximum length (in characters) of the title.\n"
"   # Words whose characters occur passed the maximum length will not be \n"
"   # indexed. Read more about quality or maps.\n"
"   # This keeps the indexed portion of the title down to 200 characters for \n"
"   # all qualities.\n"
"   <quality11>  15 </>\n"
"   <maxLen11>  200 </>\n"
"\n"
"   # Map the URL's quality to a maximum score for indexing the terms in the\n"
"   # title. 100%% is the maximum 'maximum score'. You cannot exceed 100%% ever.\n"
"   <quality21>   15 </>\n"
"   <quality22>   30 </>\n"
"   <quality23>   45 </>\n"
"   <quality24>   60 </>\n"
"   <quality25>   80 </>\n"
"   <maxScore21>  30 </>\n"
"   <maxScore22>  45 </>\n"
"   <maxScore23>  60 </>\n"
"   <maxScore24>  80 </>\n"
"   <maxScore25> 100 </>\n"
"\n"
"   # Map the URL's quality to a percentage score boost for the terms in the \n"
"   # title. This boost is multiplied by the score of each term indexed.\n"
"   <quality31>      15 </>\n"
"   <quality32>      30 </>\n"
"   <quality33>      45 </>\n"
"   <quality34>      60 </>\n"
"   <quality35>      80 </>\n"
"   <scoreWeight31>  60 </>\n"
"   <scoreWeight32> 100 </>\n"
"   <scoreWeight33> 150 </>\n"
"   <scoreWeight34> 200 </>\n"
"   <scoreWeight35> 250 </>\n"
"\n"
"   # Map the URL's title length (in characters) to a percentage score boost for\n"
"   # the terms in the title. This boost is multiplied by the score of each \n"
"   # term indexed.\n"
"   <len41>          10  </>\n"
"   <len42>          50  </> \n"
"   <len43>         100  </> \n"
"   <len44>         200  </> \n"
"   <len45>         500  </> \n"
"   <scoreWeight41> 200  </>\n"
"   <scoreWeight42> 150  </> \n"
"   <scoreWeight43> 100  </> \n"
"   <scoreWeight44>  75  </> \n"
"   <scoreWeight45>  50  </> \n"
"\n"
"   # Map the URL's title length (in characters) to a maximum score for the \n"
"   # terms in the title. This maximum is expressed as a percentage of the\n"
"   # maximum score physically possible.\n"
"   <len51>                    100  </>\n"
"   <maxScore51>                30  </>\n"
"\n"
"</index>\n"
"\n"
"\n"
"# The following <index> block tells Gigablast how to index the body.\n"
"# This will index words in the title tag, too, because that is considered \n"
"# part of the body. The body is essentially everything not in a meta tag, \n"
"# comment or javascript tag.\n"
"<index>\n"
"\n"

"   # Should gigablast break the document into sections and score the\n"
"   # words in sections with mostly link text lower than words in sections\n"
"   # without much link text? This helps to reduce the effects of menu spam.\n"
"   # Used for news articles.\n"
"   # This only applies to the body of the document.\n"
"   <scoreBySection> no </> (default is yes)\n"
"\n"

"   # Should gigablast attempt to isolate just the single most-relevant\n"
"   # content section from the document and not index anything else?\n"
"   # Used for news articles.\n"
"   # This only applies to the body of the document.\n"
"   <indexContentSectionOnly> no </> (default is no)\n"
"\n"

"   # The minimum score an entire section of the document needs to have its\n"
"   # words indexed. Each word in a section counts as 128 points, but a\n"
"   # word in a hyperlink counts as -256 points.\n"
"   # Used for news articles.\n"
"   # This only applies to the body of the document.\n"
"   <minSectionScore> -1000000000 </> (default is "
" -1000000000)\n"
"\n"

"   # Count words in links as 21 points, words not in links as 128.\n"
"   # the average score of each word is its score plus the scores of\n"
"   # its 8 left and its 7 right neighbors divided by 16. If that\n"
"   # average score is below this value, the word is not indexed and its\n"
"   # average score is set to 0. Only valid if scoreBySection is true.\n"
"   <minAvgWordScore> 0 </> (default is 0)\n"
"\n"

"   # If the number of indexable words that have a positive average score\n"
"   # is below this value, then no words will be indexed. Used\n"
"   # to just index beefy news articles. -1 means to ignore this constraint.\n"
"   <minIndexableWords> -1 </> (default is -1)\n"
"\n"

"   # Weight the first X words higher.\n"
"   # Used for news articles.\n"
"   # This only applies to the body of the document.\n"
"   <numTopWords> 0 </> (default is 0)\n"
"\n"

"   # Weight the first X words by this much, a rational number.\n"
"   # Used for news articles.\n"
"   # This only applies to the body of the document.\n"
"   <topWordsWeight> 1.0 </> (default is 1.0)\n"
"\n"

"   # Weight the first sentence by this much, a rational number.\n"
"   # Only applies to documents that support western punctuation.\n"
"   # Used for news articles.\n"
"   # This only applies to the body of the document.\n"
"   <topSentenceWeight> 1.0 </> (default is 1.0)\n"
"\n"

"   # Do not weight more than this words in the first sentence.\n"
"   # Used for news articles.\n"
"   # This only applies to the body of the document.\n"
"   <maxWordsInSentence> 0 </> (default is 0)\n"
"\n"

"   # For the body, we turn spam detection on for all URLs, regardless of\n"
"   # their quality. This will demote the scores of terms that are repetitious.\n"
"   <maxQualityForSpamDetect> 100 </> \n"
"\n"
"   # These are all the same as the <index> tag above this one.\n"
"   <minQualityToIndex>         0 </> \n"
"   <filterHtmlEntities>      yes </> \n"
"   <indexIfUniqueOnly>        no </> \n"
"   <indexSingletons>         yes </> \n"
"   <indexPhrases>            yes </> \n"
"   <indexAsWhole>             no </> \n"
"   <useStopWords>            yes </> \n"
"   <useStems>                 no </> \n"
"\n"
"   # Map the URL's quality to a maximum length (in characters) of the body.\n"
"   # This length does not include tags. Some tags, like <br> are \n"
"   # converted into \\n\\n, but most are not. Words whose characters occur \n"
"   # passed the maximum length will not be indexed. Read more about quality or\n"
"   # maps.\n"
//"   # You will still be limited by the \"#define MAX_WORDS 10000\"\n"
//"   # statement, but this is slated to disappear soon.\n"
"   <quality11>     15  </>\n"
"   <quality12>     30  </>\n"
"   <quality13>     45  </>\n"
"   <quality14>     60  </>\n"
"   <quality15>     80  </>\n"
"   <maxLen11>   80000  </>\n"
"   <maxLen12>  100000  </>\n"
"   <maxLen13>  100000  </>\n"
"   <maxLen14>  100000  </>\n"
"   <maxLen15>  100000  </>\n"
"\n"
"   # Map the URL's quality to a maximum score for indexing the terms in the\n"
"   # body. 100%% is the maximum 'maximum score'. You cannot exceed 100%% ever.\n"
"   <quality21>   15  </>\n"
"   <quality22>   30  </>\n"
"   <quality23>   45  </>\n"
"   <quality24>   60  </>\n"
"   <quality25>   80  </>\n"
"   <maxScore21>  30  </>\n"
"   <maxScore22>  45  </>\n"
"   <maxScore23>  60  </>\n"
"   <maxScore24>  80  </>\n"
"   <maxScore25> 100  </>\n"
"\n"
"   # Map the URL's quality to a percentage score boost for the terms in the\n"
"   # body. This boost is multiplied by the score of each term indexed.\n"
"   <quality31>      15  </>\n"
"   <quality32>      30  </>\n"
"   <quality33>      45  </>\n"
"   <quality34>      60  </>\n"
"   <quality35>      80  </>\n"
"   <scoreWeight31>  60  </>\n"
"   <scoreWeight32> 100  </>\n"
"   <scoreWeight33> 150  </>\n"
"   <scoreWeight34> 200  </>\n"
"   <scoreWeight35> 250  </>\n"
"\n"
"   # Map the length of the body (in characters) to a percentage score boost for\n"
"   # the terms in the body. This boost is multiplied by the score of each term \n"
"   # indexed. This length does not include tags. Some tags, like <br> are\n"
"   # converted into \n\n, but most are not.\n"
"   # This is now obsolete for newer documents. Please use the numWords map \n"
"   # immediately following. It supports unicode better, too.\n"
"   #<len41>           100  </>\n"
"   #<len42>           500  </>\n"
"   #<len43>          1000  </>\n"
"   #<len44>          2000  </>\n"
"   #<len45>          5000  </>\n"
"   #<len46>         10000  </>\n"
"   #<len47>         20000  </>\n"
"   #<len48>         50000  </>\n"
"   #<scoreWeight41>   300  </>\n"
"   #<scoreWeight42>   250  </> \n"
"   #<scoreWeight43>   200  </> \n"
"   #<scoreWeight44>   150  </> \n"
"   #<scoreWeight45>   100  </> \n"
"   #<scoreWeight46>    80  </> \n"
"   #<scoreWeight47>    60  </> \n"
"   #<scoreWeight48>    40  </> \n"
"\n"

"   # Map the number of words to a percentage score boost for the terms in \n"
"   # the body. This boost is multiplied by the score of each term \n"
"   # indexed.\n"
"   <numWords41>       20  </>\n"
"   <numWords42>      100  </>\n"
"   <numWords43>      200  </>\n"
"   <numWords44>      400  </>\n"
"   <numWords45>     1000  </>\n"
"   <numWords46>     2000  </>\n"
"   <numWords47>     4000  </>\n"
"   <numWords48>    10000  </>\n"
"   <scoreWeight41>   300  </>\n"
"   <scoreWeight42>   250  </> \n"
"   <scoreWeight43>   200  </> \n"
"   <scoreWeight44>   150  </> \n"
"   <scoreWeight45>   100  </> \n"
"   <scoreWeight46>    80  </> \n"
"   <scoreWeight47>    60  </> \n"
"   <scoreWeight48>    40  </> \n"
"\n"

"   # Map the URL's quality to a maximum score for indexing the terms in the\n"
"   # body. 100%% is the maximum 'maximum score'. You cannot exceed 100%% ever.\n"
"   <len51>       100  </>\n"
"   <len52>       500  </>\n"
"   <len53>      1000  </>\n"
"   <len54>      2000  </>\n"
"   <len55>      5000  </>\n"
"   <maxScore51>   30  </>\n"
"   <maxScore52>   45  </>\n"
"   <maxScore53>   60  </>\n"
"   <maxScore54>   80  </>\n"
"   <maxScore55>  100  </>\n"
"\n"
"</index>\n"
"\n"
"# This one is similar to above, but we're indexing \"title:\" terms.\n"
"# The major difference are in red.\n"
"<index>\n"
"   <name>                    title </>\n"
"   <prefix>                  title </> # prepend a \"title:\" to the term before indexing\n"
"   <maxQualityForSpamDetect>     0 </>\n"
"   <minQualityToIndex>           0 </>\n"
"   <filterHtmlEntities>        yes </>\n"
"\n"
"   # This tells Gigablast not to index a word or phrase if it has already been\n"
"   # indexed. This means that repeating terms in the title will have no affect.\n"
"   <indexIfUniqueOnly>         yes </>\n"
"\n"
"   <indexSingletons>           yes </>\n"
"   <indexPhrases>              yes </>\n"
"   <indexAsWhole>               no </>\n"
"   <useStopWords>              yes </>\n"
"   <useStems>                   no </>\n"
"\n"
"   # Map URL's quality to a maximum length for this field.\n"
"   <quality11>                 15  </>\n"
"   <quality12>                 30  </>\n"
"   <quality13>                 45  </>\n"
"   <quality14>                 60  </>\n"
"   <quality15>                 80  </>\n"
"   <maxLen11>               80000  </>\n"
"   <maxLen12>              100000  </>\n"
"   <maxLen13>              150000  </>\n"
"   <maxLen14>              200000  </>\n"
"   <maxLen15>              250000  </>\n"
"\n"
"   # Map URL's quality to a maximum score for terms in this field.\n"
"   <quality21>                 15  </>\n"
"   <quality22>                 30  </>\n"
"   <quality23>                 45  </>\n"
"   <quality24>                 60  </>\n"
"   <quality25>                 80  </>\n"
"   <maxScore21>                30  </>\n"
"   <maxScore22>                45  </>\n"
"   <maxScore23>                60  </>\n"
"   <maxScore24>                80  </>\n"
"   <maxScore25>               100  </>\n"
"\n"
"   # Map URL's quality to a percentage score boost for terms in this field.\n"
"   <quality31>                 15  </>\n"
"   <quality32>                 30  </>\n"
"   <quality33>                 45  </>\n"
"   <quality34>                 60  </>\n"
"   <quality35>                 80  </>\n"
"   <scoreWeight31>             60  </>\n"
"   <scoreWeight32>            100  </>\n"
"   <scoreWeight33>            150  </>\n"
"   <scoreWeight34>            200  </>\n"
"   <scoreWeight35>            250  </>\n"
"\n"
"   # Map the field's length to a percentage score boost for terms in this \n"
"   # field.\n"
"   <len41>                    100  </>\n"
"   <len42>                    500  </>\n"
"   <len43>                   1000  </>\n"
"   <len44>                   2000  </>\n"
"   <len45>                   5000  </>\n"
"   <scoreWeight41>            300  </>\n"
"   <scoreWeight42>            200  </>\n"
"   <scoreWeight43>            150  </>\n"
"   <scoreWeight44>            100  </>\n"
"   <scoreWeight45>             50  </>\n"
"\n"
"   # Map the field's length to a maximum score for terms in this field.\n"
"   <len51>                    100  </>\n"
"   <len52>                    500  </>\n"
"   <len53>                   1000  </>\n"
"   <len54>                   2000  </>\n"
"   <len55>                   5000  </>\n"
"   <maxScore51>                30  </>\n"
"   <maxScore52>                45  </>\n"
"   <maxScore53>                60  </>\n"
"   <maxScore54>                80  </>\n"
"   <maxScore55>               100  </>\n"
"\n"
"</index>\n"
"\n"
"# Now this one is for all the standard, supported meta tags.\n"
"# Terms in these tags have not been indexed yet, but we do that here.\n"
"<index>\n"
"\n"
"   # Gigablast allows multiple fields/parts to be specified for indexing\n"
"   # under the same parameters. In this case, we treat the meta summary,\n"
"   # meta description and meta keywords tags all equally.\n"
"   <name>           meta.summary      </>\n"
"   <name>           meta.description  </> \n"
"   <name>           meta.keywords     </>\n"
"\n"
"\n"
"   <maxQualityForSpamDetect>   0      </>\n"
"   <minQualityToIndex>         0      </>\n"
"   <filterHtmlEntities>      yes      </>\n"
"\n"
"   # This tells Gigablast not to index a word or phrase if it has already been\n"
"   # indexed. This means that repeating terms in these meta tags will have no\n"
"   # affect.\n"
"   <indexIfUniqueOnly>       yes      </>\n"
"\n"
"   <indexSingletons>         yes      </>\n"
"   <indexPhrases>            yes      </>\n"
"   <indexAsWhole>             no      </>\n"
"   <useStopWords>            yes      </>\n"
"   <useStems>                 no      </>\n"
"\n"
"   # Map URL's quality to a maximum length for this field.\n"
"   <quality11>  15 </>\n"
"   <maxLen11>  200 </>\n"
"\n"
"   # Map URL's quality to a maximum score for terms in this field.\n"
"   <quality21>   15 </>\n"
"   <maxScore21> 100 </>\n"
"\n"
"   # Map URL's quality to a percentage score boost for terms in this field.\n"
"   <quality31>      15 </>\n"
"   <scoreWeight31> 100 </>\n"
"\n"
"   # Map the field's length to a percentage score boost for terms in this \n"
"   # field.\n"
"   <len41>         100 </>\n"
"   <scoreWeight41> 100 </>\n"
"\n"
"   # Map the field's length to a maximum score for terms in this field.\n"
"   <len51>      100 </>\n"
"   <maxScore51> 100 </>\n"
"\n"
"</index>\n"
"

\n" "\n" "\n" */ /* "
\n" "\n" "\n" "\n" "

gbstart\n" "

\n" "

\n" "This simple script is used to start up all the gb hosts (processes) native to a particular computer. It also redirects the gb programs standard error to a log file. Notice that the gb executable takes the gb.conf filename as its first argument." "

\n" "

\n"
"#!/bin/bash\n"
"# move the old log file\n"
"mv /workdir/loga /workdir/loga-`date '+%%Y_%%m_%%d-%%H:%%M:%%S'`.log\n"
"# start up gb\n"
"/workdir/gb -c /workdir/hosts.conf >& /workdir/loga &\n"
"

\n" "\n" "
\n" */ "
\n" "\n" "" "" "" "" "" "" "" "" "" "\n" "\n" "

Stop Words\n" "

\n" "
\n" "

\n"
"at     be     by     of     on\n"
"or	do	he	if	is\n"
"it	in	me	my	re\n"
"so	to	us	vs	we\n"
"the	and	are	can	did\n"
"per	for	had	has	her\n"
"him	its	not	our	she\n"
"you	also	been	from	have\n"
"here	hers	ours	that	them\n"
"then	they	this	were	will\n"
"with	your	about	above	ain\n"
"could	isn	their	there	these\n"
"those	would	yours	theirs	aren\n"
"hadn	didn 	hasn	ll  	ve  	\n"
"should	shouldn\n"
"

\n" "
\n" "
\n" "\n" "\n" "\n" "

Phrase Breaks\n" "

\n" "
\n" "Certain punctuation breaks up a phrase. All single character punctuation marks can be phrased across, with the exception of the following:\n" "\n" "\n" "

Breaking Punctuation (1 char)
?	!	;	{	}	<	>	171	187	191	161

\n" "

\n" "The following 2 character punctuation sequences break phrases:\n" "\n" "\n" "\n" "\n" "

Breaking Punctuation (2 chars)( _ = whitespace = \\t, \\n, \\r or \\0x20)
?_	!_	;_	{_	}_	<_	>_	171_	187_	191_	161_	_.
_?	_!	_;	_{	_}	_<	_>	_171	_187	_191	_161	_.
Any 2 character combination with NO whitespaces with the exception of \"/~\"

\n" "

\n" "\n" "All 3 character sequences of punctuation break phrases with the following exceptions:\n" "\n" "\n" "

NON-Breaking Punctuation (3 chars)( _ = whitespace = \\t, \\n, \\r or \\0x20)
://	___	_,_	_-_	_+_	_&_

\n" "

\n" "\n" "All sequences of punctuation greater than 3 characters break phrases with the sole exception being a sequence of strictly whitespaces.\n" "\n" "

\n" "
\n" "\n" "\n" "\n" "products \n" "help \n" "add a url \n" "about \n" "contact\n" "\n" "\n" "\n" "\n"); //if ( user == USER_MASTER ) p += gbstrlen(p); if ( g_users.hasPermission( r, PAGE_MASTER ) ) p += gbstrlen(p); *p = '\0'; int32_t bufLen = gbstrlen(buf); return g_httpServer.sendDynamicPage ( s , buf , bufLen ); }