open-source-search-engine/XmlDoc.h

// Matt Wells, copyright Apr 2009

// . 2. you can also call setTitleRec() and then call getMetaList()
// . this class is used by Repair.cpp and by Msg7 (inject) and SpiderLoop.cpp
// . Msg7 and Repair.cpp and injections can also set more than just
//   m_firstUrl, like m_content, etc. or whatever elements are known, but
//   they must also set the corresponding "valid" flags of those elements
// . both methods must yield exactly the same result, the same "meta list"
// . after setting the contained classes XmlDoc::setMetaList() makes the list
//   of rdb records to be added to all the rdbs, this is the "meta list"
// . the meta list is made by hashing all the termIds/scores into some hash
//   tables in order to accumulate scores, then the hash table are serialized
//   into the "meta list"
// . the meta list is added to all rdbs with a simple call to
//   Msg4::addMetaList(), which is only called by Msg14 or Repair.cpp for now


#ifndef _XMLDOC_H_
#define _XMLDOC_H_

//#include "HashTableX.h"
#include "Lang.h"
#include "Words.h"
#include "Bits.h"
#include "Pos.h"
#include "Phrases.h"
//#include "Synonyms.h"
//#include "Weights.h"
#include "Xml.h"
#include "LangList.h"
#include "SafeBuf.h"
#include "Images.h"
#include "Sections.h"
#include "Msge0.h"
#include "Msge1.h"
//#include "Msge2.h"
#include "Msg4.h"
#include "Msg8b.h"

#include "SearchInput.h"
#include "Msg40.h"
#include "Dates.h"
//#include "IndexList.h"
#include "Msg0.h"
#include "Msg22.h"
#include "Tagdb.h"
#include "Url.h"
#include "Linkdb.h"
//#include "LinkInfo.h"
//#include "Msg25.h"
#include "MsgC.h"
#include "Msg13.h"
#include "RdbList.h"
#include "SiteGetter.h"
//#include "CollectionRec.h"
#include "Msg20.h"
#include "Matches.h"
#include "Query.h"
#include "Title.h"
#include "Summary.h"
#include "Msg8b.h"
#include "Address.h"
#include "zlib.h" // Z_OK
#include "Spider.h" // SpiderRequest/SpiderReply definitions
#include "HttpMime.h" // ET_DEFLAT
#include "Msg1.h"
#include "PingServer.h"
//#include "PageCrawlBot.h" // DBA_NONE

//#define XMLDOC_MAX_AD_IDS 4
//#define XMLDOC_ADLEN      64

#define MAXFRAGWORDS 80000

#define MAX_WIKI_DOCIDS 20

#define MAX_TAG_PAIR_HASHES 100

#include "Msg40.h"
//#define SAMPLE_VECTOR_SIZE (32*4)

#define POST_VECTOR_SIZE   (32*4)

#define XD_GQ_MAX_SIZE        1000
#define XD_MAX_GIGABIT_HASHES 48

#define XD_MAX_AD_IDS         5

double getTrafficPercent ( long rank ) ;

bool setLangVec ( class Words *words ,
		  class SafeBuf *langBuf ,
		  class Sections *sections ,
		  long niceness ) ;

char *getJSONFieldValue ( char *json, char *field , long *valueLen ) ;

bool logQueryLogs ( );

bool checkRegex ( SafeBuf *regex ,
		  char    *target ,
		  bool    *boolVal ,
		  bool    *boolValValid ,
		  long    *compileError ,
		  CollectionRec *cr ) ;

// Address.cpp calls this to make a vector from the "place name" for comparing
// to other places in placedb using the computeSimilarity() function. if
// we got a >75% similarity we set the AF_VERIFIED_PLACE_NAME bit in the
// Address::m_flags for that address on the web page.
long makeSimpleWordVector ( char *s, long *vbuf, long vbufSize, long niceness);

// this is used for making the event summary/title vectors as well as in
// Msg40.cpp where it merges events and does not want to repetitively display
// the same summary lines for an event
bool getWordVector ( char *s ,
		     HashTableX *ht ,
		     uint32_t *d ,
		     long *nd ,
		     long ndmax ) ;

bool getDensityRanks ( long long *wids ,
		       long nw,
		       //long wordStart ,
		       //long wordEnd ,
		       long hashGroup ,
		       SafeBuf *densBuf ,
		       Sections *sections ,
		       long niceness );

// diversity vector
bool getDiversityVec ( class Words *words ,
		       class Phrases *phrases ,
		       class HashTableX *countTable ,
		       class SafeBuf *sbWordVec ,
		       //class SafeBuf *sbPhraseVec ,
		       long niceness );

float computeSimilarity ( long   *vec0 ,
			  long   *vec1 ,
			  // corresponding scores vectors
			  long   *s0   ,
			  long   *s1   ,
			  class Query  *q    ,
			  long  niceness ,
			  // only Sections::addDateBasedImpliedSections()
			  // sets this to true right now. if set to true
			  // we essentially dedup each vector, although
			  // the score is compounded into the remaining
			  // occurence. i'm not sure if that is the right
			  // behavior though.
			  bool dedupVecs = false );

bool isSimilar_sorted ( long   *vec0 ,
			long   *vec1 ,
			long nv0 , // how many longs in vec?
			long nv1 , // how many longs in vec?
			// they must be this similar or more to return true
			long percentSimilar,
			long    niceness ) ;

// this is called by Msg40.cpp to set "top"
long intersectGigabits ( Msg20       **mp          ,   // search results
			 long          nmp         ,
			 uint8_t       langId      ,   // searcher's langId
			 long          maxTop      ,
			 long          docsToScan  ,
			 long          minDocCount , // must be in this # docs
			 class GigabitInfo  *top   ,
			 long          niceness    ) ;

long getDirtyPoints ( char *s , long len , long niceness , char *logUrl ) ;

bool storeTerm ( char             *s        ,
                 long              slen     ,
                 long long         termId   ,
                 class HashInfo   *hi       ,
                 long              wordNum  ,
		 long              wordPos  ,
		 char densityRank   ,
		 char diversityRank ,
		 char wordSpamRank  ,
		 char hashGroup ,
		 //bool              isPhrase ,
                 class SafeBuf    *wbuf     ,
                 class HashTableX *wts      ,
		 char              synSrc   ,
		 char              langId   ) ;

// tell zlib to use our malloc/free functions
int gbuncompress ( unsigned char *dest      ,
		   unsigned long *destLen   ,
		   unsigned char *source    ,
		   unsigned long  sourceLen );

int gbcompress   ( unsigned char *dest      ,
		   unsigned long *destLen   ,
		   unsigned char *source    ,
		   unsigned long  sourceLen ,
		   long encoding = ET_DEFLATE);

int gbcompress7  ( unsigned char *dest      ,
		   unsigned long *destLen   ,
		   unsigned char *source    ,
		   unsigned long  sourceLen ,
		   bool compress = true );

int gbuncompress7  ( unsigned char *dest      ,
		     unsigned long *destLen   ,
		     unsigned char *source    ,
		     unsigned long  sourceLen ) ;


uint32_t score8to32 ( uint8_t score8 );

// for Msg13.cpp
char getContentTypeFromContent ( char *p , long niceness ) ;

// . for Msg13.cpp
// . *pend must equal \0
long getContentHash32Fast ( unsigned char *p ,
			    long plen ,
			    long niceness ) ;

uint16_t getCharsetFast ( class HttpMime *mime,
			  char *url ,
			  char *s ,
			  long slen ,
			  long niceness );

//#define MAX_CONTACT_OUTLINKS 5

#define MAX_CONTACT_ADDRESSES 20
#define EMAILBUFSIZE 512

#define ROOT_TITLE_BUF_MAX 512

// store the subsentences in an array now
class SubSent {
public:
	sentflags_t m_subSentFlags;
	//esflags_t   m_esflags;
	long        m_senta;
	long        m_sentb;
	long        m_subEnding;
	float       m_titleScore;
};

#define MAX_XML_DOCS 4

class XmlDoc {

 public:

	// . variable size rdb records all start with key then dataSize
	// . do not do that here since we compress our record's data!!
	//key_t m_titleRecKey;
	//long  m_dataSize;

	//
	// BEGIN WHAT IS STORED IN THE TITLE REC (Titledb.h)
	//


	// headerSize = this->ptr_firstUrl - this->m_headerSize
	uint16_t  m_headerSize;
	uint16_t  m_version;
	// these flags are used to indicate which ptr_ members are present:
	uint32_t  m_internalFlags1;
	long      m_ip;
	long      m_crawlDelay;
	// . use this to quickly detect if doc is unchanged
	// . we can avoid setting Xml and Words classes etc...
	long      m_contentHash32;
	// like the above but hash of all tags in TagRec for this url
	long      m_tagHash32;
	long      m_siteNumInlinks;
	long      m_siteNumInlinksUniqueIp; // m_siteNumInlinksFresh
	long      m_siteNumInlinksUniqueCBlock; // m_sitePop;
	time_t    m_spideredTime;
	time_t    m_minPubDate;
	time_t    m_maxPubDate;
	time_t    m_pubDate;    // aka m_datedbDate
	//time_t  m_nextSpiderTime;
	time_t    m_firstIndexedDate;
	time_t    m_outlinksAddedDate;
	uint16_t  m_charset; // the ORIGINAL charset, we are always utf8!
	uint16_t  m_countryId;
	//uint16_t  m_reserved1;//titleWeight;
	//uint16_t  m_reserved2;//headerWeight;
	long      m_siteNumInlinksTotal;
	//uint16_t  m_reserved3;//urlPathWeight;
	uint8_t   m_metaListCheckSum8; // bring it back!!
	char      m_reserved3b;
	uint16_t  m_reserved4;//externalLinkTextWeight;
	uint16_t  m_reserved5;//internalLinkTextWeight;

	// a new parm from reserved6. need to know the count so we can
	// delete the json objects derived from this page if we want to
	// delete this page. or if this page is respidered then we get the
	// json objects for it, REject the old json object urls, and inject
	// the new ones i guess.
	uint16_t  m_diffbotJSONCount;

	// these do not include header/footer (dup) addresses
	//int16_t   m_numAddresses;
	int16_t   m_httpStatus; // -1 if not found (empty http reply)

	//int8_t  m_nextSpiderPriority;
	int8_t    m_hopCount;
	//int8_t  m_metalistChecksum; // parser checksum
	//uint8_t m_numBannedOutlinks8;
	uint8_t   m_langId;
	uint8_t   m_rootLangId;
	uint8_t   m_contentType;


	// bit flags
	uint16_t  m_isRSS:1;
	uint16_t  m_isPermalink:1;
	uint16_t  m_isAdult:1;
	uint16_t  m_wasInjected:1;//eliminateMenus:1;
	uint16_t  m_spiderLinks:1;
	uint16_t  m_isContentTruncated:1;
	uint16_t  m_isLinkSpam:1;
	uint16_t  m_hasAddress:1;
	uint16_t  m_hasTOD:1;
	uint16_t  m_hasSiteVenue:1;
	uint16_t  m_hasContactInfo:1;
	uint16_t  m_isSiteRoot:1;

	uint16_t  m_isDiffbotJSONObject:1;
	uint16_t  m_sentToDiffbot:1;
	uint16_t  m_gotDiffbotSuccessfulReply:1;
	uint16_t  m_reserved804:1;
	uint16_t  m_reserved805:1;
	uint16_t  m_reserved806:1;
	uint16_t  m_reserved807:1;
	uint16_t  m_reserved808:1;
	uint16_t  m_reserved809:1;
	uint16_t  m_reserved810:1;
	uint16_t  m_reserved811:1;
	uint16_t  m_reserved812:1;
	uint16_t  m_reserved813:1;
	uint16_t  m_reserved814:1;
	uint16_t  m_reserved815:1;
	uint16_t  m_reserved816:1;


	char      *ptr_firstUrl;
	char      *ptr_redirUrl;
	//char    *ptr_tagRecData;
	char      *ptr_rootTitleBuf;
	long      *ptr_gigabitHashes;
	long      *ptr_gigabitScores;
	long long *ptr_adVector;
	long long *ptr_wikiDocIds;
	rscore_t  *ptr_wikiScores;
	char      *ptr_imageData;
	long      *ptr_catIds;
	long      *ptr_indCatIds;
	char      *ptr_dmozTitles;
	char      *ptr_dmozSumms;
	char      *ptr_dmozAnchors;
	char      *ptr_utf8Content;
	//char    *ptr_sectionsReply; // votes read from sectiondb - m_osvt
	//char    *ptr_sectionsVotes; // our local votes - m_nsvt
	//char    *ptr_addressReply;
	char      *ptr_clockCandidatesData;
	// . serialization of the sectiondb and placedb lists
	// . that way we can store just these and not have to store the content
	//   of the entire page if we do not need to
	//char    *ptr_sectiondbData;
	//char    *ptr_placedbData;
	// do not let SiteGetter change this when we re-parse!
	char      *ptr_site;
	LinkInfo  *ptr_linkInfo1;
	char      *ptr_linkdbData;
	char      *ptr_sectiondbData;
	char      *ptr_tagRecData;
	LinkInfo  *ptr_linkInfo2;

	long       size_firstUrl;
	long       size_redirUrl;
	//long     size_tagRecData;
	long       size_rootTitleBuf;
	long       size_gigabitHashes;
	long       size_gigabitScores;
	long       size_adVector;
	long       size_wikiDocIds;
	long       size_wikiScores;
	long       size_imageData;
	long       size_catIds;
	long       size_indCatIds;
	long       size_dmozTitles;
	long       size_dmozSumms;
	long       size_dmozAnchors;
	long       size_utf8Content;
	//long     size_sectionsReply;
	//long     size_sectionsVotes;
	//long     size_addressReply;
	long       size_clockCandidatesData;
	//long     size_sectiondbData;
	//long     size_placedbData;
	long       size_site;
	long       size_linkInfo1;
	long       size_linkdbData;
	long       size_sectiondbData;
	long       size_tagRecData;
	long       size_linkInfo2;

	char      m_dummyEnd;

	//
	// END WHAT IS STORED IN THE TITLE REC (Titledb.h)
	//

 public:

	// . returns false and sets errno on error
	// . once you call this you can call setMetaList() below
	// . sets all the contained parser classes, Words, Xml, etc. if they
	//   have not already been set! that way Msg16/Msg14 can set bits
	//   and pieces here and there and we do not reset what it's done
	// . our m_xml will contain ptrs into titleRec's content, be careful
	// . if titleRec gets freed we should be freed too
	//bool set ( char           *titleRec                    ,
	//	   class SafeBuf  *pbuf         = NULL         ,
	//	   long            niceness     = MAX_NICENESS ,
	//	   bool            justSetLinks = false        );

	// . used by Msg16 to set the Xml to get meta redirect tag's content
	// . used by Msg16 to get <META NAME="ROBOTS" CONTENT="index,follow">
	// . this should be set by Msg16 so it can get meta redirect url


	void print   ( );

	bool set1 ( char *url ,
		    char *coll,
		    SafeBuf *pbuf ,
		    long niceness );


	bool set2 ( char *titleRec,
		    long maxSize,
		    char *coll,
		    class SafeBuf *p,
		    long niceness ,
		    class SpiderRequest *sreq = NULL );

	// . since being set from a docId, we will load the old title rec
	//   and use that!
	// . used by PageGet.cpp
	bool set3 ( long long  docId       ,
		    char      *coll        ,
		    long       niceness    );

	bool set4 ( class SpiderRequest *sreq  ,
		    key_t           *doledbKey ,
		    char            *coll      ,
		    class SafeBuf   *pbuf      ,
		    long             niceness  ,
		    char            *utf8Content = NULL ,
		    bool             deleteFromIndex = false ,
		    long             forcedIp = 0 ,
		    uint8_t          contentType = CT_HTML ,
		    time_t           spideredTime = 0 ,
		    bool             contentHasMime = false ) ;

	// we now call this right away rather than at download time!
	long getSpideredTime();

	// another entry point, like set3() kinda
	bool loadFromOldTitleRec ();

	XmlDoc() ;
	~XmlDoc() ;
	void nukeDoc ( class XmlDoc *);
	void reset ( ) ;
	bool setFirstUrl ( char *u , bool addWWW , Url *base = NULL ) ;
	bool setRedirUrl ( char *u , bool addWWW ) ;
	void setStatus ( char *s ) ;
	void setCallback ( void *state, void (*callback) (void *state) ) ;
	void setCallback ( void *state, bool (*callback) (void *state) ) ;
	bool addToSpiderdb ( ) ;
	bool indexDoc ( );
	bool indexDoc2 ( );
	key_t *getTitleRecKey() ;
	//char *getSkipIndexing ( );
	char *prepareToMakeTitleRec ( ) ;
	char **getTitleRec ( ) ;
	char *getIsAdult ( ) ;
	long **getIndCatIds ( ) ;
	long **getCatIds ( ) ;
	class CatRec *getCatRec ( ) ;

	long *getNumDmozEntries() ;
	char **getDmozTitles ( ) ;
	char **getDmozSummaries ( ) ;
	char **getDmozAnchors ( ) ;
	bool setDmozInfo () ;

	long long **getWikiDocIds ( ) ;
	void gotWikiResults ( class UdpSlot *slot );
	long *getPubDate ( ) ;
	//class DateParse2 *getDateParse2 ( ) ;
	class Dates *getSimpleDates();
	class Dates *getDates();
	class HashTableX *getClockCandidatesTable();
	long getUrlPubDate ( ) ;
	long getOutlinkAge ( long outlinkNum ) ;
	char *getIsPermalink ( ) ;
	char *getIsUrlPermalinkFormat ( ) ;
	char *getIsRSS ( ) ;
	class Xml *getXml ( ) ;
	uint8_t *getLangVector ( ) ;
	uint8_t *getLangId ( ) ;
	char computeLangId ( Sections *sections ,Words *words , char *lv ) ;
	class Words *getWords ( ) ;
	class Bits *getBits ( ) ;
	class Bits *getBitsForSummary ( ) ;
	class Pos *getPos ( );
	class Phrases *getPhrases ( ) ;
	//class Synonyms *getSynonyms ( );
	class Sections *getExplicitSections ( ) ;
	class Sections *getImpliedSections ( ) ;
	class Sections *getSections ( ) ;
	class Sections *getSectionsWithDupStats ( );
	bool gotSectionStats( class Msg3a *msg3a );
	class SectionStats *getSectionStats ( long long secHash64 );
	class SectionVotingTable *getOldSectionVotingTable();
	class SectionVotingTable *getNewSectionVotingTable();
	char **getSectionsReply ( ) ;
	char **getSectionsVotes ( ) ;
	HashTableX *getSectionVotingTable();
	long *getLinkSiteHashes ( );
	class Links *getLinks ( bool doQuickSet = false ) ;
	class HashTableX *getCountTable ( ) ;
	bool hashString_ct ( class HashTableX *ht, char *s , long slen ) ;
	uint8_t *getSummaryLangId ( ) ;
	long     *getTagPairHashVector ( ) ;
	uint32_t *getTagPairHash32 ( ) ;
	long *getSummaryVector ( ) ;
	long *getPageSampleVector ( ) ;
	long *getPostLinkTextVector ( long linkNode ) ;
	long computeVector ( class Sections *sections, class Words *words,
			     uint32_t *vec , long start = 0 , long end = -1 );
	float *getTagSimilarity ( class XmlDoc *xd2 ) ;
	float *getGigabitSimilarity ( class XmlDoc *xd2 ) ;
	float *getPageSimilarity ( class XmlDoc *xd2 ) ;
	float *getPercentChanged ( );
	uint64_t *getFuzzyDupHash ( );
	long long *getExactContentHash64();
	class RdbList *getDupList ( ) ;
	class RdbList *getLikedbListForReq ( );
	class RdbList *getLikedbListForIndexing ( );
	long addLikedbRecords ( bool justGetSize ) ;
	char *getIsDup ( ) ;
	char *isDupOfUs ( long long d ) ;
	uint32_t *getGigabitVectorScorelessHash ( ) ;
	long *getGigabitHashes ( );
	char *getGigabitQuery ( ) ;
	char *getMetaDescription( long *mdlen ) ;
	char *getMetaSummary ( long *mslen ) ;
	char *getMetaKeywords( long *mklen ) ;
	bool addGigabits ( char *s , long long docId , uint8_t langId ) ;
	bool addGigabits2 ( char *s,long slen,long long docId,uint8_t langId);
	bool addGigabits ( class Words *ww ,
			   long long docId,
			   class Sections *sections,
			   //class Weights  *we ,
			   uint8_t langId );

	long *getSiteSpiderQuota ( ) ;
	class Url *getCurrentUrl ( ) ;
	class Url *getFirstUrl() ;
	long long getFirstUrlHash48();
	long long getFirstUrlHash64();
	class Url **getRedirUrl() ;
	class Url **getMetaRedirUrl() ;
	long *getFirstIndexedDate ( ) ;
	long *getOutlinksAddedDate ( ) ;
	//long *getNumBannedOutlinks ( ) ;
	uint16_t *getCountryId ( ) ;
	class XmlDoc **getOldXmlDoc ( ) ;
	bool isRobotsTxtFile ( char *url , long urlLen ) ;
	class XmlDoc **getExtraDoc ( char *url , long maxCacheAge = 0 ) ;
	bool getIsPageParser ( ) ;
	class XmlDoc **getRootXmlDoc ( long maxCacheAge = 0 ) ;
	//class XmlDoc **getGatewayXmlDoc ( ) ;
	// . returns false if blocked, true otherwise.
	// . returns true and sets g_errno on error
	//bool setFromOldTitleRec ( ) ;
	//RdbList *getOldMetaList ( ) ;
	char **getOldTitleRec ( );
	uint8_t *getRootLangId ();
	//bool *updateRootLangId ( );
	char **getRootTitleRec ( ) ;
	//char **getContactTitleRec ( char *url ) ;
	long long *getDocId ( ) ;
	char *getIsIndexed ( ) ;
	class TagRec *getTagRec ( ) ;
	char *getHasContactInfo ( ) ;
	char *getIsThisDocContacty ( );
	bool *getHasTOD();
	bool *getHasSiteVenue();

	// non-dup/nondup addresses only
	bool *getHasAddress();
	class Addresses *getAddresses ( ) ;
	Address **getContactAddresses ( );
	long *getNumOfficialEmails ( ) ;
	char *getEmailBuf ( ) ;
	long *getNumContactAddresses ( );
	long addEmailTags ( class Xml *xml , class Words *ww ,
			    class TagRec *gr , long ip ) ;
	//class Url *getContactUsLink ( ) ;
	//class Url *getAboutUsLink ( ) ;
	long *getFirstIp ( ) ;
	bool *updateFirstIp ( ) ;
	long *getSiteNumInlinksUniqueIp ( ) ;
	long *getSiteNumInlinksUniqueCBlock ( ) ;
	long *getSiteNumInlinksTotal ( );
	//long *getSiteNumInlinksFresh ( ) ;
	//long *getSitePop ( ) ;
	uint8_t *getSiteNumInlinks8 () ;
	long *getSiteNumInlinks ( ) ;
	class LinkInfo *getSiteLinkInfo() ;
	long *getIp ( ) ;
	long *gotIp ( bool save ) ;
	bool *getIsAllowed ( ) ;
	long *getFinalCrawlDelay();
	long      m_finalCrawlDelay;
	//long getTryAgainTimeDelta() {
	//	if ( ! m_tryAgainTimeDeltaValid ) { char *xx=NULL;*xx=0;}
	//	return m_tryAgainTimeDelta;
	//};
	char *getIsWWWDup ( ) ;
	class LinkInfo *getLinkInfo1 ( ) ;
	class LinkInfo **getLinkInfo2 ( ) ;
	char *getSite ( ) ;
	void  gotSite ( ) ;
	long long *getSiteHash64 ( ) ;
	long *getSiteHash32 ( ) ;
	char **getHttpReply ( ) ;
	char **getHttpReply2 ( ) ;
	char **gotHttpReply ( ) ;
	char *getIsContentTruncated ( );
	long *getDownloadStatus ( ) ;
	long long *getDownloadEndTime ( ) ;
	int16_t *getHttpStatus ( );
	char waitForTimeSync ( ) ;
	bool m_alreadyRegistered;
	class HttpMime *getMime () ;
	char **getContent ( ) ;
	uint8_t *getContentType ( ) ;
	uint16_t *getCharset ( ) ;
	char *getIsBinary ( ) ;
	char **getFilteredContent ( ) ;
	void filterStart_r ( bool amThread ) ;
	char **getRawUtf8Content ( ) ;
	char **getExpandedUtf8Content ( ) ;
	char **getUtf8Content ( ) ;
	long *getContentHash32 ( ) ;
	long *getTagHash32 ( ) ;
	long getHostHash32a ( ) ;
	long getHostHash32b ( ) ;
	long getDomHash32 ( );
	char **getImageData();
	class Images *getImages ( ) ;
	int8_t *getNextSpiderPriority ( ) ;
	long *getPriorityQueueNum ( ) ;
	class TagRec ***getOutlinkTagRecVector () ;
	char *hasNoIndexMetaTag();
	char *hasFakeIpsMetaTag ( );
	long **getOutlinkFirstIpVector () ;
	//char **getOutlinkIsIndexedVector () ;
	long *getRegExpNum ( long outlinkNum ) ;
	long *getRegExpNum2 ( long outlinkNum ) ;
	char *getIsSiteRoot ( ) ;
	bool getIsOutlinkSiteRoot ( char *u , class TagRec *gr ) ;
	int8_t *getHopCount ( ) ;
	//int8_t *getOutlinkHopCountVector ( ) ;
	char *getSpiderLinks ( ) ;
	long *getNextSpiderTime ( ) ;
	//char *getIsSpam() ;
	char *getIsFiltered ();
	bool getIsInjecting();
	long *getSpiderPriority ( ) ;
	long *getIndexCode ( ) ;
	long *getIndexCode2 ( ) ;
	SafeBuf *getNewTagBuf ( ) ;

	char *updateTagdb ( ) ;
	bool logIt ( ) ;
	bool m_doConsistencyTesting;
	bool doConsistencyTest ( bool forceTest ) ;
	long printMetaList ( ) ;
	void printMetaList ( char *metaList , char *metaListEnd ,
			     class SafeBuf *pbuf );
	bool verifyMetaList ( char *p , char *pend , bool forDelete ) ;
	bool hashMetaList ( class HashTableX *ht        ,
			    char       *p         ,
			    char       *pend      ,
			    bool        checkList ) ;

	char *getMetaList ( bool forDelete = false );

	void copyFromOldDoc ( class XmlDoc *od ) ;

	// we add a SpiderReply to spiderdb when done spidering, even if
	// m_indexCode or g_errno was set!
	class SpiderReply *getNewSpiderReply ( );


	SpiderRequest **getRedirSpiderRequest ( );
	SpiderRequest m_redirSpiderRequest;
	SpiderRequest *m_redirSpiderRequestPtr;


	void  setSpiderReqForMsg20 ( class SpiderRequest *sreq ,
				     class SpiderReply   *srep );


	char *addOutlinkSpiderRecsToMetaList ( );

	//bool addTable96 ( class HashTableX *tt1     ,
	//		  long       date1   ,
	//		  bool       nosplit ) ;

	long getSiteRank ();
	bool addTable144 ( class HashTableX *tt1 );

	bool addTable224 ( HashTableX *tt1 ) ;

	//bool addTableDate ( class HashTableX *tt1     , //T<key128_t,char> *tt1
	//                           uint64_t    docId   ,
	//                           uint8_t     rdbId   ,
	//                           bool        nosplit ) ;

	bool addTable128 ( class HashTableX *tt1     , // T <key128_t,char>*tt1
                           uint8_t     rdbId   ,
			   bool        forDelete ) ;

	bool hashNoSplit ( class HashTableX *tt ) ;
	char *hashAll ( class HashTableX *table ) ;
	long getBoostFromSiteNumInlinks ( long inlinks ) ;
	bool hashMetaTags ( class HashTableX *table ) ;
	bool hashIsClean ( class HashTableX *table ) ;
	bool hashZipCodes ( class HashTableX *table ) ;
	bool hashMetaZip ( class HashTableX *table ) ;
	bool hashContentType ( class HashTableX *table ) ;
	bool hashDMOZCategories ( class HashTableX *table ) ;
	bool hashLinks ( class HashTableX *table ) ;
	bool hashUrl ( class HashTableX *table ) ;
	bool hashDateNumbers ( class HashTableX *tt ) ;
	bool hashSections ( class HashTableX *table ) ;
	bool hashIncomingLinkText ( class HashTableX *table            ,
				    bool       hashAnomalies    ,
                                    bool       hashNonAnomalies ) ;

	bool hashLinksForLinkdb ( class HashTableX *table ) ;
	bool hashNeighborhoods ( class HashTableX *table ) ;
	bool hashRSSInfo ( class HashTableX *table ) ;
	bool hashRSSTerm ( class HashTableX *table , bool inRSS ) ;
	bool hashTitle ( class HashTableX *table );
	bool hashBody2 ( class HashTableX *table );
	bool hashMetaKeywords ( class HashTableX *table );
	bool hashMetaSummary ( class HashTableX *table );
	bool linksToGigablast ( ) ;
	bool searchboxToGigablast ( ) ;
	bool hashLanguage ( class HashTableX *table ) ;
	bool hashCountry ( class HashTableX *table ) ;
	bool hashSiteNumInlinks ( class HashTableX *table ) ;
	bool hashCharset ( class HashTableX *table ) ;
	bool hashTagRec ( class HashTableX *table ) ;
	bool hashPermalink ( class HashTableX *table ) ;
	bool hashVectors(class HashTableX *table ) ;
	bool hashAds(class HashTableX *table ) ;
	class Url *getBaseUrl ( ) ;
	bool hashSubmitUrls ( class HashTableX *table ) ;
	bool hashIsAdult    ( class HashTableX *table ) ;

	void set20 ( Msg20Request *req ) ;
	class Msg20Reply *getMsg20Reply ( ) ;
	char **getImageUrl() ;
	class MatchOffsets *getMatchOffsets () ;
	Query *getQuery() ;
	Matches *getMatches () ;
	char *getDescriptionBuf ( char *displayMetas , long *dlen ) ;
	class Title *getTitle ();
	class Summary *getSummary () ;
	char *getHighlightedSummary ();
	SafeBuf *getSampleForGigabits ( ) ;
	char *getIsCompromised ( ) ;
	char *getIsNoArchive ( ) ;
	long *getUrlFilterNum();
	//long *getDiffbotApiNum();
	SafeBuf *getDiffbotApiUrl();
	long long **getAdVector ( ) ;
	char *getIsLinkSpam ( ) ;
	char *getIsHijacked();
	char *getIsErrorPage ( ) ;
	char* matchErrorMsg(char* p, char* pend );

	bool hashWords  ( //long            wordStart ,
			  //long            wordEnd   ,
			  class HashInfo *hi        ) ;
	bool hashSingleTerm ( long long       termId ,
			      class HashInfo *hi     ) ;
	bool hashSingleTerm ( char            *s    ,
			      long             slen ,
			      class HashInfo  *hi   );
	bool hashString ( class HashTableX *ht   ,
			  //class Weights    *we   ,
			  class Bits       *bits ,
			  char             *s    ,
			  long              slen ) ;
	bool hashString ( char             *s    ,
			  long              slen ,
			  class HashInfo   *hi   ) ;


	bool hashWords3 ( //long              wordStart     ,
			  //long              wordEnd       ,
			  class HashInfo   *hi            ,
			  class Words      *words         ,
			  class Phrases    *phrases       ,
			  class Synonyms   *synonyms      ,
			  class Sections   *sections      ,
			  class HashTableX *countTable    ,
			  char *fragVec ,
			  char *wordSpamVec ,
			  char *langVec ,
			  char  docLangId , // default lang id
			  class SafeBuf    *pbuf          ,
			  class HashTableX *wts           ,
			  class SafeBuf    *wbuf          ,
			  long              niceness      );

	bool hashString3 ( char             *s              ,
			  long              slen           ,
			  class HashInfo   *hi             ,
			  class HashTableX *countTable     ,
			  class SafeBuf    *pbuf           ,
			  class HashTableX *wts            ,
			  class SafeBuf    *wbuf           ,
			  long              version        ,
			  long              siteNumInlinks ,
			  long              niceness       );


	bool hashNumber ( char *beginBuf ,
			  char *buf ,
			  long bufLen ,
			  class HashInfo *hi ) ;

	bool hashNumber2 ( float f ,
			   class HashInfo *hi ,
			   char *gbsortByStr ) ;

	// print out for PageTitledb.cpp and PageParser.cpp
	bool printDoc ( class SafeBuf *pbuf );
	bool printMenu ( class SafeBuf *pbuf );
	bool printDocForProCog ( class SafeBuf *sb , HttpRequest *hr ) ;
	bool printGeneralInfo ( class SafeBuf *sb , HttpRequest *hr ) ;
	bool printRainbowSections ( class SafeBuf *sb , HttpRequest *hr );
	bool printSiteInlinks ( class SafeBuf *sb , HttpRequest *hr );
	bool printPageInlinks ( class SafeBuf *sb , HttpRequest *hr );
	bool printTermList ( class SafeBuf *sb , HttpRequest *hr );
	bool printSpiderStats ( class SafeBuf *sb , HttpRequest *hr );
	bool printCachedPage ( class SafeBuf *sb , HttpRequest *hr );

	bool printSerpFiltered ( class Section *sx , char *tagName ) ;

	char **getTitleBuf             ( );
	char **getRootTitleBuf         ( );
	char **getFilteredRootTitleBuf ( );

	// funcs that update our tagdb tagrec, m_tagRec, and also update tagdb
	bool *updateVenueAddresses ( );

	// called by msg0 handler to add posdb termlists into g_termListCache
	// for faster seo pipeline
	bool cacheTermLists();

 public:

	// stuff set from the key of the titleRec, above the compression area
	//key_t     m_key;
	long long m_docId;

	char     *m_ubuf;
	long      m_ubufSize;
	long      m_ubufAlloc;

	// does this page link to gigablast, or has a search form to it?
	//bool linksToGigablast();
	//bool searchboxToGigablast();

	// private:

	// we we started spidering it, in milliseconds since the epoch
	long long    m_startTime;

	// when set() was called by Msg20.cpp so we can time how long it took
	// to generate the summary
	long long    m_setTime;
	long long    m_cpuSummaryStartTime;

	// timers
	long long m_beginSEOTime;
	long long m_beginTimeAllMatch;
	long long m_beginTimeMatchUrl;
	long long m_beginTimeFullQueries;
	long long m_beginTimeLinks;
	//long long m_beginMsg98s;
	long long m_beginRelatedQueries;
	long long m_beginMsg95s;

	// . these should all be set using set*() function calls so their
	//   individual validity flags can bet set to true, and successive
	//   calls to their corresponding get*() functions will not core
	// . these particular guys are set immediately on set(char *titleRec)

	Url        m_redirUrl;
	Url       *m_redirUrlPtr;
	Url        m_metaRedirUrl;
	Url       *m_metaRedirUrlPtr;
	long       m_redirError;
	char       m_allowSimplifiedRedirs;
	Url        m_firstUrl;
	long long  m_firstUrlHash48;
	long long  m_firstUrlHash64;
	Url        m_currentUrl;

	//char      *m_coll;
	//char       m_collBuf[MAX_COLL_LEN+1]; // include \0
	CollectionRec *m_lastcr;
	collnum_t      m_collnum;
	long           m_lastCollRecResetCount;
	class CollectionRec *getCollRec ( ) ;
	bool setCollNum ( char *coll ) ;


	char      *m_content;
	long       m_contentLen;

	char *m_metaList;
	long  m_metaListSize;

	SafeBuf  m_metaList2;

	// . same thing, a little more complicated
	// . these classes are only set on demand
	Xml        m_xml;
	Links      m_links;
	Words      m_words;
	Bits       m_bits;
	Bits       m_bits2;
	Pos        m_pos;
	Phrases    m_phrases;
	//Synonyms   m_synonyms;
	SafeBuf    m_synBuf;
	//Weights    m_weights;
	Sections   m_sections;

	Section *m_si;
	//Section *m_nextSection;
	//Section *m_lastSection;
	long m_msg3aRequestsOut;
	long m_msg3aRequestsIn;
	char *m_queryBuf;
	Msg39Request *m_msg39RequestArray;
	SafeBuf m_msg3aBuf;
	Msg3a *m_msg3aArray;
	char  *m_inUse;
	Query *m_queryArray;
	long long *m_secHash64Array;
	bool     m_gotDupStats;
	//long     m_secHash64;
	//Query    m_q4;
	//Msg3a    m_msg3a;
	//Msg39Request m_r39;
	Msg39Request m_mr2;
	HashTableX m_sectionStatsTable;
	//char m_sectionHashQueryBuf[128];

	// also set in getSections()
	long       m_maxVotesForDup;

	// . for rebuild logging of what's changed
	// . Repair.cpp sets these based on titlerec
	char m_logLangId;
	long m_logSiteNumInlinks;

	SectionVotingTable m_nsvt;

	SectionVotingTable m_osvt;
	long m_numSectiondbReads;
	long m_numSectiondbNeeds;
	key128_t m_sectiondbStartKey;
	RdbList m_secdbList;
	long m_sectiondbRecall;
	SafeBuf m_tmpBuf3;

	//HashTableX m_rvt;
	//Msg17 m_msg17;
	//char *m_cachedRootVoteRec;
	//long  m_cachedRootVoteRecSize;
	//bool  m_triedVoteCache;
	//bool  m_storedVoteCache;
	//SafeBuf m_cacheRecBuf;

	HashTableX m_turkVotingTable;
	HashTableX m_turkBitsTable;
	uint32_t m_confirmedTitleContentHash ;
	uint32_t m_confirmedVenueContentHash ;
	uint32_t m_confirmedTitleTagHash     ;
	uint32_t m_confirmedVenueTagHash     ;

	// turk voting tag rec
	TagRec m_vtr;
	// tagrec of banned turks
	TagRec m_bannedTurkRec;
	// and the table of the hashed banned turk users
	HashTableX m_turkBanTable;

	// used for displaying turk votes...
	HashTableX m_vctab;
	HashTableX m_vcduptab;

	Images     m_images;
	HashTableX m_countTable;
	HttpMime   m_mime;
	TagRec     m_tagRec;
	SafeBuf    m_tagRecBuf;
	// copy of m_oldTagRec but with our modifications, if any
	//TagRec     m_newTagRec;
	SafeBuf    m_newTagBuf;
	SafeBuf    m_fragBuf;
	SafeBuf    m_wordSpamBuf;
	SafeBuf    m_finalSummaryBuf;
	// this one is initially the same as m_tagRec, but we do not modify it
	// so that Address.cpp can reference into its buffer, m_buf, without
	// fear of getting the buffer overwritten by crap
	//TagRec     m_savedTagRec1;
	//char    *m_sampleVector  ;
	uint32_t   m_tagPairHash;
	long       m_firstIp;

	class SafeBuf     *m_savedSb;
	class HttpRequest *m_savedHr;


	// validity flags. on reset() all these are set to false.
	char     m_VALIDSTART;
	// DO NOT add validity flags above this line!
	char     m_metaListValid;
	//char   m_docQualityValid;
	char     m_siteValid;
	char     m_startTimeValid;
	char     m_currentUrlValid;
	char     m_firstUrlValid;
	char     m_firstUrlHash48Valid;
	char     m_firstUrlHash64Valid;
	char     m_lastUrlValid;
	char     m_docIdValid;
	//char     m_collValid;
	char     m_tagRecValid;
	char     m_robotsTxtLenValid;
	char     m_tagRecDataValid;
	char     m_newTagBufValid;
	char     m_rootTitleBufValid;
	char     m_filteredRootTitleBufValid;
	char     m_titleBufValid;
	char     m_fragBufValid;
	char     m_wordSpamBufValid;
	char     m_finalSummaryBufValid;
	char     m_matchingQueryBufValid;
	char     m_relatedQueryBufValid;
	char     m_queryLinkBufValid;
	char     m_redirSpiderRequestValid;
	//char     m_queryPtrsValid;
	char     m_queryOffsetsValid;
	//char     m_queryPtrsSortedValid;
	char     m_queryPtrsWholeValid;
	char     m_relatedDocIdBufValid;
	char     m_topMatchingQueryBufValid;
	char     m_relatedDocIdsScoredBufValid;
	char     m_relatedDocIdsWithTitlesValid;
	char     m_relatedTitleBufValid;
	//char     m_queryLinkBufValid;
	char     m_missingTermBufValid;
	char     m_matchingTermBufValid;
	//char     m_relPtrsValid;
	char     m_sortedPosdbListBufValid;
	char     m_wpSortedPosdbListBufValid;
	char     m_termListBufValid;
	char     m_insertableTermsBufValid;
	char     m_scoredInsertableTermsBufValid;
	//char     m_iwfiBufValid; // for holding WordFreqInfo instances
	char     m_wordPosInfoBufValid;
	char     m_recommendedLinksBufValid;
	char     m_tempMsg25PageValid;
	char     m_tempMsg25SiteValid;

	//char     m_queryHashTableValid;
	char     m_queryOffsetTableValid;
	//char     m_socketWriteBufValid;
	//char     m_numBannedOutlinksValid;
	char     m_hopCountValid;
	char     m_isInjectingValid;
	char     m_metaListCheckSum8Valid;
	char     m_contentValid;
	char     m_filteredContentValid;
	char     m_charsetValid;
	char     m_langVectorValid;
	char     m_langIdValid;
	char     m_rootLangIdValid;
	char     m_datedbDateValid;
	char     m_isRSSValid;
	char     m_spiderLinksArgValid;
	char     m_isContentTruncatedValid;
	char     m_xmlValid;
	char     m_linksValid;
	char     m_wordsValid;
	char     m_bitsValid;
	char     m_bits2Valid;
	char     m_posValid;
	char     m_isUrlBadYearValid;
	char     m_phrasesValid;
	//char     m_synonymsValid;
	//char     m_weightsValid;
	char     m_sectionsValid;
	char     m_subSentsValid;
	char     m_osvtValid;
	char     m_nsvtValid;
	//char   m_rvtValid;
	char     m_turkVotingTableValid;
	char     m_turkBitsTableValid;
	char     m_turkBanTableValid;
	char     m_vctabValid;
	char     m_explicitSectionsValid;
	char     m_impliedSectionsValid;
	char     m_sectionVotingTableValid;
	char     m_imageDataValid;
	char     m_imagesValid;
	char     m_msge0Valid;
	char     m_msge1Valid;
	//char     m_msge2Valid;
	//char   m_sampleVectorValid;
	char     m_gigabitHashesValid;
	char     m_tagPairHashValid;
	char     m_oldsrValid;
	char     m_newsrValid;
	char     m_titleRecValid;

	bool m_ipValid;
	bool m_firstIpValid;
	bool m_spideredTimeValid;
	//bool m_nextSpiderTimeValid;
	bool m_firstIndexedValid;
	bool m_outlinksAddedDateValid;
	bool m_countryIdValid;
	/*
	bool m_titleWeightValid;
	bool m_headerWeightValid;
	bool m_urlPathWeightValid;
	bool m_externalLinkTextWeightValid;
	bool m_internalLinkTextWeightValid;
	bool m_conceptWeightValid;
	*/
	bool m_httpStatusValid;
	bool m_crawlDelayValid;
	bool m_finalCrawlDelayValid;
	bool m_titleRecKeyValid;
	bool m_adVectorValid;
	bool m_wikiDocIdsValid;
	bool m_catIdsValid;
	bool m_versionValid;
	bool m_indCatIdsValid;
	bool m_dmozTitlesValid;
	bool m_dmozSummsValid;
	bool m_dmozAnchorsValid;
	bool m_dmozInfoValid;
	bool m_rawUtf8ContentValid;
	bool m_expandedUtf8ContentValid;
	bool m_utf8ContentValid;
	bool m_isAllowedValid;
	//bool m_tryAgainTimeDeltaValid;
	//bool m_eliminateMenusValid;
	bool m_redirUrlValid;
	bool m_metaRedirUrlValid;
	bool m_statusMsgValid;
	bool m_mimeValid;
	bool m_pubDateValid;
	bool m_hostHash32aValid;
	bool m_hostHash32bValid;
	bool m_indexCodeValid;
	bool m_priorityValid;
	bool m_downloadStatusValid;
	bool m_downloadEndTimeValid;
	bool m_redirErrorValid;
	bool m_domHash32Valid;
	bool m_contentHash32Valid;
	bool m_tagHash32Valid;
	bool m_linkInfo2Valid;
	bool m_spiderLinksValid;
	//bool m_nextSpiderPriorityValid;
	bool m_firstIndexedDateValid;
	bool m_isPermalinkValid;

	bool m_isAdultValid;
	bool m_hasAddressValid;
	bool m_hasTODValid;
	bool m_hasSiteVenueValid;
	bool m_catRecValid;
	bool m_urlPubDateValid;
	bool m_isUrlPermalinkFormatValid;
	bool m_percentChangedValid;
	bool m_unchangedValid;
	bool m_countTableValid;
	bool m_summaryLangIdValid;
	bool m_tagPairHashVecValid;
	bool m_summaryVecValid;
	bool m_titleVecValid;
	bool m_pageSampleVecValid;
	bool m_postVecValid;
	bool m_dupListValid;
	bool m_likedbListValid;
	bool m_isDupValid;
	bool m_gigabitVectorHashValid;
	bool m_gigabitQueryValid;
	bool m_metaDescValid;
	bool m_metaSummaryValid;
	bool m_metaKeywordsValid;
	bool m_siteSpiderQuotaValid;
	bool m_oldDocValid;
	bool m_extraDocValid;
	bool m_ahrefsDocValid;
	//bool m_contactDocValid;
	bool m_rootDocValid;
	//bool m_gatewayDocValid;
	bool m_oldMetaListValid;
	bool m_oldTitleRecValid;
	bool m_rootTitleRecValid;
	//bool m_contactTitleRecValid;
	bool m_isIndexedValid;
	bool m_hasContactInfoValid;
	bool m_isContactyValid;
	bool m_contactInfoTagRecValid;
	bool m_addressesValid;
	bool m_contactAddressesValid;
	bool m_emailBufValid;
	//bool m_contactUsLinkValid;
	//bool m_aboutUsLinkValid;
	//bool m_contactLinksValid;
	bool m_siteNumInlinksValid;
	bool m_siteNumInlinksUniqueIpValid;//FreshValid;
	bool m_siteNumInlinksUniqueCBlockValid;//sitePopValid
	bool m_siteNumInlinksTotalValid;
	bool m_siteNumInlinks8Valid;
	bool m_siteLinkInfoValid;
	bool m_isWWWDupValid;
	bool m_linkInfo1Valid;
	bool m_linkSiteHashesValid;
	//bool m_dateParse2Valid;
	bool m_simpleDatesValid;
	bool m_datesValid;
	bool m_sectionsReplyValid;
	bool m_sectionsVotesValid;
	bool m_sectiondbDataValid;
	bool m_placedbDataValid;
	bool m_siteHash64Valid;
	bool m_siteHash32Valid;
	bool m_httpReplyValid;
	bool m_contentTypeValid;
	bool m_isBinaryValid;
	bool m_priorityQueueNumValid;
	bool m_outlinkTagRecVectorValid;
	bool m_outlinkIpVectorValid;
	bool m_hasNoIndexMetaTagValid;
	bool m_hasUseFakeIpsMetaTagValid;
	bool m_outlinkIsIndexedVectorValid;
	bool m_isSiteRootValid;
	bool m_wasInjectedValid;
	bool m_outlinkHopCountVectorValid;
	//bool m_isSpamValid;
	bool m_isFilteredValid;
	bool m_urlFilterNumValid;
	bool m_numOutlinksAddedValid;
	bool m_baseUrlValid;
	bool m_replyValid;
	bool m_recycleDiffbotReplyValid;
	bool m_diffbotReplyValid;
	bool m_tokenizedDiffbotReplyValid;
	//bool m_diffbotUrlCrawlPatternMatchValid;
	//bool m_diffbotUrlProcessPatternMatchValid;
	//bool m_diffbotPageProcessPatternMatchValid;
	//bool m_useDiffbotValid;
	//bool m_diffbotApiNumValid;
	bool m_diffbotApiUrlValid;
	bool m_crawlInfoValid;
	bool m_isPageParserValid;
	bool m_imageUrlValid;
	bool m_matchOffsetsValid;
	bool m_queryValid;
	bool m_matchesValid;
	bool m_dbufValid;
	bool m_titleValid;
	bool m_collnumValid;
	//bool m_twidsValid;
	bool m_termId32BufValid;
	bool m_termInfoBufValid;
	bool m_newTermInfoBufValid;
	bool m_summaryValid;
	bool m_gsbufValid;
	bool m_isCompromisedValid;
	bool m_isNoArchiveValid;
	//bool m_isVisibleValid;
	bool m_clockCandidatesTableValid;
	bool m_clockCandidatesDataValid;
	bool m_isLinkSpamValid;
	bool m_isErrorPageValid;
	bool m_isHijackedValid;
	bool m_dupHashValid;
	bool m_exactContentHash64Valid;

	// shadows
	char m_isRSS2;
	char m_isPermalink2;
	char m_isAdult2;
        char m_spiderLinks2;
	char m_isContentTruncated2;
	char m_isLinkSpam2;
	bool m_hasAddress2;
	bool m_hasTOD2;
	bool m_hasSiteVenue2;
	char m_hasContactInfo2;
	char m_isSiteRoot2;

	// DO NOT add validity flags below this line!
	char     m_VALIDEND;

	// more stuff
	//char *m_utf8Content;
	//long m_utf8ContentLen;
	CatRec m_catRec;
	// use this stuff for getting wiki docids that match our doc's gigabits
	//Query m_wq;
	//SearchInput m_si;
	//Msg40 m_msg40;
	//DateParse2 m_dateParse2;
	Dates m_dates;
	HashTableX m_clockCandidatesTable;
	SafeBuf m_cctbuf;
	float m_ageInDays;
	long m_urlPubDate;
	//long m_urlAge;
	char m_isUrlPermalinkFormat;
	uint8_t m_summaryLangId;
	long m_tagPairHashVec[MAX_TAG_PAIR_HASHES];
	long m_tagPairHashVecSize;
	long m_summaryVec [SAMPLE_VECTOR_SIZE/4];
	long m_summaryVecSize;
	long m_titleVec [SAMPLE_VECTOR_SIZE/4];
	long m_titleVecSize;
	long m_pageSampleVec[SAMPLE_VECTOR_SIZE/4];
	long m_pageSampleVecSize;
	long m_postVec[POST_VECTOR_SIZE/4];
	long m_postVecSize;
	float m_tagSimilarity;
	float m_gigabitSimilarity;
	float m_pageSimilarity;
	float m_percentChanged;
	bool  m_unchanged;
	// what docids are similar to us? docids are in this list
	RdbList m_dupList;
	RdbList m_likedbList;
	uint64_t m_dupHash;
	long long m_exactContentHash64;
	Msg0 m_msg0;
	Msg5 m_msg5;
	char m_isDup;
	long m_ei;
	long m_lastLaunch;
	Msg22Request m_msg22Request;
	Msg22 m_msg22a;
	Msg22 m_msg22b;
	Msg22 m_msg22c;
	Msg22 m_msg22d;
	Msg22 m_msg22e;
	Msg22 m_msg22f;
	//long m_collLen;
	uint32_t m_gigabitVectorHash;
	char m_gigabitQuery [XD_GQ_MAX_SIZE];
	long m_gigabitHashes [XD_MAX_GIGABIT_HASHES];
	long m_gigabitScores [XD_MAX_GIGABIT_HASHES];
	char *m_gigabitPtrs  [XD_MAX_GIGABIT_HASHES];
	// for debug printing really
	class GigabitInfo *m_top[100];
	long               m_numTop;
	//char  m_metaDesc[1025];
	//char  m_metaKeywords[1025];
	// these now reference directly into the html src so our
	// WordPosInfo::m_wordPtr algo works in seo.cpp
	char *m_metaDesc;
	long  m_metaDescLen;
	char *m_metaSummary;
	long  m_metaSummaryLen;
	char *m_metaKeywords;
	long  m_metaKeywordsLen;
	long  m_siteSpiderQuota;
	//long m_numBannedOutlinks;
	class XmlDoc *m_oldDoc;
	class XmlDoc *m_extraDoc;
	class XmlDoc *m_ahrefsDoc;
	//class XmlDoc *m_contactDoc;
	class XmlDoc *m_rootDoc;
	//class XmlDoc *m_gatewayDoc;
	RdbList m_oldMetaList;
	char   *m_oldTitleRec;
	long    m_oldTitleRecSize;
	char   *m_rootTitleRec;
	long    m_rootTitleRecSize;
	//char   *m_contactTitleRec;
	//long    m_contactTitleRecSize;
	char    m_isIndexed;
	Msg8a   m_msg8a;
	char   *m_tagdbColl;
	long    m_tagdbCollLen;
	Addresses m_addresses;

	Address *m_contactAddresses[MAX_CONTACT_ADDRESSES];
	long     m_numContactAddresses;

	char     m_isContacty;

	//Url     m_contactUsLink;
	//Url     m_aboutUsLink;
	/*
	char *m_contactLinks     [MAX_CONTACT_OUTLINKS];
	long  m_contactLens      [MAX_CONTACT_OUTLINKS];
	long  m_contactScores    [MAX_CONTACT_OUTLINKS];
	long  m_contactFlags     [MAX_CONTACT_OUTLINKS];
	char  m_contactProcessed [MAX_CONTACT_OUTLINKS];
	char *m_contactText      [MAX_CONTACT_OUTLINKS];
	char *m_contactTextEnd   [MAX_CONTACT_OUTLINKS];
	long  m_minContactScore;
	long  m_minContactIndex;
	long  m_numContactLinks;
	*/
	Url   m_extraUrl;
	//long m_siteNumInlinksFresh;
	//long m_sitePop;
	uint8_t m_siteNumInlinks8;
	//long m_siteNumInlinks;
	LinkInfo m_siteLinkInfo;
	SafeBuf m_mySiteLinkInfoBuf;
	SafeBuf m_myPageLinkInfoBuf;
	SafeBuf m_myTempLinkInfoBuf;
	char m_isInjecting;
	char m_useFakeMime;
	char m_useSiteLinkBuf;
	char m_usePageLinkBuf;
	char m_printInXml;
	Msg25 m_msg25;
	Msg25 *m_tempMsg25Page;
	Msg25 *m_tempMsg25Site;
	// for page or for site?
	Msg25 *getAllInlinks ( bool forSite );
	// lists from cachedb for msg25's msg20 replies serialized
	RdbList m_siteReplyList;
	RdbList m_pageReplyList;
	bool m_checkedCachedbForSite;
	bool m_checkedCachedbForPage;
	bool m_triedToAddWordPosInfoToCachedb;
	bool m_calledMsg25ForSite;
	bool m_calledMsg25ForPage;
	//void (* m_masterLoopWrapper) (void *state);
	MsgC m_msgc;
	bool m_isAllowed;
	bool m_forwardDownloadRequest;
	bool m_isChildDoc;
	Msg13 m_msg13;
	Msg13Request m_msg13Request;
	bool m_isSpiderProxy;
	// for limiting # of iframe tag expansions
	long m_numExpansions;
	char m_newOnly;
	//long m_tryAgainTimeDelta;
	//long m_sameIpWait;
	//long m_sameDomainWait;
	//long m_maxSpidersPerDomain;
	char m_isWWWDup;
	char m_calledMsg0b;
	Url  m_tmpUrl;

	SafeBuf m_tmpsb1;
	SafeBuf m_tmpsb2;
	SafeBuf m_turkBuf;
	SafeBuf m_linkSiteHashBuf;
	SafeBuf m_linkdbDataBuf;
	SafeBuf m_langVec;
	Msg0 m_msg0b;
	class RdbList *m_ulist;
	void *m_hack;
	class XmlDoc *m_hackxd;
	//class LinkInfo *m_linkInfo1Ptr;
	char     *m_linkInfoColl;
	//char m_injectedReply;
	long m_minInlinkerHopCount;
	//class LinkInfo *m_linkInfo2Ptr;
	SiteGetter m_siteGetter;
	long long  m_siteHash64;
	//char *m_site;
	//long m_siteLen;
	//Url m_siteUrl;
	long m_siteHash32;
	char *m_httpReply;
	//char m_downloadAttempted;
	char m_incrementedAttemptsCount;
	char m_incrementedDownloadCount;
	char m_redirectFlag;
	//char m_isScraping;
	//char m_throttleDownload;
	char m_spamCheckDisabled;
	char m_useRobotsTxt;
	long m_robotsTxtLen;
	long m_httpReplySize;
	long m_httpReplyAllocSize;
	char m_isBinary;
	char *m_filteredContent;
	long m_filteredContentLen;
	char *m_filter;
	long m_filteredContentAllocSize;
	long m_filteredContentMaxSize;
	char m_calledThread;
	long m_errno;
	//class CollectionRec *m_cr;
	//long m_utf8ContentAllocSize;
	long m_hostHash32a;
	long m_hostHash32b;
	long m_domHash32;
	long m_priorityQueueNum;

	// this points into m_msge0 i guess
	//class TagRec **m_outlinkTagRecVector;
	Msge0 m_msge0;

	// this points into m_msge1 i guess
	long *m_outlinkIpVector;
	SafeBuf m_outlinkTagRecPtrBuf;
	SafeBuf m_fakeIpBuf;
	char m_hasNoIndexMetaTag;
	char m_hasUseFakeIpsMetaTag;
	Msge1 m_msge1;
	TagRec **m_outlinkTagRecVector;
	SafeBuf m_fakeTagRecPtrBuf;
	TagRec m_fakeTagRec;

	//
	// diffbot parms for indexing diffbot's json output
	//
	XmlDoc *m_dx;
	char *m_diffbotObj;
	SafeBuf m_diffbotReply;
	SafeBuf *m_tokenizedDiffbotReplyPtr;
	SafeBuf  m_tokenizedDiffbotReply;
	long m_diffbotReplyError;
	bool m_recycleDiffbotReply;
	//bool m_diffbotUrlCrawlPatternMatch;
	//bool m_diffbotUrlProcessPatternMatch;
	//bool m_diffbotPageProcessPatternMatch;
	//long m_diffbotApiNum;
	//bool m_useDiffbot;
	// url to access diffbot with
	SafeBuf m_diffbotApiUrl;

	bool *getRecycleDiffbotReply ( ) ;
	SafeBuf *getTokenizedDiffbotReply ( ) ;
	SafeBuf *getDiffbotReply ( ) ;
	//bool doesUrlMatchDiffbotCrawlPattern() ;
	//bool doesUrlMatchDiffbotProcessPattern() ;
	bool doesPageContentMatchDiffbotProcessPattern() ;
	char *hashJSON ( HashTableX *table );
	long *nukeJSONObjects ( ) ;
	long m_joc;

	//EmailInfo m_emailInfo;

	//
	// functions and vars for the seo query matching tool
	//
	bool loadTitleRecFromDiskOrSpider();
	//SafeBuf *getSEOQueryInfo ( );
	HashTableX *getTermIdBufDedupTable32();
	//long  *getTopWordsVector( bool includeSynonyms );
	SafeBuf *getTermId32Buf();
	SafeBuf *getTermInfoBuf();
	SafeBuf *getNewTermInfoBuf();
	SafeBuf *getMatchingQueryBuf();
	SafeBuf *getQueryLinkBuf(SafeBuf *docIdListBuf,bool doMatchingQueries);
	//SafeBuf *getMatchingQueriesScored();
	SafeBuf *getMatchingQueriesScoredForFullQuery();
	SafeBuf *getRelatedDocIds();
	SafeBuf *getRelatedDocIdsScored();
	SafeBuf *getTopMatchingQueryBuf();
	bool     addRelatedDocIdInfo ( long long docId ,
				       long queryNum ,
				       float score ,
				       long  rank ,
				       long  siteHash26 ) ;
	bool     setRelatedDocIdWeightAndRank ( class RelatedDocId *rd );
	SafeBuf *getRelatedDocIdsWithTitles();
	bool     setRelatedDocIdInfoFromMsg20Reply ( class RelatedDocId *rd ,
						     class Msg20Reply *reply );

	SafeBuf *getRelatedQueryBuf();
	//SafeBuf *getRelatedQueryLinksModPart ( long modPart );

	bool addTermsFromQuery ( char *queryStr,
				 uint8_t queryLangId,
				 long gigablastTraffic,
				 long googleTraffic,
				 long hackqoff,
				 class SafeBuf *tmpBuf ,
				 class HashTableX *scoreTable ,
				 class HashTableX *topWordsTable ,
				 float imp,
				 bool isRelatedQuery ) ;

	bool sortTermsIntoBuf ( class HashTableX *scoreTable ,
				class SafeBuf *tmpBuf ,
				class SafeBuf *missingTermBuf ) ;


	SafeBuf *getMissingTermBuf ();
	SafeBuf *getMatchingTermBuf ();
	SafeBuf *getTermIdSortedPosdbListBuf();
	SafeBuf *getWordPosSortedPosdbListBuf();
	SafeBuf *getTermListBuf(); // list of posdb termlists for caching
	SafeBuf *getWordPosInfoBuf ( ) ;


	//bool     sendBin ( long i );
	//bool     scoreDocIdRestrictedQueries(class Msg99Reply **replyPtrs,
	//				     class QueryLink  *linkPtrs,
	//				     long  numPtrs );

	// private like functions
	bool   addUniqueWordsToBuf ( SafeBuf *termInfoBuf,
				     HashTableX *dedupTable ,
				     HashTableX *filterTable ,
				     HashTableX *minCountTable ,
				     bool storeCounts,
				     Words *words ,
				     bool includeSynonyms );
	//void gotMsg99Reply ( UdpSlot *slot );
	//void gotMsg98Reply ( UdpSlot *slot );
	void gotMsg95Reply ( UdpSlot *slot );
	//void gotMsg3aReplyForMainUrl  ( );
	void gotMsg3aReplyForFullQuery( );
	//void gotMsg3aReplyForFullQueryCached ( char *cachedRec ,
	//				       class Msg99Reply *qp );
	//void gotMsg3aReplyForRelQuery ( class Msg3a *msg3a );
	void gotMsg3fReply ( class Bin *bin );
	//void pumpSocketWriteBuf ( );
	//HashTableX *getMatchingQueryHashTable();
	HashTableX *getMatchingQueryOffsetTable();

	long getNumInsertableTerms ( );
	class SafeBuf *getInsertableTerms ( );
	class SafeBuf *getScoredInsertableTerms ( );
	//class SafeBuf *getInsertableWordFreqInfoBuf ();
	bool processMsg95Replies();
	void setWordPosInfosTrafficGain ( class InsertableTerm *it );
	long getTrafficGain( class QueryChange *qc ) ;
	// print in xml
	bool printScoredInsertableTerms ( SafeBuf *sbuf ) ;


	HashTableX m_tidTable32;
	//long *m_twids;
	//long  m_numTwids;
	SafeBuf m_termId32Buf;
	SafeBuf m_termInfoBuf;
	SafeBuf m_newTermInfoBuf;
	//long  m_maxQueries;
	//long  m_maxRelatedQueries;
	//long  m_maxRelatedUrls;
	//long  m_numMsg99Requests;
	//long  m_numMsg98Requests;
	//long  m_numMsg99Replies;
	//long  m_numMsg98Replies;
	//char *m_msg99ReplyPtrs [MAX_HOSTS];
	//long  m_msg99ReplySizes[MAX_HOSTS];
	//long  m_msg99ReplyAlloc[MAX_HOSTS];
	//long  m_msg99HostIds   [MAX_HOSTS];
	char *m_msg95ReplyPtrs [MAX_HOSTS];
	long  m_msg95ReplySizes[MAX_HOSTS];
	//HashTableX m_queryHashTable;
	HashTableX m_queryOffsetTable;
	HashTableX m_tmpTable;
	HashTableX m_fullQueryDedup;
	//SafeBuf m_twbuf;
	//SafeBuf m_queryPtrs;
	SafeBuf m_matchingQueryBuf;
	SafeBuf m_matchingQueryStringBuf;
	SafeBuf m_relatedQueryBuf;
	SafeBuf m_relatedQueryStringBuf;
	SafeBuf m_docIdListBuf;
	SafeBuf m_queryOffsets;
	SafeBuf m_extraQueryBuf;
	//SafeBuf m_socketWriteBuf;
	SafeBuf m_relatedDocIdBuf;
	SafeBuf m_relatedTitleBuf;
	SafeBuf m_commonQueryNumBuf;
	SafeBuf m_topMatchingQueryBuf;
	HashTableX m_rdtab;

	// related query algo stuff
	SafeBuf m_queryLinkBuf;
	SafeBuf m_queryLinkStringBuf;
	char   *m_msg8eReply    [MAX_HOSTS];
	long    m_msg8eReplySize[MAX_HOSTS];
	long    m_numMsg8eRequests;
	long    m_numMsg8eReplies;
	//bool    m_launchedAll;
	long long m_tlbufTimer;

	SafeBuf m_missingTermBuf;
	SafeBuf m_matchingTermBuf;
	//SafeBuf m_queryRelBuf;
	//SafeBuf m_relPtrs;
	SafeBuf m_sortedPosdbListBuf;
	SafeBuf m_wpSortedPosdbListBuf;
	SafeBuf m_termListBuf;
	SafeBuf m_insertableTermsBuf;
	//SafeBuf m_iwfiBuf;
	SafeBuf m_wordPosInfoBuf;
	//SafeBuf m_msg20ReplyPtrBuf;
	SafeBuf m_recommendedLinksBuf;
	SafeBuf m_tmpMsg0Buf;
	SafeBuf m_msg20Array;
	SafeBuf m_newLinkerBuf;

	//Msg17  m_msg17;
	//key_t  m_cacheKey;
	//char  *m_cacheRec;
	//long   m_cacheRecSize;
	//bool   m_triedCache;

	//class TopDocIds *m_topDocIdsBuf;
	//long             m_topDocIdsBufSize;
	SafeBuf m_topDocIdsBuf;
	//class TopDocIds *m_nextAvailTopDocIds;
	//long m_nextAvailTopDocIdsOffset;

	//long    m_maxFullQueries;
	//XmlDoc *m_newxd;
	//XmlDoc *m_newxd2;
	//bool    m_newxd2Blocked;
	//HashTableX m_tmpDupTable;
	//class Msg20  *m_newMsg20;
	Msg3a  *m_msg3a;
	Query  *m_query3a;
	long m_numMsg3aRequests;
	long m_numMsg3aReplies;

	long m_numMsg3fRequests;
	long m_numMsg3fReplies;
	long m_numMsg4fRequests;
	long m_numMsg4fReplies;
	bool m_sentMsg4fRequests;
	class UdpSlot *m_savedSlot;
	long m_numMsg95Requests;
	long m_numMsg95Replies;
	long m_qcursor;
	char m_seoDebug;
	char m_progressBar;
	bool m_readFromCachedb;
	bool m_writeToCachedb;
	//bool m_setForReplyPtrs;
	//bool m_setForLinkPtrs;

	SafeBuf *getRecommendedLinksBuf ( );
	bool processLinkInfoMsg20Reply ( class Msg25 *msg25 );
	bool printRecommendedLinksBuf ( class SafeBuf *sb ) ;

	// recommendedlinksbuf vars and functions
	long m_numLinkRequestsOut;
	long m_numLinkRequestsIn;
	long m_hadLinkInfoError;
	long m_numMsg20sIn;
	long m_numMsg20sOut;
	long m_numValidMsg20s;
	long m_titleCursor;
	long m_msg20Phase;
	long m_recommendedLinkError;
	SafeBuf *lookupTitles();
	bool gotLinkerTitle ( class Msg20 *msg20 );

	// 1 *current* bin per host!
	//class Bin *m_currentBinPtrs[MAX_HOSTS];
	//long       m_binError;
	//long       m_msg98ReplyError;
	//long       m_binErrorForReplyPtrs;
	//long       m_binErrorForLinkPtrs;
	HashTableX m_qstringTable;

	// flow flags
	bool m_printedQueries;
	bool m_printedRelatedDocIds;
	bool m_printedRelatedQueries;
	bool m_printedScoredInsertableTerms;
	bool m_printedRecommendedLinks;
	bool m_loggedMsg3;
	long long m_lastPrintedDocId;
	//bool m_docIndexed;
	//bool m_sentMsg99Requests;
	bool m_didSet3;
	//bool m_didSet3b;
	bool m_registeredSocketCallback;
	// the caller's socket the expect the xml reply on
	TcpSocket *m_seoSocket;
	TcpSocket *m_hackSocket;
	bool m_doingSEO;


	bool clientClosedConnection ( );
	bool m_hadMatchError;
	bool m_clientClosed;
	bool m_lastCheckTime;
	long m_msg3aErrno ;
	bool m_computedMetaListCheckSum;

	// cachedb related args
	//bool m_seoInfoSetFromCache;
	bool m_checkedCachedb;
	bool m_processedCachedbReply;
	//bool m_storedIntoCachedb;
	RdbList m_cacheList;
	//SafeBuf m_msg99ReplyBuf;
	SafeBuf m_queryChangeBuf;
	SafeBuf m_queryLogBuf;
	//SafeBuf m_itStrBuf;
	SafeBuf m_debugScoreInfoBuf;
	SafeBuf m_origScoreInfoBuf;
	RdbList m_storeList;
	Msg1    m_msg1;
	bool    m_allHashed;
	bool checkCachedb ( );
	bool storeScoredInsertableTermsIntoCachedb ( ) ;
	bool storeRelatedQueriesIntoCachedb ( ) ;
	bool storeRelatedDocIdsIntoCachedb ( ) ;
	bool storeMatchingQueriesIntoCachedb ( ) ; // only the top 1000 or so
	bool storeMissingTermBufIntoCachedb ( );
	bool storeWordPosInfoBufIntoCachedb ( );
	bool storeRecommendedLinksBuf ( );

	// cursors
	long    m_socketWriteBufSent;
	long    m_queryNum;
	long    m_rdCursor;
	long    m_relatedNum;
	long    m_numRelatedAdded;

	// for getRelatedDocIdsWithTitles() launching msg20s
	long m_relatedDocIdError;
	long m_numMsg20Replies;
	long m_numMsg20Requests;
	SafeBuf m_msg20Buf;

	// this points into m_msge2
	//char *m_outlinkIsIndexedVector;
	//Msge2 m_msge2;

	bool m_doneWithAhrefs;
	bool m_useAhrefs;
	bool m_reallyInjectLinks;
	long m_downloadLevel;
	long m_numRegExs;
	//char m_isSiteRoot;
	int8_t *m_outlinkHopCountVector;
	long  m_outlinkHopCountVectorSize;
	//char m_isSpam;
	char m_isUrlBadYear;
	char m_isFiltered;
	long m_urlFilterNum;
	long m_numOutlinksAdded;
	long m_numOutlinksAddedFromSameDomain;
	long m_numOutlinksFiltered;
	long m_numOutlinksBanned;
	long m_numRedirects;
	bool m_isPageParser;
	Url m_baseUrl;
	Msg20Reply m_reply;
	Msg20Request *m_req;
	//char *m_gsbuf;
	SafeBuf m_gsbuf;
	//long  m_gsbufSize;
	//long  m_gsbufAllocSize;
	char *m_note;
	char *m_imageUrl;
	char  m_imageUrlBuf[100];
	long  m_imageUrlSize;
	MatchOffsets m_matchOffsets;
	Query m_query;
	Matches m_matches;
	// meta description buf
	long m_dbufLen;
	char m_dbuf[1024];
	Title m_title;
	Summary m_summary;
	char m_isCompromised;
	char m_isNoArchive;
	char m_isErrorPage;
	char m_isHijacked;
	//char m_isVisible;
	//char m_dmozBuf[12000];
	SafeBuf m_dmozBuf;
	long m_numDmozEntries;

	// stuff
	char *m_statusMsg;
	Msg4  m_msg4;
	Msg8b m_msg8b;
	bool  m_incCount;
	bool  m_decCount;

	bool  m_deleteFromIndex;

	// ptrs to stuff
	char *m_titleRec;
	long  m_titleRecSize;
	bool  m_freeTitleRec;
	long  m_titleRecAllocSize;
	key_t m_titleRecKey;

	// for isDupOfUs()
	char *m_dupTrPtr;
	long  m_dupTrSize;

	// parse these out of spider rec
	/*
	long  m_retryNum                ;
	long  m_spiderRecPriority       ;
	bool  m_spiderRecIsNew          ;
	long  m_spiderRecSiteNumInlinks ;
	long  m_spiderRecRetryCount     ;
	long  m_spiderRecHopCount       ;
	key_t m_spiderRecKey            ;
	bool  m_spiderRecForced         ;
	long  m_spiderRecTime           ;
	long  m_srDataSize ;
	char  m_srData [ MAX_SPIDERREC_SIZE ];
	*/

	key_t     m_doledbKey;
	SpiderRequest m_oldsr;
	SpiderReply   m_newsr;

	// bool flags for what procedures we have done
	bool m_checkedUrlFilters;

	bool m_listAdded                ;
	bool m_listFlushed              ;
	bool m_check1                   ;
	bool m_check2                   ;
	bool m_prepared                 ;
	bool m_updatedCounts            ;
	bool m_updatedCounts2           ;
	//bool m_updatedTagdb1            ;
	//bool m_updatedTagdb2            ;
	//bool m_updatedTagdb3            ;
	//bool m_updatedTagdb4            ;
	//bool m_updatedTagdb5            ;
	bool m_copied1                  ;
	bool m_updatingSiteLinkInfoTags ;
	bool m_addressSetCalled         ;

	//bool m_calledMsg22a             ;
	//bool m_calledMsg22b             ;
	//bool m_calledMsg22c             ;
	long long m_calledMsg22d             ;
	bool m_didDelay                 ;
	bool m_didDelayUnregister       ;
	bool m_calledMsg22e             ;
	bool m_calledMsg22f             ;
	bool m_calledMsg25              ;
	bool m_calledMsg25b             ;
	bool m_calledMsg8b              ;
	bool m_calledMsg40              ;
	bool m_calledSections           ;
	bool m_firstEntry               ;
	bool m_firstEntry2              ;
	bool m_launchedSpecialMsg8a     ;
	bool m_launchedMsg8a2           ;
	bool m_loaded                   ;

	// used for getHasContactInfo()
	bool m_processed0               ;

	// a lock to prevent infinite loops
	//bool m_checkForRedir            ;

	bool m_processedLang            ;

	bool m_doingConsistencyCheck ;

	long    m_langIdScore;
	//long    m_rootLangIdScore;
	//uint8_t m_rootLangId;

	// used for getting contact info
	//bool m_triedRoot                ;
	//long m_winner                   ;

	long m_dist;

	// the tags in this tagRec are just contact info based tags and
	// created in the addContactInfo() function. also, in that same
	// function we add/sub the tags in m_citr to the m_newTagRec tag rec.
	//TagRec m_citr ;

	char m_emailBuf[EMAILBUFSIZE];
	long m_numOfficialEmails;

	// use to store a \0 list of "titles" of the root page so we can
	// see which if any are the venue name, and thus match that to
	// addresses of the venue on the site, and we can use those addresses
	// as default venue addresses when no venues are listed on a page
	// on that site.
	char   m_rootTitleBuf[ROOT_TITLE_BUF_MAX];
	long   m_rootTitleBufSize;

	// . this is filtered
	// . certain punct is replaced with \0
	char   m_filteredRootTitleBuf[ROOT_TITLE_BUF_MAX];
	long   m_filteredRootTitleBufSize;

	// like m_rootTitleBuf but for the current page
	char   m_titleBuf[ROOT_TITLE_BUF_MAX];
	long   m_titleBufSize;


	bool m_setTr                    ;
	//bool m_checkedRobots            ;
	bool m_triedTagRec              ;
	bool m_didGatewayPage           ;
	bool m_didQuickDupCheck         ;

	void (* m_masterLoop) ( void *state );
	void  * m_masterState;

	void (* m_callback1) ( void *state );
	bool (* m_callback2) ( void *state );
	void  *m_state;


	//void (* m_injectionCallback) ( void *state );
	//void   *m_injectionState;

	// flags for spider
	//bool m_isAddUrl;
	//bool m_forceDelete;
	bool m_didDelete;

	// this is non-zero if we decided not to index the doc
	long m_indexCode;

	// the spider priority
	long m_priority;

	// the download error, like ETIMEDOUT, ENOROUTE, etc.
	long m_downloadStatus;

	// . when the download was completed. will be zero if no download done
	// . used to set SpiderReply::m_downloadEndTime because we need
	//   high resolution for that so we can dole out the next spiderrequest
	//   from that IP quickly if the sameipwait is like 500ms.
	long long m_downloadEndTime;

	char *m_metaListEnd;
	long  m_metaListAllocSize;
	char *m_p;
	char *m_pend;

	long  m_maxCacheAge;

	// a list of 32-bit ints followed by a zero 32-bit int to terminate
	long long m_adIds [ XD_MAX_AD_IDS ];
	//char *m_adVector;// [XMLDOC_MAX_AD_IDS];
	//long  m_adVectorSize;

	char     *m_wikiqbuf;
	long      m_wikiqbufSize;
	long long m_wikiDocIds [ MAX_WIKI_DOCIDS ];
	rscore_t  m_wikiScores [ MAX_WIKI_DOCIDS ];

	bool      m_registeredSleepCallback;
	bool      m_addedNegativeDoledbRec;

	bool          m_hashedTitle;
	bool          m_hashedMetas;

	long          m_niceness;

	bool m_usePosdb     ;
	//bool m_useDatedb    ;
	bool m_useClusterdb ;
	bool m_useLinkdb    ;
	bool m_useSpiderdb  ;
	bool m_useTitledb   ;
	bool m_useTagdb     ;
	bool m_usePlacedb   ;
	//bool m_useTimedb    ;
	bool m_useSectiondb ;
	//bool m_useRevdb     ;
	bool m_useSecondaryRdbs ;

	long          m_linkeeQualityBoost;

	SafeBuf *m_pbuf;
	// used by SpiderLoop to set m_pbuf to
	SafeBuf  m_sbuf;
	// store termlist into here if non-null
	bool     m_storeTermListInfo;
	char     m_sortTermListBy;

	SafeBuf m_sectiondbData;
	//char *m_sectiondbData;
	char *m_placedbData;
	//long  m_sectiondbDataSize;
	long  m_placedbDataSize;

	// we now have HashInfo to replace this
	//bool m_inHashNoSplit;

	// store the terms that we hash into this table so that PageParser.cpp
	// can print what was hashed and with what score and what description
	class HashTableX *m_wts;
	HashTableX m_wtsTable;
	SafeBuf m_wbuf;

	// used by addContactInfo() to keep track of what urls we have
	// processed for contact info to avoid re-processing them in the
	// recursive loop thing that we do
	//HashTableX m_pt;

	// Msg25.cpp stores its pageparser.cpp output into this one
	SafeBuf m_pageLinkBuf;
	SafeBuf m_siteLinkBuf;

	SafeBuf m_serpBuf;

	// which set() function was called above to set us?
	bool          m_setFromTitleRec;
	bool          m_setFromSpiderRec;
	bool          m_setFromUrl;
	bool          m_setFromDocId;
	bool          m_freeLinkInfo1;
	bool          m_freeLinkInfo2;

	bool          m_contentInjected;

	bool          m_recycleContent;
	//bool        m_loadFromOldTitleRec;

	char *m_rawUtf8Content;
	long  m_rawUtf8ContentSize;
	long  m_rawUtf8ContentAllocSize; // we overallocate sometimes
	char *m_expandedUtf8Content;
	long  m_expandedUtf8ContentSize;
	char *m_savedp;
	char *m_oldp;
	bool  m_didExpansion;
	SafeBuf m_esbuf;
	SafeBuf m_xbuf;

	//bool m_useIpsTxtFile ;
	//bool m_readFromTestCache ;

	// used by msg13
	class Msg13Request *m_r;

	// Msg20 uses this to stash its TcpSlot
	void *m_slot;

	char *getTestDir();

	bool m_freed;

	bool m_msg4Waiting;
	bool m_msg4Launched;

	// word spam detection
	char *getWordSpamVec ( );
	bool setSpam ( long *profile, long plen , long numWords ,
		       unsigned char *spam );
	long  getProbSpam  ( long *profile, long plen , long step );
	bool m_isRepeatSpammer;
	long m_numRepeatSpam;
	bool m_totallySpammed;

	// frag vector (repeated fragments). 0 means repeated, 1 means not.
	// vector is 1-1 with words in the document body.
	char *getFragVec ( );

	bool injectLinks  ( HashTableX *linkDedupTable ,
			    HashTableX *domDedupTable ,
			    void *finalState ,
			    void (* finalCallback)(void *));
	bool injectAhrefsLinks();
	bool doInjectLoop ( );
	void doneInjecting ( class XmlDoc *xd );
	long  m_i;
	long  m_blocked;
	HashTableX  m_domDedupTable;
	HashTableX *m_linkDedupTablePtr;
	HashTableX *m_domDedupTablePtr;
	bool m_dedupLinkDomains;
	void *m_finalState;
	void (* m_finalCallback) ( void *state );
	char  m_used[MAX_XML_DOCS];
	class XmlDoc *m_xmlDocs[MAX_XML_DOCS];
	long long m_cacheStartTime;
};

// . PageParser.cpp uses this class for printing hashed terms out by calling
//   XmlDoc::print()
// . we store TermInfos into XmlDoc::m_wtsTable, a HashTableX
// . one for each term hashed
// . the key is the termId. dups are allowed
// . the term itself is stored into a separate buffer, m_wbuf, a SafeBuf, so
//   that TermInfo::m_term will reference that and it won't disappear on us
class TermDebugInfo {
 public:
	long      m_termOff;
	long      m_termLen;
	//uint32_t  m_score32;
	long      m_descOff;   // the description offset
	long      m_prefixOff; // the prefix offset, like "site" or "gbadid"
	long long m_termId;
	long      m_date;
	bool      m_shardByTermId;

	//float     m_weight;
	char      m_langId;
	char      m_diversityRank;
	char      m_densityRank;
	char      m_wordSpamRank;
	char      m_hashGroup;
	long      m_wordNum;
	long      m_wordPos;
	//bool      m_isSynonym;
	// 0 = not a syn, 1 = syn from presets,2=wikt,3=generated
	char      m_synSrc;
	long long  m_langBitVec64;
	// this is copied from Weights::m_rvw or m_rvp
	//float     m_rv[MAX_RULES];
};

// a ptr to HashInfo is passed to hashString() and hashWords()
class HashInfo {
public:
	HashInfo() {
		m_tt                      = NULL;
		m_prefix                  = NULL;
		m_desc                    = NULL;
		m_date                    = 0;
		// should we do sharding based on termid and not the usual docid???
		// in general this is false, but for checksum we want to shard
		// by the checksum and not docid to avoid having to do a
		// gbchecksum:xxxxx search on ALL shards. much more efficient.
		m_shardByTermId = false;
		//m_useWeights              = false;
		m_useSynonyms             = false;
		m_hashGroup = -1;
		m_startDist = 0;
		m_siteHash32 = 0;
	};
	class HashTableX *m_tt;
	char             *m_prefix;
	// "m_desc" should detail the algorithm
	char             *m_desc;
	long              m_date;
	char              m_shardByTermId;
	char              m_linkerSiteRank;
	//char              m_useWeights;
	char              m_useSynonyms;
	char              m_hashGroup;
	long              m_startDist;
	long              m_siteHash32;
};


// g_tt is used for debugging
//extern class TermTable *g_tt;

extern uint8_t score32to8 ( uint32_t score ) ;

extern pid_t g_pid    ;
extern long  g_ticker ;
extern long  g_filterTimeout ;

// as recommended in the "man system" page we use our own
int my_system_r ( char *cmd , long timeout ) ;

// . returns 0 to 100 , the probability of spam for this subprofile
// . a "profile" is an array of all the positions of a word in the document
// . a "position" is just the word #, like first word, word #8, etc...
// . we are passed a subprofile, "profile", of the actual profile
//   because some of the document may be more "spammy" than other parts
// . inlined to speed things up because this may be called multiple times
//   for each word in the document
// . if "step" is 1 we look at every       word position in the profile
// . if "step" is 2 we look at every other word position
// . if "step" is 3 we look at every 3rd   word position, etc...
inline long XmlDoc::getProbSpam(long *profile, long plen, long step) {

	// you can spam 2 or 1 letter words all you want to
	if ( plen <= 2 ) return 0;

	// if our step is bigger than the profile return 0
	if ( step == plen ) return 0;

	register long avgSpacing, stdDevSpacing;
	long d,dev=0;
	register long i;

	for (long j = 0; j < step; j++) {

		// find avg. of gaps between consecutive tokens in subprofile
		// TODO: isn't profile[i] < profile[i+1]??
		long istop = plen-1;
		avgSpacing = 0;
		for (i=0; i < istop; i += step )
			avgSpacing += ( profile[i] - profile[i+1] );
		// there's 1 less spacing than positions in the profile
		// so we divide by plen-1
		avgSpacing = (avgSpacing * 256) / istop;

		// compute standard deviation of the gaps in this sequence
		stdDevSpacing = 0;
		for (i = 0 ; i < istop; i += step ) {
			d = (( profile[i] - profile[i+1]) * 256 ) - avgSpacing;
			if ( d < 0 ) stdDevSpacing -= d;
			else         stdDevSpacing += d;
		}

		// TODO: should we divide by istop-1 for stdDev??
		stdDevSpacing /= istop;

		// average of the stddevs for all sequences
		dev += stdDevSpacing;
	}

	dev /= step;

	// if the plen is big we should expect dev to be big
	// here's some interpolation points:
	// plen >=  2  and  dev<= 0.2  --> 100%
	// plen  =  7  and  dev = 1.0  --> 100%
	// plen  = 14  and  dev = 2.0  --> 100%
	// plen  = 21  and  dev = 3.0  --> 100%
	// plen  = 7   and  dev = 2.0  -->  50%

	// NOTE: dev has been multiplied by 256 to avoid using floats
	if ( dev <= 51.2 ) return 100;  // (.2 * 256)
	long prob = ( (256*100/7) * plen ) / dev;

	if (prob>100) prob=100;

	return prob;

	//if (prob>=0) {
	//	long i;
	//printf("dev=%i,plen=%i,nseq=%i,prob=%i----\n",dev,plen,step,prob);
	//	for (i=0;i<plen;i++)
	//		printf("%i#",profile[i]);
	//	printf("\n");
	//}
}

#endif