open-source-search-engine/XmlDoc.h
Matt Wells 16f8af0d57 added awesome streaming mode support
to tcpserver.cpp for sending back
json objects as we get them from shards.
and as we get them in small pieces so we
don't go oom. made that code much simpler
and more reliable in the long run.
2014-01-17 16:26:17 -08:00

2392 lines
68 KiB
C++

// Matt Wells, copyright Apr 2009
// . 2. you can also call setTitleRec() and then call getMetaList()
// . this class is used by Repair.cpp and by Msg7 (inject) and SpiderLoop.cpp
// . Msg7 and Repair.cpp and injections can also set more than just
// m_firstUrl, like m_content, etc. or whatever elements are known, but
// they must also set the corresponding "valid" flags of those elements
// . both methods must yield exactly the same result, the same "meta list"
// . after setting the contained classes XmlDoc::setMetaList() makes the list
// of rdb records to be added to all the rdbs, this is the "meta list"
// . the meta list is made by hashing all the termIds/scores into some hash
// tables in order to accumulate scores, then the hash table are serialized
// into the "meta list"
// . the meta list is added to all rdbs with a simple call to
// Msg4::addMetaList(), which is only called by Msg14 or Repair.cpp for now
#ifndef _XMLDOC_H_
#define _XMLDOC_H_
//#include "HashTableX.h"
#include "Lang.h"
#include "Words.h"
#include "Bits.h"
#include "Pos.h"
#include "Phrases.h"
//#include "Synonyms.h"
//#include "Weights.h"
#include "Xml.h"
#include "LangList.h"
#include "SafeBuf.h"
#include "Images.h"
#include "Sections.h"
#include "Msge0.h"
#include "Msge1.h"
//#include "Msge2.h"
#include "Msg4.h"
#include "Msg8b.h"
#include "SearchInput.h"
#include "Msg40.h"
#include "Dates.h"
//#include "IndexList.h"
#include "Msg0.h"
#include "Msg22.h"
#include "Tagdb.h"
#include "Url.h"
#include "Linkdb.h"
//#include "LinkInfo.h"
//#include "Msg25.h"
#include "MsgC.h"
#include "Msg13.h"
#include "RdbList.h"
#include "SiteGetter.h"
//#include "CollectionRec.h"
#include "Msg20.h"
#include "Matches.h"
#include "Query.h"
#include "Title.h"
#include "Summary.h"
#include "Msg8b.h"
#include "Address.h"
#include "zlib.h" // Z_OK
#include "Spider.h" // SpiderRequest/SpiderReply definitions
#include "HttpMime.h" // ET_DEFLAT
#include "Msg1.h"
#include "PingServer.h"
//#include "PageCrawlBot.h" // DBA_NONE
//#define XMLDOC_MAX_AD_IDS 4
//#define XMLDOC_ADLEN 64
#define MAXFRAGWORDS 80000
#define MAX_WIKI_DOCIDS 20
#define MAX_TAG_PAIR_HASHES 100
#include "Msg40.h"
//#define SAMPLE_VECTOR_SIZE (32*4)
#define POST_VECTOR_SIZE (32*4)
#define XD_GQ_MAX_SIZE 1000
#define XD_MAX_GIGABIT_HASHES 48
#define XD_MAX_AD_IDS 5
double getTrafficPercent ( long rank ) ;
bool setLangVec ( class Words *words ,
class SafeBuf *langBuf ,
class Sections *sections ,
long niceness ) ;
char *getJSONFieldValue ( char *json, char *field , long *valueLen ) ;
bool logQueryLogs ( );
bool checkRegex ( SafeBuf *regex ,
char *target ,
bool *boolVal ,
bool *boolValValid ,
long *compileError ,
CollectionRec *cr ) ;
// Address.cpp calls this to make a vector from the "place name" for comparing
// to other places in placedb using the computeSimilarity() function. if
// we got a >75% similarity we set the AF_VERIFIED_PLACE_NAME bit in the
// Address::m_flags for that address on the web page.
long makeSimpleWordVector ( char *s, long *vbuf, long vbufSize, long niceness);
// this is used for making the event summary/title vectors as well as in
// Msg40.cpp where it merges events and does not want to repetitively display
// the same summary lines for an event
bool getWordVector ( char *s ,
HashTableX *ht ,
uint32_t *d ,
long *nd ,
long ndmax ) ;
bool getDensityRanks ( long long *wids ,
long nw,
//long wordStart ,
//long wordEnd ,
long hashGroup ,
SafeBuf *densBuf ,
Sections *sections ,
long niceness );
// diversity vector
bool getDiversityVec ( class Words *words ,
class Phrases *phrases ,
class HashTableX *countTable ,
class SafeBuf *sbWordVec ,
//class SafeBuf *sbPhraseVec ,
long niceness );
float computeSimilarity ( long *vec0 ,
long *vec1 ,
// corresponding scores vectors
long *s0 ,
long *s1 ,
class Query *q ,
long niceness ,
// only Sections::addDateBasedImpliedSections()
// sets this to true right now. if set to true
// we essentially dedup each vector, although
// the score is compounded into the remaining
// occurence. i'm not sure if that is the right
// behavior though.
bool dedupVecs = false );
bool isSimilar_sorted ( long *vec0 ,
long *vec1 ,
long nv0 , // how many longs in vec?
long nv1 , // how many longs in vec?
// they must be this similar or more to return true
long percentSimilar,
long niceness ) ;
// this is called by Msg40.cpp to set "top"
long intersectGigabits ( Msg20 **mp , // search results
long nmp ,
uint8_t langId , // searcher's langId
long maxTop ,
long docsToScan ,
long minDocCount , // must be in this # docs
class GigabitInfo *top ,
long niceness ) ;
long getDirtyPoints ( char *s , long len , long niceness , char *logUrl ) ;
bool storeTerm ( char *s ,
long slen ,
long long termId ,
class HashInfo *hi ,
long wordNum ,
long wordPos ,
char densityRank ,
char diversityRank ,
char wordSpamRank ,
char hashGroup ,
//bool isPhrase ,
class SafeBuf *wbuf ,
class HashTableX *wts ,
char synSrc ,
char langId ) ;
// tell zlib to use our malloc/free functions
int gbuncompress ( unsigned char *dest ,
unsigned long *destLen ,
unsigned char *source ,
unsigned long sourceLen );
int gbcompress ( unsigned char *dest ,
unsigned long *destLen ,
unsigned char *source ,
unsigned long sourceLen ,
long encoding = ET_DEFLATE);
int gbcompress7 ( unsigned char *dest ,
unsigned long *destLen ,
unsigned char *source ,
unsigned long sourceLen ,
bool compress = true );
int gbuncompress7 ( unsigned char *dest ,
unsigned long *destLen ,
unsigned char *source ,
unsigned long sourceLen ) ;
uint32_t score8to32 ( uint8_t score8 );
// for Msg13.cpp
char getContentTypeFromContent ( char *p , long niceness ) ;
// . for Msg13.cpp
// . *pend must equal \0
long getContentHash32Fast ( unsigned char *p ,
long plen ,
long niceness ) ;
uint16_t getCharsetFast ( class HttpMime *mime,
char *url ,
char *s ,
long slen ,
long niceness );
//#define MAX_CONTACT_OUTLINKS 5
#define MAX_CONTACT_ADDRESSES 20
#define EMAILBUFSIZE 512
#define ROOT_TITLE_BUF_MAX 512
// store the subsentences in an array now
class SubSent {
public:
sentflags_t m_subSentFlags;
//esflags_t m_esflags;
long m_senta;
long m_sentb;
long m_subEnding;
float m_titleScore;
};
#define MAX_XML_DOCS 4
class XmlDoc {
public:
// . variable size rdb records all start with key then dataSize
// . do not do that here since we compress our record's data!!
//key_t m_titleRecKey;
//long m_dataSize;
//
// BEGIN WHAT IS STORED IN THE TITLE REC (Titledb.h)
//
// headerSize = this->ptr_firstUrl - this->m_headerSize
uint16_t m_headerSize;
uint16_t m_version;
// these flags are used to indicate which ptr_ members are present:
uint32_t m_internalFlags1;
long m_ip;
long m_crawlDelay;
// . use this to quickly detect if doc is unchanged
// . we can avoid setting Xml and Words classes etc...
long m_contentHash32;
// like the above but hash of all tags in TagRec for this url
long m_tagHash32;
long m_siteNumInlinks;
long m_siteNumInlinksUniqueIp; // m_siteNumInlinksFresh
long m_siteNumInlinksUniqueCBlock; // m_sitePop;
time_t m_spideredTime;
time_t m_minPubDate;
time_t m_maxPubDate;
time_t m_pubDate; // aka m_datedbDate
//time_t m_nextSpiderTime;
time_t m_firstIndexedDate;
time_t m_outlinksAddedDate;
uint16_t m_charset; // the ORIGINAL charset, we are always utf8!
uint16_t m_countryId;
//uint16_t m_reserved1;//titleWeight;
//uint16_t m_reserved2;//headerWeight;
long m_siteNumInlinksTotal;
//uint16_t m_reserved3;//urlPathWeight;
uint8_t m_metaListCheckSum8; // bring it back!!
char m_reserved3b;
uint16_t m_reserved4;//externalLinkTextWeight;
uint16_t m_reserved5;//internalLinkTextWeight;
// a new parm from reserved6. need to know the count so we can
// delete the json objects derived from this page if we want to
// delete this page. or if this page is respidered then we get the
// json objects for it, REject the old json object urls, and inject
// the new ones i guess.
uint16_t m_diffbotJSONCount;
// these do not include header/footer (dup) addresses
//int16_t m_numAddresses;
int16_t m_httpStatus; // -1 if not found (empty http reply)
//int8_t m_nextSpiderPriority;
int8_t m_hopCount;
//int8_t m_metalistChecksum; // parser checksum
//uint8_t m_numBannedOutlinks8;
uint8_t m_langId;
uint8_t m_rootLangId;
uint8_t m_contentType;
// bit flags
uint16_t m_isRSS:1;
uint16_t m_isPermalink:1;
uint16_t m_isAdult:1;
uint16_t m_wasInjected:1;//eliminateMenus:1;
uint16_t m_spiderLinks:1;
uint16_t m_isContentTruncated:1;
uint16_t m_isLinkSpam:1;
uint16_t m_hasAddress:1;
uint16_t m_hasTOD:1;
uint16_t m_hasSiteVenue:1;
uint16_t m_hasContactInfo:1;
uint16_t m_isSiteRoot:1;
uint16_t m_isDiffbotJSONObject:1;
uint16_t m_sentToDiffbot:1;
uint16_t m_gotDiffbotSuccessfulReply:1;
uint16_t m_reserved804:1;
uint16_t m_reserved805:1;
uint16_t m_reserved806:1;
uint16_t m_reserved807:1;
uint16_t m_reserved808:1;
uint16_t m_reserved809:1;
uint16_t m_reserved810:1;
uint16_t m_reserved811:1;
uint16_t m_reserved812:1;
uint16_t m_reserved813:1;
uint16_t m_reserved814:1;
uint16_t m_reserved815:1;
uint16_t m_reserved816:1;
char *ptr_firstUrl;
char *ptr_redirUrl;
//char *ptr_tagRecData;
char *ptr_rootTitleBuf;
long *ptr_gigabitHashes;
long *ptr_gigabitScores;
long long *ptr_adVector;
long long *ptr_wikiDocIds;
rscore_t *ptr_wikiScores;
char *ptr_imageData;
long *ptr_catIds;
long *ptr_indCatIds;
char *ptr_dmozTitles;
char *ptr_dmozSumms;
char *ptr_dmozAnchors;
char *ptr_utf8Content;
//char *ptr_sectionsReply; // votes read from sectiondb - m_osvt
//char *ptr_sectionsVotes; // our local votes - m_nsvt
//char *ptr_addressReply;
char *ptr_clockCandidatesData;
// . serialization of the sectiondb and placedb lists
// . that way we can store just these and not have to store the content
// of the entire page if we do not need to
//char *ptr_sectiondbData;
//char *ptr_placedbData;
// do not let SiteGetter change this when we re-parse!
char *ptr_site;
LinkInfo *ptr_linkInfo1;
char *ptr_linkdbData;
char *ptr_sectiondbData;
char *ptr_tagRecData;
LinkInfo *ptr_linkInfo2;
long size_firstUrl;
long size_redirUrl;
//long size_tagRecData;
long size_rootTitleBuf;
long size_gigabitHashes;
long size_gigabitScores;
long size_adVector;
long size_wikiDocIds;
long size_wikiScores;
long size_imageData;
long size_catIds;
long size_indCatIds;
long size_dmozTitles;
long size_dmozSumms;
long size_dmozAnchors;
long size_utf8Content;
//long size_sectionsReply;
//long size_sectionsVotes;
//long size_addressReply;
long size_clockCandidatesData;
//long size_sectiondbData;
//long size_placedbData;
long size_site;
long size_linkInfo1;
long size_linkdbData;
long size_sectiondbData;
long size_tagRecData;
long size_linkInfo2;
char m_dummyEnd;
//
// END WHAT IS STORED IN THE TITLE REC (Titledb.h)
//
public:
// . returns false and sets errno on error
// . once you call this you can call setMetaList() below
// . sets all the contained parser classes, Words, Xml, etc. if they
// have not already been set! that way Msg16/Msg14 can set bits
// and pieces here and there and we do not reset what it's done
// . our m_xml will contain ptrs into titleRec's content, be careful
// . if titleRec gets freed we should be freed too
//bool set ( char *titleRec ,
// class SafeBuf *pbuf = NULL ,
// long niceness = MAX_NICENESS ,
// bool justSetLinks = false );
// . used by Msg16 to set the Xml to get meta redirect tag's content
// . used by Msg16 to get <META NAME="ROBOTS" CONTENT="index,follow">
// . this should be set by Msg16 so it can get meta redirect url
void print ( );
bool set1 ( char *url ,
char *coll,
SafeBuf *pbuf ,
long niceness );
bool set2 ( char *titleRec,
long maxSize,
char *coll,
class SafeBuf *p,
long niceness ,
class SpiderRequest *sreq = NULL );
// . since being set from a docId, we will load the old title rec
// and use that!
// . used by PageGet.cpp
bool set3 ( long long docId ,
char *coll ,
long niceness );
bool set4 ( class SpiderRequest *sreq ,
key_t *doledbKey ,
char *coll ,
class SafeBuf *pbuf ,
long niceness ,
char *utf8Content = NULL ,
bool deleteFromIndex = false ,
long forcedIp = 0 ,
uint8_t contentType = CT_HTML ,
time_t spideredTime = 0 ,
bool contentHasMime = false ) ;
// we now call this right away rather than at download time!
long getSpideredTime();
// another entry point, like set3() kinda
bool loadFromOldTitleRec ();
XmlDoc() ;
~XmlDoc() ;
void nukeDoc ( class XmlDoc *);
void reset ( ) ;
bool setFirstUrl ( char *u , bool addWWW , Url *base = NULL ) ;
bool setRedirUrl ( char *u , bool addWWW ) ;
void setStatus ( char *s ) ;
void setCallback ( void *state, void (*callback) (void *state) ) ;
void setCallback ( void *state, bool (*callback) (void *state) ) ;
bool addToSpiderdb ( ) ;
bool indexDoc ( );
bool indexDoc2 ( );
key_t *getTitleRecKey() ;
//char *getSkipIndexing ( );
char *prepareToMakeTitleRec ( ) ;
char **getTitleRec ( ) ;
char *getIsAdult ( ) ;
long **getIndCatIds ( ) ;
long **getCatIds ( ) ;
class CatRec *getCatRec ( ) ;
long *getNumDmozEntries() ;
char **getDmozTitles ( ) ;
char **getDmozSummaries ( ) ;
char **getDmozAnchors ( ) ;
bool setDmozInfo () ;
long long **getWikiDocIds ( ) ;
void gotWikiResults ( class UdpSlot *slot );
long *getPubDate ( ) ;
//class DateParse2 *getDateParse2 ( ) ;
class Dates *getSimpleDates();
class Dates *getDates();
class HashTableX *getClockCandidatesTable();
long getUrlPubDate ( ) ;
long getOutlinkAge ( long outlinkNum ) ;
char *getIsPermalink ( ) ;
char *getIsUrlPermalinkFormat ( ) ;
char *getIsRSS ( ) ;
class Xml *getXml ( ) ;
uint8_t *getLangVector ( ) ;
uint8_t *getLangId ( ) ;
char computeLangId ( Sections *sections ,Words *words , char *lv ) ;
class Words *getWords ( ) ;
class Bits *getBits ( ) ;
class Bits *getBitsForSummary ( ) ;
class Pos *getPos ( );
class Phrases *getPhrases ( ) ;
//class Synonyms *getSynonyms ( );
class Sections *getExplicitSections ( ) ;
class Sections *getImpliedSections ( ) ;
class Sections *getSections ( ) ;
class Sections *getSectionsWithDupStats ( );
bool gotSectionStats( class Msg3a *msg3a );
class SectionStats *getSectionStats ( long long secHash64 );
class SectionVotingTable *getOldSectionVotingTable();
class SectionVotingTable *getNewSectionVotingTable();
char **getSectionsReply ( ) ;
char **getSectionsVotes ( ) ;
HashTableX *getSectionVotingTable();
long *getLinkSiteHashes ( );
class Links *getLinks ( bool doQuickSet = false ) ;
class HashTableX *getCountTable ( ) ;
bool hashString_ct ( class HashTableX *ht, char *s , long slen ) ;
uint8_t *getSummaryLangId ( ) ;
long *getTagPairHashVector ( ) ;
uint32_t *getTagPairHash32 ( ) ;
long *getSummaryVector ( ) ;
long *getPageSampleVector ( ) ;
long *getPostLinkTextVector ( long linkNode ) ;
long computeVector ( class Sections *sections, class Words *words,
uint32_t *vec , long start = 0 , long end = -1 );
float *getTagSimilarity ( class XmlDoc *xd2 ) ;
float *getGigabitSimilarity ( class XmlDoc *xd2 ) ;
float *getPageSimilarity ( class XmlDoc *xd2 ) ;
float *getPercentChanged ( );
uint64_t *getFuzzyDupHash ( );
long long *getExactContentHash64();
class RdbList *getDupList ( ) ;
class RdbList *getLikedbListForReq ( );
class RdbList *getLikedbListForIndexing ( );
long addLikedbRecords ( bool justGetSize ) ;
char *getIsDup ( ) ;
char *isDupOfUs ( long long d ) ;
uint32_t *getGigabitVectorScorelessHash ( ) ;
long *getGigabitHashes ( );
char *getGigabitQuery ( ) ;
char *getMetaDescription( long *mdlen ) ;
char *getMetaSummary ( long *mslen ) ;
char *getMetaKeywords( long *mklen ) ;
bool addGigabits ( char *s , long long docId , uint8_t langId ) ;
bool addGigabits2 ( char *s,long slen,long long docId,uint8_t langId);
bool addGigabits ( class Words *ww ,
long long docId,
class Sections *sections,
//class Weights *we ,
uint8_t langId );
long *getSiteSpiderQuota ( ) ;
class Url *getCurrentUrl ( ) ;
class Url *getFirstUrl() ;
long long getFirstUrlHash48();
long long getFirstUrlHash64();
class Url **getRedirUrl() ;
class Url **getMetaRedirUrl() ;
long *getFirstIndexedDate ( ) ;
long *getOutlinksAddedDate ( ) ;
//long *getNumBannedOutlinks ( ) ;
uint16_t *getCountryId ( ) ;
class XmlDoc **getOldXmlDoc ( ) ;
bool isRobotsTxtFile ( char *url , long urlLen ) ;
class XmlDoc **getExtraDoc ( char *url , long maxCacheAge = 0 ) ;
bool getIsPageParser ( ) ;
class XmlDoc **getRootXmlDoc ( long maxCacheAge = 0 ) ;
//class XmlDoc **getGatewayXmlDoc ( ) ;
// . returns false if blocked, true otherwise.
// . returns true and sets g_errno on error
//bool setFromOldTitleRec ( ) ;
//RdbList *getOldMetaList ( ) ;
char **getOldTitleRec ( );
uint8_t *getRootLangId ();
//bool *updateRootLangId ( );
char **getRootTitleRec ( ) ;
//char **getContactTitleRec ( char *url ) ;
long long *getDocId ( ) ;
char *getIsIndexed ( ) ;
class TagRec *getTagRec ( ) ;
char *getHasContactInfo ( ) ;
char *getIsThisDocContacty ( );
bool *getHasTOD();
bool *getHasSiteVenue();
// non-dup/nondup addresses only
bool *getHasAddress();
class Addresses *getAddresses ( ) ;
Address **getContactAddresses ( );
long *getNumOfficialEmails ( ) ;
char *getEmailBuf ( ) ;
long *getNumContactAddresses ( );
long addEmailTags ( class Xml *xml , class Words *ww ,
class TagRec *gr , long ip ) ;
//class Url *getContactUsLink ( ) ;
//class Url *getAboutUsLink ( ) ;
long *getFirstIp ( ) ;
bool *updateFirstIp ( ) ;
long *getSiteNumInlinksUniqueIp ( ) ;
long *getSiteNumInlinksUniqueCBlock ( ) ;
long *getSiteNumInlinksTotal ( );
//long *getSiteNumInlinksFresh ( ) ;
//long *getSitePop ( ) ;
uint8_t *getSiteNumInlinks8 () ;
long *getSiteNumInlinks ( ) ;
class LinkInfo *getSiteLinkInfo() ;
long *getIp ( ) ;
long *gotIp ( bool save ) ;
bool *getIsAllowed ( ) ;
long *getFinalCrawlDelay();
long m_finalCrawlDelay;
//long getTryAgainTimeDelta() {
// if ( ! m_tryAgainTimeDeltaValid ) { char *xx=NULL;*xx=0;}
// return m_tryAgainTimeDelta;
//};
char *getIsWWWDup ( ) ;
class LinkInfo *getLinkInfo1 ( ) ;
class LinkInfo **getLinkInfo2 ( ) ;
char *getSite ( ) ;
void gotSite ( ) ;
long long *getSiteHash64 ( ) ;
long *getSiteHash32 ( ) ;
char **getHttpReply ( ) ;
char **getHttpReply2 ( ) ;
char **gotHttpReply ( ) ;
char *getIsContentTruncated ( );
long *getDownloadStatus ( ) ;
long long *getDownloadEndTime ( ) ;
int16_t *getHttpStatus ( );
char waitForTimeSync ( ) ;
bool m_alreadyRegistered;
class HttpMime *getMime () ;
char **getContent ( ) ;
uint8_t *getContentType ( ) ;
uint16_t *getCharset ( ) ;
char *getIsBinary ( ) ;
char **getFilteredContent ( ) ;
void filterStart_r ( bool amThread ) ;
char **getRawUtf8Content ( ) ;
char **getExpandedUtf8Content ( ) ;
char **getUtf8Content ( ) ;
long *getContentHash32 ( ) ;
long *getTagHash32 ( ) ;
long getHostHash32a ( ) ;
long getHostHash32b ( ) ;
long getDomHash32 ( );
char **getImageData();
class Images *getImages ( ) ;
int8_t *getNextSpiderPriority ( ) ;
long *getPriorityQueueNum ( ) ;
class TagRec ***getOutlinkTagRecVector () ;
char *hasNoIndexMetaTag();
char *hasFakeIpsMetaTag ( );
long **getOutlinkFirstIpVector () ;
//char **getOutlinkIsIndexedVector () ;
long *getRegExpNum ( long outlinkNum ) ;
long *getRegExpNum2 ( long outlinkNum ) ;
char *getIsSiteRoot ( ) ;
bool getIsOutlinkSiteRoot ( char *u , class TagRec *gr ) ;
int8_t *getHopCount ( ) ;
//int8_t *getOutlinkHopCountVector ( ) ;
char *getSpiderLinks ( ) ;
long *getNextSpiderTime ( ) ;
//char *getIsSpam() ;
char *getIsFiltered ();
bool getIsInjecting();
long *getSpiderPriority ( ) ;
long *getIndexCode ( ) ;
long *getIndexCode2 ( ) ;
SafeBuf *getNewTagBuf ( ) ;
char *updateTagdb ( ) ;
bool logIt ( ) ;
bool m_doConsistencyTesting;
bool doConsistencyTest ( bool forceTest ) ;
long printMetaList ( ) ;
void printMetaList ( char *metaList , char *metaListEnd ,
class SafeBuf *pbuf );
bool verifyMetaList ( char *p , char *pend , bool forDelete ) ;
bool hashMetaList ( class HashTableX *ht ,
char *p ,
char *pend ,
bool checkList ) ;
char *getMetaList ( bool forDelete = false );
void copyFromOldDoc ( class XmlDoc *od ) ;
// we add a SpiderReply to spiderdb when done spidering, even if
// m_indexCode or g_errno was set!
class SpiderReply *getNewSpiderReply ( );
SpiderRequest **getRedirSpiderRequest ( );
SpiderRequest m_redirSpiderRequest;
SpiderRequest *m_redirSpiderRequestPtr;
void setSpiderReqForMsg20 ( class SpiderRequest *sreq ,
class SpiderReply *srep );
char *addOutlinkSpiderRecsToMetaList ( );
//bool addTable96 ( class HashTableX *tt1 ,
// long date1 ,
// bool nosplit ) ;
long getSiteRank ();
bool addTable144 ( class HashTableX *tt1 );
bool addTable224 ( HashTableX *tt1 ) ;
//bool addTableDate ( class HashTableX *tt1 , //T<key128_t,char> *tt1
// uint64_t docId ,
// uint8_t rdbId ,
// bool nosplit ) ;
bool addTable128 ( class HashTableX *tt1 , // T <key128_t,char>*tt1
uint8_t rdbId ,
bool forDelete ) ;
bool hashNoSplit ( class HashTableX *tt ) ;
char *hashAll ( class HashTableX *table ) ;
long getBoostFromSiteNumInlinks ( long inlinks ) ;
bool hashMetaTags ( class HashTableX *table ) ;
bool hashIsClean ( class HashTableX *table ) ;
bool hashZipCodes ( class HashTableX *table ) ;
bool hashMetaZip ( class HashTableX *table ) ;
bool hashContentType ( class HashTableX *table ) ;
bool hashDMOZCategories ( class HashTableX *table ) ;
bool hashLinks ( class HashTableX *table ) ;
bool hashUrl ( class HashTableX *table ) ;
bool hashDateNumbers ( class HashTableX *tt ) ;
bool hashSections ( class HashTableX *table ) ;
bool hashIncomingLinkText ( class HashTableX *table ,
bool hashAnomalies ,
bool hashNonAnomalies ) ;
bool hashLinksForLinkdb ( class HashTableX *table ) ;
bool hashNeighborhoods ( class HashTableX *table ) ;
bool hashRSSInfo ( class HashTableX *table ) ;
bool hashRSSTerm ( class HashTableX *table , bool inRSS ) ;
bool hashTitle ( class HashTableX *table );
bool hashBody2 ( class HashTableX *table );
bool hashMetaKeywords ( class HashTableX *table );
bool hashMetaSummary ( class HashTableX *table );
bool linksToGigablast ( ) ;
bool searchboxToGigablast ( ) ;
bool hashLanguage ( class HashTableX *table ) ;
bool hashCountry ( class HashTableX *table ) ;
bool hashSiteNumInlinks ( class HashTableX *table ) ;
bool hashCharset ( class HashTableX *table ) ;
bool hashTagRec ( class HashTableX *table ) ;
bool hashPermalink ( class HashTableX *table ) ;
bool hashVectors(class HashTableX *table ) ;
bool hashAds(class HashTableX *table ) ;
class Url *getBaseUrl ( ) ;
bool hashSubmitUrls ( class HashTableX *table ) ;
bool hashIsAdult ( class HashTableX *table ) ;
void set20 ( Msg20Request *req ) ;
class Msg20Reply *getMsg20Reply ( ) ;
char **getImageUrl() ;
class MatchOffsets *getMatchOffsets () ;
Query *getQuery() ;
Matches *getMatches () ;
char *getDescriptionBuf ( char *displayMetas , long *dlen ) ;
class Title *getTitle ();
class Summary *getSummary () ;
char *getHighlightedSummary ();
SafeBuf *getSampleForGigabits ( ) ;
char *getIsCompromised ( ) ;
char *getIsNoArchive ( ) ;
long *getUrlFilterNum();
//long *getDiffbotApiNum();
SafeBuf *getDiffbotApiUrl();
long long **getAdVector ( ) ;
char *getIsLinkSpam ( ) ;
char *getIsHijacked();
char *getIsErrorPage ( ) ;
char* matchErrorMsg(char* p, char* pend );
bool hashWords ( //long wordStart ,
//long wordEnd ,
class HashInfo *hi ) ;
bool hashSingleTerm ( long long termId ,
class HashInfo *hi ) ;
bool hashSingleTerm ( char *s ,
long slen ,
class HashInfo *hi );
bool hashString ( class HashTableX *ht ,
//class Weights *we ,
class Bits *bits ,
char *s ,
long slen ) ;
bool hashString ( char *s ,
long slen ,
class HashInfo *hi ) ;
bool hashWords3 ( //long wordStart ,
//long wordEnd ,
class HashInfo *hi ,
class Words *words ,
class Phrases *phrases ,
class Synonyms *synonyms ,
class Sections *sections ,
class HashTableX *countTable ,
char *fragVec ,
char *wordSpamVec ,
char *langVec ,
char docLangId , // default lang id
class SafeBuf *pbuf ,
class HashTableX *wts ,
class SafeBuf *wbuf ,
long niceness );
bool hashString3 ( char *s ,
long slen ,
class HashInfo *hi ,
class HashTableX *countTable ,
class SafeBuf *pbuf ,
class HashTableX *wts ,
class SafeBuf *wbuf ,
long version ,
long siteNumInlinks ,
long niceness );
bool hashNumber ( char *beginBuf ,
char *buf ,
long bufLen ,
class HashInfo *hi ) ;
bool hashNumber2 ( float f ,
class HashInfo *hi ,
char *gbsortByStr ) ;
// print out for PageTitledb.cpp and PageParser.cpp
bool printDoc ( class SafeBuf *pbuf );
bool printMenu ( class SafeBuf *pbuf );
bool printDocForProCog ( class SafeBuf *sb , HttpRequest *hr ) ;
bool printGeneralInfo ( class SafeBuf *sb , HttpRequest *hr ) ;
bool printRainbowSections ( class SafeBuf *sb , HttpRequest *hr );
bool printSiteInlinks ( class SafeBuf *sb , HttpRequest *hr );
bool printPageInlinks ( class SafeBuf *sb , HttpRequest *hr );
bool printTermList ( class SafeBuf *sb , HttpRequest *hr );
bool printSpiderStats ( class SafeBuf *sb , HttpRequest *hr );
bool printCachedPage ( class SafeBuf *sb , HttpRequest *hr );
bool printSerpFiltered ( class Section *sx , char *tagName ) ;
char **getTitleBuf ( );
char **getRootTitleBuf ( );
char **getFilteredRootTitleBuf ( );
// funcs that update our tagdb tagrec, m_tagRec, and also update tagdb
bool *updateVenueAddresses ( );
// called by msg0 handler to add posdb termlists into g_termListCache
// for faster seo pipeline
bool cacheTermLists();
public:
// stuff set from the key of the titleRec, above the compression area
//key_t m_key;
long long m_docId;
char *m_ubuf;
long m_ubufSize;
long m_ubufAlloc;
// does this page link to gigablast, or has a search form to it?
//bool linksToGigablast();
//bool searchboxToGigablast();
// private:
// we we started spidering it, in milliseconds since the epoch
long long m_startTime;
// when set() was called by Msg20.cpp so we can time how long it took
// to generate the summary
long long m_setTime;
long long m_cpuSummaryStartTime;
// timers
long long m_beginSEOTime;
long long m_beginTimeAllMatch;
long long m_beginTimeMatchUrl;
long long m_beginTimeFullQueries;
long long m_beginTimeLinks;
//long long m_beginMsg98s;
long long m_beginRelatedQueries;
long long m_beginMsg95s;
// . these should all be set using set*() function calls so their
// individual validity flags can bet set to true, and successive
// calls to their corresponding get*() functions will not core
// . these particular guys are set immediately on set(char *titleRec)
Url m_redirUrl;
Url *m_redirUrlPtr;
Url m_metaRedirUrl;
Url *m_metaRedirUrlPtr;
long m_redirError;
char m_allowSimplifiedRedirs;
Url m_firstUrl;
long long m_firstUrlHash48;
long long m_firstUrlHash64;
Url m_currentUrl;
//char *m_coll;
//char m_collBuf[MAX_COLL_LEN+1]; // include \0
CollectionRec *m_lastcr;
collnum_t m_collnum;
long m_lastCollRecResetCount;
class CollectionRec *getCollRec ( ) ;
bool setCollNum ( char *coll ) ;
char *m_content;
long m_contentLen;
char *m_metaList;
long m_metaListSize;
SafeBuf m_metaList2;
// . same thing, a little more complicated
// . these classes are only set on demand
Xml m_xml;
Links m_links;
Words m_words;
Bits m_bits;
Bits m_bits2;
Pos m_pos;
Phrases m_phrases;
//Synonyms m_synonyms;
SafeBuf m_synBuf;
//Weights m_weights;
Sections m_sections;
Section *m_si;
//Section *m_nextSection;
//Section *m_lastSection;
long m_msg3aRequestsOut;
long m_msg3aRequestsIn;
char *m_queryBuf;
Msg39Request *m_msg39RequestArray;
SafeBuf m_msg3aBuf;
Msg3a *m_msg3aArray;
char *m_inUse;
Query *m_queryArray;
long long *m_secHash64Array;
bool m_gotDupStats;
//long m_secHash64;
//Query m_q4;
//Msg3a m_msg3a;
//Msg39Request m_r39;
Msg39Request m_mr2;
HashTableX m_sectionStatsTable;
//char m_sectionHashQueryBuf[128];
// also set in getSections()
long m_maxVotesForDup;
// . for rebuild logging of what's changed
// . Repair.cpp sets these based on titlerec
char m_logLangId;
long m_logSiteNumInlinks;
SectionVotingTable m_nsvt;
SectionVotingTable m_osvt;
long m_numSectiondbReads;
long m_numSectiondbNeeds;
key128_t m_sectiondbStartKey;
RdbList m_secdbList;
long m_sectiondbRecall;
SafeBuf m_tmpBuf3;
//HashTableX m_rvt;
//Msg17 m_msg17;
//char *m_cachedRootVoteRec;
//long m_cachedRootVoteRecSize;
//bool m_triedVoteCache;
//bool m_storedVoteCache;
//SafeBuf m_cacheRecBuf;
HashTableX m_turkVotingTable;
HashTableX m_turkBitsTable;
uint32_t m_confirmedTitleContentHash ;
uint32_t m_confirmedVenueContentHash ;
uint32_t m_confirmedTitleTagHash ;
uint32_t m_confirmedVenueTagHash ;
// turk voting tag rec
TagRec m_vtr;
// tagrec of banned turks
TagRec m_bannedTurkRec;
// and the table of the hashed banned turk users
HashTableX m_turkBanTable;
// used for displaying turk votes...
HashTableX m_vctab;
HashTableX m_vcduptab;
Images m_images;
HashTableX m_countTable;
HttpMime m_mime;
TagRec m_tagRec;
SafeBuf m_tagRecBuf;
// copy of m_oldTagRec but with our modifications, if any
//TagRec m_newTagRec;
SafeBuf m_newTagBuf;
SafeBuf m_fragBuf;
SafeBuf m_wordSpamBuf;
SafeBuf m_finalSummaryBuf;
// this one is initially the same as m_tagRec, but we do not modify it
// so that Address.cpp can reference into its buffer, m_buf, without
// fear of getting the buffer overwritten by crap
//TagRec m_savedTagRec1;
//char *m_sampleVector ;
uint32_t m_tagPairHash;
long m_firstIp;
class SafeBuf *m_savedSb;
class HttpRequest *m_savedHr;
// validity flags. on reset() all these are set to false.
char m_VALIDSTART;
// DO NOT add validity flags above this line!
char m_metaListValid;
//char m_docQualityValid;
char m_siteValid;
char m_startTimeValid;
char m_currentUrlValid;
char m_firstUrlValid;
char m_firstUrlHash48Valid;
char m_firstUrlHash64Valid;
char m_lastUrlValid;
char m_docIdValid;
//char m_collValid;
char m_tagRecValid;
char m_robotsTxtLenValid;
char m_tagRecDataValid;
char m_newTagBufValid;
char m_rootTitleBufValid;
char m_filteredRootTitleBufValid;
char m_titleBufValid;
char m_fragBufValid;
char m_wordSpamBufValid;
char m_finalSummaryBufValid;
char m_matchingQueryBufValid;
char m_relatedQueryBufValid;
char m_queryLinkBufValid;
char m_redirSpiderRequestValid;
//char m_queryPtrsValid;
char m_queryOffsetsValid;
//char m_queryPtrsSortedValid;
char m_queryPtrsWholeValid;
char m_relatedDocIdBufValid;
char m_topMatchingQueryBufValid;
char m_relatedDocIdsScoredBufValid;
char m_relatedDocIdsWithTitlesValid;
char m_relatedTitleBufValid;
//char m_queryLinkBufValid;
char m_missingTermBufValid;
char m_matchingTermBufValid;
//char m_relPtrsValid;
char m_sortedPosdbListBufValid;
char m_wpSortedPosdbListBufValid;
char m_termListBufValid;
char m_insertableTermsBufValid;
char m_scoredInsertableTermsBufValid;
//char m_iwfiBufValid; // for holding WordFreqInfo instances
char m_wordPosInfoBufValid;
char m_recommendedLinksBufValid;
char m_tempMsg25PageValid;
char m_tempMsg25SiteValid;
//char m_queryHashTableValid;
char m_queryOffsetTableValid;
//char m_socketWriteBufValid;
//char m_numBannedOutlinksValid;
char m_hopCountValid;
char m_isInjectingValid;
char m_metaListCheckSum8Valid;
char m_contentValid;
char m_filteredContentValid;
char m_charsetValid;
char m_langVectorValid;
char m_langIdValid;
char m_rootLangIdValid;
char m_datedbDateValid;
char m_isRSSValid;
char m_spiderLinksArgValid;
char m_isContentTruncatedValid;
char m_xmlValid;
char m_linksValid;
char m_wordsValid;
char m_bitsValid;
char m_bits2Valid;
char m_posValid;
char m_isUrlBadYearValid;
char m_phrasesValid;
//char m_synonymsValid;
//char m_weightsValid;
char m_sectionsValid;
char m_subSentsValid;
char m_osvtValid;
char m_nsvtValid;
//char m_rvtValid;
char m_turkVotingTableValid;
char m_turkBitsTableValid;
char m_turkBanTableValid;
char m_vctabValid;
char m_explicitSectionsValid;
char m_impliedSectionsValid;
char m_sectionVotingTableValid;
char m_imageDataValid;
char m_imagesValid;
char m_msge0Valid;
char m_msge1Valid;
//char m_msge2Valid;
//char m_sampleVectorValid;
char m_gigabitHashesValid;
char m_tagPairHashValid;
char m_oldsrValid;
char m_newsrValid;
char m_titleRecValid;
bool m_ipValid;
bool m_firstIpValid;
bool m_spideredTimeValid;
//bool m_nextSpiderTimeValid;
bool m_firstIndexedValid;
bool m_outlinksAddedDateValid;
bool m_countryIdValid;
/*
bool m_titleWeightValid;
bool m_headerWeightValid;
bool m_urlPathWeightValid;
bool m_externalLinkTextWeightValid;
bool m_internalLinkTextWeightValid;
bool m_conceptWeightValid;
*/
bool m_httpStatusValid;
bool m_crawlDelayValid;
bool m_finalCrawlDelayValid;
bool m_titleRecKeyValid;
bool m_adVectorValid;
bool m_wikiDocIdsValid;
bool m_catIdsValid;
bool m_versionValid;
bool m_indCatIdsValid;
bool m_dmozTitlesValid;
bool m_dmozSummsValid;
bool m_dmozAnchorsValid;
bool m_dmozInfoValid;
bool m_rawUtf8ContentValid;
bool m_expandedUtf8ContentValid;
bool m_utf8ContentValid;
bool m_isAllowedValid;
//bool m_tryAgainTimeDeltaValid;
//bool m_eliminateMenusValid;
bool m_redirUrlValid;
bool m_metaRedirUrlValid;
bool m_statusMsgValid;
bool m_mimeValid;
bool m_pubDateValid;
bool m_hostHash32aValid;
bool m_hostHash32bValid;
bool m_indexCodeValid;
bool m_priorityValid;
bool m_downloadStatusValid;
bool m_downloadEndTimeValid;
bool m_redirErrorValid;
bool m_domHash32Valid;
bool m_contentHash32Valid;
bool m_tagHash32Valid;
bool m_linkInfo2Valid;
bool m_spiderLinksValid;
//bool m_nextSpiderPriorityValid;
bool m_firstIndexedDateValid;
bool m_isPermalinkValid;
bool m_isAdultValid;
bool m_hasAddressValid;
bool m_hasTODValid;
bool m_hasSiteVenueValid;
bool m_catRecValid;
bool m_urlPubDateValid;
bool m_isUrlPermalinkFormatValid;
bool m_percentChangedValid;
bool m_unchangedValid;
bool m_countTableValid;
bool m_summaryLangIdValid;
bool m_tagPairHashVecValid;
bool m_summaryVecValid;
bool m_titleVecValid;
bool m_pageSampleVecValid;
bool m_postVecValid;
bool m_dupListValid;
bool m_likedbListValid;
bool m_isDupValid;
bool m_gigabitVectorHashValid;
bool m_gigabitQueryValid;
bool m_metaDescValid;
bool m_metaSummaryValid;
bool m_metaKeywordsValid;
bool m_siteSpiderQuotaValid;
bool m_oldDocValid;
bool m_extraDocValid;
bool m_ahrefsDocValid;
//bool m_contactDocValid;
bool m_rootDocValid;
//bool m_gatewayDocValid;
bool m_oldMetaListValid;
bool m_oldTitleRecValid;
bool m_rootTitleRecValid;
//bool m_contactTitleRecValid;
bool m_isIndexedValid;
bool m_hasContactInfoValid;
bool m_isContactyValid;
bool m_contactInfoTagRecValid;
bool m_addressesValid;
bool m_contactAddressesValid;
bool m_emailBufValid;
//bool m_contactUsLinkValid;
//bool m_aboutUsLinkValid;
//bool m_contactLinksValid;
bool m_siteNumInlinksValid;
bool m_siteNumInlinksUniqueIpValid;//FreshValid;
bool m_siteNumInlinksUniqueCBlockValid;//sitePopValid
bool m_siteNumInlinksTotalValid;
bool m_siteNumInlinks8Valid;
bool m_siteLinkInfoValid;
bool m_isWWWDupValid;
bool m_linkInfo1Valid;
bool m_linkSiteHashesValid;
//bool m_dateParse2Valid;
bool m_simpleDatesValid;
bool m_datesValid;
bool m_sectionsReplyValid;
bool m_sectionsVotesValid;
bool m_sectiondbDataValid;
bool m_placedbDataValid;
bool m_siteHash64Valid;
bool m_siteHash32Valid;
bool m_httpReplyValid;
bool m_contentTypeValid;
bool m_isBinaryValid;
bool m_priorityQueueNumValid;
bool m_outlinkTagRecVectorValid;
bool m_outlinkIpVectorValid;
bool m_hasNoIndexMetaTagValid;
bool m_hasUseFakeIpsMetaTagValid;
bool m_outlinkIsIndexedVectorValid;
bool m_isSiteRootValid;
bool m_wasInjectedValid;
bool m_outlinkHopCountVectorValid;
//bool m_isSpamValid;
bool m_isFilteredValid;
bool m_urlFilterNumValid;
bool m_numOutlinksAddedValid;
bool m_baseUrlValid;
bool m_replyValid;
bool m_recycleDiffbotReplyValid;
bool m_diffbotReplyValid;
bool m_tokenizedDiffbotReplyValid;
//bool m_diffbotUrlCrawlPatternMatchValid;
//bool m_diffbotUrlProcessPatternMatchValid;
//bool m_diffbotPageProcessPatternMatchValid;
//bool m_useDiffbotValid;
//bool m_diffbotApiNumValid;
bool m_diffbotApiUrlValid;
bool m_crawlInfoValid;
bool m_isPageParserValid;
bool m_imageUrlValid;
bool m_matchOffsetsValid;
bool m_queryValid;
bool m_matchesValid;
bool m_dbufValid;
bool m_titleValid;
bool m_collnumValid;
//bool m_twidsValid;
bool m_termId32BufValid;
bool m_termInfoBufValid;
bool m_newTermInfoBufValid;
bool m_summaryValid;
bool m_gsbufValid;
bool m_isCompromisedValid;
bool m_isNoArchiveValid;
//bool m_isVisibleValid;
bool m_clockCandidatesTableValid;
bool m_clockCandidatesDataValid;
bool m_isLinkSpamValid;
bool m_isErrorPageValid;
bool m_isHijackedValid;
bool m_dupHashValid;
bool m_exactContentHash64Valid;
// shadows
char m_isRSS2;
char m_isPermalink2;
char m_isAdult2;
char m_spiderLinks2;
char m_isContentTruncated2;
char m_isLinkSpam2;
bool m_hasAddress2;
bool m_hasTOD2;
bool m_hasSiteVenue2;
char m_hasContactInfo2;
char m_isSiteRoot2;
// DO NOT add validity flags below this line!
char m_VALIDEND;
// more stuff
//char *m_utf8Content;
//long m_utf8ContentLen;
CatRec m_catRec;
// use this stuff for getting wiki docids that match our doc's gigabits
//Query m_wq;
//SearchInput m_si;
//Msg40 m_msg40;
//DateParse2 m_dateParse2;
Dates m_dates;
HashTableX m_clockCandidatesTable;
SafeBuf m_cctbuf;
float m_ageInDays;
long m_urlPubDate;
//long m_urlAge;
char m_isUrlPermalinkFormat;
uint8_t m_summaryLangId;
long m_tagPairHashVec[MAX_TAG_PAIR_HASHES];
long m_tagPairHashVecSize;
long m_summaryVec [SAMPLE_VECTOR_SIZE/4];
long m_summaryVecSize;
long m_titleVec [SAMPLE_VECTOR_SIZE/4];
long m_titleVecSize;
long m_pageSampleVec[SAMPLE_VECTOR_SIZE/4];
long m_pageSampleVecSize;
long m_postVec[POST_VECTOR_SIZE/4];
long m_postVecSize;
float m_tagSimilarity;
float m_gigabitSimilarity;
float m_pageSimilarity;
float m_percentChanged;
bool m_unchanged;
// what docids are similar to us? docids are in this list
RdbList m_dupList;
RdbList m_likedbList;
uint64_t m_dupHash;
long long m_exactContentHash64;
Msg0 m_msg0;
Msg5 m_msg5;
char m_isDup;
long m_ei;
long m_lastLaunch;
Msg22Request m_msg22Request;
Msg22 m_msg22a;
Msg22 m_msg22b;
Msg22 m_msg22c;
Msg22 m_msg22d;
Msg22 m_msg22e;
Msg22 m_msg22f;
//long m_collLen;
uint32_t m_gigabitVectorHash;
char m_gigabitQuery [XD_GQ_MAX_SIZE];
long m_gigabitHashes [XD_MAX_GIGABIT_HASHES];
long m_gigabitScores [XD_MAX_GIGABIT_HASHES];
char *m_gigabitPtrs [XD_MAX_GIGABIT_HASHES];
// for debug printing really
class GigabitInfo *m_top[100];
long m_numTop;
//char m_metaDesc[1025];
//char m_metaKeywords[1025];
// these now reference directly into the html src so our
// WordPosInfo::m_wordPtr algo works in seo.cpp
char *m_metaDesc;
long m_metaDescLen;
char *m_metaSummary;
long m_metaSummaryLen;
char *m_metaKeywords;
long m_metaKeywordsLen;
long m_siteSpiderQuota;
//long m_numBannedOutlinks;
class XmlDoc *m_oldDoc;
class XmlDoc *m_extraDoc;
class XmlDoc *m_ahrefsDoc;
//class XmlDoc *m_contactDoc;
class XmlDoc *m_rootDoc;
//class XmlDoc *m_gatewayDoc;
RdbList m_oldMetaList;
char *m_oldTitleRec;
long m_oldTitleRecSize;
char *m_rootTitleRec;
long m_rootTitleRecSize;
//char *m_contactTitleRec;
//long m_contactTitleRecSize;
char m_isIndexed;
Msg8a m_msg8a;
char *m_tagdbColl;
long m_tagdbCollLen;
Addresses m_addresses;
Address *m_contactAddresses[MAX_CONTACT_ADDRESSES];
long m_numContactAddresses;
char m_isContacty;
//Url m_contactUsLink;
//Url m_aboutUsLink;
/*
char *m_contactLinks [MAX_CONTACT_OUTLINKS];
long m_contactLens [MAX_CONTACT_OUTLINKS];
long m_contactScores [MAX_CONTACT_OUTLINKS];
long m_contactFlags [MAX_CONTACT_OUTLINKS];
char m_contactProcessed [MAX_CONTACT_OUTLINKS];
char *m_contactText [MAX_CONTACT_OUTLINKS];
char *m_contactTextEnd [MAX_CONTACT_OUTLINKS];
long m_minContactScore;
long m_minContactIndex;
long m_numContactLinks;
*/
Url m_extraUrl;
//long m_siteNumInlinksFresh;
//long m_sitePop;
uint8_t m_siteNumInlinks8;
//long m_siteNumInlinks;
LinkInfo m_siteLinkInfo;
SafeBuf m_mySiteLinkInfoBuf;
SafeBuf m_myPageLinkInfoBuf;
SafeBuf m_myTempLinkInfoBuf;
char m_isInjecting;
char m_useFakeMime;
char m_useSiteLinkBuf;
char m_usePageLinkBuf;
char m_printInXml;
Msg25 m_msg25;
Msg25 *m_tempMsg25Page;
Msg25 *m_tempMsg25Site;
// for page or for site?
Msg25 *getAllInlinks ( bool forSite );
// lists from cachedb for msg25's msg20 replies serialized
RdbList m_siteReplyList;
RdbList m_pageReplyList;
bool m_checkedCachedbForSite;
bool m_checkedCachedbForPage;
bool m_triedToAddWordPosInfoToCachedb;
bool m_calledMsg25ForSite;
bool m_calledMsg25ForPage;
//void (* m_masterLoopWrapper) (void *state);
MsgC m_msgc;
bool m_isAllowed;
bool m_forwardDownloadRequest;
bool m_isChildDoc;
Msg13 m_msg13;
Msg13Request m_msg13Request;
bool m_isSpiderProxy;
// for limiting # of iframe tag expansions
long m_numExpansions;
char m_newOnly;
//long m_tryAgainTimeDelta;
//long m_sameIpWait;
//long m_sameDomainWait;
//long m_maxSpidersPerDomain;
char m_isWWWDup;
char m_calledMsg0b;
Url m_tmpUrl;
SafeBuf m_tmpsb1;
SafeBuf m_tmpsb2;
SafeBuf m_turkBuf;
SafeBuf m_linkSiteHashBuf;
SafeBuf m_linkdbDataBuf;
SafeBuf m_langVec;
Msg0 m_msg0b;
class RdbList *m_ulist;
void *m_hack;
class XmlDoc *m_hackxd;
//class LinkInfo *m_linkInfo1Ptr;
char *m_linkInfoColl;
//char m_injectedReply;
long m_minInlinkerHopCount;
//class LinkInfo *m_linkInfo2Ptr;
SiteGetter m_siteGetter;
long long m_siteHash64;
//char *m_site;
//long m_siteLen;
//Url m_siteUrl;
long m_siteHash32;
char *m_httpReply;
//char m_downloadAttempted;
char m_incrementedAttemptsCount;
char m_incrementedDownloadCount;
char m_redirectFlag;
//char m_isScraping;
//char m_throttleDownload;
char m_spamCheckDisabled;
char m_useRobotsTxt;
long m_robotsTxtLen;
long m_httpReplySize;
long m_httpReplyAllocSize;
char m_isBinary;
char *m_filteredContent;
long m_filteredContentLen;
char *m_filter;
long m_filteredContentAllocSize;
long m_filteredContentMaxSize;
char m_calledThread;
long m_errno;
//class CollectionRec *m_cr;
//long m_utf8ContentAllocSize;
long m_hostHash32a;
long m_hostHash32b;
long m_domHash32;
long m_priorityQueueNum;
// this points into m_msge0 i guess
//class TagRec **m_outlinkTagRecVector;
Msge0 m_msge0;
// this points into m_msge1 i guess
long *m_outlinkIpVector;
SafeBuf m_outlinkTagRecPtrBuf;
SafeBuf m_fakeIpBuf;
char m_hasNoIndexMetaTag;
char m_hasUseFakeIpsMetaTag;
Msge1 m_msge1;
TagRec **m_outlinkTagRecVector;
SafeBuf m_fakeTagRecPtrBuf;
TagRec m_fakeTagRec;
//
// diffbot parms for indexing diffbot's json output
//
XmlDoc *m_dx;
char *m_diffbotObj;
SafeBuf m_diffbotReply;
SafeBuf *m_tokenizedDiffbotReplyPtr;
SafeBuf m_tokenizedDiffbotReply;
long m_diffbotReplyError;
bool m_recycleDiffbotReply;
//bool m_diffbotUrlCrawlPatternMatch;
//bool m_diffbotUrlProcessPatternMatch;
//bool m_diffbotPageProcessPatternMatch;
//long m_diffbotApiNum;
//bool m_useDiffbot;
// url to access diffbot with
SafeBuf m_diffbotApiUrl;
bool *getRecycleDiffbotReply ( ) ;
SafeBuf *getTokenizedDiffbotReply ( ) ;
SafeBuf *getDiffbotReply ( ) ;
//bool doesUrlMatchDiffbotCrawlPattern() ;
//bool doesUrlMatchDiffbotProcessPattern() ;
bool doesPageContentMatchDiffbotProcessPattern() ;
char *hashJSON ( HashTableX *table );
long *nukeJSONObjects ( ) ;
long m_joc;
//EmailInfo m_emailInfo;
//
// functions and vars for the seo query matching tool
//
bool loadTitleRecFromDiskOrSpider();
//SafeBuf *getSEOQueryInfo ( );
HashTableX *getTermIdBufDedupTable32();
//long *getTopWordsVector( bool includeSynonyms );
SafeBuf *getTermId32Buf();
SafeBuf *getTermInfoBuf();
SafeBuf *getNewTermInfoBuf();
SafeBuf *getMatchingQueryBuf();
SafeBuf *getQueryLinkBuf(SafeBuf *docIdListBuf,bool doMatchingQueries);
//SafeBuf *getMatchingQueriesScored();
SafeBuf *getMatchingQueriesScoredForFullQuery();
SafeBuf *getRelatedDocIds();
SafeBuf *getRelatedDocIdsScored();
SafeBuf *getTopMatchingQueryBuf();
bool addRelatedDocIdInfo ( long long docId ,
long queryNum ,
float score ,
long rank ,
long siteHash26 ) ;
bool setRelatedDocIdWeightAndRank ( class RelatedDocId *rd );
SafeBuf *getRelatedDocIdsWithTitles();
bool setRelatedDocIdInfoFromMsg20Reply ( class RelatedDocId *rd ,
class Msg20Reply *reply );
SafeBuf *getRelatedQueryBuf();
//SafeBuf *getRelatedQueryLinksModPart ( long modPart );
bool addTermsFromQuery ( char *queryStr,
uint8_t queryLangId,
long gigablastTraffic,
long googleTraffic,
long hackqoff,
class SafeBuf *tmpBuf ,
class HashTableX *scoreTable ,
class HashTableX *topWordsTable ,
float imp,
bool isRelatedQuery ) ;
bool sortTermsIntoBuf ( class HashTableX *scoreTable ,
class SafeBuf *tmpBuf ,
class SafeBuf *missingTermBuf ) ;
SafeBuf *getMissingTermBuf ();
SafeBuf *getMatchingTermBuf ();
SafeBuf *getTermIdSortedPosdbListBuf();
SafeBuf *getWordPosSortedPosdbListBuf();
SafeBuf *getTermListBuf(); // list of posdb termlists for caching
SafeBuf *getWordPosInfoBuf ( ) ;
//bool sendBin ( long i );
//bool scoreDocIdRestrictedQueries(class Msg99Reply **replyPtrs,
// class QueryLink *linkPtrs,
// long numPtrs );
// private like functions
bool addUniqueWordsToBuf ( SafeBuf *termInfoBuf,
HashTableX *dedupTable ,
HashTableX *filterTable ,
HashTableX *minCountTable ,
bool storeCounts,
Words *words ,
bool includeSynonyms );
//void gotMsg99Reply ( UdpSlot *slot );
//void gotMsg98Reply ( UdpSlot *slot );
void gotMsg95Reply ( UdpSlot *slot );
//void gotMsg3aReplyForMainUrl ( );
void gotMsg3aReplyForFullQuery( );
//void gotMsg3aReplyForFullQueryCached ( char *cachedRec ,
// class Msg99Reply *qp );
//void gotMsg3aReplyForRelQuery ( class Msg3a *msg3a );
void gotMsg3fReply ( class Bin *bin );
//void pumpSocketWriteBuf ( );
//HashTableX *getMatchingQueryHashTable();
HashTableX *getMatchingQueryOffsetTable();
long getNumInsertableTerms ( );
class SafeBuf *getInsertableTerms ( );
class SafeBuf *getScoredInsertableTerms ( );
//class SafeBuf *getInsertableWordFreqInfoBuf ();
bool processMsg95Replies();
void setWordPosInfosTrafficGain ( class InsertableTerm *it );
long getTrafficGain( class QueryChange *qc ) ;
// print in xml
bool printScoredInsertableTerms ( SafeBuf *sbuf ) ;
HashTableX m_tidTable32;
//long *m_twids;
//long m_numTwids;
SafeBuf m_termId32Buf;
SafeBuf m_termInfoBuf;
SafeBuf m_newTermInfoBuf;
//long m_maxQueries;
//long m_maxRelatedQueries;
//long m_maxRelatedUrls;
//long m_numMsg99Requests;
//long m_numMsg98Requests;
//long m_numMsg99Replies;
//long m_numMsg98Replies;
//char *m_msg99ReplyPtrs [MAX_HOSTS];
//long m_msg99ReplySizes[MAX_HOSTS];
//long m_msg99ReplyAlloc[MAX_HOSTS];
//long m_msg99HostIds [MAX_HOSTS];
char *m_msg95ReplyPtrs [MAX_HOSTS];
long m_msg95ReplySizes[MAX_HOSTS];
//HashTableX m_queryHashTable;
HashTableX m_queryOffsetTable;
HashTableX m_tmpTable;
HashTableX m_fullQueryDedup;
//SafeBuf m_twbuf;
//SafeBuf m_queryPtrs;
SafeBuf m_matchingQueryBuf;
SafeBuf m_matchingQueryStringBuf;
SafeBuf m_relatedQueryBuf;
SafeBuf m_relatedQueryStringBuf;
SafeBuf m_docIdListBuf;
SafeBuf m_queryOffsets;
SafeBuf m_extraQueryBuf;
//SafeBuf m_socketWriteBuf;
SafeBuf m_relatedDocIdBuf;
SafeBuf m_relatedTitleBuf;
SafeBuf m_commonQueryNumBuf;
SafeBuf m_topMatchingQueryBuf;
HashTableX m_rdtab;
// related query algo stuff
SafeBuf m_queryLinkBuf;
SafeBuf m_queryLinkStringBuf;
char *m_msg8eReply [MAX_HOSTS];
long m_msg8eReplySize[MAX_HOSTS];
long m_numMsg8eRequests;
long m_numMsg8eReplies;
//bool m_launchedAll;
long long m_tlbufTimer;
SafeBuf m_missingTermBuf;
SafeBuf m_matchingTermBuf;
//SafeBuf m_queryRelBuf;
//SafeBuf m_relPtrs;
SafeBuf m_sortedPosdbListBuf;
SafeBuf m_wpSortedPosdbListBuf;
SafeBuf m_termListBuf;
SafeBuf m_insertableTermsBuf;
//SafeBuf m_iwfiBuf;
SafeBuf m_wordPosInfoBuf;
//SafeBuf m_msg20ReplyPtrBuf;
SafeBuf m_recommendedLinksBuf;
SafeBuf m_tmpMsg0Buf;
SafeBuf m_msg20Array;
SafeBuf m_newLinkerBuf;
//Msg17 m_msg17;
//key_t m_cacheKey;
//char *m_cacheRec;
//long m_cacheRecSize;
//bool m_triedCache;
//class TopDocIds *m_topDocIdsBuf;
//long m_topDocIdsBufSize;
SafeBuf m_topDocIdsBuf;
//class TopDocIds *m_nextAvailTopDocIds;
//long m_nextAvailTopDocIdsOffset;
//long m_maxFullQueries;
//XmlDoc *m_newxd;
//XmlDoc *m_newxd2;
//bool m_newxd2Blocked;
//HashTableX m_tmpDupTable;
//class Msg20 *m_newMsg20;
Msg3a *m_msg3a;
Query *m_query3a;
long m_numMsg3aRequests;
long m_numMsg3aReplies;
long m_numMsg3fRequests;
long m_numMsg3fReplies;
long m_numMsg4fRequests;
long m_numMsg4fReplies;
bool m_sentMsg4fRequests;
class UdpSlot *m_savedSlot;
long m_numMsg95Requests;
long m_numMsg95Replies;
long m_qcursor;
char m_seoDebug;
char m_progressBar;
bool m_readFromCachedb;
bool m_writeToCachedb;
//bool m_setForReplyPtrs;
//bool m_setForLinkPtrs;
SafeBuf *getRecommendedLinksBuf ( );
bool processLinkInfoMsg20Reply ( class Msg25 *msg25 );
bool printRecommendedLinksBuf ( class SafeBuf *sb ) ;
// recommendedlinksbuf vars and functions
long m_numLinkRequestsOut;
long m_numLinkRequestsIn;
long m_hadLinkInfoError;
long m_numMsg20sIn;
long m_numMsg20sOut;
long m_numValidMsg20s;
long m_titleCursor;
long m_msg20Phase;
long m_recommendedLinkError;
SafeBuf *lookupTitles();
bool gotLinkerTitle ( class Msg20 *msg20 );
// 1 *current* bin per host!
//class Bin *m_currentBinPtrs[MAX_HOSTS];
//long m_binError;
//long m_msg98ReplyError;
//long m_binErrorForReplyPtrs;
//long m_binErrorForLinkPtrs;
HashTableX m_qstringTable;
// flow flags
bool m_printedQueries;
bool m_printedRelatedDocIds;
bool m_printedRelatedQueries;
bool m_printedScoredInsertableTerms;
bool m_printedRecommendedLinks;
bool m_loggedMsg3;
long long m_lastPrintedDocId;
//bool m_docIndexed;
//bool m_sentMsg99Requests;
bool m_didSet3;
//bool m_didSet3b;
bool m_registeredSocketCallback;
// the caller's socket the expect the xml reply on
TcpSocket *m_seoSocket;
TcpSocket *m_hackSocket;
bool m_doingSEO;
bool clientClosedConnection ( );
bool m_hadMatchError;
bool m_clientClosed;
bool m_lastCheckTime;
long m_msg3aErrno ;
bool m_computedMetaListCheckSum;
// cachedb related args
//bool m_seoInfoSetFromCache;
bool m_checkedCachedb;
bool m_processedCachedbReply;
//bool m_storedIntoCachedb;
RdbList m_cacheList;
//SafeBuf m_msg99ReplyBuf;
SafeBuf m_queryChangeBuf;
SafeBuf m_queryLogBuf;
//SafeBuf m_itStrBuf;
SafeBuf m_debugScoreInfoBuf;
SafeBuf m_origScoreInfoBuf;
RdbList m_storeList;
Msg1 m_msg1;
bool m_allHashed;
bool checkCachedb ( );
bool storeScoredInsertableTermsIntoCachedb ( ) ;
bool storeRelatedQueriesIntoCachedb ( ) ;
bool storeRelatedDocIdsIntoCachedb ( ) ;
bool storeMatchingQueriesIntoCachedb ( ) ; // only the top 1000 or so
bool storeMissingTermBufIntoCachedb ( );
bool storeWordPosInfoBufIntoCachedb ( );
bool storeRecommendedLinksBuf ( );
// cursors
long m_socketWriteBufSent;
long m_queryNum;
long m_rdCursor;
long m_relatedNum;
long m_numRelatedAdded;
// for getRelatedDocIdsWithTitles() launching msg20s
long m_relatedDocIdError;
long m_numMsg20Replies;
long m_numMsg20Requests;
SafeBuf m_msg20Buf;
// this points into m_msge2
//char *m_outlinkIsIndexedVector;
//Msge2 m_msge2;
bool m_doneWithAhrefs;
bool m_useAhrefs;
bool m_reallyInjectLinks;
long m_downloadLevel;
long m_numRegExs;
//char m_isSiteRoot;
int8_t *m_outlinkHopCountVector;
long m_outlinkHopCountVectorSize;
//char m_isSpam;
char m_isUrlBadYear;
char m_isFiltered;
long m_urlFilterNum;
long m_numOutlinksAdded;
long m_numOutlinksAddedFromSameDomain;
long m_numOutlinksFiltered;
long m_numOutlinksBanned;
long m_numRedirects;
bool m_isPageParser;
Url m_baseUrl;
Msg20Reply m_reply;
Msg20Request *m_req;
//char *m_gsbuf;
SafeBuf m_gsbuf;
//long m_gsbufSize;
//long m_gsbufAllocSize;
char *m_note;
char *m_imageUrl;
char m_imageUrlBuf[100];
long m_imageUrlSize;
MatchOffsets m_matchOffsets;
Query m_query;
Matches m_matches;
// meta description buf
long m_dbufLen;
char m_dbuf[1024];
Title m_title;
Summary m_summary;
char m_isCompromised;
char m_isNoArchive;
char m_isErrorPage;
char m_isHijacked;
//char m_isVisible;
//char m_dmozBuf[12000];
SafeBuf m_dmozBuf;
long m_numDmozEntries;
// stuff
char *m_statusMsg;
Msg4 m_msg4;
Msg8b m_msg8b;
bool m_incCount;
bool m_decCount;
bool m_deleteFromIndex;
// ptrs to stuff
char *m_titleRec;
long m_titleRecSize;
bool m_freeTitleRec;
long m_titleRecAllocSize;
key_t m_titleRecKey;
// for isDupOfUs()
char *m_dupTrPtr;
long m_dupTrSize;
// parse these out of spider rec
/*
long m_retryNum ;
long m_spiderRecPriority ;
bool m_spiderRecIsNew ;
long m_spiderRecSiteNumInlinks ;
long m_spiderRecRetryCount ;
long m_spiderRecHopCount ;
key_t m_spiderRecKey ;
bool m_spiderRecForced ;
long m_spiderRecTime ;
long m_srDataSize ;
char m_srData [ MAX_SPIDERREC_SIZE ];
*/
key_t m_doledbKey;
SpiderRequest m_oldsr;
SpiderReply m_newsr;
// bool flags for what procedures we have done
bool m_checkedUrlFilters;
bool m_listAdded ;
bool m_listFlushed ;
bool m_check1 ;
bool m_check2 ;
bool m_prepared ;
bool m_updatedCounts ;
bool m_updatedCounts2 ;
//bool m_updatedTagdb1 ;
//bool m_updatedTagdb2 ;
//bool m_updatedTagdb3 ;
//bool m_updatedTagdb4 ;
//bool m_updatedTagdb5 ;
bool m_copied1 ;
bool m_updatingSiteLinkInfoTags ;
bool m_addressSetCalled ;
//bool m_calledMsg22a ;
//bool m_calledMsg22b ;
//bool m_calledMsg22c ;
long long m_calledMsg22d ;
bool m_didDelay ;
bool m_didDelayUnregister ;
bool m_calledMsg22e ;
bool m_calledMsg22f ;
bool m_calledMsg25 ;
bool m_calledMsg25b ;
bool m_calledMsg8b ;
bool m_calledMsg40 ;
bool m_calledSections ;
bool m_firstEntry ;
bool m_firstEntry2 ;
bool m_launchedSpecialMsg8a ;
bool m_launchedMsg8a2 ;
bool m_loaded ;
// used for getHasContactInfo()
bool m_processed0 ;
// a lock to prevent infinite loops
//bool m_checkForRedir ;
bool m_processedLang ;
bool m_doingConsistencyCheck ;
long m_langIdScore;
//long m_rootLangIdScore;
//uint8_t m_rootLangId;
// used for getting contact info
//bool m_triedRoot ;
//long m_winner ;
long m_dist;
// the tags in this tagRec are just contact info based tags and
// created in the addContactInfo() function. also, in that same
// function we add/sub the tags in m_citr to the m_newTagRec tag rec.
//TagRec m_citr ;
char m_emailBuf[EMAILBUFSIZE];
long m_numOfficialEmails;
// use to store a \0 list of "titles" of the root page so we can
// see which if any are the venue name, and thus match that to
// addresses of the venue on the site, and we can use those addresses
// as default venue addresses when no venues are listed on a page
// on that site.
char m_rootTitleBuf[ROOT_TITLE_BUF_MAX];
long m_rootTitleBufSize;
// . this is filtered
// . certain punct is replaced with \0
char m_filteredRootTitleBuf[ROOT_TITLE_BUF_MAX];
long m_filteredRootTitleBufSize;
// like m_rootTitleBuf but for the current page
char m_titleBuf[ROOT_TITLE_BUF_MAX];
long m_titleBufSize;
bool m_setTr ;
//bool m_checkedRobots ;
bool m_triedTagRec ;
bool m_didGatewayPage ;
bool m_didQuickDupCheck ;
void (* m_masterLoop) ( void *state );
void * m_masterState;
void (* m_callback1) ( void *state );
bool (* m_callback2) ( void *state );
void *m_state;
//void (* m_injectionCallback) ( void *state );
//void *m_injectionState;
// flags for spider
//bool m_isAddUrl;
//bool m_forceDelete;
bool m_didDelete;
// this is non-zero if we decided not to index the doc
long m_indexCode;
// the spider priority
long m_priority;
// the download error, like ETIMEDOUT, ENOROUTE, etc.
long m_downloadStatus;
// . when the download was completed. will be zero if no download done
// . used to set SpiderReply::m_downloadEndTime because we need
// high resolution for that so we can dole out the next spiderrequest
// from that IP quickly if the sameipwait is like 500ms.
long long m_downloadEndTime;
char *m_metaListEnd;
long m_metaListAllocSize;
char *m_p;
char *m_pend;
long m_maxCacheAge;
// a list of 32-bit ints followed by a zero 32-bit int to terminate
long long m_adIds [ XD_MAX_AD_IDS ];
//char *m_adVector;// [XMLDOC_MAX_AD_IDS];
//long m_adVectorSize;
char *m_wikiqbuf;
long m_wikiqbufSize;
long long m_wikiDocIds [ MAX_WIKI_DOCIDS ];
rscore_t m_wikiScores [ MAX_WIKI_DOCIDS ];
bool m_registeredSleepCallback;
bool m_addedNegativeDoledbRec;
bool m_hashedTitle;
bool m_hashedMetas;
long m_niceness;
bool m_usePosdb ;
//bool m_useDatedb ;
bool m_useClusterdb ;
bool m_useLinkdb ;
bool m_useSpiderdb ;
bool m_useTitledb ;
bool m_useTagdb ;
bool m_usePlacedb ;
//bool m_useTimedb ;
bool m_useSectiondb ;
//bool m_useRevdb ;
bool m_useSecondaryRdbs ;
long m_linkeeQualityBoost;
SafeBuf *m_pbuf;
// used by SpiderLoop to set m_pbuf to
SafeBuf m_sbuf;
// store termlist into here if non-null
bool m_storeTermListInfo;
char m_sortTermListBy;
SafeBuf m_sectiondbData;
//char *m_sectiondbData;
char *m_placedbData;
//long m_sectiondbDataSize;
long m_placedbDataSize;
// we now have HashInfo to replace this
//bool m_inHashNoSplit;
// store the terms that we hash into this table so that PageParser.cpp
// can print what was hashed and with what score and what description
class HashTableX *m_wts;
HashTableX m_wtsTable;
SafeBuf m_wbuf;
// used by addContactInfo() to keep track of what urls we have
// processed for contact info to avoid re-processing them in the
// recursive loop thing that we do
//HashTableX m_pt;
// Msg25.cpp stores its pageparser.cpp output into this one
SafeBuf m_pageLinkBuf;
SafeBuf m_siteLinkBuf;
SafeBuf m_serpBuf;
// which set() function was called above to set us?
bool m_setFromTitleRec;
bool m_setFromSpiderRec;
bool m_setFromUrl;
bool m_setFromDocId;
bool m_freeLinkInfo1;
bool m_freeLinkInfo2;
bool m_contentInjected;
bool m_recycleContent;
//bool m_loadFromOldTitleRec;
char *m_rawUtf8Content;
long m_rawUtf8ContentSize;
long m_rawUtf8ContentAllocSize; // we overallocate sometimes
char *m_expandedUtf8Content;
long m_expandedUtf8ContentSize;
char *m_savedp;
char *m_oldp;
bool m_didExpansion;
SafeBuf m_esbuf;
SafeBuf m_xbuf;
//bool m_useIpsTxtFile ;
//bool m_readFromTestCache ;
// used by msg13
class Msg13Request *m_r;
// Msg20 uses this to stash its TcpSlot
void *m_slot;
char *getTestDir();
bool m_freed;
bool m_msg4Waiting;
bool m_msg4Launched;
// word spam detection
char *getWordSpamVec ( );
bool setSpam ( long *profile, long plen , long numWords ,
unsigned char *spam );
long getProbSpam ( long *profile, long plen , long step );
bool m_isRepeatSpammer;
long m_numRepeatSpam;
bool m_totallySpammed;
// frag vector (repeated fragments). 0 means repeated, 1 means not.
// vector is 1-1 with words in the document body.
char *getFragVec ( );
bool injectLinks ( HashTableX *linkDedupTable ,
HashTableX *domDedupTable ,
void *finalState ,
void (* finalCallback)(void *));
bool injectAhrefsLinks();
bool doInjectLoop ( );
void doneInjecting ( class XmlDoc *xd );
long m_i;
long m_blocked;
HashTableX m_domDedupTable;
HashTableX *m_linkDedupTablePtr;
HashTableX *m_domDedupTablePtr;
bool m_dedupLinkDomains;
void *m_finalState;
void (* m_finalCallback) ( void *state );
char m_used[MAX_XML_DOCS];
class XmlDoc *m_xmlDocs[MAX_XML_DOCS];
long long m_cacheStartTime;
};
// . PageParser.cpp uses this class for printing hashed terms out by calling
// XmlDoc::print()
// . we store TermInfos into XmlDoc::m_wtsTable, a HashTableX
// . one for each term hashed
// . the key is the termId. dups are allowed
// . the term itself is stored into a separate buffer, m_wbuf, a SafeBuf, so
// that TermInfo::m_term will reference that and it won't disappear on us
class TermDebugInfo {
public:
long m_termOff;
long m_termLen;
//uint32_t m_score32;
long m_descOff; // the description offset
long m_prefixOff; // the prefix offset, like "site" or "gbadid"
long long m_termId;
long m_date;
bool m_shardByTermId;
//float m_weight;
char m_langId;
char m_diversityRank;
char m_densityRank;
char m_wordSpamRank;
char m_hashGroup;
long m_wordNum;
long m_wordPos;
//bool m_isSynonym;
// 0 = not a syn, 1 = syn from presets,2=wikt,3=generated
char m_synSrc;
long long m_langBitVec64;
// this is copied from Weights::m_rvw or m_rvp
//float m_rv[MAX_RULES];
};
// a ptr to HashInfo is passed to hashString() and hashWords()
class HashInfo {
public:
HashInfo() {
m_tt = NULL;
m_prefix = NULL;
m_desc = NULL;
m_date = 0;
// should we do sharding based on termid and not the usual docid???
// in general this is false, but for checksum we want to shard
// by the checksum and not docid to avoid having to do a
// gbchecksum:xxxxx search on ALL shards. much more efficient.
m_shardByTermId = false;
//m_useWeights = false;
m_useSynonyms = false;
m_hashGroup = -1;
m_startDist = 0;
m_siteHash32 = 0;
};
class HashTableX *m_tt;
char *m_prefix;
// "m_desc" should detail the algorithm
char *m_desc;
long m_date;
char m_shardByTermId;
char m_linkerSiteRank;
//char m_useWeights;
char m_useSynonyms;
char m_hashGroup;
long m_startDist;
long m_siteHash32;
};
// g_tt is used for debugging
//extern class TermTable *g_tt;
extern uint8_t score32to8 ( uint32_t score ) ;
extern pid_t g_pid ;
extern long g_ticker ;
extern long g_filterTimeout ;
// as recommended in the "man system" page we use our own
int my_system_r ( char *cmd , long timeout ) ;
// . returns 0 to 100 , the probability of spam for this subprofile
// . a "profile" is an array of all the positions of a word in the document
// . a "position" is just the word #, like first word, word #8, etc...
// . we are passed a subprofile, "profile", of the actual profile
// because some of the document may be more "spammy" than other parts
// . inlined to speed things up because this may be called multiple times
// for each word in the document
// . if "step" is 1 we look at every word position in the profile
// . if "step" is 2 we look at every other word position
// . if "step" is 3 we look at every 3rd word position, etc...
inline long XmlDoc::getProbSpam(long *profile, long plen, long step) {
// you can spam 2 or 1 letter words all you want to
if ( plen <= 2 ) return 0;
// if our step is bigger than the profile return 0
if ( step == plen ) return 0;
register long avgSpacing, stdDevSpacing;
long d,dev=0;
register long i;
for (long j = 0; j < step; j++) {
// find avg. of gaps between consecutive tokens in subprofile
// TODO: isn't profile[i] < profile[i+1]??
long istop = plen-1;
avgSpacing = 0;
for (i=0; i < istop; i += step )
avgSpacing += ( profile[i] - profile[i+1] );
// there's 1 less spacing than positions in the profile
// so we divide by plen-1
avgSpacing = (avgSpacing * 256) / istop;
// compute standard deviation of the gaps in this sequence
stdDevSpacing = 0;
for (i = 0 ; i < istop; i += step ) {
d = (( profile[i] - profile[i+1]) * 256 ) - avgSpacing;
if ( d < 0 ) stdDevSpacing -= d;
else stdDevSpacing += d;
}
// TODO: should we divide by istop-1 for stdDev??
stdDevSpacing /= istop;
// average of the stddevs for all sequences
dev += stdDevSpacing;
}
dev /= step;
// if the plen is big we should expect dev to be big
// here's some interpolation points:
// plen >= 2 and dev<= 0.2 --> 100%
// plen = 7 and dev = 1.0 --> 100%
// plen = 14 and dev = 2.0 --> 100%
// plen = 21 and dev = 3.0 --> 100%
// plen = 7 and dev = 2.0 --> 50%
// NOTE: dev has been multiplied by 256 to avoid using floats
if ( dev <= 51.2 ) return 100; // (.2 * 256)
long prob = ( (256*100/7) * plen ) / dev;
if (prob>100) prob=100;
return prob;
//if (prob>=0) {
// long i;
//printf("dev=%i,plen=%i,nseq=%i,prob=%i----\n",dev,plen,step,prob);
// for (i=0;i<plen;i++)
// printf("%i#",profile[i]);
// printf("\n");
//}
}
#endif