open-source-search-engine/Msg20.h
Ivan Skytte Jørgensen 1a2693dc12 Removed explicit m_buf [0] member from Msg20Request/Msg20Reply
And some comment cleanup

Origin: b82536583f
2021-06-18 13:03:48 +10:00

822 lines
33 KiB
C++

// Matt Wells, copyright Nov 2007
// get various information from a query and a docId, like summary, title, etc.
#ifndef _MSG20_H_
#define _MSG20_H_
#include "UdpServer.h"
#include "Hostdb.h"
#include "Multicast.h"
#include "Xml.h"
#include "Summary.h"
#include "Titledb.h"
#include "Query.h"
//#include "LinkInfo.h"
#include "Tagdb.h" // TagRec
#include "Events.h" // EventIdBits
// values for SummaryLine::m_flags
//#define SL_TRUNCATED 0x0100
//#define SL_IS_TITLE 0x0080
//#define SL_HAS_DATE 0x0040
//#define SL_HAS_QTERM 0x0020
//#define SL_BULLET 0x0001
#define MSG20_CURRENT_VERSION 0
// MAX_QUERY_LEN is pretty big and Msg40 contains 50 or so Msg20s so let's
// cut down on memory usage here.
//#define MAX_MSG20_REQUEST_SIZE (500+500)
//#define MSG20_MAX_REPLY_SIZE (1*1024)
// see what happens if we eliminate these bufs
#define MAX_MSG20_REQUEST_SIZE (1)
#define MSG20_MAX_REPLY_SIZE (1)
#define REQ20FLAG1_USEDATELISTS 0x01
#define REQ20FLAG1_EXCLDATELIST 0x02
#define REQ20FLAG1_EXCLQTINANCH 0x04
#define REQ20FLAG1_PQRENABLED 0x08
#define REQ20FLAG1_PQRLOCENABLED 0x010
#define INLINK_FLAG_SPAM 0x01
#define INLINK_FLAG_HASTEXT 0x02
class Msg20Request {
public:
Msg20Request() { reset(); };
// zero ourselves out
void reset() {
memset ( (char *)this,0,sizeof(Msg20Request) );
// these are the only non-zero defaults
m_version = MSG20_CURRENT_VERSION;
m_maxNumCharsPerLine = 50;
m_numSummaryLines = 2;
m_expected = false;
m_allowPunctInPhrase = true;
m_docId = -1LL; // set docid to "invalid"
m_boolFlag = 2 ; // autodetect if query boolean
m_titleMaxLen = 64 ;
m_summaryMaxLen = 512 ;
};
int32_t getStoredSize ( );
char *serialize ( int32_t *sizePtr ,
char *userBuf ,
int32_t userBufSize ) ;
int32_t deserialize ( );
char m_version ; // non-zero default
char m_numSummaryLines ; // non-zero default
char m_expected ; // non-zero default
char m_allowPunctInPhrase ; // non-zero default
bool m_getHeaderTag ;
void *m_state ;
void *m_state2 ; // used by Msg25.cpp
int32_t m_j ; // used by Msg25.cpp
bool (* m_callback)( void *m_state );
void (* m_callback2)( void *m_state );
int64_t m_docId ;
Hostdb *m_hostdb ;
int32_t m_niceness ;
char m_boolFlag ;
int32_t m_titleMaxLen ;
int32_t m_summaryMaxLen ;
int32_t m_summaryMaxNumCharsPerLine ;
int32_t m_maxNumCharsPerLine ;
int32_t m_bigSampleRadius ;
int32_t m_bigSampleMaxLen ;
int32_t m_maxCacheAge ;
int32_t m_maxLinks ;
int32_t m_discoveryDate ;
// special shit so we can remove an inlinker to a related docid
// if they also link to the main url we are processing seo for.
// set both of these to 0 to disregard.
int32_t m_ourHostHash32;
int32_t m_ourDomHash32;
FacetValHash_t m_facetValHash;
char m_justGetFacets : 1 ;
// for sending msg20 request to another network
//int32_t m_hostIP;
//int32_t m_hostUDPPort;
// if titleRec not from this ruleset, return g_errno = EDOCFILTERED
//int32_t m_rulesetFilter ;
// add this many seconds to clock to simulate event search going
// forward or backward in time
int32_t m_clockOff;
// we force the clock time to this if "clockset" is a non-zero cgi parm
time_t m_clockSet;
// pass in the same time in UTC we used for the intersection algo
time_t m_nowUTC;
int32_t m_turkIp;
// language the query is in (ptr_qbuf)
uint8_t m_langId;
// . if not 0 then return the event from the docid with that eventId
// . include the title and text of the event, and the address
// serialized using Address::serialize(), and all the start dates
// from now onward
int32_t m_eventId ;
// we now use the numeric collection # and not the ptr_coll
collnum_t m_collnum;
// set this to true when you pass in m_eventIdBits...
char m_getEventSummary ;
char m_summaryMode ;
// typically we allow 1 vote per ip or host i guess, but buzz should
// allow for up to 4, for better influence determination.
char m_linksPerIpHost ;
char m_flags ;
char m_highlightQueryTerms :1;
char m_highlightDates :1; // for event dates
char m_wcache :1;
//char m_checkSitedb :1;
char m_getImageUrl :1;
char m_ratInSummary :1;
char m_countOutlinks :1;
char m_considerTitlesFromBody :1;
char m_getSummaryVector :1;
char m_showBanned :1;
//char m_excludeLinkText :1;
//char m_excludeMetaText :1;
//char m_hackFixWords :1;
//char m_hackFixPhrases :1;
char m_includeCachedCopy :1;
char m_getSectionVotingInfo :1; // in JSON for now
char m_getMatches :1;
char m_useLinkdbForInlinks :1;
char m_getTermListBuf :1;
//char m_getInlinks :1; // use m_getLinkInfo!
char m_getOutlinks :1;
char m_getTitleRec :1; // sets ptr_tr in reply
char m_maxInlinks :1;
char m_getGigabitVector :1;
char m_doLinkSpamCheck :1;
char m_isLinkSpam :1; // Msg25 uses for storage
char m_isSiteLinkInfo :1; // site link info?
char m_isDebug :1;
// if true, calls Msg25 and fills in ptr_linkInfo/size_linkInfo
char m_computeLinkInfo :1;
// if true, just calls TitleRec::getLinkInfo() to set ptr_linkInfo
char m_getLinkInfo :1;
// if this is true we will not compute the title, etc. of BAD inlinks
// deemed link spam
char m_onlyNeedGoodInlinks :1;
// if true, sets ptr_linkText, etc.
char m_getLinkText :1;
// if this is true then we set ptr_turkForm to be an input form
// for turking this event summary and title
char m_getTurkForm :1;
char m_showTurkInstructions :1;
char m_isTurkSpecialQuery :1;
char m_isMasterAdmin :1;
// . this is for buzz.
// . this says to compute the <absScore2> tag in their xml feed.
// . the document receives a score of 0 if it does not match the query
// . can we just keep it a binary score? let's try that.
char m_checkForQueryMatch :1;
// pointer+size variable section
char *ptr_qbuf ;
char *ptr_hqbuf ;
//char *ptr_q2buf ;
char *ptr_turkUser ;
char *ptr_ubuf ; // url buffer
char *ptr_rubuf ; // redirect url buffer
char *ptr_termFreqs ;
char *ptr_affWeights ;
char *ptr_linkee ; // used by Msg25 for getting link text
//char *ptr_coll ;
//char *ptr_imgUrl ;
char *ptr_displayMetas ;
int32_t size_qbuf ;
int32_t size_hqbuf ;
//int32_t size_q2buf ;
int32_t size_turkUser ;
int32_t size_ubuf ; // url buffer
int32_t size_rubuf ; // redirect url buffer
int32_t size_termFreqs ;
int32_t size_affWeights ;
int32_t size_linkee ; // size includes terminating \0
//int32_t size_coll ; // size includes terminating \0
//int32_t size_imgUrl ;
int32_t size_displayMetas ; // size includes terminating \0
// variable data comes here
};
// the Msg20Reply::ptr_eventSummaryLines is a list of these classes
class SummaryLine {
public:
int32_t m_totalSize;
//int32_t m_pageOff;
int32_t m_pageOff1;
int32_t m_pageOff2;
int32_t m_firstDatePageOff;
// so we know if two summary lines are adjacent. then we do not
// insert the "..." between them when displaying.
int32_t m_alnumPosA;
int32_t m_alnumPosB;
// copied from EventDesc::m_dflags. might also include some tags
// that we add in XmlDoc::getEventSummary(), like EDF_TRUNCATED
int32_t m_flags;
// if two summary lines are adjacent then do not print the ... between
// in the serps, will look cleaner...
//int32_t m_alnumWordA;
//int32_t m_alnumWordB;
char m_buf[0];
};
// values for m_flags3
//#define F3_STORE_HOURS 0x01
class Msg20Reply {
public:
Msg20Reply();
// free the merge buf from Msg40.cpp merging event summaries
~Msg20Reply();
void destructor();
// zero ourselves out
void reset() { memset ( (char *)this,0,sizeof(Msg20Reply) ); };
// how many bytes if we had to serialize it?
int32_t getStoredSize() ;
int32_t deserialize ( ) ;
int32_t serialize ( char *buf , int32_t bufSize );
char *getAttendeeUrl ( int32_t i ) { return ""; };
char *getLikerUrl ( int32_t i ) { return ""; };
bool sendReply ( class XmlDoc *xd ) ;
// after calling these, when serialize() is called again it will
// exclude these strings which were "cleared". Used by Msg40 to
// reduce the memory required for caching the Msg40 which includes an
// array of Msg20s.
//void clearBigSample ( ) { size_sbuf = 0; };
void clearOutlinks ( ) {
size_obuf = 0;
size_linkText = 0;
size_surroundingText = 0;
//size_linkInfo = 0;
size_outlinks = 0;
};
void clearVectors ( ) { size_vbuf = 0; };
// a new one for getting the display contents sequentially used
// by Msg24.cpp. this routine is the exclusive user of the "next"
// variable which must be set to "ptr_dbuf" when first called.
char *getNextDisplayBuf ( int32_t *len , char **next ) {
if ( ! *next ) return NULL;
if ( *next >= (char *)ptr_dbuf + size_dbuf ) return NULL;
char *s = *next;
*len = gbstrlen(*next);
*next += *len + 1;
return s;
};
char m_version ;
int32_t m_ip ;
int32_t m_firstIp ;
int32_t m_wordPosStart ;
int64_t m_domHash ;
int64_t m_docId ;
int64_t m_urlHash48 ;
uint64_t m_eventHash64 ;
int32_t m_eventId ;
uint64_t m_eventDateHash64 ;
uint32_t m_adch32 ; // event address/data content hash
uint32_t m_adth32 ; // event address/data tag hash
int32_t m_firstSpidered ;
int32_t m_lastSpidered ;
int32_t m_lastModified ;
int32_t m_datedbDate ;
int32_t m_firstIndexedDate ; // for the url/document as a whole
int32_t m_discoveryDate ; // for the inlink in question...
int32_t m_numAlnumWords ;
//int32_t m_numAttendees ;
//int32_t m_numLikers ;
bool m_datedbDateIsEstimated;
int32_t m_errno ; // LinkInfo uses it for LinkTextRepl
collnum_t m_collnum ; // collection # we came from
char m_sumFromDmoz ; // unused
int32_t m_hostHash ;
char m_noArchive ;
char m_contentType ;
//char m_docQuality ;
char m_siteRank ;
char m_isBanned ;
char m_isFiltered ;
char m_eventExpired ;
char m_hasLinkToOurDomOrHost;
//char m_isNormalized ;
char m_urlFilterNum ;
char m_hopcount ;
//char m_flags3 ;
char m_recycled ;
uint8_t m_language ;
uint8_t m_summaryLanguage ;
uint16_t m_country ;
uint16_t m_computedCountry ;
int16_t m_charset ;
// for use by caller
class Msg20Reply *m_nextMerged ;
//int32_t m_numCatIds ; // use size_catIds
//int32_t m_numIndCatIds ; // use size_indCatIds
int32_t m_contentLen ; // was m_docLen
int32_t m_contentHash32 ; // for deduping diffbot json objects streaming
//int32_t m_docSummaryScore ;
//int32_t m_inSectionScore ;
//float m_proximityScore ;
//int32_t m_ruleset ;
int32_t m_pageNumInlinks ;
int32_t m_pageNumGoodInlinks ;
int32_t m_pageNumUniqueIps ; // includes our own inlinks
int32_t m_pageNumUniqueCBlocks; // includes our own inlinks
int32_t m_pageInlinksLastUpdated;
int32_t m_siteNumInlinks ; // GOOD inlinks!
//int32_t m_siteNumInlinksTotal ; // TOTAL inlinks
//int32_t m_siteNumUniqueIps ;
//int32_t m_siteNumUniqueCBlocks;
int32_t m_numOutlinks ; // replaced m_linkCount
int32_t m_tmp ; // used by Msg40.cpp for event merge
//float m_diversity ;
uint32_t m_tagVectorHash ; // zak's hash of html template
uint32_t m_gigabitVectorHash ; // zak's hash of the gigabits
uint32_t m_eventSummaryHash ;
double m_eventGeocoderLat ; // lat/lon of the event
double m_eventGeocoderLon ;
uint64_t m_eventAddressHash64 ; // event address hash
uint64_t m_eventTitleHash64 ; // event title hash
int32_t m_eventTitleOff ; // offset of first word in title
evflags_t m_eventFlags ;
char m_timeZoneOffset ; // in hours
char m_useDST ; // does event place use dst?
int32_t m_nextStart ; // next occ starts at this time_t
int32_t m_nextEnd ; // end - start is how long it is
int32_t m_prevStart ;
int32_t m_prevEnd ;
int32_t m_displayStartTime ; // the event times to display
int32_t m_displayEndTime ;
double m_balloonLat;
double m_balloonLon;
char m_balloonLetter;
// . hash of all the event start times from now until 60 days from now
// . used by Msg40.cpp for merging event summaries deemed to be the
// same event
uint32_t m_timeIntervalHash;
// these are just storage for LinkInfo::set() to use
//int32_t m_linkTextScoreWeight ;
int32_t m_linkTextNumWords ;
//int32_t m_linkTextLinkerQualityBoost ;
//int32_t m_linkTextNumWordsBoost ;
//int32_t m_linkTextBaseScore ;
char *m_linkTextNote ;
//int32_t m_pagePop ; // set for m_computeLinkInfo
//int32_t m_siteRootPagePop ; // set for m_computeLinkInfo
//int32_t m_siteRootNumInlinks ; // set for m_computeLinkInfo
//int32_t m_sitePop ; // set for m_computeLinkInfo
int32_t m_midDomHash ; // set for m_getLinkText
int32_t m_adIdHash ; // set for m_getLinkText
int32_t m_timeLinkSpam ; // set for m_getLinkText
void *m_parentOwner;
char m_constructorId;
char m_inlinkWeight ; // set for m_getLinkText
char m_isLinkSpam ; // set for m_getLinkText
char m_isAnomaly ; // set for m_getLinkText
char m_outlinkInContent ; // set for m_getLinkText
char m_outlinkInComment ; // set for m_getLinkText
char m_hasAllQueryTerms ; // set for m_getLinkText (buzz)
char m_isPermalink ; // set for m_getLinkText (buzz)
// pointer+size variable section
char *ptr_tbuf ; // title buffer
char *ptr_htag ; // h1 tag buf
char *ptr_ubuf ; // url buffer
char *ptr_rubuf ; // redirect url buffer
char *ptr_displaySum ; // summary for displaying
char *ptr_dedupSum ; // summary for deduping
char *ptr_dbuf ; // display metas \0 separated
//char *ptr_sbuf ; // big sample buf for gigabits
char *ptr_gigabitSample ;
char *ptr_obuf ; // outlinks buf, \0 separated
char *ptr_mbuf ; // match offsets
char *ptr_vbuf ; // summary vector
char *ptr_tvbuf ; // title vector
char *ptr_gbvecbuf ; // gigabit vector
char *ptr_imgUrl ; // youtube/metacafe vid thumb
char *ptr_imgData ; // for encoded images
char *ptr_facetBuf ;
//char *ptr_eventEnglishTime ; // "every saturday [[]] jan"
//char *ptr_eventDateIntervals ;
char *ptr_likedbList ;
char *ptr_matchedQueryWords ;
char *ptr_numMatchedQueryWords ;
char *ptr_matchedTypes ;
int32_t *ptr_catIds ;
int32_t *ptr_indCatIds ;
//char *ptr_dmozTitleLens ;
char *ptr_dmozTitles ;
//char *ptr_dmozSummLens ;
char *ptr_dmozSumms ;
//char *ptr_dmozAnchorLens ;
char *ptr_dmozAnchors ;
//char *ptr_tagRec ;
char *ptr_site ;
char *ptr_gbAdIds ;
char *ptr_summLocs ;
char *ptr_summLocsPops ;
// . if m_computeLinkInfo is true this is computed using Msg25 (fresh)
// . if m_setLinkInfo is true this is just set from the titleRec
// . this is a serialized LinkInfo class
char *ptr_linkInfo; // inlinks ;
// . made using LinkInfo::set ( Msg20Reply **ptrs )
// . this is a serialized LinkInfo class
char *ptr_outlinks ;
// . these are used only by Msg25 to compute LinkInfo
// . Msg25 will call Msg20 on the docid of a potentially good inlinker
// instead of calling the now obsolete Msg23::getLinkText()
int32_t *ptr_vector1 ; // set for m_getLinkText
int32_t *ptr_vector2 ; // set for m_getLinkText
int32_t *ptr_vector3 ; // set for m_getLinkText
char *ptr_linkText ; // set for m_getLinkText
char *ptr_surroundingText ; // set for m_getLinkText
char *ptr_linkUrl ; // what we link to
char *ptr_rssItem ; // set for m_getLinkText
char *ptr_categories ;
char *ptr_gigabitQuery ; // , separated list of gigabits
int32_t *ptr_gigabitScores ; // 1-1 with the terms in query
char *ptr_content ; // page content in utf8
char *ptr_sectionVotingInfo ; // in JSON
char *ptr_tr ; // like just using msg22
char *ptr_tlistBuf ;
char *ptr_tiBuf ; // terminfobuf
char *ptr_templateVector ;
char *ptr_metadataBuf;
// . for eventIds include the title and text of the event, and the addr
// serialized using Address::serialize(), and all the start dates
// from now onward
// . contains serialized EventReply classes
// . usually just one, but if multiple events that had different
// addresses from this same docid matched the query, then we will
// have multiple EventReply classes in this buf
//char *ptr_eventSummaryLines ;
//char *ptr_eventAddr ;
//char *ptr_eventTagsFromContent ;
//char *ptr_eventTagsFromTagdb ;
//char *ptr_eventBestPlaceName ;
// . if Msg20Request::m_forTurk is true then the ptr_turkForm will
// be a little input form that lists every line in the title and
// description of the event along with controls that allow the turk
// to turn descriptions on/off and pick different titles.
// . when they submit their changes then it should basically add
// the turk tag hashes of each line to tagdb, but only if changed
// by the turk.
// . i guess it should submit directly to tagdb...
// . then we should do a query reindex on all docs with that
// tagformathash
//char *ptr_turkForm;
char *ptr_note ; // reason why it cannot vote
int32_t size_tbuf ;
int32_t size_htag ;
int32_t size_ubuf ;
int32_t size_rubuf ;
int32_t size_displaySum ;
int32_t size_dedupSum ;
int32_t size_dbuf ;
//int32_t size_sbuf ;
int32_t size_gigabitSample ; // includes \0
int32_t size_obuf ;
int32_t size_mbuf ;
int32_t size_vbuf ;
int32_t size_tvbuf ;
int32_t size_gbvecbuf ;
int32_t size_imgUrl ; // youtube/metacafe vid thumb
int32_t size_imgData ;
int32_t size_facetBuf ;
//int32_t size_eventEnglishTime ;
//int32_t size_eventDateIntervals ;
int32_t size_likedbList ;
int32_t size_matchedQueryWords ;
int32_t size_numMatchedQueryWords ;
int32_t size_matchedTypes ;
int32_t size_catIds ;
int32_t size_indCatIds ;
//int32_t size_dmozTitleLens ;
int32_t size_dmozTitles ;
//int32_t size_dmozSummLens ;
int32_t size_dmozSumms ;
//int32_t size_dmozAnchorLens ;
int32_t size_dmozAnchors ;
//int32_t size_tagRec ;
int32_t size_site ;
int32_t size_gbAdIds ;
int32_t size_summLocs ;
int32_t size_summLocsPops ;
int32_t size_linkInfo;//inlinks ;
int32_t size_outlinks ;
int32_t size_vector1 ;
int32_t size_vector2 ;
int32_t size_vector3 ;
int32_t size_linkText ;
int32_t size_surroundingText ;
int32_t size_linkUrl ;
int32_t size_rssItem ;
int32_t size_categories ;
int32_t size_gigabitQuery ;
int32_t size_gigabitScores ;
int32_t size_content ; // page content in utf8
int32_t size_sectionVotingInfo ; // in json, includes \0
int32_t size_tr ;
int32_t size_tlistBuf ;
int32_t size_tiBuf ;
int32_t size_templateVector ;
int32_t size_metadataBuf ;
//int32_t size_eventSummaryLines ;
//int32_t size_eventAddr ;
//int32_t size_eventTagsFromContent ;
//int32_t size_eventTagsFromTagdb ;
//int32_t size_eventBestPlaceName ;
//int32_t size_turkForm ;
int32_t size_note ;
// variable data comes here
int32_t getNumCatIds (){return size_catIds/4; };
int32_t getNumIndCatIds (){return size_indCatIds/4; };
int32_t getCatId (int32_t i){return ((int32_t *)ptr_catIds)[i]; };
int32_t getIndCatId (int32_t i){return ((int32_t *)ptr_indCatIds)[i];};
//int32_t getDmozTitleLen (int32_t i){
// return ((int32_t *)ptr_dmozTitleLens)[i];};
//int32_t getDmozSummLen (int32_t i){
// return ((int32_t *)ptr_dmozSummLens)[i]; };
//int32_t getDmozAnchorLen (int32_t i){
// return (int32_t)((uint8_t *)ptr_dmozAnchorLens)[i];};
//int32_t *getCatIds (){return (int32_t *)ptr_catIds; };
//int32_t *getIndCatIds (){return (int32_t *)ptr_indCatIds; };
//int32_t *getTitleLens (){return (int32_t *)ptr_dmozTitleLens;};
//int32_t *getSummLens (){return (int32_t *)ptr_dmozSummLens; };
//uint8_t*getAnchorLens (){return(uint8_t *)ptr_dmozAnchorLens;};
};
class Msg20 {
public:
// . this should only be called once
// . should also register our get record handlers with the udpServer
bool registerHandler ( );
// see definition of Msg20Request below
bool getSummary ( class Msg20Request *r );
// "m_request = r->serialize(&m_requestSize,m_requestBuf)"
char *m_request;
int32_t m_requestSize;
char m_requestBuf[MAX_MSG20_REQUEST_SIZE];
// this is cast to m_replyPtr
Msg20Reply *m_r ;
// m_replyPtr pts to either m_replyBuf or to mem allocated from the
// udp server to hold the reply.
//char *m_replyPtr;
int32_t m_replySize;
int32_t m_replyMaxSize;
//char m_replyBuf[MSG20_MAX_REPLY_SIZE];
// i guess Msg40.cpp looks at this flag
char m_gotReply;
// set if we had an error
int32_t m_errno;
int64_t getRequestDocId () { return m_requestDocId; };
int64_t m_requestDocId;
// and this is copied from the msg20request
int32_t m_eventId;
// when we merge two msg20 replies in Msg40.cpp keep track of the
// event ids via this bit vector so if the click on the [cached] page
// link we can highlight the relevant event sections.
//EventIdBits m_eventIdBits;
int32_t getStoredSize ( ) {
if ( ! m_r ) return 0;
return m_r->getStoredSize(); };
// . return how many bytes we serialize into "buf"
// . sets g_errno and returns -1 on error
int32_t serialize ( char *buf , int32_t bufSize ) {
if ( ! m_r ) return 0;
return m_r->serialize ( buf , bufSize ); };
// . this is destructive on the "buf". it converts offs to ptrs
// . sets m_r to the modified "buf" when done
// . sets g_errno and returns -1 on error, otherwise # of bytes deseril
int32_t deserialize ( char *buf , int32_t bufSize ) ;
// Msg40 caches each Msg20Reply when it caches the page of results, so,
// to keep the size of the cached Msg40 down, we do not cache certain
// things. so we have to "clear" these guys out before caching.
//void clearBigSample () { m_r->clearBigSample(); };
void clearOutlinks () { if ( m_r ) m_r->clearOutlinks (); };
void clearLinks () { if ( m_r ) m_r->clearOutlinks (); };
void clearVectors () { if ( m_r ) m_r->clearVectors (); };
// copy "src" to ourselves
void copyFrom ( class Msg20 *src ) ;
// inlinker information, used by PostQueryRerank.cpp
//class LinkInfo *getInlinks () {
// return (class LinkInfo *)m_r->ptr_inlinks ; };
//class LinkInfo *getOutlinks () {
// return (class LinkInfo *)m_r->ptr_outlinks; };
// just let caller parse it up
Msg20Reply *getReply () { return m_r; };
/*
// these just return what we parsed out from the reply
char *getSummary () { return m_r->ptr_sum ; };
char *getDisplayBuf () { return m_r->ptr_dbuf; };
char *getBigSampleBuf () { return m_r->ptr_sbuf; };
char *getTitle () { return m_r->ptr_tbuf; };
char *getUrl () { return m_r->ptr_ubuf; };
char *getRedirUrl () { return m_r->ptr_rubuf; };
char *getOutlinksBuf () { return m_r->ptr_obuf; };
char *getLinksBuf () { return m_r->ptr_obuf; };
char *getMatchOffBuf () { return m_r->ptr_mbuf; };
char *getSummaryVector () { return m_r->ptr_vbuf; };
// what was this? gigabit/sample vector combined?
char *getVectorRec () { return NULL; };
char *getGigabitVector () {return m_r->ptr_gbvecbuf;}
uint64_t *getSummaryLocs () { return (uint64_t*)m_r->ptr_summLocs;}
int32_t *getSummaryLocsPops() {return (int32_t *)m_r->ptr_summLocsPops;}
int32_t getSummaryLen () { return m_r->size_sum; };
int32_t getDisplayBufLen () { return m_r->size_dbuf; };
int32_t getBigSampleLen () { return m_r->size_sbuf; };
int32_t getTitleLen () { return m_r->size_tbuf; };
int32_t getUrlSize () { return m_r->size_ubuf; };
int32_t getRedirUrlSize () { return m_r->size_rubuf; };
int32_t getNumSummaryLocs() { return m_r->size_summLocs/sizeof(uint64_t); }
int32_t getNumSummaryLocsPops() { return m_r->size_summLocs/sizeof(int32_t); }
int32_t getUrlLen () { return m_r->size_ubuf-1; };
int32_t getRedirUrlLen () { return m_r->size_rubuf-1; };
int32_t getOutlinksBufLen() { return m_r->size_obuf; };
int32_t getLinksBufLen () { return m_r->size_obuf; };
int32_t getMatchOffBufLen () { return m_r->size_mbuf; };
int32_t getDocLen () { return m_r->m_contentLen; };
int32_t getContentLen () { return m_r->m_contentLen; };
bool isSumFromDmoz () { return m_r->m_sumFromDmoz; };
int32_t getIp () { return m_r->m_ip; };
int64_t getDomainHash () { return m_r->m_domHash; };
unsigned char getLanguage () { return m_r->m_language; };
unsigned char getSummaryLanguage() { return m_r->m_summaryLanguage; };
uint16_t getCountry () { return m_r->m_country; };
uint16_t getComputedCountry(){ return m_r->m_computedCountry; }
int16_t getCharset () { return m_r->m_charset; };
char getUrlFilterNum () { return m_r->m_urlFilterNum; }
int64_t getDocId () { return m_r->m_docId; };
time_t getFirstSpidered () { return m_r->m_firstSpidered; };
time_t getLastSpidered () { return m_r->m_lastSpidered; };
time_t getNextSpiderDate() { return m_r->m_nextSpiderDate; };
time_t getLastModified () { return m_r->m_lastModified; };
time_t getDatedbDate () { return m_r->m_datedbDate; };
bool getDatedbDateIsEstimated () {
return m_r->m_datedbDateIsEstimated; };
int32_t getRuleset () { return m_r->m_ruleset; };
int32_t getHostHash () { return m_r->m_hostHash; };
// if this is true, do not display a [cached] link for this result
char getNoArchive () { return m_r->m_noArchive; };
// content-type codes are in HttpMime.h (CT_HTML, CT_PDF, ...)
char getContentType () { return m_r->m_contentType; };
unsigned char getQuality () { return m_r->m_docQuality; };
unsigned char getDocQuality () { return m_r->m_docQuality; };
//int32_t getNumOutlinks () { return m_r->m_numOutlinks; };
bool isBanned () { return m_r->m_isBanned; };
bool isNormalized () { return m_r->m_isNormalized; } ;
bool hasAllQueryTerms () { return m_r->m_hasAllQueryTerms; };
char getHopCount () { return m_r->m_hopcount; };
float getDiversity () { return m_r->m_diversity;}
// . parse the TitleRec::m_flags3 variable
// . if the datedb date was estimated that means we computed it using
// the bisection method. so print "Updated XXXX" instead of
// "Published XXXX" next to the result.
char datedbDateIsEstimated () {
return m_r->m_flags3 & FLAG3_DATEDB_DATE_IS_ESTIMATED; };
float getProximityScore () {return m_r->m_proximityScore;};
int32_t getInSectionScore () {return m_r->m_inSectionScore;};
int32_t getSummaryScore () {return m_r->m_docSummaryScore; };
TagRec *getTagRec (){return (TagRec *)m_r->ptr_tagRec; };
int32_t getTagRecSize (){return m_r->size_tagRec; };
int32_t getNumCatids (){return m_r->size_catids/4; };
int32_t getNumIndCatids (){return m_r->size_indCatids/4; };
int32_t *getDmozCatIds (){return (int32_t *)m_r->ptr_catids; };
int32_t *getDmozCatids (){return (int32_t *)m_r->ptr_catids; };
int32_t *getDmozIndCatIds (){return (int32_t *)m_r->ptr_indCatids; };
int32_t *getDmozIndCatids (){return (int32_t *)m_r->ptr_indCatids; };
char *getDmozTitles (){return m_r->ptr_dmozTitles; };
int32_t *getDmozTitleLens (){return (int32_t *)m_r->ptr_dmozTitleLens;};
char *getDmozSumms (){return m_r->ptr_dmozSumms; };
int32_t *getDmozSummLens (){return (int32_t *)m_r->ptr_dmozSummLens; };
char *getDmozAnchors (){return m_r->ptr_dmozAnchors; };
uint8_t *getDmozAnchorLens (){
return(uint8_t *)m_r->ptr_dmozAnchorLens;};
uint32_t getTagVectorHash () {return m_r->m_tagVectorHash; }
uint32_t getGigabitVectorHash() {return m_r->m_gigabitVectorHash; }
char* getAdIds () {return m_r->ptr_gbAdIds;}
int32_t getAdIdsSize () {return m_r->size_gbAdIds;}
*/
static int32_t getApproxLinkCount(char* content, int32_t contentLen);
//static int32_t getLinkHashes(Links& ln, char* buf, int32_t bufSize);
//char *getNextDisplayBuf ( int32_t *len , char **next ) {
// return m_r->getNextDisplayBuf(len,next); };
// for sending the request
Multicast m_mcast;
void gotReply ( class UdpSlot *slot );
// general purpose routines
Msg20();
~Msg20();
// so we can alloc arrays of these using mmalloc()
void constructor ();
void destructor ();
void freeReply ();
void reset ();
void *m_hack;
int32_t m_hack2;
int32_t m_ii;
// is the reply in progress? if msg20 has not launched a request
// this is false. if msg20 received its reply, this is false.
// otherwise this is true.
bool m_inProgress;
bool m_launched;
char m_ownReply;
char m_expected;
bool (*m_callback ) ( void *state );
void (*m_callback2) ( void *state );
void *m_state;
// used by MsgE to store its data
void *m_state2;
void *m_state3;
void *m_owningParent;
char m_constructedId;
// PostQueryRerank storage area for printing out in PageResults.cpp
float m_pqr_old_score ;
float m_pqr_factor_diversity ;
float m_pqr_factor_quality ;
float m_pqr_factor_inlinkers ;
float m_pqr_factor_proximity ;
float m_pqr_factor_ctype ;
float m_pqr_factor_lang ; // includes country
};
#endif