open-source-search-engine/seo.h

690 lines
20 KiB
C
Raw Normal View History

2013-08-03 00:12:24 +04:00
#ifndef _SEO_H_
#define _SEO_H_
#include "gb-include.h"
#include "Mem.h" // gbstrlen
// . scalar to convert gb daily impression estimates to google
// . multiply by 31 to go monthly, 60 for google
#define GB_TRAFFIC_MODIFIER (60*31)
bool setQueryInfo ( char *qstr , class QueryInfo *qi ) ;
// so main.cpp can register it
void runSEOQueryLoop ( int fd , void *state ) ;
void handleRequest8e ( class UdpSlot *slot , long netnice ) ;
void handleRequest4f ( class UdpSlot *slot , long netnice ) ;
void handleRequest95 ( class UdpSlot *slot , long netnice ) ;
bool loadQueryLog();
long long getSynBaseHash64 ( char *qstr , uint8_t langId ) ;
extern char *g_secret_tran_key;
extern char *g_secret_api_key;
/*
class Msg3fRequest {
public:
// document language:
uint8_t m_langId3f;
long m_niceness;
// first is coll
char *ptr_coll;
// termlistbuf for doing msg39 queries
char *ptr_termListBuf;
long size_coll;
long size_termListBuf;
};
*/
// when we insert an insertable term into a document, how does it affect
// the document's ranking for a particular query?
class QueryChange {
public:
// . what term are we inserting here
// . offset is relative to the Msg95Request::ptr_insertableTerms
// which are strings separated by \0's
// . this is just an offset into the XmlDoc::m_itStrBuf safebuf
long m_termStrOffset;
// we scan all matching queries for this term/position combo, so
// this is the offset of the QueryLogEntry in our local g_qbuf.
long m_queryOffset3;
// this is the one that is relative the Msg95Reply::ptr_queryLogBuf.
// the other one is relative to the remote host's g_qbuf QueryLogEntry
// buf.
long m_replyQueryOffset;
// for debugging, offset into m_debugScoreInfoBuf
long m_debugScoreInfoOffset;
// for debugging, offset into m_origScoreInfoBuf
long m_origScoreInfoOffset;
// total traffic the query gets per day
//long m_dailyTraffic;
// for matching to our InsertableTerms when done
long long m_termHash64;
long m_queryHash32;
// and the term's position range this applies to
long m_insertPos;//termPosition;
// the original score
float m_oldScore;
// the new score with term insertion
float m_newScore;
// the old rank
char m_oldRank;
// and new rank with term insertion
char m_newRank;
// this hostid, handling the msg95
//short m_hostId;
// the linked list we are in
class QueryChange *m_next;
// this makes sure DocIdScore::m_pairScores ptr is correct
class DocIdScore *getDebugDocIdScore ( SafeBuf *debugScoreInfoBuf ) ;
// this makes sure DocIdScore::m_singleScores ptr is correct
class DocIdScore *getOrigDocIdScore ( SafeBuf *origScoreInfoBuf ) ;
};
// . of all the alnum
// . we have one of these for every alnum word in the doc that we
// index into posdb. basically we make this from the list of posdb
// keys. we pass it in through Msg95Request to handleRequest95.
class WordPosInfo {
public:
long m_wordPos;
long m_sentNum;
char m_hashGroup;
char m_densityRank;
char m_wordSpamRank; // doubles as siterank for link text hashgroup
char m_diversityRank;
char *m_wordPtr;
long m_wordLen;
// traffic gain for inserting the selected insertable term into
// this word position. this is set in seo.cpp
long m_trafficGain;
// holds the symbol info, the glyph to display
char m_color;
};
// put these in the safebuf now
class TermInfo {
public:
unsigned long long m_termId64;
//long long m_termFreq64;
};
/*
class WordFreqInfo {
public:
// 32 bit termid
long long m_wordId64;
// it's term freq
long long m_wordFreq64;
};
*/
// used to get matching queries for the main url or a related docid
class Msg95Request {
public:
long long m_docId;
uint8_t m_docLangId;
// debug mode?
char m_seoDebug;
// basically all the posdb keys, all are full 18-byte keys. no
// compression for simplicity. used to pass posdb termlists into
// msg39 just for this m_docid
char *ptr_posdbTermList;
// Top Word Ids buffer. basically all the word ids contained
// in the doc (m_docId) or in the inlink text for the doc
char *ptr_termInfoBuf;//twid32Buf;
// array of WordFreqInfos from host #0 only for consistency
// so msg95 handler can use them for calling msg39
//char *ptr_wordFreqInfoBuf;
// instances of the WordInfo class that give us sentence #, hashgroup,
// word position of every word in the doc. sorted by m_wordPos.
char *ptr_wordPosInfoBuf;
char *ptr_coll;
// \0 separated list of the insertable phrases to try to insert
// into every possible position in the document and see how much
// our traffic will increase/decrease by doing so. each
// insertable term can be a single word or sequence of words.
char *ptr_insertableTerms;
long size_posdbTermList;
long size_termInfoBuf; // twid32Buf;
//long size_wordFreqInfoBuf;
long size_wordPosInfoBuf;
long size_coll;
long size_insertableTerms;
char m_buf[0];
};
class Msg95Reply {
public:
char *ptr_queryChangeBuf ;
char *ptr_debugScoreInfoBuf;
char *ptr_origScoreInfoBuf;
char *ptr_queryLogBuf ;
long size_queryChangeBuf;
long size_debugScoreInfoBuf;
long size_origScoreInfoBuf;
long size_queryLogBuf ;
char m_buf[0];
};
// get the top 300 results of each query we do and store their docids/scores
#define NUM_RESULTS_FOR_RELATED_DOCIDS 300
// these are 1-1 with the top list of ptrs to Msg99Replies
class TopDocIds {
public:
long m_queryNum; // in the matchingquerybuf
long m_numDocIds;
long long m_topDocIds[NUM_RESULTS_FOR_RELATED_DOCIDS];
float m_topScores[NUM_RESULTS_FOR_RELATED_DOCIDS];
long m_topSiteHashes26[NUM_RESULTS_FOR_RELATED_DOCIDS];
};
/*
class Msg99Request {
public:
char m_justGetQueryOffsets;
char *ptr_twids;
char *ptr_coll;
char *ptr_posdbTermList;
long size_twids;
long size_coll;
long size_posdbTermList;
char m_buf[0];
};
class QueryInfo {
public:
// stuff for setting importance
short m_numUniqueWordForms;
short m_numRepeatWordForms;
short m_numControlWordForms;
float m_smallestNormTermFreq;
// hash of wids in query. see Query::getHash() function.
long long m_queryExactHash64;
// hash of smallest syn of each wid in qry
long long m_querySynBaseHash64;
// this is set by setQueryImportance()
float m_queryImportance;
// . how many docids are in our linked list, QueryRel::m_next
// . only used by getRelatedQuery*() functions
long m_docIdVotes;
// . score from combining all in linked list
// . only used by getRelatedQuery*() functions
float m_myScoreRelated;
};
*/
#define MAX_MATCHING_QUERIES 300
// only store this many of the top-scoring related docids
#define MAX_RELATED_DOCIDS 300
#define MAX_RELATED_QUERIES 300
// a related query must be shared by this many related docids in order
// to be scored. i had 640,000 related query
// candidates to score in getRelatedQueryBuf() for gigablast.com when this was
// 10.
// there were a total of 3.8M related queries among all the related docids. so
// the MIN_DOCID_VOTES constraint is about 20% of the total.
#define MIN_DOCID_VOTES 10
// m_flags values
#define QEF_ESTIMATE_SCORE 0x01
// use this now to make things simpler
class QueryLogEntry {
public:
char m_qn;
char m_flags;
uint8_t m_langId;
long m_gigablastTraffic;
long m_googleTraffic;
long m_googleTrafficDate;
float m_topSERPScore; // score of the #1 result, in SLICE!
float m_minTop50Score;
long m_minTop50ScoreDate;
// how many results we got when we did this query on a slice
// of the index. slice sizes should all be the same.
long m_numTotalResultsInSlice;
unsigned long m_queryTermIds32[0];
//
// NOTE: query string follows this list of termids
//
char *getQueryStr () {
return (char *)((char *)this+sizeof(QueryLogEntry)+m_qn*4);};
char *getQueryString () {
return (char *)((char *)this+sizeof(QueryLogEntry)+m_qn*4);};
uint8_t getQueryLangId () {
// assume english if unknown (langUnknown = 0)
if ( m_langId == langUnknown ) return langEnglish;
return m_langId;
};
long getSize() {
char *start = (char *)this;
char *p = start;
p += sizeof(QueryLogEntry)+m_qn*4;
p += gbstrlen(p) + 1;
return p - start;
};
};
/*
// . this is basically a MATCHING QUERY
// . it could match our main url, or it could match a related docid; it is
// used for both
// . we get one of these back in response to a Msg99Request
// . we send the Msg99Requests out in batch in XmlDoc.cpp::sendBin()
// . it gives us a query that matches the termlists in Msg99Request which
// are from our docid
class Msg99Reply {
public:
// . just pass this whole thing back
// . crap, but the query part is bogus and so are the
// query termids... because they are beyond the class's fixed size
QueryLogEntry m_queryLogEntry;
// estimated # searches the query gets per day
//long m_gigablastTraffic;
//long m_googleTraffic;
// offset into g_qbuf on that host that query string came from
// NO! now it is to the full query entry which has the # of terms,
// the pop and the termids followed by the string!
long m_qbufOffset;
// replying host's hostid
short m_replyingHostId;
// was the query added from the m_extraQueryBuf which is set
// from the user-supplied textarea
char m_isManuallyAdded:1;
//char m_hasFullScore:1;
// is it first in a linked list of msg99replies for the same query
// but different m_myDocId? (for related queries algo)
char m_isFirst:1;
// score of the query
float m_myScore;
// docid of related query, set after getting reply
long long m_myDocId;
// hmmm. what's this? the top 300 or so scoring docids, set by
// getMatchingQueriesScoredForThisUrl() .
//class TopDocIds *m_topDocIds;
long m_topDocIdsBufOffset;
TopDocIds *getTopDocIds ( SafeBuf *topDocIdsBuf ) {
if ( m_topDocIdsBufOffset < 0 ) return NULL;
char *p = topDocIdsBuf->getBufStart();
p += m_topDocIdsBufOffset;
return (TopDocIds *)p;
};
// this is set in handleRequest99() in seo.cpp
//float m_minTop50Score;
// this is also set in handleRequest99() in seo.cpp and used
// in XmlDoc::getMatchingQueriesScoredForFullQuery()
// for doing related docids, because we do not want to dedup
// related queries if they are basically the same like
// "search+engine" is too similar to "search+engines" so this
// will allow us to skip them because they yield like the exact
// same related docids!!!!
long long m_querySynBaseHash64;
// . how important is this query to the main url?
// . now a function of m_numTotalResultsInSlice and query's traffic
// . this is set when processing the msg99replies in
// XmlDoc.cpp::getMatchingQueriesScoredForThisUrl()
float m_queryImportance;
// and the query string itself
char m_queryStr[0];
long getSize() { return (long)sizeof(Msg99Reply)+
gbstrlen(m_queryStr)+
1; };
};
*/
// this is the term being inserted into a document. it may affect the
// document's ranking for multiple queries depending on what position it is
// inserted into. that ranking info should be described by the m_queryChanges
// array.
class InsertableTerm {
public:
//
// these members are set by getInsertableTerms():
//
//char *m_termStr;
//long m_termLen;
// need this for matching to QueryChange::m_termHash64
long long m_termHash64;
// . sum of traffic of all queries that had this term
// . maybe sort by this after m_bestTrafficGain?
long m_trafficSum;
// is it a related term? i.e. from a related query as opposed to
// a matching query. i.e. not contained by our doc..
char m_isRelatedTerm:1;
//
// the following members are set by get*SCORED*InsertableTerms():
//
// first QueryChange in linked list, sorted by QueryChange::m_insertPos
class QueryChange *m_firstQueryChange;
// . this is only set in getInsertableTermsScore()
// . indicates how much traffic we can gain by inserting this
// term into the document or a link
long m_bestTrafficGain;
long m_bestInsertPos;
// the first QueryChange in the linked list for this m_bestWordPosition
class QueryChange *m_bestQueryChange;
// includes \0 terminating the term
long m_termSize;
// store the term string here
char m_buf[0];
char *getTerm ( ) { return m_buf; };
long getTermLen ( ) { return m_termSize - 1; };
long getSize ( ) { return sizeof(InsertableTerm)+m_termSize; };
};
class RelatedDocId {
public:
long long m_docId;
// from clusterdb from doing a search:
long m_siteHash26;
// how many queries we have in common with the main url
long m_numCommonQueries;
char m_rd_siteRank;
uint8_t m_rd_langId;
// the full site hash from titlerec, not just clusterdb!
long m_rd_siteHash32;
// sum of the Msg99Reply::m_queryImportance for each query we have
// in common with the main url.
// m_queryImportance is the score this related docid had for
// a matching query divided by a top score for that query.
//float m_similarityScore;//queryImportanceIntersectionSum;
// . linked list of the QueryNums we have in common with main url
// . these are offsets into m_commonQueryNumBuf safebuf
long m_firstCommonQueryNumOff ;
//long m_lastCommonQueryNumOff ;
long rd_title_off;
long rd_url_off;
long rd_site_off;
char *getUrl ( class SafeBuf *relatedTitleBuf ) {
if ( rd_url_off == -1 ) return NULL;
return relatedTitleBuf->getBufStart() + rd_url_off;
};
char *getSite ( class SafeBuf *relatedTitleBuf ) {
if ( rd_site_off == -1 ) return NULL;
return relatedTitleBuf->getBufStart() + rd_site_off;
};
char *getTitle ( class SafeBuf *relatedTitleBuf ) {
if ( rd_title_off == -1 ) return NULL;
return relatedTitleBuf->getBufStart() + rd_title_off;
};
// use offsets not ptrs! offsets into m_relatedTitleBuf safebuf
long m_linkInfo1Offset;
// . the ip address of this m_docId
// . actually, the ip address for the FIRST time we encountered this
// subdomain. so it may be old, but it is consistent.
long m_relatedFirstIp;
long m_relatedCurrentIp;
// try just this
// . A = vector of our scores for the queries we have
// in common with the main url
// . B = vector of main url's scores for queries it has in common
// with this m_relatedDocId
// . C = vector of the score of top result for such queries
// . dotProduct = (A/C * B/C)
// . so the relatedDocId that has the highest m_dotProduct
// is the most similar
// . aka m_similarity!!!! but relative to all the other related docids
//float m_dotProduct;
// this replaces dot product
float m_relatedWeight;
};
class RecommendedLink {
public:
// sum of related docids' m_dotProduct values that have this inlink
float m_totalRecommendedScore;
// how many related docids have this inlink
long m_votes;
// the siterank of this inlink
char m_rl_siteRank;
// docid of the link
long long m_rl_docId;
long m_rl_firstIp;
// offsets into XmlDoc::m_relatedDocIdBuf of the related docids
// that this link links to
long m_relatedDocIdOff[10];
// these include the \0
long m_urlSize;
long m_titleSize;
// offsets relative to "this" now! strange!!
//long m_urlOffset;
//long m_titleOffset;
char *getUrl ( ) {
char *ptr = (char *)this;
ptr += sizeof(RecommendedLink);
return ptr;
};
char *getTitle ( ) {
char *ptr = (char *)this;
ptr += sizeof(RecommendedLink);
ptr += m_urlSize;
return ptr;
};
long getSize() { return sizeof(RecommendedLink) +
// these are stored right after us in buf
m_urlSize +
m_titleSize; };
};
// use for related urls/docids:
class QueryNumLinkedNode {
public:
long m_queryNum;
// offset of next link in linked list into m_commonQueryNumBuf
long m_nextOff;
// new stuff
long m_relatedDocIdRank;
float m_relatedDocIdSerpScore;
long m_mainUrlRank;
// the sum of these is the RelatedDocid::m_relatedWeight, the
// final score of the related docid:
float m_queryScoreWeight;
//float m_mainUrlSerpScore;
};
// used by buffers returned by getMatchingQueryBuf() and getRelatedQueryBuf()
class QueryLink {
public:
// offset to the corresponding QueryLogEntry
long m_queryStringOffset;
// score of this docid from gbdocid:xxx|querystr
float m_serpScore;
// . individial score for this QueryLink.
// . different algo used to compute this for m_matchingQueryBuf
// compared to m_relatedQueryBuf
float m_queryImportance;
// sum of all QueryLinks::m_importance that have this same
// query/m_queryStringOffset. only valid for head of linked list
// in the case of m_relatedQueryBuf.
float m_totalQueryImportance;
// a linked list of QueryLinks
//long m_tailOff;
//long m_nextOff;
// now QueryLinks with the same m_queryStringOffset (query) are
// stored all consecutively, sorted by their m_queryImportance.
// m_isFirst is set to true for the first one in the list.
// INCLUDES the head querylink!!!!
long m_numInList;
// how many related docIds contributed to m_totalRelatedQueryImportance
short m_docIdVotes;
// . the docid that had this query as a matching query
// . HACK: if this QueryLink is for a matching query of the main url
// then we use this for the topdocids # in m_topDocIdBuf
short m_relatedDocIdNum;
// a flag. the head of the linked list?
char m_isFirst:1;
// . sort by this first then by m_totalRelatedQueryImportance
// . the lower this value, the higher this query will be displayed
uint8_t m_uniqueRound;
//long long getQueryLinkHash64 ( ) {
// long long h64 = m_queryHostId;
// h64 <<= 32;
// h64 |= m_queryDataOffset;
// return h64;
//};
class RelatedDocId *getRelatedDocId ( SafeBuf *relatedDocIdBuf ) {
if ( m_relatedDocIdNum == -1 ) return NULL;
RelatedDocId *rds ;
rds = (RelatedDocId *)relatedDocIdBuf->getBufStart();
return &rds[m_relatedDocIdNum];
};
/*
class QueryLink *getNext ( SafeBuf *queryLinkBuf ) {
if ( m_nextOff == -1 ) return NULL;
char *base = queryLinkBuf->getBufStart();
return (QueryLink *)(base + m_nextOff);
};
class QueryLink *getTail ( SafeBuf *queryLinkBuf ) {
if ( m_tailOff == -1 ) return NULL;
char *base = queryLinkBuf->getBufStart();
return (QueryLink *)(base + m_tailOff);
};
*/
class QueryLogEntry *getQueryLogEntry ( SafeBuf *stringBuf) {
char *base = stringBuf->getBufStart();
QueryLogEntry *qe;
qe = (QueryLogEntry *)(base + m_queryStringOffset);
return qe;
};
// these are in string buf, m_stringBuf too!
float getTopOfSliceSERPScore ( SafeBuf *stringBuf ) {
QueryLogEntry *qe = getQueryLogEntry(stringBuf);
return qe->m_topSERPScore;
};
long getGigablastTraffic ( SafeBuf *stringBuf ) {
QueryLogEntry *qe = getQueryLogEntry(stringBuf);
return qe->m_gigablastTraffic;
};
// this is -1 if unknown
long getGoogleTraffic ( SafeBuf *stringBuf ) {
QueryLogEntry *qe = getQueryLogEntry(stringBuf);
return qe->m_googleTraffic;
};
char *getQueryString ( SafeBuf *stringBuf ) {
QueryLogEntry *qe = getQueryLogEntry(stringBuf);
return qe->getQueryString();
};
};
class MissingTerm {
public:
// how many related docids had this term
long m_votes;
// what is the score
float m_importance;//score;
// sum of traffic of all related queries that had this term
long long m_traffic;
// linked list of synonyms
//class MissingTerm *m_synNext;
//class MissingTerm *m_synTail;
// what missing term are we a synonym of?
class MissingTerm *m_synOf;
// we get the largest phrase in wikipedia title's to make a full phrase
// like "modern warfare 3" etc...
//long m_numAlnumWords;
char m_isMissingTerm;
char m_reserved2;
char m_reserved3;
char m_reserved4;
// . the first ten related queries that contain this
// . they are offsets to QueryLogEntries
// . use -1 to indicate end...
// . if m_isFromRelatedQuery is true these are offsets are for
// a QueryLogEntry in the m_relatedQueryDataBuf,
// OTHERWISE these are offsets into
// the m_msg99ReplyBuf and a msg99Reply
long m_hackQueryOffsets[10];
// . get ith query that contains this term
// . bbb is relatedQueryDataBuf for related terms
// . bbb is m_msg99ReplyBuf for matching terms
char *getContainingQuery ( long i , class XmlDoc *xd ) ;
char *getTerm() { return m_buf; };
long getTermSize() { return m_termSize; };
long getTermLen() { return m_termSize-1; };
long getSize () { return sizeof(MissingTerm)+m_termSize;};
long m_termSize;
char m_buf[0];
};
#endif