open-source-search-engine/Msg39.h

307 lines
7.6 KiB
C
Raw Normal View History

2013-08-03 00:12:24 +04:00
// Matt Wells, copyright Jul 2001
// . gets the resulting docIds from a query
// . TODO: use our own facility to replace Msg2? hash a list as it comes.
#ifndef _MSG39_H_
#define _MSG39_H_
#include "UdpServer.h" // UdpSlot type
#include "Multicast.h" // multicast send
#include "Query.h" // Query::set()
#include "Msg37.h" // getTermFreqs()
#include "Msg2.h" // getLists()
#include "Posdb.h"
#include "IndexList.h"
#include "TopTree.h"
#include "Msg51.h"
#include "HashTableX.h"
#define MAX_MSG39_REQUEST_SIZE (500+MAX_QUERY_LEN)
void handleRequest39 ( UdpSlot *slot , long netnice ) ;
class Msg39Request {
public:
Msg39Request () { reset(); };
void reset() {
m_docsToGet = 10;
m_niceness = MAX_NICENESS;
m_maxAge = 0;
m_maxQueryTerms = 9999;
//m_compoundListMaxSize = 20000000;
2013-08-03 00:12:24 +04:00
m_boolFlag = 2;
m_language = 0;
m_queryExpansion = false;
m_debug = 0;
m_getDocIdScoringInfo = true;
m_doSiteClustering = true;
//m_doIpClustering = true;
2013-08-03 00:12:24 +04:00
m_doDupContentRemoval = true;
m_restrictPosdbForQuery = false;
m_addToCache = false;
m_familyFilter = false;
m_timeout = -1; // -1 means auto-compute
m_getSectionStats = false;
//m_useMinAlgo = false;
//m_fastIntersection = -1;
2013-08-03 00:12:24 +04:00
m_stripe = 0;
m_collnum = -1;
2013-08-03 00:12:24 +04:00
m_useQueryStopWords = true;
//m_useNewAlgo = true;
2013-08-03 00:12:24 +04:00
m_doMaxScoreAlgo = true;
m_seoDebug = false;
m_useSeoResultsCache = false;
2014-05-06 21:47:57 +04:00
2013-08-03 00:12:24 +04:00
ptr_readSizes = NULL;
ptr_query = NULL; // in utf8?
ptr_whiteList = NULL;
//ptr_coll = NULL;
2013-08-03 00:12:24 +04:00
size_readSizes = 0;
size_query = 0;
size_whiteList = 0;
//size_coll = 0;
2013-08-03 00:12:24 +04:00
m_getDocIdScoringInfo = 1;
// -1 means to not to docid range restriction
m_minDocId = -1;
m_maxDocId = -1;
2014-05-06 21:47:57 +04:00
// for widget, to only get results to append to last docid
m_maxSerpScore = 0.0;
m_minSerpDocId = 0LL;
2013-08-03 00:12:24 +04:00
m_makeReply = true;
// . search results knobs
// . accumulate the top 10 term pairs from inlink text. lower
// it down from 10 here.
m_realMaxTop = MAX_TOP;
};
// we are requesting that this many docids be returned. Msg40 requests
// of Msg3a a little more docids than it needs because it assumes
// some will be de-duped at summary gen time.
long m_docsToGet;
long m_nqt; // # of query terms
char m_niceness;
long m_maxAge;
long m_maxQueryTerms;
//long m_compoundListMaxSize;
2013-08-03 00:12:24 +04:00
char m_boolFlag;
uint8_t m_language;
// flags
char m_queryExpansion;
char m_debug;
char m_seoDebug;
char m_useSeoResultsCache;
char m_doSiteClustering;
//char m_doIpClustering;
2013-08-03 00:12:24 +04:00
char m_doDupContentRemoval;
char m_restrictPosdbForQuery;
char m_addToCache;
char m_familyFilter;
char m_getDocIdScoringInfo;
char m_realMaxTop;
char m_stripe;
char m_useQueryStopWords;
//char m_useNewAlgo;
2013-08-03 00:12:24 +04:00
char m_doMaxScoreAlgo;
char m_getSectionStats;
long m_siteHash32;// for m_getSectionStats
//char m_useMinAlgo;
//char m_fastIntersection;
2013-08-03 00:12:24 +04:00
collnum_t m_collnum;
2013-08-03 00:12:24 +04:00
long long m_minDocId;
long long m_maxDocId;
bool m_makeReply;
2014-05-06 21:47:57 +04:00
// for widget, to only get results to append to last docid
double m_maxSerpScore;
long long m_minSerpDocId;
2013-08-03 00:12:24 +04:00
// msg3a stuff
long m_timeout; // in seconds
time_t m_nowUTC;
char *ptr_readSizes;
char *ptr_termFreqWeights;
char *ptr_query; // in utf8?
char *ptr_whiteList;
//char *ptr_coll;
2013-08-03 00:12:24 +04:00
long size_readSizes;
long size_termFreqWeights;
long size_query;
long size_whiteList;
//long size_coll;
2013-08-03 00:12:24 +04:00
char m_buf[0];
};
class Msg39Reply {
public:
// zero ourselves out
void reset() { memset ( (char *)this,0,sizeof(Msg39Reply) ); };
long m_numDocIds;
// # of "unignored" query terms
long m_nqt;
// # of estimated hits we had
long m_estimatedHits;
// for when m_getSectionStats is true
SectionStats m_sectionStats;
// error code
long m_errno;
char *ptr_docIds ; // the results, long long
char *ptr_scores; ; // now doubles! so we can have intScores
2013-08-03 00:12:24 +04:00
char *ptr_scoreInfo ; // transparency info
char *ptr_pairScoreBuf ; // transparency info
char *ptr_singleScoreBuf ; // transparency info
char *ptr_siteHashList ; // for m_getSectionStats
char *ptr_clusterRecs ; // key_t (might be empty)
long size_docIds;
long size_scores;
long size_scoreInfo;
long size_pairScoreBuf ;
long size_singleScoreBuf;
long size_siteHashList;
long size_clusterRecs;
// . this is the "string buffer" and it is a variable size
// . this whole class is cast to a udp reply, so the size of "buf"
// depends on the size of that udp reply
char m_buf[0];
};
class Msg39 {
public:
Msg39();
void reset();
void reset2();
// register our request handler for Msg39's
bool registerHandler ( );
// called by handler when a request for docids arrives
void getDocIds ( UdpSlot *slot ) ;
// XmlDoc.cpp seo pipeline uses this call
void getDocIds2 ( class Msg39Request *req ) ;
// retrieves the lists needed as specified by termIds and PosdbTable
bool getLists () ;
// called when lists have been retrieved, uses PosdbTable to hash lists
bool intersectLists ( );//bool updateReadInfo ) ;
2013-08-03 00:12:24 +04:00
// this is called after thread exits, or if thread creation failed
bool addedLists();
// incoming parameters passed to Msg39::getDocIds() function
//void *m_state;
//void (* m_callback ) ( void *state );
// . this is used by handler to reconstruct the incoming Query class
// . TODO: have a serialize/deserialize for Query class
Query m_tmpq;
long long m_docIdStart ;
long long m_docIdEnd ;
2013-08-03 00:12:24 +04:00
// used to get IndexLists all at once
Msg2 m_msg2;
// holds slot after we create this Msg39 to handle a request for docIds
UdpSlot *m_slot;
// . used for getting IndexList startKey/endKey/minNumRecs for each
// termId we got from the query
// . used for hashing our retrieved IndexLists
PosdbTable m_posdbTable;
// keep a ptr to the request
Msg39Request *m_r;
char m_debug;
long m_numDocIdSplits;
bool m_allocedTree;
long long m_ddd;
long long m_dddEnd;
bool doDocIdSplitLoop();
// . we hold our IndexLists here for passing to PosdbTable
// . one array for each of the tiers
IndexList m_lists [ MAX_QUERY_TERMS ];
2013-08-03 00:12:24 +04:00
// used for timing
long long m_startTime;
// this is set if PosdbTable::addLists() had an error
long m_errno;
// always use top tree now
TopTree m_tt;
char m_boolFlag;
long m_firstResultNum;
long long m_numTotalHits;
long m_numCensored;
// for indexdb splitting
char m_paritySplit;
long m_bufSize;
char *m_buf;
long long *m_clusterDocIds;
char *m_clusterLevels;
key_t *m_clusterRecs;
long m_numClusterDocIds;
long m_numVisible;
long m_numDocIds;
Msg51 m_msg51;
bool m_gotClusterRecs;
bool controlLoop();
long m_phase;
void estimateHitsAndSendReply ();
2013-08-03 00:12:24 +04:00
bool setClusterRecs ();
bool gotClusterRecs ();
2013-08-03 00:12:24 +04:00
// hack stuff
void *m_tmp;
long m_tmp2;
bool m_blocked;
void (*m_callback)( void *state );
void *m_state;
long long m_topDocId;
float m_topScore;
long long m_topDocId2;
float m_topScore2;
// . for the top 50 algo in seo.cpp
// . will be the score of the last result if < 50 results
float m_topScore50;
long long m_topDocId50;
bool m_inUse;
};
#endif