open-source-search-engine/Msg40.h

384 lines
11 KiB
C
Raw Normal View History

2013-08-03 00:12:24 +04:00
// Matt Wells, copyright Jul 2001
// . gets the title/summary/docLen/url results from a query
#ifndef _MSG40_H_
#define _MSG40_H_
#define SAMPLE_VECTOR_SIZE (32*4)
#include "SearchInput.h"
#include "UdpServer.h" // UdpSlot type
#include "Multicast.h" // multicast send
#include "Query.h" // Query::set()
#include "Msg39.h" // getTermFreqs()
#include "Msg20.h" // for getting summary from docId
#include "Msg17.h" // a distributed cache of serialized/compressed Msg40s
2013-10-03 08:34:21 +04:00
//#include "Msg2b.h" // for generating directories
2013-08-03 00:12:24 +04:00
#include "IndexReadInfo.h" // STAGE0,...
#include "Msg3a.h"
#include "PostQueryRerank.h"
// replace CollectionRec::m_maxDocIdsToCompute with this
//#define MAXDOCIDSTOCOMPUTE 500000
// make it 2B now. no reason not too limit it so low.
#define MAXDOCIDSTOCOMPUTE 2000000000
2013-08-03 00:12:24 +04:00
#define MAX_GIGABIT_WORDS 10
class Gigabit {
public:
char *m_term;
2014-11-11 01:45:11 +03:00
int32_t m_termLen;
2014-10-30 22:36:39 +03:00
int64_t m_termId64;
2013-08-03 00:12:24 +04:00
float m_gbscore;
2014-11-11 01:45:11 +03:00
int32_t m_minPop;
int32_t m_numWords;
int32_t m_numPages;
2014-10-30 22:36:39 +03:00
int64_t m_lastDocId;
2013-08-03 00:12:24 +04:00
// the wordids of the words in the gigabit (m_numWords of them)
2014-10-30 22:36:39 +03:00
int64_t m_wordIds[MAX_GIGABIT_WORDS];
2013-08-03 00:12:24 +04:00
};
//
// TODO: add Gigabit::m_firstFastFactOffset..
//
#define MAX_GIGABIT_PTRS 10
class Fact {
public:
// offset of the gigabit in m_gigabitBuf we belong to
2014-11-11 01:45:11 +03:00
int32_t m_gigabitOffset;
2013-08-03 00:12:24 +04:00
// . the sentence contaning the gigabit and a lot of the query terms
// . ptr refrences into Msg20Reply::ptr_gigabitSample buffers
char *m_fact;
2014-11-11 01:45:11 +03:00
int32_t m_factLen;
2013-08-03 00:12:24 +04:00
float m_gigabitModScore;
float m_queryScore;
float m_maxGigabitModScore; // gigabitscore * #pagesItIsOn
2014-11-11 01:45:11 +03:00
int32_t m_numGigabits;
2013-08-03 00:12:24 +04:00
char m_printed;
class Gigabit *m_gigabitPtrs[MAX_GIGABIT_PTRS];
2014-11-11 01:45:11 +03:00
int32_t m_numQTerms;
2014-10-30 22:36:39 +03:00
int64_t m_docId; // from where it came
2013-08-03 00:12:24 +04:00
Msg20Reply *m_reply; // reply from where it came
// for deduping sentences
char m_dedupVector[SAMPLE_VECTOR_SIZE]; // 128
};
class GigabitInfo {
public:
2014-11-11 01:45:11 +03:00
int32_t m_pts;
2013-08-03 00:12:24 +04:00
uint32_t m_hash;
2014-11-11 01:45:11 +03:00
int32_t m_pop;
int32_t m_count;
int32_t m_numDocs;
2014-10-30 22:36:39 +03:00
int64_t m_lastDocId;
2014-11-11 01:45:11 +03:00
int32_t m_currentDocCount;
2013-08-03 00:12:24 +04:00
char *m_ptr;
2014-11-11 01:45:11 +03:00
int32_t m_len;
2013-08-03 00:12:24 +04:00
};
class Msg40 {
public:
Msg40();
~Msg40();
void resetBuf2 ( ) ;
static bool registerHandler ();
// . returns false if blocked, true otherwise
// . sets errno on error
// . uses Query class to parse query
// . uses Msg37 to retrieve term frequencies for each termId in query
// . uses Indexdb class to intersect the lists to get results
// . fills local buffer, m_docIds, with resulting docIds
// . set m_numDocIds to number of docIds in m_docIds
// . a useCache of -1 means default, 1 means use the cache,0 means dont
// . "displayMetas" is a space separated list of meta tag names
2014-11-18 05:13:36 +03:00
// that you want the content for along with the summary
2013-08-03 00:12:24 +04:00
bool getResults ( class SearchInput *si ,
bool forward ,
void *state ,
//void (* callback)(class Msg40 *THIS, void *state));
void (* callback)(void *state));
void makeCallback();
2013-08-03 00:12:24 +04:00
bool gotCacheReply();
// a continuation function of getResults() above
bool prepareToGetDocIds ( );
bool getDocIds ( bool recall );
bool gotExternalReply ( ) ;
bool postResultsProcessing();
bool computeGigabits( class TopicGroup *tg );
SafeBuf m_gigabitBuf;
// nuggabits...
2013-08-03 00:12:24 +04:00
bool computeFastFacts ( );
bool addFacts ( HashTableX *queryTable,
HashTableX *gbitTable ,
char *pstart,
char *pend,
bool debugGigabits ,
class Msg20Reply *reply,
SafeBuf *factBuf ) ;
2013-12-02 01:47:08 +04:00
2013-08-03 00:12:24 +04:00
SafeBuf m_factBuf;
// keep these public since called by wrapper functions
bool federatedLoop ( ) ;
2013-08-03 00:12:24 +04:00
bool gotDocIds ( ) ;
bool launchMsg20s ( bool recalled ) ;
class Msg20 *getAvailMsg20();
2014-11-11 01:45:11 +03:00
class Msg20 *getCompletedSummary ( int32_t ix );
2013-08-03 00:12:24 +04:00
bool getSummaries ( ) ;
bool gotSummary ( ) ;
bool reallocMsg20Buf ( ) ;
//bool printLocalTime ( class SafeBuf *sb );
2014-11-11 01:45:11 +03:00
void uncluster ( int32_t m ) ;
2013-08-03 00:12:24 +04:00
// serialization routines used for caching Msg40s by Msg17
2014-11-11 01:45:11 +03:00
int32_t getStoredSize ( ) ;
int32_t serialize ( char *buf , int32_t bufLen ) ;
int32_t deserialize ( char *buf , int32_t bufLen ) ;
2013-08-03 00:12:24 +04:00
// see Msg51.h for CR_* values of crId
2014-11-11 01:45:11 +03:00
int32_t getFilterStats ( int32_t crId ) { return m_filterStats[crId]; };
int32_t getNumCensored ( ) { return m_filterStats[CR_DIRTY]; };
2013-08-03 00:12:24 +04:00
2014-11-11 01:45:11 +03:00
int32_t getNumTopicGroups ( ) { return m_si->m_numTopicGroups; };
2013-08-03 00:12:24 +04:00
// . estimated # of total hits
// . this is now an EXACT count... since we read all posdb termlists
2014-10-30 22:36:39 +03:00
int64_t getNumTotalHits (){return m_msg3a.m_numTotalEstimatedHits; }
2013-08-03 00:12:24 +04:00
// . we copy query and coll to our own local buffer
// . these routines give us back our inputted parameters we saved
char *getQuery ( ) { return m_si->m_q.getQuery(); };
2014-11-11 01:45:11 +03:00
int32_t getQueryLen ( ) { return m_si->m_q.getQueryLen(); };
//char *getColl ( ) { return m_si->m_coll2; };
2014-11-11 01:45:11 +03:00
//int32_t getCollLen ( ) { return m_si->m_collLen2; };
int32_t getDocsWanted ( ) { return m_si->m_docsWanted; };
int32_t getFirstResultNum ( ) { return m_si->m_firstResultNum; };
2013-08-03 00:12:24 +04:00
2014-11-11 01:45:11 +03:00
int32_t getNumResults ( ){return m_msg3a.m_numDocIds; };
int32_t getNumDocIds ( ){return m_msg3a.m_numDocIds; };
2013-08-03 00:12:24 +04:00
2014-11-11 01:45:11 +03:00
char getClusterLevel(int32_t i){return m_msg3a.m_clusterLevels[i];};
2013-08-03 00:12:24 +04:00
2014-11-11 01:45:11 +03:00
int64_t getDocId ( int32_t i ){return m_msg3a.m_docIds[i]; };
2014-10-30 22:36:39 +03:00
int64_t *getDocIds( ){return m_msg3a.m_docIds; };
2014-11-11 01:45:11 +03:00
double getScore ( int32_t i ){return m_msg3a.m_scores[i]; };
class DocIdScore *getScoreInfo(int32_t i){
2014-03-07 05:01:41 +04:00
if ( ! m_msg3a.m_scoreInfos ) return NULL;
return m_msg3a.m_scoreInfos[i];
}
2014-11-11 01:45:11 +03:00
//LinkInfo *getLinkInfo( int32_t i){return m_msg20[i]->m_linkInfo; }
2013-08-03 00:12:24 +04:00
bool moreResultsFollow ( ) {return m_moreToCome; };
time_t getCachedTime ( ) {return m_cachedTime; };
/*
2014-11-11 01:45:11 +03:00
char *getTopicPtr ( int32_t i ){return m_gigabitInfos[i].m_ptr; };
int32_t getTopicLen ( int32_t i ){return m_gigabitInfos[i].m_len; };
int32_t getTopicScore ( int32_t i ){return m_gigabitInfos[i].m_pts; };
char getTopicGid ( int32_t i ){return 0; }; // temporarily
int32_t getNumTopics ( ){return m_numGigabitInfos; };
2013-08-03 00:12:24 +04:00
// advanced gigabit/topic attributes
2014-11-11 01:45:11 +03:00
int32_t getTopicDocIdCount(int32_t i){return m_gigabitInfos[i].m_numDocs; };
int32_t getTopicPop(int32_t i){return m_gigabitInfos[i].m_pop; };
2013-08-03 00:12:24 +04:00
// intersectGigabits() in Msg40.cpp fills these in when we call it
// from Msg40.cpp
GigabitInfo m_gigabitInfos[50];
2014-11-11 01:45:11 +03:00
int32_t m_numGigabitInfos;
2013-08-03 00:12:24 +04:00
*/
2014-11-11 01:45:11 +03:00
int32_t getNumGigabits (){return m_gigabitBuf.length()/sizeof(Gigabit);};
Gigabit *getGigabit ( int32_t i ) {
2013-08-03 00:12:24 +04:00
Gigabit *gbs = (Gigabit *)m_gigabitBuf.getBufStart();
return &gbs[i];
};
2014-10-30 22:36:39 +03:00
int64_t *getDocIdPtr() { return m_msg3a.m_docIds; }
2013-08-03 00:12:24 +04:00
// Msg39 and all Msg20s must use the same clock timestamp
time_t m_nowUTC;
2014-11-11 01:45:11 +03:00
int32_t m_lastHeartbeat;
2014-11-11 01:45:11 +03:00
bool printSearchResult9 ( int32_t ix , int32_t *numPrintedSoFar ,
2014-07-01 22:46:01 +04:00
class Msg20Reply *mr ) ;
SafeBuf m_unusedBuf;
2014-11-11 01:45:11 +03:00
int32_t m_numMsg20sOut ;
int32_t m_numMsg20sIn ;
int32_t m_j ;
int32_t m_i ;
bool m_doneWithLookup;
HashTableX m_facetTextTable;
SafeBuf m_facetTextBuf;
2014-07-30 06:32:27 +04:00
bool m_firstTime;
2014-11-11 01:45:11 +03:00
int32_t m_omitCount;
bool printFacetTables ( class SafeBuf *sb ) ;
bool printFacetsForTable ( SafeBuf *sb , QueryTerm *qt );
bool lookupFacets ( ) ;
2014-07-30 06:32:27 +04:00
void lookupFacets2 ( ) ;
void gotFacetText ( class Msg20 *msg20 ) ;
class Msg20 *getUnusedMsg20 ( ) ;
HashTableX m_columnTable;
bool printCSVHeaderRow ( class SafeBuf *sb );
2014-11-11 01:45:11 +03:00
bool printJsonItemInCSV ( class State0 *st , int32_t ix );
int32_t m_numCSVColumns;
HashTableX m_dedupTable;
2014-11-11 01:45:11 +03:00
int32_t m_msg3aRecallCnt;
// this goes into msg3a now so we can send multiple msg3as out,
// 1 per collection
//Msg39Request m_r;
2013-08-03 00:12:24 +04:00
2014-11-11 01:45:11 +03:00
int32_t m_docsToGet;
int32_t m_docsToGetVisible;
2013-08-03 00:12:24 +04:00
// incoming parameters
void *m_state;
void (* m_callback ) ( void *state );
2014-11-11 01:45:11 +03:00
int32_t m_needFirstReplies;
2013-08-03 00:12:24 +04:00
// max outstanding msg20s
2014-11-11 01:45:11 +03:00
//int32_t m_maxOutstanding;
2013-08-03 00:12:24 +04:00
// # of contiguous msg20 replies we have received (no gaps)
2014-11-11 01:45:11 +03:00
//int32_t m_numContiguous;
2013-08-03 00:12:24 +04:00
// of thos contiguous results, how many are visible? (unfiltered,.etc)
2014-11-11 01:45:11 +03:00
//int32_t m_visibleContiguous;
2013-08-03 00:12:24 +04:00
// . do not uncluster more than this many docids! it slows things down.
// . kind of a HACK until we do it right
2014-11-11 01:45:11 +03:00
int32_t m_unclusterCount;
2013-08-03 00:12:24 +04:00
// how many of the m_numContiguous have been checked for dups?
2014-11-11 01:45:11 +03:00
//int32_t m_numChecked;
2013-08-03 00:12:24 +04:00
// do we have enough visible docids? stop launch msg20s when we do
//bool m_gotEnough;
// a bunch of msg20's for getting summaries/titles/...
Msg20 **m_msg20;
2014-11-11 01:45:11 +03:00
int32_t m_numMsg20s;
2013-08-03 00:12:24 +04:00
char *m_msg20StartBuf;
2014-11-11 01:45:11 +03:00
int32_t m_numToFree;
2013-08-03 00:12:24 +04:00
bool m_hadPrintError ;
2014-11-11 01:45:11 +03:00
int32_t m_numPrinted ;
bool m_printedHeader ;
bool m_printedTail ;
bool m_lastChunk ;
2014-11-11 01:45:11 +03:00
int32_t m_sendsOut ;
int32_t m_sendsIn ;
int32_t m_printi ;
int32_t m_numDisplayed ;
int32_t m_numPrintedSoFar;
int32_t m_socketHadError;
2013-08-03 00:12:24 +04:00
// use msg3a to get docIds
Msg3a m_msg3a;
// use this for getting compressed, cached images of ourselves
Msg17 m_msg17;
char *m_cachePtr;
2014-11-11 01:45:11 +03:00
int32_t m_cacheSize;
2013-08-03 00:12:24 +04:00
2014-11-11 01:45:11 +03:00
//int32_t m_maxDocIdsToCompute;
2013-08-03 00:12:24 +04:00
// count summary replies (msg20 replies) we get
2014-11-11 01:45:11 +03:00
int32_t m_numRequests;
int32_t m_numReplies;
2013-08-03 00:12:24 +04:00
// we launched all docids from 0 to m_maxiLaunched
2014-11-11 01:45:11 +03:00
//int32_t m_maxiLaunched;
2013-08-03 00:12:24 +04:00
// true if more results follow these
bool m_moreToCome;
2014-11-11 01:45:11 +03:00
int32_t m_lastProcessedi;
2013-08-03 00:12:24 +04:00
bool m_didSummarySkip;
2013-08-03 00:12:24 +04:00
// a multicast class to send the request
Multicast m_mcast;
// for timing how long to get all summaries
2014-10-30 22:36:39 +03:00
int64_t m_startTime;
2013-08-03 00:12:24 +04:00
// was Msg40 cached? if so, at what time?
bool m_cachedResults;
time_t m_cachedTime;
// gigabits
//Msg24 m_msg24;
// references
//Msg1a m_msg1a;
2014-11-11 01:45:11 +03:00
int32_t m_tasksRemaining;
2013-08-03 00:12:24 +04:00
int32_t m_printCount;
2013-08-03 00:12:24 +04:00
// buffer we deserialize from, allocated by Msg17, but we free it
char *m_buf;
2014-11-11 01:45:11 +03:00
int32_t m_bufMaxSize;
2013-08-03 00:12:24 +04:00
// for holding the msg20s
char *m_buf2;
2014-11-11 01:45:11 +03:00
int32_t m_bufMaxSize2;
2013-08-03 00:12:24 +04:00
2014-11-11 01:45:11 +03:00
int32_t m_errno;
2013-08-03 00:12:24 +04:00
// was family filter on and query had dirty words?
bool m_queryCensored;
// did we have dups in the list of docids that we had to remove?
bool m_removedDupContent;
// up to 30 different CR_ values in Msg51.h
2014-11-11 01:45:11 +03:00
int32_t m_filterStats[30];
2013-08-03 00:12:24 +04:00
SearchInput *m_si;
// for topic clustering, saved from CollectionRec
2014-11-11 01:45:11 +03:00
int32_t m_topicSimilarCutoff;
int32_t m_docsToScanForTopics;
2013-08-03 00:12:24 +04:00
// Msg2b for generating a directory
2013-10-03 08:34:21 +04:00
//Msg2b m_msg2b;
2013-08-03 00:12:24 +04:00
bool mergeDocIdsIntoBaseMsg3a();
2014-11-11 01:45:11 +03:00
int32_t m_numCollsToSearch;
class Msg3a **m_msg3aPtrs;
SafeBuf m_msg3aPtrBuf;
2014-11-11 01:45:11 +03:00
int32_t m_num3aRequests;
int32_t m_num3aReplies;
collnum_t m_firstCollnum;
2013-08-03 00:12:24 +04:00
PostQueryRerank m_postQueryRerank;
HashTableT<uint64_t, uint64_t> m_urlTable;
};
#endif