open-source-search-engine/Stats.h
Matt f088e734f6 allow up to 3000 query terms. really we can allow
much more since we are mostly dynamically allocating,
only a few smaller arrays use the 3000 on the stack.
2015-07-10 19:02:30 -06:00

287 lines
8.5 KiB
C++

// Matt Wells, copyright Jul 2002
// . all disk accesses should use this global class to record their stats
// . this class can dump a gif graph to disk in the html directory with
// the filename "Stats.gif"
// . you may add stats to this class, but do not remove them as they may
// be necessary for Statsdb.
#ifndef _STATS_H_
#define _STATS_H_
#include "SafeBuf.h"
#include "UdpProtocol.h" // MAX_MSG_TYPES
//#include "IndexReadInfo.h"
class StatPoint {
public:
int32_t m_numBytes ;
int64_t m_startTime ;
int64_t m_endTime ;
int m_color ;
char m_rdbId ;
char m_type ;
};
#define MAX_POINTS 6000
#define MAX_WIDTH 6
//#define DY 1000 // pixels vertical
#define DY 500 // pixels vertical
#define DX 1000 // pixels across
#define DT (10*1000) // time window, 10 seconds
#define MAX_LINES (DY / (MAX_WIDTH+1)) // leave free pixel above each line
#define STAT_GENERIC 0
#define STAT_QUERY 1
#define MAX_BUCKETS 16
class Stats {
public:
// just clear our points array when we're born
Stats ( ) ;
// we take lower 3 bytes of "color" for the rgb color of this line/pt
void addStat_r ( int32_t numBytes ,
int64_t startTime ,
int64_t endTime ,
int32_t color = 0x00000000 , // black
char type = STAT_GENERIC,
char *fnName = NULL);
// . use -1 to see points from ALL times stored
// . dumps a bar graph
// . each bar represents a stat in time, from inception to completion
// . useful for seeing possible sources of contention
//void dumpGIF ( int64_t startTime = -1 , int64_t endTime = -1 );
void printGraphInHtml ( SafeBuf &sb );
// this graphs:
// 1. stats per second
// 2. avg time of completion
// 3. st dev highway around avg
void dumpPerSecGIF ( char type ,
int64_t start = -1 ,
int64_t end = -1 );
// store the points here
StatPoint m_pts [ MAX_POINTS ];
int32_t m_next;
// don't graph any points that start BEFORE this time because our
// sampling is not accurate because we had to throw points away
// due to the MAX_POINTS limit
//int64_t m_minWindowStartTime;
// . conglomerate scores
// . # seeks is estimated since we don't know the disk fragmentation
//int64_t m_numSeeks;
// sum of these 2 should equal m_numSeeks
//int64_t m_numReads;
//int64_t m_numWrites;
// total bytes read and written since server was started
//int64_t m_bytesRead;
//int64_t m_bytesWritten;
void addPoint ( StatPoint **points ,
int32_t *numPoints ,
StatPoint *p ) ;
//queries
void logAvgQueryTime(int64_t startTime);
void calcQueryStats();
void clearMsgStats();
int64_t m_startTime;
int64_t m_upTime;
int64_t m_lastQueryLogTime;
int64_t m_queryTimes;
float m_avgQueriesPerSec;
int32_t m_numQueries;
int32_t m_numSuccess;
int32_t m_numFails;
int32_t m_totalNumQueries;
int32_t m_totalNumSuccess;
int32_t m_totalNumFails;
float m_avgQueryTime;
float m_successRate;
//int64_t m_readSignals;
//int64_t m_writeSignals;
// set in BigFile.cpp
int32_t m_slowDiskReads;
// when we have to close a socket because too many are open.. count it
int32_t m_closedSockets;
// keep track of last 1000 urls spidered for reporting spider stats
void addSpiderPoint ( int32_t errCode, bool isNew ) ;
int32_t m_spiderSample;
int32_t m_spiderErrors;
int32_t m_spiderNew;
int32_t m_spiderErrorsNew;
int32_t m_errCodes[1000];
char m_isSampleNew[1000];
int64_t m_totalSpiderSuccessNew;
int64_t m_totalSpiderErrorsNew;
int64_t m_totalSpiderSuccessOld;
int64_t m_totalSpiderErrorsOld;
int64_t m_allErrorsNew[65536];
int64_t m_allErrorsOld[65536];
time_t m_uptimeStart;
// Deduped stats
int32_t m_msg3aRecallCnt;
// when we have to re-requrest docid lists for each split, inc this one
int32_t m_msg3aSlowRecalls;
// when we just request more docids from the same tier
int32_t m_msg3aFastRecalls;
// how many resolutions did we get on each tier
//int32_t m_tierHits [MAX_TIERS];
//int64_t m_tierTimes[MAX_TIERS];
// how many searches did not get enough results?
int32_t m_tier2Misses;
// one count for each CR_* defined in Msg51.h
int32_t m_filterStats[30];
//int32_t m_totalDedupCand;
//int32_t m_dedupedCand;
//int32_t m_bannedDups;
//int32_t m_bigHackDups;
//int32_t m_summaryDups;
//int32_t m_contentDups;
//int32_t m_clusteredTier1;
//int32_t m_clusteredTier2;
//int32_t m_errored;
int32_t m_msg3aRecalls[6];
SafeBuf m_keyCols;
//int32_t m_numTermsVsTier[14][MAX_TIERS];
//int32_t m_termsVsTierExp[14][MAX_TIERS][7];
// use m_start so we know what msg stats to clear with memset
char m_start;
// and stats for how long to send a request or reply from
// start to finish. the first "2" is the niceness, 0 or 1, and
// the second "2" is 0 if sending a reply and 1 if sending a request.
int64_t m_msgTotalOfSendTimes [MAX_MSG_TYPES][2][2];
int64_t m_msgTotalSent [MAX_MSG_TYPES][2][2];
int64_t m_msgTotalSentByTime [MAX_MSG_TYPES][2][2][MAX_BUCKETS];
// how long we wait after receiving the request until handler is called
int64_t m_msgTotalOfQueuedTimes [MAX_MSG_TYPES][2];
int64_t m_msgTotalQueued [MAX_MSG_TYPES][2];
int64_t m_msgTotalQueuedByTime [MAX_MSG_TYPES][2][MAX_BUCKETS];
// msg stats, from UdpServer.cpp and UdpSlot.cpp. all times in
// milliseconds. the second index is the niceness, 0 or 1. Buckets
// are for breaking down times by the time range: 0-1ms, 2-3ms,
// 4-7ms, 8-15ms, ...
int64_t m_msgTotalOfHandlerTimes [MAX_MSG_TYPES][2];
int64_t m_msgTotalHandlersCalled [MAX_MSG_TYPES][2];
int64_t m_msgTotalHandlersByTime [MAX_MSG_TYPES][2][MAX_BUCKETS];
int32_t m_packetsIn [MAX_MSG_TYPES][2];
int32_t m_packetsOut [MAX_MSG_TYPES][2];
int32_t m_acksIn [MAX_MSG_TYPES][2];
int32_t m_acksOut [MAX_MSG_TYPES][2];
int32_t m_reroutes [MAX_MSG_TYPES][2];
int32_t m_errors [MAX_MSG_TYPES][2];
int32_t m_timeouts [MAX_MSG_TYPES][2]; // specific error
int32_t m_nomem [MAX_MSG_TYPES][2]; // specific error
int32_t m_dropped [MAX_MSG_TYPES][2]; // dropped dgram
int32_t m_cancelRead [MAX_MSG_TYPES][2]; // dropped dgram
int32_t m_parsingInconsistencies;
int32_t m_totalOverflows;
// count ip and domain hammer for Msg13.cpp here
//int32_t m_numBackoffs;
// used by msg39
int32_t m_recomputeCacheMissess;
// if the msg3a advances to the next tier, of course, it will be
// a cache miss, so don't count those, they are justified
// recomputeCacheMisses
int32_t m_icacheTierJumps;
// when SpiderLoop calls SpiderCache::getSpiderRec() how many times
// does it actually get back a url to spider? and how many times does
// it miss (not get a url to spider)?
//int32_t m_spiderUrlsHit;
//int32_t m_spiderUrlsMissed;
int32_t m_compressedBytesIn;
int32_t m_uncompressedBytesIn;
// spider compression proxy stats set in Msg13.cpp
int64_t m_compressAllDocs;
int64_t m_compressAllBytesIn;
int64_t m_compressAllBytesOut;
int64_t m_compressMimeErrorDocs;
int64_t m_compressMimeErrorBytesIn;
int64_t m_compressMimeErrorBytesOut;
int64_t m_compressUnchangedDocs;
int64_t m_compressUnchangedBytesIn;
int64_t m_compressUnchangedBytesOut;
int64_t m_compressBadContentDocs;
int64_t m_compressBadContentBytesIn;
int64_t m_compressBadContentBytesOut;
int64_t m_compressBadLangDocs;
int64_t m_compressBadLangBytesIn;
int64_t m_compressBadLangBytesOut;
int64_t m_compressBadCharsetDocs;
int64_t m_compressBadCharsetBytesIn;
int64_t m_compressBadCharsetBytesOut;
int64_t m_compressBadCTypeDocs;
int64_t m_compressBadCTypeBytesIn;
int64_t m_compressBadCTypeBytesOut;
int64_t m_compressHasIframeDocs;
int64_t m_compressHasIframeBytesIn;
int64_t m_compressHasIframeBytesOut;
int64_t m_compressPlainLinkDocs;
int64_t m_compressPlainLinkBytesIn;
int64_t m_compressPlainLinkBytesOut;
int64_t m_compressEmptyLinkDocs;
int64_t m_compressEmptyLinkBytesIn;
int64_t m_compressEmptyLinkBytesOut;
int64_t m_compressFullPageDocs;
int64_t m_compressFullPageBytesIn;
int64_t m_compressFullPageBytesOut;
int64_t m_compressHasDateDocs;
int64_t m_compressHasDateBytesIn;
int64_t m_compressHasDateBytesOut;
int64_t m_compressRobotsTxtDocs;
int64_t m_compressRobotsTxtBytesIn;
int64_t m_compressRobotsTxtBytesOut;
int64_t m_compressUnknownTypeDocs;
int64_t m_compressUnknownTypeBytesIn;
int64_t m_compressUnknownTypeBytesOut;
// use m_end so we know what msg stats to clear with memset
char m_end;
//bool m_gotLock;
};
extern class Stats g_stats;
#endif