open-source-search-engine/RdbBase.h

// Matt Wells, copyright Sep 2000

// . the core database class, we have one of these for each collection and
//   pointer to them are stored in the new "Rdb" class
// . records stored on disk ordered by lowest key first
// . allows simple key-based record retrieval
// . uses non-blocking i/o with callbacks
// . thread UNsafe for maximum speed
// . has a "groupMask" that allows us to split db between multiple Rdb's
// . uses BigFile class to support files bigger than 2gb
// . can instantly delete records in memory
// . deletes records on disk by re-writing them to disk with key low bit CLEAR
// . we merge files (non-blocking operation) into 1 file to save on disk seeks
// . adding a record with the same key as an existing one we will replace it
//   unless you set dedup to false which is yet to be supported
// . when mem is low dumps records from tree to disk, frees as it dumps
// . has a key-to-diskOffset/dataSize map in memory (good for small records)
//   for mapping a file of records on disk
// . this key-to-offset map takes up sizeof(key_t)+  bytes per disk page
// . we can map .8 gigs of disk with 1 meg of mem (using page size of 8k)
// . memory is only freed by the Mem.h class when it finds it's running out
// . addRecord will only return false if there's some lack of memory problems
// . we can only dump the RdbTree to disk if it's using at least "minMem" or
//   we are shutting down and Rdb::close() was called

#ifndef _RDBBASE_H_
#define _RDBBASE_H_

#include "Conf.h"
#include "Mem.h"
#include "RdbScan.h"
#include "RdbDump.h"
#include "RdbTree.h"
#include "RdbBuckets.h"
#include "RdbCache.h"
#include "RdbMerge.h"
#include "Msg3.h"               // MAX_RDB_FILES definition
#include "Dir.h"
#include "RdbMem.h"

// how many rdbs are in "urgent merge" mode?
extern int32_t g_numUrgentMerges;

extern RdbMerge g_merge;
extern RdbMerge g_merge2;

class RdbBase {

 public:

	 RdbBase ( );
	~RdbBase ( );

	// . the more memory the tree has the less file merging required
	// . when a slot's key is ANDed with "groupMask" the result must equal
	//   "groupId" in order to be in this database
	// . "minMem" is how much mem must be used before considering dumping
	//   the RdbTree (our unbalanced btree) to disk
	// . you can fix the dataSize of all records in this rdb by setting
	//   "fixedDataSize"
	// . if "maskKeyLowLong" we mask the lower int32_t of the key and then
	//   compare that to the groupId to see if the record belongs
	// . this is currently just used by Spiderdb
	// . otherwise, we mask the high int32_t in the key
	bool init ( char  *dir             , // working directory
		    char  *dbname          , // "indexdb","tagdb",...
		    bool   dedup           , //= true ,
		    int32_t   fixedDataSize   , //= -1   ,
		    //uint32_t   groupMask       , //=  0   ,
		    //uint32_t   groupId         , //=  0   ,
		    int32_t   minToMerge      , //, //=  2   ,
		    //int32_t   maxTreeMem      , //=  1024*1024*32 ,
		    //int32_t   maxTreeNodes    ,
		    //bool   isTreeBalanced  ,
		    //int32_t   maxCacheMem     , //=  1024*1024*5 );
		    //int32_t   maxCacheNodes   ,
		    bool   useHalfKeys     ,
		    char   keySize         ,
		    int32_t   pageSize        ,
		    char                *coll    ,
		    collnum_t            collnum ,
		    RdbTree             *tree    ,
		    RdbBuckets          *buckets ,
		    RdbDump             *dump    ,
		    class Rdb           *rdb    ,
		    class DiskPageCache *pc = NULL ,
		    bool                 isTitledb = false , // use fileIds2[]?
		    bool                 preloadDiskPageCache = false ,
		    bool                 biasDiskPageCache    = false );

	void closeMaps ( bool urgent );
	void saveMaps  ( bool useThread );

	// . frees up all the memory and closes all files
	// . suspends any current merge (saves state to disk)
	// . calls reset() for each file
	// . will cause any open map files to dump
	// . will dump tables to backup or store
	// . calls close on each file
	// . returns false if blocked, true otherwise
	// . sets errno on error
	//bool close ( void *state ,
	//	     void (* callback)(void *state ) ,
	//	     bool urgent ,
	//	     bool exitAfterClosing );
	//bool close ( ) { return close ( NULL , NULL ); };
	// used by PageMaster.cpp to check to see if all rdb's are closed yet
	//bool isClosed ( ) { return m_isClosed; };

	// . returns false and sets g_errno on error
	// . caller should retry later on g_errno of ENOMEM or ETRYAGAIN
	// . returns the node # in the tree it added the record to
	// . key low bit must be set (otherwise it indicates a delete)
	//bool addRecord ( key_t &key, char *data, int32_t dataSize );

	// returns false if no room in tree or m_mem for a list to add
	//bool hasRoom ( RdbList *list );

	// . returns false on error and sets errno
	// . return true on success
	// . if we can't handle all records in list we don't add any and
	//   set errno to ETRYAGAIN or ENOMEM
	// . we copy all data so you can free your list when we're done
	//bool addList ( RdbList *list );

	// . add a record without any data, just a key (faster)
	// . returns the node # in the tree it added the record to
	//int32_t addKey ( key_t &key );

	// . uses the bogus data pointed to by "m_dummy" for record's data
	// . we clear the key low bit to signal a delete
	// . returns false and sets errno on error
	//bool deleteRecord ( key_t &key ) ;

	// TODO: this needs to support
	// . we split our data over rdbs across the network based on masks
	// . we now just use g_conf.m_groupMask, g_hostdb.m_groupId, ...
	//int32_t getGroupId ( key_t &key ) { return (key.n1 & m_groupMask); };
	//uint32_t getGroupMask ( ) { return m_groupMask; };
	//uint32_t getGroupId   ( ) { return m_groupId  ; };

	// . when a slot's key is ANDed with "groupMask" the result must equal
	//   "groupId" in order to be in this database
	// . used to split data across multiple rdbs
	//void setMask ( uint32_t groupMask , uint32_t groupId );

	// get the directory name where this rdb stores it's files
	char *getDir ( ) { return m_dir.getDirname(); };
	char *getStripeDir ( ) { return g_conf.m_stripeDir; };

	int32_t getFixedDataSize ( ) { return m_fixedDataSize; };

	bool useHalfKeys ( ) { return m_useHalfKeys; };

	//RdbTree    *getTree    ( ) { return &m_tree; };
	//RdbCache   *getCache   ( ) { return &m_cache; };

	RdbMap   **getMaps  ( ) { return m_maps; };
	BigFile  **getFiles ( ) { return m_files; };

	BigFile   *getFile   ( int32_t n ) { return m_files   [n]; };
	int32_t       getFileId ( int32_t n ) { return m_fileIds [n]; };
	int32_t       getFileId2( int32_t n ) { return m_fileIds2[n]; };
	RdbMap    *getMap    ( int32_t n ) { return m_maps    [n]; };

	int32_t getFileNumFromId  ( int32_t id  ) ; // for converting old titledbs
	int32_t getFileNumFromId2 ( int32_t id2 ) ; // map tfn to real file num (rfn)

	//RdbMem    *getRdbMem () { return &m_mem; };

	float getPercentNegativeRecsOnDisk ( int64_t *totalArg ) ;

	// how much mem is alloced for our maps?
	int64_t getMapMemAlloced ();

	int32_t       getNumFiles ( ) { return m_numFiles; };

	// sum of all parts of all big files
	int32_t      getNumSmallFiles ( ) ;
	int64_t getDiskSpaceUsed ( );

	// returns -1 if variable (variable dataSize)
	int32_t getRecSize ( ) {
		if ( m_fixedDataSize == -1 ) return -1;
		//return sizeof(key_t) + m_fixedDataSize; };
		return m_ks + m_fixedDataSize; };

	// use the maps and tree to estimate the size of this list
	//int32_t getListSize ( key_t startKey ,key_t endKey , key_t *maxKey ,
	int64_t getListSize ( char *startKey ,char *endKey , char *maxKey ,
			        int64_t oldTruncationLimit ) ;

	// positive minus negative
	int64_t getNumTotalRecs ( ) ;

	int64_t getNumRecsOnDisk ( );

	int64_t getNumGlobalRecs ( );

	/*
	// used for keeping track of stats
	void      didSeek       (            ) { m_numSeeks++; };
	void      didRead       ( int32_t bytes ) { m_numRead += bytes; };
	int64_t getNumSeeks   (            ) { return m_numSeeks; };
	int64_t getNumRead    (            ) { return m_numRead ; };

	// net stats for "get" requests
	void      readRequestGet ( int32_t bytes ) {
		m_numReqsGet++    ; m_numNetReadGet += bytes; };
	void      sentReplyGet     ( int32_t bytes ) {
		m_numRepliesGet++ ; m_numNetSentGet += bytes; };
	int64_t getNumRequestsGet ( ) { return m_numReqsGet;    };
	int64_t getNetReadGet     ( ) { return m_numNetReadGet; };
	int64_t getNumRepliesGet  ( ) { return m_numRepliesGet; };
	int64_t getNetSentGet     ( ) { return m_numNetSentGet; };

	// net stats for "add" requests
	void      readRequestAdd ( int32_t bytes ) {
		m_numReqsAdd++    ; m_numNetReadAdd += bytes; };
	void      sentReplyAdd     ( int32_t bytes ) {
		m_numRepliesAdd++ ; m_numNetSentAdd += bytes; };
	int64_t getNumRequestsAdd ( ) { return m_numReqsAdd;    };
	int64_t getNetReadAdd     ( ) { return m_numNetReadAdd; };
	int64_t getNumRepliesAdd  ( ) { return m_numRepliesAdd; };
	int64_t getNetSentAdd     ( ) { return m_numNetSentAdd; };

	// used by main.cpp to periodically save us if we haven't dumped
	// in a while
	int64_t getLastWriteTime   ( ) { return m_lastWrite; };
	*/

	// private:

	void attemptMerge ( int32_t niceness , bool forceMergeAll ,
			    bool doLog = true ,
			    // -1 means to not override it
			    int32_t minToMergeOverride = -1 );

	bool gotTokenForDump  ( ) ;
	void gotTokenForMerge ( ) ;

	// called after merge completed
	bool incorporateMerge ( );

	// . you'll lose your data in this class if you call this
	void reset();

	// . load the tree named "saved.dat", keys must be out of order because
	//   tree is not balanced
	//bool loadTree ( ) ;

	// . write out tree to a file with keys in order
	// . only shift.cpp/reindex.cpp programs set niceness to 0
	//bool dumpTree ( int32_t niceness ); //= MAX_NICENESS );

	// . set the m_files, m_fileMaps, m_fileIds arrays and m_numFiles
	bool setFiles ( ) ;

	// . called when done saving a tree to disk (keys not ordered)
	//void doneSaving ( ) ;

	// . called when we've dumped the tree to disk w/ keys ordered
	//void doneDumping ( );

	void verifyDiskPageCache ( );

	bool verifyFileSharding ( );

	// . add a (new) file to the m_files/m_maps/m_fileIds arrays
	// . both return array position we added it to
	// . both return -1 and set errno on error
	int32_t addFile     ( int32_t fileId, bool isNew, int32_t mergeNum, int32_t id2 ,
			   bool converting = false ) ;
	int32_t addNewFile  ( int32_t id2 ) ;
	//int32_t getAvailId2 ( ); // used only by titledb

	// used by the high priority udp server to suspend merging for ALL
	// rdb's since we share a common merge class, s_merge
	//void suspendAllMerges ( ) ;
	// resume ANY merges
	//void resumeAllMerges ( ) ;

	//bool needsDump ( );

	// these are used by Msg34 class for computing load on a machine
	bool isMerging ( ) { return m_isMerging; };
	bool isDumping ( ) { return m_dump->isDumping(); };

	bool hasMergeFile ( ) { return m_hasMergeFile; };

	// used for translating titledb file # 255 (as read from new tfndb)
	// into the real file number
	int32_t getNewestFileNum ( ) { return m_numFiles - 1; };

	// Msg22 needs the merge info so if the title file # of a read we are
	// doing is being merged, we have to include the start merge file num
	int32_t      getMergeStartFileNum ( ) { return m_mergeStartFileNum; };
	int32_t      getMergeNumFiles     ( ) { return m_numFilesToMerge; };

	// used by Sync.cpp to convert a file name to a file number in m_files
	int32_t getFileNumFromName ( char *filename );

	// bury m_files[] in [a,b)
	void buryFiles ( int32_t a , int32_t b );

	void doneWrapper2 ( ) ;
	void doneWrapper4 ( ) ;
	int32_t m_x;
	int32_t m_a;

	// PageRepair indirectly calls this to move the map and data of this
	// rdb into the trash subdir after renaming them, because they will
	// be replaced by the rebuilt files.
	bool moveToDir   ( char *dstDir ) { return moveToTrash ( dstDir ); };
	bool moveToTrash ( char *dstDir ) ;
	// PageRepair indirectly calls this to rename the map and data files
	// of a secondary/rebuilt rdb to the filenames of the primary rdb.
	// after that, RdbBase::setFiles() is called to reload them into
	// the primary rdb. this is called after moveToTrash() is called for
	// the primary rdb.
	bool removeRebuildFromFilenames ( ) ;
	bool removeRebuildFromFilename  ( BigFile *f ) ;

	// keep a copy of these here so merge can use them to kick out
	// records whose key when, ANDed w/ m_groupMask, equals
	// m_groupId
	//uint32_t  m_groupMask;
	//uint32_t  m_groupId;

	// . we try to minimize the number of files to minimize disk seeks
	// . records that end up as not found will hit all these files
	// . when we get "m_minToMerge" or more files a merge kicks in
	// . TODO: merge should combine just the smaller files... kinda
	// . files are sorted by fileId
	// . older files are listed first (lower fileIds)
	// . filenames should include the directory (full filenames)
	// . TODO: RdbMgr should control what rdb gets merged?
	BigFile  *m_files     [ MAX_RDB_FILES ];
	int32_t      m_fileIds   [ MAX_RDB_FILES ];
	int32_t      m_fileIds2  [ MAX_RDB_FILES ]; // for titledb/tfndb linking
	RdbMap   *m_maps      [ MAX_RDB_FILES ];
	int32_t      m_numFiles;

	// this class contains a ptr to us
	class Rdb           *m_rdb;

	bool      m_dedup;
	int32_t      m_fixedDataSize;

	Dir       m_dir;
	char      m_dbname [32];
	int32_t      m_dbnameLen;

	char      *m_coll;
	collnum_t  m_collnum;

	bool m_didRepair;

	//RdbCache  m_cache;
	// for storing records in memory
	RdbTree    *m_tree;
	RdbBuckets *m_buckets;
	// for dumping a table to an rdb file
	RdbDump    *m_dump;
	// memory for us to use to avoid calling malloc()/mdup()/...
	//RdbMem    m_mem;

	// . this is now static in Rdb.cpp
	// . for merging many rdb files into one
	// . no we brought it back so tfndb can merge while titledb is merging
	//RdbMerge  m_merge;

	//BigFile   m_saveFile; // for saving the tree
	//bool      m_isClosing;
	//bool      m_isClosed;
	//bool      m_haveSavedFile; // we only unlink this file when we dump

	// this callback called when close is complete
	//void     *m_closeState;
	//void    (* m_closeCallback) (void *state );

	int32_t      m_maxTreeMem ; // max mem tree can use, dump at 90% of this

	int32_t      m_minToMergeArg;
	int32_t      m_minToMerge;  // need at least this many files b4 merging
	int32_t      m_absMaxFiles;
	int32_t      m_numFilesToMerge   ;
	int32_t      m_mergeStartFileNum ;

	// a dummy data string for deleting records when m_fixedDataSize > 0
	//char     *m_dummy;
	//int32_t      m_dummySize ; // size of that dummy data
	//int32_t      m_delRecSize; // size of the whole delete record

	/*
	// for keeping stats
	int64_t m_numSeeks;
	int64_t m_numRead;
	// network request/reply info for get requests
	int64_t m_numReqsGet    ;
	int64_t m_numNetReadGet ;
	int64_t m_numRepliesGet ;
	int64_t m_numNetSentGet ;
	// network request/reply info for add requests
	int64_t m_numReqsAdd    ;
	int64_t m_numNetReadAdd ;
	int64_t m_numRepliesAdd ;
	int64_t m_numNetSentAdd ;
	*/

	// should our next merge in waiting force itself?
	bool      m_nextMergeForced;

	// do we need to dump to disk?
	//bool      m_needsSave;

	// . when we dump list to an rdb file, can we use int16_t keys?
	// . currently exclusively used by indexdb
	bool      m_useHalfKeys;

	// key size
	char      m_ks;

	bool m_checkedForMerge;

	int32_t      m_pageSize;

	// are we waiting on another merge/dump to complete before our turn?
	bool      m_inWaiting;

	// . is our merge urgent? (if so, it will starve spider disk reads)
	// . also see Threads.cpp for the starvation
	bool      m_mergeUrgent;

	// are we saving the tree urgently? like we cored...
	//bool      m_urgent;
	// after saving the tree in call to Rdb::close() should the tree
	// remain closed to writes?
	//bool      m_isReallyClosing;

	bool      m_niceness;

	//bool      m_waitingForTokenForDump ;
	bool      m_waitingForTokenForMerge;

	// we now determine when in merge mode
	bool      m_isMerging;

	// have we create the merge file?
	bool      m_hasMergeFile;

	// rec counts for files being merged
	int64_t m_numPos ;
	int64_t m_numNeg ;

	// so only one save thread launches at a time
	//bool m_isSaving;

	class DiskPageCache *m_pc;

	bool m_isTitledb;

	int32_t m_numThreads;

	bool m_isUnlinking;

	// filename of merge file for passing to g_sync to unlink it from there
	char m_oldname [ 256 ];

	//BigFile m_dummyFile;

	int64_t m_lastWrite;

	char m_doLog;
};

extern int32_t g_numThreads;

extern char g_dumpMode;

#endif