open-source-search-engine/DiskPageCache.cpp
mwells 87285ba3cd use gbmemcpy not memcpy so we can get profiler working again
since memcpy can't be interrupted and backtrace() called.
2015-01-13 12:25:42 -07:00

1586 lines
52 KiB
C++

#undef _XOPEN_SOURCE // needed for pread and pwrite
#define _XOPEN_SOURCE 500
#include "gb-include.h"
#include "DiskPageCache.h"
#include "RdbMap.h" // GB_PAGE_SIZE
#include "Indexdb.h"
#include "Profiler.h"
// types.h uses key_t type that shmget uses
//#undef key_t
/*
#ifdef GBUSESHM
#include <sys/ipc.h> // shmget()
#include <sys/shm.h> // shmget()
#endif
*/
// FORMAT of a MEMORY PAGE representing a DISK PAGE
//
// HEADER:
//
// bbbbbbbb bbbbbbbb bbbbbbbb bbbbbbb # of disk data bytes stored in this page
// ffffffff ffffffff ffffffff fffffff Offset into memory page they are stored
// pppppppp pppppppp pppppppp ppppppp Offset of prev mem page in linked list
// nnnnnnnn nnnnnnnn nnnnnnnn nnnnnnn Offset of next mem page in linked list
// dddddddd dddddddd dddddddd ddddddd Disk page # mem page is mapping.
// vvvvvvvv vvvvvvvv vvvvvvvv vvvvvvv vfd of file page is mapping
//
// DDDDDDDD ........ raw disk data at that page...
// offsets in bytes in the header each entry has.
// should total HEADERSIZE bytes.
#define OFF_SIZE 0
#define OFF_SKIP (int)(sizeof(int32_t))
#define OFF_PREV (int)(sizeof(int32_t)*2)
#define OFF_NEXT (int)(sizeof(int32_t)*3)
#define OFF_DISKPAGENUM (int)(sizeof(int32_t)*4)
#define OFF_VFD (int)(sizeof(int32_t)*5)
// store disk data iteself into page at this offset
#define HEADERSIZE (int)(sizeof(int32_t)*6)
DiskPageCache::DiskPageCache () {
m_numPageSets = 0;
// sometimes db may pass an unitialized DiskPageCache to a BigFile
// so make sure when BigFile::close calls DiskPageCache::rmVfd() our
// m_memOffFromDiskPage vector is all NULLed out, otherwise
// it will core
//memset ( m_memOff , 0 , sizeof(int32_t *) * MAX_NUM_VFDS2 );
for ( int32_t i = 0 ; i < MAX_NUM_VFDS2 ; i++ )
m_memOffFromDiskPage[i] = NULL;
m_availMemOff = NULL;
//m_isOverriden = false;
reset();
}
DiskPageCache::~DiskPageCache() {
reset();
}
/*
#ifdef GBUSESHM
static char *s_mem = NULL;
static int s_shmid = -1;
#endif
*/
void DiskPageCache::reset() {
if ( m_numPageSets > 0 )
log("db: resetting page cache for %s",m_dbname);
// . "m_pageSet[]" the actual memory buffers for holding disk pages
// . we allocate one m_pageSet[] at a time like pools
for ( int32_t i = 0 ; i < m_numPageSets ; i++ ) {
mfree ( m_pageSet[i], m_pageSetSize[i], "DiskPageCache");
m_pageSet [i] = NULL;
m_pageSetSize[i] = 0;
}
// . free all the m_memOffs[] arrays
// . free map that maps this files pages on disk to pages/offs in mem
// . m_memOffs[DISKPAGENUM] -> MEMPAGEOFFSET
for ( int32_t i = 0 ; i < MAX_NUM_VFDS2 ; i++ ) {
if ( ! m_memOffFromDiskPage [ i ] ) continue;
int32_t size = m_maxPagesInFile[i] * sizeof(int32_t);
mfree ( m_memOffFromDiskPage [ i ] , size , "DiskPageCache" );
m_memOffFromDiskPage [ i ] = NULL;
}
// . and these contain offsets to available memory pages
// . there are m_numAvailMemOffs of them
// . m_availMemOff[0] would map to the memory offset of the next
// available memory page. kinda like m_memOffFromDiskPage[] but that one is
// for used pages
if ( m_availMemOff ) {
int32_t size = m_maxAvailMemOffs * sizeof(int32_t);
mfree ( m_availMemOff , size , "DiskPageCache" );
}
/*
#ifdef GBUSESHM
// free current one, if exists
if ( s_shmid >= 0 && s_mem ) {
if ( shmdt ( s_mem ) == -1 )
log("disk: shmdt: reset: %s",mstrerror(errno));
s_mem = NULL;
s_shmid = -1;
}
// mark shared mem for destruction
for ( int32_t i = 0 ; m_useSHM && i < m_numShmids ; i++ ) {
int shmid = m_shmids[i];
if ( shmctl ( shmid , IPC_RMID , NULL) == -1 )
log("db: shmctlt shmid=%"INT32": %s",
(int32_t)shmid,mstrerror(errno));
else
log("db: shmctl freed shmid=%"INT32"",(int32_t)shmid);
}
#endif
*/
m_numPageSets = 0;
m_nextMemOff = 0;
m_upperMemOff = 0;
m_maxMem = 0;
m_memAlloced = 0;
m_availMemOff = NULL;
m_numAvailMemOffs = 0;
m_maxAvailMemOffs = 0;
m_headOff = -1;
m_tailOff = -1;
m_enabled = true;
m_nexti = 0;
//m_ramfd = -1;
//m_useRAMDisk = false;
//m_useSHM = false;
}
bool DiskPageCache::init ( const char *dbname ,
char rdbId,
int32_t maxMem ,
int32_t pageSize,
bool useRAMDisk,
bool minimizeDiskSeeks ) {
// int32_t maxMem ,
// void (*getPages2)(DiskPageCache*, int32_t, char*,
// int32_t, int64_t, int32_t*,
// int64_t*),
// void (*addPages2)(DiskPageCache*, int32_t, char*,
// int32_t, int64_t),
// int32_t (*getVfd2)(DiskPageCache*, int64_t),
// void (*rmVfd2)(DiskPageCache*, int32_t) ) {
reset();
// seems like we lose data when it prints "Caught add breach"
// so let's stop using until we fix that... happens while we are
// dumping i think and somehow the data seems to get lost that
// we were dumping.
//maxMem = 0;
m_rdbId = rdbId;
bool *tog = NULL;
if (m_rdbId==RDB_INDEXDB ) tog=&g_conf.m_useDiskPageCacheIndexdb;
if (m_rdbId==RDB_POSDB ) tog=&g_conf.m_useDiskPageCachePosdb;
if (m_rdbId==RDB_DATEDB ) tog=&g_conf.m_useDiskPageCacheDatedb;
if (m_rdbId==RDB_TITLEDB ) tog=&g_conf.m_useDiskPageCacheTitledb;
if (m_rdbId==RDB_SPIDERDB ) tog=&g_conf.m_useDiskPageCacheSpiderdb;
if (m_rdbId==RDB_TFNDB ) tog=&g_conf.m_useDiskPageCacheTfndb;
if (m_rdbId==RDB_TAGDB ) tog=&g_conf.m_useDiskPageCacheTagdb;
if (m_rdbId==RDB_CLUSTERDB ) tog=&g_conf.m_useDiskPageCacheClusterdb;
if (m_rdbId==RDB_CATDB ) tog=&g_conf.m_useDiskPageCacheCatdb;
if (m_rdbId==RDB_LINKDB ) tog=&g_conf.m_useDiskPageCacheLinkdb;
m_switch = tog;
/*
bool useSHM = false;
// a quick hacky thing, force them to use shared mem instead of ram dsk
if ( useRAMDisk ) {
useRAMDisk = false;
useSHM = true;
}
*/
// not for tmp cluster
//if ( g_hostdb.m_useTmpCluster ) useSHM = false;
// it is off by default because it leaks easily (if u Ctrl+C the process)
//if ( ! g_conf.m_useSHM ) useSHM = false;
// right now shared mem only supports a single page size because
// we use s_mem/s_shmid, and if we have a small page size which
// we free, then shmat() may get ENOMEM when trying to get the larger
// of the two page sizes
//if(useSHM && pageSize != GB_INDEXDB_PAGE_SIZE) {char *xx=NULL;*xx=0;}
// don't use it until we figure out how to stop the memory from being
// counted as being the process's memory space. i think we can make
// shmat() use the same mem address each time...
// if ( useSHM ) {
// log("disk: shared mem currently not supported. Turn off "
// "in gb.conf <useSharedMem>");
// char *xx=NULL;*xx=0;
// }
// save it;
//m_useSHM = useSHM;
// clear it
//m_numShmids = 0;
// set this
//m_maxAllocSize = 33554432;
// the shared mem page size is a little more than the disk page size
//m_spageSize = pageSize + HEADERSIZE;
// . this is /proc/sys/kernel/shmmax DIVIDED BY 2 on titan and gk0 now
// . which is the max to get per call to shmat()
// . making this smaller did not seem to have much effect on speed
//int32_t max = 33554432/2;
// make sure it is "pageSize" aligned so we don't split pages
//m_maxAllocSize = (max / m_spageSize) * m_spageSize;
// max of ~16MB worth of pages
//int32_t adjPageSize = pageSize + HEADERSIZE;
//m_maxAllocSize = 2000000000; // 2GB (16000000 / adjPageSize) * adjPageSize;
/*
#ifdef GBUSESHM
// set it up
if ( m_useSHM ) {
// we can only use like 30MB shared mem pieces
int32_t need = maxMem;
shmloop:
// how much to alloc now?
int32_t alloc = need;
// this is /proc/sys/kernel/shmmax on titan and gk0 now
if ( alloc > m_maxAllocSize ) alloc = m_maxAllocSize;
// don't allow anything lower than this because we always
// "swap out" one for another below. that is, we call shmdt()
// to free it then shmat() to reclaim it. otherwise, shmat()
// will run out of memory!!
if ( alloc < m_maxAllocSize ) alloc = m_maxAllocSize;
// get it // SHM_R|SHM_W|SHM_R>>3|SHM_R>>6|...
int shmid = shmget(IPC_PRIVATE, alloc, SHM_R|SHM_W|IPC_CREAT);
// on error, bail
if ( shmid == -1 )
return log("db: shmget: %s",mstrerror(errno));
// don't swap it out (only 2.6 kernel i think)
//if ( shmctl ( shmid , SHM_LOCK , NULL ) )
// return log("db: shmctl: %s",mstrerror(errno));
// log it
log("db: allocated %"INT32" bytes shmid=%"INT32"",alloc,(int32_t)shmid);
// add it to our list
m_shmids [ m_numShmids ] = shmid;
m_shmidSize [ m_numShmids ] = alloc;
m_numShmids++;
// count it
g_mem.m_sharedUsed += alloc;
// log it for now
//logf(LOG_DEBUG,"db: new shmid id is %"INT32", size=%"INT32"",
// (int32_t)shmid,(int32_t)alloc);
// subtract it
need -= alloc;
// get more
if ( need > 0 ) goto shmloop;
}
#endif
*/
// a malloc tag, must be LESS THAN 16 bytes including the NULL
char *p = m_memTag;
gbmemcpy ( p , "pgcache-" , 8 ); p += 8;
if ( dbname ) strncpy ( p , dbname , 8 );
// so we know what db we are caching for
m_dbname = p;
p += 8;
*p++ = '\0';
// sanity check, we store bytes used as a int16_t at top of page
//if ( m_diskPageSize > 0x7fff ) { char *xx = NULL; *xx = 0; }
// . do not use more than this much memory for caching
// . it may go over by like 2% for header information
m_maxMem = maxMem ;
// set m_pageSetSize. use this now instead of m_maxPageSetSize #define
int32_t phsize = pageSize + HEADERSIZE;
m_maxPageSetSize = (((128*1024*1024)/phsize)*phsize);
m_diskPageSize = pageSize;
m_minimizeDiskSeeks = minimizeDiskSeeks;
// we need to keep a count memory of files being cached
if ( m_minimizeDiskSeeks )
m_memFree = m_maxMem;
// check for overriding functions
//if ( getPages2 && addPages2 && getVfd2 && rmVfd2 ) {
// // set override flag
// m_isOverriden = true;
// // set override functions
// m_getPages2 = getPages2;
// m_addPages2 = addPages2;
// m_getVfd2 = getVfd2;
// m_rmVfd2 = rmVfd2;
// // return here
// return true;
//}
/*
// for now only indexdb will use the ramdisk
if ( strcmp ( dbname, "indexdb" ) == 0 && useRAMDisk ){
if ( !initRAMDisk( dbname, maxMem ) )
return log ( "db: failed to init RAM disk" );
}
*/
// . use up to 800k for starters
// . it will grow more as needed
if ( ! growCache ( maxMem ) )
return log("db: pagecache init failed: %s.",
mstrerror(g_errno));
// success
return true;
}
// use Linux's ram disk for caching disk pages, in addition to the ram it
// already uses. I would like to be able to pass in a "maxMemForRamDisk" parm
// to its init() function and have it open a single, ram-disk file descriptor
// for writing up to that many bytes.
// then i would like only Indexdb (and later on Datedb) to pass in an 800MB
// "maxMemForRamDisk" value, and, furthermore, i do not want to cache disk
// pages from the indexdb root file, nor, any indexdb file that is larger than
// twice the "maxMemForRamDisk" value (in this case 1.6GB). this will be used
// exclusively for smaller indexdb files to eliminate excessive disk seeks and
// utilize ALL the 4GB of ram in each machine.
// lastly, we need some way to "force" a merge at around midnight when traffic
// is minimal, or when there are 3 or more indexdb files that are less than
// 80% in the indexdb disk page cache. because that means we are starting to
// do a lot of disk seeks.
/*
bool DiskPageCache::initRAMDisk( const char *dbname, int32_t maxMem ){
m_useRAMDisk = true;
if ( !dbname ) {char *xx=NULL; *xx=0;}
// open a file descriptor
char ff [1024];
sprintf ( ff, "/mnt/RAMDisk/%sPageCache", dbname );
// unlink it first
unlink (ff);
m_ramfd = open ( ff, O_RDWR | O_CREAT );
if ( m_ramfd < 0 )
return log ( LOG_WARN,"db: could not open fd in RAMdisk" );
return true;
}
*/
// . this returns true iff the entire read was copied into
// "buf" from the page cache
// . it will move the used pages to the head of the linked list
// . if *buf is NULL we allocate here
void DiskPageCache::getPages ( int32_t vfd ,
char **buf ,
int32_t numBytes ,
int64_t diskOffset ,
int32_t *newNumBytes ,
int64_t *newOffset ,
char **allocBuf ,
int32_t *allocSize ,
int32_t allocOff ) {
// check for override function
//if ( m_isOverriden ) {
// //log ( LOG_INFO, "cache: Get Pages [%"INT32"] [%"INT32"][%"INT64"]",
// // vfd, numBytes, offset );
// m_getPages2 ( this,
// vfd,
// buf,
// numBytes,
// offset,
// newNumBytes,
// newOffset );
// return;
//}
// return new disk offset, assume unchanged
*newOffset = diskOffset;
*newNumBytes = numBytes;
// return if no pages allowed in page cache
if ( m_maxMem == 0 ) return;
// or disabled
if ( ! m_enabled ) return;
// disabled at the master controls?
if ( m_switch && ! *m_switch ) return;
// or if minimizeDiskSeeks did not accept the vfd
if ( m_minimizeDiskSeeks && vfd < 0 )
return;
// or if no pages in this vfd
if ( ! m_memOffFromDiskPage[vfd] )
return;
// debug point
//if ( offset == 16386 && numBytes == 16386 )
// log("hey");
// what is the page range of in-memory pages?
int32_t sp = diskOffset / m_diskPageSize ;
int32_t ep = (diskOffset + (numBytes-1)) / m_diskPageSize ;
// . sanity check
// . we establish the maxPagesInFile when BigFile::open is called
// by RdbDump. Rdb.cpp calls m_dump.set with a maxFileSize based on
// the mem occupied by the RdbTree. BUT,recs can be added to the tree
// WHILE we are dumping, so we end up with a bigger file, and this
// disk page cache is not prepared for it!
if ( ep >= m_maxPagesInFile[vfd] ) {
// happens because rdbdump did not get a high enough
// maxfilesize so we did not make enough pages! we endedup
// dumping more than what was end the tree because stuff was
// added to the tree while dumping!
log("db: pagecache: Caught get breach. "
"ep=%"INT32" max=%"INT32" vfd=%"INT32""
, ep,m_maxPagesInFile[vfd] ,vfd);
return;
//char *xx = NULL; *xx = 0;
}
char *bufPtr = *buf;
char *bufEnd = *buf + numBytes;
// our offset into first page on disk ( as well as memory page)
int32_t start1 = diskOffset - sp * m_diskPageSize;
// this is for second while loop
int32_t start2 = 0;
if ( ep == sp ) start2 = start1;
// store start pages
while ( sp <= ep ) {
// map disk page # sp into memory offset, "poff"
int32_t poff = m_memOffFromDiskPage[vfd][sp];
// get a ptr to it
//char *s = getMemPtrFromMemOff ( poff );
//if ( ! s ) break;
// break if we do not have page in memory
if ( poff < 0 ) break;
// first 4 bytes of page is how many bytes are used in page
int32_t size = 0;
readFromCache( &size, poff, OFF_SIZE, sizeof(int32_t));
//int32_t size = *(int32_t *)(s+OFF_SIZE);
// second set of 4 bytes is offset of data from page boundary
int32_t skip = 0;
readFromCache( &skip, poff, OFF_SKIP, sizeof(int32_t));
//int32_t skip = *(int32_t *)(s+OFF_SKIP);
// debug msg
// log("getPage: pageNum=%"INT32" poff=%"INT32" size=%"INT32" "
// "skip=%"INT32"",
// sp,poff,(int32_t)size,(int32_t)skip);
// if this mem page data starts AFTER our offset, it is no good
if ( skip > start1 ) break;
// adjust size by our page offset, we won't necessarily be
// starting our read at "skip"
size -= (start1 - skip);
// if size is 0 or less all cached data was
// below our disk offset and is useless
if ( size <= 0 ) break;
// . promote this memory page in the linked list
// . 16 byte header of each memory page houses the
// linked lists' next and prev ptrs to pages in memory
// just for putting the most frequently used pages on top
promotePage ( poff , false );
// allocate the read buffer if we need to
if ( ! *buf ) {
// allocate enough room for allocOff, too
int32_t need = numBytes + allocOff;
char *p = (char *) mmalloc ( need,"PageCacheReadBuf" );
// let FileState know what needs to be freed
*allocBuf = p;
*allocSize = need;
// if couldn't allocate, return now, what's the point
if ( ! p ) return;
// let caller know his new read buffer
*buf = p + allocOff;
// assign the ptrs now
bufPtr = *buf ;
bufEnd = *buf + numBytes;
}
// don't store more than asked for
if ( bufPtr + size > bufEnd ) size = bufEnd - bufPtr;
// . read in "size" bytes from memory into "bufPtr"
// . start reading at an offset of "HEADERSIZE+start1" into
// the memory page
readFromCache(bufPtr, poff, HEADERSIZE + start1 , size);
//gbmemcpy ( bufPtr , s + HEADERSIZE + start1 , size );
bufPtr += size;
*newOffset += size;
*newNumBytes -= size;
// return if we got it all
if ( bufPtr >= bufEnd ) { m_hits += 1; return; }
// otherwise, advance to next page
sp++;
// and our page relative offset is zero now, iff ep > sp
if ( sp <= ep ) start1 = 0;
// if the memory page ended before the disk page, break out
// because we don't want any holes
readFromCache( &size, poff, OFF_SIZE, sizeof(int32_t));
if ( skip + size < m_diskPageSize ) break;
//if ( skip + *(int32_t *)(s+OFF_SIZE) < m_diskPageSize )break;
}
// now store from tail down
/*
while ( ep > sp ) {
// the page offset in memory
int32_t poff = m_memOffFromDiskPage[vfd][ep];
// get a ptr to it
char *s = getMemPtrFromMemOff ( poff );
// break if we do not have page in memory
if ( ! s ) break;
// first 2 bytes of page is how many bytes are used
int32_t size = *(int32_t *)s;
// second set of 2 bytes is offset from boundary
int32_t skip = *(int32_t *)(s+OFF_SKIP);
// adjust size by our page offset, if not zero
if ( start2 > skip ) size -= (start2 - skip);
// his skip point could be beyond us, too
if ( skip >
// . promote this page in the linked list
// . bytes 8-16 of each page in memory houses the
// next and prev ptrs to pages in memory
promotePage ( s , poff , false );
// don't store more than asked for
if ( bufEnd - size < bufPtr ) size = bufEnd - bufPtr;
gbmemcpy ( bufEnd - size , s + HEADERSIZE + start2 , size );
bufEnd -= size;
*newNumBytes -= size;
// return if we got it all
if ( bufEnd <= bufPtr ) { m_hits += 1; return; }
// if this page had a skip, break out, we don't wany any holes
if ( skip > 0 ) break;
// otherwise, advance to next page
ep--;
}
*/
m_misses += 1;
}
// after you read/write from/to disk, copy into the page cache
void DiskPageCache::addPages ( int32_t vfd,
char *buf,
int32_t numBytes,
int64_t diskOffset ,
int32_t niceness ){
// check for override function
//if ( m_isOverriden ) {
// m_addPages2 ( this,
// vfd,
// buf,
// numBytes,
// offset );
// return;
//}
// if vfd is -1, then we were not able to add a map for this file
if ( vfd < 0 ) return;
// no NULL ptrs
if ( ! buf ) return;
// return if no pages allowed in page cache
if ( m_maxMem == 0 ) return;
// or disabled
if ( ! m_enabled ) return;
// disabled at the master controls?
if ( m_switch && ! *m_switch ) return;
// sometimes the file got unlinked on us
if ( ! m_memOffFromDiskPage[vfd] ) return;
// for some reason profiler cores all the time in here
//if ( g_profiler.m_realTimeProfilerRunning ) return;
// . "diskPageNum" is the first DISK page #
// . "offset" is the offset on disk the data was read from
// . "m_diskPageSize" is the size of the disk pages
int64_t diskPageNum = diskOffset / m_diskPageSize ;
// point to the data that was read from disk
char *bufPtr = buf;
char *bufEnd = buf + numBytes;
// . how much did we exceed the mem page boundary by?
// . "skip" is offset into the memory page where we store the disk data
int32_t skip = diskOffset - diskPageNum * m_diskPageSize ;
// how many bytes of disk data should we store into the memory page?
int32_t size = m_diskPageSize - skip;
// now add the remaining data into memory pages
while ( bufPtr < bufEnd ) {
// breathe
QUICKPOLL(niceness);
// ensure "size" is not too big.
// adjust "size" if so,so we won't exceed the mem page boundary
if ( bufPtr + size > bufEnd ) size = bufEnd - bufPtr;
// add the page to memory.
// "bufPtr" is the data we read from disk.
// "size" is where to start writing relative to this memory
// page's start.
// "skip" is how many bytes to write into this "page".
addPage ( vfd , diskPageNum , bufPtr , size , skip );
// advance disk data buf over what we stored into the mem page
bufPtr += size;
// advance DISK page #
diskPageNum++;
// assume we will be filling up the next mem page fully
size = m_diskPageSize;
// skip is offset from beginning of the memory page
skip = 0;
}
}
// . convert our MEMORY offset into an actual ptr to a chunk of memory
// . this makes our memory pooling approach transparent
// . "off" is offset into the memory
// . "off" includes HEADERSIZE headers in it
char *DiskPageCache::getMemPtrFromMemOff ( int32_t off ) {
if ( off < 0 ) return NULL; // NULL means not in DiskPageCache
// for some reason profiler cores all the time in here
// and m_numPageSets is 0 like we got reset
//if ( g_profiler.m_realTimeProfilerRunning ) return NULL;
// get set number
int32_t sn = off / m_maxPageSetSize ;
// get offset from within the chunk of memory (within the set)
//int32_t poff = off & (m_maxPageSetSize-1);
int32_t poff = off % (m_maxPageSetSize);
// . sanity check
// . offset must be multiple of m_diskPageSize+HEADERSIZE, no cuz we skip
// ahead X bytes of a page set boundary...
//int32_t off2 = off - sn * m_maxPageSetSize;
//if ( off2 != 0 && (off2% (m_diskPageSize+HEADERSIZE)) != 0) {
// char *xx = NULL; *xx = 0; }
// if we are not in the first page set, advance by one chunk
// because the first page is often mapped to by a truncated poff from
// the previous page set
//if ( sn > 0 && poff == 0 ) poff += m_diskPageSize + HEADER_SIZE;
// if it would breech our PAGE_SET, up it
if ( poff + m_diskPageSize + HEADERSIZE > m_maxPageSetSize) {poff=0; sn++;}
// sanity check
if ( sn >= m_numPageSets ) { char *xx = NULL; *xx = 0; }
// return the proper ptr
return (m_pageSet[sn]) + poff;
}
// . "diskPageNum" is the disk page # of the file with "vfd"
// . "page" points to the disk data we read from disk
// . "size" is how many bytes to write into the memory page, #pageNum
// . "skip" is the offset into the memory page we will write the disk data into
void DiskPageCache::addPage(int32_t vfd,
int32_t diskPageNum,
char *pageData,
int32_t size,
int32_t skip){
// . if pageNum is beyond the file size
// . see the explanation for this same error msg above
if ( diskPageNum >= m_maxPagesInFile[vfd] ) {
// this has happened during a merge before!! (at startup)
//log(LOG_LOGIC,"db: pagecache: addPage: Bad engineer. "
// happens because rdbdump did not get a high enough
// maxfilesize so we did not make enough pages! we endedup
// dumping more than what was end the tree because stuff was
// added to the tree while dumping!
log("db: pagecache: Caught add breach. "
"pageNum=%"INT32" max=%"INT32" db=%s",
diskPageNum,m_maxPagesInFile[vfd],m_dbname);
return;
}
// debug msg
// log("addPage: vfd=%"INT32" diskPageNum=%"INT32" pageData[0]=%hhx "
// "size=%"INT32" skip=%"INT32"",
// vfd,diskPageNum,pageData[0],size,(int32_t)skip);
// "poff" is the DISK page # for "vfd" (virtual file descriptor) and
// it returns an offset to the page in memory.
int32_t poff = m_memOffFromDiskPage [ vfd ] [ diskPageNum ] ;
int32_t oldDiskPage;
// p will be NULL if page does not have any data in memory yet
//char *p = getMemPtrFromMemOff ( poff );
// if page already exists in cache and needs data on the boundaries
// we may be able to supply it
if ( poff >= 0 ) {
// debug msg
//log("ENHANCING off=%"INT32"",poff);
enhancePage ( poff , pageData , size , skip );
return;
}
// don't add any more if we're minimizing disk seeks and are full
if ( m_minimizeDiskSeeks &&
m_numPagesPresentOfFile[vfd] >= m_maxPagesPerFile[vfd] )
return;
// top:
// try to get an available memory spot from list
if ( m_numAvailMemOffs > 0 ) {
poff = m_availMemOff [ --m_numAvailMemOffs ] ;
// debug msg
//log("RECYCLING off=%"INT32" numAvailMemOffs-1=%"INT32""
// ,poff,m_numAvailMemOffs);
}
// can we grab a page from memory without having to grow?
else if ( m_nextMemOff + m_diskPageSize + HEADERSIZE < m_upperMemOff) {
poff = m_nextMemOff;
m_nextMemOff += m_diskPageSize + HEADERSIZE;
// debug msg
// log("CLAIMING off=%"INT32" (nextmemoff=%"INT32"",poff,
// m_nextMemOff);
}
// . we now grow everything at start
// . otherwise, try to grow the page cache by 200k
//else if ( m_nextMemOff + m_diskPageSize + HEADERSIZE < m_maxMem ) {
// // grow by 100k worth of pages each time
// if ( ! growCache ( m_upperMemOff + 200*1024 ) ) return;
// goto top;
//}
// this should never happen. Since in minimizeDiskSeek we have
// an exact number of pages per file
else if ( m_minimizeDiskSeeks ) {
char *xx = NULL; *xx = 0;
}
// if no freebies left, take over the tail page in memory
else {
// STEAL IT!!
poff = m_tailOff;
// remove it from linked list. it will be re-added below @ head
////
// CAUTION: THIS CHANGES m_tailOff!!!!!!
///
excisePage ( m_tailOff );
// . the file no longer owns him
// . this is a int32_t ptr to &m_bufOffs[vfd][pageNum]
// . if that vfd no longer exists it should have added all its
// pages to m_avail list
//int32_t tmp = -1;
// WHY DOING THIS?
//int32_t memOff = -1;//NULL;
//readFromCache(&memOff, poff, OFF_PTR, sizeof(int32_t));
// the tail may actualy belong to a separated file with
// a different vfd
int oldVfd;
readFromCache (&oldVfd,poff,OFF_VFD,sizeof(int32_t));
readFromCache (&oldDiskPage,poff,OFF_DISKPAGENUM,
sizeof(int32_t));
// did excise work?
// this cored here from m_memOffFroMDiskPage[oldVfd] being
// NULL, so how could that happen?
if ( m_memOffFromDiskPage[oldVfd] &&
m_memOffFromDiskPage[oldVfd][oldDiskPage] != -1 ) {
char *xx=NULL;*xx=0; }
// did ex
// seg faultint here: mdw:
//*memOffPtr = -1;
// how can this be, we subverted a valid buffer
//if ( memOff == -1 ) { char *xx=NULL;*xx=0; }
//poff = memOff;
//m_cacheBuf.writeToCache(poff, OFF_PTR, &tmp, sizeof(int32_t));
// testing
//m_cacheBuf.readFromCache ( &tmp, poff+OFF_PTR, sizeof(int32_t) );
//if ( tmp != -1 ){
//char *xx=NULL; *xx=0;}
//**(int32_t **)(p+OFF_PTR) = -1;
// debug msg
//log("KICKINGTAIL off=%"INT32"",poff);
}
// sanity check
if ( poff < 0 ) { char *xx = NULL; *xx = 0; }
// get ptr to the page in memory from the memory offset
//p = getMemPtrFromMemOff ( poff );
// store how many bytes we wrote into the memory page residing @ poff
writeToCache(poff, OFF_SIZE, &size, sizeof(int32_t));
// int32_t tmp = 0;
// m_cacheBuf.readFromCache ( &tmp, poff, OFF_SIZE, sizeof(int32_t) );
// if ( tmp != size ){
// char *xx=NULL; *xx=0;}
//*(int32_t *)(p+OFF_SIZE) = size;
// store "skip" which is the offset into the memory page we start
// storing the disk data into
writeToCache( poff, OFF_SKIP, &skip, sizeof(int32_t) );
//*(int32_t *)(p+OFF_SKIP) = skip;
// sanity check
if ( size + skip > m_diskPageSize ) { char *xx = NULL; *xx = 0; }
// then store a ptr to m_memOffFromDiskPage[vfd][pageNum] so we can set
// *ptr to -1 if they page gets replaced by another
// store the offset of this memory page
//int32_t *memOffPtr = &m_memOffFromDiskPage[ vfd ][ pageNum ];
// m_memOffFromDiskPage maps a vfd/pagenum to a memory page offset.
// -1 means none.
// why do we need to store the memory offset in the memory page???
//int32_t memOff = m_memOffFromDiskPage[ vfd ][ pageNum ];
//writeToCache( poff, OFF_PTR, &memOff, sizeof(int32_t));
//*(int32_t **)(p+OFF_PTR) = &m_memOffFromDiskPage [ vfd ] [ pageNum ];
// then the data from disk (skip over linked list info)
// "skip" is how far into the memory page we should write the
// disk data because it is not aligned perfectly with the mem page.
writeToCache( poff, HEADERSIZE + skip, pageData, size);
//gbmemcpy ( p + HEADERSIZE + skip , page , size );
// transform mem ptr to memory offset
//if ( !m_useRAMDisk && ! m_useSHM ) {
/*
int32_t off = -1;
char *p = getMemPtrFromMemOff ( poff );
for ( int32_t i = 0 ; i < m_numPageSets ; i++ ) {
if ( p < m_pageSet[i] ) continue;
if ( p > m_pageSet[i] + m_pageSetSize[i] )
continue;
off = p - m_pageSet[i] + i * m_maxPageSetSize ;
break;
}
*/
// gotta record this now too!
writeToCache( poff, OFF_DISKPAGENUM, &diskPageNum, sizeof(int32_t) );
writeToCache( poff, OFF_VFD, &vfd, sizeof(int32_t) );
// store the linked list information in the remaining header bytes
// that we use for promoting heaviliy hit pages to the top of
// thereby replacing the tail when adding new pages. this will
// insert our page into the linked list. it will set the prev/next
// mem page offsets in the header of this memory page.
promotePage ( poff , true/*isNew?*/ );
// update map. map disk page # to mem offset.
m_memOffFromDiskPage [ vfd ] [ diskPageNum ] = poff;
// sanity check
//if ( off != poff ) { char *xx=NULL; *xx=0; }
//}
//else
// m_memOffFromDiskPage [ vfd ] [ pageNum ] = poff;
// update the header of that page
// we have added the page!
if ( m_minimizeDiskSeeks )
m_numPagesPresentOfFile[vfd]++;
}
// . add data from "page" (we just read it from disk or wrote to disk)
// . "poff" is the memory page # that will receive the disk data
// . "page" points to the disk data we read from disk to be stored into mem pg
// . "size" is how many bytes to write into the memory page, #pageNum
// . "skip" is the offset into the memory page we will write the disk data into
void DiskPageCache::enhancePage (int32_t poff, char *page, int32_t size,
int32_t skip) {
int32_t psize = 0;
readFromCache( &psize, poff, OFF_SIZE, sizeof(int32_t));
//int32_t psize = *(int32_t *)(p+OFF_SIZE);
int32_t pskip = 0;
readFromCache( &pskip, poff, OFF_SKIP, sizeof(int32_t));
//int32_t pskip = *(int32_t *)(p+OFF_SKIP);
// can we add to front of page?
if ( skip < pskip ) {
int32_t diff = pskip - skip;
// . we cored here because page[diff-1] was out of bounds. why?
// . do not allow gap in between cached data, that is, we have
// cached bytes at the end of the page, then we try to cache
// some at the beginning, and it's not contiguous... we are
// not built for that... this can happen when dumping a file,
// if your first reads up to the file end (somewhere in the
// middle of the page) and your second read starts somewhere
// else.... mmmm... i dunno....
if ( skip + size < pskip || diff > size ) {
log("db: Avoided cache gap in %s. diff=%"INT32" "
"size=%"INT32" pskip=%"INT32" skip=%"INT32".",
m_dbname,diff,size,(int32_t)pskip,(int32_t)skip);
return;
}
writeToCache(poff, HEADERSIZE + skip , page , diff);
//gbmemcpy ( p + HEADERSIZE + skip , page , diff );
psize += diff;
pskip -= diff;
writeToCache(poff, OFF_SIZE, &psize, sizeof(int32_t));
//*(int32_t *)(p+OFF_SIZE) = psize ;
writeToCache(poff, OFF_SKIP, &pskip, sizeof(int32_t));
//*(int32_t *)(p+OFF_SKIP) = pskip ;
}
// can we add to end of page?
int32_t pend = pskip + psize;
int32_t end = skip + size;
if ( end <= pend ) return;
int32_t diff = end - pend ;
// if the read's starting point is beyond our ending point, bail,
// we don't want any holes...
if ( diff > size ) return;
writeToCache(poff, HEADERSIZE + pend, page + size - diff, diff);
//gbmemcpy ( p + HEADERSIZE + pend , page + size - diff , diff );
int32_t tmp = psize+diff;
writeToCache(poff, OFF_SIZE, &tmp, sizeof(int32_t));
//*(int32_t *)(p+OFF_SIZE) = (int32_t)psize + diff;
}
// the link information is bytes 8-16 of each page in mem (next/prev mem ptrs)
void DiskPageCache::promotePage ( int32_t poff , bool isNew ) {
if ( isNew ) {
here:
// store a -1 to indicate previous page offset.
// we are the head of the linked list now, so -1 means none.
int32_t tmp = -1;
writeToCache(poff, OFF_PREV, &tmp, sizeof(int32_t));
// testing
readFromCache ( &tmp, poff, OFF_PREV, sizeof(int32_t) );
if ( tmp != -1 ){
char *xx=NULL; *xx=0;}
//*(int32_t *)(p + OFF_PREV) = -1 ;// our prev is -1 (none)
// store the next page in the linked list who WAS the head
// it could be -1 if we are the first entry intothe linked list
writeToCache(poff, OFF_NEXT, &m_headOff, sizeof(int32_t));
//*(int32_t *)(p+OFF_NEXT)=m_headOff;//our next is the old head
// the old head's prev is us
if ( m_headOff >= 0 ) {
writeToCache(m_headOff,OFF_PREV,&poff,sizeof(int32_t));
//char *headPtr = getMemPtrFromMemOff ( m_headOff ) ;
//*(int32_t *)(headPtr + OFF_PREV) = poff;
}
// and we're the new head
m_headOff = poff;
// if no tail, we become that, too, we must be the first
if ( m_tailOff < 0 ) m_tailOff = poff;
return;
}
// otherwise, we have to excise
excisePage ( poff );
// and add as new
goto here;
}
// remove a page from the linked list
void DiskPageCache::excisePage ( int32_t poff ) {
// get our neighbors, NULL if none
int32_t prev = 0;
readFromCache(&prev, poff, OFF_PREV, sizeof(int32_t));
//int32_t prev = *(int32_t *)(p + OFF_PREV);
int32_t next = 0;
readFromCache(&next, poff, OFF_NEXT, sizeof(int32_t));
//int32_t next = *(int32_t *)(p + OFF_NEXT);
// if we were the head or tail, then pass it off to our neighbor
if ( poff == m_headOff ) m_headOff = next;
if ( poff == m_tailOff ) m_tailOff = prev;
// our prev's next becomes our old next
if ( prev >= 0 ) {
//char *prevPtr = getMemPtrFromMemOff ( prev );
writeToCache(prev, OFF_NEXT, &next, sizeof(int32_t));
//*(int32_t *)(prevPtr + OFF_NEXT ) = next;
}
// our next's prev becomes our old prev
if ( next >= 0 ) {
//char *nextPtr = getMemPtrFromMemOff ( next );
writeToCache(next, OFF_PREV, &prev, sizeof(int32_t));
//int32_t *)(nextPtr + OFF_PREV ) = prev;
}
// what is the tail's disk page # so we can update
// m_memOffFromDiskPage[vfd][tailDiskPageNum] ?
int32_t diskPageNum;
readFromCache ( &diskPageNum,poff,OFF_DISKPAGENUM,sizeof(int32_t) );
int vfd;
readFromCache ( &vfd,poff,OFF_VFD,sizeof(int32_t) );
// the memory page we are commandeering should no longer be
// mapped to from its disk page
if ( m_memOffFromDiskPage [ vfd ] )
m_memOffFromDiskPage [ vfd ] [ diskPageNum ] = -1;
}
// . grow/shrink m_memOffFromDiskPage[] which maps vfd/page to a mem offset
// . returns false and sets g_errno on error
// . called by DiskPageCache::open()/close() respectively
// . fileSize is so we can alloc m_memOffFromDiskPage[vfd] big enough
// for all pgs
int32_t DiskPageCache::getVfd ( int64_t maxFileSize, bool vfdAllowed ) {
// check for override function
//if ( m_isOverriden ) {
// return m_getVfd2 ( this, maxFileSize );
//}
// for RAMDisks, do not cache disk
// pages from the indexdb root file, nor, any indexdb file that is
// larger than twice the "maxMemForRamDisk" value
/*
if ( m_useRAMDisk && maxFileSize > (m_maxMem * 2) ){
log (LOG_INFO,"db: getvfd: cannot cache on RAMDisk files that "
"larger than twice the max mem value. fileSize=%"INT32"",
m_maxMem);
return -1;
}
*/
int32_t numPages = (maxFileSize / m_diskPageSize) + 1;
// RESTRICT to only the first m_maxMemOff worth of files,
// starting with the SMALLEST file first. so if maxMemoff is 50MB, and
// we have 5 files that are 10,20,30 & 40MB,
// then we use 10MB for the first file, 20MB of the 2nd BUT only
// 20MB for the 3rd file, and the 4th file does not get any page cache.
// if doing "biased lookups" each file is virtually half the actual
// size, and this allocates page cache appropriately.
// don't to do a page cache for an indexdb0001.dat that is 100GB
// because we'd have to allocate too much mem for the
// m_memOffFromDiskPage[] array
// so for the parital file make sure its less than 1 GB
if ( m_minimizeDiskSeeks && !vfdAllowed ){
log (LOG_INFO,"db: getVfd: cannot cache because minimizing "
"disk seeks. numPages=%"INT32"", numPages);
return -1;
}
// . pick a vfd for this BigFile to use
// . start AFTER last pick in case BigFile closed, released its
// m_vfd, a read thread returned and called addPages() using that
// old m_vfd!!!!!!! TODO: can we fix this better?
int32_t i ;
int32_t count = MAX_NUM_VFDS2;
for ( i = m_nexti ; count-- > 0 ; i++ ) {
if ( i >= MAX_NUM_VFDS2 ) i = 0; // wrap
if ( ! m_memOffFromDiskPage [ i ] ) break;
}
// bail if none left
if ( count == 0 ) {
g_errno = EBADENGINEER;
log(LOG_LOGIC,"db: pagecache: getvfd: no vfds remaining.");
//char *xx = NULL; *xx = 0;
return -1;
}
// . file size has to be below 2 gigs because m_memOffFromDiskPage is
// only a int32_t
// . if we need to we could transform m_memOffFromDiskPage into
// m_memPageNum
//if ( maxFileSize > 0x7fffffffLL ) {
// g_errno = EBADENGINEER;
// log("DiskPageCache::getVfd: maxFileSize too big");
// return -1;
//}
// assign it
int32_t vfd = i;
// start here next time
m_nexti = i + 1;
// say which cache it is
// alloc the map space for this file
int32_t need = numPages * sizeof(int32_t) ;
int32_t *buf = (int32_t *)mmalloc ( need , m_memTag );
if ( ! buf ) {
log("db: Failed to allocate %"INT32" bytes for page cache "
"structures for caching pages for vfd %"INT32". "
"MaxfileSize=%"INT64". Not enough memory.",
need,i,maxFileSize);
return -1;
}
m_memOffFromDiskPage [ vfd ] = buf;
m_maxPagesInFile [ vfd ] = numPages;
// keep a tab on the number of pages we can store of the file
if ( m_minimizeDiskSeeks ){
m_numPagesPresentOfFile[vfd] = 0;
if ( m_memFree > numPages * ( HEADERSIZE + m_diskPageSize ) )
m_maxPagesPerFile[vfd] = numPages;
else
m_maxPagesPerFile[vfd] = m_memFree / ( m_diskPageSize +
HEADERSIZE );
}
// add it in
m_memAlloced += need;
// debug msg
//log("%s adding %"INT32"",m_dbname,need);
// no pages are in memory yet, so set offsets to -1
for ( i = 0 ; i < numPages ; i++ )
m_memOffFromDiskPage [ vfd ] [ i ] = -1;
// if minimizing disk seeks then calculate the memory used
if ( m_minimizeDiskSeeks ){
m_memFree -= maxFileSize;
// if the file is bigger than the mem only partially store it
if ( m_memFree < 0 )
m_memFree = 0;
}
// debug msg
//log("ALLOCINGFILE pages=%"INT32"",numPages);
return vfd;
}
// when a file loses its vfd this is called
void DiskPageCache::rmVfd ( int32_t vfd ) {
// check for override function
//if ( m_isOverriden ) {
// m_rmVfd2 ( this, vfd );
// return;
//}
// ensure validity
if ( vfd < 0 ) return;
// if 0 bytes are allocated for disk cache, just skip this junk
if ( m_maxMem <= 0 ) return;
// this vfd may have already been nuked by call to unlink!
if ( ! m_memOffFromDiskPage [ vfd ] ) return;
// add valid offsets used by vfd into m_availMemOff
for ( int32_t i = 0 ; i < m_maxPagesInFile [ vfd ] ; i++ ) {
int32_t off = m_memOffFromDiskPage [ vfd ] [ i ];
// a -1 offset means empty
if ( off < 0 ) continue;
// sanity check
if ( m_numAvailMemOffs >= m_maxAvailMemOffs ) {
char *xx = NULL; *xx = 0; }
// debug msg
//log("MAKING off=%"INT32" available. na=%"INT32"",
// off,m_numAvailMemOffs+1);
// store it in list of available memory offsets so some other
// file can use it
m_availMemOff [ m_numAvailMemOffs++ ] = off;
//log("disk: m_numAvailMemOffs+1 -> %"INT32,m_numAvailMemOffs);
// set this to -1 i guess. it'll be freed below anyway.
m_memOffFromDiskPage [ vfd ] [i] = -1;
// remove that page from linked list, too
//char *p = getMemPtrFromMemOff ( off );
excisePage ( off );
}
// free the map that maps this files pages on disk to pages/offs in mem
int32_t size = m_maxPagesInFile[vfd] * sizeof(int32_t);
mfree ( m_memOffFromDiskPage [ vfd ] , size , "DiskPageCache" );
m_memOffFromDiskPage [ vfd ] = NULL;
// debug msg
//log("%s rmVfd: vfd=%"INT32" down %"INT32"",m_dbname,vfd,size);
m_memAlloced -= size;
if ( m_minimizeDiskSeeks ){
m_memFree += m_maxPagesPerFile[vfd] * m_diskPageSize;
m_maxPagesPerFile[vfd] = 0;
m_numPagesPresentOfFile[vfd] = 0;
}
}
// use "mem" bytes of memory for the cache
bool DiskPageCache::growCache ( int32_t mem ) {
// debug msg
//log("GROWING PAGE CACHE from %"INT32" to %"INT32" bytes (%"XINT64")"
// ,m_upperMemOff, mem ,(uint64_t)this);
// don't exceed the max
if ( mem > m_maxMem ) mem = m_maxMem;
// bail if we wouldn't be growing
if ( mem <= m_upperMemOff ) return true;
// how many pages? round up.
int32_t npages = mem/(m_diskPageSize+HEADERSIZE) + 1;
// . we need one "available" slot for each page in the cache
// . this is a list of memory offsets that are available
int32_t oldSize = m_maxAvailMemOffs * sizeof(int32_t) ;
int32_t newSize = npages * sizeof(int32_t) ;
int32_t *a=(int32_t *)mrealloc(m_availMemOff,oldSize,newSize,m_memTag);
if ( ! a ) return log("db: Failed to regrow page cache from %"INT32" to "
"%"INT32" bytes. Not enough memory.",oldSize,newSize);
m_availMemOff = a;
m_maxAvailMemOffs = npages;
m_memAlloced += (newSize - oldSize);
// debug msg
//log("%s growCache: up %"INT32"",m_dbname,(newSize - oldSize));
// how much more mem do we need to alloc?
int32_t need = mem - m_upperMemOff ;
// how big is our last page set?
int32_t size = 0;
char *ptr = NULL;
int32_t i = 0;
if ( m_numPageSets > 0 ) {
// since we allocate everything at init this shouldn't happen
char *xx=NULL; *xx=0;
i = m_numPageSets - 1;
ptr = m_pageSet [ i ];
size = m_pageSetSize [ i ];
}
// realloc him
int32_t extra = m_maxPageSetSize - size ;
if ( extra > need ) extra = need;
/*
if ( m_useRAMDisk ){
// since RAMdisk it creates a file, no reason to alloc
m_memAlloced = need;
m_upperMemOff = need;
return true;
}
// and shared mem already has the mem at this point
if ( m_useSHM ) {
m_memAlloced = need;
m_upperMemOff = need;
return true;
}
*/
char *s = (char *)mrealloc ( ptr , size , size + extra,
m_memTag);
if ( ! s ) return log("db: Failed to allocate %"INT32" bytes more "
"for pagecache.",extra);
m_pageSet [ i ] = s;
m_pageSetSize [ i ] = size + extra;
// if we are not adding to an existing, we are a new page set
if ( ! ptr ) m_numPageSets++;
// discount it
need -= extra;
// add to alloc count
m_memAlloced += extra;
m_upperMemOff += extra;
// debug msg
//log("%s growCache2: up %"INT32"",m_dbname,extra);
// if we do not need more, we are done
if ( need == 0 ) return true;
// otherwise, alloc new page sets until we hit it
for ( i++ ; i < MAX_PAGE_SETS && need > 0 ; i++ ) {
int32_t size = need;
if ( size > m_maxPageSetSize ) size = m_maxPageSetSize;
need -= size;
m_pageSet[i] = (char *) mmalloc ( size , m_memTag );
if ( ! m_pageSet[i] ) break;
m_pageSetSize[i] = size;
m_memAlloced += size;
m_upperMemOff += size;
m_numPageSets++;
// debug msg
//log("%s growCache3: up %"INT32"",m_dbname,size);
}
// update upper bound
if ( need == 0 ) return true;
return log(LOG_LOGIC,"db: pagecache: Bad engineer. Weird problem.");
}
int32_t DiskPageCache::getMemUsed ( ) {
return m_nextMemOff - m_numAvailMemOffs * (m_diskPageSize+HEADERSIZE);
}
bool DiskPageCache::verifyData2 ( int32_t vfd ) {
// ensure validity
//if ( vfd < 0 ) return true;
for ( int vfd = 0 ; vfd < 10 ; vfd++ ) {
// this vfd may have already been nuked by call to unlink!
if ( ! m_memOffFromDiskPage [ vfd ] ) continue;//return true;
// debug msg
//log("VERIFYING PAGECACHE vfd=%"INT32" fn=%s",vfd,f->getFilename());
// read into here
// add valid offsets used by vfd into m_availMemOff
for ( int32_t i = 0 ; i < m_maxPagesInFile [ vfd ] ; i++ ) {
int32_t off = m_memOffFromDiskPage [ vfd ] [ i ];
// if page not in use, skip it
if ( off < 0 ) continue;
// check this now too
int32_t storedvfd;
readFromCache ( &storedvfd,
off ,
OFF_VFD,
sizeof(int32_t) );
if ( storedvfd != vfd ) { char *xx=NULL;*xx=0; }
// ensure we are in sync with the map of diskpage to mem
int32_t storedDiskPageNum;
readFromCache ( &storedDiskPageNum ,
off ,
OFF_DISKPAGENUM,
sizeof(int32_t) );
if ( storedDiskPageNum != i ) { char *xx=NULL;*xx=0; }
}
}
return true;
}
#include "BigFile.h"
#include "Threads.h"
bool DiskPageCache::verifyData ( BigFile *f ) {
int32_t vfd = f->getVfd();
// ensure validity
if ( vfd < 0 ) return true;
// this vfd may have already been nuked by call to unlink!
if ( ! m_memOffFromDiskPage [ vfd ] ) return true;
// debug msg
//log("VERIFYING PAGECACHE vfd=%"INT32" fn=%s",vfd,f->getFilename());
// read into here
char buf [ 32 * 1024 ];//GB_PAGE_SIZE ]; //m_diskPageSize ];
// ensure threads disabled
bool on = ! g_threads.areThreadsDisabled();
if ( on ) g_threads.disableThreads();
// disable ourselves
disableCache();
// add valid offsets used by vfd into m_availMemOff
for ( int32_t i = 0 ; i < m_maxPagesInFile [ vfd ] ; i++ ) {
int32_t off = m_memOffFromDiskPage [ vfd ] [ i ];
// if page not in use, skip it
if ( off < 0 ) continue;
// ensure we are in sync with the map of diskpage to mem
int32_t storedDiskPageNum;
readFromCache ( &storedDiskPageNum ,
off ,
OFF_DISKPAGENUM,
sizeof(int32_t) );
if ( storedDiskPageNum != i ) { char *xx=NULL;*xx=0; }
// check this now too
int32_t storedvfd;
readFromCache ( &storedvfd,
off ,
OFF_VFD,
sizeof(int32_t) );
if ( storedvfd != vfd ) { char *xx=NULL;*xx=0; }
//char *p = getMemPtrFromMemOff ( off );
int32_t size = 0;
readFromCache(&size, off, OFF_SIZE, sizeof(int32_t));
//int32_t size = *(int32_t *)(p+OFF_SIZE);
int32_t skip = 0;
readFromCache(&skip, off, OFF_SKIP, sizeof(int32_t));
if ( size > 32 * 1024 ){
char *xx=NULL; *xx=0; }
//int32_t skip = *(int32_t *)(p+OFF_SKIP);
FileState fstate;
if ( ! f->read ( buf ,
size ,
((int64_t)i * (int64_t)m_diskPageSize) +
(int64_t)skip ,
&fstate ,
NULL , // state
NULL , // callback
0 )){// niceness
// core if it did not complete
char *xx = NULL; *xx = 0; }
// compare to what we have in mem
log("checking vfd=%"INT32" "
"diskpage # %"INT32" size=%"INT32" skip=%"INT32""
, (int32_t)vfd , i, size, skip);
char buf2[32 * 1024];
readFromCache( buf2, off, HEADERSIZE + skip, size );
if ( memcmp ( buf, buf2, size ) != 0 ){
char *xx = NULL; *xx = 0; }
//if ( memcmp ( buf , p + HEADERSIZE + skip, size ) != 0 ) {
//char *xx = NULL; *xx = 0; }
}
if ( on ) g_threads.enableThreads();
enableCache();
// debug msg
log("DONE VERIFYING PAGECACHE");
return true;
}
// bigOff is used to get the MemPtr, smallOff is the offset in the Mem
void DiskPageCache::writeToCache( int32_t memOff,
int32_t memPageOff ,
void *inBuf,
int32_t size ){
/*
#ifdef GBUSESHM
if ( m_useSHM ) {
// what page are we on?
int32_t page = ( bigOff + smallOff ) / m_maxAllocSize;
// offset within that page
int32_t poff = ( bigOff + smallOff ) % m_maxAllocSize;
// sanity check
if ( page >= m_numShmids ) { char *xx=NULL; *xx=0; }
// sanity check
if ( poff + size > m_shmidSize[page] ) { char *xx=NULL;*xx=0; }
// get first byte
int shmid = m_shmids[page];
// assume we already have it loaded in
char *mem = s_mem;
// . is this the page we currently have loaded?
// . th shmdt and shmat() seems to take about 12 microseconds
// on avg to execute. so about 100 times per milliseconds.
// . seems like the writeToCache() is 3x slower than the
// readFromCache() perhaps because the dirty pages are
// COPIED back into system mem?
if ( shmid != s_shmid ) {
// time it
//int64_t start = gettimeofdayInMicroseconds();
// free current i guess
if ( s_mem && shmdt ( s_mem ) == -1 ) {
log("disk: shmdt: %s",mstrerror(errno));
char *xx=NULL;*xx=0;
}
// load it in if not
mem = (char *) shmat ( shmid , NULL, SHM_R|SHM_W );
// if this happens at startup, try calling shmat
// when we init this page cache above...
if ( mem == (char *)-1 ) {
log("disk: shmat: %s",mstrerror(errno));
char *xx=NULL;*xx=0;
}
// store it
s_mem = mem;
s_shmid = shmid;
// time it
//int64_t took = gettimeofdayInMicroseconds() -start;
//if ( took > 1 )
// logf(LOG_DEBUG,"disk: took %"INT64" us to write "
// "to shm page cache shmid=%"INT32".",took,
// (int32_t)shmid);
}
// store it into the cache
gbmemcpy ( mem + poff , inBuf , size );
return;
}
#endif
if ( m_useRAMDisk ){
int32_t numBytesWritten = pwrite( m_ramfd, inBuf, size,
bigOff + smallOff );
if ( numBytesWritten != size ){
char *xx=NULL; *xx=0;
}
return;
}
*/
char *p = getMemPtrFromMemOff ( memOff );
gbmemcpy(p + memPageOff, inBuf, size);
}
// . store cached disk info into "outBuf". up to "size" bytes of it.
void DiskPageCache::readFromCache( void *outBuf,
int32_t memOff,
int32_t pageOffset,
int32_t bytesToCopy ) {
/*
#ifdef GBUSESHM
if ( m_useSHM ) {
// what page are we on?
int32_t page = ( bigOff + smallOff ) / m_maxAllocSize;
// offset within that page
int32_t poff = ( bigOff + smallOff ) % m_maxAllocSize;
// sanity check
if ( page >= m_numShmids ) { char *xx=NULL; *xx=0; }
// sanity check
if ( poff + size > m_shmidSize[page] ) { char *xx=NULL;*xx=0; }
// get first byte
int shmid = m_shmids[page];
// assume we already have it loaded in
char *mem = s_mem;
// . is this the page we currently have loaded?
// . the shmdt() and shmat() seems to take about 2 MICROSECONDS
// on avg to execute here. about 3x faster than the
// writeToCache() above.
if ( shmid != s_shmid ) {
// time it
//int64_t start = gettimeofdayInMilliseconds();
// free current first so shmat has some room?
if ( s_mem && shmdt ( s_mem ) == -1 ) {
log("disk: shmdt: %s",mstrerror(errno));
char *xx=NULL;*xx=0;
}
// load it in if not
mem = (char *) shmat ( shmid , NULL, SHM_R|SHM_W );
// if this happens at startup, try calling shmat
// when we init this page cache above...
if ( mem == (char *)-1 ) {
log("disk: shmat: %s",mstrerror(errno));
char *xx=NULL;*xx=0;
}
// store it
s_mem = mem;
s_shmid = shmid;
// time it
//int64_t took = gettimeofdayInMilliseconds() -start;
//if ( took > 1 )
// logf(LOG_DEBUG,"disk: took %"INT64" ms to read "
// "to shm page cache shmid=%"INT32".",took,
// (int32_t)shmid);
}
// store it in outBuf
gbmemcpy ( outBuf , mem + poff , size );
return;
}
#endif
if ( m_useRAMDisk ) {
int32_t numBytesRead = pread( m_ramfd, outBuf, size,
bigOff + smallOff );
if ( numBytesRead != size ){
char *xx=NULL; *xx=0;
}
return;
}
*/
// the old fashioned way
char *p = getMemPtrFromMemOff ( memOff );
gbmemcpy(outBuf, p + pageOffset, bytesToCopy );
}
// lastly, we need some way to "force" a merge at around midnight when traffic
// is minimal, or when there are 3 or more indexdb files that are less than
// 80% in the indexdb disk page cache. because that means we are starting to
// do a lot of disk seeks.
// checks if indexdb needs merge
/*
bool DiskPageCache::needsMerge( ){
if ( !m_useRAMDisk ) return false;
int32_t numVfds = 0;
for ( int32_t i = 0; i < MAX_NUM_VFDS2; i++ ){
if ( !m_memOffFromDiskPage[i] ) continue;
// check to see if a file is less than 80% in the indexdb
// disk page cache
int32_t numOffsUsed = 0;
for ( int32_t j = 0; j < m_maxPagesInFile[i]; j++ ){
if ( m_memOffFromDiskPage[i][j] >= 0 )
numOffsUsed++;
}
if ( (numOffsUsed * 100)/m_maxPagesInFile[i] < 80 )
numVfds++;
}
if ( numVfds >= 3 )
return true;
return false;
}
*/
// 'ipcs -m' will show shared mem in linux
void freeAllSharedMem ( int32_t max ) {
// free shared mem whose pid no longer exists
//struct shmid_ds buf;
//shmctl ( 0 , SHM_STAT , &buf );
//int shmctl(int shmid, int cmd, struct shmid_ds *buf);
/*
#ifdef GBUSESHM
// types.h uses key_t type that shmget uses
// try to nuke it all
for ( int32_t i = 0 ; i < max ; i++ ) {
int shmid = i;
int32_t status = shmctl ( shmid , IPC_RMID , NULL);
if ( status == -1 ) {
//if ( errno != EINVAL )
// log("db: shctlt %"INT32": %s",(int32_t)shmid,mstrerror(errno));
}
else
log("db: Removed shmid %"INT32"",i);
}
#endif
*/
}
// types.h uses key_t type that shmget uses
#undef key_t