open-source-search-engine/DiskPageCache.cpp

#undef _XOPEN_SOURCE // needed for pread and pwrite
#define _XOPEN_SOURCE 500

#include "gb-include.h"

#include "DiskPageCache.h"
#include "RdbMap.h"    // GB_PAGE_SIZE
#include "Indexdb.h"

// types.h uses key_t type that shmget uses
#undef key_t

#ifdef GBUSESHM
#include <sys/ipc.h>  // shmget()
#include <sys/shm.h>  // shmget()
#endif

#define OFF_SIZE 0
#define OFF_SKIP 4
#define OFF_PREV 8
#define OFF_NEXT 12
#define OFF_PTR  16

#define oldshort long

DiskPageCache::DiskPageCache () {
	m_numPageSets = 0;
	// sometimes db may pass an unitialized DiskPageCache to a BigFile
	// so make sure when BigFile::close calls DiskPageCache::rmVfd() our
	// m_memOff vector is all NULLed out, otherwise it will core
	//memset ( m_memOff , 0 , sizeof(long *) * MAX_NUM_VFDS2 );
	for ( long i = 0 ; i < MAX_NUM_VFDS2 ; i++ )
		m_memOff[i] = NULL;

	m_availMemOff = NULL;
	//m_isOverriden = false;
	reset();
}

DiskPageCache::~DiskPageCache() {
	reset();
}

#ifdef GBUSESHM
static char *s_mem = NULL;
static int   s_shmid = -1;
#endif

void DiskPageCache::reset() {
	if ( m_numPageSets > 0 )
		log("db: resetting page cache for %s",m_dbname);
	for ( long i = 0 ; i < m_numPageSets ; i++ ) {
		mfree ( m_pageSet[i], m_pageSetSize[i], "DiskPageCache");
		m_pageSet    [i] = NULL;
		m_pageSetSize[i] = 0;
	}
	// free all the m_memOffs[] arrays
	// free the map that maps this files pages on disk to pages/offs in mem
	for ( long i = 0 ; i < MAX_NUM_VFDS2 ; i++ ) {
		if ( ! m_memOff [ i ] ) continue;
		long size = m_maxPagesInFile[i] * sizeof(long);
		mfree ( m_memOff [ i ] , size , "DiskPageCache" );
		m_memOff [ i ] = NULL;
	}
	// and these
	if ( m_availMemOff ) {
		long size = m_maxAvailMemOffs * sizeof(long);
		mfree ( m_availMemOff , size , "DiskPageCache" );
	}
#ifdef GBUSESHM
	// free current one, if exists
	if ( s_shmid >= 0 && s_mem ) {
		if ( shmdt ( s_mem ) == -1 )
			log("disk: shmdt: reset: %s",mstrerror(errno));
		s_mem   = NULL;
		s_shmid = -1;
	}
	// mark shared mem for destruction
	for ( long i = 0 ; m_useSHM && i < m_numShmids ; i++ ) {
		int shmid = m_shmids[i];
		if ( shmctl ( shmid , IPC_RMID , NULL) == -1 )
			log("db: shmctlt shmid=%li: %s",
			    (long)shmid,mstrerror(errno));
		else
			log("db: shmctl freed shmid=%li",(long)shmid);
	}
#endif
	m_numPageSets     = 0;
	m_nextMemOff      = 0;
	m_upperMemOff     = 0;
	m_maxMemOff       = 0;
	m_memAlloced      = 0;
	m_availMemOff     = NULL;
	m_numAvailMemOffs = 0;
	m_headOff         = -1;
	m_tailOff         = -1;
	m_enabled         = true;
	m_nexti           = 0;
	m_ramfd = -1;
	m_useRAMDisk = false;
	m_useSHM = false;
}

bool DiskPageCache::init ( const char *dbname ,
			   char rdbId,
			   long maxMem  ,
			   long pageSize,
			   bool useRAMDisk,
			   bool minimizeDiskSeeks ) {
			//   long maxMem ,
			//   void (*getPages2)(DiskPageCache*, long, char*,
			//		     long, long long, long*,
			//		     long long*),
			//   void (*addPages2)(DiskPageCache*, long, char*,
			//	   	     long, long long),
			//   long (*getVfd2)(DiskPageCache*, long long),
			//   void (*rmVfd2)(DiskPageCache*, long) ) {
	reset();

	// seems like we lose data when it prints "Caught add breach"
	// so let's stop using until we fix that... happens while we are
	// dumping i think and somehow the data seems to get lost that
	// we were dumping.
	//maxMem = 0;

	m_rdbId = rdbId;

	bool *tog = NULL;
	if (m_rdbId==RDB_INDEXDB   ) tog=&g_conf.m_useDiskPageCacheIndexdb;
	if (m_rdbId==RDB_POSDB   ) tog=&g_conf.m_useDiskPageCachePosdb;
	if (m_rdbId==RDB_DATEDB    ) tog=&g_conf.m_useDiskPageCacheDatedb;
	if (m_rdbId==RDB_TITLEDB   ) tog=&g_conf.m_useDiskPageCacheTitledb;
	if (m_rdbId==RDB_SPIDERDB  ) tog=&g_conf.m_useDiskPageCacheSpiderdb;
	if (m_rdbId==RDB_TFNDB     ) tog=&g_conf.m_useDiskPageCacheTfndb;
	if (m_rdbId==RDB_TAGDB     ) tog=&g_conf.m_useDiskPageCacheTagdb;
	if (m_rdbId==RDB_CLUSTERDB ) tog=&g_conf.m_useDiskPageCacheClusterdb;
	if (m_rdbId==RDB_CATDB     ) tog=&g_conf.m_useDiskPageCacheCatdb;
	if (m_rdbId==RDB_LINKDB    ) tog=&g_conf.m_useDiskPageCacheLinkdb;
	m_switch = tog;

	bool useSHM = false;
	// a quick hacky thing, force them to use shared mem instead of ram dsk
	if ( useRAMDisk ) {
		useRAMDisk = false;
		useSHM     = true;
	}
	// not for tmp cluster
	if ( g_hostdb.m_useTmpCluster ) useSHM = false;
	// it is off by default because it leaks easily (if u Ctrl+C the process)
	if ( ! g_conf.m_useSHM ) useSHM = false;
	// right now shared mem only supports a single page size because
	// we use s_mem/s_shmid, and if we have a small page size which
	// we free, then shmat() may get ENOMEM when trying to get the larger
	// of the two page sizes
	if ( useSHM && pageSize != GB_INDEXDB_PAGE_SIZE) {char *xx=NULL;*xx=0;}
	// don't use it until we figure out how to stop the memory from being
	// counted as being the process's memory space. i think we can make
	// shmat() use the same mem address each time...
	if ( useSHM ) {
		log("disk: shared mem currently not supported. Turn off "
		    "in gb.conf <useSharedMem>");
		char *xx=NULL;*xx=0;
	}
	// save it;
	m_useSHM = useSHM;
	// clear it
	m_numShmids = 0;
	// set this
	//m_maxAllocSize = 33554432;
	// the shared mem page size is a little more than the disk page size
	m_spageSize = pageSize + HEADERSIZE;
	// . this is /proc/sys/kernel/shmmax DIVIDED BY 2 on titan and gk0 now
	// . which is the max to get per call to shmat()
	// . making this smaller did not seem to have much effect on speed
	long max = 33554432/2;
	// make sure it is "pageSize" aligned so we don't split pages
	m_maxAllocSize = (max / m_spageSize) * m_spageSize;

#ifdef GBUSESHM
	// set it up
	if ( m_useSHM ) {
		// we can only use like 30MB shared mem pieces
		long need = maxMem;
	shmloop:
		// how much to alloc now?
		long alloc = need;
		// this is /proc/sys/kernel/shmmax on titan and gk0 now
		if ( alloc > m_maxAllocSize ) alloc = m_maxAllocSize;
		// don't allow anything lower than this because we always
		// "swap out" one for another below. that is, we call shmdt()
		// to free it then shmat() to reclaim it. otherwise, shmat()
		// will run out of memory!!
		if ( alloc < m_maxAllocSize ) alloc = m_maxAllocSize;
		// get it     // SHM_R|SHM_W|SHM_R>>3|SHM_R>>6|...
		int shmid = shmget(IPC_PRIVATE, alloc, SHM_R|SHM_W|IPC_CREAT);
		// on error, bail
		if ( shmid == -1 )
			return log("db: shmget: %s",mstrerror(errno));
		// don't swap it out (only 2.6 kernel i think)
		//if ( shmctl ( shmid , SHM_LOCK , NULL ) )
		//	return log("db: shmctl: %s",mstrerror(errno));
		// log it
		log("db: allocated %li bytes shmid=%li",alloc,(long)shmid);
		// add it to our list
		m_shmids    [ m_numShmids ] = shmid;
		m_shmidSize [ m_numShmids ] = alloc;
		m_numShmids++;
		// count it
		g_mem.m_sharedUsed += alloc;
		// log it for now
		//logf(LOG_DEBUG,"db: new shmid id is %li, size=%li",
		//     (long)shmid,(long)alloc);
		// subtract it
		need -= alloc;
		// get more
		if ( need > 0 ) goto shmloop;
	}
#endif

	// a malloc tag, must be LESS THAN 16 bytes including the NULL
	char *p = m_memTag;
	memcpy  ( p , "pgcache-" , 8 ); p += 8;
	if ( dbname ) strncpy ( p , dbname    , 8 );
	// so we know what db we are caching for
	m_dbname = p;
	p += 8;
	*p++ = '\0';
	// sanity check, we store bytes used as a short at top of page
	//if ( m_pageSize > 0x7fff ) { char *xx = NULL; *xx = 0; }
	// . do not use more than this much memory for caching
	// . it may go over by like 2% for header information
	m_maxMemOff = maxMem ;
	// set m_pageSetSize. use this now instead of m_maxPageSetSize #define
	long phsize = pageSize + HEADERSIZE;
	m_maxPageSetSize = (((128*1024*1024)/phsize)*phsize);
	m_pageSize     = pageSize;

	m_minimizeDiskSeeks = minimizeDiskSeeks;

	// we need to keep a count memory of files being cached
	if ( m_minimizeDiskSeeks )
		m_memFree = m_maxMemOff;

	// check for overriding functions
	//if ( getPages2 && addPages2 && getVfd2 && rmVfd2 ) {
	//	// set override flag
	//	m_isOverriden = true;
	//	// set override functions
	//	m_getPages2 = getPages2;
	//	m_addPages2 = addPages2;
	//	m_getVfd2   = getVfd2;
	//	m_rmVfd2    = rmVfd2;
	//	// return here
	//	return true;
	//}

	// for now only indexdb will use the ramdisk
	if ( strcmp ( dbname, "indexdb" ) == 0 && useRAMDisk ){
		if ( !initRAMDisk( dbname, maxMem ) )
			return log ( "db: failed to init RAM disk" );
	}

	// . use up to 800k for starters
	// . it will grow more as needed
	if ( ! growCache ( maxMem ) )
		return log("db: pagecache init failed: %s.",
			   mstrerror(g_errno));
	// success
	return true;
}

// use Linux's ram disk for caching disk pages, in addition to the ram it
// already uses. I would like to be able to pass in a "maxMemForRamDisk" parm
// to its init() function and have it open a single, ram-disk file descriptor
// for writing up to that many bytes.

// then i would like only Indexdb (and later on Datedb) to pass in an 800MB
// "maxMemForRamDisk" value, and, furthermore, i do not want to cache disk
// pages from the indexdb root file, nor, any indexdb file that is larger than
// twice the "maxMemForRamDisk" value (in this case 1.6GB). this will be used
// exclusively for smaller indexdb files to eliminate excessive disk seeks and
// utilize ALL the 4GB of ram in each machine.

// lastly, we need some way to "force" a merge at around midnight when traffic
// is minimal, or when there are 3 or more indexdb files that are less than
// 80% in the indexdb disk page cache. because that means we are starting to
// do a lot of disk seeks.
bool DiskPageCache::initRAMDisk( const char *dbname, long maxMem ){
	m_useRAMDisk = true;
	if ( !dbname ) {char *xx=NULL; *xx=0;}
	// open a file descriptor
	char ff [1024];
	sprintf ( ff, "/mnt/RAMDisk/%sPageCache", dbname );
	// unlink it first
	unlink (ff);

	m_ramfd = open ( ff, O_RDWR | O_CREAT );
	if ( m_ramfd < 0 )
		return log ( LOG_WARN,"db: could not open fd in RAMdisk" );

	return true;
}

// . this returns true iff the entire read was copied into
//   "buf" from the page cache
// . it will move the used pages to the head of the linked list
// . if *buf is NULL we allocate here
void DiskPageCache::getPages   ( long       vfd         ,
				 char     **buf         ,
				 long       numBytes    ,
				 long long  offset      ,
				 long      *newNumBytes ,
				 long long *newOffset   ,
				 char     **allocBuf    ,
				 long      *allocSize   ,
				 long       allocOff    ) {
	// check for override function
	//if ( m_isOverriden ) {
	//	//log ( LOG_INFO, "cache: Get Pages [%li] [%li][%lli]",
	//	//		vfd, numBytes, offset );
	//	m_getPages2 ( this,
	//		      vfd,
	//		      buf,
	//		      numBytes,
	//		      offset,
	//		      newNumBytes,
	//		      newOffset );
	//	return;
	//}

	// return new disk offset, assume unchanged
	*newOffset   = offset;
	*newNumBytes = numBytes;

	// return if no pages allowed in page cache
	if ( m_maxMemOff == 0 ) return;
	// or disabled
	if ( ! m_enabled ) return;
	// disabled at the master controls?
	if ( m_switch && ! *m_switch ) return;

	// or if minimizeDiskSeeks did not accept the vfd
	if ( m_minimizeDiskSeeks && vfd < 0 )
		return;

	// or if no pages in this vfd
	if ( !m_memOff[vfd] )
		return;

	// debug point
	//if ( offset == 16386 && numBytes == 16386 )
	//	log("hey");

	// what is the page range?
	long sp = offset / m_pageSize ;
	long ep = (offset + (numBytes-1)) / m_pageSize ;

	// . sanity check
	// . we establish the maxPagesInFile when BigFile::open is called
	//   by RdbDump. Rdb.cpp calls m_dump.set with a maxFileSize based on
	//   the mem occupied by the RdbTree. BUT, recs can be added to the tree
	//   WHILE we are dumping, so we end up with a bigger file, and this
	//   disk page cache is not prepared for it!
	if ( ep >= m_maxPagesInFile[vfd] ) {
		// happens because rdbdump did not get a high enough
		// maxfilesize so we did not make enough pages! we endedup
		// dumping more than what was end the tree because stuff was
		// added to the tree while dumping!
		log("db: pagecache: Caught get breach. "
		    "ep=%li max=%li vfd=%li", ep,m_maxPagesInFile[vfd] ,vfd);
		return;
		//char *xx = NULL; *xx = 0;
	}

	char *bufPtr = *buf;
	char *bufEnd = *buf + numBytes;

	// our offset into first page on disk
	oldshort start1 = offset - sp * m_pageSize;
	// this is for second while loop
	oldshort start2 = 0;
	if ( ep == sp ) start2 = start1;

	// store start pages
	while ( sp <= ep ) {
		// the page offset in memory
		long poff = m_memOff[vfd][sp];
		// get a ptr to it
		//char *s = getMemPtrFromOff ( poff );
		// break if we do not have page in memory
		//if ( ! s ) break;
		if ( poff < 0 ) break;
		// first 2 bytes of page is how many bytes are used in page
		oldshort size = 0;
		readFromCache( &size, poff, OFF_SIZE, sizeof(oldshort));
		//oldshort size = *(oldshort *)(s+OFF_SIZE);
		// second set of 2 bytes is offset of data from page boundary
		oldshort skip = 0;
		readFromCache( &skip, poff, OFF_SKIP, sizeof(oldshort));
		//oldshort skip = *(oldshort *)(s+OFF_SKIP);
		// debug msg
		//log("getPage: pageNum=%li page[0]=%hhx size=%li skip=%li",
		//    sp,s[HEADERSIZE],(long)size,(long)skip);
		// if this page data starts AFTER our offset, it is no good
		if ( skip > start1 ) break;
		// adjust size by our page offset, we won't necessarily be
		// starting our read at "skip"
		size -= (start1 - skip);
		// if size is 0 or less all cached data was below our offset
		if ( size <= 0 ) break;
		// . promote this page in the linked list
		// . bytes 8-16 of each page in memory houses the
		//   next and prev ptrs to pages in memory
		promotePage ( poff , false );
		// allocate the read buffer if we need to
		if ( ! *buf ) {
			// allocate enough room for allocOff, too
			long need = numBytes + allocOff;
			char *p = (char *) mmalloc ( need,"PageCacheReadBuf" );
			// let FileState know what needs to be freed
			*allocBuf  = p;
			*allocSize = need;
			// if couldn't allocate, return now, what's the point
			if ( ! p ) return;
			// let caller know his new read buffer
			*buf       = p + allocOff;
			// assign the ptrs now
			bufPtr     = *buf ;
			bufEnd     = *buf + numBytes;
		}
		// don't store more than asked for
		if ( bufPtr + size > bufEnd ) size = bufEnd - bufPtr;
		readFromCache(bufPtr, poff, HEADERSIZE + start1 , size);
		//memcpy ( bufPtr , s + HEADERSIZE + start1 , size );
		bufPtr       += size;
		*newOffset   += size;
		*newNumBytes -= size;
		// return if we got it all
		if ( bufPtr >= bufEnd ) { m_hits += 1; return; }
		// otherwise, advance to next page
		sp++;
		// and our page relative offset is zero now, iff ep > sp
		if ( sp <= ep ) start1 = 0;
		// if the cached page ended before the physical page, break out
		// because we don't want any holes
		readFromCache( &size, poff, OFF_SIZE, sizeof(oldshort));
		if ( skip + size < m_pageSize ) break;
		//if ( skip + *(oldshort *)(s+OFF_SIZE) < m_pageSize ) break;
	}

	// now store from tail down
	/*
	while ( ep > sp ) {
		// the page offset in memory
		long poff = m_memOff[vfd][ep];
		// get a ptr to it
		char *s = getMemPtrFromOff ( poff );
		// break if we do not have page in memory
		if ( ! s ) break;
		// first 2 bytes of page is how many bytes are used
		oldshort size = *(oldshort *)s;
		// second set of 2 bytes is offset from boundary
		oldshort skip = *(oldshort *)(s+OFF_SKIP);
		// adjust size by our page offset, if not zero
		if ( start2 > skip ) size -= (start2 - skip);
		// his skip point could be beyond us, too
		if ( skip >
		// . promote this page in the linked list
		// . bytes 8-16 of each page in memory houses the
		//   next and prev ptrs to pages in memory
		promotePage ( s , poff , false );
		// don't store more than asked for
		if ( bufEnd - size < bufPtr ) size = bufEnd - bufPtr;
		memcpy ( bufEnd - size , s + HEADERSIZE + start2 , size );
		bufEnd       -= size;
		*newNumBytes -= size;
		// return if we got it all
		if ( bufEnd <= bufPtr ) { m_hits += 1; return; }
		// if this page had a skip, break out, we don't wany any holes
		if ( skip > 0 ) break;
		// otherwise, advance to next page
		ep--;
	}
	*/
	m_misses += 1;
}

// after you read/write from/to disk, copy into the page cache
void DiskPageCache::addPages ( long vfd,
			       char *buf,
			       long numBytes,
			       long long offset ,
			       long niceness ){
	// check for override function
	//if ( m_isOverriden ) {
	//	m_addPages2 ( this,
	//		      vfd,
	//		      buf,
	//		      numBytes,
	//		      offset );
	//	return;
	//}
	// if vfd is -1, then we were not able to add a map for this file
	if ( vfd < 0 ) return;
	// no NULL ptrs
	if ( ! buf ) return;
	// return if no pages allowed in page cache
	if ( m_maxMemOff == 0 ) return;
	// or disabled
	if ( ! m_enabled ) return;
	// disabled at the master controls?
	if ( m_switch && ! *m_switch ) return;
	// sometimes the file got unlinked on us
	if ( ! m_memOff[vfd] ) return;
	// what is the page range?
	long long sp = offset / m_pageSize ;
	// point to it
	char *bufPtr = buf;
	char *bufEnd = buf + numBytes;
	// . do not add first page unless right on the boundary
	// . how much did we exceed the boundary by?
	oldshort skip = offset - sp * m_pageSize ;
	long  size = m_pageSize - skip;
	// now add the remaining pages
	while ( bufPtr < bufEnd ) {
		// breathe
		QUICKPOLL(niceness);
		// ensure "size" is not too big
		if ( bufPtr + size > bufEnd ) size = bufEnd - bufPtr;
		// add the page to memory
		addPage ( vfd , sp , bufPtr , size , skip );
		// advance
		bufPtr += size;
		sp++;
		size    = m_pageSize;
		skip    = 0;
	}
}

char *DiskPageCache::getMemPtrFromOff ( long off ) {
	if ( off < 0 ) return NULL; // NULL means not in DiskPageCache
	// get set number
	long sn = off / m_maxPageSetSize ;
	// get offset from within the chunk of memory (within the set)
	//long poff = off & (m_maxPageSetSize-1);
	long poff = off % (m_maxPageSetSize);
	// . sanity check
	// . offset must be multiple of m_pageSize+HEADERSIZE, no cuz we skip
	//   ahead X bytes of a page set boundary...
	//long off2 = off - sn * m_maxPageSetSize;
	//if ( off2 != 0 && (off2% (m_pageSize+HEADERSIZE)) != 0) {
	//	char *xx = NULL; *xx = 0; }
	// if we are not in the first page set, advance by one chunk
	// because the first page is often mapped to by a truncated poff from
	// the previous page set
	//if ( sn > 0 && poff == 0 ) poff += m_pageSize + HEADER_SIZE;
	// if it would breech our PAGE_SET, up it
	if ( poff + m_pageSize + HEADERSIZE > m_maxPageSetSize ) {poff=0; sn++;}
	// sanity check
	if ( sn >= m_numPageSets ) { char *xx = NULL; *xx = 0; }
	// return the proper ptr
	return &m_pageSet[sn][poff];
}

// skip is offset of "page" into physical page
void DiskPageCache::addPage(long vfd,long pageNum,char *page,long size,
			    oldshort skip){
	// . if pageNum is beyond the file size
	// . see the explanation for this same error msg above
	if ( pageNum >= m_maxPagesInFile[vfd] ) {
		// this has happened during a merge before!! (at startup)
		//log(LOG_LOGIC,"db: pagecache: addPage: Bad engineer. "
		// happens because rdbdump did not get a high enough
		// maxfilesize so we did not make enough pages! we endedup
		// dumping more than what was end the tree because stuff was
		// added to the tree while dumping!
		log("db: pagecache: Caught add breach. "
		    "pageNum=%li max=%li db=%s",
		    pageNum,m_maxPagesInFile[vfd],m_dbname);
		return;
	}

	// debug msg
	//log("addPage: pageNum=%li page[0]=%hhx size=%li skip=%li",
	//    pageNum,page[0],size,(long)skip);

	long poff = m_memOff [ vfd ] [ pageNum ] ;
	// p will be NULL if page does not have any data in memory yet
	//char *p = getMemPtrFromOff ( poff );
	// if page already exists in cache and needs data on the boundaries
	// we may be able to supply it
	if ( poff >= 0 ) {
		// debug msg
		//log("ENHANCING off=%li",poff);
		enhancePage ( poff , page , size , skip );
		return;
	}

	// don't add any more if we're minimizing disk seeks and are full
	if ( m_minimizeDiskSeeks &&
	     m_numPagesPresentOfFile[vfd] >= m_maxPagesPerFile[vfd] )
		return;

	// top:
	// try to get an available memory spot from list
	if ( m_numAvailMemOffs > 0 ) {
		poff = m_availMemOff [ --m_numAvailMemOffs ] ;
		// debug msg
		//log("RECYCLING off=%li",poff);
	}
	// can we grab a page from memory without having to grow?
	else if ( m_nextMemOff + m_pageSize + HEADERSIZE < m_upperMemOff ) {
		poff = m_nextMemOff;
		m_nextMemOff += m_pageSize + HEADERSIZE;
		// debug msg
		//log("CLAIMING off=%li",poff);
	}
	// . we now grow everything at start
	// . otherwise, try to grow the page cache by 200k
	//else if ( m_nextMemOff + m_pageSize + HEADERSIZE < m_maxMemOff ) {
	//	// grow by 100k worth of pages each time
	//	if ( ! growCache ( m_upperMemOff + 200*1024 ) ) return;
	//	goto top;
	//}
	// this should never happen. Since in minimizeDiskSeek we have
	// an exact number of pages per file
	else if ( m_minimizeDiskSeeks ) {
		char *xx = NULL; *xx = 0;
	}
	// if no freebies left, take over the tail page in memory
	else {
		poff = m_tailOff;
		//char *p = getMemPtrFromOff ( poff );
		excisePage ( poff );
		// . the file no longer owns him
		// . this is a long ptr to &m_bufOffs[vfd][pageNum]
		// . if that vfd no longer exists it should have added all its
		//   pages to m_avail list
		//long tmp = -1;
		long *memOffPtr = NULL;
		readFromCache(&memOffPtr, poff, OFF_PTR, sizeof(long*));
		*memOffPtr = -1;
		//m_cacheBuf.writeToCache(poff, OFF_PTR, &tmp, sizeof(long));
		// testing
		//m_cacheBuf.readFromCache ( &tmp, poff+OFF_PTR, sizeof(long) );
		//if ( tmp != -1 ){
		//char *xx=NULL; *xx=0;}
		//**(long **)(p+OFF_PTR) = -1;
		// debug msg
		//log("KICKINGTAIL off=%li",poff);
	}
	// sanity check
	if ( poff < 0 ) { char *xx = NULL; *xx = 0; }
	// get ptr to the page in memory from the memory offset
	//p = getMemPtrFromOff ( poff );
	// store the size as first 2 bytes
	writeToCache(poff, OFF_SIZE, &size, sizeof(oldshort));
	// oldshort tmp = 0;
	// m_cacheBuf.readFromCache ( &tmp, poff, OFF_SIZE, sizeof(long) );
	// if ( tmp != size ){
	//  char *xx=NULL; *xx=0;}
	//*(oldshort *)(p+OFF_SIZE) = size;
	writeToCache( poff, OFF_SKIP, &skip, sizeof(oldshort) );
	//*(oldshort *)(p+OFF_SKIP) = skip;
	// sanity check
	if ( size + skip > m_pageSize ) { char *xx = NULL; *xx = 0; }
	// store the link information in bytes 8-16
	promotePage ( poff , true/*isNew?*/ );
	// then store a ptr to m_memOff[vfd][pageNum] so we can set *ptr
	// to -1 if they page gets replaced by another

	long *memOffPtr = &m_memOff[ vfd ][ pageNum ];
	writeToCache( poff, OFF_PTR, &memOffPtr, sizeof(long*));

	//*(long **)(p+OFF_PTR) = &m_memOff [ vfd ] [ pageNum ] ;
	// then the data from disk (skip over link info)
	writeToCache( poff, HEADERSIZE + skip, page, size);
	//memcpy ( p + HEADERSIZE + skip , page , size );
	// transform mem ptr to offset
	if ( !m_useRAMDisk && ! m_useSHM ) {
		long off = -1;
		char *p = getMemPtrFromOff ( poff );
		for ( long i = 0 ; i < m_numPageSets ; i++ ) {
			if ( p < m_pageSet[i] ) continue;
			if ( p > m_pageSet[i] + m_pageSetSize[i] )
				continue;
			off = p - m_pageSet[i] + i * m_maxPageSetSize ;
			break;
		}
		// update map
		m_memOff [ vfd ] [ pageNum ] = off;
		// sanity check
		if ( off != poff ) { char *xx=NULL; *xx=0; }
	}
	else
		m_memOff [ vfd ] [ pageNum ] = poff;
	// update the header of that page

	// we have added the page!
	if ( m_minimizeDiskSeeks )
		m_numPagesPresentOfFile[vfd]++;
}

// add data from "page" (we just read it from disk or wrote to disk)
// into "p" page in memory
void DiskPageCache::enhancePage (long poff, char *page, long size,
				 oldshort skip) {
	oldshort psize = 0;
	readFromCache( &psize, poff, OFF_SIZE, sizeof(oldshort));
	//oldshort psize = *(oldshort *)(p+OFF_SIZE);
	oldshort pskip = 0;
	readFromCache( &pskip, poff, OFF_SKIP, sizeof(oldshort));
	//oldshort pskip = *(oldshort *)(p+OFF_SKIP);
	// can we add to front of page?
	if ( skip < pskip ) {
		long diff = pskip - skip;
		// . we cored here because page[diff-1] was out of bounds. why?
		// . do not allow gap in between cached data, that is, we have
		//   cached bytes at the end of the page, then we try to cache
		//   some at the beginning, and it's not contiguous... we are
		//   not built for that... this can happen when dumping a file,
		//   if your first reads up to the file end (somewhere in the
		//   middle of the page) and your second read starts somewhere
		//   else.... mmmm... i dunno....
		if ( skip + size < pskip || diff > size ) {
			log("db: Avoided cache gap in %s. diff=%li "
			    "size=%li pskip=%li skip=%li.",
			    m_dbname,diff,size,(long)pskip,(long)skip);
			return;
		}
		writeToCache(poff, HEADERSIZE + skip , page , diff);
		//memcpy ( p + HEADERSIZE + skip , page , diff );
		psize += diff;
		pskip -= diff;
		writeToCache(poff, OFF_SIZE, &psize, sizeof(oldshort));
		//*(oldshort *)(p+OFF_SIZE) = psize ;
		writeToCache(poff, OFF_SKIP, &pskip, sizeof(oldshort));
		//*(oldshort *)(p+OFF_SKIP) = pskip ;
	}
	// can we add to end of page?
	long pend = pskip + psize;
	long  end = skip  +  size;
	if ( end <= pend ) return;
	long diff = end - pend ;
	// if the read's starting point is beyond our ending point, bail,
	// we don't want any holes...
	if ( diff > size ) return;
	writeToCache(poff, HEADERSIZE + pend, page + size - diff, diff);
	//memcpy ( p + HEADERSIZE + pend , page + size - diff , diff );
	oldshort tmp = psize+diff;
	writeToCache(poff, OFF_SIZE, &tmp, sizeof(oldshort));
	//*(oldshort *)(p+OFF_SIZE) = (oldshort)psize + diff;
}

// the link information is bytes 8-16 of each page in mem (next/prev mem ptrs)
void DiskPageCache::promotePage ( long poff , bool isNew ) {
	if ( isNew ) {
	here:
		long tmp = -1;
		writeToCache(poff, OFF_PREV, &tmp, sizeof(long));
		// testing
		readFromCache ( &tmp, poff, OFF_PREV, sizeof(long) );
		if ( tmp != -1 ){
			char *xx=NULL; *xx=0;}
		//*(long *)(p + OFF_PREV) = -1       ;// our prev is -1 (none)
		writeToCache(poff, OFF_NEXT, &m_headOff, sizeof(long));
		//*(long *)(p+OFF_NEXT) = m_headOff;// our next is the old head
		// the old head's prev is us
		if ( m_headOff >= 0 ) {
			writeToCache(m_headOff, OFF_PREV, &poff,
				     sizeof(long));
			//char *headPtr = getMemPtrFromOff ( m_headOff ) ;
			//*(long *)(headPtr + OFF_PREV) = poff;
		}
		// and we're the new head
		m_headOff = poff;
		// if no tail, we become that, too, we must be the first
		if ( m_tailOff < 0 ) m_tailOff = poff;
		return;
	}
	// otherwise, we have to excise
	excisePage ( poff );
	// and add as new
	goto here;
}

// remove a page from the linked list
void DiskPageCache::excisePage ( long poff ) {
	// get our neighbors, NULL if none
	long prev = 0;
	readFromCache(&prev, poff, OFF_PREV, sizeof(long));
	//long prev = *(long *)(p + OFF_PREV);
	long next = 0;
	readFromCache(&next, poff, OFF_NEXT, sizeof(long));
	//long next = *(long *)(p + OFF_NEXT);
	// if we were the head or tail, then pass it off to our neighbor
	if ( poff == m_headOff ) m_headOff = next;
	if ( poff == m_tailOff ) m_tailOff = prev;
	// our prev's next becomes our old next
	if ( prev >= 0 ) {
		//char *prevPtr = getMemPtrFromOff ( prev );
		writeToCache(prev, OFF_NEXT, &next, sizeof(long));
		//*(long *)(prevPtr + OFF_NEXT ) = next;
	}
	// our next's prev becomes our old prev
	if ( next >= 0 ) {
		//char *nextPtr = getMemPtrFromOff ( next );
		writeToCache(next, OFF_PREV, &prev, sizeof(long));
		//long *)(nextPtr + OFF_PREV ) = prev;
	}
}

// . grow/shrink m_memOff[] which maps vfd/page to a mem offset
// . returns false and sets g_errno on error
// . called by DiskPageCache::open()/close() respectively
// . fileSize is so we can alloc m_memOff[vfd] big enough for all pgs
long DiskPageCache::getVfd ( long long maxFileSize, bool vfdAllowed ) {
	// check for override function
	//if ( m_isOverriden ) {
	//	return m_getVfd2 ( this, maxFileSize );
	//}

	// for RAMDisks, do not cache disk
	// pages from the indexdb root file, nor, any indexdb file that is
	// larger than twice the "maxMemForRamDisk" value
	if ( m_useRAMDisk && maxFileSize > (m_maxMemOff * 2) ){
		log (LOG_INFO,"db: getvfd: cannot cache on RAMDisk files that "
		     "larger than twice the max mem value. fileSize=%li",
		     m_maxMemOff);
		return -1;
	}

	long  numPages = (maxFileSize / m_pageSize) + 1;

	// RESTRICT to only the first m_maxMemOff worth of files,
	// starting with the SMALLEST file first. so if maxMemoff is 50MB, and
	// we have 5 files that are 10,20,30 & 40MB,
        // then we use 10MB for the first file, 20MB of the 2nd BUT only
        // 20MB for the 3rd file, and the 4th file does not get any page cache.
        // if doing "biased lookups" each file is virtually half the actual
        // size, and this allocates page cache appropriately.

	// don't to do a page cache for an indexdb0001.dat that is 100GB
	// because we'd have to allocate too much mem for the m_memOff[] array
	// so for the parital file make sure its less than 1 GB
	if ( m_minimizeDiskSeeks && !vfdAllowed ){
		log (LOG_INFO,"db: getVfd: cannot cache because minimizing "
		     "disk seeks. numPages=%li", numPages);
		return -1;
	}

	// . pick a vfd for this BigFile to use
	// . start AFTER last pick in case BigFile closed, released its
	//   m_vfd, a read thread returned and called addPages() using that
	//   old m_vfd!!!!!!! TODO: can we fix this better?
	long i ;
	long count = MAX_NUM_VFDS2;
	for ( i = m_nexti ; count-- > 0 ; i++ ) {
		if ( i >= MAX_NUM_VFDS2 ) i = 0; // wrap
		if ( ! m_memOff [ i ] ) break;
	}
	// bail if none left
	if ( count == 0 ) {
		g_errno = EBADENGINEER;
		log(LOG_LOGIC,"db: pagecache: getvfd: no vfds remaining.");
		//char *xx = NULL; *xx = 0;
		return -1;
	}
	// . file size has to be below 2 gigs because m_memOff is only a long
	// . if we need to we could transform m_memOff into m_memPageNum
	//if ( maxFileSize > 0x7fffffffLL ) {
	//	g_errno = EBADENGINEER;
	//	log("DiskPageCache::getVfd: maxFileSize too big");
	//	return -1;
	//}
	// assign it
	long vfd = i;
	// start here next time
	m_nexti = i + 1;
	// say which cache it is


	// alloc the map space for this file
	long  need     = numPages * sizeof(long) ;
	long *buf      = (long *)mmalloc ( need , m_memTag );
	if ( ! buf ) {
		log("db: Failed to allocate %li bytes for page cache "
		    "structures for caching pages for vfd %li. "
		    "MaxfileSize=%lli. Not enough memory.",need,i,maxFileSize);
		return -1;
	}
	m_memOff         [ vfd ] = buf;
	m_maxPagesInFile [ vfd ] = numPages;

	// keep a tab on the number of pages we can store of the file
	if ( m_minimizeDiskSeeks ){
		m_numPagesPresentOfFile[vfd] = 0;
		if ( m_memFree > numPages * ( HEADERSIZE + m_pageSize ) )
			m_maxPagesPerFile[vfd] = numPages;
		else
			m_maxPagesPerFile[vfd] = m_memFree / ( m_pageSize +
							       HEADERSIZE );
	}

	// add it in
	m_memAlloced += need;
	// debug msg
	//log("%s adding %li",m_dbname,need);
	// no pages are in memory yet, so set offsets to -1
	for ( i = 0 ; i < numPages ; i++ ) m_memOff [ vfd ] [ i ] = -1;

	// if minimizing disk seeks then calculate the memory used
	if ( m_minimizeDiskSeeks ){
		m_memFree -= maxFileSize;
		// if the file is bigger than the mem only partially store it
		if ( m_memFree < 0 )
			m_memFree = 0;
	}
	// debug msg
	//log("ALLOCINGFILE pages=%li",numPages);
	return vfd;
}

// when a file loses its vfd this is called
void DiskPageCache::rmVfd  ( long vfd ) {
	// check for override function
	//if ( m_isOverriden ) {
	//	m_rmVfd2 ( this, vfd );
	//	return;
	//}
	// ensure validity
	if ( vfd < 0 ) return;

	// if 0 bytes are allocated for disk cache, just skip this junk
	if ( m_maxMemOff <= 0 ) return;

	// this vfd may have already been nuked by call to unlink!
	if ( ! m_memOff [ vfd ] ) return;
	// add valid offsets used by vfd into m_availMemOff
	for ( long i = 0 ; i < m_maxPagesInFile [ vfd ] ; i++ ) {
		long off = m_memOff [ vfd ] [ i ];
		if ( off < 0 ) continue;
		// sanity check
		if ( m_numAvailMemOffs > m_maxAvailMemOffs ) {
			char *xx = NULL; *xx = 0; }
		// debug msg
		//log("MAKING off=%li available. na=%li",
		// off,m_numAvailMemOffs+1);
		// store it in list of available memory offsets so some other
		// file can use it
		m_availMemOff [ m_numAvailMemOffs++ ] = off;
		// remove that page from linked list, too
		//char *p = getMemPtrFromOff ( off );
		excisePage ( off );
	}
	// free the map that maps this files pages on disk to pages/offs in mem
	long size = m_maxPagesInFile[vfd] * sizeof(long);
	mfree ( m_memOff [ vfd ] , size , "DiskPageCache" );
	m_memOff [ vfd ] = NULL;
	// debug msg
	//log("%s rmVfd: vfd=%li down %li",m_dbname,vfd,size);
	m_memAlloced -= size;
	if ( m_minimizeDiskSeeks ){
		m_memFree += m_maxPagesPerFile[vfd] * m_pageSize;
		m_maxPagesPerFile[vfd] = 0;
		m_numPagesPresentOfFile[vfd] = 0;
	}
}

// use "mem" bytes of memory for the cache
bool DiskPageCache::growCache ( long mem ) {
	// debug msg
	//log("GROWING PAGE CACHE from %li to %li bytes", m_upperMemOff, mem );
	// don't exceed the max
	if ( mem > m_maxMemOff ) mem = m_maxMemOff;
	// bail if we wouldn't be growing
	if ( mem <= m_upperMemOff ) return true;
	// how many pages? round up.
	long npages = mem/(m_pageSize+HEADERSIZE) + 1;

	// . we need one "available" slot for each page in the cache
	// . this is a list of memory offsets that are available
	long oldSize = m_maxAvailMemOffs * sizeof(long) ;
	long newSize = npages            * sizeof(long) ;
	long *a = (long *) mrealloc(m_availMemOff,oldSize,newSize,m_memTag);
	if ( ! a ) return log("db: Failed to regrow page cache from %li to "
			      "%li bytes. Not enough memory.",oldSize,newSize);
	m_availMemOff     = a;
	m_maxAvailMemOffs = npages;
	m_memAlloced += (newSize - oldSize);
	// debug msg
	//log("%s growCache: up %li",m_dbname,(newSize - oldSize));

	// how much more mem do we need to alloc?
	long need = mem - m_upperMemOff ;
	// how big is our last page set?
	long size = 0;
	char *ptr = NULL;
	long    i = 0;
	if ( m_numPageSets > 0 ) {
		// since we allocate everything at init this shouldn't happen
		char *xx=NULL; *xx=0;
		i    = m_numPageSets - 1;
		ptr  = m_pageSet     [ i ];
		size = m_pageSetSize [ i ];
	}
	// realloc him
	long extra = m_maxPageSetSize - size ;
	if ( extra > need ) extra = need;
	if ( m_useRAMDisk ){
		// since RAMdisk it creates a file, no reason to alloc
		m_memAlloced = need;
		m_upperMemOff = need;
		return true;
	}
	// and shared mem already has the mem at this point
	if ( m_useSHM ) {
		m_memAlloced = need;
		m_upperMemOff = need;
		return true;
	}

	char *s = (char *)mrealloc ( ptr , size , size + extra,
				     m_memTag);
	if ( ! s ) return log("db: Failed to allocate %li bytes more "
			      "for pagecache.",extra);
	m_pageSet     [ i ] = s;
	m_pageSetSize [ i ] = size + extra;
	// if we are not adding to an existing, we are a new page set
	if ( ! ptr ) m_numPageSets++;
	// discount it
	need -= extra;
	// add to alloc count
	m_memAlloced  += extra;
	m_upperMemOff += extra;
	// debug msg
	//log("%s growCache2: up %li",m_dbname,extra);
	// if we do not need more, we are done
	if ( need == 0 ) return true;
	// otherwise, alloc new page sets until we hit it
	for ( i++ ; i < MAX_PAGE_SETS && need > 0 ; i++ ) {
		long size = need;
		if ( size > m_maxPageSetSize ) size = m_maxPageSetSize;
		need -= size;
		m_pageSet[i] = (char *) mmalloc ( size , m_memTag );
		if ( ! m_pageSet[i] ) break;
		m_pageSetSize[i]  = size;
		m_memAlloced     += size;
		m_upperMemOff    += size;
		m_numPageSets++;
		// debug msg
		//log("%s growCache3: up %li",m_dbname,size);
	}
	// update upper bound
	if ( need == 0 ) return true;
	return log(LOG_LOGIC,"db: pagecache: Bad engineer. Weird problem.");
}

long DiskPageCache::getMemUsed ( ) {
	return m_nextMemOff - m_numAvailMemOffs * (m_pageSize+HEADERSIZE);
}

#include "BigFile.h"
#include "Threads.h"

bool DiskPageCache::verify ( BigFile *f ) {
	long vfd = f->getVfd();
	// ensure validity
	if ( vfd < 0 ) return true;
	// this vfd may have already been nuked by call to unlink!
	if ( ! m_memOff [ vfd ] ) return true;
	// debug msg
	//log("VERIFYING PAGECACHE vfd=%li fn=%s",vfd,f->getFilename());
	// read into here
	char buf [ 32 * 1024 ];//GB_PAGE_SIZE ]; //m_pageSize ];
	// ensure threads disabled
	bool on = ! g_threads.areThreadsDisabled();
	if ( on ) g_threads.disableThreads();
	// disable ourselves
	disableCache();
	// add valid offsets used by vfd into m_availMemOff
	for ( long i = 0 ; i < m_maxPagesInFile [ vfd ] ; i++ ) {
		long off = m_memOff [ vfd ] [ i ];
		if ( off < 0 ) continue;
		//char *p = getMemPtrFromOff ( off );
		oldshort size = 0;
		readFromCache(&size, off, OFF_SIZE, sizeof(oldshort));
		//oldshort size = *(oldshort *)(p+OFF_SIZE);
		oldshort skip = 0;
		readFromCache(&skip, off, OFF_SKIP, sizeof(oldshort));
		if ( size > 32 * 1024 ){
			char *xx=NULL; *xx=0; }
		//oldshort skip = *(oldshort *)(p+OFF_SKIP);
		FileState fstate;
		if ( ! f->read ( buf           ,
				 size          ,
				 ((long long)i * (long long)m_pageSize) +
				                 (long long)skip ,
				 &fstate       ,
				 NULL          ,  // state
				 NULL          ,  // callback
				 0             )){// niceness
			// core if it did not complete
			char *xx = NULL; *xx = 0; }
		// compare to what we have in mem
		log("checking page # %li size=%li skip=%li", i, size, skip);
		char buf2[32 * 1024];
		readFromCache( buf2, off, HEADERSIZE + skip, size );
		if ( memcmp ( buf, buf2, size ) != 0 ){
			char *xx = NULL; *xx = 0; }
		//if ( memcmp ( buf , p + HEADERSIZE + skip, size ) != 0 ) {
		//char *xx = NULL; *xx = 0; }
	}
	if ( on ) g_threads.enableThreads();
	enableCache();
	// debug msg
	log("DONE VERIFYING PAGECACHE");
	return true;
}

// bigOff is used to get the MemPtr, smallOff is the offset in the Mem
void DiskPageCache::writeToCache( long bigOff, long smallOff,  void *inBuf,
				  long size ){

#ifdef GBUSESHM
	if ( m_useSHM ) {
		// what page are we on?
		long page = ( bigOff + smallOff ) / m_maxAllocSize;
		// offset within that page
		long poff = ( bigOff + smallOff ) % m_maxAllocSize;
		// sanity check
		if ( page >= m_numShmids ) { char *xx=NULL; *xx=0; }
		// sanity check
		if ( poff + size > m_shmidSize[page] ) { char *xx=NULL;*xx=0; }
		// get first byte
		int shmid = m_shmids[page];
		// assume we already have it loaded in
		char *mem = s_mem;
		// . is this the page we currently have loaded?
		// . th shmdt and shmat() seems to take about 12 microseconds
		//   on avg to execute. so about 100 times per milliseconds.
		// . seems like the writeToCache() is 3x slower than the
		//   readFromCache() perhaps because the dirty pages are
		//   COPIED back into system mem?
		if ( shmid != s_shmid ) {
			// time it
			//long long start = gettimeofdayInMicroseconds();
			// free current i guess
			if ( s_mem && shmdt ( s_mem ) == -1 ) {
				log("disk: shmdt: %s",mstrerror(errno));
				char *xx=NULL;*xx=0;
			}
			// load it in if not
			mem = (char *) shmat ( shmid , NULL, SHM_R|SHM_W );
			// if this happens at startup, try calling shmat
			// when we init this page cache above...
			if ( mem == (char *)-1 ) {
				log("disk: shmat: %s",mstrerror(errno));
				char *xx=NULL;*xx=0;
			}
			// store it
			s_mem   = mem;
			s_shmid = shmid;
			// time it
			//long long took = gettimeofdayInMicroseconds() -start;
			//if ( took > 1 )
			//	logf(LOG_DEBUG,"disk: took %lli us to write "
			//	     "to shm page cache shmid=%li.",took,
			//	     (long)shmid);
		}
		// store it into the cache
		memcpy ( mem + poff , inBuf , size );
		return;
	}
#endif

	if ( m_useRAMDisk ){
		long numBytesWritten = pwrite( m_ramfd, inBuf, size,
					       bigOff + smallOff );
		if ( numBytesWritten != size ){
			char *xx=NULL; *xx=0;
		}
		return;
	}

	char *p = getMemPtrFromOff ( bigOff );
	memcpy(p + smallOff, inBuf, size);
}

void DiskPageCache::readFromCache( void *outBuf, long bigOff, long smallOff,
				   long size ){
#ifdef GBUSESHM
	if ( m_useSHM ) {
		// what page are we on?
		long page = ( bigOff + smallOff ) / m_maxAllocSize;
		// offset within that page
		long poff = ( bigOff + smallOff ) % m_maxAllocSize;
		// sanity check
		if ( page >= m_numShmids ) { char *xx=NULL; *xx=0; }
		// sanity check
		if ( poff + size > m_shmidSize[page] ) { char *xx=NULL;*xx=0; }
		// get first byte
		int shmid = m_shmids[page];
		// assume we already have it loaded in
		char *mem = s_mem;
		// . is this the page we currently have loaded?
		// . the shmdt() and shmat() seems to take about 2 MICROSECONDS
		//   on avg to execute here. about 3x faster than the
		//   writeToCache() above.
		if ( shmid != s_shmid ) {
			// time it
			//long long start = gettimeofdayInMilliseconds();
			// free current first so shmat has some room?
			if ( s_mem && shmdt ( s_mem ) == -1 ) {
				log("disk: shmdt: %s",mstrerror(errno));
				char *xx=NULL;*xx=0;
			}
			// load it in if not
			mem = (char *) shmat ( shmid , NULL, SHM_R|SHM_W );
			// if this happens at startup, try calling shmat
			// when we init this page cache above...
			if ( mem == (char *)-1 ) {
				log("disk: shmat: %s",mstrerror(errno));
				char *xx=NULL;*xx=0;
			}
			// store it
			s_mem   = mem;
			s_shmid = shmid;
			// time it
			//long long took = gettimeofdayInMilliseconds() -start;
			//if ( took > 1 )
			//	logf(LOG_DEBUG,"disk: took %lli ms to read "
			//	     "to shm page cache shmid=%li.",took,
			//	     (long)shmid);
		}
		// store it in outBuf
		memcpy ( outBuf , mem + poff , size );
		return;
	}
#endif

	if ( m_useRAMDisk ) {
		long numBytesRead = pread( m_ramfd, outBuf, size,
					   bigOff + smallOff );
		if ( numBytesRead != size ){
			char *xx=NULL; *xx=0;
		}
		return;
	}

	// the old fashioned way
	char *p = getMemPtrFromOff ( bigOff );
	memcpy(outBuf, p + smallOff, size);
}

// lastly, we need some way to "force" a merge at around midnight when traffic
// is minimal, or when there are 3 or more indexdb files that are less than
// 80% in the indexdb disk page cache. because that means we are starting to
// do a lot of disk seeks.
// checks if indexdb needs merge
/*
bool DiskPageCache::needsMerge( ){
	if ( !m_useRAMDisk ) return false;
	long numVfds = 0;
	for ( long i = 0; i < MAX_NUM_VFDS2; i++ ){
		if ( !m_memOff[i] ) continue;
		// check to see if a file is less than 80% in the indexdb
		// disk page cache
		long numOffsUsed = 0;
		for ( long j = 0; j < m_maxPagesInFile[i]; j++ ){
			if ( m_memOff[i][j] >= 0 )
				numOffsUsed++;
		}
		if ( (numOffsUsed * 100)/m_maxPagesInFile[i] < 80 )
			numVfds++;
	}
	if ( numVfds >= 3 )
		return true;
	return false;
}
*/

// 'ipcs -m' will show shared mem in linux
void freeAllSharedMem ( long max ) {

	// free shared mem whose pid no longer exists
	//struct shmid_ds buf;
	//shmctl ( 0 , SHM_STAT , &buf );
	//int shmctl(int shmid, int cmd, struct shmid_ds *buf);

#ifdef GBUSESHM
	// types.h uses key_t type that shmget uses
	// try to nuke it all
	for ( long i = 0 ; i < max ; i++ ) {
		int shmid = i;
		long status = shmctl ( shmid , IPC_RMID , NULL);
		if ( status == -1 ) {
			//if ( errno != EINVAL )
			//	log("db: shctlt %li: %s",(long)shmid,mstrerror(errno));
		}
		else
			log("db: Removed shmid %li",i);
	}
#endif
}

// types.h uses key_t type that shmget uses
#undef key_t