#undef _XOPEN_SOURCE // needed for pread and pwrite #define _XOPEN_SOURCE 500 #include "gb-include.h" #include "DiskPageCache.h" #include "RdbMap.h" // GB_PAGE_SIZE #include "Indexdb.h" #include "Profiler.h" // types.h uses key_t type that shmget uses //#undef key_t /* #ifdef GBUSESHM #include // shmget() #include // shmget() #endif */ // FORMAT of a MEMORY PAGE representing a DISK PAGE // // HEADER: // // bbbbbbbb bbbbbbbb bbbbbbbb bbbbbbb # of disk data bytes stored in this page // ffffffff ffffffff ffffffff fffffff Offset into memory page they are stored // pppppppp pppppppp pppppppp ppppppp Offset of prev mem page in linked list // nnnnnnnn nnnnnnnn nnnnnnnn nnnnnnn Offset of next mem page in linked list // dddddddd dddddddd dddddddd ddddddd Disk page # mem page is mapping. // vvvvvvvv vvvvvvvv vvvvvvvv vvvvvvv vfd of file page is mapping // // DDDDDDDD ........ raw disk data at that page... // offsets in bytes in the header each entry has. // should total HEADERSIZE bytes. #define OFF_SIZE 0 #define OFF_SKIP (int)(sizeof(int32_t)) #define OFF_PREV (int)(sizeof(int32_t)*2) #define OFF_NEXT (int)(sizeof(int32_t)*3) #define OFF_DISKPAGENUM (int)(sizeof(int32_t)*4) #define OFF_VFD (int)(sizeof(int32_t)*5) // store disk data iteself into page at this offset #define HEADERSIZE (int)(sizeof(int32_t)*6) DiskPageCache::DiskPageCache () { m_numPageSets = 0; // sometimes db may pass an unitialized DiskPageCache to a BigFile // so make sure when BigFile::close calls DiskPageCache::rmVfd() our // m_memOffFromDiskPage vector is all NULLed out, otherwise // it will core //memset ( m_memOff , 0 , sizeof(int32_t *) * MAX_NUM_VFDS2 ); for ( int32_t i = 0 ; i < MAX_NUM_VFDS2 ; i++ ) m_memOffFromDiskPage[i] = NULL; m_availMemOff = NULL; //m_isOverriden = false; reset(); } DiskPageCache::~DiskPageCache() { reset(); } /* #ifdef GBUSESHM static char *s_mem = NULL; static int s_shmid = -1; #endif */ void DiskPageCache::reset() { if ( m_numPageSets > 0 ) log("db: resetting page cache for %s",m_dbname); // . "m_pageSet[]" the actual memory buffers for holding disk pages // . we allocate one m_pageSet[] at a time like pools for ( int32_t i = 0 ; i < m_numPageSets ; i++ ) { mfree ( m_pageSet[i], m_pageSetSize[i], "DiskPageCache"); m_pageSet [i] = NULL; m_pageSetSize[i] = 0; } // . free all the m_memOffs[] arrays // . free map that maps this files pages on disk to pages/offs in mem // . m_memOffs[DISKPAGENUM] -> MEMPAGEOFFSET for ( int32_t i = 0 ; i < MAX_NUM_VFDS2 ; i++ ) { if ( ! m_memOffFromDiskPage [ i ] ) continue; int32_t size = m_maxPagesInFile[i] * sizeof(int32_t); mfree ( m_memOffFromDiskPage [ i ] , size , "DiskPageCache" ); m_memOffFromDiskPage [ i ] = NULL; } // . and these contain offsets to available memory pages // . there are m_numAvailMemOffs of them // . m_availMemOff[0] would map to the memory offset of the next // available memory page. kinda like m_memOffFromDiskPage[] but that one is // for used pages if ( m_availMemOff ) { int32_t size = m_maxAvailMemOffs * sizeof(int32_t); mfree ( m_availMemOff , size , "DiskPageCache" ); } /* #ifdef GBUSESHM // free current one, if exists if ( s_shmid >= 0 && s_mem ) { if ( shmdt ( s_mem ) == -1 ) log("disk: shmdt: reset: %s",mstrerror(errno)); s_mem = NULL; s_shmid = -1; } // mark shared mem for destruction for ( int32_t i = 0 ; m_useSHM && i < m_numShmids ; i++ ) { int shmid = m_shmids[i]; if ( shmctl ( shmid , IPC_RMID , NULL) == -1 ) log("db: shmctlt shmid=%"INT32": %s", (int32_t)shmid,mstrerror(errno)); else log("db: shmctl freed shmid=%"INT32"",(int32_t)shmid); } #endif */ m_numPageSets = 0; m_nextMemOff = 0; m_upperMemOff = 0; m_maxMem = 0; m_memAlloced = 0; m_availMemOff = NULL; m_numAvailMemOffs = 0; m_maxAvailMemOffs = 0; m_headOff = -1; m_tailOff = -1; m_enabled = true; m_nexti = 0; //m_ramfd = -1; //m_useRAMDisk = false; //m_useSHM = false; } bool DiskPageCache::init ( const char *dbname , char rdbId, int32_t maxMem , int32_t pageSize, bool useRAMDisk, bool minimizeDiskSeeks ) { // int32_t maxMem , // void (*getPages2)(DiskPageCache*, int32_t, char*, // int32_t, int64_t, int32_t*, // int64_t*), // void (*addPages2)(DiskPageCache*, int32_t, char*, // int32_t, int64_t), // int32_t (*getVfd2)(DiskPageCache*, int64_t), // void (*rmVfd2)(DiskPageCache*, int32_t) ) { reset(); // seems like we lose data when it prints "Caught add breach" // so let's stop using until we fix that... happens while we are // dumping i think and somehow the data seems to get lost that // we were dumping. //maxMem = 0; m_rdbId = rdbId; bool *tog = NULL; if (m_rdbId==RDB_INDEXDB ) tog=&g_conf.m_useDiskPageCacheIndexdb; if (m_rdbId==RDB_POSDB ) tog=&g_conf.m_useDiskPageCachePosdb; if (m_rdbId==RDB_DATEDB ) tog=&g_conf.m_useDiskPageCacheDatedb; if (m_rdbId==RDB_TITLEDB ) tog=&g_conf.m_useDiskPageCacheTitledb; if (m_rdbId==RDB_SPIDERDB ) tog=&g_conf.m_useDiskPageCacheSpiderdb; if (m_rdbId==RDB_TFNDB ) tog=&g_conf.m_useDiskPageCacheTfndb; if (m_rdbId==RDB_TAGDB ) tog=&g_conf.m_useDiskPageCacheTagdb; if (m_rdbId==RDB_CLUSTERDB ) tog=&g_conf.m_useDiskPageCacheClusterdb; if (m_rdbId==RDB_CATDB ) tog=&g_conf.m_useDiskPageCacheCatdb; if (m_rdbId==RDB_LINKDB ) tog=&g_conf.m_useDiskPageCacheLinkdb; m_switch = tog; /* bool useSHM = false; // a quick hacky thing, force them to use shared mem instead of ram dsk if ( useRAMDisk ) { useRAMDisk = false; useSHM = true; } */ // not for tmp cluster //if ( g_hostdb.m_useTmpCluster ) useSHM = false; // it is off by default because it leaks easily (if u Ctrl+C the process) //if ( ! g_conf.m_useSHM ) useSHM = false; // right now shared mem only supports a single page size because // we use s_mem/s_shmid, and if we have a small page size which // we free, then shmat() may get ENOMEM when trying to get the larger // of the two page sizes //if(useSHM && pageSize != GB_INDEXDB_PAGE_SIZE) {char *xx=NULL;*xx=0;} // don't use it until we figure out how to stop the memory from being // counted as being the process's memory space. i think we can make // shmat() use the same mem address each time... // if ( useSHM ) { // log("disk: shared mem currently not supported. Turn off " // "in gb.conf "); // char *xx=NULL;*xx=0; // } // save it; //m_useSHM = useSHM; // clear it //m_numShmids = 0; // set this //m_maxAllocSize = 33554432; // the shared mem page size is a little more than the disk page size //m_spageSize = pageSize + HEADERSIZE; // . this is /proc/sys/kernel/shmmax DIVIDED BY 2 on titan and gk0 now // . which is the max to get per call to shmat() // . making this smaller did not seem to have much effect on speed //int32_t max = 33554432/2; // make sure it is "pageSize" aligned so we don't split pages //m_maxAllocSize = (max / m_spageSize) * m_spageSize; // max of ~16MB worth of pages //int32_t adjPageSize = pageSize + HEADERSIZE; //m_maxAllocSize = 2000000000; // 2GB (16000000 / adjPageSize) * adjPageSize; /* #ifdef GBUSESHM // set it up if ( m_useSHM ) { // we can only use like 30MB shared mem pieces int32_t need = maxMem; shmloop: // how much to alloc now? int32_t alloc = need; // this is /proc/sys/kernel/shmmax on titan and gk0 now if ( alloc > m_maxAllocSize ) alloc = m_maxAllocSize; // don't allow anything lower than this because we always // "swap out" one for another below. that is, we call shmdt() // to free it then shmat() to reclaim it. otherwise, shmat() // will run out of memory!! if ( alloc < m_maxAllocSize ) alloc = m_maxAllocSize; // get it // SHM_R|SHM_W|SHM_R>>3|SHM_R>>6|... int shmid = shmget(IPC_PRIVATE, alloc, SHM_R|SHM_W|IPC_CREAT); // on error, bail if ( shmid == -1 ) return log("db: shmget: %s",mstrerror(errno)); // don't swap it out (only 2.6 kernel i think) //if ( shmctl ( shmid , SHM_LOCK , NULL ) ) // return log("db: shmctl: %s",mstrerror(errno)); // log it log("db: allocated %"INT32" bytes shmid=%"INT32"",alloc,(int32_t)shmid); // add it to our list m_shmids [ m_numShmids ] = shmid; m_shmidSize [ m_numShmids ] = alloc; m_numShmids++; // count it g_mem.m_sharedUsed += alloc; // log it for now //logf(LOG_DEBUG,"db: new shmid id is %"INT32", size=%"INT32"", // (int32_t)shmid,(int32_t)alloc); // subtract it need -= alloc; // get more if ( need > 0 ) goto shmloop; } #endif */ // a malloc tag, must be LESS THAN 16 bytes including the NULL char *p = m_memTag; gbmemcpy ( p , "pgcache-" , 8 ); p += 8; if ( dbname ) strncpy ( p , dbname , 8 ); // so we know what db we are caching for m_dbname = p; p += 8; *p++ = '\0'; // sanity check, we store bytes used as a int16_t at top of page //if ( m_diskPageSize > 0x7fff ) { char *xx = NULL; *xx = 0; } // . do not use more than this much memory for caching // . it may go over by like 2% for header information m_maxMem = maxMem ; // set m_pageSetSize. use this now instead of m_maxPageSetSize #define int32_t phsize = pageSize + HEADERSIZE; m_maxPageSetSize = (((128*1024*1024)/phsize)*phsize); m_diskPageSize = pageSize; m_minimizeDiskSeeks = minimizeDiskSeeks; // we need to keep a count memory of files being cached if ( m_minimizeDiskSeeks ) m_memFree = m_maxMem; // check for overriding functions //if ( getPages2 && addPages2 && getVfd2 && rmVfd2 ) { // // set override flag // m_isOverriden = true; // // set override functions // m_getPages2 = getPages2; // m_addPages2 = addPages2; // m_getVfd2 = getVfd2; // m_rmVfd2 = rmVfd2; // // return here // return true; //} /* // for now only indexdb will use the ramdisk if ( strcmp ( dbname, "indexdb" ) == 0 && useRAMDisk ){ if ( !initRAMDisk( dbname, maxMem ) ) return log ( "db: failed to init RAM disk" ); } */ // . use up to 800k for starters // . it will grow more as needed if ( ! growCache ( maxMem ) ) return log("db: pagecache init failed: %s.", mstrerror(g_errno)); // success return true; } // use Linux's ram disk for caching disk pages, in addition to the ram it // already uses. I would like to be able to pass in a "maxMemForRamDisk" parm // to its init() function and have it open a single, ram-disk file descriptor // for writing up to that many bytes. // then i would like only Indexdb (and later on Datedb) to pass in an 800MB // "maxMemForRamDisk" value, and, furthermore, i do not want to cache disk // pages from the indexdb root file, nor, any indexdb file that is larger than // twice the "maxMemForRamDisk" value (in this case 1.6GB). this will be used // exclusively for smaller indexdb files to eliminate excessive disk seeks and // utilize ALL the 4GB of ram in each machine. // lastly, we need some way to "force" a merge at around midnight when traffic // is minimal, or when there are 3 or more indexdb files that are less than // 80% in the indexdb disk page cache. because that means we are starting to // do a lot of disk seeks. /* bool DiskPageCache::initRAMDisk( const char *dbname, int32_t maxMem ){ m_useRAMDisk = true; if ( !dbname ) {char *xx=NULL; *xx=0;} // open a file descriptor char ff [1024]; sprintf ( ff, "/mnt/RAMDisk/%sPageCache", dbname ); // unlink it first unlink (ff); m_ramfd = open ( ff, O_RDWR | O_CREAT ); if ( m_ramfd < 0 ) return log ( LOG_WARN,"db: could not open fd in RAMdisk" ); return true; } */ // . this returns true iff the entire read was copied into // "buf" from the page cache // . it will move the used pages to the head of the linked list // . if *buf is NULL we allocate here void DiskPageCache::getPages ( int32_t vfd , char **buf , int32_t numBytes , int64_t diskOffset , int32_t *newNumBytes , int64_t *newOffset , char **allocBuf , int32_t *allocSize , int32_t allocOff ) { // check for override function //if ( m_isOverriden ) { // //log ( LOG_INFO, "cache: Get Pages [%"INT32"] [%"INT32"][%"INT64"]", // // vfd, numBytes, offset ); // m_getPages2 ( this, // vfd, // buf, // numBytes, // offset, // newNumBytes, // newOffset ); // return; //} // return new disk offset, assume unchanged *newOffset = diskOffset; *newNumBytes = numBytes; // return if no pages allowed in page cache if ( m_maxMem == 0 ) return; // or disabled if ( ! m_enabled ) return; // disabled at the master controls? if ( m_switch && ! *m_switch ) return; // or if minimizeDiskSeeks did not accept the vfd if ( m_minimizeDiskSeeks && vfd < 0 ) return; // or if no pages in this vfd if ( ! m_memOffFromDiskPage[vfd] ) return; // debug point //if ( offset == 16386 && numBytes == 16386 ) // log("hey"); // what is the page range of in-memory pages? int32_t sp = diskOffset / m_diskPageSize ; int32_t ep = (diskOffset + (numBytes-1)) / m_diskPageSize ; // . sanity check // . we establish the maxPagesInFile when BigFile::open is called // by RdbDump. Rdb.cpp calls m_dump.set with a maxFileSize based on // the mem occupied by the RdbTree. BUT,recs can be added to the tree // WHILE we are dumping, so we end up with a bigger file, and this // disk page cache is not prepared for it! if ( ep >= m_maxPagesInFile[vfd] ) { // happens because rdbdump did not get a high enough // maxfilesize so we did not make enough pages! we endedup // dumping more than what was end the tree because stuff was // added to the tree while dumping! log("db: pagecache: Caught get breach. " "ep=%"INT32" max=%"INT32" vfd=%"INT32"" , ep,m_maxPagesInFile[vfd] ,vfd); return; //char *xx = NULL; *xx = 0; } char *bufPtr = *buf; char *bufEnd = *buf + numBytes; // our offset into first page on disk ( as well as memory page) int32_t start1 = diskOffset - sp * m_diskPageSize; // this is for second while loop int32_t start2 = 0; if ( ep == sp ) start2 = start1; // store start pages while ( sp <= ep ) { // map disk page # sp into memory offset, "poff" int32_t poff = m_memOffFromDiskPage[vfd][sp]; // get a ptr to it //char *s = getMemPtrFromMemOff ( poff ); //if ( ! s ) break; // break if we do not have page in memory if ( poff < 0 ) break; // first 4 bytes of page is how many bytes are used in page int32_t size = 0; readFromCache( &size, poff, OFF_SIZE, sizeof(int32_t)); //int32_t size = *(int32_t *)(s+OFF_SIZE); // second set of 4 bytes is offset of data from page boundary int32_t skip = 0; readFromCache( &skip, poff, OFF_SKIP, sizeof(int32_t)); //int32_t skip = *(int32_t *)(s+OFF_SKIP); // debug msg // log("getPage: pageNum=%"INT32" poff=%"INT32" size=%"INT32" " // "skip=%"INT32"", // sp,poff,(int32_t)size,(int32_t)skip); // if this mem page data starts AFTER our offset, it is no good if ( skip > start1 ) break; // adjust size by our page offset, we won't necessarily be // starting our read at "skip" size -= (start1 - skip); // if size is 0 or less all cached data was // below our disk offset and is useless if ( size <= 0 ) break; // . promote this memory page in the linked list // . 16 byte header of each memory page houses the // linked lists' next and prev ptrs to pages in memory // just for putting the most frequently used pages on top promotePage ( poff , false ); // allocate the read buffer if we need to if ( ! *buf ) { // allocate enough room for allocOff, too int32_t need = numBytes + allocOff; char *p = (char *) mmalloc ( need,"PageCacheReadBuf" ); // let FileState know what needs to be freed *allocBuf = p; *allocSize = need; // if couldn't allocate, return now, what's the point if ( ! p ) return; // let caller know his new read buffer *buf = p + allocOff; // assign the ptrs now bufPtr = *buf ; bufEnd = *buf + numBytes; } // don't store more than asked for if ( bufPtr + size > bufEnd ) size = bufEnd - bufPtr; // . read in "size" bytes from memory into "bufPtr" // . start reading at an offset of "HEADERSIZE+start1" into // the memory page readFromCache(bufPtr, poff, HEADERSIZE + start1 , size); //gbmemcpy ( bufPtr , s + HEADERSIZE + start1 , size ); bufPtr += size; *newOffset += size; *newNumBytes -= size; // return if we got it all if ( bufPtr >= bufEnd ) { m_hits += 1; return; } // otherwise, advance to next page sp++; // and our page relative offset is zero now, iff ep > sp if ( sp <= ep ) start1 = 0; // if the memory page ended before the disk page, break out // because we don't want any holes readFromCache( &size, poff, OFF_SIZE, sizeof(int32_t)); if ( skip + size < m_diskPageSize ) break; //if ( skip + *(int32_t *)(s+OFF_SIZE) < m_diskPageSize )break; } // now store from tail down /* while ( ep > sp ) { // the page offset in memory int32_t poff = m_memOffFromDiskPage[vfd][ep]; // get a ptr to it char *s = getMemPtrFromMemOff ( poff ); // break if we do not have page in memory if ( ! s ) break; // first 2 bytes of page is how many bytes are used int32_t size = *(int32_t *)s; // second set of 2 bytes is offset from boundary int32_t skip = *(int32_t *)(s+OFF_SKIP); // adjust size by our page offset, if not zero if ( start2 > skip ) size -= (start2 - skip); // his skip point could be beyond us, too if ( skip > // . promote this page in the linked list // . bytes 8-16 of each page in memory houses the // next and prev ptrs to pages in memory promotePage ( s , poff , false ); // don't store more than asked for if ( bufEnd - size < bufPtr ) size = bufEnd - bufPtr; gbmemcpy ( bufEnd - size , s + HEADERSIZE + start2 , size ); bufEnd -= size; *newNumBytes -= size; // return if we got it all if ( bufEnd <= bufPtr ) { m_hits += 1; return; } // if this page had a skip, break out, we don't wany any holes if ( skip > 0 ) break; // otherwise, advance to next page ep--; } */ m_misses += 1; } // after you read/write from/to disk, copy into the page cache void DiskPageCache::addPages ( int32_t vfd, char *buf, int32_t numBytes, int64_t diskOffset , int32_t niceness ){ // check for override function //if ( m_isOverriden ) { // m_addPages2 ( this, // vfd, // buf, // numBytes, // offset ); // return; //} // if vfd is -1, then we were not able to add a map for this file if ( vfd < 0 ) return; // no NULL ptrs if ( ! buf ) return; // return if no pages allowed in page cache if ( m_maxMem == 0 ) return; // or disabled if ( ! m_enabled ) return; // disabled at the master controls? if ( m_switch && ! *m_switch ) return; // sometimes the file got unlinked on us if ( ! m_memOffFromDiskPage[vfd] ) return; // for some reason profiler cores all the time in here //if ( g_profiler.m_realTimeProfilerRunning ) return; // . "diskPageNum" is the first DISK page # // . "offset" is the offset on disk the data was read from // . "m_diskPageSize" is the size of the disk pages int64_t diskPageNum = diskOffset / m_diskPageSize ; // point to the data that was read from disk char *bufPtr = buf; char *bufEnd = buf + numBytes; // . how much did we exceed the mem page boundary by? // . "skip" is offset into the memory page where we store the disk data int32_t skip = diskOffset - diskPageNum * m_diskPageSize ; // how many bytes of disk data should we store into the memory page? int32_t size = m_diskPageSize - skip; // now add the remaining data into memory pages while ( bufPtr < bufEnd ) { // breathe QUICKPOLL(niceness); // ensure "size" is not too big. // adjust "size" if so,so we won't exceed the mem page boundary if ( bufPtr + size > bufEnd ) size = bufEnd - bufPtr; // add the page to memory. // "bufPtr" is the data we read from disk. // "size" is where to start writing relative to this memory // page's start. // "skip" is how many bytes to write into this "page". addPage ( vfd , diskPageNum , bufPtr , size , skip ); // advance disk data buf over what we stored into the mem page bufPtr += size; // advance DISK page # diskPageNum++; // assume we will be filling up the next mem page fully size = m_diskPageSize; // skip is offset from beginning of the memory page skip = 0; } } // . convert our MEMORY offset into an actual ptr to a chunk of memory // . this makes our memory pooling approach transparent // . "off" is offset into the memory // . "off" includes HEADERSIZE headers in it char *DiskPageCache::getMemPtrFromMemOff ( int32_t off ) { if ( off < 0 ) return NULL; // NULL means not in DiskPageCache // for some reason profiler cores all the time in here // and m_numPageSets is 0 like we got reset //if ( g_profiler.m_realTimeProfilerRunning ) return NULL; // get set number int32_t sn = off / m_maxPageSetSize ; // get offset from within the chunk of memory (within the set) //int32_t poff = off & (m_maxPageSetSize-1); int32_t poff = off % (m_maxPageSetSize); // . sanity check // . offset must be multiple of m_diskPageSize+HEADERSIZE, no cuz we skip // ahead X bytes of a page set boundary... //int32_t off2 = off - sn * m_maxPageSetSize; //if ( off2 != 0 && (off2% (m_diskPageSize+HEADERSIZE)) != 0) { // char *xx = NULL; *xx = 0; } // if we are not in the first page set, advance by one chunk // because the first page is often mapped to by a truncated poff from // the previous page set //if ( sn > 0 && poff == 0 ) poff += m_diskPageSize + HEADER_SIZE; // if it would breech our PAGE_SET, up it if ( poff + m_diskPageSize + HEADERSIZE > m_maxPageSetSize) {poff=0; sn++;} // sanity check if ( sn >= m_numPageSets ) { char *xx = NULL; *xx = 0; } // return the proper ptr return (m_pageSet[sn]) + poff; } // . "diskPageNum" is the disk page # of the file with "vfd" // . "page" points to the disk data we read from disk // . "size" is how many bytes to write into the memory page, #pageNum // . "skip" is the offset into the memory page we will write the disk data into void DiskPageCache::addPage(int32_t vfd, int32_t diskPageNum, char *pageData, int32_t size, int32_t skip){ // . if pageNum is beyond the file size // . see the explanation for this same error msg above if ( diskPageNum >= m_maxPagesInFile[vfd] ) { // this has happened during a merge before!! (at startup) //log(LOG_LOGIC,"db: pagecache: addPage: Bad engineer. " // happens because rdbdump did not get a high enough // maxfilesize so we did not make enough pages! we endedup // dumping more than what was end the tree because stuff was // added to the tree while dumping! log("db: pagecache: Caught add breach. " "pageNum=%"INT32" max=%"INT32" db=%s", diskPageNum,m_maxPagesInFile[vfd],m_dbname); return; } // debug msg // log("addPage: vfd=%"INT32" diskPageNum=%"INT32" pageData[0]=%hhx " // "size=%"INT32" skip=%"INT32"", // vfd,diskPageNum,pageData[0],size,(int32_t)skip); // "poff" is the DISK page # for "vfd" (virtual file descriptor) and // it returns an offset to the page in memory. int32_t poff = m_memOffFromDiskPage [ vfd ] [ diskPageNum ] ; int32_t oldDiskPage; // p will be NULL if page does not have any data in memory yet //char *p = getMemPtrFromMemOff ( poff ); // if page already exists in cache and needs data on the boundaries // we may be able to supply it if ( poff >= 0 ) { // debug msg //log("ENHANCING off=%"INT32"",poff); enhancePage ( poff , pageData , size , skip ); return; } // don't add any more if we're minimizing disk seeks and are full if ( m_minimizeDiskSeeks && m_numPagesPresentOfFile[vfd] >= m_maxPagesPerFile[vfd] ) return; // top: // try to get an available memory spot from list if ( m_numAvailMemOffs > 0 ) { poff = m_availMemOff [ --m_numAvailMemOffs ] ; // debug msg //log("RECYCLING off=%"INT32" numAvailMemOffs-1=%"INT32"" // ,poff,m_numAvailMemOffs); } // can we grab a page from memory without having to grow? else if ( m_nextMemOff + m_diskPageSize + HEADERSIZE < m_upperMemOff) { poff = m_nextMemOff; m_nextMemOff += m_diskPageSize + HEADERSIZE; // debug msg // log("CLAIMING off=%"INT32" (nextmemoff=%"INT32"",poff, // m_nextMemOff); } // . we now grow everything at start // . otherwise, try to grow the page cache by 200k //else if ( m_nextMemOff + m_diskPageSize + HEADERSIZE < m_maxMem ) { // // grow by 100k worth of pages each time // if ( ! growCache ( m_upperMemOff + 200*1024 ) ) return; // goto top; //} // this should never happen. Since in minimizeDiskSeek we have // an exact number of pages per file else if ( m_minimizeDiskSeeks ) { char *xx = NULL; *xx = 0; } // if no freebies left, take over the tail page in memory else { // STEAL IT!! poff = m_tailOff; // remove it from linked list. it will be re-added below @ head //// // CAUTION: THIS CHANGES m_tailOff!!!!!! /// excisePage ( m_tailOff ); // . the file no longer owns him // . this is a int32_t ptr to &m_bufOffs[vfd][pageNum] // . if that vfd no longer exists it should have added all its // pages to m_avail list //int32_t tmp = -1; // WHY DOING THIS? //int32_t memOff = -1;//NULL; //readFromCache(&memOff, poff, OFF_PTR, sizeof(int32_t)); // the tail may actualy belong to a separated file with // a different vfd int oldVfd; readFromCache (&oldVfd,poff,OFF_VFD,sizeof(int32_t)); readFromCache (&oldDiskPage,poff,OFF_DISKPAGENUM, sizeof(int32_t)); // did excise work? // this cored here from m_memOffFroMDiskPage[oldVfd] being // NULL, so how could that happen? if ( m_memOffFromDiskPage[oldVfd] && m_memOffFromDiskPage[oldVfd][oldDiskPage] != -1 ) { char *xx=NULL;*xx=0; } // did ex // seg faultint here: mdw: //*memOffPtr = -1; // how can this be, we subverted a valid buffer //if ( memOff == -1 ) { char *xx=NULL;*xx=0; } //poff = memOff; //m_cacheBuf.writeToCache(poff, OFF_PTR, &tmp, sizeof(int32_t)); // testing //m_cacheBuf.readFromCache ( &tmp, poff+OFF_PTR, sizeof(int32_t) ); //if ( tmp != -1 ){ //char *xx=NULL; *xx=0;} //**(int32_t **)(p+OFF_PTR) = -1; // debug msg //log("KICKINGTAIL off=%"INT32"",poff); } // sanity check if ( poff < 0 ) { char *xx = NULL; *xx = 0; } // get ptr to the page in memory from the memory offset //p = getMemPtrFromMemOff ( poff ); // store how many bytes we wrote into the memory page residing @ poff writeToCache(poff, OFF_SIZE, &size, sizeof(int32_t)); // int32_t tmp = 0; // m_cacheBuf.readFromCache ( &tmp, poff, OFF_SIZE, sizeof(int32_t) ); // if ( tmp != size ){ // char *xx=NULL; *xx=0;} //*(int32_t *)(p+OFF_SIZE) = size; // store "skip" which is the offset into the memory page we start // storing the disk data into writeToCache( poff, OFF_SKIP, &skip, sizeof(int32_t) ); //*(int32_t *)(p+OFF_SKIP) = skip; // sanity check if ( size + skip > m_diskPageSize ) { char *xx = NULL; *xx = 0; } // then store a ptr to m_memOffFromDiskPage[vfd][pageNum] so we can set // *ptr to -1 if they page gets replaced by another // store the offset of this memory page //int32_t *memOffPtr = &m_memOffFromDiskPage[ vfd ][ pageNum ]; // m_memOffFromDiskPage maps a vfd/pagenum to a memory page offset. // -1 means none. // why do we need to store the memory offset in the memory page??? //int32_t memOff = m_memOffFromDiskPage[ vfd ][ pageNum ]; //writeToCache( poff, OFF_PTR, &memOff, sizeof(int32_t)); //*(int32_t **)(p+OFF_PTR) = &m_memOffFromDiskPage [ vfd ] [ pageNum ]; // then the data from disk (skip over linked list info) // "skip" is how far into the memory page we should write the // disk data because it is not aligned perfectly with the mem page. writeToCache( poff, HEADERSIZE + skip, pageData, size); //gbmemcpy ( p + HEADERSIZE + skip , page , size ); // transform mem ptr to memory offset //if ( !m_useRAMDisk && ! m_useSHM ) { /* int32_t off = -1; char *p = getMemPtrFromMemOff ( poff ); for ( int32_t i = 0 ; i < m_numPageSets ; i++ ) { if ( p < m_pageSet[i] ) continue; if ( p > m_pageSet[i] + m_pageSetSize[i] ) continue; off = p - m_pageSet[i] + i * m_maxPageSetSize ; break; } */ // gotta record this now too! writeToCache( poff, OFF_DISKPAGENUM, &diskPageNum, sizeof(int32_t) ); writeToCache( poff, OFF_VFD, &vfd, sizeof(int32_t) ); // store the linked list information in the remaining header bytes // that we use for promoting heaviliy hit pages to the top of // thereby replacing the tail when adding new pages. this will // insert our page into the linked list. it will set the prev/next // mem page offsets in the header of this memory page. promotePage ( poff , true/*isNew?*/ ); // update map. map disk page # to mem offset. m_memOffFromDiskPage [ vfd ] [ diskPageNum ] = poff; // sanity check //if ( off != poff ) { char *xx=NULL; *xx=0; } //} //else // m_memOffFromDiskPage [ vfd ] [ pageNum ] = poff; // update the header of that page // we have added the page! if ( m_minimizeDiskSeeks ) m_numPagesPresentOfFile[vfd]++; } // . add data from "page" (we just read it from disk or wrote to disk) // . "poff" is the memory page # that will receive the disk data // . "page" points to the disk data we read from disk to be stored into mem pg // . "size" is how many bytes to write into the memory page, #pageNum // . "skip" is the offset into the memory page we will write the disk data into void DiskPageCache::enhancePage (int32_t poff, char *page, int32_t size, int32_t skip) { int32_t psize = 0; readFromCache( &psize, poff, OFF_SIZE, sizeof(int32_t)); //int32_t psize = *(int32_t *)(p+OFF_SIZE); int32_t pskip = 0; readFromCache( &pskip, poff, OFF_SKIP, sizeof(int32_t)); //int32_t pskip = *(int32_t *)(p+OFF_SKIP); // can we add to front of page? if ( skip < pskip ) { int32_t diff = pskip - skip; // . we cored here because page[diff-1] was out of bounds. why? // . do not allow gap in between cached data, that is, we have // cached bytes at the end of the page, then we try to cache // some at the beginning, and it's not contiguous... we are // not built for that... this can happen when dumping a file, // if your first reads up to the file end (somewhere in the // middle of the page) and your second read starts somewhere // else.... mmmm... i dunno.... if ( skip + size < pskip || diff > size ) { log("db: Avoided cache gap in %s. diff=%"INT32" " "size=%"INT32" pskip=%"INT32" skip=%"INT32".", m_dbname,diff,size,(int32_t)pskip,(int32_t)skip); return; } writeToCache(poff, HEADERSIZE + skip , page , diff); //gbmemcpy ( p + HEADERSIZE + skip , page , diff ); psize += diff; pskip -= diff; writeToCache(poff, OFF_SIZE, &psize, sizeof(int32_t)); //*(int32_t *)(p+OFF_SIZE) = psize ; writeToCache(poff, OFF_SKIP, &pskip, sizeof(int32_t)); //*(int32_t *)(p+OFF_SKIP) = pskip ; } // can we add to end of page? int32_t pend = pskip + psize; int32_t end = skip + size; if ( end <= pend ) return; int32_t diff = end - pend ; // if the read's starting point is beyond our ending point, bail, // we don't want any holes... if ( diff > size ) return; writeToCache(poff, HEADERSIZE + pend, page + size - diff, diff); //gbmemcpy ( p + HEADERSIZE + pend , page + size - diff , diff ); int32_t tmp = psize+diff; writeToCache(poff, OFF_SIZE, &tmp, sizeof(int32_t)); //*(int32_t *)(p+OFF_SIZE) = (int32_t)psize + diff; } // the link information is bytes 8-16 of each page in mem (next/prev mem ptrs) void DiskPageCache::promotePage ( int32_t poff , bool isNew ) { if ( isNew ) { here: // store a -1 to indicate previous page offset. // we are the head of the linked list now, so -1 means none. int32_t tmp = -1; writeToCache(poff, OFF_PREV, &tmp, sizeof(int32_t)); // testing readFromCache ( &tmp, poff, OFF_PREV, sizeof(int32_t) ); if ( tmp != -1 ){ char *xx=NULL; *xx=0;} //*(int32_t *)(p + OFF_PREV) = -1 ;// our prev is -1 (none) // store the next page in the linked list who WAS the head // it could be -1 if we are the first entry intothe linked list writeToCache(poff, OFF_NEXT, &m_headOff, sizeof(int32_t)); //*(int32_t *)(p+OFF_NEXT)=m_headOff;//our next is the old head // the old head's prev is us if ( m_headOff >= 0 ) { writeToCache(m_headOff,OFF_PREV,&poff,sizeof(int32_t)); //char *headPtr = getMemPtrFromMemOff ( m_headOff ) ; //*(int32_t *)(headPtr + OFF_PREV) = poff; } // and we're the new head m_headOff = poff; // if no tail, we become that, too, we must be the first if ( m_tailOff < 0 ) m_tailOff = poff; return; } // otherwise, we have to excise excisePage ( poff ); // and add as new goto here; } // remove a page from the linked list void DiskPageCache::excisePage ( int32_t poff ) { // get our neighbors, NULL if none int32_t prev = 0; readFromCache(&prev, poff, OFF_PREV, sizeof(int32_t)); //int32_t prev = *(int32_t *)(p + OFF_PREV); int32_t next = 0; readFromCache(&next, poff, OFF_NEXT, sizeof(int32_t)); //int32_t next = *(int32_t *)(p + OFF_NEXT); // if we were the head or tail, then pass it off to our neighbor if ( poff == m_headOff ) m_headOff = next; if ( poff == m_tailOff ) m_tailOff = prev; // our prev's next becomes our old next if ( prev >= 0 ) { //char *prevPtr = getMemPtrFromMemOff ( prev ); writeToCache(prev, OFF_NEXT, &next, sizeof(int32_t)); //*(int32_t *)(prevPtr + OFF_NEXT ) = next; } // our next's prev becomes our old prev if ( next >= 0 ) { //char *nextPtr = getMemPtrFromMemOff ( next ); writeToCache(next, OFF_PREV, &prev, sizeof(int32_t)); //int32_t *)(nextPtr + OFF_PREV ) = prev; } // what is the tail's disk page # so we can update // m_memOffFromDiskPage[vfd][tailDiskPageNum] ? int32_t diskPageNum; readFromCache ( &diskPageNum,poff,OFF_DISKPAGENUM,sizeof(int32_t) ); int vfd; readFromCache ( &vfd,poff,OFF_VFD,sizeof(int32_t) ); // the memory page we are commandeering should no longer be // mapped to from its disk page if ( m_memOffFromDiskPage [ vfd ] ) m_memOffFromDiskPage [ vfd ] [ diskPageNum ] = -1; } // . grow/shrink m_memOffFromDiskPage[] which maps vfd/page to a mem offset // . returns false and sets g_errno on error // . called by DiskPageCache::open()/close() respectively // . fileSize is so we can alloc m_memOffFromDiskPage[vfd] big enough // for all pgs int32_t DiskPageCache::getVfd ( int64_t maxFileSize, bool vfdAllowed ) { // check for override function //if ( m_isOverriden ) { // return m_getVfd2 ( this, maxFileSize ); //} // for RAMDisks, do not cache disk // pages from the indexdb root file, nor, any indexdb file that is // larger than twice the "maxMemForRamDisk" value /* if ( m_useRAMDisk && maxFileSize > (m_maxMem * 2) ){ log (LOG_INFO,"db: getvfd: cannot cache on RAMDisk files that " "larger than twice the max mem value. fileSize=%"INT32"", m_maxMem); return -1; } */ int32_t numPages = (maxFileSize / m_diskPageSize) + 1; // RESTRICT to only the first m_maxMemOff worth of files, // starting with the SMALLEST file first. so if maxMemoff is 50MB, and // we have 5 files that are 10,20,30 & 40MB, // then we use 10MB for the first file, 20MB of the 2nd BUT only // 20MB for the 3rd file, and the 4th file does not get any page cache. // if doing "biased lookups" each file is virtually half the actual // size, and this allocates page cache appropriately. // don't to do a page cache for an indexdb0001.dat that is 100GB // because we'd have to allocate too much mem for the // m_memOffFromDiskPage[] array // so for the parital file make sure its less than 1 GB if ( m_minimizeDiskSeeks && !vfdAllowed ){ log (LOG_INFO,"db: getVfd: cannot cache because minimizing " "disk seeks. numPages=%"INT32"", numPages); return -1; } // . pick a vfd for this BigFile to use // . start AFTER last pick in case BigFile closed, released its // m_vfd, a read thread returned and called addPages() using that // old m_vfd!!!!!!! TODO: can we fix this better? int32_t i ; int32_t count = MAX_NUM_VFDS2; for ( i = m_nexti ; count-- > 0 ; i++ ) { if ( i >= MAX_NUM_VFDS2 ) i = 0; // wrap if ( ! m_memOffFromDiskPage [ i ] ) break; } // bail if none left if ( count == 0 ) { g_errno = EBADENGINEER; log(LOG_LOGIC,"db: pagecache: getvfd: no vfds remaining."); //char *xx = NULL; *xx = 0; return -1; } // . file size has to be below 2 gigs because m_memOffFromDiskPage is // only a int32_t // . if we need to we could transform m_memOffFromDiskPage into // m_memPageNum //if ( maxFileSize > 0x7fffffffLL ) { // g_errno = EBADENGINEER; // log("DiskPageCache::getVfd: maxFileSize too big"); // return -1; //} // assign it int32_t vfd = i; // start here next time m_nexti = i + 1; // say which cache it is // alloc the map space for this file int32_t need = numPages * sizeof(int32_t) ; int32_t *buf = (int32_t *)mmalloc ( need , m_memTag ); if ( ! buf ) { log("db: Failed to allocate %"INT32" bytes for page cache " "structures for caching pages for vfd %"INT32". " "MaxfileSize=%"INT64". Not enough memory.", need,i,maxFileSize); return -1; } m_memOffFromDiskPage [ vfd ] = buf; m_maxPagesInFile [ vfd ] = numPages; // keep a tab on the number of pages we can store of the file if ( m_minimizeDiskSeeks ){ m_numPagesPresentOfFile[vfd] = 0; if ( m_memFree > numPages * ( HEADERSIZE + m_diskPageSize ) ) m_maxPagesPerFile[vfd] = numPages; else m_maxPagesPerFile[vfd] = m_memFree / ( m_diskPageSize + HEADERSIZE ); } // add it in m_memAlloced += need; // debug msg //log("%s adding %"INT32"",m_dbname,need); // no pages are in memory yet, so set offsets to -1 for ( i = 0 ; i < numPages ; i++ ) m_memOffFromDiskPage [ vfd ] [ i ] = -1; // if minimizing disk seeks then calculate the memory used if ( m_minimizeDiskSeeks ){ m_memFree -= maxFileSize; // if the file is bigger than the mem only partially store it if ( m_memFree < 0 ) m_memFree = 0; } // debug msg //log("ALLOCINGFILE pages=%"INT32"",numPages); return vfd; } // when a file loses its vfd this is called void DiskPageCache::rmVfd ( int32_t vfd ) { // check for override function //if ( m_isOverriden ) { // m_rmVfd2 ( this, vfd ); // return; //} // ensure validity if ( vfd < 0 ) return; // if 0 bytes are allocated for disk cache, just skip this junk if ( m_maxMem <= 0 ) return; // this vfd may have already been nuked by call to unlink! if ( ! m_memOffFromDiskPage [ vfd ] ) return; // add valid offsets used by vfd into m_availMemOff for ( int32_t i = 0 ; i < m_maxPagesInFile [ vfd ] ; i++ ) { int32_t off = m_memOffFromDiskPage [ vfd ] [ i ]; // a -1 offset means empty if ( off < 0 ) continue; // sanity check if ( m_numAvailMemOffs >= m_maxAvailMemOffs ) { char *xx = NULL; *xx = 0; } // debug msg //log("MAKING off=%"INT32" available. na=%"INT32"", // off,m_numAvailMemOffs+1); // store it in list of available memory offsets so some other // file can use it m_availMemOff [ m_numAvailMemOffs++ ] = off; //log("disk: m_numAvailMemOffs+1 -> %"INT32,m_numAvailMemOffs); // set this to -1 i guess. it'll be freed below anyway. m_memOffFromDiskPage [ vfd ] [i] = -1; // remove that page from linked list, too //char *p = getMemPtrFromMemOff ( off ); excisePage ( off ); } // free the map that maps this files pages on disk to pages/offs in mem int32_t size = m_maxPagesInFile[vfd] * sizeof(int32_t); mfree ( m_memOffFromDiskPage [ vfd ] , size , "DiskPageCache" ); m_memOffFromDiskPage [ vfd ] = NULL; // debug msg //log("%s rmVfd: vfd=%"INT32" down %"INT32"",m_dbname,vfd,size); m_memAlloced -= size; if ( m_minimizeDiskSeeks ){ m_memFree += m_maxPagesPerFile[vfd] * m_diskPageSize; m_maxPagesPerFile[vfd] = 0; m_numPagesPresentOfFile[vfd] = 0; } } // use "mem" bytes of memory for the cache bool DiskPageCache::growCache ( int32_t mem ) { // debug msg //log("GROWING PAGE CACHE from %"INT32" to %"INT32" bytes (%"XINT64")" // ,m_upperMemOff, mem ,(uint64_t)this); // don't exceed the max if ( mem > m_maxMem ) mem = m_maxMem; // bail if we wouldn't be growing if ( mem <= m_upperMemOff ) return true; // how many pages? round up. int32_t npages = mem/(m_diskPageSize+HEADERSIZE) + 1; // . we need one "available" slot for each page in the cache // . this is a list of memory offsets that are available int32_t oldSize = m_maxAvailMemOffs * sizeof(int32_t) ; int32_t newSize = npages * sizeof(int32_t) ; int32_t *a=(int32_t *)mrealloc(m_availMemOff,oldSize,newSize,m_memTag); if ( ! a ) return log("db: Failed to regrow page cache from %"INT32" to " "%"INT32" bytes. Not enough memory.",oldSize,newSize); m_availMemOff = a; m_maxAvailMemOffs = npages; m_memAlloced += (newSize - oldSize); // debug msg //log("%s growCache: up %"INT32"",m_dbname,(newSize - oldSize)); // how much more mem do we need to alloc? int32_t need = mem - m_upperMemOff ; // how big is our last page set? int32_t size = 0; char *ptr = NULL; int32_t i = 0; if ( m_numPageSets > 0 ) { // since we allocate everything at init this shouldn't happen char *xx=NULL; *xx=0; i = m_numPageSets - 1; ptr = m_pageSet [ i ]; size = m_pageSetSize [ i ]; } // realloc him int32_t extra = m_maxPageSetSize - size ; if ( extra > need ) extra = need; /* if ( m_useRAMDisk ){ // since RAMdisk it creates a file, no reason to alloc m_memAlloced = need; m_upperMemOff = need; return true; } // and shared mem already has the mem at this point if ( m_useSHM ) { m_memAlloced = need; m_upperMemOff = need; return true; } */ char *s = (char *)mrealloc ( ptr , size , size + extra, m_memTag); if ( ! s ) return log("db: Failed to allocate %"INT32" bytes more " "for pagecache.",extra); m_pageSet [ i ] = s; m_pageSetSize [ i ] = size + extra; // if we are not adding to an existing, we are a new page set if ( ! ptr ) m_numPageSets++; // discount it need -= extra; // add to alloc count m_memAlloced += extra; m_upperMemOff += extra; // debug msg //log("%s growCache2: up %"INT32"",m_dbname,extra); // if we do not need more, we are done if ( need == 0 ) return true; // otherwise, alloc new page sets until we hit it for ( i++ ; i < MAX_PAGE_SETS && need > 0 ; i++ ) { int32_t size = need; if ( size > m_maxPageSetSize ) size = m_maxPageSetSize; need -= size; m_pageSet[i] = (char *) mmalloc ( size , m_memTag ); if ( ! m_pageSet[i] ) break; m_pageSetSize[i] = size; m_memAlloced += size; m_upperMemOff += size; m_numPageSets++; // debug msg //log("%s growCache3: up %"INT32"",m_dbname,size); } // update upper bound if ( need == 0 ) return true; return log(LOG_LOGIC,"db: pagecache: Bad engineer. Weird problem."); } int32_t DiskPageCache::getMemUsed ( ) { return m_nextMemOff - m_numAvailMemOffs * (m_diskPageSize+HEADERSIZE); } bool DiskPageCache::verifyData2 ( int32_t vfd ) { // ensure validity //if ( vfd < 0 ) return true; for ( int vfd = 0 ; vfd < 10 ; vfd++ ) { // this vfd may have already been nuked by call to unlink! if ( ! m_memOffFromDiskPage [ vfd ] ) continue;//return true; // debug msg //log("VERIFYING PAGECACHE vfd=%"INT32" fn=%s",vfd,f->getFilename()); // read into here // add valid offsets used by vfd into m_availMemOff for ( int32_t i = 0 ; i < m_maxPagesInFile [ vfd ] ; i++ ) { int32_t off = m_memOffFromDiskPage [ vfd ] [ i ]; // if page not in use, skip it if ( off < 0 ) continue; // check this now too int32_t storedvfd; readFromCache ( &storedvfd, off , OFF_VFD, sizeof(int32_t) ); if ( storedvfd != vfd ) { char *xx=NULL;*xx=0; } // ensure we are in sync with the map of diskpage to mem int32_t storedDiskPageNum; readFromCache ( &storedDiskPageNum , off , OFF_DISKPAGENUM, sizeof(int32_t) ); if ( storedDiskPageNum != i ) { char *xx=NULL;*xx=0; } } } return true; } #include "BigFile.h" #include "Threads.h" bool DiskPageCache::verifyData ( BigFile *f ) { int32_t vfd = f->getVfd(); // ensure validity if ( vfd < 0 ) return true; // this vfd may have already been nuked by call to unlink! if ( ! m_memOffFromDiskPage [ vfd ] ) return true; // debug msg //log("VERIFYING PAGECACHE vfd=%"INT32" fn=%s",vfd,f->getFilename()); // read into here char buf [ 32 * 1024 ];//GB_PAGE_SIZE ]; //m_diskPageSize ]; // ensure threads disabled bool on = ! g_threads.areThreadsDisabled(); if ( on ) g_threads.disableThreads(); // disable ourselves disableCache(); // add valid offsets used by vfd into m_availMemOff for ( int32_t i = 0 ; i < m_maxPagesInFile [ vfd ] ; i++ ) { int32_t off = m_memOffFromDiskPage [ vfd ] [ i ]; // if page not in use, skip it if ( off < 0 ) continue; // ensure we are in sync with the map of diskpage to mem int32_t storedDiskPageNum; readFromCache ( &storedDiskPageNum , off , OFF_DISKPAGENUM, sizeof(int32_t) ); if ( storedDiskPageNum != i ) { char *xx=NULL;*xx=0; } // check this now too int32_t storedvfd; readFromCache ( &storedvfd, off , OFF_VFD, sizeof(int32_t) ); if ( storedvfd != vfd ) { char *xx=NULL;*xx=0; } //char *p = getMemPtrFromMemOff ( off ); int32_t size = 0; readFromCache(&size, off, OFF_SIZE, sizeof(int32_t)); //int32_t size = *(int32_t *)(p+OFF_SIZE); int32_t skip = 0; readFromCache(&skip, off, OFF_SKIP, sizeof(int32_t)); if ( size > 32 * 1024 ){ char *xx=NULL; *xx=0; } //int32_t skip = *(int32_t *)(p+OFF_SKIP); FileState fstate; if ( ! f->read ( buf , size , ((int64_t)i * (int64_t)m_diskPageSize) + (int64_t)skip , &fstate , NULL , // state NULL , // callback 0 )){// niceness // core if it did not complete char *xx = NULL; *xx = 0; } // compare to what we have in mem log("checking vfd=%"INT32" " "diskpage # %"INT32" size=%"INT32" skip=%"INT32"" , (int32_t)vfd , i, size, skip); char buf2[32 * 1024]; readFromCache( buf2, off, HEADERSIZE + skip, size ); if ( memcmp ( buf, buf2, size ) != 0 ){ char *xx = NULL; *xx = 0; } //if ( memcmp ( buf , p + HEADERSIZE + skip, size ) != 0 ) { //char *xx = NULL; *xx = 0; } } if ( on ) g_threads.enableThreads(); enableCache(); // debug msg log("DONE VERIFYING PAGECACHE"); return true; } // bigOff is used to get the MemPtr, smallOff is the offset in the Mem void DiskPageCache::writeToCache( int32_t memOff, int32_t memPageOff , void *inBuf, int32_t size ){ /* #ifdef GBUSESHM if ( m_useSHM ) { // what page are we on? int32_t page = ( bigOff + smallOff ) / m_maxAllocSize; // offset within that page int32_t poff = ( bigOff + smallOff ) % m_maxAllocSize; // sanity check if ( page >= m_numShmids ) { char *xx=NULL; *xx=0; } // sanity check if ( poff + size > m_shmidSize[page] ) { char *xx=NULL;*xx=0; } // get first byte int shmid = m_shmids[page]; // assume we already have it loaded in char *mem = s_mem; // . is this the page we currently have loaded? // . th shmdt and shmat() seems to take about 12 microseconds // on avg to execute. so about 100 times per milliseconds. // . seems like the writeToCache() is 3x slower than the // readFromCache() perhaps because the dirty pages are // COPIED back into system mem? if ( shmid != s_shmid ) { // time it //int64_t start = gettimeofdayInMicroseconds(); // free current i guess if ( s_mem && shmdt ( s_mem ) == -1 ) { log("disk: shmdt: %s",mstrerror(errno)); char *xx=NULL;*xx=0; } // load it in if not mem = (char *) shmat ( shmid , NULL, SHM_R|SHM_W ); // if this happens at startup, try calling shmat // when we init this page cache above... if ( mem == (char *)-1 ) { log("disk: shmat: %s",mstrerror(errno)); char *xx=NULL;*xx=0; } // store it s_mem = mem; s_shmid = shmid; // time it //int64_t took = gettimeofdayInMicroseconds() -start; //if ( took > 1 ) // logf(LOG_DEBUG,"disk: took %"INT64" us to write " // "to shm page cache shmid=%"INT32".",took, // (int32_t)shmid); } // store it into the cache gbmemcpy ( mem + poff , inBuf , size ); return; } #endif if ( m_useRAMDisk ){ int32_t numBytesWritten = pwrite( m_ramfd, inBuf, size, bigOff + smallOff ); if ( numBytesWritten != size ){ char *xx=NULL; *xx=0; } return; } */ char *p = getMemPtrFromMemOff ( memOff ); gbmemcpy(p + memPageOff, inBuf, size); } // . store cached disk info into "outBuf". up to "size" bytes of it. void DiskPageCache::readFromCache( void *outBuf, int32_t memOff, int32_t pageOffset, int32_t bytesToCopy ) { /* #ifdef GBUSESHM if ( m_useSHM ) { // what page are we on? int32_t page = ( bigOff + smallOff ) / m_maxAllocSize; // offset within that page int32_t poff = ( bigOff + smallOff ) % m_maxAllocSize; // sanity check if ( page >= m_numShmids ) { char *xx=NULL; *xx=0; } // sanity check if ( poff + size > m_shmidSize[page] ) { char *xx=NULL;*xx=0; } // get first byte int shmid = m_shmids[page]; // assume we already have it loaded in char *mem = s_mem; // . is this the page we currently have loaded? // . the shmdt() and shmat() seems to take about 2 MICROSECONDS // on avg to execute here. about 3x faster than the // writeToCache() above. if ( shmid != s_shmid ) { // time it //int64_t start = gettimeofdayInMilliseconds(); // free current first so shmat has some room? if ( s_mem && shmdt ( s_mem ) == -1 ) { log("disk: shmdt: %s",mstrerror(errno)); char *xx=NULL;*xx=0; } // load it in if not mem = (char *) shmat ( shmid , NULL, SHM_R|SHM_W ); // if this happens at startup, try calling shmat // when we init this page cache above... if ( mem == (char *)-1 ) { log("disk: shmat: %s",mstrerror(errno)); char *xx=NULL;*xx=0; } // store it s_mem = mem; s_shmid = shmid; // time it //int64_t took = gettimeofdayInMilliseconds() -start; //if ( took > 1 ) // logf(LOG_DEBUG,"disk: took %"INT64" ms to read " // "to shm page cache shmid=%"INT32".",took, // (int32_t)shmid); } // store it in outBuf gbmemcpy ( outBuf , mem + poff , size ); return; } #endif if ( m_useRAMDisk ) { int32_t numBytesRead = pread( m_ramfd, outBuf, size, bigOff + smallOff ); if ( numBytesRead != size ){ char *xx=NULL; *xx=0; } return; } */ // the old fashioned way char *p = getMemPtrFromMemOff ( memOff ); gbmemcpy(outBuf, p + pageOffset, bytesToCopy ); } // lastly, we need some way to "force" a merge at around midnight when traffic // is minimal, or when there are 3 or more indexdb files that are less than // 80% in the indexdb disk page cache. because that means we are starting to // do a lot of disk seeks. // checks if indexdb needs merge /* bool DiskPageCache::needsMerge( ){ if ( !m_useRAMDisk ) return false; int32_t numVfds = 0; for ( int32_t i = 0; i < MAX_NUM_VFDS2; i++ ){ if ( !m_memOffFromDiskPage[i] ) continue; // check to see if a file is less than 80% in the indexdb // disk page cache int32_t numOffsUsed = 0; for ( int32_t j = 0; j < m_maxPagesInFile[i]; j++ ){ if ( m_memOffFromDiskPage[i][j] >= 0 ) numOffsUsed++; } if ( (numOffsUsed * 100)/m_maxPagesInFile[i] < 80 ) numVfds++; } if ( numVfds >= 3 ) return true; return false; } */ // 'ipcs -m' will show shared mem in linux void freeAllSharedMem ( int32_t max ) { // free shared mem whose pid no longer exists //struct shmid_ds buf; //shmctl ( 0 , SHM_STAT , &buf ); //int shmctl(int shmid, int cmd, struct shmid_ds *buf); /* #ifdef GBUSESHM // types.h uses key_t type that shmget uses // try to nuke it all for ( int32_t i = 0 ; i < max ; i++ ) { int shmid = i; int32_t status = shmctl ( shmid , IPC_RMID , NULL); if ( status == -1 ) { //if ( errno != EINVAL ) // log("db: shctlt %"INT32": %s",(int32_t)shmid,mstrerror(errno)); } else log("db: Removed shmid %"INT32"",i); } #endif */ } // types.h uses key_t type that shmget uses #undef key_t