Matt Wells 4e803210ee tons of changes from live github on neo.
lots of core fixes.
took out ppthtml powerpoint convert, it hangs.
dynamic rdbmap to save memory per coll.
fixed disk page cache logic and brought it
2014-01-17 21:01:43 -08:00

1642 lines
58 KiB

#include "gb-include.h"
#include "RdbMap.h"
#include "BigFile.h"
#include "IndexList.h"
RdbMap::RdbMap() {
m_numSegments = 0;
m_numSegmentPtrs = 0;
m_numSegmentOffs = 0;
reset ( );
// dont save map on deletion!
RdbMap::~RdbMap() {
void RdbMap::set ( char *dir , char *mapFilename,
long fixedDataSize , bool useHalfKeys , char keySize ,
long pageSize ) {
m_fixedDataSize = fixedDataSize;
m_file.set ( dir , mapFilename );
m_useHalfKeys = useHalfKeys;
m_ks = keySize;
m_pageSize = pageSize;
m_pageSizeBits = getNumBitsOn32(pageSize-1);
// m_pageSize -1 must be able to be stored in m_offsets[][] (a short)
if ( m_pageSize > 32768 ) {
log(LOG_LOGIC,"db: rdbmap: m_pageSize too big for m_offsets.");
char *xx = NULL; *xx = 0;
// . we remove the head part files of a BigFile when merging it
// . this keeps the required merge space down to a small amount
// . when we chop off a part file from a BigFile we must also
// chop off the corresponding segments in the map
// . the match must be EXACT
// . therefore, PAGES_PER_SEGMENT * m_pageSize must evenly divide
// MAX_PART_SIZE #define'd in BigFile.h
if ( (MAX_PART_SIZE % (PAGES_PER_SEGMENT*m_pageSize)) == 0 ) return;
log(LOG_LOGIC,"db: rdbmap: PAGES_PER_SEGMENT*"
"m_pageSize does not divide MAX_PART_SIZE. cannot do "
"space-saving merges due to this.");
char *xx = NULL; *xx = 0;
bool RdbMap::close ( bool urgent ) {
bool status = true;
if ( /*mdw m_numPages > 0 &&*/ m_needToWrite ) status = writeMap ( );
// clears and frees everything
if ( ! urgent ) reset ();
return status;
void RdbMap::reset ( ) {
m_generatingMap = false;
for ( long i = 0 ; i < m_numSegments; i++ ) {
mfree(m_offsets[i], 2*PAGES_PER_SEGMENT,"RdbMap");
// set to NULL so we know if accessed illegally
m_keys [i] = NULL;
m_offsets[i] = NULL;
// the ptrs themselves are now a dynamic array to save mem
// when we have thousands of collections
mfree(m_keys,m_numSegmentPtrs*sizeof(char *),"MapPtrs");
mfree(m_offsets,m_numSegmentOffs*sizeof(short *),"MapPtrs");
m_numSegmentPtrs = 0;
m_numSegmentOffs = 0;
m_needToWrite = false;
m_fileStartOffset = 0LL;
m_numSegments = 0;
m_numPages = 0;
m_maxNumPages = 0;
m_offset = 0LL;
m_numPositiveRecs = 0LL;
m_numNegativeRecs = 0LL;
//m_lastKey.n1 = 0;
//m_lastKey.n0 = 0LL;
KEYMIN(m_lastKey,MAX_KEY_BYTES); // m_ks);
// close up shop
// m_file.close ( ); this casues an error in Rdb.cpp:317 (new RdbMap)
m_lastLogTime = 0;
m_badKeys = 0;
m_needVerify = false;
bool RdbMap::writeMap ( ) {
if ( g_conf.m_readOnlyMode ) return true;
// return true if nothing to write out
// mdw if ( m_numPages <= 0 ) return true;
if ( ! m_needToWrite ) return true;
// open a new file
if ( ! ( O_RDWR | O_CREAT | O_TRUNC ) )
return log("db: Could not open %s for writing: %s.",
// write map data
bool status = writeMap2 ( );
// on success, we don't need to write it anymore
if ( status ) m_needToWrite = false;
// . close map
// . no longer since we use BigFile
//m_file.close ( );
// return status
return status;
bool RdbMap::writeMap2 ( ) {
// the current disk offset
long long offset = 0LL;
g_errno = 0;
// first 8 bytes are the size of the DATA file we're mapping
m_file.write ( &m_offset , 8 , offset );
if ( g_errno ) return log("db: Failed to write to %s: %s",
offset += 8;
// when a BigFile gets chopped, keep up a start offset for it
m_file.write ( &m_fileStartOffset , 8 , offset );
if ( g_errno ) return log("db: Failed to write to %s: %s",
offset += 8;
// store total number of non-deleted records
m_file.write ( &m_numPositiveRecs , 8 , offset );
if ( g_errno ) return log("db: Failed to write to %s: %s",
offset += 8;
// store total number of deleted records
m_file.write ( &m_numNegativeRecs , 8 , offset );
if ( g_errno ) return log("db: Failed to write to %s: %s",
offset += 8;
// store last key in map
//m_file.write ( &m_lastKey , 12 , offset );
m_file.write ( m_lastKey , m_ks , offset );
if ( g_errno ) return log("db: Failed to write to %s: %s",
//offset += 12;
offset += m_ks;
// . now store the map itself
// . write the segments (keys/offsets) from the map file
for ( long i = 0 ; i < m_numSegments ; i++ ) {
offset = writeSegment ( i , offset );
if ( offset<=0 ) return log("db: Failed to write to "
"%s: %s",
// . make sure it happens now!
// . no, we use O_SYNC
return true;
long long RdbMap::writeSegment ( long seg , long long offset ) {
// how many pages have we written?
long pagesWritten = seg * PAGES_PER_SEGMENT;
// how many pages are left to write?
long pagesLeft = m_numPages - pagesWritten;
// if none left to write return offset now
if ( pagesLeft <= 0 ) return offset;
// truncate to segment's worth of pages for writing purposes
if ( pagesLeft > PAGES_PER_SEGMENT ) pagesLeft = PAGES_PER_SEGMENT;
// determine writeSize for keys
//long writeSize = pagesLeft * sizeof(key_t);
long writeSize = pagesLeft * m_ks;
// write the keys segment
g_errno = 0;
m_file.write ( (char *)m_keys[seg] , writeSize , offset );
if ( g_errno ) return false;//log("RdbMapFile::writeSegment: failed");
offset += writeSize ;
// determine writeSize for relative 2-byte offsets
writeSize = pagesLeft * 2;
// write the offsets of segment
m_file.write ( (char *)m_offsets[seg] , writeSize , offset );
if ( g_errno ) return false;//log("RdbMapFile::writeSegment: failed");
offset += writeSize ;
// return the new offset
return offset ;
// . called by openOld()
// . returns true on success
// . returns false on i/o error.
// . calls setMapSize() to get memory for m_keys/m_offsets
// . The format of the map on disk is described in Map.h
// . sets "m_numPages", "m_keys", and "m_offsets"
// . reads the keys and offsets into buffers allocated during open().
// . now we pass in ptr to the data file we map so verifyMap() can use it
bool RdbMap::readMap ( BigFile *dataFile ) {
// bail if does not exist
if ( ! m_file.doesExist() )
return log("db: Map file %s does not exist.",
// . open the file
// . do not open O_RDONLY because if we are resuming a killed merge
// we will add to this map and write it back out.
if ( ! ( O_RDWR ) )
return log("db: Could not open %s for reading: %s.",
bool status = readMap2 ( );
// . close map
// . no longer since we use BigFile
// . no, we have to close since we will hog all the fds
// . we cannot call BigFile::close() because then RdbMap::unlink() will
// not work because BigFile::m_maxParts gets set to 0, and that is
// used in the loop in BigFile::unlinkRename().
m_file.closeFds ( );
// verify and fix map, data on disk could be corrupted
if ( ! verifyMap ( dataFile ) ) return false;
// return status
return status;
bool RdbMap::verifyMap ( BigFile *dataFile ) {
long long diff = m_offset - m_fileStartOffset;
diff -= dataFile->getFileSize();
// make it positive
if ( diff < 0 ) diff = diff * -1LL;
// . return false if file size does not match
// . i've seen this happen before
if ( diff ) {
"db: Map file %s says that file %s should be %lli bytes "
"long, but it is %lli bytes.",
dataFile->m_baseFilename ,
m_offset - m_fileStartOffset ,
dataFile->getFileSize() );
// we let headless files squeak by on this because we cannot
// generate a map for them yet. if power went out a key can be
// caught in the middle of a write... thus limit to 12 bytes
if ( dataFile->doesPartExist(0) || diff >= 12 ) return false;
// explain it
log("db: Datafile is headless (so the map can not be "
"regenerated right now) and the difference is < 12, so "
"we will let this one squeak by.");
//log("RdbMap::verifyMap: Regenerating map.");
//log("db: Please delete map file %s and restart. "
// "This will regenerate the map file.",
// //Continuing despite discrepancy.",
// m_file.getFilename());
//return false;
//return true;
// are we a 16k page size map?
long long maxSize =(long long)(m_numPages + 1)*(long long)m_pageSize;
long long minSize =(long long)(m_numPages - 1)*(long long)m_pageSize;
long long dfs = dataFile->getFileSize();
if ( dfs < minSize || dfs > maxSize ) {
//log("db: File is not mapped with PAGE_SIZE of %li. Please "
// "delete map file %s and restart in order to regenerate "
// "it. Chances are you are running a new version of gb on "
// "old data.", (long)PAGE_SIZE, m_file.getFilename());
log("db: File %s is not mapped with PAGE_SIZE of %li. "
"You may be running a new version of gb on "
"old data.", m_file.getFilename(),(long)m_pageSize);
//exit (-1);
return false;
// . first, if our data file is headless we may have to chop our heads
// because a merge was probably killed
// . how many head PARTs are missing?
//long numMissingParts = 0;
//while ( ! dataFile->doesPartExist ( numMissingParts ) )
// numMissingParts++;
// we should count backwards so we stop at the first gap from the top.
// power outages sometimes leave one file linked when it should have
// been unlinked... although a file after it was successfully recorded
// as being unlinked on the hard drive, it itself was never committed.
// thereby producing a gap in the contiguous sequence of part files.
// let's ignore such islands. these islands can be more than one file
// too. let's verify they are unlinked after the merge completes.
long numMissingParts = dataFile->m_maxParts;
while ( numMissingParts > 0 &&
dataFile->doesPartExist ( numMissingParts-1 ) )
if ( numMissingParts > 0 ) {
File *f = dataFile->getFile ( numMissingParts );
if ( f ) log("db: Missing part file before %s.",
// how many PARTs have been removed from map?
long removed = m_fileStartOffset / MAX_PART_SIZE;
// . balance it out
// . don't map to PARTs of data file that have been chopped
while ( removed < numMissingParts ) {
log(LOG_INIT,"db: Removing part #%li from map.",removed);
chopHead ( MAX_PART_SIZE );
// now fix the map if it had out of order keys in it
return verifyMap2 ( );
// this just fixes a bad map
bool RdbMap::verifyMap2 ( ) {
//key_t lastKey ; lastKey.n0 = 0LL; lastKey.n1 = 0;
char lastKey[MAX_KEY_BYTES];
for ( long i = 0 ; i < m_numPages ; i++ ) {
//key_t k;
//k = getKey(i);
//if ( k >= lastKey ) { lastKey = k; continue; }
char *k = getKeyPtr(i);
if ( KEYCMP(k,lastKey,m_ks)>=0 ) {
KEYSET(lastKey,k,m_ks); continue; }
// just bitch for now
"db: Key out of order in map file %s. "
"page = %li. key offset = %lli. Map or data file is "
"corrupt, but it is probably the data file.",
m_file.getFilename() ,
i,(long long)m_pageSize*(long long)i+getOffset(i));
//log("db: oldk.n1=%08lx n0=%016llx",
// lastKey.n1,lastKey.n0);
//log("db: k.n1=%08lx n0=%016llx",k.n1 ,k.n0);
log("db: oldk.n1=%016llx n0=%016llx",
log("db: k.n1=%016llx n0=%016llx",KEY1(k,m_ks),KEY0(k));
log("db: m_numPages = %li",m_numPages);
//char *xx=NULL;*xx=0;
// was k too small?
//if ( i + 1 < m_numPages && lastKey <= getKey(i+1) ) {
if (i+1<m_numPages && KEYCMP(lastKey,getKeyPtr(i+1),m_ks)<=0){
//key_t f = lastKey ;
char f[MAX_KEY_BYTES];
//if ( lastKey != getKey(i+1) ) f += (unsigned long)1;
if (KEYCMP(lastKey,getKeyPtr(i+1),m_ks)!=0)
log("db: Key in map was too small. Fixed.");
goto top;
// was lastKey too big?
//if ( i - 2 >= m_numPages && getKey(i-2) <= k ) {
if ( i - 2 >= m_numPages && KEYCMP(getKeyPtr(i-2),k,m_ks)<=0) {
//key_t f = getKey(i-2);
char *f = getKeyPtr(i-2);
//if ( f != k ) f += (unsigned long)1;
if ( KEYCMP(f,k,m_ks)!=0) KEYADD(f,1,m_ks);
log("db: LastKey in map was too big. Fixed.");
goto top;
// otherwise it is a sequence of out-of-order keys
long left = i - 1;
long right = i;
// try removing left side
//while ( left > 0 && getKey(left-1) > k )
while ( left > 0 && KEYCMP(getKeyPtr(left-1),k,m_ks)>0 )
long leftCount = i - left;
// try removing the right side
//while ( right + 1 < m_numPages && getKey(right+1) < lastKey)
while ( right + 1 < m_numPages &&
long rightCount = right - i + 1;
// make [a,b] represent the smallest bad chunk that when
// removed will fix the map
long a , b ;
if ( leftCount <= rightCount ) { a = left ; b = i - 1 ; }
else { a = i ; b = right ; }
//key_t keya ; keya.n0 = 0LL; keya.n1 = 0;
char *keya = KEYMIN();
if ( a > 0 ) keya = getKeyPtr(a-1);
// remove the smallest chunk
for ( long j = a ; j <= b ; j++ )
setKey ( j , keya );
// count it for reference
log("db: Removed bad block in map of %li pages. Data "
"may have been permanently lost. Consider "
"syncing from a twin.",b-a+1);
// try from the top
goto top;
return true;
bool RdbMap::readMap2 ( ) {
// keep track of read offset
long long offset = 0;
g_errno = 0;
// first 8 bytes are the size of the DATA file we're mapping ( &m_offset , 8 , offset );
if ( g_errno ) return log("db: Had error reading %s: %s.",
offset += 8;
// when a BigFile gets chopped, keep up a start offset for it ( &m_fileStartOffset , 8 , offset );
if ( g_errno ) return log("db: Had error reading %s: %s.",
offset += 8;
// read total number of non-deleted records ( &m_numPositiveRecs , 8 , offset );
if ( g_errno ) return log("db: Had error reading %s: %s.",
offset += 8;
// read total number of deleted records ( &m_numNegativeRecs , 8 , offset );
if ( g_errno ) return log("db: Had error reading %s: %s.",
offset += 8;
// read total number of deleted records
// ( &m_lastKey , 12 , offset ); ( m_lastKey , m_ks , offset );
if ( g_errno ) return log("db: Had error reading %s: %s.",
//offset += 12;
offset += m_ks;
// get the total size of this map file from our derived file class
long fileSize = m_file.getFileSize () ;
if ( fileSize < 0 ) return log("db: getFileSize failed on %s: %s.",
// read in the segments
for ( long i = 0 ; offset < fileSize ; i++ ) {
// . this advance offset passed the read segment
// . it uses fileSize for reading the last partial segment
offset = readSegment ( i , offset , fileSize ) ;
if ( offset<=0 ) return log("db: Had error reading "
"%s: %s.",
return true;
long long RdbMap::readSegment ( long seg , long long offset , long fileSize ) {
// . add a new segment for this
// . increments m_numSegments and increases m_maxNumPages
if ( ! addSegment () ) return -1;
// get the slot size, 1 12 byte key and 1 short offset per page
//long slotSize = sizeof(key_t) + 2;
long slotSize = m_ks + 2;
// how much will we read now?
long totalReadSize = PAGES_PER_SEGMENT * slotSize;
// how much left in the map file?
long long avail = fileSize - offset;
// . what's available MUST always be a multiple of 16
// . sanity check
if ( ( avail % slotSize ) != 0 ) {
log("db: Had error reading part of map: Bad map "
"size."); return -1; }
// truncate if not a full segment
if ( totalReadSize > avail ) totalReadSize = avail;
// get # of keys/offsets to read
long numKeys = totalReadSize / slotSize;
// calculate how many bytes to read of keys
//long readSize = numKeys * sizeof(key_t);
long readSize = numKeys * m_ks;
// do the read
g_errno = 0; ( (char *)m_keys[seg] , readSize , offset );
if ( g_errno ) return false; // log("RdbMapFile::readSegment: failed");
offset += readSize;
// read the offsets of segment
readSize = numKeys * 2; ( (char *)m_offsets[seg] , readSize , offset );
if ( g_errno ) return false; // log("RdbMapFile::readSegment: failed");
offset += readSize ;
// increase m_numPages based on the keys/pages read
m_numPages += numKeys;
// return the new offset
return offset ;
// . add a record to the map
// . returns false and sets g_errno on error
// . offset is the current offset of the rdb file where the key/data was added
// . TODO: speed this up
// . we pass in "data" so we can compute the crc of each page
//bool RdbMap::addRecord ( key_t &key, char *rec , long recSize ) {
bool RdbMap::addRecord ( char *key, char *rec , long recSize ) {
// calculate size of the whole slot
//long size = sizeof(key_t) ;
// include the dataSize, 4 bytes, for each slot if it's not fixed
//if ( m_fixedDataSize == -1 ) size += 4;
// include the data
//size += dataSize;
// what page is first byte of key on?
//long pageNum = m_offset / m_pageSize;
long pageNum = m_offset >> m_pageSizeBits;
// what is the last page we touch?
//long lastPageNum = (m_offset + recSize - 1) / m_pageSize;
long lastPageNum = (m_offset + recSize - 1) >> m_pageSizeBits;
// . see if we need to reallocate/allocate more pages in the map.
// . g_errno should be set to ENOMEM
// . only do this if we're NOT adding to disk
// . should only change m_maxNumPages, not m_numPages
// . if the rec is HUGE it may span SEVERAL, so do a while()
while ( lastPageNum + 2 >= m_maxNumPages ) {
if ( ! addSegment() ) {
log("db: Failed to add segment3 to map file %s.",
// core dump until we revert to old values
char *xx = NULL; *xx = 0;
// we need to call writeMap() before we exit
m_needToWrite = true;
// debug
log("db: addmap k=%s keysize=%li offset=%lli pagenum=%li",
// we now call RdbList::checkList_r() in RdbDump::dumpList()
// and that checks the order of the keys
//#ifdef _SANITYCHECK_
// . sanity check
// . a key of 0 is valid, so watch out for m_lastKey's sake
//if ( key <= m_lastKey && (m_lastKey.n0!=0 || m_lastKey.n1!=0)) {
if ( KEYCMP(key,m_lastKey,m_ks)<=0 &&
KEYCMP(m_lastKey,KEYMIN(),m_ks)!=0 ) {
// do not log more than once per second
if ( getTime() == m_lastLogTime ) goto skip;
m_lastLogTime = getTime();
//pageNum > 0 && getKey(pageNum-1) > getKey(pageNum) ) {
log(LOG_LOGIC,"build: RdbMap: added key out of order. "
//log(LOG_LOGIC,"build: k.n1=%lx %llx lastKey.n1=%lx %llx",
// key.n1,key.n0,m_lastKey.n1,m_lastKey.n0 );
log(LOG_LOGIC,"build: offset=%lli",
log(LOG_LOGIC,"build: k1=%s",
log(LOG_LOGIC,"build: k2=%s",
if ( m_generatingMap ) {
return false;
char *xx=NULL;*xx=0;
// . during a merge, corruption can happen, so let's core
// here until we figure out how to fix it.
// . any why wasn't the corruption discovered and patched
// with a twin? or at least excised... because the read
// list may have all keys in order, but be out of order
// with respect to the previously-read list?
//char *xx = NULL; *xx = 0;
// let's ignore it for now and just add the corrupt
// record (or maybe the one before was corrupted) but we
// need to verify the map afterwards to fix these problems
m_needVerify = true;
// sleep(50000);
// remember the lastKey in the whole file
//m_lastKey = key;
// debug msg
//log(LOG_LOGIC,"build: map add lastk.n1=%llx %llx",
// KEY1(m_lastKey,m_ks),KEY0(m_lastKey));
// set m_numPages to the last page num we touch plus one
m_numPages = lastPageNum + 1;
// keep a global tally on # of recs that are deletes (low bit cleared)
//if ( (key.n0 & 0x01) == 0 ) m_numNegativeRecs++;
if ( KEYNEG(key) ) m_numNegativeRecs++;
// keep a global tally on # of recs that are NOT deletes
else m_numPositiveRecs++;
// increment the size of the data file
m_offset += recSize ;
// . reset all pages above pageNum that we touch
// . store -1 in offset to indicate it's continuation of key which
// started on another page
// . store -1 on lastPageNum PLUS 1 incase we just take up lastPageNum
// ourselves and the next key will start on lastPageNum+1 at offset 0
// . also by storing -1 for offset this page becomes available for
// keys/recs to follow
for ( long i = pageNum + 1; i <= lastPageNum; i++ ) setKey ( i , key );
// . return now if we're NOT the first key wholly on page #pageNum
// . add crc of this rec
// . this offset will be -1 for unstarted pages
// . tally the crc until we hit a new page
if ( getOffset ( pageNum ) >= 0 ) return true;
// . if no key has claimed this page then we'll claim it
// . by claiming it we are the first key to be wholly on this page
setOffset ( pageNum , ( m_offset - recSize ) & (m_pageSize-1) );
setKey ( pageNum , key );
// success!
return true;
// . for adding a data-less key very quickly
// . i don't use m_numPages here (should use m_offset!)
// . TODO: can quicken by pre-initializing map size
// . TODO: don't use until it counts the # of deleted keys, etc...
bool RdbMap::addKey ( key_t &key ) {
// what page is first byte of key on?
long pageNum = m_offset / m_pageSize;
// increment the size of the data file
m_offset += sizeof(key_t);
// keep the number of pages up to date
m_numPages = m_offset / m_pageSize + 1;
// . see if we need to reallocate/allocate more pages in the map.
// . g_errno should be set to ENOMEM
// . only do this if we're NOT adding to disk
if ( m_numPages >= m_maxNumPages )
if ( ! setMapSize ( m_numPages + 8*1024 ) ) return false;
// if no key has claimed this page then we'll claim it
if ( m_offsets [ pageNum ] < 0 ) {
m_offsets [ pageNum ] = m_offset % m_pageSize;
m_keys [ pageNum ] = key;
// otherwise if current page already has a FIRST slot on it then return
return true;
// . call addRecord() or addKey() for each record in this list
bool RdbMap::prealloc ( RdbList *list ) {
// sanity check
if ( list->m_ks != m_ks ) { char *xx = NULL; *xx = 0; }
// bail now if it's empty
if ( list->isEmpty() ) return true;
// what is the last page we touch?
long lastPageNum = (m_offset + list->getListSize() - 1) / m_pageSize;
// . need to pre-alloc up here so malloc does not fail mid stream
// . TODO: only do it if list is big enough
while ( lastPageNum + 2 >= m_maxNumPages ) {
if ( ! addSegment() )
return log("db: Failed to add segment to map file %s.",
return true;
// . call addRecord() or addKey() for each record in this list
bool RdbMap::addList ( RdbList *list ) {
// sanity check
if ( list->m_ks != m_ks ) { char *xx = NULL; *xx = 0; }
// . reset list to beginning to make sure
// . no, because of HACK in RdbDump.cpp we set m_listPtrHi < m_list
// so our first key can be a half key, calling resetListPtr()
// will reset m_listPtrHi and fuck it up
// bail now if it's empty
if ( list->isEmpty() ) return true;
// what is the last page we touch?
long lastPageNum = (m_offset + list->getListSize() - 1) / m_pageSize;
// . need to pre-alloc up here so malloc does not fail mid stream
// . TODO: only do it if list is big enough
while ( lastPageNum + 2 >= m_maxNumPages ) {
if ( ! addSegment() )
return log("db: Failed to add segment to map file %s.",
// . index lists are very special cases
// . the keys may be the full 12 bytes or a compressed 6 bytes
// . disable for now! for new linkdb, posdb, etc.
//if ( list->useHalfKeys() )
// return addIndexList ( (IndexList *)list );
// disabled until addKey() works correctly
if ( list->isDataless() ) {
key = list->getCurrentKey ( );
if ( ! addKey ( key ) ) return false;
if ( list->skipCurrentRecord() ) goto top1;
return true;
// print the last key from lasttime
log("map: lastkey=%s",KEYSTR(m_lastKey,m_ks));
//key_t key;
char key[MAX_KEY_BYTES];
long recSize;
char *rec;
//key = list->getCurrentKey ( );
recSize = list->getCurrentRecSize();
rec = list->getCurrentRec ();
if ( ! addRecord ( key , rec , recSize ) ) {
log("db: Failed to add record to map: %s.",
char *xx = NULL; *xx = 0;
if ( list->skipCurrentRecord() ) goto top2;
// sanity check -- i added this for debug but i think it was
// corrupted buckets!!
return true;
// . a short list is a data-less list whose keys are 12 bytes or 6 bytes
// . the 6 byte keys are compressed 12 byte keys that actually have the
// same most significant 6 bytes as the closest 12 byte key before them
// . CAUTION: this list may have a 6 byte key as its first key because
// RdbDump does that hack so that on disk there are not unnecessary 12 byte
// keys because that would make IndexTable.cpp:addLists_r() inefficient
bool RdbMap::addIndexList ( IndexList *list ) {
// return now if empty
if ( list->isEmpty() ) return true;
// we need to call writeMap() before we exit
m_needToWrite = true;
// . reset list to beginning to make sure
// . no, because of HACK in RdbDump.cpp we set m_listPtrHi < m_list
// so our first key can be a half key, calling resetListPtr()
// will reset m_listPtrHi and fuck it up
// what page # will the first rec of this list be on?
long pageNum = m_offset / m_pageSize;
long end;
// convenience vars
char *rec;
char *recStart;
char *recMax;
char *recHi;
// what was the size of the last key we hit in the while loop? 6 or 12?
char size = 0;
// compare our start key to last list's endkey
char kp[MAX_KEY_BYTES];
if ( KEYCMP(kp,m_lastKey,m_ks) <= 0 &&
KEYCMP(m_lastKey,KEYMIN(),m_ks) != 0 ) {
log(LOG_LOGIC,"build: RdbMap: added key out of order "
"in addIndexList. ");
log(LOG_LOGIC,"build: k.n1=%llx %llx lastKey.n1=%llx %llx. ",
char *xx = NULL; *xx = 0;
// if the current page DOES NOT have a starting key, we are it
if ( pageNum >= m_numPages ) goto startNewPage;
// what is the last offset that can be on this page?
end = m_offset + m_pageSize - (m_offset % m_pageSize) - 1;
// get the current record
rec = list->getListPtr();
recStart = rec;
// how far to advance rec?
recMax = rec + (end - m_offset);
// don't exceed list end
if ( recMax > list->getListEnd() ) recMax = list->getListEnd();
// . get hi ptr of record
// . subtract 6 cuz we add 6 later
//recHi = list->getListPtrHi() - 6;
recHi = list->getListPtrHi() - (m_ks-6);
// . is a record from the last list already starting on this page #?
// . if it is already claimed, add until we hit next page
// . is a record from the last list already starting on this page #?
// . if it is already claimed, add until we hit next page
while ( rec < recMax ) {
// keep a global tally on # of recs that are deletes and NOT
if ( (*rec & 0x01) == 0 ) m_numNegativeRecs++;
else m_numPositiveRecs++;
// is half bit on?
//if ( *rec & 0x02 ) { size = 6 ; rec += 6; }
//else { size = 12; recHi = rec; rec += 12; }
if ( *rec & 0x02 ) { size = m_ks-6 ; rec += size;}
else { size = m_ks; recHi = rec; rec += size;}
// update list current ptr
//list->setListPtrs ( rec , recHi + 6 );
list->setListPtrs ( rec , recHi + (m_ks-6) );
// and update m_offset, too
m_offset += rec - recStart;
// . if our list is done, return
// . otherwise, we filled up the whole page
if ( list->isExhausted() ) {
// set m_lastKey
//m_lastKey = list->getKey ( list->getListPtr() - size );
list->getKey ( list->getListPtr() - size , m_lastKey );
return true;
// do we need to add a segment?
// . see if we need to reallocate/allocate more pages in the map.
// . g_errno should be set to ENOMEM
// . only do this if we're NOT adding to disk
// . should only change m_maxNumPages, not m_numPages
if ( m_numPages >= m_maxNumPages && ! addSegment() ) {
log("db: Failed to add segment2 to map file %s.",
// core dump until we revert to old values
char *xx = NULL; *xx = 0;
// we are the first key fully on this page
//key_t k = list->getCurrentKey();
char k[MAX_KEY_BYTES];
// the half bit should be in the off position, since k is 12 bytes,
// even though the key in the list may only be 6 bytes (half key)
setKey ( m_numPages , k );
setOffset ( m_numPages , m_offset % m_pageSize );
// what is the last offset that can be on this page?
end = m_offset + m_pageSize - (m_offset % m_pageSize) - 1;
// get the current record
rec = list->getListPtr();
recStart = rec;
// how far to advance rec?
recMax = rec + (end - m_offset);
// don't exceed list end
if ( recMax > list->getListEnd() ) recMax = list->getListEnd();
// . get hi ptr of record
// . subtract 6 cuz we add 6 later
//recHi = list->getListPtrHi() - 6;
recHi = list->getListPtrHi() - (m_ks-6);
// . is a record from the last list already starting on this page #?
// . if it is already claimed, add until we hit next page
while ( rec < recMax ) {
// keep a global tally on # of recs that are deletes and NOT
if ( (*rec & 0x01) == 0 ) m_numNegativeRecs++;
else m_numPositiveRecs++;
// is half bit on?
//if ( *rec & 0x02 ) { size = 6 ; rec += 6; }
//else { size = 12; recHi = rec; rec += 12; }
if ( *rec & 0x02 ) { size = m_ks-6 ; rec += size;}
else { size = m_ks; recHi = rec; rec += size;}
// update list current ptr
//list->setListPtrs ( rec , recHi + 6 );
list->setListPtrs ( rec , recHi + (m_ks-6) );
// and update m_offset, too
m_offset += rec - recStart;
// we occupied that page, baby
// start on the next page
goto startNewPage;
// . set *rsp and *rep so if we read from first key on *rsp to first key on
// *rep all records will have their key in [startKey,endKey]
// . the relative offset (m_offset[sp]) may be -1
// . this can now return negative sizes
long long RdbMap::getMinRecSizes ( long sp ,
long ep ,
//key_t startKey ,
//key_t endKey ,
char *startKey ,
char *endKey ,
bool subtract ) {
// . calculate first page, "sp", whose key is >= startKey
// . NOTE: sp may have a relative offset of -1
// . in this case, just leave it be!
//while ( sp < ep && getKey(sp) < startKey ) sp++;
while ( sp < ep && KEYCMP(getKeyPtr(sp),startKey,m_ks)<0 ) sp++;
// now calculate endpg whose key is <= endKey
long ep1 = ep;
//while ( ep > sp && getKey(ep) > endKey ) ep--;
while ( ep > sp && KEYCMP(getKeyPtr(ep),endKey,m_ks)>0 ) ep--;
// . if ep has a relative offset of -1 we can advance it
// . we cannot have back-to-back -1 offset with DIFFERENT keys
while ( ep <= ep1 && ep < m_numPages && getOffset(ep) == -1 ) ep++;
// now getRecSizes on this contrained range
return getRecSizes ( sp , ep , subtract );
// . like above, but sets an upper bound for recs in [startKey,endKey]
long long RdbMap::getMaxRecSizes ( long sp ,
long ep ,
//key_t startKey ,
//key_t endKey ,
char *startKey ,
char *endKey ,
bool subtract ) {
// . calculate first page, "sp", whose key is >= startKey
// . NOTE: sp may have a relative offset of -1
// . in this case, just leave it be!
//while ( sp > 0 && getKey(sp) > startKey ) sp--;
while ( sp > 0 && KEYCMP(getKeyPtr(sp),startKey,m_ks)>0 ) sp--;
// now calculate endpg whose key is > endKey
//while ( ep < m_numPages && getKey(ep) < endKey ) ep++;
while ( ep < m_numPages && KEYCMP(getKeyPtr(ep),endKey,m_ks)<0 ) ep++;
// . if ep has a relative offset of -1 we can advance it
// . we cannot have back-to-back -1 offset with DIFFERENT keys
while ( ep < m_numPages && getOffset(ep) == -1 ) ep++;
// now getRecSizes on this contrained range
return getRecSizes ( sp , ep , subtract );
// . how many bytes in the range?
// . range is from first key on startPage UP TO first key on endPage
// . if endPage is >= m_numPages then range is UP TO the end of the file
// . this can now return negative sizes
long long RdbMap::getRecSizes ( long startPage ,
long endPage ,
bool subtract ) {
// . assume a minimum of one page if key range not well mapped
// . no, why should we?
// . if pages are the same, there's no recs between them!
// . this seemed to cause a problem when startPage==endPage == lastPage
// and we started in the middle of a dump, so instead of reading
// 0 bytes, since offset was the end of the file, the dump dumped
// some and we read that. And the # of bytes we read was not
// divisible by sizeof(key_t) and RdbList::checkList_r() complained
// about the last key out of order, but that last key's last 8
// bytes were garbage we did NOT read from disk... phew!
if ( startPage == endPage ) return 0; // return (long)m_pageSize;
long long offset1;
long long offset2;
if ( ! subtract ) {
offset1 = getAbsoluteOffset ( startPage );
offset2 = getAbsoluteOffset ( endPage );
return offset2 - offset1;
// . but take into account delete keys, so we can have a negative size!
// . use random sampling
long long size = 0;
//key_t k;
char *k;
for ( long i = startPage ; i < endPage ; i++ ) {
// get current page size
offset1 = getAbsoluteOffset ( i );
offset2 = getAbsoluteOffset ( i + 1 );
// get startKey for this page
k = getKeyPtr ( i );
// if key is a delete assume all in page are deletes
//if ( (k.n0)&0x01 == 0LL) size -= (offset2 - offset1);
if ( KEYNEG(k) ) size -= (offset2 - offset1);
else size += (offset2 - offset1);
// return the size
return size;
// if page has relative offset of -1, use the next page
long long RdbMap::getAbsoluteOffset ( long page ) {
if ( page >= m_numPages ) return m_offset; // fileSize
long long offset =
(long long)getOffset(page) +
(long long)m_pageSize * (long long)page;
if ( getOffset(page) != -1 ) return offset + m_fileStartOffset;
// just use end of page if in the middle of a record
while ( page < m_numPages && getOffset(page) == -1 ) page++;
goto top;
// . get offset of next known key after the one in page
// . do a while to skip rec on page "page" if it spans multiple pages
// . watch out for eof
long long RdbMap::getNextAbsoluteOffset ( long page ) {
// advance to next page
// inc page as long as we need to
while ( page < m_numPages && getOffset(page) == -1 ) page++;
// . if we hit eof then return m_offset
// . otherwise, we hit another key
return getAbsoluteOffset ( page );
// . [startPage,*endPage] must cover [startKey,endKey]
// . by cover i mean have all recs with those keys
// . returns the endPage #
//long RdbMap::getEndPage ( long startPage , key_t endKey ) {
long RdbMap::getEndPage ( long startPage , char *endKey ) {
// use "ep" for the endPage we're computing
long ep = startPage;
// advance if "ep"'s key <= endKey
//while ( ep < m_numPages && getKey(ep) <= endKey ) ep++;
while ( ep < m_numPages && KEYCMP(getKeyPtr(ep),endKey,m_ks)<=0 ) ep++;
// now we may have ended up on a page with offset of -1
// which is not good so, even if page's key is > endKey, advance it
while ( ep < m_numPages && getOffset(ep) == -1 ) ep++;
// now we're done
return ep;
// . convert a [startKey,endKey] range to a [startPage,endPage] range
// . this says that if you read from first key offset on *startPage UP TO
// first key offset on *endPage you'll get the keys/recs you want
// . if *endPage equals m_numPages then you must read to the end of file
// . returns false if no keys in [startKey,endKey] are present
// . *maxKey will be an APPROXIMATION of the max key we have
//bool RdbMap::getPageRange ( key_t startKey ,
// key_t endKey ,
bool RdbMap::getPageRange ( char *startKey ,
char *endKey ,
long *startPage ,
long *endPage ,
//key_t *maxKey ,
char *maxKey ,
long long oldTruncationLimit ) {
// the first key on n1 is usually <= startKey, but can be > startKey
// if the page (n-1) has only 1 rec whose key is < startKey
long n1 = getPage ( startKey );
// . get the ending page for this scan
// . tally up the deleted keys as we go
long n2 = getPage ( endKey );
// . set maxKey if we need to
// . ensure that it is in [startKey,endKey] because it is used for
// determining what the length of an IndexList would have been
// if it was not truncated
// . that is, we use maxKey for interpolation
if ( maxKey ) {
long n3 = n2;
if ( oldTruncationLimit >= 0 ) {
long nn = n1 + (oldTruncationLimit*6LL) / m_pageSize;
if ( n3 > nn ) n3 = nn;
//while ( n3 > n1 && getKey(n3) > endKey ) n3--;
while ( n3 > n1 && KEYCMP(getKeyPtr(n3),endKey,m_ks)>0 ) n3--;
//*maxKey = getKey ( n3 );
// . if the first key appearing on this page is <= endKey we inc n2
// . make m_keys[n2] > endKey since we read up to first key on n2
// . n2 can be equal to m_numPages (means read to eof then)
//while ( n2 < m_numPages && getKey ( n2 ) <= endKey ) n2++;
while ( n2 < m_numPages && KEYCMP(getKeyPtr(n2),endKey,m_ks)<=0 ) n2++;
// skip n2 over any -1 offset
while ( n2 < m_numPages && getOffset ( n2 ) == -1 ) n2++;
// neither n1 nor n2 should have a -1 offset
//if ( m_offsets[n1] == -1 || m_offsets[n2] == -1 ) {
//log("getPageRange: bad engineer"); exit (-1); }
// if n1 == n2 then it's a not found since the key on page n1 is >
// startKey and > endKey AND all keys on page n1-1 are < startKey
//if ( n1 == n2 ) return false;
// otherwise set our stuff and return true
*startPage = n1;
*endPage = n2;
return true;
bool RdbMap::getPageRange ( key_t startKey , key_t endKey ,
long minRecSizes ,
long *startPage , long *endPage ) {
// the first key on n1 is usually <= startKey, but can be > startKey
// if the page (n-1) has only 1 rec whose key is < startKey
long n1 = getPage ( startKey );
// set n2, the endpage
long n2 = n1;
// tally the recSizes
long recSizes = 0;
// . increase n2 until we are > endKey or meet minRecSizes requirement
// . if n2 == m_numPages, that means all the pages from n1 on
while ( n2<m_numPages && m_keys[n2]<=endKey && recSizes<minRecSizes) {
// . find the next value for n2
// . m_offsets[n2] may not be -1
long next = n2 + 1;
while ( next < m_numPages && m_offsets[next] == -1 ) next++;
// . getPageSize() returns size from the first key on page n2
// to the next key on the next page
// . next key may be more than 1 page away if key on page n2
// takes up more than 1 page (-1 == m_offsets[n2+1])
if ( m_keys[n2] >= startKey ) recSizes += getRecSizes(n2,next);
n2 = next;
// otherwise set our stuff and return true
*startPage = n1;
*endPage = n2;
return true;
// . return a page number, N
// . if m_keys[N] < startKey then m_keys[N+1] is > startKey
// . if m_keys[N] > startKey then all keys before m_keys[N] in the rdb file
// are < startKey
// . if m_keys[N] > startKey then m_keys[N-1] spans multiple pages so that
// the key immediately after it on disk is in fact, m_keys[N]
//long RdbMap::getPage ( key_t startKey ) {
long RdbMap::getPage ( char *startKey ) {
// if the key exceeds our lastKey then return m_numPages
//if ( startKey > m_lastKey ) return m_numPages;
if ( KEYCMP(startKey,m_lastKey,m_ks)>0 ) return m_numPages;
// . find the disk offset based on "startKey"
// . b-search over the map of pages
// . "n" is the page # that has a key <= "startKey"
// . "n+1" has a key that is > "startKey"
long n = ( m_numPages ) / 2;
long step = n / 2;
while ( step > 0 ) {
//if ( startKey <= getKey ( n ) ) n -= step;
//else n += step;
if ( KEYCMP(startKey,getKeyPtr(n),m_ks)<=0 ) n -= step;
else n += step;
step >>= 1; // divide by 2
// . let's adjust for the inadaquecies of the above algorithm...
// . increment n until our key is >= the key in the table
//while ( n < m_numPages - 1 && getKey(n) < startKey ) n++;
while ( n<m_numPages - 1 && KEYCMP(getKeyPtr(n),startKey,m_ks)<0 ) n++;
// . decrement n until page key is LESS THAN OR EQUAL to startKey
// . it is now <= the key, not just <, since, if the positive
// key exists it, then the negative should not be in this file, too!
//while ( n > 0 && getKey(n) > startKey ) n--;
while ( n>0 && KEYCMP(getKeyPtr(n),startKey,m_ks)>0 ) n--;
// debug point
//if ( m_offsets[n] == -1 && m_keys[n] == startKey &&
//m_keys[n-1] != startKey )
//log("debug point\n");
// . make sure we're not in the middle of the data
// . decrease n until we're on a page that has the start of a key
while ( n > 0 && getOffset(n) == -1 ) n--;
// this is the page we should start reading at
return n;
// . return immediately if this is our key (exact match)
//if ( m_keys[n] == startKey ) return n;
// . now m_keys[n] should be < startKey
// . the next m_key, however, should be BIGGER than our key
// . but if m_keys[n] spans multiple pages then skip over it
// because the next key in the map IMMEDIATELY follows it
//if ( n < m_numPages - 1 && m_offsets[n+1] == -1 )
//while ( n < m_numPages - 1 ) n++;
// . now m_keys[n] may actually be bigger than startKey but it's
// only because the previous key on disk is less than startKey
//return n;
void RdbMap::printMap () {
long h = 0;
for ( int i = 0 ; i < m_numPages; i++ ) {
//log(LOG_INFO,"page=%i) key=%llu--%lu, offset=%hi\n",
// i,getKey(i).n0,getKey(i).n1,getOffset(i));
// for comparing
char buf[1000];
sprintf(buf,"page=%i) key=%llx %llx, offset=%hi",
h = hash32 ( buf , gbstrlen(buf) , h );
log(LOG_INFO,"map checksum = 0x%lx",h);
//long RdbMap::setMapSizeFromFileSize ( long fileSize ) {
// long n = fileSize / m_pageSize ;
// if ( (fileSize % m_pageSize) == 0 ) return setMapSize ( n );
// return setMapSize ( n + 1 );
// . returns false if malloc had problems
// . increases m_maxNumPages
// . increases m_numSegments
//bool RdbMap::setMapSize ( long numPages ) {
// . add the segments
// . addSegment() increases m_maxNumPages with each call
// . it returns false and sets g_errno on error
// for ( long i = 0 ; m_maxNumPages < numPages ; i++ )
// if ( ! addSegment ( ) ) return false;
// return true;
long long RdbMap::getMemAlloced ( ) {
// . how much space per segment?
// . each page has a key and a 2 byte offset
//long long space = PAGES_PER_SEGMENT * (sizeof(key_t) + 2);
long long space = PAGES_PER_SEGMENT * (m_ks + 2);
// how many segments we use * segment allocation
return (long long)m_numSegments * space;
bool RdbMap::addSegmentPtr ( long n ) {
// realloc
if ( n >= m_numSegmentPtrs ) {
char **k;
long nn = (long)((float)n * 1.20) + 1;
k = (char **) mrealloc (m_keys,
m_numSegmentPtrs * sizeof(char *) ,
nn * sizeof(char *) ,
"MapPtrs" );
// failed?
if ( ! k ) return false;
// succeeded
m_numSegmentPtrs = nn;
m_keys = k;
// try offsets
if ( n >= m_numSegmentOffs ) {
short **o;
long nn = (long)((float)n * 1.20) + 1;
o = (short **) mrealloc (m_offsets,
m_numSegmentOffs * sizeof(short *) ,
nn * sizeof(short *) ,
"MapPtrs" );
// failed?
if ( ! o ) return false;
// succeeded
m_numSegmentOffs = nn;
m_offsets = o;
return true;
// . add "n" segments
// . returns false and sets g_errno on error
bool RdbMap::addSegment ( ) {
// a helper variable
//long ks = sizeof(key_t);
long ks = m_ks;
// easy variables
long n = m_numSegments;
// ensure doesn't exceed the max
//if ( n >= MAX_SEGMENTS ) return log("db: Mapped file is "
// "too big. Critical error.");
// the array of up to MAX_SEGMENT pool ptrs is now dynamic too!
// because diffbot uses thousands of collections, this will save
// over 1GB of ram!
if ( ! addSegmentPtr ( n ) )
return log("db: Failed to allocate memory for adding seg ptr "
"for map file %s.", m_file.getFilename());
// alloc spaces for each key segment
// allocate new segments now
//m_keys[n] = (key_t *) mmalloc ( ks * pps , "RdbMap" );
m_keys[n] = (char *) mmalloc ( ks * pps , "RdbMap" );
m_offsets[n] = (short *) mmalloc ( 2 * pps , "RdbMap" );
bool hadProblem = false;
// free up the segment on any problem
if ( ! m_offsets[n] ) hadProblem = true;
if ( ! m_keys [n] ) hadProblem = true;
if ( hadProblem ) {
if ( m_keys [n] ) mfree ( m_keys[n] , ks*pps, "RdbMap" );
if ( m_offsets[n] ) mfree ( m_offsets[n], 2*pps , "RdbMap" );
// set to NULL so we know if accessed illegally
m_keys [n] = NULL;
m_offsets[n] = NULL;
return log(
"db: Failed to allocate memory for adding to "
"map file %s.", m_file.getFilename());
// set all new offsets to -1
for ( long j = 0 ; j < PAGES_PER_SEGMENT ; j++ ) m_offsets[n][j] = -1;
// reset m_maxNumPages and m_numSegments
m_maxNumPages += PAGES_PER_SEGMENT;
return true;
// . chop off any segment COMPLETELY before pageNum
// . if pageNum is -1 free ALL segments
// . fileHeadSize should equal MAX_PART_SIZE #define'd in BigFile.h
// . MAX_PART_SIZE is the max size of a little file that is part of a BigFile
bool RdbMap::chopHead ( long fileHeadSize ) {
// ensure fileHeadSize is valid
if ( fileHeadSize != MAX_PART_SIZE )
return log(LOG_LOGIC,"db: rdbmap: chopHead: fileHeadSize of "
"%li is invalid.", fileHeadSize );
// what segment does this page fall on?
long segNum = (fileHeadSize / m_pageSize) / PAGES_PER_SEGMENT;
// . must match exactly
// . not any more i guess, we can still have a segment that
// corresponds in part to a PART file no longer with us
//if ( fileHeadSize * m_pageSize * PAGES_PER_SEGMENT != segNum )
//return log("RdbMap::chopHead: file head isn't multiple");
// return true if nothing to delete
if ( segNum == 0 ) return true;
// . we need to call writeMap() before we exit
// . not any more! if the merge is killed or saved in the middle then
// verifyMap() will now call chopHead() until the head of the map
// matches the head PART file of the data file we map
//m_needToWrite = true;
// a helper variable
//long ks = sizeof(key_t);
long ks = m_ks;
// remove segments before segNum
for ( long i = 0 ; i < segNum ; i++ ) {
mfree ( m_keys [i] , ks * PAGES_PER_SEGMENT , "RdbMap" );
mfree ( m_offsets[i] , 2 * PAGES_PER_SEGMENT , "RdbMap" );
// set to NULL so we know if accessed illegally
m_keys [i] = NULL;
m_offsets[i] = NULL;
// adjust # of segments down
m_numSegments -= segNum;
// same with max # of used pages
m_maxNumPages -= PAGES_PER_SEGMENT * segNum ;
// same with # of used pages, since the head was ALL used
m_numPages -= PAGES_PER_SEGMENT * segNum ;
// this could be below zero if last segment was chopped
if ( m_numPages < 0 ) m_numPages = 0;
// if 0 return now
// if ( m_numSegments == 0 ) return true;
// bury the stuff we chopped
//long sk = sizeof(key_t *);
long sk = sizeof(char *);
long ss = sizeof(short *);
memmove ( &m_keys [0] , &m_keys [segNum] , m_numSegments * sk );
memmove ( &m_offsets[0] , &m_offsets[segNum] , m_numSegments * ss );
// adjust the m_fileStartOffset so getAbsoluteOffset(),... is ok
m_fileStartOffset += segNum * PAGES_PER_SEGMENT * m_pageSize;
return true;
// . attempts to auto-generate from data file, f
// . returns false and sets g_errno on error
bool RdbMap::generateMap ( BigFile *f ) {
if ( g_conf.m_readOnlyMode ) return false;
// we don't support headless datafiles right now
if ( ! f->doesPartExist(0) ) {
return log("db: Cannot generate map for "
"headless data files yet.");
// scan through all the recs in f
long long offset = 0;
long long fileSize = f->getFileSize();
// if file is length 0, we don't need to do much
if ( fileSize == 0 ) return true;
// g_errno should be set on error
if ( fileSize < 0 ) return false;
// don't read in more than 10 megs at a time initially
long long bufSize = fileSize;
if ( bufSize > 10*1024*1024 ) bufSize = 10*1024*1024;
char *buf = (char *)mmalloc ( bufSize , "RdbMap" );
// use extremes
//key_t endKey;
//key_t startKey;
char *startKey = KEYMIN();
char *endKey = KEYMAX();
// a rec needs to be at least this big
long minRecSize = 0;
// negative keys do not have the dataSize field... so undo this
if ( m_fixedDataSize == -1 ) minRecSize += 0; // minRecSize += 4;
else minRecSize += m_fixedDataSize;
//if ( m_useHalfKeys ) minRecSize += 6;
//else minRecSize += 12;
if ( m_ks == 18 ) minRecSize += 6; // POSDB
else if ( m_useHalfKeys ) minRecSize += m_ks-6;
else minRecSize += m_ks;
// for parsing the lists into records
//key_t key;
char key[MAX_KEY_BYTES];
long recSize = 0;
char *rec = buf;
long long next = 0LL;
m_generatingMap = true;
// read in at most "bufSize" bytes with each read
// keep track of how many bytes read in the log
if ( offset >= next ) {
if ( next != 0 ) logf(LOG_INFO,"db: Read %lli bytes.", next );
next += 500000000; // 500MB
// our reads should always block
long long readSize = fileSize - offset;
if ( readSize > bufSize ) readSize = bufSize;
// if the readSize is less than the minRecSize, we got a bad cutoff
// so we can't go any more
if ( readSize < minRecSize ) {
mfree ( buf , bufSize , "RdbMap");
return true;
// otherwise, read it in
if ( ! f->read ( buf , readSize , offset ) ) {
mfree ( buf , bufSize , "RdbMap");
return log("db: Failed to read %lli bytes of %s at "
"offset=%lli. Map generation failed.",
// set the list
RdbList list;
list.set ( buf ,
readSize ,
buf ,
readSize ,
startKey ,
endKey ,
m_fixedDataSize ,
false , // own data?
//m_useHalfKeys );
m_useHalfKeys ,
m_ks );
// . HACK to fix useHalfKeys compression thing from one read to the nxt
// . "key" should still be set to the last record we read last read
//if ( offset > 0 ) list.m_listPtrHi = ((char *)&key)+6;
if ( offset > 0 ) list.m_listPtrHi = key+(m_ks-6);
// ... fix for posdb!!!
if ( offset > 0 && m_ks == 18 ) list.m_listPtrLo = key+(m_ks-12);
// . parse through the records in the list
// . stolen from RdbMap::addList()
rec = list.getCurrentRec ();
if ( rec+64 > list.getListEnd() && offset+readSize < fileSize ) {
// set up so next read starts at this rec that MAY have been
// cut off
offset += (rec - buf);
goto readLoop;
// WARNING: when data is corrupted these may cause segmentation faults?
//key = list.getCurrentKey ( );
recSize = list.getCurrentRecSize();
//rec = list.getCurrentRec ();
// don't chop keys
//if ( recSize > 1000000 ) { char *xx = NULL; *xx = 0; }
if ( recSize < 6 ) {
log("db: Got negative recsize of %li at offset=%lli "
recSize , offset + (rec-buf), m_offset );
// it truncates to m_offset!
if ( truncateFile(f) ) goto done;
return false;
// do we have a breech?
if ( rec + recSize > buf + readSize ) {
// save old
long long oldOffset = offset;
// set up so next read starts at this rec that got cut off
offset += (rec - buf);
// . if we advanced nothing, then we'll end up looping forever
// . this will == 0 too, for big recs that did not fit in our
// read but we take care of that below
// . this can happen if merge dumped out half-ass
// . the write split a record...
if ( rec - buf == 0 && recSize <= bufSize ) {
"db: Map generation failed because last record "
"in data file was split. Power failure while "
"writing? Truncating file to %lli bytes. "
"(lost %lli bytes)", offset,fileSize-offset);
// when merge resumes it call our getFileSize()
// in RdbMerge.cpp::gotLock() to set the dump offset
// otherwise, if we don't do this and write data
// in the middle of a split record AND then we crash
// without saving the map again, the second call to
// generateMap() will choke on that boundary and
// we'll lose a massive amount of data like we did
// with newspaperarchive
m_offset = offset;
goto done;
// ...we can now have huge titlerecs...
// is it something absurd? (over 40 Megabytes?)
if ( recSize > 40*1024*1024 ) {
// now just cut it short
//g_errno = ECORRUPTDATA;
"RdbMap::generateMap: Insane rec size of "
"%li bytes encountered. off=%lli. "
"data corruption? ignoring.",
recSize, offset);
//log("RdbMap::generateMap: truncating the file.");
//goto done;
// is our buf big enough to hold this type of rec?
if ( recSize > bufSize ) {
mfree ( buf , bufSize , "RdbMap");
bufSize = recSize;
buf = (char *)mmalloc ( bufSize , "RdbMap" );
if ( ! buf )
return log("db: Got error while "
"generating the map file: %s. "
// read agin starting at the adjusted offset
goto readLoop;
if ( ! addRecord ( key , rec , recSize ) ) {
// if it was key out of order, it might be because the
// power went out and we ended up writing a a few bytes of
// garbage then a bunch of 0's at the end of the file.
// if the truncate works out then we are done.
if ( g_errno == ECORRUPTDATA && truncateFile(f) ) goto done;
// otherwise, give it up
mfree ( buf , bufSize , "RdbMap");
return log("db: Map generation failed: %s.",
// skip current good record now
if ( list.skipCurrentRecord() ) goto nextRec;
// advance offset
offset += readSize;
// loop if more to go
if ( offset < fileSize ) goto readLoop;
// don't forget to free this
mfree ( buf , bufSize , "RdbMap");
// if there was bad data we probably added out of order keys
if ( m_needVerify ) {
log("db: Fixing map. Added at least %lli bad keys.",
m_needVerify = false;
// otherwise, we're done
return true;
// 5MB is a typical write buffer size, so do a little more than that
#define MAX_TRUNC_SIZE 6000000
bool RdbMap::truncateFile ( BigFile *f ) {
// right now just use for indexdb, datedb, tfnb, etc.
//if ( m_fixedDataSize != 0 ) return false;
// how big is the big file
long long fileSize = f->getFileSize();
long long tail = fileSize - m_offset;
//if ( tail > 20*1024*1024 )
// return log("db: Cannot truncate data file because bad tail is "
// "%lli bytes > %li.",tail,(long)MAX_TRUNC_SIZE);
// up to 20MB is ok to remove if most just bytes that are zeroes
log("db: Counting bytes that are zeroes in the tail.");
long long count = 0;
char buf [100000];
long long off = m_offset;
long readSize = fileSize - off;
if ( readSize > 100000 ) readSize = 100000;
f->read ( buf , readSize , off );
if ( ! f->read ( buf , readSize , off ) ) {
return log("db: Failed to read %li bytes of %s at "
// count the zero bytes
for ( long i = 0 ; i < readSize ; i++ )
if ( buf[i] == 0 ) count++;
// read more if we can
off += readSize;
if ( off < fileSize ) goto loop;
// remove those from the size of the tail
tail -= count;
// if too much remains, do not truncate it
if ( tail > MAX_TRUNC_SIZE )
return log("db: Cannot truncate data file because bad tail is "
"%lli bytes > %li. That excludes bytes that are "
"zero.",tail, (long)MAX_TRUNC_SIZE);
// how many parts does it have?
long numParts = f->getNumParts();
// what part num are we on?
long partnum = f->getPartNum ( m_offset );
File *p = f->getFile ( partnum );
if ( ! p ) return log("db: Unable to get part file.");
// get offset relative to the part file
long newSize = m_offset % (long long)MAX_PART_SIZE;
// log what we are doing
long oldSize = p->getFileSize();
long lost = oldSize - newSize;
log("db: Removing %li bytes at the end of %s. Power outage "
"probably corrupted it.",lost,p->getFilename());
log("db: Doing a truncate(%s,%li).",p->getFilename(),newSize);
// we must always be the last part of next to last part
if ( partnum != numParts-1 && partnum != numParts-2 )
return log("db: This file is not the last part or next to "
"last part for this file. aborting truncation.");
// sanity check. if we are not the last part file, but are the next
// to last one, then the the last part file must be less than
// MAX_TRUNC_SIZE bytes big
File *p2 = NULL;
if ( partnum == numParts-2 ) {
p2 = f->getFile ( partnum + 1 );
if ( ! p2 ) return log("db: Could not get next part in line.");
if ( p2->getFileSize() > MAX_TRUNC_SIZE )
return log("db: Next part file is bigger than %li "
// do the truncation
if ( truncate ( p->getFilename() , newSize ) )
// return false if had an error
return log("db: truncate(%s,%li): %s.",
// if we are not the last part, remove it
if ( partnum == numParts-2 ) {
log("db: Removing tiny last part. unlink (%s).",
// ensure it is smaller than 1k
if ( ! p2->unlink() )
return log("db: Unlink of tiny last part failed.");
// reset file size, parts, etc on the big file since we truncated
// a part file and possibly removed another part file
if ( ! f->reset() )
return log("db: Failed to reset %s.",f->getFilename());
// success
return true;