open-source-search-engine/Msg3.cpp

1209 lines
45 KiB
C++
Raw Normal View History

2013-08-03 00:12:24 +04:00
#include "gb-include.h"
#include "Msg3.h"
#include "Rdb.h"
#include "Threads.h"
#include "Stats.h" // for timing and graphing merge time
//#include "Sync.h" // incremental syncing
//#include "Tfndb.h" // makeMaxKey()
#include "PingServer.h"
#include "Process.h"
static void doneScanningWrapper ( void *state ) ;
//bool mainShutdown ( bool urgent );
2014-11-11 01:45:11 +03:00
int32_t g_numIOErrors = 0;
2013-08-03 00:12:24 +04:00
Msg3::Msg3() {
m_alloc = NULL;
m_numScansCompleted = 0;
m_numScansStarted = 0;
}
Msg3::~Msg3() {
reset();
}
void Msg3::reset() {
if ( m_numScansCompleted < m_numScansStarted ) { char *xx=NULL;*xx=0; }
m_hadCorruption = false;
// reset # of lists to 0
m_numScansCompleted = 0;
m_numScansStarted = 0;
if ( ! m_alloc ) return;
// call destructors
2014-11-11 01:45:11 +03:00
for ( int32_t i = 0 ; i < m_numChunks ; i++ ) m_lists[i].destructor();
2013-08-03 00:12:24 +04:00
if ( m_alloc == m_buf ) return;
mfree ( m_alloc , m_allocSize , "Msg3" );
m_alloc = NULL;
}
// . return false if blocked, true otherwise
// . set g_errno on error
// . read list of keys in [startKey,endKey] range
// . read at least "minRecSizes" bytes of keys in that range
// . the "m_endKey" of resulting, merged list may have a smaller endKey
// than the argument, "endKey" due to limitation by "minRecSizes"
// . resulting list will contain ALL keys between ITS [m_startKey,m_endKey]
// . final merged list "should" try to have a size of at least "minRecSizes"
// but due to negative/postive rec elimination may be less
// . the endKey of the lists we read may be <= "endKey" provided
// . we try to shrink the endKey if minRecSizes is >= 0 in order to
// avoid excessive reading
// . by shrinking the endKey we cannot take into account the size of deleted
2014-11-11 01:45:11 +03:00
// records, so therefore we may fall int16_t of "minRecSizes" in actuality,
2013-08-03 00:12:24 +04:00
// in fact, the returned list may even be empty with a shrunken endKey
// . we merge all lists read from disk into the provided "list"
2014-11-11 01:45:11 +03:00
// . caller should call Msg3.getList(int32_t i) and Msg3:getNumLists() to retrieve
2013-08-03 00:12:24 +04:00
// . this makes the query engine faster since we don't need to merge the docIds
// and can just send them across the network separately and they will be
// hashed into IndexTable's table w/o having to do time-wasting merging.
// . caller can specify array of filenums to read from so incremental syncing
// in Sync class can just read from titledb*.dat files that were formed
// since the last sync point.
bool Msg3::readList ( char rdbId ,
collnum_t collnum ,
2013-08-03 00:12:24 +04:00
//key_t startKey ,
//key_t endKey ,
char *startKeyArg ,
char *endKeyArg ,
2014-11-11 01:45:11 +03:00
int32_t minRecSizes , // max size of scan
int32_t startFileNum , // first file to scan
int32_t numFiles , // rel. to startFileNum
2013-08-03 00:12:24 +04:00
void *state , // for callback
void (* callback ) ( void *state ) ,
2014-11-11 01:45:11 +03:00
int32_t niceness ,
int32_t retryNum ,
int32_t maxRetries ,
2013-08-03 00:12:24 +04:00
bool compensateForMerge ,
2014-10-30 22:36:39 +03:00
int64_t syncPoint ,
2013-08-03 00:12:24 +04:00
bool justGetEndKey ,
bool allowPageCache ,
bool hitDisk ) {
// clear, this MUST be done so if we return true g_errno is correct
g_errno = 0;
// assume lists are not checked for corruption
m_listsChecked = false;
// warn
if ( minRecSizes < -1 ) {
2014-11-11 01:45:11 +03:00
log(LOG_LOGIC,"db: Msg3 got minRecSizes of %"INT32", changing "
2013-08-03 00:12:24 +04:00
"to -1.",minRecSizes);
minRecSizes = -1;
}
// reset m_alloc and data in all lists in case we are a re-call
reset();
// warning
if ( collnum < 0 ) log(LOG_LOGIC,"net: NULL collection. msg3.");
2013-08-03 00:12:24 +04:00
// remember the callback
m_rdbId = rdbId;
m_collnum = collnum;
2013-08-03 00:12:24 +04:00
m_callback = callback;
m_state = state;
m_niceness = niceness;
m_numScansCompleted = 0;
m_retryNum = retryNum;
m_maxRetries = maxRetries;
m_compensateForMerge = compensateForMerge;
m_allowPageCache = allowPageCache;
m_hitDisk = hitDisk;
m_hadCorruption = false;
// get keySize of rdb
m_ks = getKeySizeFromRdbId ( m_rdbId );
// reset the group error
m_errno = 0;
// . reset all our lists
// . these are reset in call the RdbScan::setRead() below
2014-11-11 01:45:11 +03:00
//for ( int32_t i = 0 ; i < MAX_RDB_FILES ; i++ ) m_lists[i].reset();
2013-08-03 00:12:24 +04:00
// . ensure startKey last bit clear, endKey last bit set
// . no! this warning is now only in Msg5
// . if RdbMerge is merging some files, not involving the root
// file, then we can expect to get a lot of unmatched negative recs.
// . as a consequence, our endKeys may often be negative. This means
// it may not annihilate with the positive key, but we should only
// miss like this at the boundaries of the lists we fetch.
// . so in that case RdbList::merge will stop merging once the
// minRecSizes limit is reached even if it means ending on a negative
// rec key
//if ( (startKey.n0 & 0x01) == 0x01 )
if ( !KEYNEG(startKeyArg) )
log(LOG_REMIND,"net: msg3: StartKey lastbit set.");
if ( KEYNEG(endKeyArg) )
log(LOG_REMIND,"net: msg3: EndKey lastbit clear.");
// declare vars here becaues of 'goto skip' below
2014-11-11 01:45:11 +03:00
int32_t mergeFileNum = -1 ;
int32_t max ;
2013-08-03 00:12:24 +04:00
// get base, returns NULL and sets g_errno to ENOCOLLREC on error
RdbBase *base; if (!(base=getRdbBase(m_rdbId,m_collnum))) return true;
2013-08-03 00:12:24 +04:00
// if caller specified exactly
/*
m_syncPoint = syncPoint;
if ( syncPoint != -1 && syncPoint != 0 ) {
// . store them all
// . what if we merged one of these files (or are merging)???
// . then sync class should not discard syncpoints until no
2014-11-18 05:13:36 +03:00
// longer syncing and we'll know about it
2013-08-03 00:12:24 +04:00
// . this should compensate for merges by including any files
// that are merging a file in m_fileNums
m_numFileNums = g_sync.getFileNums ( m_rdbId ,
m_coll ,
m_syncPoint ,
m_fileNums ,
MAX_RDB_FILES );
log("NOOOOOO. we do not alloc if we go to skip!!");
char *xx = NULL; *xx = 0;
// bring back the comment below... i removed it because i added
2014-11-11 01:45:11 +03:00
// "int32_t chunk" et al below and didn't want to move them.
2013-08-03 00:12:24 +04:00
//if ( m_numFileNums > 0 ) goto skip;
log("net: Trying to read data in %s from files generated after"
2014-11-11 01:45:11 +03:00
" a sync point %"UINT64" in \"sync\" file, but none found.",
2013-08-03 00:12:24 +04:00
base->m_dbname,m_syncPoint);
return true;
}
// should we read all?
if ( m_syncPoint == 0 ) {
numFiles = -1;
startFileNum = 0;
}
*/
// store the file numbers in the array, these are the files we read
m_numFileNums = 0;
// save startFileNum here, just for recall
m_startFileNum = startFileNum;
m_numFiles = numFiles;
// . if we have a merge going on, we may have to change startFileNum
// . if some files get unlinked because merge completes then our
// reads will detect the error and loop back here
// . we launch are reads right after this without giving up the cpu
// and we use file descriptors, so any changes to Rdb::m_files[]
// should not hurt us
// . WARNING: just make sure you don't lose control of cpu until after
// you call RdbScan::set()
// . we use hasMergeFile() instead of isMerging() because he may not
// be merging cuz he got suspended or he restarted and
// hasn't called attemptMerge() yet, but he may still contain it
if ( g_conf.m_logDebugQuery )
log(LOG_DEBUG,
"net: msg3: "
2014-11-11 01:45:11 +03:00
"c=%"INT32" hmf=%"INT32" sfn=%"INT32" msfn=%"INT32" nf=%"INT32" db=%s.",
(int32_t)compensateForMerge,(int32_t)base->hasMergeFile(),
(int32_t)startFileNum,(int32_t)base->m_mergeStartFileNum-1,
(int32_t)numFiles,base->m_dbname);
int32_t pre = -10;
2013-08-03 00:12:24 +04:00
if ( compensateForMerge && base->hasMergeFile() &&
startFileNum >= base->m_mergeStartFileNum - 1 &&
(startFileNum > 0 || numFiles != -1) ) {
// now also include the file being merged into, but only
// if we are reading from a file being merged...
if ( startFileNum < base->m_mergeStartFileNum +
base->m_numFilesToMerge - 1 )
//m_fileNums [ m_numFileNums++ ] =
// base->m_mergeStartFileNum - 1;
pre = base->m_mergeStartFileNum - 1;
// debug msg
if ( g_conf.m_logDebugQuery )
log(LOG_DEBUG,
2014-11-11 01:45:11 +03:00
"net: msg3: startFileNum from %"INT32" to %"INT32" (mfn=%"INT32")",
2013-08-03 00:12:24 +04:00
startFileNum,startFileNum+1,mergeFileNum);
// if merge file was inserted before us, inc our file number
startFileNum++;
}
// adjust num files if we need to, as well
if ( compensateForMerge && base->hasMergeFile() &&
startFileNum < base->m_mergeStartFileNum - 1 &&
numFiles != -1 &&
startFileNum + numFiles - 1 >= base->m_mergeStartFileNum - 1 ) {
// debug msg
if ( g_conf.m_logDebugQuery )
log(LOG_DEBUG,"net: msg3: numFiles up one.");
// if merge file was inserted before us, inc our file number
numFiles++;
}
// . how many rdb files does this base have?
// . IMPORTANT: this can change since files are unstable because they
// might have all got merged into one!
// . so do this check to make sure we're safe... especially if
// there was an error before and we called readList() on ourselves
max = base->getNumFiles();
// -1 means we should scan ALL the files in the base
if ( numFiles == -1 ) numFiles = max;
// limit it by startFileNum, however
if ( numFiles > max - startFileNum ) numFiles = max - startFileNum;
// set g_errno and return true if it is < 0
if ( numFiles < 0 ) {
log(LOG_LOGIC,
2014-11-11 01:45:11 +03:00
"net: msg3: readList: numFiles = %"INT32" < 0 (max=%"INT32")(sf=%"INT32")",
2013-08-03 00:12:24 +04:00
numFiles , max , startFileNum );
g_errno = EBADENGINEER;
// force core dump
char *xx=NULL;*xx=0;
2013-08-03 00:12:24 +04:00
return true;
}
// . allocate buffer space
// . m_scans, m_startpg, m_endpg, m_hintKeys, m_hintOffsets,
// m_fileNums, m_lists, m_tfns
2014-11-11 01:45:11 +03:00
int32_t chunk = sizeof(RdbScan) + // m_scans
2013-08-03 00:12:24 +04:00
4 + // m_startpg
4 + // m_endpg
//sizeof(key_t) + // m_hintKeys
m_ks + // m_hintKeys
4 + // m_hintOffsets
4 + // m_fileNums
sizeof(RdbList) + // m_lists
4 ; // m_tfns
2014-11-11 01:45:11 +03:00
int32_t nn = numFiles;
2013-08-03 00:12:24 +04:00
if ( pre != -10 ) nn++;
m_numChunks = nn;
2014-11-11 01:45:11 +03:00
int32_t need = nn * (chunk);
2013-08-03 00:12:24 +04:00
m_alloc = m_buf;
2014-11-11 01:45:11 +03:00
if ( need > (int32_t)MSG3_BUF_SIZE ) {
2013-08-03 00:12:24 +04:00
m_allocSize = need;
m_alloc = (char *)mcalloc ( need , "Msg3" );
if ( ! m_alloc ) {
2014-11-11 01:45:11 +03:00
log("disk: Could not allocate %"INT32" bytes read "
2013-08-03 00:12:24 +04:00
"structures to read %s.",need,base->m_dbname);
return true;
}
}
char *p = m_alloc;
m_scans = (RdbScan *)p; p += nn * sizeof(RdbScan);
2014-11-11 01:45:11 +03:00
m_startpg = (int32_t *)p; p += nn * 4;
m_endpg = (int32_t *)p; p += nn * 4;
2013-08-03 00:12:24 +04:00
//m_hintKeys = (key_t *)p; p += nn * sizeof(key_t);
m_hintKeys = (char *)p; p += nn * m_ks;
2014-11-11 01:45:11 +03:00
m_hintOffsets = (int32_t *)p; p += nn * 4;
m_fileNums = (int32_t *)p; p += nn * 4;
2013-08-03 00:12:24 +04:00
m_lists = (RdbList *)p; p += nn * sizeof(RdbList);
2014-11-11 01:45:11 +03:00
m_tfns = (int32_t *)p; p += nn * 4;
2013-08-03 00:12:24 +04:00
// sanity check
if ( p - m_alloc != need ) {
log(LOG_LOGIC,"disk: Bad malloc in Msg3.cpp.");
char *xx = NULL; *xx = 0;
}
// call constructors
2014-11-11 01:45:11 +03:00
for ( int32_t i = 0 ; i < nn ; i++ ) m_lists[i].constructor();
2013-08-03 00:12:24 +04:00
// make fix from up top
if ( pre != -10 ) m_fileNums [ m_numFileNums++ ] = pre;
// store them all
2014-11-11 01:45:11 +03:00
for ( int32_t i = startFileNum ; i < startFileNum + numFiles ; i++ )
2013-08-03 00:12:24 +04:00
m_fileNums [ m_numFileNums++ ] = i;
// we skip down to here when a syncPoint was used to set the
// m_fileNums/m_numFileNums array of files to read from
// JAB: warning abatement
// skip:
// . remove file nums that are being unlinked after a merge now
// . keep it here (below skip: label) so sync point reads can use it
2014-11-11 01:45:11 +03:00
int32_t n = 0;
for ( int32_t i = 0 ; i < m_numFileNums ; i++ ) {
2013-08-03 00:12:24 +04:00
// skip those that are being unlinked after the merge
if ( base->m_isUnlinking &&
m_fileNums[i] >= base->m_mergeStartFileNum &&
m_fileNums[i] < base->m_mergeStartFileNum +
base->m_numFilesToMerge )
continue;
// otherwise, keep it
m_fileNums[n++] = m_fileNums[i];
}
m_numFileNums = n;
// . if root file is being merged, he's file #0, & root file is file #1
// . this is a hack so caller gets what he wants
//if ( startFileNum == 0 && base->getFileId(0) == 0 && numFiles == 1 )
// numFiles = 2;
// remember the file range we should scan
m_numScansStarted = 0;
m_numScansCompleted = 0;
//m_startKey = startKey;
//m_endKey = endKey;
//m_constrainKey = endKey; // set in case justGetEndKey is true
KEYSET(m_startKey,startKeyArg,m_ks);
KEYSET(m_endKey,endKeyArg,m_ks);
KEYSET(m_constrainKey,endKeyArg,m_ks);//set incase justGetEndKey istrue
m_minRecSizes = minRecSizes;
m_compensateForMerge = compensateForMerge;
// bail if 0 files to scan -- no! need to set startKey/endKey
if ( numFiles == 0 ) return true;
// don't read anything if endKey < startKey
//if ( m_startKey > m_endKey ) return true;
if ( KEYCMP(m_startKey,m_endKey,m_ks)>0 ) return true;
// keep the original in tact in case g_errno == ETRYAGAIN
//m_endKeyOrig = endKey;
KEYSET(m_endKeyOrig,endKeyArg,m_ks);
m_minRecSizesOrig = minRecSizes;
// start reading at this key
m_fileStartKey = startKeyArg;
// start the timer, keep it fast for clusterdb though
if ( g_conf.m_logTimingDb ) m_startTime = gettimeofdayInMilliseconds();
// translate base to an id, for the sake of m_msg0
//char baseId = m_msg0->getRdbId ( base );
// map ptrs
RdbMap **maps = base->getMaps();
// . we now boost m_minRecSizes to account for negative recs
// . but not if only reading one list, cuz it won't get merged and
// it will be too big to send back
if ( m_numFileNums > 1 ) compensateForNegativeRecs ( base );
// . often endKey is too big for an efficient read of minRecSizes bytes
// because we end up reading too much from all the files
// . this will set m_startpg[i], m_endpg[i] for each RdbScan/RdbFile
// to ensure we read "minRecSizes" worth of records, not much more
// . returns the new endKey for all ranges
// . now this just overwrites m_endKey
//m_endKey = setPageRanges ( base ,
setPageRanges ( base ,
m_fileNums ,
m_numFileNums ,
m_fileStartKey , // start reading @ key
m_endKey , // stop reading @ key
m_minRecSizes );
// . NEVER let m_endKey be a negative key, because it will
// always be unmatched, since delbit is cleared
// . adjusting it here ensures our generated hints are valid
// . we will use this key to call constrain() with
//m_constrainKey = m_endKey;
//if ( ( m_constrainKey.n0 & 0x01) == 0x00 )
2014-11-11 01:45:11 +03:00
// m_constrainKey -= (uint32_t)1;
2013-08-03 00:12:24 +04:00
KEYSET(m_constrainKey,m_endKey,m_ks);
if ( KEYNEG(m_constrainKey) )
KEYSUB(m_constrainKey,1,m_ks);
// if m_endKey splits some keys that should be together, we need to
// decrease it so such a split doesn't happen.
//if ( m_endKey != m_endKeyOrig && m_rdbId==RDB_TFNDB && numFiles > 0){
/*
if ( KEYCMP(m_endKey,m_endKeyOrig,m_ks)!=0 && m_rdbId==RDB_TFNDB &&
numFiles > 0 ) {
// . drop the docid down one and max out the tfn...
// . we may lose some recs when we call constrain, but at least
// we are guaranteed not to split a sequence with the same
// docid but different tfns... thus the disk merge will
// then work correctly. before we were splitting these
// sequence between successive disk reads and they were not
// getting annihilated together in the call to indexMerge_r()
2014-10-30 22:36:39 +03:00
int64_t d = g_tfndb.getDocId ( (key_t *)&m_endKey );
2013-08-03 00:12:24 +04:00
if ( d > 0 ) d = d - 1LL;
//m_constrainKey = g_tfndb.makeMaxKey(d);
*(key_t *)m_constrainKey = g_tfndb.makeMaxKey(d);
// set the half bit on
//m_constrainKey.n0 |= 0x02;
*m_constrainKey |= 0x02;
// note it
2014-11-11 01:45:11 +03:00
//logf(LOG_DEBUG,"oldukey.n1=%"XINT32" n0=%"XINT64" new.n1=%"XINT32" n0=%"XINT64"",
2013-08-03 00:12:24 +04:00
// m_endKey.n1,m_endKey.n0,
// m_constrainKey.n1,m_constrainKey.n0);
}
*/
// Msg5 likes to get the endkey for getting the list from the tree
if ( justGetEndKey ) return true;
// sanity check
if ( m_numFileNums > nn ) {
log(LOG_LOGIC,"disk: Failed sanity check in Msg3.");
char *xx = NULL; *xx = 0;
}
// debug msg
2014-11-11 01:45:11 +03:00
//log("msg3 getting list (msg5=%"UINT32")",m_state);
2013-08-03 00:12:24 +04:00
// . MDW removed this -- go ahead an end on a delete key
// . RdbMerge might not pick it up this round, but oh well
// . so we can have both positive and negative co-existing in same file
// make sure the last bit is set so we don't end on a delete key
//m_endKey.n0 |= 0x01LL;
// . now start reading/scanning the files
// . our m_scans array starts at 0
2014-11-11 01:45:11 +03:00
for ( int32_t i = 0 ; i < m_numFileNums ; i++ ) {
2013-08-03 00:12:24 +04:00
// get the page range
2014-11-11 01:45:11 +03:00
//int32_t p1 = m_startpg [ i ];
//int32_t p2 = m_endpg [ i ];
//#ifdef GBSANITYCHECK
2014-11-11 01:45:11 +03:00
int32_t fn = m_fileNums[i];
2013-08-03 00:12:24 +04:00
// this can happen somehow!
if ( fn < 0 ) {
2014-11-11 01:45:11 +03:00
log(LOG_LOGIC,"net: msg3: fn=%"INT32". Bad engineer.",fn);
2013-08-03 00:12:24 +04:00
continue;
}
// sanity check
if ( i > 0 && m_fileNums[i-1] >= fn ) {
log(LOG_LOGIC,
"net: msg3: files must be read in order "
"from oldest to newest so RdbList::indexMerge_r "
"works properly. Otherwise, corruption will "
"result. ");
char *xx = NULL; *xx = 0;
return true;
}
// . sanity check?
// . no, we must get again since we turn on endKey's last bit
2014-11-11 01:45:11 +03:00
int32_t p1 , p2;
2013-08-03 00:12:24 +04:00
maps[fn]->getPageRange ( m_fileStartKey ,
m_endKey ,
&p1 ,
&p2 ,
NULL );
//if ( p1 != p1c || p2 != p2c ) {
// fprintf(stderr,"Msg3::bad page range\n");
// sleep(50000);
//}
// sanity check, each endpg's key should be > endKey
//if ( p2 < maps[fn]->getNumPages() &&
// maps[fn]->getKey ( p2 ) <= m_endKey ) {
// fprintf(stderr,"Msg3::bad page range 2\n");
// sleep(50000);
//}
//#endif
2014-11-11 01:45:11 +03:00
//int32_t p1 , p2;
2013-08-03 00:12:24 +04:00
//maps[fn]->getPageRange (startKey,endKey,minRecSizes,&p1,&p2);
// now get some read info
2014-10-30 22:36:39 +03:00
int64_t offset = maps[fn]->getAbsoluteOffset ( p1 );
2014-11-11 01:45:11 +03:00
int32_t bytesToRead = maps[fn]->getRecSizes ( p1, p2, false);
2013-08-03 00:12:24 +04:00
// max out the endkey for this list
// debug msg
//#ifdef _DEBUG_
//if ( minRecSizes == 2000000 )
2014-11-11 01:45:11 +03:00
//log("Msg3:: reading %"INT32" bytes from file #%"INT32"",bytesToRead,i);
2013-08-03 00:12:24 +04:00
//#endif
// inc our m_numScans
m_numScansStarted++;
// . keep stats on our disk accesses
// . count disk seeks (assuming no fragmentation)
// . count disk bytes read
if ( bytesToRead > 0 ) {
base->m_rdb->didSeek ( );
base->m_rdb->didRead ( bytesToRead );
}
// . the startKey may be different for each RdbScan class
// . RdbLists must have all keys within their [startKey,endKey]
// . therefore set startKey individually from first page in map
// . this endKey must be >= m_endKey
// . this startKey must be < m_startKey
//key_t startKey = maps[fn]->getKey ( p1 );
//key_t endKey = maps[fn]->getKey ( p2 );
char startKey2 [ MAX_KEY_BYTES ];
char endKey2 [ MAX_KEY_BYTES ];
maps[fn]->getKey ( p1 , startKey2 );
maps[fn]->getKey ( p2 , endKey2 );
//char *startKey = maps[fn]->getKeyPtr ( p1 );
//char *endKey = maps[fn]->getKeyPtr ( p2 );
// store in here
m_startpg [ i ] = p1;
m_endpg [ i ] = p2;
// . we read UP TO that endKey, so reduce by 1
// . but iff p2 is NOT the last page in the map/file
// . maps[fn]->getKey(lastPage) will return the LAST KEY
// and maps[fn]->getOffset(lastPage) the length of the file
2014-11-11 01:45:11 +03:00
//if ( maps[fn]->getNumPages()!=p2) endKey -=(uint32_t)1;
2013-08-03 00:12:24 +04:00
if ( maps[fn]->getNumPages() != p2 ) KEYSUB(endKey2,1,m_ks);
// otherwise, if we're reading all pages, then force the
// endKey to virtual inifinite
//else endKey.setMax();
else KEYMAX(endKey2,m_ks);
// . set up the hints
// . these are only used if we are only reading from 1 file
// . these are used to call constrain() so we can constrain
// the end of the list w/o looping through all the recs
// in the list
2014-11-11 01:45:11 +03:00
int32_t h2 = p2 ;
2013-08-03 00:12:24 +04:00
// decrease by one page if we're on the last page
if ( h2 > p1 && maps[fn]->getNumPages() == h2 ) h2--;
// . decrease hint page until key is <= endKey on that page
// AND offset is NOT -1 because the old way would give
// us hints passed the endkey
// . also decrease so we can constrain on minRecSizes in
// case we're the only list being read
// . use >= m_minRecSizes instead of >, otherwise we may
// never be able to set "size" in RdbList::constrain()
// because "p" could equal "maxPtr" right away
while ( h2 > p1 &&
//( maps[fn]->getKey (h2) > m_constrainKey ||
(KEYCMP(maps[fn]->getKeyPtr(h2),m_constrainKey,m_ks)>0||
maps[fn]->getOffset(h2) == -1 ||
maps[fn]->getAbsoluteOffset(h2) - offset >=
m_minRecSizes ) )
h2--;
// now set the hint
m_hintOffsets [ i ] = maps[fn]->getAbsoluteOffset ( h2 ) -
maps[fn]->getAbsoluteOffset ( p1 ) ;
//m_hintKeys [ i ] = maps[fn]->getKey ( h2 );
KEYSET(&m_hintKeys[i*m_ks],maps[fn]->getKeyPtr(h2),m_ks);
// reset g_errno before calling setRead()
g_errno = 0;
// . this fix is now in RdbList::checklist_r()
// . we can now have dup keys, so, we may read in
// a rec with key "lastMinKey" even though we don't read
// in the first key on the end page, so don't subtract 1...
//if ( endKey != m_endKeyOrig )
2014-11-11 01:45:11 +03:00
// endKey += (uint32_t) 1;
2013-08-03 00:12:24 +04:00
// timing debug
if ( g_conf.m_logTimingDb )
log(LOG_TIMING,
2014-11-11 01:45:11 +03:00
"net: msg: reading %"INT32" bytes from %s file #%"INT32" "
"(niceness=%"INT32")",
2013-08-03 00:12:24 +04:00
bytesToRead,base->m_dbname,i,m_niceness);
// set the tfn
if ( m_rdbId == RDB_TITLEDB )
m_tfns[i] = base->getFileId2(m_fileNums[i]);
// log huge reads, those hurt us
if ( bytesToRead > 150000000 ) {
2014-11-11 01:45:11 +03:00
logf(LOG_INFO,"disk: Reading %"INT32" bytes at offset %"INT64" "
2013-08-03 00:12:24 +04:00
"from %s.",
bytesToRead,offset,base->m_dbname);
}
// if any keys in the map are the same report corruption
char tmpKey [16];
char lastTmpKey[16];
2014-11-11 01:45:11 +03:00
int32_t ccount = 0;
2013-08-03 00:12:24 +04:00
if ( bytesToRead > 10000000 &&
bytesToRead / 2 > m_minRecSizes &&
base->m_fixedDataSize >= 0 ) {
2014-11-11 01:45:11 +03:00
for ( int32_t pn = p1 ; pn <= p2 ; pn++ ) {
2013-08-03 00:12:24 +04:00
maps[fn]->getKey ( pn , tmpKey );
if ( KEYCMP(tmpKey,lastTmpKey,m_ks) == 0 )
ccount++;
gbmemcpy(lastTmpKey,tmpKey,m_ks);
2013-08-03 00:12:24 +04:00
}
}
if ( ccount > 10 ) {
2014-11-11 01:45:11 +03:00
logf(LOG_INFO,"disk: Reading %"INT32" bytes from %s file #"
"%"INT32" when min "
"required is %"INT32". Map is corrupt and has %"INT32" "
2013-08-03 00:12:24 +04:00
"identical consecutive page keys because the "
"map was \"repaired\" because out of order keys "
"in the index.",
2014-11-11 01:45:11 +03:00
(int32_t)bytesToRead,
2013-08-03 00:12:24 +04:00
base->m_dbname,fn,
2014-11-11 01:45:11 +03:00
(int32_t)m_minRecSizes,
(int32_t)ccount);
2013-08-03 00:12:24 +04:00
m_numScansCompleted++;
m_errno = ECORRUPTDATA;
m_hadCorruption = true;
//m_maxRetries = 0;
break;
}
// . do the scan/read of file #i
// . this returns false if blocked, true otherwise
// . this will set g_errno on error
bool done = m_scans[i].setRead (base->getFile(m_fileNums[i]),
base->m_fixedDataSize ,
offset ,
bytesToRead ,
startKey2 ,
endKey2 ,
m_ks ,
&m_lists[i] ,
this ,
doneScanningWrapper ,
base->useHalfKeys() ,
m_rdbId,
m_niceness ,
m_allowPageCache ,
m_hitDisk ) ;
// . damn, usually the above will indirectly launch a thread
// to do the reading, but it sets g_errno to EINTR,
// "interrupted system call"!
// . i guess the thread does the read w/o blocking and then
// queues the signal on g_loop's queue before it exits
// . try ignoring, and keep going
if ( g_errno == EINTR ) {
log("net: Interrupted system call while reading file. "
"Ignoring.");
g_errno = 0;
}
// debug msg
2014-11-11 01:45:11 +03:00
//fprintf(stderr,"Msg3:: reading %"INT32" bytes from file #%"INT32","
// "done=%"INT32",offset=%"INT64",g_errno=%s,"
// "startKey=n1=%"UINT32",n0=%"UINT64", "
// "endKey=n1=%"UINT32",n0=%"UINT64"\n",
// bytesToRead,i,(int32_t)done,offset,mstrerror(g_errno),
2013-08-03 00:12:24 +04:00
// m_startKey,m_endKey);
//if ( bytesToRead == 0 )
// fprintf(stderr,"shit\n");
// if it did not block then it completed, so count it
if ( done ) m_numScansCompleted++;
// break on an error, and remember g_errno in case we block
if ( g_errno && g_errno != ENOTHREADSLOTS ) {
2014-11-11 01:45:11 +03:00
int32_t tt = LOG_WARN;
2013-08-03 00:12:24 +04:00
if ( g_errno == EFILECLOSED ) tt = LOG_INFO;
log(tt,"disk: Reading %s had error: %s.",
base->m_dbname, mstrerror(g_errno));
m_errno = g_errno;
break;
}
}
// debug test
//if ( rand() % 100 <= 10 ) m_errno = EIO;
// if we blocked, return false
if ( m_numScansCompleted < m_numScansStarted ) return false;
// . if all scans completed without blocking then wrap it up & ret true
// . doneScanning may now block if it finds data corruption and must
// get the list remotely
return doneScanning();
}
void doneScanningWrapper ( void *state ) {
Msg3 *THIS = (Msg3 *) state;
// inc the scan count
THIS->m_numScansCompleted++;
// we decided to try to ignore these errors
if ( g_errno == EINTR ) {
log("net: Interrupted system call while reading file. "
"Ignoring.");
g_errno = 0;
}
// if we had an error, remember it
if ( g_errno ) {
// get base, returns NULL and sets g_errno to ENOCOLLREC on err
RdbBase *base; base=getRdbBase(THIS->m_rdbId,THIS->m_collnum);
2013-08-03 00:12:24 +04:00
char *dbname = "NOT FOUND";
if ( base ) dbname = base->m_dbname;
2014-11-11 01:45:11 +03:00
int32_t tt = LOG_WARN;
2013-08-03 00:12:24 +04:00
if ( g_errno == EFILECLOSED ) tt = LOG_INFO;
log(tt,"net: Reading %s had error: %s.",
dbname,mstrerror(g_errno));
THIS->m_errno = g_errno;
g_errno = 0;
}
// return now if we're awaiting more scan completions
if ( THIS->m_numScansCompleted < THIS->m_numScansStarted ) return;
// . give control to doneScanning
// . return if it blocks
if ( ! THIS->doneScanning() ) return;
// if one of our lists was *huge* and could not alloc mem, it was
// due to corruption
if ( THIS->m_hadCorruption ) g_errno = ECORRUPTDATA;
// if it doesn't block call the callback, g_errno may be set
THIS->m_callback ( THIS->m_state );
}
static void doneSleepingWrapper3 ( int fd , void *state ) ;
// . but now that we may get a list remotely to fix data corruption,
// this may indeed block
bool Msg3::doneScanning ( ) {
QUICKPOLL(m_niceness);
// . did we have any error on any scan?
// . if so, repeat ALL of the scans
g_errno = m_errno;
// 2 retry is the default
2014-11-11 01:45:11 +03:00
int32_t max = 2;
2013-08-03 00:12:24 +04:00
// see if explicitly provided by the caller
if ( m_maxRetries >= 0 ) max = m_maxRetries;
// now use -1 (no max) as the default no matter what
max = -1;
// ENOMEM is particulary contagious, so watch out with it...
if ( g_errno == ENOMEM && m_maxRetries == -1 ) max = 0;
// msg0 sets maxRetries to 2, don't let max stay set to -1
if ( g_errno == ENOMEM && m_maxRetries != -1 ) max = m_maxRetries;
// when thread cannot alloc enough read buf it keeps the read buf
// set to NULL and BigFile.cpp sets g_errno to EBUFTOOSMALL
if ( g_errno == EBUFTOOSMALL && m_maxRetries == -1 ) max = 0;
// msg0 sets maxRetries to 2, don't let max stay set to -1
if ( g_errno == EBUFTOOSMALL && m_maxRetries != -1 ) max = m_maxRetries;
// . if no thread slots available, that hogs up serious memory.
// the size of Msg3 is 82k, so having just 5000 of them is 430MB.
// . i just made Msg3 alloc mem when it needs more than about 2k
// so this problem is greatly reduced, therefore let's keep
// retrying... forever if no thread slots in thread queue since
// we become the thread queue in a way.
if ( g_errno == ENOTHREADSLOTS ) max = -1;
// this is set above if the map has the same consecutive key repeated
// and the read is enormous
if ( g_errno == ECORRUPTDATA ) max = 0;
// usually bad disk failures, don't retry those forever
//if ( g_errno == EIO ) max = 3;
// no, now our hitachis return these even when they're good so
// we have to keep retrying forever
if ( g_errno == EIO ) max = -1;
// count these so we do not take drives offline just because
// kernel ring buffer complains...
if ( g_errno == EIO ) g_numIOErrors++;
// bail early on high priority reads for these errors
if ( g_errno == EDISKSTUCK && m_niceness == 0 ) max = 0;
if ( g_errno == EIO && m_niceness == 0 ) max = 0;
// how does this happen? we should never bail out on a low priority
// disk read... we just wait for it to complete...
if ( g_errno == EDISKSTUCK && m_niceness != 0 ) { char *xx=NULL;*xx=0;}
// on I/O, give up at call it corrupt after a while. some hitachis
// have I/O errros on little spots, like gk88, maybe we can fix him
if ( g_errno == EIO && m_retryNum >= 5 ) {
m_errno = ECORRUPTDATA;
m_hadCorruption = true;
// do not do any retries any more
max = 0;
}
// convert m_errno to ECORRUPTDATA if it is EBUFTOOSMALL and the
// max of the bytesToRead are over 500MB.
// if bytesToRead was ludicrous, then assume that the data file
// was corrupted, the map was regenerated and it patched
// over the corrupted bits which were 500MB or more in size.
// we cannot practically allocate that much, so let's just
// give back an empty buffer. treat it like corruption...
// the way it patches is to store the same key over all the corrupted
// pages, which can get pretty big. so if you read a range with that
// key you will be hurting!!
// this may be the same scenario as when the rdbmap has consecutive
// same keys. see above where we set m_errno to ECORRUPTDATA...
if ( g_errno == EBUFTOOSMALL ) {
2014-11-11 01:45:11 +03:00
int32_t biggest = 0;
for ( int32_t i = 0 ; i < m_numFileNums ; i++ ) {
2013-08-03 00:12:24 +04:00
if ( m_scans[i].m_bytesToRead < biggest ) continue;
biggest = m_scans[i].m_bytesToRead;
}
if ( biggest > 500000000 ) {
2014-11-11 01:45:11 +03:00
log("db: Max read size was %"INT32" > 500000000. Assuming "
2013-08-03 00:12:24 +04:00
"corrupt data in data file.",biggest);
m_errno = ECORRUPTDATA;
m_hadCorruption = true;
// do not do any retries on this, the read was > 500MB
max = 0;
}
}
// if shutting down gb then limit to 20 so we can shutdown because
// it can't shutdown until all threads are out of the queue i think
2015-04-13 19:08:01 +03:00
if ( g_process.m_mode == EXIT_MODE && max < 0 ) {
2015-06-17 20:17:36 +03:00
//log("msg3: forcing retries to 0 because shutting down");
max = 0;
2015-04-13 19:08:01 +03:00
}
2013-08-03 00:12:24 +04:00
// get base, returns NULL and sets g_errno to ENOCOLLREC on error
RdbBase *base; if (!(base=getRdbBase(m_rdbId,m_collnum))) return true;
2013-08-03 00:12:24 +04:00
// this really slows things down because it blocks the cpu so
// leave it out for now
#ifdef GBSANITYCHECK
2013-08-03 00:12:24 +04:00
// check for corruption here, do not do it again in Msg5 if we pass
if ( ! g_errno ) { // && g_conf.m_doErrorCorrection ) {
2014-11-11 01:45:11 +03:00
int32_t i;
2013-08-03 00:12:24 +04:00
for ( i = 0 ; i < m_numFileNums ; i++ )
if ( ! m_lists[i].checkList_r ( false, false ) ) break;
if ( i < m_numFileNums ) {
g_errno = ECORRUPTDATA;
m_errno = ECORRUPTDATA;
max = g_conf.m_corruptRetries; // try 100 times
log("db: Encountered corrupt list in file %s.",
base->getFile(m_fileNums[i])->getFilename());
}
else
m_listsChecked = true;
}
#endif
// . if we had a ETRYAGAIN error, then try again now
// . it usually means the whole file or a part of it was deleted
// before we could finish reading it, so we should re-read all now
// . RdbMerge deletes BigFiles after it merges them and also chops
// off file heads
// . now that we have threads i'd imagine we'd get EBADFD or something
// . i've also seen "illegal seek" as well
if ( m_errno && (m_retryNum < max || max < 0) &&
// this will complete in due time, we can't call a sleep wrapper
// on it because the read is really still pending...
m_errno != EDISKSTUCK ) {
// print the error
static time_t s_time = 0;
time_t now = getTime();
if ( now - s_time > 5 || g_errno != ENOTHREADSLOTS ) {
log("net: Had error reading %s: %s. Retrying. "
2014-11-11 01:45:11 +03:00
"(retry #%"INT32")",
2013-08-03 00:12:24 +04:00
base->m_dbname,mstrerror(g_errno) , m_retryNum );
s_time = now;
}
// send email alert if in an infinite loop, but don't send
// more than once every 2 hours
2014-11-11 01:45:11 +03:00
static int32_t s_lastSendTime = 0;
2013-08-03 00:12:24 +04:00
if ( m_retryNum == 100 && getTime() - s_lastSendTime > 3600*2){
// remove this for now it is going off all the time
//g_pingServer.sendEmail(NULL,//g_hostdb.getMyHost(),
// "100 read retries",true);
s_lastSendTime = getTime();
}
// clear g_errno cuz we should for call to readList()
g_errno = 0;
// free the list buffer since if we have 1000 Msg3s retrying
// it will totally use all of our memory
2014-11-11 01:45:11 +03:00
for ( int32_t i = 0 ; i < m_numChunks ; i++ )
2013-08-03 00:12:24 +04:00
m_lists[i].destructor();
// count retries
m_retryNum++;
// backoff scheme, wait 100ms more each time
2014-11-11 01:45:11 +03:00
int32_t wait ;
2013-08-03 00:12:24 +04:00
if ( m_retryNum == 1 ) wait = 10;
else wait = 200 * m_retryNum;
// . don't wait more than 10 secs between tries
// . i've seen gf0 and gf16 get mega saturated
if ( wait > 10000 ) wait = 10000;
// wait 500 ms
if ( g_loop.registerSleepCallback ( wait , // ms
this ,
doneSleepingWrapper3,
m_niceness))
return false;
// otherwise, registration failed
log(
"net: Failed to register sleep callback for retry. "
"Abandoning read. This is bad.");
// return, g_errno should be set
g_errno = EBUFTOOSMALL;
m_errno = EBUFTOOSMALL;
return true;
}
// if we got an error and should not retry any more then give up
if ( g_errno ) {
log(
2014-11-11 01:45:11 +03:00
"net: Had error reading %s: %s. Giving up after %"INT32" "
2013-08-03 00:12:24 +04:00
"retries.",
base->m_dbname,mstrerror(g_errno) , m_retryNum );
return true;
}
// note it if the retry finally worked
if ( m_retryNum > 0 )
2014-11-11 01:45:11 +03:00
log(LOG_INFO,"disk: Read succeeded after retrying %"INT32" times.",
(int32_t)m_retryNum);
2013-08-03 00:12:24 +04:00
// count total bytes for logging
2014-11-11 01:45:11 +03:00
int32_t count = 0;
2013-08-03 00:12:24 +04:00
// . constrain all lists to make merging easier
// . if we have only one list, then that's nice cuz the constrain
// will allow us to send it right away w/ zero copying
// . if we have only 1 list, it won't be merged into a final list,
// that is, we'll just set m_list = &m_lists[i]
2014-11-11 01:45:11 +03:00
for ( int32_t i = 0 ; i < m_numFileNums ; i++ ) {
2013-08-03 00:12:24 +04:00
QUICKPOLL(m_niceness);
// count total bytes for logging
count += m_lists[i].getListSize();
// . hint offset is relative to the offset of first key we read
// . if that key was only 6 bytes RdbScan shift the list buf
// down 6 bytes to make the first key 12 bytes... a
// requirement for all RdbLists
// . don't inc it, though, if it was 0, pointing to the start
// of the list because our shift won't affect that
if ( m_scans[i].m_shifted == 6 && m_hintOffsets[i] > 0 )
m_hintOffsets[i] += 6;
// posdb double compression
if ( m_scans[i].m_shifted == 12 && m_hintOffsets[i] > 0 )
m_hintOffsets[i] += 12;
// . don't constrain on minRecSizes here because it may
// make our endKey smaller, which will cause problems
// when Msg5 merges these lists.
// . If all lists have different endKeys RdbList's merge
// chooses the min and will merge in recs beyond that
// causing a bad list BECAUSE we don't check to make
// sure that recs we are adding are below the endKey
// . if we only read from one file then constrain based
// on minRecSizes so we can send the list back w/o merging
// OR if just merging with RdbTree's list
2014-11-11 01:45:11 +03:00
int32_t mrs ;
2013-08-03 00:12:24 +04:00
// . constrain to m_minRecSizesOrig, not m_minRecSizes cuz
// that could be adjusted by compensateForNegativeRecs()
// . but, really, they should be the same if we only read from
// the root file
if ( m_numFileNums == 1 ) mrs = m_minRecSizesOrig;
else mrs = -1;
// . this returns false and sets g_errno on error
// . like if data is corrupt
BigFile *ff = base->getFile(m_fileNums[i]);
if ( ! m_lists[i].constrain ( m_startKey ,
m_constrainKey , // m_endKey
mrs , // m_minRecSizes
m_hintOffsets[i] ,
//m_hintKeys [i] ,
&m_hintKeys [i*m_ks] ,
ff->getFilename() ,
m_niceness ) ) {
log("net: Had error while constraining list read from "
2014-11-11 01:45:11 +03:00
"%s: %s/%s. vfd=%"INT32" parts=%"INT32". "
"This is likely caused by corrupted "
2013-08-03 00:12:24 +04:00
"data on disk.",
mstrerror(g_errno), ff->m_dir ,
ff->getFilename(), ff->m_vfd ,
2014-11-11 01:45:11 +03:00
(int32_t)ff->m_numParts );
2013-08-03 00:12:24 +04:00
}
}
// print the time
if ( g_conf.m_logTimingDb ) {
2014-10-30 22:36:39 +03:00
int64_t now = gettimeofdayInMilliseconds();
int64_t took = now - m_startTime;
2013-08-03 00:12:24 +04:00
log(LOG_TIMING,
2014-11-11 01:45:11 +03:00
"net: Took %"INT64" ms to read %"INT32" lists of %"INT32" bytes total"
" from %s (niceness=%"INT32").",
2013-08-03 00:12:24 +04:00
took,m_numFileNums,count,base->m_dbname,m_niceness);
}
return true;
}
void doneSleepingWrapper3 ( int fd , void *state ) {
Msg3 *THIS = (Msg3 *)state;
// now try reading again
if ( ! THIS->doneSleeping ( ) ) return;
// if it doesn't block call the callback, g_errno may be set
THIS->m_callback ( THIS->m_state );
}
bool Msg3::doneSleeping ( ) {
// unregister
g_loop.unregisterSleepCallback(this,doneSleepingWrapper3);
// read again
if ( ! readList ( m_rdbId ,
m_collnum ,
2013-08-03 00:12:24 +04:00
m_startKey ,
m_endKeyOrig ,
m_minRecSizesOrig ,
m_startFileNum ,
m_numFiles ,
m_state ,
m_callback ,
m_niceness ,
m_retryNum ,
m_maxRetries ,
m_compensateForMerge ,
-1,//m_syncPoint ,
false ,
m_allowPageCache ,
m_hitDisk ) ) return false;
return true;
}
// . returns a new, smaller endKey
// . shrinks endKey while still preserving the minRecSizes requirement
// . this is the most confusing subroutine in the project
// . this now OVERWRITES endKey with the new one
//key_t Msg3::setPageRanges ( RdbBase *base ,
void Msg3::setPageRanges ( RdbBase *base ,
2014-11-11 01:45:11 +03:00
int32_t *fileNums ,
int32_t numFileNums ,
2013-08-03 00:12:24 +04:00
//key_t startKey ,
//key_t endKey ,
char *startKey ,
char *endKey ,
2014-11-11 01:45:11 +03:00
int32_t minRecSizes ) {
2013-08-03 00:12:24 +04:00
// sanity check
//if ( m_ks != 12 && m_ks != 16 ) { char *xx=NULL;*xx=0; }
// get the file maps from the rdb
RdbMap **maps = base->getMaps();
// . initialize the startpg/endpg for each file
// . we read from the first offset on m_startpg to offset on m_endpg
// . since we set them equal that means an empty range for each file
2014-11-11 01:45:11 +03:00
for ( int32_t i = 0 ; i < numFileNums ; i++ ) {
int32_t fn = fileNums[i];
2013-08-03 00:12:24 +04:00
if ( fn < 0 ) { char *xx = NULL; *xx = 0; }
m_startpg[i] = maps[fn]->getPage( startKey );
m_endpg [i] = m_startpg[i];
}
// just return if minRecSizes 0 (no reading needed)
//if ( minRecSizes <= 0 ) return endKey ;
if ( minRecSizes <= 0 ) return;
// calculate minKey minus one
//key_t lastMinKey ;
char lastMinKey[MAX_KEY_BYTES];
char lastMinKeyIsValid = 0;
// loop until we find the page ranges that barely satisfy "minRecSizes"
loop:
// find the map whose next page has the lowest key
2014-11-11 01:45:11 +03:00
int32_t minpg = -1;
2013-08-03 00:12:24 +04:00
//key_t minKey;
char minKey[MAX_KEY_BYTES];
2014-11-11 01:45:11 +03:00
for ( int32_t i = 0 ; i < numFileNums ; i++ ) {
int32_t fn = fileNums[i];
2013-08-03 00:12:24 +04:00
// this guy is out of race if his end key > "endKey" already
//if ( maps[fn]->getKey ( m_endpg[i] ) > endKey ) continue;
if(KEYCMP(maps[fn]->getKeyPtr(m_endpg[i]),endKey,m_ks)>0)
continue;
// get the next page after m_endpg[i]
2014-11-11 01:45:11 +03:00
int32_t nextpg = m_endpg[i] + 1;
2013-08-03 00:12:24 +04:00
// if endpg[i]+1 == m_numPages then we maxed out this range
if ( nextpg > maps[fn]->getNumPages() ) continue;
// . but this may have an offset of -1
// . which means the page has no key starting on it and
// it's occupied by a rec which starts on a previous page
while ( nextpg < maps[fn]->getNumPages() &&
maps[fn]->getOffset ( nextpg ) == -1 ) nextpg++;
// . continue if his next page doesn't have the minimum key
// . if nextpg == getNumPages() then it returns the LAST KEY
// contained in the corresponding RdbFile
//if ( minpg != -1 && maps[fn]->getKey ( nextpg ) > minKey )
if (minpg != -1 &&
KEYCMP(maps[fn]->getKeyPtr(nextpg),minKey,m_ks)>0)continue;
// . we got a winner, his next page has the current min key
// . if m_endpg[i]+1 == getNumPages() then getKey() returns the
// last key in the mapped file
// . minKey should never equal the key on m_endpg[i] UNLESS
// it's on page #m_numPages
//minKey = maps[fn]->getKey ( nextpg );
KEYSET(minKey,maps[fn]->getKeyPtr(nextpg),m_ks);
minpg = i;
// if minKey is same as the current key on this endpg, inc it
// so we cause some advancement, otherwise, we'll loop forever
//if ( minKey != maps[fn]->getKey ( m_endpg[i] ) ) continue;
if ( KEYCMP(minKey,maps[fn]->getKeyPtr(m_endpg[i]),m_ks)!=0)
continue;
2014-11-11 01:45:11 +03:00
//minKey += (uint32_t) 1;
2013-08-03 00:12:24 +04:00
KEYADD(minKey,1,m_ks);
}
// . we're done if we hit the end of all maps in the race
// . return the max end key
// key_t maxEndKey; maxEndKey.setMax(); return maxEndKey; }
// . no, just the endKey
//if ( minpg == -1 ) return endKey;
if ( minpg == -1 ) return;
// sanity check
if ( lastMinKeyIsValid && KEYCMP(minKey,lastMinKey,m_ks)<=0 ) {
g_errno = ECORRUPTDATA;
log("db: Got corrupted map in memory for %s. This is almost "
"always because of bad memory. Please replace your RAM.",
base->m_dbname);
// do not wait for any merge to complete... otherwise
// Rdb.cpp will not close until the merge is done
g_merge.m_isMerging = false;
g_merge2.m_isMerging = false;
// to complete
// shutdown with urgent=true so threads are disabled.
//mainShutdown(true);
g_process.shutdown(true);
//g_numCorrupt++;
// sleep for now until we make sure this works
//sleep(2000);
return;
}
// don't let minKey exceed endKey, however
//if ( minKey > endKey ) {
if ( KEYCMP(minKey,endKey,m_ks)>0 ) {
//minKey = endKey ;
2014-11-11 01:45:11 +03:00
//minKey += (uint32_t) 1;
2013-08-03 00:12:24 +04:00
//lastMinKey = endKey;
KEYSET(minKey,endKey,m_ks);
KEYADD(minKey,1,m_ks);
KEYSET(lastMinKey,endKey,m_ks);
}
else {
//lastMinKey = minKey ;
2014-11-11 01:45:11 +03:00
//lastMinKey -= (uint32_t) 1;
2013-08-03 00:12:24 +04:00
KEYSET(lastMinKey,minKey,m_ks);
KEYSUB(lastMinKey,1,m_ks);
}
// it is now valid
lastMinKeyIsValid = 1;
// . advance m_endpg[i] so that next page < minKey
// . we want to read UP TO the first key on m_endpg[i]
2014-11-11 01:45:11 +03:00
for ( int32_t i = 0 ; i < m_numFileNums ; i++ ) {
int32_t fn = fileNums[i];
2013-08-03 00:12:24 +04:00
m_endpg[i] = maps[fn]->getEndPage ( m_endpg[i], lastMinKey );
}
// . if the minKey is BIGGER than the provided endKey we're done
// . we don't necessarily include records whose key is "minKey"
//if ( minKey > endKey ) return endKey;
if ( KEYCMP(minKey,endKey,m_ks)>0) return;
// . calculate recSizes per page within [startKey,minKey-1]
// . compute bytes of records in [startKey,minKey-1] for each map
// . this includes negative records so we may have annihilations
// when merging into "diskList" and get less than what we wanted
2014-11-11 01:45:11 +03:00
// but endKey should be int16_tened, so our caller will know to call
2013-08-03 00:12:24 +04:00
// again if he wants more
2014-11-11 01:45:11 +03:00
int32_t recSizes = 0;
for ( int32_t i = 0 ; i < m_numFileNums ; i++ ) {
int32_t fn = fileNums[i];
2013-08-03 00:12:24 +04:00
recSizes += maps[fn]->getMinRecSizes ( m_startpg[i] ,
m_endpg [i] ,
startKey ,
lastMinKey ,
false );
}
// if we hit it then return minKey -1 so we only read UP TO "minKey"
// not including "minKey"
//if ( recSizes >= minRecSizes )
if ( recSizes >= minRecSizes ) {
// . sanity check
// . this sanity check fails sometimes, but leave it
// out for now... causes the Illegal endkey msgs in
// RdbList::indexMerge_r()
//if ( KEYNEG(lastMinKey) ) { char *xx=NULL;*xx=0; }
KEYSET(endKey,lastMinKey,m_ks);
//return lastMinKey;
return;
}
// keep on truckin'
goto loop;
}
// . we now boost m_minRecSizes to account for negative recs in certain files
// . TODO: use floats for averages, not ints
void Msg3::compensateForNegativeRecs ( RdbBase *base ) {
// get the file maps from the rdb
RdbMap **maps = base->getMaps();
// add up counts from each map
2014-10-30 22:36:39 +03:00
int64_t totalNegatives = 0;
int64_t totalPositives = 0;
int64_t totalFileSize = 0;
2014-11-11 01:45:11 +03:00
for (int32_t i = 0 ; i < m_numFileNums ; i++) {
int32_t fn = m_fileNums[i];
2013-08-03 00:12:24 +04:00
// . this cored on me before when fn was -1, how'd that happen?
// . it happened right after startup before a merge should
// have been attempted
if ( fn < 0 ) {
2014-11-11 01:45:11 +03:00
log(LOG_LOGIC,"net: msg3: fn=%"INT32". bad engineer.",fn);
2013-08-03 00:12:24 +04:00
continue;
}
totalNegatives += maps[fn]->getNumNegativeRecs();
totalPositives += maps[fn]->getNumPositiveRecs();
totalFileSize += maps[fn]->getFileSize();
}
// add em all up
2014-10-30 22:36:39 +03:00
int64_t totalNumRecs = totalNegatives + totalPositives;
2013-08-03 00:12:24 +04:00
// if we have no records on disk, why are we reading from disk?
if ( totalNumRecs == 0 ) return ;
// what is the size of a negative record?
2014-11-11 01:45:11 +03:00
//int32_t negRecSize = sizeof(key_t);
int32_t negRecSize = m_ks;
2013-08-03 00:12:24 +04:00
if ( base->getFixedDataSize() == -1 ) negRecSize += 4;
// what is the size of all positive recs combined?
2014-10-30 22:36:39 +03:00
int64_t posFileSize = totalFileSize - negRecSize * totalNegatives;
2013-08-03 00:12:24 +04:00
// . we often overestimate the size of the negative recs for indexdb
// because it uses half keys...
// . this can make posFileSize go negative and ultimately result in
// a newMin of 0x7fffffff which really fucks us up
if ( posFileSize < 0 ) posFileSize = 0;
// what is the average size of a positive record?
2014-11-11 01:45:11 +03:00
int32_t posRecSize = 0;
2013-08-03 00:12:24 +04:00
if ( totalPositives > 0 ) posRecSize = posFileSize / totalPositives;
// we annihilate the negative recs and their positive pairs
2014-10-30 22:36:39 +03:00
int64_t loss = totalNegatives * (negRecSize + posRecSize);
2013-08-03 00:12:24 +04:00
// what is the percentage lost?
2014-10-30 22:36:39 +03:00
int64_t lostPercent = (100LL * loss) / totalFileSize;
2013-08-03 00:12:24 +04:00
// how much more should we read to compensate?
2014-11-11 01:45:11 +03:00
int32_t newMin = ((int64_t)m_minRecSizes * (lostPercent + 100LL))/100LL;
2013-08-03 00:12:24 +04:00
// newMin will never be smaller unless it overflows
if ( newMin < m_minRecSizes ) newMin = 0x7fffffff;
// print msg if we changed m_minRecSizes
//if ( newMin != m_minRecSizes )
2014-11-11 01:45:11 +03:00
// log("Msg3::compensated from minRecSizes from %"INT32" to %"INT32"",
2013-08-03 00:12:24 +04:00
// m_minRecSizes, newMin );
// set the new min
m_minRecSizes = newMin;
}