#include "gb-include.h" #include "RdbDump.h" #include "Rdb.h" //#include "Tfndb.h" //#include "Sync.h" #include "Collectiondb.h" //#include "CollectionRec.h" #include "Tagdb.h" //#include "Catdb.h" #include "Statsdb.h" #include "Accessdb.h" extern void dumpDatedb ( char *coll,long sfn,long numFiles,bool includeTree, long long termId , bool justVerify ) ; extern void dumpPosdb ( char *coll,long sfn,long numFiles,bool includeTree, long long termId , bool justVerify ) ; void doneReadingForVerifyWrapper ( void *state ) ; //void gotTfndbListWrapper ( void *state , RdbList *list, Msg5 *msg5 ) ; // . return false if blocked, true otherwise // . sets g_errno on error bool RdbDump::set ( //char *coll , collnum_t collnum , BigFile *file , long id2 , // in Rdb::m_files[] array bool isTitledb , RdbBuckets *buckets , // optional buckets to dump RdbTree *tree , // optional tree to dump RdbMap *map , RdbCache *cache , long maxBufSize , bool orderedDump , // dump in order of keys? bool dedup , // 4 RdbCache::incorporateList() long niceness , void *state , void (* callback) ( void *state ) , bool useHalfKeys , long long startOffset , //key_t prevLastKey , char *prevLastKey , char keySize , class DiskPageCache *pc , long long maxFileSize , Rdb *rdb ) { if ( ! orderedDump ) { log(LOG_LOGIC,"db: RdbDump does not support non-ordered."); char *xx = NULL; *xx = 0; } //if ( ! coll && //if ( ! coll && rdb->m_isCollectionLess ) // strcpy(m_coll,rdb->m_dbname); m_collnum = collnum; // use 0 for collectionless if ( rdb && rdb->m_isCollectionLess ) m_collnum = 0; /* if ( ! coll && g_catdb.getRdb() == rdb ) strcpy(m_coll, "catdb"); else if ( ! coll && g_statsdb.getRdb() == rdb ) strcpy(m_coll, "statsdb"); else if ( ! coll && g_accessdb.getRdb() == rdb ) strcpy(m_coll, "accessdb"); */ //else // strcpy ( m_coll , coll ); m_file = file; m_id2 = id2; m_isTitledb = isTitledb; m_buckets = buckets; m_tree = tree; m_map = map; m_cache = cache; m_orderedDump = orderedDump; m_dedup = dedup; m_state = state; m_callback = callback; m_list = NULL; m_niceness = niceness; m_tried = false; m_isSuspended = false; m_ks = keySize; m_addToMap = true; // reset this in case we run out of mem, it doesn't get set properly // and needs to be NULL for RdbMem's call to getLastKeyinQueue() m_lastKeyInQueue = NULL; KEYMIN(m_firstKeyInQueue,m_ks); m_isDumping = false; m_writing = false; m_buf = NULL; m_verifyBuf = NULL; m_maxBufSize = maxBufSize; m_offset = startOffset ; m_rolledOver = false; // true if m_nextKey rolls over back to 0 //m_nextKey = 0 ; // used in dumpTree() KEYMIN(m_nextKey,m_ks); m_nextNode = 0 ; // used in dumpTree() // if we're dumping indexdb, allow half keys m_useHalfKeys = useHalfKeys; //m_prevLastKey = prevLastKey; KEYSET(m_prevLastKey,prevLastKey,m_ks); // for setting m_rdb->m_needsSave after deleting the dump list m_rdb = rdb; // . don't dump to a pre-existing file // . seems like Rdb.cpp makes a new BigFile before calling this // . now we can resume merges, so we can indeed dump to the END // of a pre-exiting file, but not when dumping a tree! //if ( m_file->doesExist() > 0 ) { if ( (m_tree || m_buckets) && m_file->getFileSize() > 0 ) { g_errno = EEXIST; log("db: Could not dump to %s. File exists.", m_file->getFilename()); return true; } // . NOTE: MAX_PART_SIZE in BigFile must be defined to be bigger than // anything we actually dump since we only anticipate spanning 1 file // and so only register the first file's fd for write callbacks //if ( m_tree && m_tree->getMaxMem() > MAX_PART_SIZE ) //return log("RdbDump::dump: tree bigger than file part size"); // . open the file nonblocking, sync with disk, read/write // . NOTE: O_SYNC doesn't work too well over NFS // . we need O_SYNC when dumping trees only because we delete the // nodes/records as we dump them // . ensure this sets g_errno for us // . TODO: open might not block! fix that! long flags = O_RDWR | O_CREAT ; // a niceness bigger than 0 means to do non-blocking dumps if ( niceness > 0 ) flags |= O_ASYNC | O_NONBLOCK ; if ( ! m_file->open ( flags , pc , maxFileSize ) ) return true; // . get the file descriptor of the first real file in BigFile // . we should only dump to the first file in BigFile otherwise, // we'd have to juggle fd registration m_fd = m_file->getfd ( 0 , false /*for reading?*/ ); if ( m_fd < 0 ) { log(LOG_LOGIC,"db: dump: Bad fd of first file in BigFile.") ; return true; } // debug test //char buf1[10*1024]; //long n1 = m_file->write ( buf1 , 10*1024 , 0 ); //log("bytes written=%li\n",n1); // we're now considered to be in dumping state m_isDumping = true; // . if no tree was provided to dump it must be RdbMerge calling us // . he'll want to call dumpList() on his own if ( ! m_tree && !m_buckets ) return true; // how many recs in tree? long nr; char *structureName; if(m_tree) { nr = m_tree->getNumUsedNodes(); structureName = "tree"; } else if(m_buckets){ nr = m_buckets->getNumKeys(); structureName = "buckets"; } // debug msg log(LOG_INFO,"db: Dumping %li recs from %s to files.", nr, structureName); // nr , m_file->getFilename() ); // keep a total count for reporting when done m_totalPosDumped = 0; m_totalNegDumped = 0; // we have our own flag here since m_dump::m_isDumping gets // set to true between collection dumps, RdbMem.cpp needs // a flag that doesn't do that... see RdbDump.cpp. // this was in Rdb.cpp but when threads were turned off it was // NEVER getting set and resulted in corruption in RdbMem.cpp. m_rdb->m_inDumpLoop = true; // . start dumping the tree // . return false if it blocked if ( ! dumpTree ( false ) ) return false; // no longer dumping doneDumping(); // return true since we didn't block return true; } void RdbDump::reset ( ) { // free verify buf if there if ( m_verifyBuf ) { mfree ( m_verifyBuf , m_verifyBufSize , "RdbDump4"); m_verifyBuf = NULL; } } void RdbDump::doneDumping ( ) { long saved = g_errno; m_isDumping = false; // print stats log(LOG_INFO, "db: Dumped %li positive and %li negative recs. Total = %li.", m_totalPosDumped , m_totalNegDumped , m_totalPosDumped + m_totalNegDumped ); // . map verify // . if continueDumping called us with no collectionrec, it got // deleted so RdbBase::m_map is nuked too i guess if ( saved != ENOCOLLREC ) log("db: map # pos=%lli neg=%lli", m_map->getNumPositiveRecs(), m_map->getNumNegativeRecs() ); // free the list's memory if ( m_list ) m_list->freeList(); // reset verify buffer reset(); // did collection get deleted/reset from under us? if ( saved == ENOCOLLREC ) return; // save the map to disk m_map->writeMap(); #ifdef _SANITYCHECK_ // sanity check log("DOING SANITY CHECK FOR MAP -- REMOVE ME"); if ( ! m_map->verifyMap ( m_file ) ) { char *xx = NULL; *xx = 0; } // now check the whole file for consistency if ( m_ks == 18 ) { // map->m_rdbId == RDB_POSDB ) { collnum_t collnum = g_collectiondb.getCollnum ( m_coll ); class RdbBase *base = m_rdb->m_bases[collnum]; long startFileNum = base->getNumFiles()-1; log("sanity: startfilenum=%li",startFileNum); dumpPosdb(m_coll, startFileNum, // startFileNum 1 , // numFiles false , // includeTree -1 , // termId true );// justVerify? } #endif // . append it to "sync" state we have in memory // . when host #0 sends a OP_SYNCTIME signal we dump to disk //g_sync.addOp ( OP_CLOSE , m_file , 0 ); } static void tryAgainWrapper2 ( int fd , void *state ) ; void tryAgainWrapper2 ( int fd , void *state ) { // debug msg log(LOG_INFO,"db: Trying to get data again."); // stop waiting g_loop.unregisterSleepCallback ( state , tryAgainWrapper2 ); // bitch about errors if (g_errno) log("db: Had error: %s.",mstrerror(g_errno)); // get THIS ptr from state RdbDump *THIS = (RdbDump *)state; // continue dumping the tree or give control back to caller THIS->continueDumping ( ); } // . returns false if blocked, true otherwise // . sets g_errno on error // . dumps the RdbTree, m_tree, into m_file // . also sets and writes the RdbMap for m_file // . we methodically get RdbLists from the RdbTree // . dumped recs are ordered by key if "orderedDump" was true in call to set() // otherwise, lists are ordered by node # // . we write each list of recs to the file until the whole tree has been done // . we delete all records in list from the tree after we've written the list // . if a cache was provided we incorporate the list into the cache before // deleting it from the tree to keep the cache in sync. NO we do NOT! // . called again by writeBuf() when it's done writing the whole list bool RdbDump::dumpTree ( bool recall ) { // set up some vars //long nextNode; //key_t maxEndKey; //maxEndKey.setMax(); char maxEndKey[MAX_KEY_BYTES]; KEYMAX(maxEndKey,m_ks); // if dumping statsdb, we can only dump records 30 seconds old or // more because Statsdb.cpp can "back modify" such records in the tree // because it may have a query that took 10 seconds come in then it // needs to add a partial stat to the last 10 stats for those 10 secs. // we use Global time at this juncture if ( m_rdb->m_rdbId == RDB_STATSDB ) { long nowSecs = getTimeGlobal(); StatKey *sk = (StatKey *)maxEndKey; sk->m_zero = 0x01; sk->m_labelHash = 0xffffffff; // leave last 60 seconds in there just to be safe sk->m_time1 = nowSecs - 60; } // this list will hold the list of nodes/recs from m_tree m_list = &m_ourList; // convert coll to collnum //collnum_t collnum = g_collectiondb.getCollnum ( m_coll ); // a collnum of -1 is for collectionless rdbs //if ( collnum < 0 ) { // //if ( g_catdb->getRdb() == m_rdb ) // if ( ! m_rdb->m_isCollectionLess ) { // char *xx=NULL;*xx=0; //return true; // } // g_errno = 0; // collnum = 0; //} // getMemOccupiedForList2() can take some time, so breathe long niceness = 1; loop: // if the lastKey was the max end key last time then we're done if ( m_rolledOver ) return true; // this is set to -1 when we're done with our unordered dump if ( m_nextNode == -1 ) return true; // . NOTE: list's buffer space should be re-used!! (TODO) // . "lastNode" is set to the last node # in the list bool status = true; //if ( ! m_orderedDump ) { // status = ((RdbTree *)m_tree)->getListUnordered ( m_nextNode , // m_maxBufSize , // m_list , // &nextNode ); // // this is -1 when no more nodes are left // m_nextNode = nextNode; //} // "lastKey" is set to the last key in the list //else { { if ( recall ) goto skip; // debug msg //log("RdbDump:: getting list"); m_t1 = gettimeofdayInMilliseconds(); if(m_tree) status = m_tree->getList ( m_collnum , m_nextKey , maxEndKey , m_maxBufSize , // max recSizes m_list , &m_numPosRecs , &m_numNegRecs , m_useHalfKeys , niceness ); else if(m_buckets) status = m_buckets->getList ( m_collnum, m_nextKey , maxEndKey , m_maxBufSize , // max recSizes m_list , &m_numPosRecs , &m_numNegRecs , m_useHalfKeys ); // if(!m_list->checkList_r ( false , // removeNegRecs? // false , // sleep on problem? // m_rdb->m_rdbId )) { // log("db: list to dump is not sane!"); // char *xx=NULL;*xx=0; // } skip: long long t2; //key_t lastKey; char *lastKey; // if error getting list (out of memory?) if ( ! status ) goto hadError; // debug msg t2 = gettimeofdayInMilliseconds(); log(LOG_INFO,"db: Get list took %lli ms. " "%li positive. %li negative.", t2 - m_t1 , m_numPosRecs , m_numNegRecs ); // keep a total count for reporting when done m_totalPosDumped += m_numPosRecs; m_totalNegDumped += m_numNegRecs; // . check the list we got from the tree for problems // . ensures keys are ordered from lowest to highest as well #ifdef _SANITYCHECK_ log("dump: verifying list before dumping"); m_list->checkList_r ( false , // removeNegRecs? false , // sleep on problem? m_rdb->m_rdbId ); #endif // if list is empty, we're done! if ( status && m_list->isEmpty() ) { // consider that a rollover? if ( m_rdb->m_rdbId == RDB_STATSDB ) m_rolledOver = true; return true; } // get the last key of the list lastKey = m_list->getLastKey(); // advance m_nextKey //m_nextKey = lastKey ; //m_nextKey += (unsigned long)1; //if ( m_nextKey < lastKey ) m_rolledOver = true; KEYSET(m_nextKey,lastKey,m_ks); KEYADD(m_nextKey,1,m_ks); if (KEYCMP(m_nextKey,lastKey,m_ks)<0) m_rolledOver = true; // debug msg //log(0,"RdbDump:lastKey.n1=%lu,n0=%llu",lastKey.n1,lastKey.n0); //log(0,"RdbDump:next.n1=%lu,n0=%llu",m_nextKey.n1,m_nextKey.n0); } // . return true on error, g_errno should have been set // . this is probably out of memory error if ( ! status ) { hadError: log("db: Had error getting data for dump: %s. Retrying.", mstrerror(g_errno)); // debug msg //log("RdbDump::getList: sleeping and retrying"); // retry for the remaining two types of errors if (!g_loop.registerSleepCallback(1000,this,tryAgainWrapper2)){ log( "db: Retry failed. Could not register callback."); return true; } // wait for sleep return false; } // if list is empty, we're done! if ( m_list->isEmpty() ) return true; // . set m_firstKeyInQueue and m_lastKeyInQueue // . this doesn't work if you're doing an unordered dump, but we should // not allow adds when closing m_lastKeyInQueue = m_list->getLastKey(); //m_firstKeyInQueue = m_list->getCurrentKey(); m_list->getCurrentKey(m_firstKeyInQueue); // . write this list to disk // . returns false if blocked, true otherwise // . sets g_errno on error // . if this blocks it should call us (dumpTree() back) if ( ! dumpList ( m_list , m_niceness , false ) ) return false; // close up shop on a write/dumpList error if ( g_errno ) return true; // . if dumpList() did not block then keep on truckin' // . otherwise, wait for callback of dumpTree() goto loop; } static void doneWritingWrapper ( void *state ) ; // . return false if blocked, true otherwise // . sets g_errno on error // . this one is also called by RdbMerge to dump lists bool RdbDump::dumpList ( RdbList *list , long niceness , bool recall ) { // if we had a write error and are being recalled... if ( recall ) { m_offset -= m_bytesToWrite; goto recallskip; } // assume we don't hack the list m_hacked = false; m_hacked12 = false; // save ptr to list... why? m_list = list; // nothing to do if list is empty if ( m_list->isEmpty() ) return true; // we're now in dump mode again m_isDumping = true; #ifdef _SANITYCHECK_ // don't check list if we're dumping an unordered list from tree! if ( m_orderedDump ) { m_list->checkList_r ( false /*removedNegRecs?*/ ); // print list stats log("dump: sk=%s ",KEYSTR(m_list->m_startKey,m_ks)); log("dump: ek=%s ",KEYSTR(m_list->m_endKey,m_ks)); } #endif // before calling RdbMap::addList(), always reset list ptr // since we no longer call this in RdbMap::addList() so we don't // mess up the possible HACK below m_list->resetListPtr(); // . SANITY CHECK // . ensure first key is >= last key added to the map map if ( m_offset > 0 ) { //key_t k = m_list->getCurrentKey(); char k[MAX_KEY_BYTES]; m_list->getCurrentKey(k); //key_t lastKey = m_map->getLastKey (); // m_lastKey char lastKey[MAX_KEY_BYTES]; m_map->getLastKey(lastKey); //char *lastKey = m_map->getLastKey(); //if ( k <= lastKey ) { if ( KEYCMP(k,lastKey,m_ks)<=0 ) { log(LOG_LOGIC,"db: Dumping list key out of order. " //"lastKey.n1=%lx n0=%llx k.n1=%lx n0=%llx", //lastKey.n1,lastKey.n0,k.n1,k.n0); "lastKey=%s k=%s", KEYSTR(lastKey,m_ks), KEYSTR(k,m_ks)); g_errno = EBADENGINEER; //return true; char *xx = NULL; *xx = 0; } } if ( m_ks==18 ) { m_list->checkList_r(false,false,RDB_POSDB); m_list->resetListPtr(); } // HACK! POSDB if ( m_ks == 18 && m_orderedDump && m_offset > 0 ) { char k[MAX_KEY_BYTES]; m_list->getCurrentKey(k); // . same top 6 bytes as last key we added? // . if so, we should only add 6 bytes from this key, not 12 // so on disk it is compressed consistently if ( memcmp ( (k ) + (m_ks-12) , (m_prevLastKey ) + (m_ks-12) , 12 ) == 0 ) { char tmp[MAX_KEY_BYTES]; char *p = m_list->getList(); // swap high 12 bytes with low 6 bytes for first key memcpy ( tmp , p , m_ks-12 ); memcpy ( p , p + (m_ks-12) , 12 ); memcpy ( p + 12, tmp , m_ks-12 ); // big hack here m_list->m_list = p + 12; m_list->m_listPtr = p + 12; m_list->m_listPtrLo = p ; m_list->m_listPtrHi = p + 6; m_list->m_listSize -= 12 ; // turn on both bits to indicate double compression *(p+12) |= 0x06; m_hacked12 = true; } } // . HACK // . if we're doing an ordered dump then hack the list's first 12 byte // key to make it a 6 byte iff the last key we dumped last time // shares the same top 6 bytes as the first key of this list // . this way we maintain compression consistency on the disk // so IndexTable.cpp can expect all 6 byte keys for the same termid // and RdbList::checkList_r() can expect the half bits to always be // on when they can be on // . IMPORTANT: calling m_list->resetListPtr() will mess this HACK up!! if ( m_useHalfKeys && m_orderedDump && m_offset > 0 && ! m_hacked12 ) { //key_t k = m_list->getCurrentKey(); char k[MAX_KEY_BYTES]; m_list->getCurrentKey(k); // . same top 6 bytes as last key we added? // . if so, we should only add 6 bytes from this key, not 12 // so on disk it is compressed consistently //if ( memcmp ( ((char *)&k ) + 6 , // ((char *)&m_prevLastKey ) + 6 , 6 ) == 0 ) { if ( memcmp ( (k ) + (m_ks-6) , (m_prevLastKey ) + (m_ks-6) , 6 ) == 0 ) { m_hacked = true; //char tmp[6]; char tmp[MAX_KEY_BYTES]; char *p = m_list->getList(); //memcpy ( tmp , p , 6 ); //memcpy ( p , p + 6 , 6 ); //memcpy ( p + 6 , tmp , 6 ); memcpy ( tmp , p , m_ks-6 ); memcpy ( p , p + (m_ks-6) , 6 ); memcpy ( p + 6 , tmp , m_ks-6 ); // big hack here m_list->m_list = p + 6; m_list->m_listPtr = p + 6; // make this work for POSDB, too m_list->m_listPtrLo = p + 6 + 6; m_list->m_listPtrHi = p ; m_list->m_listSize -= 6 ; // hack on the half bit, too *(p+6) |= 0x02; } } // update old last key //m_prevLastKey = m_list->getLastKey(); m_list->getLastKey(m_prevLastKey); // now write it to disk m_buf = m_list->getList (); m_bytesToWrite = m_list->getListSize(); //#ifdef _SANITYCHECK_ //if (m_list->getListSize()!=m_list->getListEnd() - m_list->getList()){ // log("RdbDump::dumpList: major problem here!"); // sleep(50000); //} //#endif recallskip: // make sure we have enough mem to add to map after a successful // dump up here, otherwise, if we write it and fail to add to map // the map is not in sync if we core thereafter if ( m_addToMap && m_map && ! m_map->prealloc ( m_list ) ) { log("db: Failed to prealloc list into map: %s.", mstrerror(g_errno)); // g_errno should be set to something if that failed if ( ! g_errno ) { char *xx = NULL; *xx = 0; } return true; } // tab to the old offset long long offset = m_offset; // might as well update the offset now, even before write is done m_offset += m_bytesToWrite ; // write thread is out m_writing = true; //m_bytesWritten = 0; // sanity check //log("dump: writing %li bytes at offset %lli",m_bytesToWrite,offset); // . if we're called by RdbMerge directly use m_callback/m_state // . otherwise, use doneWritingWrapper() which will call dumpTree() // . BigFile::write() return 0 if blocked,-1 on error,>0 on completion // . it also sets g_errno on error bool isDone = m_file->write ( m_buf , m_bytesToWrite , offset , &m_fstate , this , doneWritingWrapper , niceness ); // debug msg //log("RdbDump dumped %li bytes, done=%li\n", // m_bytesToWrite,isDone); // return false if it blocked if ( ! isDone ) return false; // done writing m_writing = false; // return true on error if ( g_errno ) return true; // . delete list from tree, incorporate list into cache, add to map // . returns false if blocked, true otherwise, sets g_errno on error // . will only block in calling updateTfndb() return doneDumpingList ( true ); } // . delete list from tree, incorporate list into cache, add to map // . returns false if blocked, true otherwise, sets g_errno on error bool RdbDump::doneDumpingList ( bool addToMap ) { // we can get suspended when gigablast is shutting down, in which // case the map may have been deleted. only RdbMerge suspends its // m_dump class, not Rdb::m_dump. return false so caller nevers // gets called back. we can not resume from this suspension! //if ( m_isSuspended ) return false; // . if error was EFILECLOSE (file got closed before we wrote to it) // then try again. file can close because fd pool needed more fds // . we cannot do this retry in BigFile.cpp because the BigFile // may have been deleted/unlinked from a merge, but we could move // this check to Msg3... and do it for writes, too... // . seem to be getting EBADFD errors now, too (what code is it?) // i don't remember, just do it on *all* errors for now! //if ( g_errno == EFILECLOSED || g_errno == EBADFD ) { if ( g_errno && ! m_isSuspended ) { log(LOG_INFO,"db: Had error dumping data: %s. Retrying.", mstrerror(g_errno)); // . deal with the EBADF bug, it will loop forever on this // . i still don't know how the fd gets closed and s_fds[vfd] // is not set to -1?!?!?! if ( g_errno == EBADF ) { // note it log(LOG_LOGIC,"db: setting fd for vfd to -1."); // mark our fd as not there... long i = (m_offset - m_bytesToWrite) / MAX_PART_SIZE; // sets s_fds[vfd] to -1 if ( m_file->m_files[i] ) releaseVfd ( m_file->m_files[i]->m_vfd ); } //log("RdbDump::doneDumpingList: retrying."); return dumpList ( m_list , m_niceness , true ); } // bail on error if ( g_errno ) { log("db: Had error dumping data: %s.", mstrerror(g_errno)); //log("RdbDump::doneDumpingList: %s",mstrerror(g_errno)); return true; } // . don't delete the list if we were dumping an unordered list // . we only dump unordered lists when we do a save // . it saves time not having to delete the list and it also allows // us to do saves without deleting our data! good! if ( ! m_orderedDump ) return true; //--turn this off until save works // save for verify routine m_addToMap = addToMap; // should we verify what we wrote? useful for preventing disk // corruption from those pesky Western Digitals and Maxtors? if ( g_conf.m_verifyWrites ) { // a debug message, if log disk debug messages is enabled log(LOG_DEBUG,"disk: Verifying %li bytes written.", m_bytesToWrite); // make a read buf if ( m_verifyBuf && m_verifyBufSize < m_bytesToWrite ) { mfree ( m_verifyBuf , m_verifyBufSize , "RdbDump3" ); m_verifyBuf = NULL; m_verifyBufSize = 0; } if ( ! m_verifyBuf ) { m_verifyBuf = (char *)mmalloc ( m_bytesToWrite , "RdbDump3" ); m_verifyBufSize = m_bytesToWrite; } // out of mem? if so, skip the write verify if ( ! m_verifyBuf ) return doneReadingForVerify(); // read what we wrote bool isDone = m_file->read ( m_verifyBuf , m_bytesToWrite , m_offset - m_bytesToWrite , &m_fstate , this , doneReadingForVerifyWrapper , m_niceness ); // debug msg //log("RdbDump dumped %li bytes, done=%li\n", // m_bytesToWrite,isDone); // return false if it blocked if ( ! isDone ) return false; } return doneReadingForVerify(); } void doneReadingForVerifyWrapper ( void *state ) { RdbDump *THIS = (RdbDump *)state; // return if this blocks if ( ! THIS->doneReadingForVerify() ) return; // delete list from tree, incorporate list into cache, add to map //if ( ! THIS->doneDumpingList( true ) ) return; // continue THIS->continueDumping ( ); } bool RdbDump::doneReadingForVerify ( ) { // see if what we wrote is the same as what we read back if ( m_verifyBuf && memcmp(m_verifyBuf,m_buf,m_bytesToWrite) != 0 && ! g_errno ) { log("disk: Write verification of %li bytes to file %s " "failed at offset=%lli. Retrying.", m_bytesToWrite, m_file->getFilename(), m_offset - m_bytesToWrite); // try writing again return dumpList ( m_list , m_niceness , true ); } // time dump to disk (and tfndb bins) long long t ; // start timing on first call only if ( m_addToMap ) t = gettimeofdayInMilliseconds(); // sanity check if ( m_list->m_ks != m_ks ) { char *xx = NULL; *xx = 0; } // . register this with the map now // . only register AFTER it's ALL on disk so we don't get partial // record reads and we don't read stuff on disk that's also in tree // . add the list to the rdb map if we have one // . we don't have maps when we do unordered dumps // . careful, map is NULL if we're doing unordered dump if ( m_addToMap && m_map && ! m_map->addList ( m_list ) ) { g_errno = ENOMEM; log("db: Failed to add data to map."); // undo the offset update, the write failed, the parent // should retry. i know RdbMerge.cpp does, but not sure // what happens when Rdb.cpp is dumping an RdbTree //m_offset -= m_bytesToWrite ; // this should never happen now since we call prealloc() above char *xx = NULL; *xx = 0; return true; } // debug msg long long now = gettimeofdayInMilliseconds(); log(LOG_TIMING,"db: adding to map took %llu ms" , now - t ); // . Msg5.cpp and RdbList::merge_r() should remove titleRecs // that are not supported by tfndb, so we only need to add tfndb // records at this point to update the tfndb recs to point to the // new tfn we are dumping into for the existing titlerecs // . we just add one tfndb rec per positive titleRec in m_list // . negative TitleRec keys should have had a negative tfndb key // added to tfndb in Rdb.cpp::addRecord() already, and ... // . RdbList::indexMerge_r() will take care of merging properly // so as to not treat the tfn bits as part of the key when comparing // . this will re-call this doneDumpingList(false) if it blocks // . returns false if blocks, true otherwise //if ( ! updateTfndbLoop() ) return false; // . HACK: fix hacked lists before deleting from tree // . iff the first key has the half bit set if ( m_hacked ) { //char tmp[6]; char tmp[MAX_KEY_BYTES]; char *p = m_list->getList() - 6 ; //memcpy ( tmp , p , 6 ); //memcpy ( p , p + 6 , 6 ); //memcpy ( p + 6 , tmp , 6 ); memcpy ( tmp , p , 6 ); memcpy ( p , p + 6 , m_ks-6 ); memcpy ( p + (m_ks-6) , tmp , 6 ); // undo the big hack m_list->m_list = p ; m_list->m_listPtr = p ; // make this work for POSDB... m_list->m_listPtrLo = p + m_ks - 12; m_list->m_listPtrHi = p + m_ks - 6; m_list->m_listSize += 6 ; // hack off the half bit, we're 12 bytes again *p &= 0xfd ; // turn it off again just in case m_hacked = false; } if ( m_hacked12 ) { char tmp[MAX_KEY_BYTES]; char *p = m_list->getList() - 12 ; // swap high 12 bytes with low 6 bytes for first key memcpy ( tmp , p , 12 ); memcpy ( p , p + 12 , 6 ); memcpy ( p + 6, tmp , 12 ); // big hack here m_list->m_list = p ; m_list->m_listPtr = p ; m_list->m_listPtrLo = p + 6; m_list->m_listPtrHi = p + 12; m_list->m_listSize += 12 ; // hack off the half bit, we're 12 bytes again *p &= 0xf9 ; m_hacked12 = false; } // verify keys are in order after we hack it back //if ( m_orderedDump ) m_list->checkList_r ( false , true ); // if we're NOT dumping a tree then return control to RdbMerge if ( ! m_tree && !m_buckets ) return true; // . merge the writeBuf into the cache at this point or after deleting // . m_list should have it's m_lastKey set since we got called from // RdbMerge if m_cache is non-NULL and it called RdbList::merge() // through Msg5 at one point to form this list // . right now i just made this clear the cache... it's easier //if ( m_cache ) m_cache->incorporateList ( m_list , m_dedup , // m_list->getLastKey() ); // . delete these nodes from the tree now that they're on the disk // now that they can be read from list since addList() was called // . however, while we were writing to disk a key that we were // writing could have been deleted from the tree. To prevent // problems we should only delete nodes that are present in tree... // . actually i fixed that problem by not deleting any nodes that // might be in the middle of being dumped // . i changed Rdb::addNode() and Rdb::deleteNode() to do this // . since we made it here m_list MUST be ordered, therefore // let's try the new, faster deleteOrderedList and let's not do // balancing to make it even faster // . balancing will be restored once we're done deleting this list // debug msg //log("RdbDump:: deleting list"); long long t1 = gettimeofdayInMilliseconds(); // convert to number, this is -1 if no longer exists //collnum_t collnum = g_collectiondb.getCollnum ( m_coll ); //if ( collnum < 0 && m_rdb->m_isCollectionLess ) { // collnum = 0; // g_errno = 0; //} //m_tree->deleteOrderedList ( m_list , false /*do balancing?*/ ); // tree delete is slow due to checking for leaks, not balancing bool s; if(m_tree) { s = m_tree->deleteList(m_collnum,m_list,true/*do balancing?*/); } else if(m_buckets) { s = m_buckets->deleteList(m_collnum, m_list); } // problem? if ( ! s && ! m_tried ) { m_tried = true; log("db: Corruption in tree detected when dumping to %s. " "Fixing. Your memory had an error. Consider replacing it.", m_file->getFilename()); if ( m_rdb && m_rdb->m_rdbId != RDB_DOLEDB ) { // core now to debug this for sectiondb char *xx=NULL;*xx=0; ((RdbTree *)m_tree)->fixTree ( ); } } // tell rdb he needs saving now //if ( m_rdb ) m_rdb->m_needsSave = true; // debug msg long long t2 = gettimeofdayInMilliseconds(); log(LOG_TIMING,"db: dump: deleteList: took %lli",t2-t1); return true; } /* static void tryAgainWrapper ( int fd , void *state ) ; // returns false if blocks, true otherwise bool RdbDump::updateTfndbLoop () { // only if dumping titledb if ( ! m_isTitledb ) return true; // . start from beginning in case last add failed // . this may result in some dups if we get re-called, but that's ok m_list->resetListPtr(); // point to it Rdb *tdb = g_tfndb.getRdb(); // is it the secondary/repair rdb used by Repair.cpp? if ( m_rdb == g_titledb2.getRdb () ) tdb = g_tfndb2.getRdb(); // get collection number collnum_t collnum = g_collectiondb.getCollnum ( m_coll ); // bail if collection gone if ( collnum < (collnum_t)0 ) { //if ( g_catdb->getRdb() == m_rdb ) if ( strcmp ( m_coll, "catdb" ) == 0 ) collnum = 0; else if ( strcmp ( m_coll, "statsdb" ) == 0 ) collnum = 0; else { log("Collection \"%s\" removed during dump.",m_coll); return true; } } loop: // get next if ( m_list->isExhausted() ) return true; // get the TitleRec key //key_t k = m_list->getCurrentKey(); char k[MAX_KEY_BYTES]; m_list->getCurrentKey(k); //char *rec = m_list->getCurrentRec(); //long recSize = m_list->getCurrentRecSize(); // advance for next call m_list->skipCurrentRecord(); // skip if a delete if ( KEYNEG(k) ) goto loop; // . otherwise, this is the "final" titleRec for this docid because // Msg5/RdbList::merge_r() should have removed it if it is not the // ultimate titleRec for this docid, because RdbList::merge_r() // takes a "tfndbList" as input just to weed out titleRecs that // are not supported by a tfndb record // . make the tfndb key long long d = g_titledb.getDocIdFromKey ((key_t *) k ); //long e = g_titledb.getHostHash ( (key_t *)k ); long long uh48 = g_titledb.getUrlHash48 ( (key_t *)k ); long tfn = m_id2; // delete=false key_t tk = g_tfndb.makeKey ( d, uh48, tfn, false ); KEYSET(m_tkey,(char *)&tk,sizeof(key_t)); // debug msg //logf(LOG_DEBUG,"db: rdbdump: updateTfndbLoop: tbadd docId=%lli " // "tfn=%03li", g_tfndb.getDocId((key_t *)m_tkey ), // (long)g_tfndb.getTitleFileNum((key_t *)m_tkey)); // . add it, returns false and sets g_errno on error // . this will override any existing tfndb record for this docid // because RdbList.cpp uses a special key compare function (cmp2) // to ignore the tfn bits on tfndb keys, so we get the newest/latest // tfndb key after the merge. if ( tdb->addRecord ( collnum , m_tkey , NULL , 0 , 0) ) goto loop; // return true with g_errno set for most errors, that's bad if ( g_errno != ETRYAGAIN && g_errno != ENOMEM ) { log("db: Had error adding record to tfndb: %s.", mstrerror(g_errno)); return true; } // try starting a dump, Rdb::addRecord() does not do this like it // should, only Rdb::addList() does if ( tdb->needsDump() ) { log(LOG_INFO,"db: Dumping tfndb while merging titledb."); // . CAUTION! must use niceness one because if we go into // urgent mode all niceness 2 stuff will freeze up until // we exit urgent mode! so when tfndb dumps out too much // stuff he'll go into urgent mode and freeze himself if ( ! tdb->dumpTree ( 1 ) ) // niceness log("db: Error dumping tfndb to disk: %s.", mstrerror(g_errno)); } // debug msg //log("db: Had error when trying to dump tfndb: %s. Retrying.", // mstrerror(g_errno)); // retry for the remaining two types of errors if ( ! g_loop.registerSleepCallback(1000,this,tryAgainWrapper)) { log("db: Failed to retry. Very bad."); return true; } // wait for sleep return false; } void tryAgainWrapper ( int fd , void *state ) { // debug msg log(LOG_INFO,"db: Trying to update tfndb again."); // stop waiting g_loop.unregisterSleepCallback ( state , tryAgainWrapper ); // bitch about errors if ( g_errno ) log(LOG_LOGIC,"db: dump: Could not unregister " "retry callback: %s.",mstrerror(g_errno)); // get THIS ptr from state RdbDump *THIS = (RdbDump *)state; // continue loop, this returns false if it blocks if ( ! THIS->updateTfndbLoop() ) return; // don't add to map, we already did if ( ! THIS->doneDumpingList ( false ) ) return; // continue dumping the tree or give control back to caller THIS->continueDumping ( ); } */ // continue dumping the tree void doneWritingWrapper ( void *state ) { // get THIS ptr from state RdbDump *THIS = (RdbDump *)state; // done writing THIS->m_writing = false; // bitch about errors if ( g_errno ) log("db: Dump to %s had write error: %s.", THIS->m_file->getFilename(),mstrerror(g_errno)); // delete list from tree, incorporate list into cache, add to map if ( ! THIS->doneDumpingList( true ) ) return; // continue THIS->continueDumping ( ); } void RdbDump::continueDumping() { // if someone reset/deleted the collection we were dumping... CollectionRec *cr = g_collectiondb.getRec ( m_collnum ); if ( ! cr ) { g_errno = ENOCOLLREC; // m_file is invalid if collrec got nuked because so did // the Rdbbase which has the files log("db: continue dumping lost collection"); } // bitch about errors else if (g_errno)log("db: Dump to %s had error writing: %s.", m_file->getFilename(),mstrerror(g_errno)); // go back now if we were NOT dumping a tree if ( ! (m_tree || m_buckets) ) { m_isDumping = false; m_callback ( m_state ); return; } // . continue dumping the tree // . return if this blocks // . if the collrec was deleted or reset then g_errno will be // ENOCOLLREC and we want to skip call to dumpTree( if ( g_errno != ENOCOLLREC && ! dumpTree ( false ) ) return; // close it up doneDumping ( ); // call the callback m_callback ( m_state ); } // . load the table from a dumped btree (unordered dump only!) // . must NOT have been an ordered dump cuz tree will be seriously skewed // . this is completely blocking cuz it used on init to recover a saved table // . used for recovering a table that was too small to dump to an rdbfile // . returns true if "filename" does not exist // . stored in key/dataSize/data fashion // . TODO: TODO: this load() routine and the m_orderedDump stuff above are // just hacks until we make the tree balanced. Then we can use RdbScan // to load the tree. Also, I we may not have enough mem to load the tree // because it loads it all in at once!!!!! /* bool RdbDump::load ( Rdb *rdb , long fixedDataSize, BigFile *file , class DiskPageCache *pc ) { //m_tree = tree; // return true if the file does not exist if ( file->doesExist() <= 0 ) return true; // open the file read only if ( ! file->open ( O_RDONLY , pc ) ) return log("db: Could not open %s: %s.",file->getFilename(), mstrerror(g_errno)); // a harmless note log(LOG_INFO,"db: Loading data from %s",file->getFilename()); // read in all data at once since this should only be run at // startup when we still have plenty of memory long bufSize = file->getFileSize(); // return true if filesize is 0 if ( bufSize == 0 ) return true; // otherwise, alloc space to read the WHOLE file char *buf = (char *) mmalloc( bufSize ,"RdbDump"); if ( ! buf ) return log("db: Could not allocate %li bytes to load " "%s" , bufSize , file->getFilename()); //long n = file->read ( buf , bufSize , m_offset ); file->read ( buf , bufSize , m_offset ); if ( g_errno ) { mfree ( buf , bufSize , "RdbDump"); return log("db: Had error reading %s: %s.",file->getFilename(), mstrerror(g_errno)); } char *p = buf; char *pend = buf + bufSize; // now let 'er rip while ( p < pend ) { // get the key key_t key = *(key_t *) p; // advance the buf ptr p += sizeof(key_t); // get dataSize long dataSize = fixedDataSize; // we may have a datasize if ( fixedDataSize == -1 ) { dataSize = *(long *)p; p += 4; } // point to data if any char *data ; if ( dataSize > 0 ) data = p; else data = NULL; // skip p over data p += dataSize; // add to rdb if ( ! rdb->addRecord ( key , data , dataSize ) ) { mfree ( buf , bufSize ,"RdbDump"); return log("db: Could not add record from %s: %s.", file->getFilename(),mstrerror(g_errno)); } // we must dup the data so the tree can free it //char *copy = mdup ( p , dataSize ,"RdbDump"); // add the node //if ( m_tree->addNode ( key , copy , dataSize ) < 0 ) { // mfree ( buf , bufSize ,"RdbDump"); // return log("RdbDump::load:addNode failed"); //} } // free the m_buffer we used mfree ( buf , bufSize , "RdbDump"); file->close(); return true; } */