#include "gb-include.h" #include "Syncdb.h" #include "Threads.h" #include "Msg4.h" #include "Process.h" // a global class extern'd in .h file Syncdb g_syncdb; static void gotListWrapper ( void *state , RdbList *list , Msg5 *msg5 ) ; static void gotListWrapper4 ( void *state ) ; static void addedListWrapper5 ( void *state , UdpSlot *slot ) ; static void handleRequest5d ( UdpSlot *slot , int32_t netnice ) ; static void handleRequest55 ( UdpSlot *slot , int32_t netnice ) ; static void gotReplyWrapper55 ( void *state , UdpSlot *slot ) ; static void syncDoneWrapper ( void *state , ThreadEntry *te ) ; static void *syncStartWrapper_r ( void *state , ThreadEntry *te ) ; // returns false with g_errno set on error bool Syncdb::gotMetaListRequest ( UdpSlot *slot ) { // get the request buffer char *req = slot->m_readBuf; int32_t reqSize = slot->m_readBufSize; // get sender hostid uint32_t sid = slot->m_hostId; return gotMetaListRequest ( req, reqSize , sid ); } bool Syncdb::gotMetaListRequest ( char *req , int32_t reqSize , uint32_t sid ) { // use 0 for this uint32_t tid = 0; // point to it uint64_t zid = *(uint64_t *)(req + 4); // sanity check if ( sid < 0 ) { char *xx=NULL;*xx=0; } // get the key of the request. use 0 for tid. key128_t k = makeKey ( 0,0,1,0,tid,sid,zid,1); logf(LOG_DEBUG,"syncdb: got meta list request"); // if tree is too full, deny it! if ( m_qt.is90PercentFull() ) { g_errno = ETRYAGAIN; return false; } // . see if there is a msg4 we already added after it from same sid // . there may have been if we just woke up from a slumber... // . so allow out of order meta lists to be added still /* key128_t ak = makeKey (0,0,1,0,tid,sid,zid); key128_t ek = makeKey (0,0,1,0,tid,sid,0xffffffffffffffff); int32_t nn = m_qt.getNextNode ( &ak ); // get key if any if ( nn >= 0 ) { key128_t ck = *(key128_t *)m_qt.getKey ( nn ); // if in range, that is strange if ( k <= ck ) { log("sync: got msg4 meta list with expired key: " "k .n1=0x%"XINT64" k .n0=0x%"XINT64" " "ck.n1=0x%"XINT64" ck.n0=0x%"XINT64" " , k.n1,k.n0,ck.n1,ck.n0 ); // pretend we added it return true; } } */ // . add the record (which contains a list) // . it starts with the syncdb key // . see below for the defition of the bits in the key // . might set g_errno to ETRYAGAIN if ( ! m_rdb.addRecord ((collnum_t)0,(char *)&k , req , reqSize , MAX_NICENESS ) ) return false; // . ADD THE C KEY // . add key with "c" set to 1 to indicate we need to add the meta list // . use "0" for tid since does not apply here // . if add did not work, return false and set g_errno // . i think g_errno is usually ETRYAGAIN // . it should initiate a dump of the tree to make room // . Msg4 handlerRequest() should send back ETRYAGAIN key128_t k2 = makeKey (0,0,1,0,tid,sid,zid,1); if ( m_qt.addKey ( &k2 ) < 0 ) { char *xx=NULL;*xx=0; } // . ADD THE D KEY // . and a key with the "d" set to indicate we need to delete the meta // list (after we got all requests, replies and added it ourselves) key128_t k3 = makeKey (0,0,0,1,tid,sid,zid,1); if ( m_qt.addKey ( &k3 ) < 0 ) { char *xx=NULL;*xx=0; } // . add the individiual checkoff keys // . 1 key per hostid in our mirror group // . we delete these keys using msg1 on successful transmissions int32_t nh = g_hostdb.getNumHostsPerShard(); Host *hosts = g_hostdb.getMyShard(); for ( int32_t i = 0 ; i < nh ; i++ ) { // get it Host *h = &hosts[i]; // skip if us if ( h == g_hostdb.m_myHost ) continue; // these actually have a twin id (tid) int32_t tid2 = h->m_hostId; // set "a" to 1 to indicate we need to send a checkoff request // set "b" to 1 to indicate we need to recv a checkoff request key128_t k4 = makeKey ( 1,0,0,0,tid2,sid,zid,1 ); key128_t k5 = makeKey ( 0,1,0,0,tid2,sid,zid,1 ); // . ADD THE A KEY // . if add did not work, return false and set g_errno // . i think g_errno is usually ETRYAGAIN // . it should initiate a dump of the tree to make room // . Msg4 handlerRequest() should send back ETRYAGAIN if ( m_qt.addKey ( &k4 ) < 0 ) { char *xx=NULL;*xx=0; } // . if we already did recv the check off request, delete it! key128_t ck = makeKey ( 0,1,0,0,tid2,sid,zid,0 ); int32_t dn = m_qt.getNode ( 0, (char *)&ck ) ; // hey, we annihilated with the positive key. i guess our // twin is ahead of us! if ( dn >= 0 ) m_qt.deleteNode( dn , false ); // . ADD the B KEY // . we did not annihilate with a negative key else if ( m_qt.addKey ( &k5 ) < 0 ) { char *xx=NULL;*xx=0; } } logf(LOG_DEBUG,"syncdb: added a b c and d keys to quick tree " "tid=%"UINT32" sid=%"UINT32" zid=%"UINT64"",(int32_t)tid,(int32_t)sid,(int64_t)zid); // success return true; } // . QUICKTREE KEY = // // . abcd0000 iiiiiiii iiiiiiii iiiiiiii i = twinHostId // . 00000000 ssssssss ssssssss ssssssss s = original msg4 sender hostid // . zzzzzzzz zzzzzzzz zzzzzzzz zzzzzzzz z = zid // . zzzzzzzz zzzzzzzz zzzzzzzz zzzzzzzD // . // . "a" is 1 to indicate we need to send a checkoff request to host "i" // . "b" is 1 to indicate we need to recv a checkoff request from host "i" // . "c" is 1 to indicate we need to add the meta list to our rdbs // . "d" is 1 to indicate we need to delete the meta list from syncdb // . "D" is 0 to indicate a delete key (negative key) (task completed) // . "s" = "sid", hostid that sent the msg request originally = "sender id" // . "z" = "zid", the sender's "transaction id" for that msg4. always increases // . "i" = "tid", hostid of the twin in our group that we are chatting with // to make sure he adds this msg4 add request reliably key128_t Syncdb::makeKey ( char a , char b , char c , char d , uint32_t tid , uint32_t sid , uint64_t zid , char delBit ) { key128_t k; uint64_t n1 = 0; if ( a ) n1 |= 0x01; n1 <<= 1; if ( b ) n1 |= 0x01; n1 <<= 1; if ( c ) n1 |= 0x01; n1 <<= 1; if ( d ) n1 |= 0x01; n1 <<= (4+24); n1 |= (tid & 0x00ffffff); // make room for sid n1 <<= 32; n1 |= (sid & 0x00ffffff); // set it k.n1 = n1; // sanity check, low bit must be reserved for del bit if ( (zid & 0x01) && ! delBit ) { char *xx=NULL;*xx=0; } // lowest 63 bits of zid k.n0 = zid ; // and del bit if ( delBit ) k.n0 |= 0x01; return k; } // . ***** META LIST PRE-ADD LOOP ***** // . scan for meta lists to add // . scan m_qt (quickTree) for keys with the "c" bit set // . the "c" bit means we need to add the meta list for that sid/zid // . add meta list keys to the m_addme[] array so loop2() can loop over syncdb // itself and add the meta lists corresponding to the keys in m_addme[] // since the meta list may be on disk cuz they are very big (32k+) // . call this once per second or so, or back-to-back if we "did something" // . this loop will set the m_addme[] array to keys in syncdb of metalists // that we can finally add! void Syncdb::loop1 ( ) { // do not re-call this loop this round in bigLoop() function m_calledLoop1 = true; // how many hosts in our group are alive? int32_t alive = 0; // get group we are in Host *group = g_hostdb.getMyShard(); // number hosts in group int32_t nh = g_hostdb.getNumHostsPerShard(); // count alive for ( int32_t i = 0 ; i < nh ; i++ ) if ( ! g_hostdb.isDead ( &group[i] ) ) alive++; // reset m_na = 0; m_ia = 0; // . if we do NOT have 2+ alive hosts, including ourselves, bail // . at least two twins a group must be alive to do an add otherwise // we risk serious data loss, since hard drives tend to die mostly // when writing! if ( alive <= 1 ) return; // . loop over the meta lists we need to add // . use a "tid" of 0 int32_t tid = 0; key128_t sk = makeKey(0,0,1,0,tid,0,0,0); key128_t ek = makeKey(0,0,1,0,tid,0xffffffff,0xffffffffffffffffLL,1); // get the first node in sequence, if any. 0 = collnum int32_t nn = m_qt.getNextNode ( 0 , (char *)&sk ); // do the loop for ( ; nn >= 0 ; nn = m_qt.getNextNode ( nn ) ) { // breathe QUICKPOLL ( MAX_NICENESS ); // get key key128_t k = *(key128_t *)m_qt.getKey ( nn ); // stop when we hit the end if ( k > ek ) break; // get zid uint64_t zid = getZid ( &k ); // get sid uint32_t sid = getSid ( &k ); // . if we have a "need to send checkoff request" key still // present in quicktree, that means we never received a good // reply for it! and we reqire that! // . if a host is dead and we need to send a checkoff request // to him, then we do not count that here... if ( ! sentAllCheckoffRequests ( sid , zid ) ) { // no use banging away at this sid any more since we // are missing a checkoff reply from an alive twin sid++; // find the key of the FIRST meta list we need to add // for this new senderId, "sid" key128_t nk = makeKey ( 0,0,1,0,tid,sid,0,0 ); // undo the m_qt.getNextNode(nn) we call in for loop nn = m_qt.getPrevNode ( 0 , (char *)&nk ); // sanity check if ( nn < 0 ) { char *xx=NULL; *xx=0; } // get next key from this new sid continue; } // schedule it for an add in loop2() below m_addMe [ m_na++ ] = k; // crap no room! wait for it to be used up! if ( m_na >= MAX_TO_ADD ) return; } } // . have we sent all the checkoff requests (and got replies) for this sid/zid? // . we cannot add a meta list until we have bool Syncdb::sentAllCheckoffRequests ( uint32_t sid , uint64_t zid ) { // get group we are in Host *group = g_hostdb.getMyShard(); // number hosts in group int32_t nh = g_hostdb.getNumHostsPerShard(); // loop over our twins for ( int32_t i = 0 ; i < nh ; i++ ) { // get host Host *h = &group[i]; // skip if us if ( h == g_hostdb.m_myHost ) continue; // skip if dead if ( g_hostdb.isDead ( h ) ) continue; // make the key key128_t k = makeKey(1,0,0,0,h->m_hostId,sid,zid,1); // get it int32_t nn = m_qt.getNode ( 0 , (char *)&k ); // if it is there, that means we have yet to get a reply // for this checkoff request! because once we get a successful // reply for it, we delete it from quick tree if ( nn >= 0 ) return false; } // i guess we are good to go! return true; } // . ***** META LIST ADD LOOP ***** // . add meta lists corresponding to the keys in m_addme[] // . call msg5 in case some got dumped to disk! // . never add zids out of order for the same sid // . returns false if blocked, sets g_errno on error and returns true bool Syncdb::loop2 ( ) { // sanity check if ( ! m_calledLoop1 ) { char *xx=NULL;*xx=0; } loop: // breathe just in case QUICKPOLL ( MAX_NICENESS ); // return if done! if ( m_ia >= m_na ) { m_calledLoop2 = true ; return true; } // . check the tree for the next key to add first!!! // . most of the time this should be the case! RdbTree *stree = &m_rdb.m_tree; // get the key key128_t k = m_addMe[m_ia]; // is it there? int32_t n = stree->getNode ( 0 , (char *)&k ); // yes! easy add... if ( n >= 0 ) { //int32_t reqSize = stree->getDataSize ( n ); char *req = stree->getData ( n ); // get zid from key uint64_t zid1 = getZid ( &k ); // get zid from request uint64_t zid2 = *(uint64_t *)(req+4); // must match! if ( zid1 != zid2 ) { char *xx=NULL;*xx=0; } // . add away using Msg4.cpp // . return with g_errno set on error if ( ! addMetaList ( req ) ) return true; // success, delete the key from QUICK TREE m_qt.deleteNode ( 0 , (char *)&k , true ); // . and delete that node (freeData = true) // . no! not until we got all the checkoff requests in! // stree->deleteNode ( n , true ); // advance on success m_ia++; // go back for more goto loop; } // make the key range key128_t sk = k; key128_t ek = k; // make negative sk.n0 &= 0xfffffffffffffffeLL; // do not let sleep ticker call bigLoop m_outstanding = true; // get the meta list from on disk i guess, if not in tree if ( ! m_msg5.getList ( RDB_SYNCDB , (collnum_t)0 , // coll &m_list , (char *)&sk , // startKey (char *)&ek , // endKey 999 , // minRecSizes true , // includeTree? false , // addToCache 0 , // maxCacheAge 0 , // startFielNum -1 , // numFiles NULL , // state gotListWrapper , MAX_NICENESS , true )) // do error correction? return false; // process it. loop back up for another on success! if ( gotList ( ) ) goto loop; // bad. g_errno must be set return false; } void gotListWrapper ( void *state , RdbList *list , Msg5 *msg5 ) { // do not re-loop on error if ( ! g_syncdb.gotList ( ) ) return; // loop backup! g_syncdb.loop2(); } // . send as many zids with a msg4d request as you can // . that way we check a bunch off all at once bool Syncdb::gotList ( ) { // bigLoop() can be called again now m_outstanding = false; // error? if ( g_errno ) { log("sync: had error in msg5: %s",mstrerror(g_errno)); return false; } // int16_tcut RdbList *m = &m_list; // just in case m->resetListPtr(); // get the rec char *rec = m->getCurrentRec(); // get key key128_t k = *(key128_t *)rec; // sanity check if ( k != m_addMe[m_ia] ) { char *xx=NULL;*xx=0;} // . add it using msg4.cpp::addMetaList() // . sets g_errno and returns false on error if ( ! addMetaList ( rec ) ) return false; // we no longer have to add it! m_qt.deleteNode ( 0 , (char *)&k , true ); // point to next m_ia++; // free it m_list.reset(); // success return true; } // . ***** META LIST DELETE LOOP ***** // . scan for meta lists to remove from syncdb // . check every D KEY // . must NOT have any "need to send request" keys (a bit set) // . must NOT have any "need to recv request" keys (b bit set) // . must NOT have our "need to add" key (c bit set) void Syncdb::loop3 ( ) { // . loop over the meta lists we need to delete // . these are "d" keys // . use a "tid" of 0 key128_t sk = makeKey ( 0,0,0,1,0,0,0,0 ); key128_t ek = makeKey ( 0,0,0,1,0,0xffffffff,0xffffffffffffffffLL,1 ); // get the first node in sequence, if any int32_t nn = m_qt.getNextNode ( 0 , (char *)&sk ); // do the loop for ( ; nn >= 0 ; nn = m_qt.getNextNode ( nn ) ) { // breathe QUICKPOLL ( MAX_NICENESS ); // get key key128_t k = *(key128_t *)m_qt.getKey ( nn ); // stop when we hit the end if ( k > ek ) break; // get zid uint64_t zid = getZid ( &k ); // get sid uint32_t sid = getSid ( &k ); // have we sent/recvd all checkoff requests required? have // we added the meta list? if so, we can nuke it from syncdb if ( ! canDeleteMetaList ( sid, zid ) ) { // no use banging away at this sid any more since we // are missing another action for this one sid++; // find the key of the FIRST meta list we need to add // for this new senderId, "sid" key128_t nk = makeKey ( 0,0,0,1,0,sid,0,0 ); // undo the m_qt.getNextNode(nn) we call in for loop nn = m_qt.getPrevNode ( 0 , (char *)&nk ); // sanity check if ( nn < 0 ) { char *xx=NULL;*xx=0; } // get next key from this new sid continue; } // . make the negative key for syncdb // . it just uses a negative "c" key, with a tid of 0 key128_t dk = makeKey ( 0,0,1,0,0,sid,zid,0); // . add it to syncdb to signifiy a delete // . this returns false and sets g_errno on error if(!m_rdb.addRecord((collnum_t)0,(char *)&dk,NULL,0, MAX_NICENESS)) return; // delete it from quick tree now that we added the negative // key successfully to syncdb int32_t dn = m_qt.getNode ( 0, (char *)&k ); // must be there! if ( ! dn ) { char *xx=NULL;*xx=0; } // nuke it m_qt.deleteNode ( dn , true ); } // . success // . do not recall until big loop completes a round m_calledLoop3 = true; } // . have we added the meta list to our rdbs yet? // . can not delete the actual meta list from syncdb until so bool Syncdb::canDeleteMetaList ( uint32_t sid , uint64_t zid ) { // make the "c" key to see if we still need to add the meta list key128_t k = makeKey(0,0,1,0,0,sid,zid,1); // get it int32_t nn = m_qt.getNode ( 0 , (char *)&k ); // if "c" key is there, that means we need to add to our rdb still // so we can not delete it yet! if ( nn >= 0 ) return false; // get group we are in Host *group = g_hostdb.getMyShard(); // . if we need to send some requests still can not delete // . number hosts in group int32_t nh = g_hostdb.getNumHostsPerShard(); // loop over our twins for ( int32_t i = 0 ; i < nh ; i++ ) { // get host Host *h = &group[i]; // skip if us if ( h == g_hostdb.m_myHost ) continue; // check the need to send req key now. the "a" key. k = makeKey(1,0,0,0,h->m_hostId,sid,zid,1); // get it nn = m_qt.getNode ( 0 , (char *)&k ); // if there, we cannot delete it yet if ( nn >= 0 ) return false; // check the recv key now. the "b" key. k = makeKey(0,1,0,0,h->m_hostId,sid,zid,1); // get it nn = m_qt.getNode ( 0 , (char *)&k ); // if there, we cannot delete it yet if ( nn >= 0 ) return false; } // i guess we can delete it, all keys are gone return true; } // . ***** SYNC LOOP ***** // . loop over the checkoff requests we are waiting for. "b" keys. // . send out msg0 requests for meta lists we need // . if we got a 60+ sec old checkoff recvd but never got the meta list // ourselves, then request it!! bool Syncdb::loop4 ( ) { // make the start key. a "b" key key128_t k = m_syncKey; // end key key128_t ek; ek = makeKey(0,1,0,0,0xffffffff,0xffffffff,0xffffffffffffffffLL,1); // get next node int32_t nn = m_qt.getNode ( 0 , (char *)&k ); // get group we are in //Host *group = g_hostdb.getMyShard(); // use this for determining approximate age of meta lists int64_t nowms = gettimeofdayInMilliseconds(); // do the loop for ( ; nn >= 0 ; nn = m_qt.getNextNode ( nn ) ) { // breathe QUICKPOLL ( MAX_NICENESS ); // give other loops some time so we can digest these // meta lists we added if ( m_addCount >= 100 ) break; // get key key128_t k = *(key128_t *)m_qt.getKey ( nn ); // stop when we hit the end if ( k > ek ) break; // update m_syncKey = k; // get twin id uint32_t tid = getTid ( &k ); // skip if tid is dead if ( g_hostdb.isDead ( tid ) ) { // skip to next tid! tid++; // . find the key of the FIRST meta list we need to add // for this new senderId, "sid" // . mdw: i reset sid to 0 key128_t nk = makeKey ( 0,1,0,0,tid,0,0,0 ); // undo the m_qt.getNextNode(nn) we call in for loop nn = m_qt.getPrevNode ( 0 , (char *)&nk ); // sanity check if ( nn < 0 ) { char *xx=NULL;*xx=0; } // get next key from this new sid continue; } // . skip if delbit set // . the delbit set means we got the meta list and are // waiting to receive the checkoff request from twin #sid // . when it comes in it will annihilate with this key! // . right now we are only interested in *negative* keys, // those with their delbit cleared if ( (k.n0 & 0x01) == 0x01 ) continue; // . ok we got checkoff request from tid for this sid/zid // . and we have not got the meta list ourselves yet cuz // we would have had k with the delbit set. // . get zid uint64_t zid = getZid ( &k ); // get sid uint32_t sid = getSid ( &k ); // get its approximate age int64_t age = nowms - zid; // go to next sid if not 60 seconds yet for this one if ( age < 60000 ) { // no use banging away at this sid any more since we // are missing another action for this one sid++; // find the key of the FIRST meta list we need to add // for this new senderId, "sid". make the "b" key. key128_t nk = makeKey ( 0,1,0,0,tid,sid,0,0 ); // undo the m_qt.getNextNode(nn) we call in for loop nn = m_qt.getPrevNode ( 0 , (char *)&nk ); // sanity check if ( nn < 0 ) { char *xx=NULL;*xx=0; } // get next key from this new sid continue; } // make the "d" key to see if we got the meta list just // to make sure! we do not want to re-add a meta list! key128_t dk = makeKey(0,0,0,1,0,sid,zid,1); if ( m_qt.getNode ( 0 , (char *)&dk ) >= 0 ) continue; // note it log("sync: requesting meta list sid=%"UINT32" zid=%"UINT64" age=%"UINT64" " "from twin hostid #%"INT32"", (uint32_t)sid, (uint64_t)zid, (uint64_t)age, (int32_t)tid); // i guess we are out of sync g_hostdb.m_myHost->m_inSync = false; // do not let sleep ticker call bigLoop m_outstanding = true; // record the sid m_requestedSid = sid; // make the c key range key128_t sk2 = makeKey(0,0,1,0,0,sid,zid,0); key128_t ek2 = makeKey(0,0,1,0,0,sid,zid,1); // ok, ask tid for this meta list if ( ! m_msg0.getList ( tid , // hostId 0 , // ip 0 , // port 0 , // maxCacheAge false , // addToCache RDB_SYNCDB , 0 , // collnum &m_list , (char *)&sk2 , (char *)&ek2 , 999 , // minRecSizes NULL , gotListWrapper4 , MAX_NICENESS , false ))// err correction? return false; // stop on error. return true with g_errno set on error if ( ! gotList4() ) return true; } // we completed m_calledLoop4 = true; return true; } void gotListWrapper4 ( void *state ) { // returns false if this had an error if ( ! g_syncdb.gotList4() ) return; // otherwise, resume the loop g_syncdb.loop4(); } // returns false and sets g_errno on error bool Syncdb::gotList4 ( ) { // bigLoop() can call us again now m_outstanding = false; // return false on error if ( g_errno ) return false; // should NOT be empty if ( m_list.isEmpty() ) { // maybe this host is not in sync? g_errno = EBADENGINEER; log("sync: twin did not have zid but we had checkoff " "request from twin."); return false; } // get the reply, should be a msg4 request char *req = m_list.getCurrentData(); int32_t reqSize = m_list.getCurrentDataSize(); // . return false if this had an error, g_errno should be set // . this will be just like we got the request from msg4 directly! if ( ! gotMetaListRequest ( req , reqSize , m_requestedSid ) ) return false; // . count adds // . after 100 we move on to another loop so that we can digest // these meta lists! m_addCount++; // advance the key. do not repeat this key since we were successful m_syncKey += 1; return true; } // . ***** CHECKOFF LOOP ***** // . loop over the checkoff requests TO SEND // . send checkoff requests out that we need to send and have not yet got // a reply for. when we get a reply for them the "need to send checkoff // request" key is immediately deleted // . make a big list then call msg1 with an rdbId of RDB_SYNCDB // . do not include dead hosts void Syncdb::loop5 ( ) { // make the start key key128_t k = m_nextk; // end key key128_t ek; ek = makeKey(1,0,0,0,0xffffffff,0xffffffff,0xffffffffffffffffLL,1); // reset m_nk = 0; // get it int32_t nn = m_qt.getNextNode ( 0 , (char *)&k ); // do one tid at a time uint32_t startTid = getTid ( &k ); // do the loop for ( ; nn >= 0 ; nn = m_qt.getNextNode ( nn ) ) { // breathe QUICKPOLL ( MAX_NICENESS ); // get key key128_t k = *(key128_t *)m_qt.getKey ( nn ); // save it m_nextk = k; // add one m_nextk += 1; // stop when we hit the end if ( k > ek ) break; // get twin id uint32_t tid = getTid ( &k ); // stop if tid changed if ( tid != startTid && m_nk >= 0 ) break; // skip if tid is dead if ( g_hostdb.isDead ( tid ) ) { // skip to next tid! tid++; // . find the key of the FIRST meta list we need to add // for this new senderId, "sid" // . mdw: i reset sid to 0 key128_t nk = makeKey ( 1,0,0,0,tid,0,0,0 ); // undo the m_qt.getNextNode(nn) we call in for loop nn = m_qt.getPrevNode ( 0 , (char *)&nk ); // bail if no good if ( nn < 0 ) break; // get next key from this new sid continue; } // get zid uint64_t zid = getZid ( &k ); // get sid uint32_t sid = getSid ( &k ); // note it log("sync: storing key for meta request list sid=%"UINT32" zid=%"UINT64" " "from twin hostid #%"INT32"",(uint32_t)sid, (uint64_t)zid,(int32_t)tid); // make the key. make NEGATIVE "b" keys. m_keys [ m_nk++ ] = makeKey ( 0,1,0,0,0,sid,zid,0 ); // stop if full if ( m_nk >= MAX_CHECKOFF_KEYS ) break; } // all done? if ( k <= ek && m_nk <= 0 ) { m_calledLoop5 = true; return; } // . add the whole list at once // . make our own msg1 request to send to just one host if ( g_udpServer.sendRequest ( (char *)m_keys , m_nk * 16 , 0x5d , // SYNCDB REQUEST 0 , // ip 0 , // port startTid , // hostId NULL , // retSlot NULL , // state addedListWrapper5 , // wrapper 60 , // timeout -1 , // backoff -1 , // maxWait NULL , // replyBuf 0 , // replyBufSize MAX_NICENESS )) // it blocked, return now return; // i guess we had an error... if ( ! addedList5() ) return; } void addedListWrapper5 ( void *state , UdpSlot *slot ) { // return on error if ( ! g_syncdb.addedList5() ) return; // otherwise, loop back up g_syncdb.loop5(); } // yay, we got a reply to the checkoff request we sent bool Syncdb::addedList5 ( ) { // are we missing a previous zid? then we are not in sync! //if ( g_errno == ENOSYNC ) g_hostdb.m_myHost->m_inSync = false; // error? if ( g_errno ) return false; // . remove from tree, we successfully added! // . make a fake list if ( ! m_qt.deleteKeys ( 0 , (char *)m_keys , m_nk ) ) { // sanity check - must all be in quicktree char *xx=NULL; *xx=0; } // success return true; } // . did we receive a checkoff request from a fellow twin? // . request is a list of checkoff request keys ("a" keys) void handleRequest5d ( UdpSlot *slot , int32_t netnice ) { // get the sending hostid int32_t sid = slot->m_hostId; // sanity check if ( sid < 0 ) { char *xx=NULL; *xx=0; } // get the request buffer //key128_t *keys = (key128_t *)slot->m_readBuf; int32_t nk = slot->m_readBufSize / 16; // int16_tcut UdpServer *us = &g_udpServer; // if tree gets full, then return false forever if ( ! g_syncdb.m_qt.hasRoomForKeys ( nk ) ) { us->sendErrorReply ( slot , ETRYAGAIN ); return; } for ( int32_t i = 0 ; i < nk ; i++ ) { // get the key key128_t k = g_syncdb.m_keys[i]; // sanity check. must be a negative key. if ( (k.n0 & 0x1) != 0x0 ) { char *xx=NULL;*xx=0; } // get the anti key. the "need to recv checkoff request" // key which is the positive key128_t pk = k; // make it positive pk.n0 |= 0x01; // is it in there? int32_t nn = g_syncdb.m_qt.getNode ( 0 , (char *)&pk ); // if yes, nuke it. they annihilate. if ( nn >= 0 ) { g_syncdb.m_qt.deleteNode ( nn , true ); continue; } // . otherwise, add right to the tree // . should always succeed! if ( g_syncdb.m_qt.addKey(&k)<0) { char *xx=NULL;*xx=0; } } // return empty reply to mean success us->sendReply_ass ( NULL , 0 , NULL , 0 , slot ); } static void sleepWrapper ( int fd , void *state ) { g_syncdb.bigLoop(); } void Syncdb::bigLoop ( ) { // try to launch our sync if ( m_doRcp ) rcpFiles(); // we got a msg0 or msg5 outstanding. wait for it to come back first. if ( m_outstanding ) return; if ( ! m_calledLoop1 ) { loop1(); return; } if ( ! m_calledLoop2 ) { loop2(); return; } if ( ! m_calledLoop3 ) { loop3(); return; } if ( ! m_calledLoop4 ) { loop4(); return; } if ( ! m_calledLoop5 ) { loop5(); return; } // if done calling loop4(), reset this loopReset(); } void Syncdb::loopReset() { m_outstanding = false; // reset flags m_calledLoop1 = false; m_calledLoop2 = false; m_calledLoop3 = false; m_calledLoop4 = false; m_calledLoop5 = false; m_syncKey = makeKey(0,1,0,0,0,0,0,0); m_addCount = 0; // if done calling loop5(), reset this m_nextk = makeKey ( 1,0,0,0,0,0,0,0 ); } // reset rdb void Syncdb::reset() { m_rdb.reset(); m_qt.reset(); } bool Syncdb::registerHandlers ( ) { // register ourselves with the udp server if ( ! g_udpServer.registerHandler ( 0x5d, handleRequest5d ) ) return false; // register ourselves with the udp server if ( ! g_udpServer.registerHandler ( 0x59, handleRequest55 ) ) return false; return true; } bool Syncdb::init ( ) { // reset loopReset(); m_doRcp = false; m_rcpStarted = false; // setup quick tree if ( ! m_qt.set ( 0 , // fixedDataSize 300000 , // 300k nodes true , // balance? -1 , // maxmem, no max false , // ownData? "tresyncdb" , false , // dataInPtrs? "quicktree" , // dbname 16 , // keySize false ))// useProtection? return false; BigFile f; f.set ( g_hostdb.m_dir , "quicktree.dat" ); // only load if it exists bool exists = f.doesExist(); // load it if ( exists && ! m_qt.fastLoad( &f , &m_stack ) ) return log("sync: quicktree.dat load failed: %s", mstrerror(g_errno)); // done f.close(); // assume permanently out of sync int32_t val = 2; // load the insync.dat file f.set ( g_hostdb.m_dir , "insync.dat" ); // fail on open failure if ( ! f.open ( O_RDONLY ) ) return false; // if not there, permanently out of sync if ( ! f.doesExist() ) log("sync: insync.dat does not exist. Assuming host is " "unrecoverable."); else { // get the value char buf[20]; int32_t n = f.read ( &buf , 10 , 0 ) ; if ( n <= 0 ) return log("sync: read insync.dat: %s", mstrerror(g_errno)); // must be digit if ( ! is_digit ( buf[0] ) ) return log("sync: insync.dat has no number in it."); // unlink it if ( ! f.unlink() ) return log("sync: failed to unlink insync.dat: %s", mstrerror(g_errno)); // get the value val = atol ( buf ); } // bad val? if ( val < 0 || val > 2 ) return log("sync: insync.dat had bad value of %"INT32"",val); // report if in sync or not if ( val == 0 ) log("sync: insync.dat says out of sync"); if ( val == 1 ) log("sync: insync.dat says in sync"); if ( val == 2 ) log("sync: insync.dat says PERMANENTLY out of sync"); // set it Host *h = g_hostdb.m_myHost; if ( val == 1 ) h->m_inSync = 1; if ( val == 2 ) h->m_isPermanentOutOfSync = 1; // call this once per second if ( ! g_loop.registerSleepCallback ( 1000 , NULL , sleepWrapper ) ) return false; // 10 MB int32_t maxTreeMem = 10000000; // . what's max # of tree nodes? // . key+4+left+right+parents+dataPtr = 12+4 +4+4+4+4 = 32 // . 28 bytes per record when in the tree int32_t maxTreeNodes = maxTreeMem / ( 16 + 1000 ); // . initialize our own internal rdb // . records are actual msg4 requests received from Msg4 // . the key is formed calling Syncdb::makeKey() which is based on // the tid, sid and zid of the msg4 request, where tid is the // twin hostid we are chatting with in our group, sid is the // ORIGINAL sending hostid of the msg4 request, and zid is the // kinda transaction #, and is unique. if ( ! m_rdb.init ( g_hostdb.m_dir , "syncdb" , true , // dedup -1 , // dataSize is variable 50 , // min files to merge maxTreeMem , maxTreeNodes , // maxTreeNodes , true , // balance tree? 50000 , // maxCacheMem , 100 , // maxCacheNodes , false , // half keys? false , // save cache? NULL , // page cache false , // is titledb false , // preload disk page cache 16 , // key size false , // bias disk page cache? true ))// is collectionless? return false; // add the coll //if ( ! g_syncdb.m_rdb.addColl ( "dummy" ) ) return true; // reset quick tree? if ( ! h->m_isPermanentOutOfSync ) return true; // clear it all! m_qt.clear(); // add the base since it is a collectionless rdb return m_rdb.addRdbBase1 ( NULL ); } // . save our crap // . returns false and sets g_errno on error bool Syncdb::save ( ) { g_errno = 0; // save the quick tree m_qt.fastSave ( g_hostdb.m_dir , "synctree" , false , NULL , NULL ); // error? if ( g_errno ) return log("sync: save: %s",mstrerror(g_errno)); // save the insync.dat file Host *h = g_hostdb.m_myHost; File f; f.set ( g_hostdb.m_dir , "insync.dat" ); // open it if ( ! f.open ( O_RDWR | O_CREAT ) ) return log("sync: open insync.dat: %s",mstrerror(g_errno)); char buf[2]; buf[0] = '0'; buf[1] = '\0'; if ( h->m_inSync ) buf[0] = '1'; if ( h->m_isPermanentOutOfSync ) buf[0] = '2'; if ( f.write ( buf , 2 , 0 ) != 2 ) return log("sync: write insync.dat: %s",mstrerror(g_errno)); return true; } bool Syncdb::verify ( char *coll ) { log ( LOG_INFO, "db: Verifying Syncdb for coll %s...", coll ); g_threads.disableThreads(); Msg5 msg5; Msg5 msg5b; RdbList list; key_t startKey; key_t endKey; startKey.setMin(); endKey.setMax(); CollectionRec *cr = g_collectiondb.getRec(coll); if ( ! msg5.getList ( RDB_SYNCDB , cr->m_collnum , &list , startKey , endKey , 64000 , // minRecSizes , true , // includeTree , false , // add to cache? 0 , // max cache age 0 , // startFileNum , -1 , // numFiles , NULL , // state NULL , // callback 0 , // niceness false , // err correction? NULL , 0 , -1 , true , -1LL , &msg5b , true )) { g_threads.enableThreads(); return log("db: HEY! it did not block"); } int32_t count = 0; int32_t got = 0; for ( list.resetListPtr() ; ! list.isExhausted() ; list.skipCurrentRecord() ) { key_t k = list.getCurrentKey(); count++; //uint32_t groupId = getGroupId ( RDB_SYNCDB , &k ); //if ( groupId == g_hostdb.m_groupId ) got++; uint32_t shardNum = getShardNum ( RDB_SYNCDB , (char *)&k ); if ( shardNum == getMyShardNum() ) got++; } if ( got != count ) { log ("db: Out of first %"INT32" records in syncdb, " "only %"INT32" belong to our group.",count,got); // exit if NONE, we probably got the wrong data if ( got == 0 ) log("db: Are you sure you have the " "right " "data in the right directory? " "Exiting."); log ( "db: Exiting due to Syncdb inconsistency." ); g_threads.enableThreads(); return g_conf.m_bypassValidation; } log ( LOG_INFO, "db: Syncdb passed verification successfully for " "%"INT32" recs.", count ); // DONE g_threads.enableThreads(); return true; } // . returns false on error // . called from PageHosts.cpp!!! bool Syncdb::syncHost ( int32_t syncHostId ) { Host *sh = g_hostdb.getHost ( syncHostId ); if ( ! sh ) return log("sync: bad host id %"INT32"",syncHostId); // get its group //Host *hosts = g_hostdb.getGroup ( sh->m_groupId ); Host *hosts = g_hostdb.getShard ( sh->m_shardNum ); // get the best twin for it to sync from for ( int32_t i = 0 ; i < g_hostdb.getNumHostsPerShard() ; i++ ) { // get host Host *h = &hosts[i]; // skip if dead if ( g_hostdb.isDead ( h ) ) continue; // skip if permanent out of sync if ( h->m_isPermanentOutOfSync ) continue; // not itself! it must be dead... wtf!? if ( h == sh ) continue; // save it int32_t tmp = syncHostId; // log it log("sync: sending sync request to host id #%"INT32"",h->m_hostId); // int16_tcut UdpServer *us = &g_udpServer; // use that guy if ( us->sendRequest ( (char *)&tmp , 4 , 0x59 , // SYNCDB REQUEST 0 , // ip 0 , // port h->m_hostId , // hostId NULL , // retSlot NULL , // state gotReplyWrapper55 , // wrapper 15 , // timeout -1 , // backoff -1 , // maxWait NULL , // replyBuf 0 , // replyBufSize MAX_NICENESS )) // success return true; // note it log("sync: had error sending sync request to host id #%"INT32": %s", h->m_hostId,mstrerror(g_errno)); // error! return false; } // none to sync from return log("sync: could not find adequate twin to sync from!"); } void gotReplyWrapper55 ( void *state , UdpSlot *slot ) { if ( g_errno ) log("sync: got error reply from sync host: %s", mstrerror(g_errno)); else log("sync: got success reply from sync host"); } // rcp our files to a new host (was a spare) void handleRequest55 ( UdpSlot *slot , int32_t netnice ) { // who should we copy to? char *p = slot->m_readBuf; int32_t size = slot->m_readBufSize; UdpServer *us = &g_udpServer; // get hostid int32_t hostId = -1; if ( size == 4 ) hostId = *(int32_t *)p; // panic? if ( hostId < 0 || hostId >= g_hostdb.getNumHosts() ) { us->sendErrorReply ( slot , EBADENGINEER ); return; } // set this. RdbBase will turn off all merging Host *h = g_hostdb.getHost ( hostId ); // only one at a time now! if ( g_hostdb.m_syncHost ) { log ( "sync: Already syncing with another host. Aborting."); us->sendErrorReply ( slot , EBADENGINEER ); return; } // can't sync with ourselves if ( h == g_hostdb.m_myHost ) { log ( "sync: Cannot sync with yourself. Aborting."); us->sendErrorReply ( slot , EBADENGINEER ); return; } if ( ! g_hostdb.isDead ( h ) ) { log ( "sync: Cannot sync live host. Aborting."); us->sendErrorReply ( slot , EBADENGINEER ); return; } Host *me = g_hostdb.m_myHost; if ( me->m_isPermanentOutOfSync ) { log ( "sync: Src host is permanently out of sync. Aborting."); us->sendErrorReply ( slot , EBADENGINEER ); return; } // now check it for a clean directory int32_t ip = h->m_ip; // if that is dead use ip #2 if ( h->m_ping >= g_conf.m_deadHostTimeout && h->m_pingShotgun < g_conf.m_deadHostTimeout ) ip = h->m_ipShotgun; char *ips = iptoa(ip); char cmd[1024]; sprintf ( cmd, "ssh %s \"cd %s; du -b | tail -n 1\" > ./synccheck.txt", ips, h->m_dir ); log ( LOG_INFO, "init: %s", cmd ); system(cmd); int32_t fd = open ( "./synccheck.txt", O_RDONLY ); if ( fd < 0 ) { log( "sync: Unable to open synccheck.txt. Aborting."); us->sendErrorReply ( slot , EBADENGINEER ); return; } int32_t len = read ( fd, cmd, 1023 ); cmd[len] = '\0'; close(fd); // delete the file to make sure we don't reuse it system ( "rm ./synccheck.txt" ); // check the size int32_t checkSize = atol(cmd); if ( checkSize > 4096 || checkSize <= 0 ) { log("sync: Detected %"INT32" bytes in directory to " "sync. Must be empty. Is .antiword dir in " "there?", checkSize); us->sendErrorReply ( slot , EBADENGINEER ); return; } // RdbBase will not start any new merges if this m_syncHost is set g_hostdb.m_syncHost = h; // set this flag g_hostdb.m_syncHost->m_doingSync = 1; } void Syncdb::rcpFiles ( ) { // wait for all merging to finish if ( g_merge.isMerging() || g_merge2.isMerging() ) return; // if first time, save it if ( ! m_rcpStarted ) { // do not re-call it this time m_rcpStarted = true; // note it log("sync: Saving all files before rcp'ing to twin."); // save us. return if blocked. we will be re-called if ( ! g_process.save() ) return; } // log the start log ( LOG_INFO, "sync: Copying our data files to host %"INT32".", g_hostdb.m_syncHost->m_hostId ); // start the sync in a thread, complete when it's done if ( g_threads.call ( GENERIC_THREAD , MAX_NICENESS , this , syncDoneWrapper , syncStartWrapper_r ) ) // return on successful launch return ; // error. g_hostdb.m_syncHost->m_doingSync = 0; g_hostdb.m_syncHost = NULL; m_doRcp = false; m_rcpStarted = false; log ( "sync: Could not spawn thread for call to sync " "host. Aborting." ); } void syncDoneWrapper ( void *state , ThreadEntry *te ) { Syncdb *THIS = (Syncdb*)state; THIS->syncDone(); } void *syncStartWrapper_r ( void *state , ThreadEntry *te ) { Syncdb *THIS = (Syncdb*)state; THIS->syncStart_r(true); return NULL; } //int my_system_r ( char *cmd , int32_t timeout ); int startUp ( void *cmd ); void Syncdb::syncStart_r ( bool amThread ) { // turn this off g_process.m_suspendAutoSave = true; char cmd[1024]; // get synchost best ip char *ips = iptoa ( g_hostdb.getAliveIp ( g_hostdb.m_syncHost ) ); // his dir char *dir = g_hostdb.m_syncHost->m_dir; // use Host *me = g_hostdb.m_myHost; // ours char *mydir = me->m_dir; // generic int32_t err; // loop over every rdb and every data and map file in each rdb for ( int32_t i = 0 ; i < RDB_END ; i++ ) { // skip SYNCDB if ( i == RDB_SYNCDB ) continue; // get that rdb Rdb *rdb = getRdbFromId ( i ); // skip if none if ( ! rdb ) continue; // get coll for ( int32_t j = 0 ; j < rdb->getNumBases() ; j++ ) { // get that base RdbBase *base = rdb->getBase(j);//m_bases[j]; if ( ! base ) continue; // get coll char *coll = base->m_coll; // and num int32_t collnum = base->m_collnum; // make the dir sprintf ( cmd , "ssh %s 'mkdir %scoll.%s.%"INT32"'", ips,dir,coll,collnum); // excecute log ( LOG_INFO, "sync: %s", cmd ); //int err = my_system_r ( cmd, 3600*24 ); //if ( err != 0 ) goto hadError; // copy the files for ( int32_t k = 0 ; k < base->m_numFiles ; k++ ) { // sleep while dumping. we are in a thread. if ( base->isDumping() ) sleep ( 1 ); // get map RdbMap *map = base->m_maps[k]; // copy the map file sprintf ( cmd , "rcp %s %s:%scoll.%s.%"INT32"/'", map->getFilename(),ips,dir,coll,collnum); log ( LOG_INFO, "sync: %s", cmd ); // MDW: take out for now //if ( ( err = my_system_r ( cmd, 3600*24 ) ) ) goto hadError; // get the file BigFile *f = base->m_files[k]; // loop over each little part file for ( int32_t m = 0 ; m < f->m_numParts ; m++ ) { // get part file File *p = f->m_files[m]; // copy that sprintf ( cmd , "rcp %s %s:%scoll.%s.%"INT32"/'", p->m_filename,ips,dir,coll,collnum); // excecute log ( LOG_INFO, "sync: %s", cmd ); // MDW: take out for now //if ( ( err = my_system_r ( cmd, 3600*24 ) ) ) goto hadError; } } } } // make the dirs sprintf ( cmd , "ssh %s '" "mkdir %s/dict/ ;" "mkdir %s/dict/en/ ;" "mkdir %s/ucdata/ ;" "mkdir %s/.antiword/ ;" "'" , ips, dir, dir, dir, dir ); // excecute log ( LOG_INFO, "sync: %s", cmd ); // MDW: take out for now //if ( ( err = my_system_r ( cmd, 3600*24 ) ) ) goto hadError; // loop over the files in Process.cpp for ( int32_t i = 0 ; i < 99999 ; i++ ) { // null means end if ( ! g_files[i] ) break; sprintf ( cmd , "rcp %s%s %s:%s", mydir,g_files[i],ips,dir); // excecute log ( LOG_INFO, "sync: %s", cmd ); // MDW: take out for now //if ( ( err = my_system_r ( cmd, 3600*24 ) ) ) goto hadError; } // new guy is NOT in sync sprintf ( cmd , "ssh %s 'echo 0 > %sinsync.dat", ips,dir); // excecute log ( LOG_INFO, "sync: %s", cmd ); // MDW: take out for now //if ( ( err = my_system_r ( cmd, 3600*24 ) ) ) goto hadError; // saved files sprintf ( cmd , "rcp %s*-saved.dat %s:%sinsync.dat", mydir,ips,dir); // excecute log ( LOG_INFO, "sync: %s", cmd ); // MDW: take out for now //if ( ( err = my_system_r ( cmd, 3600*24 ) ) ) goto hadError; // completed! return; // hadError: log ( "sync: Call to system(\"%s\") had error %s.",cmd,strerror(err)); g_hostdb.m_syncHost->m_doingSync = 0; g_hostdb.m_syncHost = NULL; return; } void Syncdb::syncDone ( ) { // allow autosaves again g_process.m_suspendAutoSave = false; // now make a call to startup the newly synced host if ( ! g_hostdb.m_syncHost ) { log ( "sync: SyncHost is invalid. Most likely a problem " "during the sync. Ending synchost." ); return; } log ( LOG_INFO, "init: Sync copy done. Starting host." ); g_hostdb.m_syncHost->m_doingSync = 0; char cmd[1024]; sprintf(cmd, "./gb start %"INT32"", g_hostdb.m_syncHost->m_hostId); log ( LOG_INFO, "init: %s", cmd ); system(cmd); g_hostdb.m_syncHost = NULL; m_doRcp = false; m_rcpStarted = false; } /* // TODO: Provide verification. bool Syncdb::addColl ( char *coll, bool doVerify ) { if ( ! m_rdb.addColl ( coll ) ) return false; return true; } */