#include "gb-include.h" // i guess both msg0 send requests failed with no route to host, //and they got retired... why didnt they switch to eth1???? #include "Multicast.h" #include "Rdb.h" // RDB_TITLEDB #include "Msg20.h" #include "Profiler.h" #include "Stats.h" #include "Process.h" // up to 10 twins in a group //#define MAX_HOSTS_PER_GROUP 10 // TODO: if we're ordered to close and we still are waiting on stuff // to send we should send as much as we can and save the remaining // slots to disk for sending later?? static void sleepWrapper1 ( int bogusfd , void *state ) ; static void sleepWrapper1b ( int bogusfd , void *state ) ; static void sleepWrapper2 ( int bogusfd , void *state ) ; static void gotReplyWrapperM1 ( void *state , UdpSlot *slot ) ; static void gotReplyWrapperM2 ( void *state , UdpSlot *slot ) ; void Multicast::constructor ( ) { m_msg = NULL; m_readBuf = NULL; m_replyBuf = NULL; m_inUse = false; } void Multicast::destructor ( ) { reset(); } Multicast::Multicast ( ) { constructor(); } Multicast::~Multicast ( ) { reset(); } // free the send/read (request/reply) bufs we pirated from a UdpSlot or // got from the caller void Multicast::reset ( ) { // if this is called while we are shutting down and Scraper has a // MsgE out it cores if ( m_inUse && ! g_process.m_exiting ) { log("net: Resetting multicast which is in use. msgType=0x%hhx", m_msgType); char *xx = NULL; *xx = 0; // destroy the outstanding slots destroySlotsInProgress(NULL); // and undo any sleepwrapper if ( m_registeredSleep ) { g_loop.unregisterSleepCallback ( this , sleepWrapper1); g_loop.unregisterSleepCallback ( this , sleepWrapper2); m_registeredSleep = false; } if ( m_registeredSleep2 ) { g_loop.unregisterSleepCallback ( this ,sleepWrapper1b); m_registeredSleep2 = false; } } if ( m_msg && m_ownMsg ) mfree ( m_msg , m_msgSize , "Multicast" ); if ( m_readBuf && m_ownReadBuf && m_freeReadBuf ) mfree ( m_readBuf , m_readBufMaxSize , "Multicast" ); // . replyBuf can be separate from m_readBuf if g_errno gets set // and sets the slot's m_readBuf to NULL, then calls closeUpShop() // which sets m_readBuf to the slot's readBuf, which is now NULL! // . this was causing the "bad engineer" errors from Msg22 to leak mem if ( m_replyBuf && m_ownReadBuf && m_freeReadBuf && m_replyBuf != m_readBuf ) mfree ( m_replyBuf , m_replyBufMaxSize , "Multicast" ); m_msg = NULL; m_readBuf = NULL; m_replyBuf = NULL; m_inUse = false; m_replyingHost = NULL; } // . an individual transaction's udpSlot is not be removed because we might // get it a reply from it later after it's timeout // . returns false and sets g_errno on error // . caller can now pass in his own reply buffer // . if "freeReplyBuf" is true that means it needs to be freed at some point // otherwise, it's probably on the stack or part of a larger allocate class. bool Multicast::send ( char *msg , long msgSize , uint8_t msgType , bool ownMsg , //unsigned long groupId , unsigned long shardNum, bool sendToWholeGroup , long key , void *state , void *state2 , void (*callback) (void *state , void *state2), long totalTimeout , // in seconds long niceness , bool realtime , long firstHostId , char *replyBuf , long replyBufMaxSize , bool freeReplyBuf , bool doDiskLoadBalancing , long maxCacheAge , key_t cacheKey , char rdbId , long minRecSizes , bool sendToSelf , bool retryForever , class Hostdb *hostdb , long redirectTimeout , class Host *firstHost ) { // make sure not being re-used! if ( m_inUse ) { log("net: Attempt to re-use active multicast"); char *xx = NULL; *xx = 0; } // reset to free "m_msg" in case we are being re-used (like by Msg14) //log(LOG_DEBUG, "Multicast: send() 0x%hhx",msgType); reset(); // it is now in use m_inUse = true; // MDW: force this off for now, i'm not sure it helps and i'm tired // of seeing the msg34 timed out msgs. // . crap, but seems like the indexdb lookups are getting biased!! //doDiskLoadBalancing = false; // set the parameters in this class m_msg = msg; m_ownMsg = ownMsg; m_ownReadBuf = true; m_freeReadBuf = freeReplyBuf; m_msgSize = msgSize; m_msgType = msgType; //m_groupId = groupId; m_shardNum = shardNum; m_sendToWholeGroup = sendToWholeGroup; m_state = state; m_state2 = state2; m_callback = callback; m_totalTimeout = totalTimeout; // in seconds m_niceness = niceness; m_realtime = realtime; // this can't be -1 i guess if ( totalTimeout <= 0 ) { char *xx=NULL;*xx=0; } // don't use this anymore! if ( m_realtime ) { char *xx = NULL; *xx = 0; } m_replyBuf = replyBuf; m_replyBufMaxSize = replyBufMaxSize; m_startTime = getTime(); m_numReplies = 0; m_readBuf = NULL; m_readBufSize = 0; m_readBufMaxSize = 0; m_registeredSleep = false; m_registeredSleep2 = false; m_sendToSelf = sendToSelf; m_retryForever = retryForever; m_sentToTwin = false; // turn it off until it is debugged m_retryForever = false; m_hostdb = hostdb; if ( ! m_hostdb ) m_hostdb = &g_hostdb; m_retryCount = 0; m_key = key; // reset Msg34's m_numRequests/m_numReplies since this may be // the second time send() was called for this particular class instance //m_msg34.reset(); // keep track of how many outstanding requests to a host m_numLaunched = 0; // variables for doing disk load balancing //m_doDiskLoadBalancing = doDiskLoadBalancing; m_maxCacheAge = maxCacheAge; m_cacheKey = cacheKey; m_rdbId = rdbId; m_minRecSizes = minRecSizes; // amount we try to read from disk m_redirectTimeout = redirectTimeout; // clear m_retired, m_errnos, m_slots memset ( m_retired , 0 , sizeof(char ) * MAX_HOSTS_PER_GROUP ); memset ( m_errnos , 0 , sizeof(long ) * MAX_HOSTS_PER_GROUP ); memset ( m_slots , 0 , sizeof(UdpSlot *) * MAX_HOSTS_PER_GROUP ); memset ( m_inProgress , 0 , sizeof(char ) * MAX_HOSTS_PER_GROUP ); // breathe QUICKPOLL(m_niceness); long hostNumToTry = -1; if ( ! firstHost ) { // . get the list of hosts in this group // . returns false if blocked, true otherwise // . sets g_errno on error //Host *hostList = m_hostdb->getGroup ( groupId , &m_numHosts); Host *hostList = g_hostdb.getShard ( shardNum , &m_numHosts ); if ( ! hostList ) { log("mcast: no group");g_errno=ENOHOSTS;return false;} // now copy the ptr into our array for ( long i = 0 ; i < m_numHosts ; i++ ) m_hostPtrs[i] = &hostList[i]; } // // if we are sending to an scproxy then put all scproxies into the // list of hosts // else { // if ( firstHost && (firstHost->m_type & HT_SCPROXY) ) { long np = 0; for ( long i = 0 ; i < g_hostdb.m_numProxyHosts ; i++ ) { // shortcut Host *h = g_hostdb.getProxy(i); if ( ! (h->m_type & HT_SCPROXY ) ) continue; // stop breaching if ( np >= 32 ) { char *xx=NULL;*xx=0; } // assign this if ( h == firstHost ) hostNumToTry = np; // set our array of ptrs of valid hosts to send to m_hostPtrs[np++] = h; } // assign m_numHosts = np; firstHostId = -1; // panic if ( ! np ) { char *xx=NULL;*xx=0; } } // . pick the fastest host in the group // . this should pick the fastest one we haven't already sent to yet if ( ! m_sendToWholeGroup ) { bool retVal = sendToHostLoop (key,hostNumToTry,firstHostId) ; // on error, un-use this class if ( ! retVal ) m_inUse = false; return retVal; } //if ( ! m_sendToWholeGroup ) return sendToHostLoop ( key , -1 ); // . send to ALL hosts in this group if sendToWholeGroup is true // . blocks forever until sends to all hosts are successfull sendToGroup ( ); // . sendToGroup() always blocks, but we return true if no g_errno // . we actually keep looping until all hosts get the msg w/o error return true; } /////////////////////////////////////////////////////// // // // GROUP SEND // // // /////////////////////////////////////////////////////// // . keeps calling itself back on any error // . resends to host/ip's that had error forever // . callback only called when all hosts transmission are successful // . it does not send to hosts whose m_errnos is 0 // . TODO: deal with errors from g_udpServer::sendRequest() better // . returns false and sets g_errno on error void Multicast::sendToGroup ( ) { // see if anyone gets an error bool hadError = false; // . cast the msg to ALL hosts in the m_hosts group of hosts for ( long i = 0 ; i < m_numHosts ; i++ ) { // cancel any errors g_errno = 0; // get the host Host *h = m_hostPtrs[i];//&m_hosts[i]; // if we got a nice reply from him skip him //slots[i] && m_slots[i]->doneReading() ) continue; if ( m_retired[i] ) continue; // sometimes msg1.cpp is able to add the data to the tree // without problems and will save us a network trans here if ( ! m_sendToSelf && h->m_hostId == g_hostdb.m_hostId && m_hostdb == &g_hostdb && ! g_conf.m_interfaceMachine ) { m_retired[i] = true; m_errnos [i] = 0; m_numReplies++; continue; } // . timeout is in seconds // . timeout is just the time remaining for the whole groupcast // long timeout = m_startTime + m_totalTimeout - getTime(); // . since we now must get non-error replies from ALL hosts // in the group we no longer have a "totalTimeout" per se // reset the g_errno for host #i m_errnos [i] = 0; // if niceness is 0, use the higher priority udpServer UdpServer *us = &g_udpServer; if ( m_realtime ) us = &g_udpServer2; // send to the same port as us! short destPort = h->m_port; //if ( m_realtime ) destPort = h->m_port2; // if from hosts2.conf pick the best ip! long bestIp = h->m_ip; if ( m_hostdb == &g_hostdb2 ) bestIp = g_hostdb.getBestHosts2IP ( h ); // retire the host to prevent resends m_retired [ i ] = true; #ifdef _GLOBALSPEC_ // debug message for global spec //logf(LOG_DEBUG,"net: mcast state=%08lx",(long)this); #endif long hid = h->m_hostId; if ( m_hostdb != &g_hostdb ) hid = -1; // . send to a single host // . this creates a transaction control slot, "udpSlot" // . returns false and sets g_errno on error if ( us->sendRequest ( m_msg , m_msgSize , m_msgType , bestIp , // h->m_ip , destPort , hid , &m_slots[i] , this , // state gotReplyWrapperM2 , m_totalTimeout , -1 , // backoff -1 , // max wait in ms m_replyBuf , m_replyBufMaxSize , m_niceness )) { // cback niceness #ifdef _GLOBALSPEC_ // note the slot ptr for reference //logf(LOG_DEBUG, "net: mcast slotPtr=%08lx", // (long)&m_slots[i]); #endif continue; } // g_errno must have been set, remember it m_errnos [ i ] = g_errno; // we had an error hadError = true; // bring him out of retirement to try again later in time m_retired[i] = false; // log the error log("net: Got error sending add data request (0x%hhx) " "to host #%li: %s. " "Sleeping one second and retrying.", m_msgType,h->m_hostId,mstrerror(g_errno) ); // . clear it, we'll try again // . if we don't clear Msg1::addList(), which returns // true if it did not block, false if it did, will pick up // on it and wierd things might happen. g_errno = 0; // continue if we're already registered for sleep callbacks if ( m_registeredSleep ) continue; // otherwise register for sleep callback to try again g_loop.registerSleepCallback (5000/*ms*/,this,sleepWrapper2, m_niceness); m_registeredSleep = true; } // if we had an error then we'll be called again in a second if ( hadError ) return; // otherwise, unregister sleep callback if we had no error if ( m_registeredSleep ) { g_loop.unregisterSleepCallback ( this , sleepWrapper2 ); m_registeredSleep = false; } } void sleepWrapper2 ( int bogusfd , void *state ) { Multicast *THIS = (Multicast *)state; // try another round of sending to see if hosts had errors or not THIS->sendToGroup ( ); } // C wrapper for the C++ callback void gotReplyWrapperM2 ( void *state , UdpSlot *slot ) { Multicast *THIS = (Multicast *)state; THIS->gotReply2 ( slot ); } // . otherwise, we were sending to a whole group so ALL HOSTS must produce a // successful reply // . we keep re-trying forever until they do void Multicast::gotReply2 ( UdpSlot *slot ) { // don't ever let UdpServer free this send buf (it is m_msg) slot->m_sendBufAlloc = NULL; // save this for msg4 logic that calls injection callback m_slot = slot; // . log the error // . ETRYAGAIN often happens when we are falling too far behind in // our merging (see Rdb.cpp) and we enter urgent merge mode // . it may also happen if tree is too full and is being dumped to disk //if ( g_errno && g_errno != ETRYAGAIN ) // log("net: Got error reply sending to a host during a " // "group send: %s.", mstrerror(g_errno) ); // set m_errnos for this slot long i; for ( i = 0 ; i < m_numHosts ; i++ ) if ( m_slots[i] == slot ) break; // if it matched no slot that's wierd if ( i == m_numHosts ) { //log("not our slot: mcast=%lu",(long)this); log(LOG_LOGIC,"net: multicast: Not our slot."); return; } // clear a timeout error on dead hosts if ( g_conf.m_giveupOnDeadHosts && g_hostdb.isDead ( m_hostPtrs[i]->m_hostId ) ) { log ( "net: GIVING UP ON DEAD HOST! This will not " "return an error." ); g_errno = 0; } // set m_errnos to g_errno, if any m_errnos[i] = g_errno; // if g_errno was not set we have a legit reply if ( ! g_errno ) m_numReplies++; // reset g_errno in case we do more sending g_errno = 0; // . if we got all the legit replies we're done, call the callback // . all slots should be destroyed by UdpServer in this case if ( m_numReplies >= m_numHosts ) { // allow us to be re-used now, callback might relaunch m_inUse = false; if ( m_callback ) { // unsigned long long profilerStart,profilerEnd; // unsigned long long statStart,statEnd; //if(g_conf.m_profilingEnabled){ // address=(long)m_callback; // g_profiler.startTimer(address, // __PRETTY_FUNCTION__); //} //g_loop.startBlockedCpuTimer(); m_callback ( m_state , m_state2 ); //if(g_conf.m_profilingEnabled){ // if(!g_profiler.endTimer(address, // __PRETTY_FUNCTION__)) // log(LOG_WARN,"admin: Couldn't add the" // "fn %li", (long)address); //} } return; } // if this guy had no error then wait for more callbacks if ( ! m_errnos[i] ) return; // bring this slot out of retirement so we can send to him again m_retired[i] = false; // do indeed log the try again things, cuz we have gotten into a // nasty loop with them that took me a while to track down bool logIt = false; static long s_elastTime = 0; if ( m_errnos[i] != ETRYAGAIN ) logIt = true; // log it every 10 seconds even if it was a try again else { long now = getTime(); if (now - s_elastTime > 10) {s_elastTime = now; logIt=true;} } // don't log ETRYAGAIN, may come across as bad when it is normal if ( m_errnos[i] == ETRYAGAIN ) logIt = false; // log a failure msg if ( logIt ) { // m_errnos[i] != ETRYAGAIN ) { Host *h = m_hostdb->getHost ( slot->m_ip ,slot->m_port ); if ( h ) log("net: Got error sending request to hostId %li " "(msgType=0x%hhx transId=%li net=%s): " "%s. Retrying.", h->m_hostId, slot->m_msgType, slot->m_transId, m_hostdb->getNetName(),mstrerror(m_errnos[i]) ); else log("net: Got error sending request to %s:%li " "(msgType=0x%hhx transId=%li net=%s): " "%s. Retrying.", iptoa(slot->m_ip), (long)slot->m_port, slot->m_msgType, slot->m_transId, m_hostdb->getNetName(),mstrerror(m_errnos[i]) ); } // . let's sleep for a second before retrying the send // . the g_errno could be ETRYAGAIN which happens if we're trying to // add data but the other host is temporarily full // . continue if we're already registered for sleep callbacks if ( m_registeredSleep ) return ; // . otherwise register for sleep callback to try again // . sleepWrapper2() will call sendToGroup() for us g_loop.registerSleepCallback (5000/*ms*/,this,sleepWrapper2, m_niceness); m_registeredSleep = true; // . this was bad cause it looped incessantly quickly! // . when we finally return, udpServer destroy this slot // . try to re-send this guy again on error // . this should always block // sendToGroup (); } /////////////////////////////////////////////////////// // // // PICK & SEND // // // /////////////////////////////////////////////////////// //static void gotBestHostWrapper ( void *state ) ; // . returns false and sets g_errno on error // . returns true if managed to send to one host ok (initially at least) // . uses key to pick the first host to send to (for consistency) // . after we pick a host and launch the request to him the sleepWrapper1 // will call this at regular intervals, so be careful, if msg34 is in // progress, then just skip it and use pickBestHost! bool Multicast::sendToHostLoop ( long key , long hostNumToTry , long firstHostId ) { // erase any errors we may have got g_errno = 0 ; // ALL multicast classes share "s_lastPing" //static time_t s_lastPing = 0; // . every 10 seconds we send to a random host in this group in // addition to the best host to act like a ping // . this keeps the ping times of all the hosts fresh // . but we should only use msgs NOT of type 0 so we don't send over // huge indexLists! /* if ( m_msgType == 0x36 && getTime() - s_lastPing > 10 ) { // . pick a host other than the best at random // . if there's multiple dead hosts we should hit a random one long i = pickRandomHost(); // if we got one, try sending to it if ( i >= 0 ) sendToHost(i); // erase any errors we may have got g_errno = 0 ; // note time of our last ping for ALL multicast classes s_lastPing = getTime(); // the best host long bh = pickBestHost ( key ); // if we sent to the one we were going to already, return now if ( i >= 0 && i == bh ) return true; // if we sent to the one and only host, bail if ( i >= 0 && bh < 0 ) return true; } */ loop: // if it is already in progress... wait for it to get back //if ( m_msg34.isInProgress() ) return true ; long i; // what if this host is dead?!?!? if ( hostNumToTry >= 0 ) // && ! g_hostdb.isDead(hostNumToTry) ) i = hostNumToTry; // try to stay on the same switch if the twin groups are on // different switches //else if ( g_conf.m_splitTwins && m_msgType == 0x00 ) // i = pickBestHost2 (key,-1,true); // . gigablast will often fetch data across the network even if it // is available locally because it thinks it will be faster than // hitting the local disk too much. This is often bad, because local // disk can be a fast scsi and network can be slow. so override // gigablast here // . only use this for msg0s, like for reading termlists, stuff like // msg39 should still be routed without any problems // . this is messing up the biasing logic in Msg51 which calls Msg0 // to get a cluster rec and want to bias the page cache to save mem // . also, Msg0 should handle preferLocalReads itself. it has logic // in there just for that //else if ( g_conf.m_preferLocalReads && // ( m_msgType == 0x00 ) ) { // || m_msgType == 0x22 ) ) { // i = pickBestHost (key,-1/*firstHostId*/,true/*preferLocal?*/); //} // . only requests that typically read from disk should set // m_doDiskLoadBalancing to true // . Msg39 should not do it otherwise host#16 ends up being starved // since host0 is the gateway // . this is called at regular intervals by sleepWrapper1 with // hostNumToTry set to -1 so don't call Msg34 if its already going // . NOTE: this is essentially an advanced replacement for // pickBestHost() so it should return essentially the same values // . pickBestHost() doesn't have a problem returning retired host #s //else if ( m_doDiskLoadBalancing ) { // && hostNumToTry == -1 ) { /* else if ( m_doDiskLoadBalancing && firstHostId < 0 ) { // debug msg //log("getting best host (this=%li)",(long)&m_msg34); // . if multiple hosts have it in memory,prefers the local host // . return true if this blocks if ( ! m_msg34.getBestHost ( m_hosts , m_retired , m_numHosts , m_niceness , m_maxCacheAge , m_cacheKey , m_rdbId , this , gotBestHostWrapper ) ) return true; // . if we did not block then get the winning host // . best hostNum is -1 if none untried or all had errors // . this can return a retired host number if all its twins // and itself are dead i = m_msg34.getBestHostNum (); // if no candidate, try pickBestHost if ( i < 0 ) i = pickBestHost ( key , -1 , false ); } */ // . otherwise we had an error on this host // . pick the next best host we haven't sent to yet // . returns -1 if we've sent to all of them in our group, m_hosts //else i = pickBestHost ( (unsigned long)key , firstHostId ); //else i = pickBestHost ( key , -1 , false ); // firstHostId else i = pickBestHost ( key , firstHostId , false ); // firstHostId // do not resend to retired hosts if ( m_retired[i] ) i = -1; // debug msg //if ( m_msgType == 0x39 || m_msgType == 0x00 ) // log("Multicast:: routing msgType=0x%hhx to hostId %li", // m_msgType,m_hosts[i].m_hostId); // . if no more hosts return FALSE // . we need to return false to the caller of us below if ( i < 0 ) { // debug msg //log("Multicast:: no hosts left to send to"); g_errno = ENOHOSTS; return false; } // . send to this guy, if we haven't yet // . returns false and sets g_errno on error // . if it returns true, we sent ok, so we should return true // . will return false if the whole thing is timed out and g_errno // will be set to ETIMEDOUT // . i guess ENOSLOTS means the udp server has no slots available // for sending, so its pointless to try to send to another host if ( sendToHost ( i ) ) return true; // if no more slots, we're done, don't loop! if ( g_errno == ENOSLOTS ) return false; // pointless as well if no time left in the multicast if ( g_errno == EUDPTIMEDOUT ) return false; // otherwise try another host and hope for the best g_errno = 0; key = 0 ; // what kind of error leads us here? EBUFTOOSMALL or EBADENGINEER... hostNumToTry = -1; goto loop; } /* void gotBestHostWrapper ( void *state ) { Multicast *THIS = (Multicast *)state; //long i = THIS->m_msg34.getBestHostNum ( ); // . if we could select none, go with non-intelligent load balancing // . this should still return -1 if all hosts retired though //if ( i < 0 ) i = THIS->pickBestHost ( 0 , -1 , false ); long i = THIS->pickBestHost ( 0 , -1 , false ); // . if we got a candidate to try to send to him // . i is -1 if we could get none // . this also returns false on ENOSLOTS, if no slots available // for sending on. if ( i >= 0 && THIS->sendToHostLoop ( 0 , i , -1 ) ) return; // if i was -1 or sendToHostLoop failed return now if we are still // awaiting a reply... gotReplyWrapperM1() will be called when that // reply comes back and that will call closeUpShop(). if ( THIS->m_numLaunched > 0 ) return; // EUREKA! if the Msg34 replies timeout, that // sets Msg34's LoadPoints m_errno var, and we end up with // no host to try AFTER blocking, which means we're responsible // for closing up shop and calling the callback. // just call the closeUpShop() routine. THIS->closeUpShop ( NULL ); } */ /* long Multicast::pickBestHost2 ( unsigned long key , long firstHostId , bool preferLocal ) { // now select the host on our same network switch long hpg = m_hostdb->m_numHostsPerShard; // . get the hostid range on our switch // . a segment is all the hosts on the same switch long segmentSize = m_hostdb->m_numHosts / hpg; // get our segment long segment = m_hostdb->m_hostId / segmentSize; long i; for ( i = 0 ; i < m_numHosts ; i++ ) { // skip if he's dead if ( m_hostdb->isDead ( &m_hosts[i] ) ) continue; // skip if he's reporting system errors if ( m_hostdb->kernelErrors(&m_hosts[i]) ) continue; // skip if he's not on our segment if ( m_hosts[i].m_hostId / segmentSize != segment ) continue; break; } // return if we got someone in our group if ( i < m_numHosts ) { if ( g_conf.m_logDebugNet ) log(LOG_DEBUG,"net: Splitting request to hostid %li", m_hosts[i].m_hostId); return i; } // if we got nothing, default to this one return pickBestHost ( key , firstHostId , preferLocal ); } */ // . pick the fastest host from m_hosts based on avg roundtrip time for ACKs // . skip hosts in our m_retired[] list of hostIds // . returns -1 if none left to pick long Multicast::pickBestHost ( unsigned long key , long firstHostId , bool preferLocal ) { // debug msg //log("pickBestHost manually"); // bail if no hosts if ( m_numHosts == 0 ) return -1; // . should we always pick host on same machine first? // . we now only run one instance of gb per physical server, not like // the old days... so this is somewhat obsolete... MDW /* if ( preferLocal && !g_conf.m_interfaceMachine ) { for ( long i = 0 ; i < m_numHosts ; i++ ) if ( m_hosts[i].m_machineNum == m_hostdb->getMyMachineNum() && ! m_hostdb->isDead ( &m_hosts[i] ) && ! m_hostdb->kernelErrors( &m_hosts[i] ) && ! m_retired[i] ) return i; } */ // . if firstHostId not -1, try it first // . Msg0 uses this only to select hosts on same machine for now // . Msg20 now uses this to try to make sure the lower half of docids // go to one twin and the upper half to the other. this makes the // tfndb page cache twice as effective when getting summaries. if ( firstHostId >= 0 ) { //log("got first hostId!!!!"); // find it in group long i; for ( i = 0 ; i < m_numHosts ; i++ ) if ( m_hostPtrs[i]->m_hostId == firstHostId ) break; // if not found bitch if ( i >= m_numHosts ) { log(LOG_LOGIC,"net: multicast: HostId %li not " "in group.", firstHostId ); char *xx = NULL; *xx = 0; } // if we got a match and it's not dead, and not reporting // system errors, return it if ( i < m_numHosts && ! m_hostdb->isDead ( m_hostPtrs[i] ) && ! m_hostdb->kernelErrors ( m_hostPtrs[i] ) ) return i; } // round robin selection //static long s_lastGroup = 0; //long count = 0; //long i ; //long slow = -1; long numDead = 0; long dead = -1; long n = 0; //long count = 0; bool balance = g_conf.m_doStripeBalancing; // always turn off stripe balancing for all but these 3 msgTypes if ( m_msgType != 0x39 && m_msgType != 0x37 && m_msgType != 0x36 ) balance = false; // . pick the guy in our "stripe" first if we are doing these msgs // . this will prevent a ton of msg39s from hitting one host and // "spiking" it. if ( balance ) n = g_hostdb.m_myHost->m_stripe; // . if key is not zero, use it to select a host in this group // . if the host we want is dead then do it the old way // . ignore the key if balance is true though! MDW if ( key != 0 && ! balance ) { // often the groupId was selected based on the key, so lets // randomize everything up a bit unsigned long i = hashLong ( key ) % m_numHosts; // if he's not dead or retired use him right away if ( ! m_retired[i] && ! m_hostdb->isDead ( m_hostPtrs[i] ) && ! m_hostdb->kernelErrors( m_hostPtrs[i] ) ) return i; } // no no no we need to randomize the order that we try them Host *fh = m_hostPtrs[n]; // if this host is not dead, and not reporting system errors, use him if ( ! m_retired[n] && ! m_hostdb->isDead(fh) && ! m_hostdb->kernelErrors(fh) ) return n; // . ok now select the kth available host // . make a list of the candidates long cand[32]; long nc = 0; for ( long i = 0 ; i < m_numHosts ; i++ ) { // get the host Host *h = m_hostPtrs[i]; // count those that are dead or are reporting system errors if ( m_hostdb->isDead ( h ) || m_hostdb->kernelErrors(h) ) numDead++; // skip host if we've retired it if ( m_retired[i] ) continue; // if this host is not dead, and not reporting system errors, // use him if ( !m_hostdb->isDead(h) && !m_hostdb->kernelErrors(h) ) cand[nc++] = i; // pick a dead that isn't retired dead = i; } // if a host was alive and untried, use him next if ( nc > 0 ) { long k = ((unsigned long)m_key) % nc; return cand[k]; } // . come here if all hosts were DEAD // . try sending to a host that is dead, but not retired now // . if all deadies are retired this will return -1 // . sometimes a host can appear to be dead even though it was // just under severe load if ( numDead == m_numHosts ) return dead; // otherwise, they weren't all dead so don't send to a deadie return -1; // . if no host we sent to had an error then we should send to deadies // . TODO: we should only send to a deadie if we haven't got back a // reply from any live hosts!! //if ( numErrors == 0 ) return dead; // . now alive host was found that we haven't tried, so return -1 // . before we were returning hosts that were marked as dead!! This // caused problems when the only alive host returned an error code // because it would take forever for the dead host to timeout... //return -1; // update lastGroup //if ( ++s_lastGroup >= m_numHosts ) s_lastGroup = 0; // return i if we got it //if ( count >= m_numHosts ) return slow; // otherwise return i //return i; } // . pick the fastest host from m_hosts based on avg roundtrip time for ACKs // . skip hosts in our m_retired[] list of hostIds // . returns -1 if none left to pick /* long Multicast::pickBestHost ( ) { long mini = -1; long minPing = 0x7fffffff; // TODO: reset the sublist ptr???? // cast the msg to "hostsPerGroup" hosts in group "groupId" for ( long i = 0 ; i < m_numHosts ; i++ ) { // skip host if we've retired it if ( m_retired[i] ) continue; // get the host Host *h = &m_hosts[i]; // see if we got a new fastest host, continue if not if ( h->m_pingAvg > minPing ) continue; minPing = h->m_pingAvg; mini = i; } // return our candidate, may be -1 if all were picked before return mini; } */ // . pick a random host // . returns -1 if we already sent to that host (he's retired) long Multicast::pickRandomHost ( ) { // select one of the dead hosts at random long i = rand() % m_numHosts ; // if he's retired return -1 if ( m_retired[i] ) return -1; // otherwise, he's a valid candidate return i; } // . returns false and sets error on g_errno // . returns true if kicked of the request (m_msg) // . sends m_msg to host "h" bool Multicast::sendToHost ( long i ) { // sanity check if ( i >= m_numHosts ) { char *xx=NULL;*xx=0; } // sanity check , bitch if retired if ( m_retired [ i ] ) { log(LOG_LOGIC,"net: multicast: Host #%li is retired. " "Bad engineer.",i); //char *xx = NULL; *xx = 0; return true; } // debug msg //log("sending to host #%li (this=%li)",i,(long)&m_msg34); // . add this host to our retired list so we don't try again // . only used by pickBestHost() and sendToHost() m_retired [ i ] = true; // what time is it now? long long nowms = gettimeofdayInMilliseconds(); time_t now = nowms / 1000; // save the time m_launchTime [ i ] = nowms; // sometimes clock is updated on us if ( m_startTime > now ) m_startTime = now ; // . timeout is in seconds // . timeout is just the time remaining for the whole groupcast long timeRemaining = m_startTime + m_totalTimeout - now; // . if timeout is negative then reset start time so we try forever // . no, this could be called by a re-route in sleepWrapper1 in which // case we really should timeout. // . this can happen if sleepWrapper found a timeout before UdpServer // got its timeout. if ( timeRemaining <= 0 ) { //m_startTime = getTime();; timeout = m_totalTimeout;} //g_errno = ETIMEDOUT; // this can happen if the udp reply timed out!!! like if a // host is under severe load... with Msg23::getLinkText() // or Msg22::getTitleRec() timing out on us. basically, our // msg23 request tried to send a msg22 request which timed out // on it so it sent us back this error. if ( g_errno != EUDPTIMEDOUT ) log(LOG_INFO,"net: multicast: had negative timeout, %li. " "startTime=%li totalTimeout=%li now=%li. msgType=0x%hhx " "niceness=%li clock updated?", timeRemaining,m_startTime,m_totalTimeout,now,m_msgType, (long)m_niceness); // we are timed out so do not bother re-routing //g_errno = ETIMEDOUT; //return false; // give it a fighting chance of 2 seconds then //timeout = 2; timeRemaining = m_totalTimeout; } // get the host Host *h = m_hostPtrs[i]; // if niceness is 0, use the higher priority udpServer UdpServer *us = &g_udpServer; if ( m_realtime ) us = &g_udpServer2; // send to the same port as us! short destPort = h->m_port; //if ( m_realtime ) destPort = h->m_port2; // if from hosts2.conf pick the best ip! long bestIp = h->m_ip; if ( m_hostdb == &g_hostdb2 ) bestIp = g_hostdb.getBestHosts2IP ( h ); #ifdef _GLOBALSPEC_ // debug message for global spec //logf(LOG_DEBUG,"net: mcast2 state=%08lx",(long)this); #endif // sanity check //if ( g_hostdb.isDead(h) ) { // log("net: trying to send to dead host."); // char *xx = NULL; // *xx = 0; //} // don't set hostid if we're sending to a remote cluster long hid = h->m_hostId; if ( m_hostdb != &g_hostdb ) hid = -1; // if sending to a proxy keep this set to -1 if ( h->m_type != HT_GRUNT ) hid = -1; // max resends. if we resend a request dgram this many times and // got no ack, bail out with g_errno set to ENOACK. this is better // than the timeout because it takes like 20 seconds to mark a // host as dead and takes "timeRemaining" seconds to timeout the // request long maxResends = -1; // . only use for nicness 0 // . it uses a backoff scheme, increments delay for first few resends: // . it starts of at 33ms, then 66, then 132, then 200 from there out if ( m_niceness == 0 ) maxResends = 4; // . send to a single host // . this creates a transaction control slot, "udpSlot" // . return false and sets g_errno on error // . returns true on successful launch and calls callback on completion if ( ! us->sendRequest ( m_msg , m_msgSize , m_msgType , bestIp , // h->m_ip , destPort , hid , &m_slots[i] , this , // state gotReplyWrapperM1 , timeRemaining , // timeout -1 , // backoff -1 , // max wait in ms m_replyBuf , m_replyBufMaxSize , m_niceness , // cback niceness maxResends )) { log("net: Had error sending msgtype 0x%hhx to host " "#%li: %s. Not retrying.", m_msgType,h->m_hostId,mstrerror(g_errno)); // i've seen ENOUDPSLOTS available msg here along with oom // condition... //char *xx=NULL;*xx=0; return false; } // mark it as outstanding m_inProgress[i] = 1; #ifdef _GLOBALSPEC_ // note the slot ptr for reference //logf(LOG_DEBUG,"net: mcast2 slotPtr=%08lx",(long)&m_slots[i]); #endif // set our last launch date m_lastLaunch = nowms ; // gettimeofdayInMilliseconds(); // save the host, too m_lastLaunchHost = h; /* // assume this host has more disk load now if ( m_doDiskLoadBalancing && ! m_msg34.isInBestHostCache() ) { // . add the disk load right after sending it in case we have // many successive sends right after this one // . in the case of titledb use a low avg read size, otherwise // this would be like over 1 Meg (the max titleRec size) long avg = m_minRecSizes; if ( m_rdbId == RDB_TITLEDB ) avg = 32*1024; // titledb read? m_msg34.addLoad ( avg , h->m_hostId , nowms ); } */ // count it as launched m_numLaunched++; // timing debug //log("Multicast sent to hostId %li, this=%li, transId=%li", // h->m_hostId, (long)this , m_slots[i]->m_transId ); // . let's sleep so we have a chance to launch to another host in // the same group in case this guy takes too long // . don't re-register if we already did if ( m_registeredSleep ) return true; // . otherwise register for sleep callback to try again // . sleepWrapper1() will call sendToHostLoop() for us g_loop.registerSleepCallback (50/*ms*/,this,sleepWrapper1,m_niceness); m_registeredSleep = true; #ifdef _GLOBALSPEC_ // debug msg //logf(LOG_DEBUG,"net: mcast registered1 this=%08lx",(long)this); #endif // successful launch return true; } // this is called every 50 ms so we have the chance to launch our request // to a more responsive host void sleepWrapper1 ( int bogusfd , void *state ) { Multicast *THIS = (Multicast *) state; // . if our last launch was less than X seconds ago, wait another tick // . we often send out 2+ requests and end up getting one reply before // the others and that results in us getting unwanted dgrams... // . increasing this delay here results in fewer wasted requests but // if a host goes down you don't want a user to wait too long // . after a host goes down it's ping takes a few secs to decrease // . if a host is shutdown properly it will broadcast a msg to // all hosts using Hostdb::broadcast() informing them that it's // going down so they know to stop sending to it and mark him as // dead long long now = gettimeofdayInMilliseconds(); // watch out for someone advancing the system clock if ( THIS->m_lastLaunch > now ) THIS->m_lastLaunch = now; // get elapsed time since we started the send long elapsed = now - THIS->m_lastLaunch; long docsWanted; long firstResultNum; long nqterms; long rerankRuleset; long wait; char exact; //long hid = -1; Host *hd; //log("elapsed = %li type=0x%hhx",elapsed,THIS->m_msgType); // . don't relaunch any niceness 1 stuff for a while // . it often gets suspended due to query traffic //if ( THIS->m_niceness > 0 && elapsed < 800000 ) return; if ( THIS->m_niceness > 0 ) return; // TODO: if the host went dead on us, re-route // . Msg36 is used to get the length of an IndexList (termFreq) // and is very fast, all in memory, don't wait more than 50ms // . if we do re-route this is sucks cuz we'll get slightly different // termFreqs which impact the total results count as well as summary // generation since it's based on termFreq, not too mention the // biggest impact being ordering of search results since the // score weight is based on termFreq as well // . but unfortunately, this scheme doesn't increase the ping time // of dead hosts that much!! // . NOTE: 2/26/04: i put most everything to 8000 ms since rerouting // too much on an already saturated network of drives just // excacerbates the problem. this stuff was originally put here // to reroute for when a host went down... let's keep it that way //long ta , nb; if ( THIS->m_redirectTimeout != -1 ) { if ( elapsed < THIS->m_redirectTimeout ) return; goto redirectTimedout; } switch ( THIS->m_msgType ) { // term freqs are msg 0x36 and don't hit disk, so reroute all the time case 0x36: exact = 0; // first byte is 1 if caller wants an *exact* termlist size // lookup which requires hitting disk and is *much* slower if ( THIS->m_msg ) exact = *(char *)(THIS->m_msg); // these don't take any resources... unless exact i guess, // so let them fly... 10ms or more to reroute if ( ! exact && elapsed < 10 ) return; //if ( exact && elapsed < 20000 ) return; // never re-reoute these, they may be incrementing/decrementing // a count, and we only store that count on one host!! return; break; // msg to get a summary from a query (calls msg22) // buzz takes extra long! it calls Msg25 sometimes. // no more buzz.. put back to 8 seconds. // put to 5 seconds now since some hosts freezeup still it seems // and i haven't seen a summary generation of 5 seconds case 0x20: if ( elapsed < 5000 ) return; break; // msg 0x20 calls this to get the title rec case 0x22: if ( elapsed < 1000 ) return; break; // Msg23 niceness 0 is only for doing &rerank=X queries //case 0x23: if ( elapsed < 100000 ) return; break; // a request to get the score of a docid, can be *very* intensive case 0x3b: if ( elapsed < 500000 ) return; break; // related topics request, calls many Msg22 to get titlerecs... case 0x24: if ( elapsed < 2000 ) return; break; // . msg to get an index list over the net // . this limit should really be based on length of the index list // . this was 15 then 12 now it is 4 case 0x00: // this should just be for when a host goes down, not for // performance reasons, cuz we do pretty good load balancing // and when things get saturated, rerouting excacerbates it if ( elapsed < 8000 ) return; break; // how many bytes were requested? /* if ( THIS->m_msg ) nb=*(long *)(THIS->m_msg + sizeof(key_t)*2); else nb=2000000; // . givem 300ms + 1ms per 5000 bytes // . a 6M read would be allowed 1500ms before re-routing // . a 1M read would be allowed 500ms // . a 100k read would be allowed 320ms ta = 300 + nb / 5000; // limit it if ( ta < 100 ) ta = 100; if ( ta > 9000 ) ta = 9000; // could this hurt us? if ( elapsed < ta ) return; break; */ // msg to get a clusterdb rec case 0x38: if ( elapsed < 2000 ) return; break; // msg to get docIds from a query, may take a while case 0x39: // how many docsids request? first 4 bytes of request. docsWanted = 10; firstResultNum = 0; nqterms = 0; rerankRuleset = -1; if ( THIS->m_msg ) docsWanted = *(long *)(THIS->m_msg); if ( THIS->m_msg ) firstResultNum = *(long *)(THIS->m_msg+4); if ( THIS->m_msg ) nqterms = *(long *)(THIS->m_msg+8); // never re-route if it has a rerank, those take forever if ( THIS->m_msg ) rerankRuleset = *(long *)(THIS->m_msg+12); if ( rerankRuleset >= 0 ) return; // . how many milliseconds of waiting before we re-route? // . 100 ms per doc wanted, but if they all end up // clustering then docsWanted is no indication of the // actual number of titleRecs (or title keys) read // . it may take a while to do dup removal on 1 million docs wait = 5000 + 100 * (docsWanted+firstResultNum); // those big UOR queries should not get re-routed all the time if ( nqterms > 0 ) wait += 1000 * nqterms; if ( wait < 8000 ) wait = 8000; // seems like buzz is hammering the cluster and 0x39'saretiming // out too much because of huge title recs taking forever with // Msg20 //if ( wait < 120000 ) wait = 120000; if ( elapsed < wait ) return; break; // these tagdb lookups are usually lickety split, should all be in mem case 0x08: if ( elapsed < 10 ) return; break; // this no longer exists! it uses msg0 //case 0x8a: if ( elapsed < 200 ) return; break; case 0x8b: if ( elapsed < 10 ) return; break; // don't relaunch anything else unless over 8 secs default: if ( elapsed < 8000 ) return; break; } // find out which host timedout //hid = -1; hd = NULL; //if ( THIS->m_retired[0] && THIS->m_hosts && THIS->m_numHosts >= 1 ) if ( THIS->m_retired[0] && THIS->m_numHosts >= 1 ) hd = THIS->m_hostPtrs[0]; //if ( THIS->m_retired[1] && THIS->m_hosts && THIS->m_numHosts >= 2 ) if ( THIS->m_retired[1] && THIS->m_numHosts >= 2 ) hd = THIS->m_hostPtrs[1]; // 11/21/06: now we only reroute if the host we sent to is marked as // dead unless it is a msg type that takes little reply generation time if ( hd && // hid >= 0 && //! g_hostdb.isDead(hid) && ! g_hostdb.isDead(hd) && //m_msgType != 0x36 && (see above) //m_msgType != 0x17 && // hosts freezeup sometimes and we don't get a summary in time... // no! we got EDISKSTUCK now and this was causing a problem // dumping core in the parse cache logic //THIS->m_msgType != 0x20 && THIS->m_msgType != 0x08 && //THIS->m_msgType != 0x8a && THIS->m_msgType != 0x8b ) return; redirectTimedout: // cancel any outstanding transactions iff we have a m_replyBuf // that we must read the reply into because we cannot share!! if ( THIS->m_readBuf ) THIS->destroySlotsInProgress ( NULL ); //if ( THIS->m_replyBuf ) // THIS->destroySlotsInProgress ( NULL ); // . do a loop over all hosts in the group // . if a whole group of twins is down this will loop forever here // every Xms, based the sleepWrapper timer for the msgType if ( g_conf.m_logDebugQuery ) { for (long i = 0 ; i < THIS->m_numHosts ; i++ ) { if ( ! THIS->m_slots[i] ) continue; // transaction is not in progress if m_errnos[i] is set char *ee = ""; if ( THIS->m_errnos[i] ) ee = mstrerror(THIS->m_errnos[i]); log("net: Multicast::sleepWrapper1: tried host " "%s:%li %s" ,iptoa(THIS->m_slots[i]->m_ip), (long)THIS->m_slots[i]->m_port , ee ); } } // log msg that we are trying to re-route //log("Multicast::sleepWrapper1: trying to re-route msgType=0x%hhx " // "to new host", THIS->m_msgType ); // if we were trying to contact a host in the secondary cluster, // mark the host as dead. this is our passive monitoring system. if ( THIS->m_hostdb == &g_hostdb2 ) { log("net: Marking hostid %li in secondary cluster as dead.", (long)THIS->m_lastLaunchHost->m_hostId); THIS->m_lastLaunchHost->m_ping = g_conf.m_deadHostTimeout; } // . otherwise, launch another request if we can // . returns true if we successfully sent to another host // . returns false and sets g_errno if no hosts left or other error if ( THIS->sendToHostLoop(0,-1,-1) ) { // msgtype 0x36 is always rerouting because the timeout is so // low because it is an easy request to satisfy... so don't // flood the logs with it long logtype = LOG_WARN; if ( THIS->m_msgType == 0x36 ) logtype = LOG_DEBUG; // same goes for msg8 if ( THIS->m_msgType == 0x08 ) logtype = LOG_DEBUG; //if ( THIS->m_msgType == 0x8a ) logtype = LOG_DEBUG; if ( THIS->m_msgType == 0x8b ) logtype = LOG_DEBUG; // log msg that we were successful long hid = -1; if ( hd ) hid = hd->m_hostId; log(logtype, "net: Multicast::sleepWrapper1: rerouted msgType=0x%hhx " "from host #%li " "to new host after waiting %li ms", THIS->m_msgType, hid,elapsed); // . mark it in the stats for PageStats.cpp // . this is timeout based rerouting g_stats.m_reroutes[(int)THIS->m_msgType][THIS->m_niceness]++; return; } // if we registered the sleep callback we must have launched a // request to a host so let gotReplyWrapperM1() deal with closeUpShop() // . let replyWrapper1 be called if we got one launched // . it should then call closeUpShop() //if ( THIS->m_numLaunched ) return; // otherwise, no outstanding requests and we failed to send to another // host, probably because : // 1. Msg34 timed out on all hosts // 2. there were no udp slots available (which is bad) //log("Multicast:: re-route failed for msgType=%hhx. abandoning.", // THIS->m_msgType ); // . the next send failed to send to a host, so close up shop // . this is probably because the Msg34s timed out and we could not // find a next "best host" to send to because of that //THIS->closeUpShop ( NULL ); // . we were not able to send to another host, maybe it was dead or // there are no hosts left! // . i guess keep sleeping until host comes back up or transaction // is cancelled //log("Multicast::sleepWrapper1: re-route of msgType=0x%hhx failed", // THIS->m_msgType); } // C wrapper for the C++ callback void gotReplyWrapperM1 ( void *state , UdpSlot *slot ) { Multicast *THIS = (Multicast *)state; // debug msg //log("gotReplyWrapperM1 for msg34=%li",(long)(&THIS->m_msg34)); THIS->gotReply1 ( slot ); } // come here if we've got a reply from a host that's not part of a group send void Multicast::gotReply1 ( UdpSlot *slot ) { // debug msg //log("gotReply1: this=%li should exit",(long)&m_msg34); // count it as returned m_numLaunched--; // don't ever let UdpServer free this send buf (it is m_msg) slot->m_sendBufAlloc = NULL; // remove the slot from m_slots so it doesn't get nuked in // gotSlot(slot) routine above long i = 0; // careful! we might have recycled a slot!!! start with top and go down // because UdpServer might give us the same slot ptr on our 3rd try // that we had on our first try! for ( i = 0 ; i < m_numHosts ; i++ ) { // skip if not in progress if ( ! m_inProgress[i] ) continue; // slot must match if ( m_slots[i] == slot ) break; } // if it matched no slot that's wierd if ( i >= m_numHosts ) { log(LOG_LOGIC,"net: multicast: Not our slot 2."); char *xx = NULL; *xx = 0; return; } // set m_errnos[i], if any if ( g_errno ) m_errnos[i] = g_errno; // mark it as no longer in progress m_inProgress[i] = 0; // if he was marked as dead on the secondary cluster, mark him as up Host *h = m_hostPtrs[i]; if ( m_hostdb == &g_hostdb2 && h && m_hostdb->isDead(h) ) { log("net: Marking hostid %li in secondary cluster as alive.", (long)h->m_hostId); h->m_ping = 0; } // save the host we got a reply from m_replyingHost = h; m_replyLaunchTime = m_launchTime[i]; if ( m_sentToTwin ) log("net: Twin msgType=0x%lx (this=0x%lx) reply: %s.", (long)m_msgType,(long)this,mstrerror(g_errno)); // on error try sending the request to another host // return if we kicked another request off ok if ( g_errno ) { Host *h; char logIt = true; // do not log not found on an external network if ( g_errno == ENOTFOUND && m_hostdb != &g_hostdb ) goto skip; // log the error h = m_hostdb->getHost ( slot->m_ip ,slot->m_port ); // do not log if not expected msg20 if ( slot->m_msgType == 0x20 && g_errno == ENOTFOUND && ! ((Msg20 *)m_state)->m_expected ) logIt = false; if ( slot->m_msgType == 0x20 && g_errno == EMISSINGQUERYTERMS ) logIt = false; if ( h && logIt ) log("net: Multicast got error in reply from hostId %li" " (msgType=0x%hhx transId=%li nice=%li net=%s): " "%s.", h->m_hostId, slot->m_msgType, slot->m_transId, m_niceness, m_hostdb->getNetName(),mstrerror(g_errno )); else if ( logIt ) log("net: Multicast got error in reply from %s:%li " "(msgType=0x%hhx transId=%li nice =%li net=%s): " "%s.", iptoa(slot->m_ip), (long)slot->m_port, slot->m_msgType, slot->m_transId, m_niceness, m_hostdb->getNetName(),mstrerror(g_errno) ); skip: // if this slot had an error we may have to tell UdpServer // not to free the read buf if ( m_replyBuf == slot->m_readBuf ) slot->m_readBuf = NULL; // . try to send to another host // . on successful sending return, we'll be called on reply // . this also returns false if no new hosts left to send to // . only try another host if g_errno is NOT ENOTFOUND cuz // we have quite a few missing clustRecs and titleRecs // and doing a second lookup will decrease query response // . if the Msg22 lookup cannot find the titleRec for indexing // purposes, it should check any twin hosts because this // is very important... if this is for query time, however, // then accept the ENOTFOUND without spawning another request // . but if the record is really not there we waste seeks! // . EBADENGINEER is now used by titledb's Msg22 when a docid // is in tfndb but not in titledb (or id2 is invalid) // . it is more important that we serve the title rec than // the performance gain. if you want the performance gain // then you should repair your index to avoid this. therefore // send to twin on ENOTFOUND // . often, though, we are restring to indexdb root so after // doing a lot of deletes there will be a lot of not founds // that are really not found (not corruption) so don't do it // anymore // . let's go for accuracy even for queries // . until i fix the bug of losing titlerecs for some reason // probably during merges now, we reroute on ENOTFOUND. bool sendToTwin = true; if ( g_errno == EBADENGINEER ) sendToTwin = false; if ( g_errno == EMISSINGQUERYTERMS ) sendToTwin = false; if ( g_errno == EMSGTOOBIG ) sendToTwin = false; // "Argument list too long" if ( g_errno == 7 ) sendToTwin = false; // i guess msg50 calls msg25 with no ip sometimes? if ( g_errno == EURLHASNOIP ) sendToTwin = false; if ( g_errno == EUNCOMPRESSERROR ) sendToTwin = false; // ok, let's give up on ENOTFOUND, because the vast majority // of time it seems it is really not on the twin either... if ( g_errno == ENOTFOUND && m_msgType == 0x020 ) sendToTwin = false; if ( g_errno == ENOTFOUND && m_msgType == 0x022 ) sendToTwin = false; // do not worry if it was a not found msg20 for a titleRec // which was not expected to be there if ( ! logIt ) sendToTwin = false; // or a notfound on the external/secondary cluster if ( g_errno == ENOTFOUND && m_hostdb == &g_hostdb2 ) sendToTwin = false; // no longer do this for titledb, too common since msg4 // cached stuff can make us slightly out of sync //if ( g_errno == ENOTFOUND ) // sendToTwin = false; // or a problem with a tfndb lookup, those are different for // each twin right now if ( m_msgType == 0x00 && m_rdbId == RDB_TFNDB ) sendToTwin = false; // do not send to twin if we are out of time time_t now = getTime(); long timeRemaining = m_startTime + m_totalTimeout - now; if ( timeRemaining <= 0 ) sendToTwin = false; // send to the twin if ( sendToTwin && sendToHostLoop(0,-1,-1) ) { log("net: Trying to send request msgType=0x%lx " "to a twin. (this=0x%lx)", (long)m_msgType,(long)this); m_sentToTwin = true; // . keep stats // . this is error based rerouting // . this can be timeouts as well, if the // receiver sent a request itself and that // timed out... g_stats.m_reroutes[(int)m_msgType][m_niceness]++; return; } // . otherwise we've failed on all hosts // . re-instate g_errno,might have been set by sendToHostLoop() g_errno = m_errnos[i]; // unregister our sleep wrapper if we did //if ( m_registeredSleep ) { // g_loop.unregisterSleepCallback ( this, sleepWrapper1 ); // m_registeredSleep = false; //} // destroy all slots that may be in progress (except "slot") //destroySlotsInProgress ( slot ); // call callback with g_errno set //if ( m_callback ) m_callback ( m_state ); // we're done, all slots should be destroyed by UdpServer //return; } closeUpShop ( slot ); } void Multicast::closeUpShop ( UdpSlot *slot ) { // sanity check if ( ! m_inUse ) { char *xx=NULL;*xx=0; } // debug msg //log("Multicast exiting (this=%li)",(long)&m_msg34); // destroy the OTHER slots we've spawned that are in progress destroySlotsInProgress ( slot ); // if we have no slot per se, skip this stuff if ( ! slot ) goto skip; // . now we have a good reply... but not if g_errno is set // . save the reply of this slot here // . this is bad if we got an g_errno above, it will set the slot's // readBuf to NULL up there, and that will make m_readBuf NULL here // causing a mem leak. i fixed by adding an mfree on m_replyBuf // in Multicast::reset() routine. // . i fixed again by ensuring we do not set m_ownReadBuf to false // in getBestReply() below if m_readBuf is NULL m_readBuf = slot->m_readBuf; m_readBufSize = slot->m_readBufSize; m_readBufMaxSize = slot->m_readBufMaxSize; // . if the slot had an error, propagate it so it will be set when // we call the callback. if(!g_errno) g_errno = slot->m_errno; // . sometimes UdpServer will read the reply into a temporary buffer // . this happens if the udp server is hot (async signal based) and // m_replyBuf is NULL because he cannot malloc a buf to read into // because malloc is not async signal safe if ( slot->m_tmpBuf == slot->m_readBuf ) m_freeReadBuf = false; // don't let UdpServer free the readBuf now that we point to it slot->m_readBuf = NULL; // save slot so msg4 knows what slot replied in udpserver // for doing its flush callback logic m_slot = slot; skip: // unregister our sleep wrapper if we did if ( m_registeredSleep ) { g_loop.unregisterSleepCallback ( this , sleepWrapper1 ); m_registeredSleep = false; #ifdef _GLOBALSPEC_ // debug msg //logf(LOG_DEBUG,"net: mcast unregistered1 this= %08lx", // (long)this); #endif } // unregister our sleep wrapper if we did if ( m_registeredSleep2 ) { g_loop.unregisterSleepCallback ( this , sleepWrapper1b ); m_registeredSleep2 = false; } // ok, now if infiniteLoop is true, we must retry. this helps us // alot if one host is under such sever load that any disk read // returns ETRYAGAIN because there is no room in the thread queue. // do not retry on persistent error, only on temporary ones. if ( m_retryForever && g_errno && g_errno != ENOTFOUND && g_errno != EMISSINGQUERYTERMS && g_errno != EBADENGINEER ) { log("net: Multicast retrying request send in 2 seconds."); m_retryCount++; // bail if already registered if ( m_registeredSleep2 ) return; // try the whole shebang again every 2 seconds if ( ! g_loop.registerSleepCallback (2000,this,sleepWrapper1b, m_niceness)) log("net: Failed to register sleep callback to " "resend multicast request. Giving up. Retry " "failed."); else m_registeredSleep2 = true; return; } if ( ! g_errno && m_retryCount > 0 ) log("net: Multicast succeeded after %li retries.",m_retryCount); // allow us to be re-used now, callback might relaunch m_inUse = false; // now call the user callback if it exists if ( m_callback ) { // unsigned long long profilerStart; //unsigned long long profilerEnd; //unsigned long long statStart,statEnd; //if (g_conf.m_profilingEnabled){ // //profilerStart=gettimeofdayInMillisecondsLocal(); // address=(long)m_callback; // g_profiler.startTimer(address, __PRETTY_FUNCTION__); //} //g_loop.startBlockedCpuTimer(); m_callback ( m_state , m_state2 ); //if (g_conf.m_profilingEnabled) { // if(!g_profiler.endTimer(address,__PRETTY_FUNCTION__)) // log(LOG_WARN,"admin: Couldn't add the fn %li", // (long)address); //} } } void sleepWrapper1b ( int bogusfd , void *state ) { Multicast *THIS = (Multicast *)state; // clear m_retired, m_errnos, m_slots memset ( THIS->m_retired, 0, sizeof(char ) * MAX_HOSTS_PER_GROUP ); memset ( THIS->m_errnos , 0, sizeof(long ) * MAX_HOSTS_PER_GROUP ); memset ( THIS->m_slots , 0, sizeof(UdpSlot *) * MAX_HOSTS_PER_GROUP ); memset ( THIS->m_inProgress,0,sizeof(char ) * MAX_HOSTS_PER_GROUP ); // retry the whole she-bang if ( THIS->sendToHostLoop ( THIS->m_key , -1 , -1 ) ) { // if call succeeded, unregister our sleep callback g_loop.unregisterSleepCallback ( THIS , sleepWrapper1b ); return; } // otherwise, retry forever log("net: Failed to launch multicast request. THIS=%lu. Waiting " "and retrying.",(long)THIS); } // destroy all slots that may be in progress (except "slot") void Multicast::destroySlotsInProgress ( UdpSlot *slot ) { // . always destroy any msg34 slots in progress // . if we re-route then we span new msg34 requests, and if we get // back an original reply we need to take out those msg34 requests // because if they get a reply they may try to access a Multicast // class that no longer exists //if ( m_doDiskLoadBalancing ) m_msg34.destroySlotsInProgress ( ); // do a loop over all hosts in the group for (long i = 0 ; i < m_numHosts ; i++ ) { // . destroy all slots but this one that are in progress // . we'll be destroyed when we return from the cback if ( ! m_slots[i] ) continue; // transaction is not in progress if m_errnos[i] is set if ( m_errnos[i] ) continue; // dont' destroy us, it'll happen when we return if ( m_slots[i] == slot ) continue; // must be in progress if ( ! m_inProgress[i] ) continue; // sometimes the slot is recycled from under us because // we already got a reply from it //if ( m_slots[i]->m_state != this ) continue; // don't free his sendBuf, readBuf is ok to free, however m_slots[i]->m_sendBufAlloc = NULL; // if niceness is 0, use the higher priority udpServer UdpServer *us = &g_udpServer; if ( m_realtime ) us = &g_udpServer2; // . stamp him so he doesn't have a better ping than host of #i // . timedOut=true -->only stamp him if it makes his ping worse //long hostId = m_slots[i]->m_hostId; //long long lastSendTime = m_slots[i]->m_lastSendTime; //long long now = gettimeofdayInMilliseconds() ; //long long tripTime = now - lastSendTime; // . we no longer stamp hosts here, leave that up to // Hostdb::pingHost() // tripTime is always in milliseconds //m_hostdb->stampHost ( hostId , tripTime , true/*timedOut?*/); //#ifdef _DEBUG_ //fprintf(stderr,"stamping host #%li w/ tripTime=%llims\n", // hostId, tripTime); //#endif // if caller provided the buffer, don't free it cuz "slot" // contains it (or m_readBuf) if ( m_replyBuf == m_slots[i]->m_readBuf ) m_slots[i]->m_readBuf = NULL; // destroy this slot that's in progress us->destroySlot ( m_slots[i] ); // do not re-destroy. consider no longer in progress. m_inProgress[i] = 0; } } // we set *freeReply to true if you'll need to free it char *Multicast::getBestReply ( long *replySize , long *replyMaxSize , bool *freeReply, bool steal) { *replySize = m_readBufSize; *replyMaxSize = m_readBufMaxSize; if(steal) m_freeReadBuf = false; *freeReply = m_freeReadBuf; // this can be NULL if we destroyed the slot in progress only to // try another host who was dead! if ( m_readBuf ) m_ownReadBuf = false; return m_readBuf; }