mononoke/blobstore_healer: handle missing regions gracefully

Summary:
There's a few things broken with common/rust/sql and the blobstore healer's handling of replication lag right now:

- If Seconds_Behind_Master isn't an int (it'll be NULL if replications is paused), it just panics.
- If it's talking to a server that it expected to be a replica but is a master, it returns None for the replication lag, but 0 would be more appropriate.
- If a region no longer has a replica, it errors out.

This diff fixes that:

- If replication is paused, we return None for lag.
- If we're talking to a master, we return 0.
- If a region has no replica, we ignore it.

Reviewed By: StanislavGlebik

Differential Revision: D17787580

fbshipit-source-id: 9e5e7682456870b88910afec12e1c409fd8c5ba6
This commit is contained in:
Thomas Orozco 2019-10-07 07:32:49 -07:00 committed by Facebook Github Bot
parent 91fd7e2035
commit 6c29aad4a0

View File

@ -27,6 +27,7 @@ use futures_ext::{spawn_future, BoxFuture, FutureExt};
use healer::Healer;
use manifoldblob::ThriftManifoldBlob;
use metaconfig_types::{BlobConfig, MetadataDBConfig, StorageConfig};
use mysql_async::error::Error as MysqlAsyncError;
use prefixblob::PrefixBlobstore;
use slog::{error, info, o, Logger};
use sql::{myrouter, Connection};
@ -192,14 +193,30 @@ fn ensure_small_db_replication_lag(
.iter()
.map(|(region, conn)| {
cloned!(region);
conn.show_replica_lag_secs().and_then(|maybe_secs| {
maybe_secs
.ok_or(format_err!(
"Could not fetch db replication lag for {}. Failing to avoid overloading db",
region
))
.map(|lag_secs| (region, lag_secs))
})
conn.show_replica_lag_secs()
.or_else(|err| match err.downcast_ref::<MysqlAsyncError>() {
Some(MysqlAsyncError::Server(inner)) => {
// 1918 is discovery failed (i.e. there is no server matching the
// constraints). This is fine, that means we don't need to monitor it.
if inner.code == 1918 {
Ok(Some(0))
} else {
Err(err)
}
},
_ => Err(err),
})
.and_then(|maybe_secs| {
let err = format_err!(
"Could not fetch db replication lag for {}. Failing to avoid overloading db",
region
);
maybe_secs
.ok_or(err)
.map(|lag_secs| (region, lag_secs))
})
})
.collect();