2019-01-15 21:08:00 +03:00
|
|
|
// Copyright (c) 2004-present, Facebook, Inc.
|
|
|
|
// All Rights Reserved.
|
|
|
|
//
|
|
|
|
// This software may be used and distributed according to the terms of the
|
|
|
|
// GNU General Public License version 2 or any later version.
|
|
|
|
|
|
|
|
use blobstore::Blobstore;
|
|
|
|
use blobstore_sync_queue::{BlobstoreSyncQueue, BlobstoreSyncQueueEntry};
|
|
|
|
use chrono::Duration as ChronoDuration;
|
2019-03-20 15:11:53 +03:00
|
|
|
use cloned::cloned;
|
2019-01-15 21:08:00 +03:00
|
|
|
use context::CoreContext;
|
2019-08-05 13:46:26 +03:00
|
|
|
use failure_ext::{err_msg, prelude::*};
|
2019-01-15 21:08:00 +03:00
|
|
|
use futures::{
|
2019-03-25 23:38:07 +03:00
|
|
|
self,
|
2019-01-15 21:08:00 +03:00
|
|
|
future::{join_all, loop_fn, Loop},
|
|
|
|
prelude::*,
|
|
|
|
};
|
|
|
|
use futures_ext::FutureExt;
|
2019-08-05 13:46:26 +03:00
|
|
|
use itertools::{Either, Itertools};
|
2019-03-20 15:11:53 +03:00
|
|
|
use lazy_static::lazy_static;
|
2019-01-31 11:31:11 +03:00
|
|
|
use metaconfig_types::BlobstoreId;
|
2019-08-05 13:46:26 +03:00
|
|
|
use mononoke_types::{BlobstoreBytes, DateTime};
|
2019-08-05 13:46:26 +03:00
|
|
|
use slog::{info, warn, Logger};
|
2019-01-15 21:08:00 +03:00
|
|
|
use std::collections::{HashMap, HashSet};
|
|
|
|
use std::sync::Arc;
|
|
|
|
|
|
|
|
lazy_static! {
|
|
|
|
/// Minimal age of entry to consider if it has to be healed
|
|
|
|
static ref ENTRY_HEALING_MIN_AGE: ChronoDuration = ChronoDuration::minutes(2);
|
|
|
|
}
|
|
|
|
|
2019-08-05 13:46:26 +03:00
|
|
|
pub struct Healer {
|
2019-01-15 21:08:00 +03:00
|
|
|
logger: Logger,
|
|
|
|
blobstore_sync_queue_limit: usize,
|
2019-03-26 13:45:32 +03:00
|
|
|
sync_queue: Arc<dyn BlobstoreSyncQueue>,
|
|
|
|
blobstores: Arc<HashMap<BlobstoreId, Arc<dyn Blobstore>>>,
|
2019-08-05 13:46:26 +03:00
|
|
|
blobstore_key_like: Option<String>,
|
2019-01-15 21:08:00 +03:00
|
|
|
}
|
|
|
|
|
2019-08-05 13:46:26 +03:00
|
|
|
impl Healer {
|
2019-01-15 21:08:00 +03:00
|
|
|
pub fn new(
|
|
|
|
logger: Logger,
|
|
|
|
blobstore_sync_queue_limit: usize,
|
2019-03-26 13:45:32 +03:00
|
|
|
sync_queue: Arc<dyn BlobstoreSyncQueue>,
|
|
|
|
blobstores: Arc<HashMap<BlobstoreId, Arc<dyn Blobstore>>>,
|
2019-08-05 13:46:26 +03:00
|
|
|
blobstore_key_like: Option<String>,
|
2019-01-15 21:08:00 +03:00
|
|
|
) -> Self {
|
|
|
|
Self {
|
|
|
|
logger,
|
|
|
|
blobstore_sync_queue_limit,
|
|
|
|
sync_queue,
|
|
|
|
blobstores,
|
2019-08-05 13:46:26 +03:00
|
|
|
blobstore_key_like,
|
2019-01-15 21:08:00 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-08-05 13:46:26 +03:00
|
|
|
/// Heal one batch of entries. It selects a set of entries which are not too young (bounded
|
|
|
|
/// by ENTRY_HEALING_MIN_AGE) up to `blobstore_sync_queue_limit` at once.
|
2019-01-15 21:08:00 +03:00
|
|
|
pub fn heal(&self, ctx: CoreContext) -> impl Future<Item = (), Error = Error> {
|
|
|
|
cloned!(
|
|
|
|
self.logger,
|
|
|
|
self.blobstore_sync_queue_limit,
|
|
|
|
self.sync_queue,
|
2019-08-05 13:46:26 +03:00
|
|
|
self.blobstores,
|
2019-01-15 21:08:00 +03:00
|
|
|
);
|
|
|
|
|
|
|
|
let now = DateTime::now().into_chrono();
|
|
|
|
let healing_deadline = DateTime::new(now - *ENTRY_HEALING_MIN_AGE);
|
|
|
|
|
|
|
|
sync_queue
|
|
|
|
.iter(
|
|
|
|
ctx.clone(),
|
2019-08-05 13:46:26 +03:00
|
|
|
self.blobstore_key_like.clone(),
|
2019-01-15 21:08:00 +03:00
|
|
|
healing_deadline.clone(),
|
|
|
|
blobstore_sync_queue_limit,
|
|
|
|
)
|
2019-08-05 13:46:26 +03:00
|
|
|
.and_then(move |queue_entries: Vec<BlobstoreSyncQueueEntry>| {
|
2019-01-15 21:08:00 +03:00
|
|
|
let healing_futures: Vec<_> = queue_entries
|
|
|
|
.into_iter()
|
|
|
|
.group_by(|entry| entry.blobstore_key.clone())
|
|
|
|
.into_iter()
|
|
|
|
.filter_map(|(key, entries)| {
|
|
|
|
cloned!(ctx, sync_queue, blobstores, healing_deadline);
|
|
|
|
heal_blob(
|
|
|
|
ctx,
|
|
|
|
sync_queue,
|
|
|
|
blobstores,
|
|
|
|
healing_deadline,
|
|
|
|
key,
|
|
|
|
entries.collect(),
|
|
|
|
)
|
|
|
|
})
|
|
|
|
.collect();
|
|
|
|
|
|
|
|
info!(
|
|
|
|
logger,
|
|
|
|
"Found {} blobs to be healed... Doing it",
|
|
|
|
healing_futures.len()
|
|
|
|
);
|
|
|
|
|
2019-08-05 13:46:26 +03:00
|
|
|
futures::stream::futures_unordered(healing_futures)
|
2019-03-25 23:38:07 +03:00
|
|
|
.collect()
|
2019-08-05 13:46:26 +03:00
|
|
|
.and_then(move |cleaned_entries: Vec<Vec<BlobstoreSyncQueueEntry>>| {
|
|
|
|
let cleaned = cleaned_entries.into_iter().flatten().collect();
|
|
|
|
cleanup_after_healing(ctx, sync_queue, cleaned)
|
2019-03-25 23:38:07 +03:00
|
|
|
})
|
2019-01-15 21:08:00 +03:00
|
|
|
})
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-08-05 13:46:26 +03:00
|
|
|
/// Heal an individual blob. The `entries` are the blobstores which have successfully stored
|
|
|
|
/// this blob; we need to replicate them onto the remaining `blobstores`. If the blob is not
|
|
|
|
/// yet eligable (too young), then just return None, otherwise we return the healed entries
|
|
|
|
/// which have now been dealt with.
|
2019-01-15 21:08:00 +03:00
|
|
|
fn heal_blob(
|
|
|
|
ctx: CoreContext,
|
2019-03-26 13:45:32 +03:00
|
|
|
sync_queue: Arc<dyn BlobstoreSyncQueue>,
|
|
|
|
blobstores: Arc<HashMap<BlobstoreId, Arc<dyn Blobstore>>>,
|
2019-01-15 21:08:00 +03:00
|
|
|
healing_deadline: DateTime,
|
|
|
|
key: String,
|
|
|
|
entries: Vec<BlobstoreSyncQueueEntry>,
|
2019-03-25 23:38:07 +03:00
|
|
|
) -> Option<impl Future<Item = Vec<BlobstoreSyncQueueEntry>, Error = Error>> {
|
2019-01-15 21:08:00 +03:00
|
|
|
let seen_blobstores: HashSet<_> = entries
|
|
|
|
.iter()
|
|
|
|
.filter_map(|entry| {
|
2019-08-05 13:46:26 +03:00
|
|
|
let id = entry.blobstore_id;
|
2019-01-15 21:08:00 +03:00
|
|
|
if blobstores.contains_key(&id) {
|
|
|
|
Some(id)
|
|
|
|
} else {
|
|
|
|
None
|
|
|
|
}
|
|
|
|
})
|
|
|
|
.collect();
|
|
|
|
|
2019-08-05 13:46:26 +03:00
|
|
|
let mut missing_blobstores: HashSet<BlobstoreId> = blobstores
|
2019-01-15 21:08:00 +03:00
|
|
|
.iter()
|
|
|
|
.filter_map(|(key, _)| {
|
|
|
|
if seen_blobstores.contains(key) {
|
|
|
|
None
|
|
|
|
} else {
|
|
|
|
Some(key.clone())
|
|
|
|
}
|
|
|
|
})
|
|
|
|
.collect();
|
|
|
|
|
|
|
|
if missing_blobstores.is_empty() {
|
|
|
|
// All blobstores have been synchronized
|
2019-03-25 23:38:07 +03:00
|
|
|
return Some(futures::future::ok(entries).left_future());
|
2019-01-15 21:08:00 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
if !entries
|
|
|
|
.iter()
|
|
|
|
.any(|entry| entry.timestamp < healing_deadline)
|
|
|
|
{
|
|
|
|
// The oldes entry is not old enough to be eligible for healing
|
|
|
|
return None;
|
|
|
|
}
|
|
|
|
|
|
|
|
let heal_future = fetch_blob(
|
|
|
|
ctx.clone(),
|
|
|
|
blobstores.clone(),
|
|
|
|
key.clone(),
|
2019-08-05 13:46:26 +03:00
|
|
|
seen_blobstores.clone(),
|
2019-01-15 21:08:00 +03:00
|
|
|
)
|
2019-08-05 13:46:26 +03:00
|
|
|
.and_then(
|
|
|
|
move |(blob, known_good_blobstore, missing_source_blobstores)| {
|
|
|
|
if !missing_source_blobstores.is_empty() {
|
|
|
|
warn!(
|
|
|
|
ctx.logger(),
|
|
|
|
"Source Blobstores {:?} of {:?} returned None even though they\
|
|
|
|
should contain data",
|
|
|
|
missing_source_blobstores,
|
|
|
|
seen_blobstores
|
|
|
|
);
|
|
|
|
for bid in missing_source_blobstores {
|
|
|
|
missing_blobstores.insert(bid);
|
|
|
|
}
|
2019-08-05 13:46:26 +03:00
|
|
|
}
|
|
|
|
|
2019-08-05 13:46:26 +03:00
|
|
|
let heal_puts: Vec<_> = missing_blobstores
|
|
|
|
.into_iter()
|
|
|
|
.map(|bid| {
|
|
|
|
let blobstore = blobstores
|
|
|
|
.get(&bid)
|
|
|
|
.expect("missing_blobstores contains unknown blobstore?");
|
|
|
|
blobstore
|
|
|
|
.put(ctx.clone(), key.clone(), blob.clone())
|
|
|
|
.then(move |result| Ok((bid, result.is_ok())))
|
|
|
|
})
|
|
|
|
.collect();
|
2019-01-15 21:08:00 +03:00
|
|
|
|
2019-08-05 13:46:26 +03:00
|
|
|
// If any puts fail make sure we put a good source blobstore_id for that blob
|
|
|
|
// back on the queue
|
|
|
|
join_all(heal_puts).and_then(move |heal_results| {
|
|
|
|
let (mut healed_stores, unhealed_stores): (HashSet<_>, HashSet<_>) =
|
|
|
|
heal_results.into_iter().partition_map(|(id, put_ok)| {
|
|
|
|
if put_ok {
|
|
|
|
Either::Left(id)
|
|
|
|
} else {
|
|
|
|
Either::Right(id)
|
|
|
|
}
|
|
|
|
});
|
|
|
|
if !unhealed_stores.is_empty() {
|
|
|
|
// Add known_good_blobstore to the healed_stores as we should write all
|
|
|
|
// known good blobstores so that the missing_blobstores logic run on read
|
|
|
|
// has the full data for the blobstore_key
|
|
|
|
//
|
|
|
|
// This also ensures we requeue at least one good source store in the case
|
|
|
|
// where all heal puts fail
|
|
|
|
healed_stores.insert(known_good_blobstore);
|
|
|
|
warn!(
|
|
|
|
ctx.logger(),
|
|
|
|
"Adding source blobstores {:?} to the queue so that failed \
|
|
|
|
destination blob stores {:?} will be retried later",
|
|
|
|
healed_stores,
|
|
|
|
unhealed_stores
|
|
|
|
);
|
|
|
|
requeue_partial_heal(ctx, sync_queue, key, healed_stores)
|
|
|
|
.map(|()| entries)
|
|
|
|
.left_future()
|
|
|
|
} else {
|
|
|
|
futures::future::ok(entries).right_future()
|
|
|
|
}
|
|
|
|
})
|
|
|
|
},
|
|
|
|
);
|
2019-01-15 21:08:00 +03:00
|
|
|
|
|
|
|
Some(heal_future.right_future())
|
|
|
|
}
|
|
|
|
|
2019-08-05 13:46:26 +03:00
|
|
|
/// Fetch a blob by `key` from one of the `seen_blobstores`. This tries them one at at time
|
2019-08-05 13:46:26 +03:00
|
|
|
/// sequentially, returning the known good store plus those found missing, or an error
|
2019-01-15 21:08:00 +03:00
|
|
|
fn fetch_blob(
|
|
|
|
ctx: CoreContext,
|
2019-03-26 13:45:32 +03:00
|
|
|
blobstores: Arc<HashMap<BlobstoreId, Arc<dyn Blobstore>>>,
|
2019-01-15 21:08:00 +03:00
|
|
|
key: String,
|
|
|
|
seen_blobstores: HashSet<BlobstoreId>,
|
2019-08-05 13:46:26 +03:00
|
|
|
) -> impl Future<Item = (BlobstoreBytes, BlobstoreId, Vec<BlobstoreId>), Error = Error> {
|
2019-01-15 21:08:00 +03:00
|
|
|
let blobstores_to_fetch: Vec<_> = seen_blobstores.iter().cloned().collect();
|
|
|
|
let err_context = format!(
|
|
|
|
"While fetching blob '{}', seen in blobstores: {:?}",
|
|
|
|
key, seen_blobstores
|
|
|
|
);
|
2019-08-05 13:46:26 +03:00
|
|
|
let missing_blobstores: Vec<BlobstoreId> = vec![];
|
2019-01-15 21:08:00 +03:00
|
|
|
|
2019-08-05 13:46:26 +03:00
|
|
|
loop_fn(
|
|
|
|
(blobstores_to_fetch, missing_blobstores),
|
|
|
|
move |(mut blobstores_to_fetch, mut missing_blobstores)| {
|
|
|
|
let bid = match blobstores_to_fetch.pop() {
|
|
|
|
None => {
|
|
|
|
return Err(err_msg("None of the blobstores to fetch responded"))
|
|
|
|
.into_future()
|
|
|
|
.left_future();
|
|
|
|
}
|
|
|
|
Some(bid) => bid,
|
|
|
|
};
|
2019-01-15 21:08:00 +03:00
|
|
|
|
2019-08-05 13:46:26 +03:00
|
|
|
let blobstore = blobstores
|
|
|
|
.get(&bid)
|
|
|
|
.expect("blobstores_to_fetch contains only existing blobstores");
|
2019-01-15 21:08:00 +03:00
|
|
|
|
2019-08-05 13:46:26 +03:00
|
|
|
blobstore
|
|
|
|
.get(ctx.clone(), key.clone())
|
|
|
|
.then(move |result| match result {
|
|
|
|
Err(_) => return Ok(Loop::Continue((blobstores_to_fetch, missing_blobstores))),
|
|
|
|
Ok(None) => {
|
|
|
|
missing_blobstores.push(bid);
|
|
|
|
return Ok(Loop::Continue((blobstores_to_fetch, missing_blobstores)));
|
|
|
|
}
|
2019-08-05 13:46:26 +03:00
|
|
|
Ok(Some(blob)) => Ok(Loop::Break((blob, bid, missing_blobstores))),
|
2019-08-05 13:46:26 +03:00
|
|
|
})
|
|
|
|
.right_future()
|
|
|
|
},
|
|
|
|
)
|
2019-01-15 21:08:00 +03:00
|
|
|
.chain_err(err_context)
|
|
|
|
.from_err()
|
|
|
|
}
|
|
|
|
|
2019-08-05 13:46:26 +03:00
|
|
|
/// Removed healed entries from the queue.
|
2019-01-15 21:08:00 +03:00
|
|
|
fn cleanup_after_healing(
|
|
|
|
ctx: CoreContext,
|
2019-03-26 13:45:32 +03:00
|
|
|
sync_queue: Arc<dyn BlobstoreSyncQueue>,
|
2019-01-15 21:08:00 +03:00
|
|
|
entries: Vec<BlobstoreSyncQueueEntry>,
|
|
|
|
) -> impl Future<Item = (), Error = Error> {
|
|
|
|
sync_queue.del(ctx, entries)
|
|
|
|
}
|
|
|
|
|
2019-08-05 13:46:26 +03:00
|
|
|
/// Write new queue items with a populated source blobstore for unhealed entries
|
|
|
|
/// Uses a current timestamp so we'll get around to trying them again for the destination
|
|
|
|
/// blobstores eventually without getting stuck on them permanently.
|
|
|
|
/// Uses a new queue entry id so the delete of original entry is safe.
|
|
|
|
fn requeue_partial_heal(
|
2019-01-15 21:08:00 +03:00
|
|
|
ctx: CoreContext,
|
2019-03-26 13:45:32 +03:00
|
|
|
sync_queue: Arc<dyn BlobstoreSyncQueue>,
|
2019-01-15 21:08:00 +03:00
|
|
|
blobstore_key: String,
|
2019-08-05 13:46:26 +03:00
|
|
|
source_blobstores: impl IntoIterator<Item = BlobstoreId>,
|
2019-01-15 21:08:00 +03:00
|
|
|
) -> impl Future<Item = (), Error = Error> {
|
|
|
|
let timestamp = DateTime::now();
|
|
|
|
|
2019-08-05 13:46:26 +03:00
|
|
|
join_all(source_blobstores.into_iter().map({
|
2019-01-15 21:08:00 +03:00
|
|
|
move |blobstore_id| {
|
2019-08-05 13:46:26 +03:00
|
|
|
cloned!(ctx, blobstore_key, timestamp);
|
2019-01-15 21:08:00 +03:00
|
|
|
sync_queue.add(
|
|
|
|
ctx,
|
|
|
|
BlobstoreSyncQueueEntry {
|
|
|
|
blobstore_key,
|
|
|
|
blobstore_id,
|
|
|
|
timestamp,
|
|
|
|
id: None,
|
|
|
|
},
|
|
|
|
)
|
|
|
|
}
|
|
|
|
}))
|
2019-08-05 13:46:26 +03:00
|
|
|
.map(|_: Vec<()>| ())
|
2019-01-15 21:08:00 +03:00
|
|
|
}
|
2019-08-05 13:46:26 +03:00
|
|
|
|
|
|
|
#[cfg(test)]
|
|
|
|
mod tests {
|
|
|
|
use super::*;
|
|
|
|
use memblob::EagerMemblob;
|
|
|
|
use std::iter::FromIterator;
|
|
|
|
|
|
|
|
fn make_empty_stores(n: usize) -> (Vec<BlobstoreId>, HashMap<BlobstoreId, Arc<dyn Blobstore>>) {
|
|
|
|
let mut test_bids = Vec::new();
|
|
|
|
let mut test_stores = HashMap::new();
|
|
|
|
for i in 0..n {
|
|
|
|
test_bids.push(BlobstoreId::new(i as u64));
|
|
|
|
let s: Arc<dyn Blobstore> = Arc::new(EagerMemblob::new());
|
|
|
|
test_stores.insert(test_bids[i], s);
|
|
|
|
}
|
|
|
|
(test_bids, test_stores)
|
|
|
|
}
|
|
|
|
|
|
|
|
fn make_value(value: &str) -> BlobstoreBytes {
|
|
|
|
BlobstoreBytes::from_bytes(value.as_bytes())
|
|
|
|
}
|
|
|
|
|
|
|
|
fn put_value(ctx: &CoreContext, store: Option<&Arc<dyn Blobstore>>, key: &str, value: &str) {
|
|
|
|
store.map(|s| s.put(ctx.clone(), key.to_string(), make_value(value)));
|
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn fetch_blob_missing_all() {
|
|
|
|
let ctx = CoreContext::test_mock();
|
|
|
|
let (bids, stores) = make_empty_stores(3);
|
|
|
|
put_value(&ctx, stores.get(&bids[0]), "dummyk", "dummyv");
|
|
|
|
put_value(&ctx, stores.get(&bids[1]), "dummyk", "dummyv");
|
|
|
|
put_value(&ctx, stores.get(&bids[2]), "dummyk", "dummyv");
|
|
|
|
let fut = fetch_blob(
|
|
|
|
ctx,
|
|
|
|
Arc::new(stores),
|
|
|
|
"specialk".to_string(),
|
|
|
|
HashSet::from_iter(bids.into_iter()),
|
|
|
|
);
|
|
|
|
let r = fut.wait();
|
|
|
|
let msg = r
|
|
|
|
.err()
|
|
|
|
.and_then(|e| e.as_fail().cause().map(|f| (format!("{}", f))));
|
|
|
|
assert_eq!(
|
|
|
|
Some("None of the blobstores to fetch responded".to_string()),
|
|
|
|
msg
|
|
|
|
);
|
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn fetch_blob_missing_none() {
|
|
|
|
let ctx = CoreContext::test_mock();
|
|
|
|
let (bids, stores) = make_empty_stores(3);
|
|
|
|
put_value(&ctx, stores.get(&bids[0]), "specialk", "specialv");
|
|
|
|
put_value(&ctx, stores.get(&bids[1]), "specialk", "specialv");
|
|
|
|
put_value(&ctx, stores.get(&bids[2]), "specialk", "specialv");
|
|
|
|
let fut = fetch_blob(
|
|
|
|
ctx,
|
|
|
|
Arc::new(stores),
|
|
|
|
"specialk".to_string(),
|
|
|
|
HashSet::from_iter(bids.into_iter()),
|
|
|
|
);
|
|
|
|
let r = fut.wait();
|
|
|
|
let foundv = r.ok().unwrap().0;
|
|
|
|
assert_eq!(make_value("specialv"), foundv);
|
|
|
|
}
|
|
|
|
|
|
|
|
// TODO enable as test once fetch_blob gives repeatable results in the some case
|
|
|
|
// fn fetch_blob_missing_some() {
|
|
|
|
// let ctx = CoreContext::test_mock();
|
|
|
|
// let (bids, stores) = make_empty_stores(3);
|
|
|
|
// put_value(&ctx, stores.get(&bids[0]), "specialk", "specialv");
|
|
|
|
// put_value(&ctx, stores.get(&bids[1]), "dummyk", "dummyv");
|
|
|
|
// put_value(&ctx, stores.get(&bids[2]), "dummyk", "dummyv");
|
|
|
|
// let fut = fetch_blob(
|
|
|
|
// ctx,
|
|
|
|
// Arc::new(stores),
|
|
|
|
// "specialk".to_string(),
|
|
|
|
// HashSet::from_iter(bids.clone().into_iter()),
|
|
|
|
// );
|
|
|
|
// let r = fut.wait();
|
|
|
|
// let (foundblob, mut missing_bids) = r.ok().unwrap();
|
|
|
|
// assert_eq!(make_value("specialv"), foundblob);
|
|
|
|
// missing_bids.sort();
|
|
|
|
// assert_eq!(missing_bids, &bids[1..3]);
|
|
|
|
// }
|
|
|
|
}
|