mononoke/blobstore_healer: check for replication lag _before_ starting work

Summary: We had crahsloops on this (which I'm fixing earlier in this stack), which resulted in overloading our queue as we tried to repeatedly clear out 100K entries at a time, rebooted, and tried again. We can fix the root cause that caused us to die, but we should also make sure crashloops don't result in ignoring lag altogether. Also, while in there, convert some of this code to async / await to make it easier to work on. Reviewed By: HarveyHunt Differential Revision: D19997589 fbshipit-source-id: 20747e5a37758aee68b8af2e95786430de55f7b1
2024-10-10 16:57:49 +03:00 · 2020-02-20 12:24:32 -08:00 · 2020-02-20 12:24:32 -08:00 · be5d7343ce
commit be5d7343ce
parent 6da3dc939a
3 changed files with 40 additions and 36 deletions
--- a/eden/mononoke/cmds/blobstore_healer/main.rs
+++ b/eden/mononoke/cmds/blobstore_healer/main.rs
@ -30,8 +30,9 @@ use futures::{
    future::{self, join_all, loop_fn, ok, Loop},
    prelude::*,
 };
-use futures_ext::{spawn_future, BoxFuture, FutureExt};
+use futures_ext::{BoxFuture, FutureExt};
 use futures_preview::compat::Future01CompatExt;
+use futures_preview::future::{FutureExt as _, TryFutureExt};
 use healer::Healer;
 use lazy_static::lazy_static;
 use metaconfig_types::{BlobConfig, MetadataDBConfig, StorageConfig};
@ -273,36 +274,39 @@ fn schedule_healing(
    iter_limit: Option<u64>,
    heal_min_age: ChronoDuration,
 ) -> BoxFuture<(), Error> {
-    let regional_conns = Arc::new(regional_conns);
-    let fut = loop_fn(0, move |count: u64| {
-        match iter_limit {
-            Some(limit) => {
-                if count >= limit {
-                    return future::ok(Loop::Break(())).left_future();
+    async move {
+        let mut count = 0;
+        let mut last_batch_was_full_size = true;
+
+        let regional_conns = Arc::new(regional_conns);
+
+        loop {
+            count += 1;
+            if let Some(iter_limit) = iter_limit {
+                if count > iter_limit {
+                    break Ok(());
                }
            }
-            _ => (),
-        }
-        let max_replication_lag_fn = max_replication_lag(regional_conns.clone());
-        let now = DateTime::now().into_chrono();
-        let healing_deadline = DateTime::new(now - heal_min_age);
-        multiplex_healer
-            .heal(ctx.clone(), healing_deadline)
-            .and_then({
-                let logger = ctx.logger().clone();
-                move |last_batch_full_sized| {
-                    ensure_small_db_replication_lag(
-                        logger,
-                        max_replication_lag_fn,
-                        last_batch_full_sized,
-                    )
-                    .map(move |_lag| Loop::Continue(count + 1))
-                }
-            })
-            .right_future()
-    });

-    spawn_future(fut).boxify()
+            ensure_small_db_replication_lag(
+                ctx.logger().clone(),
+                max_replication_lag(regional_conns.clone()),
+                last_batch_was_full_size,
+            )
+            .compat()
+            .await?;
+
+            let now = DateTime::now().into_chrono();
+            let healing_deadline = DateTime::new(now - heal_min_age);
+            last_batch_was_full_size = multiplex_healer
+                .heal(ctx.clone(), healing_deadline)
+                .compat()
+                .await?;
+        }
+    }
+    .boxed()
+    .compat()
+    .boxify()
 }

 fn max_replication_lag(
--- a/eden/mononoke/tests/integration/test-blobstore_healer.t
+++ b/eden/mononoke/tests/integration/test-blobstore_healer.t
@ -23,11 +23,11 @@ Check that healer queue has all items

 Run the heal
  $ mononoke_blobstore_healer -q --iteration-limit=1 --heal-min-age-secs=0 --storage-id=blobstore --sync-queue-limit=100 2>&1 | strip_glog
+  Max replication lag is sqlite_region, 0s
+  As there are items remaining and lag < bound, carry on without pause
  Found 30 blobs to be healed... Doing it
  For 30 blobs did HealStats { queue_add: 0, queue_del: 90, put_success: 0, put_failure: 0 }
  Deleting 90 actioned queue entries
-  Max replication lag is sqlite_region, 0s
-  As the last batch was not full sized, wait at least one second

 Check that healer queue has drained
  $ sqlite3 "$TESTTMP/monsql/sqlite_dbs" "select count(*) FROM blobstore_sync_queue";
@ -57,12 +57,12 @@ Run the heal, with write errors injected, simulating store still bad
  >  uniq -c | sed 's/^ *//'
  > }
  $ mononoke_blobstore_healer --blobstore-write-chaos-rate 1 -q --iteration-limit=1 --heal-min-age-secs=0 --storage-id=blobstore --sync-queue-limit=100 2>&1 | strip_glog | count_log
+  1 Max replication lag is sqlite_region, 0s
+  1 As there are items remaining and lag < bound, carry on without pause
  1 Found 30 blobs to be healed... Doing it
  30 Adding source blobstores [BlobstoreId(1), BlobstoreId(2)] to the queue so that failed destination blob stores [BlobstoreId(0)] will be retried later
  1 For 30 blobs did HealStats { queue_add: 60, queue_del: 60, put_success: 60, put_failure: 30 }
  1 Deleting 60 actioned queue entries
-  1 Max replication lag is sqlite_region, 0s
-  1 As the last batch was not full sized, wait at least one second

 Check that healer queue still has the items, should not have drained
  $ sqlite3 "$TESTTMP/monsql/sqlite_dbs" "select count(*) FROM blobstore_sync_queue";
@ -70,11 +70,11 @@ Check that healer queue still has the items, should not have drained

 Healer run again now store recovered
  $ mononoke_blobstore_healer -q --iteration-limit=1 --heal-min-age-secs=0 --storage-id=blobstore --sync-queue-limit=100 2>&1 | strip_glog | count_log
+  1 Max replication lag is sqlite_region, 0s
+  1 As there are items remaining and lag < bound, carry on without pause
  1 Found 30 blobs to be healed... Doing it
  1 For 30 blobs did HealStats { queue_add: 0, queue_del: 60, put_success: 30, put_failure: 0 }
  1 Deleting 60 actioned queue entries
-  1 Max replication lag is sqlite_region, 0s
-  1 As the last batch was not full sized, wait at least one second

 Check that healer queue has drained
  $ sqlite3 "$TESTTMP/monsql/sqlite_dbs" "select count(*) FROM blobstore_sync_queue";
--- a/eden/mononoke/tests/integration/test-mononoke-admin.t
+++ b/eden/mononoke/tests/integration/test-mononoke-admin.t
@ -19,11 +19,11 @@ setup configuration

 Drain the healer queue
  $ mononoke_blobstore_healer -q --iteration-limit=1 --heal-min-age-secs=0 --storage-id=blobstore --sync-queue-limit=100 2>&1 | strip_glog | sed -re '/^(Could not connect to a replica in)|(Monitoring regions:)/d'
+  Max replication lag is *, 0s (glob)
+  As there are items remaining and lag < bound, carry on without pause
  Found 30 blobs to be healed... Doing it
  For 30 blobs did HealStats { queue_add: 0, queue_del: 60, put_success: 0, put_failure: 0 }
  Deleting 60 actioned queue entries
-  Max replication lag is *, *s (glob)
-  As the last batch was not full sized, wait at least one second

 Check bookmarks
  $ mononoke_admin bookmarks log master_bookmark 2>&1 | grep master_bookmark