add a flag to report start-up success before finishing mounts

Summary: Add a flag to tell edenfs to report successful start-up as soon as the thrift server is running, without waiting for all mount points to finish being remounted. In the future I plan to have edenfs automatically perform an fsck scan of the overlay for checkouts that were not shut down cleanly. This may cause the remount to take a significant amount of extra start-up time in some cases. (This is already true today in some cases even with the simpler scan we do to re-compute the max inode number.) I think we will probably want to have systemd invoke edenfs with this option, so that we do not time out during system start up if some mount points need to be rescanned. Reviewed By: strager Differential Revision: D13522040 fbshipit-source-id: 6f183770c25efee34c4805c9bad42a9cce51039e
2024-10-04 22:07:44 +03:00 · 2019-03-06 20:30:45 -08:00 · 2019-03-06 20:30:45 -08:00 · b47184adc4
commit b47184adc4
parent ce1c8019a0
7 changed files with 239 additions and 103 deletions
--- a/eden/fs/inodes/ServerState.cpp
+++ b/eden/fs/inodes/ServerState.cpp
@ -24,6 +24,11 @@ DEFINE_bool(
    enable_fault_injection,
    false,
    "Enable the fault injection framework.");
+DEFINE_bool(
+    fault_injection_block_mounts,
+    false,
+    "Block mount attempts via the fault injection framework.  "
+    "Requires --enable_fault_injection.");

 namespace facebook {
 namespace eden {
@ -56,7 +61,19 @@ ServerState::ServerState(
          kUserIgnoreMinPollSeconds}},
      systemIgnoreFileMonitor_{CachedParsedFileMonitor<GitIgnoreFileParser>{
          edenConfig->getSystemIgnoreFile(),
-          kSystemIgnoreMinPollSeconds}} {}
+          kSystemIgnoreMinPollSeconds}} {
+  // It would be nice if we eventually built a more generic mechanism for
+  // defining faults to be configured on start up.  (e.g., loading this from the
+  // EdenConfig).
+  //
+  // For now, blocking mounts is the main thing we want to be able to control on
+  // startup (since mounting occurs automatically during startup).  Add a
+  // one-off command line flag to control this for now, until we build a more
+  // generic mechanism.
+  if (FLAGS_fault_injection_block_mounts) {
+    faultInjector_->injectBlock("mount", ".*");
+  }
+}

 ServerState::~ServerState() {}

--- a/eden/fs/service/EdenServer.cpp
+++ b/eden/fs/service/EdenServer.cpp
@ -372,8 +372,10 @@ void EdenServer::scheduleInodeUnload(std::chrono::milliseconds timeout) {
 }
 #endif // !EDEN_WIN

-Future<Unit> EdenServer::prepare(std::shared_ptr<StartupLogger> logger) {
-  return prepareImpl(std::move(logger))
+Future<Unit> EdenServer::prepare(
+    std::shared_ptr<StartupLogger> logger,
+    bool waitForMountCompletion) {
+  return prepareImpl(std::move(logger), waitForMountCompletion)
      .ensure(
          // Mark the server state as RUNNING once we finish setting up the
          // mount points. Even if an error occurs we still transition to the
@ -382,7 +384,9 @@ Future<Unit> EdenServer::prepare(std::shared_ptr<StartupLogger> logger) {
          [this] { runningState_.wlock()->state = RunState::RUNNING; });
 }

-Future<Unit> EdenServer::prepareImpl(std::shared_ptr<StartupLogger> logger) {
+Future<Unit> EdenServer::prepareImpl(
+    std::shared_ptr<StartupLogger> logger,
+    bool waitForMountCompletion) {
  bool doingTakeover = false;
  if (!acquireEdenLock()) {
    // Another edenfs process is already running.
@ -429,8 +433,8 @@ Future<Unit> EdenServer::prepareImpl(std::shared_ptr<StartupLogger> logger) {
  // receive its lock, thrift socket, and mount points now.
  // This will shut down the old process.
  const auto takeoverPath = edenDir_ + PathComponentPiece{kTakeoverSocketName};
-  TakeoverData takeoverData;
 #endif
+  TakeoverData takeoverData;
  if (doingTakeover) {
 #ifndef EDEN_WIN
    logger->log(
@ -494,96 +498,120 @@ Future<Unit> EdenServer::prepareImpl(std::shared_ptr<StartupLogger> logger) {
  takeoverServer_->start();
 #endif // !EDEN_WIN

-  // Trigger remounting of existing mount points
-  // If doingTakeover is true, use the mounts received in TakeoverData
  std::vector<Future<Unit>> mountFutures;
  if (doingTakeover) {
 #ifndef EDEN_WIN
-    for (auto& info : takeoverData.mountPoints) {
-      const auto stateDirectory = info.stateDirectory;
-      auto mountFuture =
-          makeFutureWith([&] {
-            auto initialConfig = ClientConfig::loadFromClientDirectory(
-                AbsolutePathPiece{info.mountPath},
-                AbsolutePathPiece{info.stateDirectory});
-            return mount(std::move(initialConfig), std::move(info));
-          })
-              .thenTry([logger, mountPath = info.mountPath](
-                           folly::Try<std::shared_ptr<EdenMount>>&& result) {
-                if (result.hasValue()) {
-                  logger->log("Successfully took over mount ", mountPath);
-                  return makeFuture();
-                } else {
-                  logger->warn(
-                      "Failed to perform takeover for ",
-                      mountPath,
-                      ": ",
-                      result.exception().what());
-                  return makeFuture<Unit>(std::move(result).exception());
-                }
-              });
-      mountFutures.push_back(std::move(mountFuture));
-    }
+    mountFutures =
+        prepareMountsTakeover(logger, std::move(takeoverData.mountPoints));
 #else
    NOT_IMPLEMENTED();
-#endif
+#endif // !EDEN_WIN
  } else {
-    folly::dynamic dirs = folly::dynamic::object();
-    try {
-      dirs = ClientConfig::loadClientDirectoryMap(edenDir_);
-    } catch (const std::exception& ex) {
-      logger->warn(
-          "Could not parse config.json file: ",
-          ex.what(),
-          "\nSkipping remount step.");
-      return std::move(thriftRunningFuture)
-          .thenValue(
-              [ew = folly::exception_wrapper(std::current_exception(), ex)](
-                  auto&&) { return makeFuture<Unit>(ew); });
-    }
-
-    if (dirs.empty()) {
-      logger->log("No mount points currently configured.");
-      return thriftRunningFuture;
-    }
-    logger->log("Remounting ", dirs.size(), " mount points...");
-
-    for (const auto& client : dirs.items()) {
-      auto mountFuture =
-          makeFutureWith([&] {
-            MountInfo mountInfo;
-            mountInfo.mountPoint = client.first.c_str();
-            auto edenClientPath = edenDir_ + PathComponent("clients") +
-                PathComponent(client.second.c_str());
-            mountInfo.edenClientPath = edenClientPath.stringPiece().str();
-            auto initialConfig = ClientConfig::loadFromClientDirectory(
-                AbsolutePathPiece{mountInfo.mountPoint},
-                AbsolutePathPiece{mountInfo.edenClientPath});
-            return mount(std::move(initialConfig));
-          })
-              .thenTry([logger, mountPath = client.first.asString()](
-                           folly::Try<std::shared_ptr<EdenMount>>&& result) {
-                if (result.hasValue()) {
-                  logger->log("Successfully remounted ", mountPath);
-                  return makeFuture();
-                } else {
-                  logger->warn(
-                      "Failed to remount ",
-                      mountPath,
-                      ": ",
-                      result.exception().what());
-                  return makeFuture<Unit>(std::move(result).exception());
-                }
-              });
-      mountFutures.push_back(std::move(mountFuture));
-    }
+    mountFutures = prepareMounts(logger);
  }

-  // Return a future that will complete only when all mount points have started
-  // and the thrift server is also running.
-  return folly::collectAll(mountFutures)
-      .thenValue([thriftFuture = std::move(thriftRunningFuture)](
-                     auto&&) mutable { return std::move(thriftFuture); });
+  if (waitForMountCompletion) {
+    // Return a future that will complete only when all mount points have
+    // started and the thrift server is also running.
+    mountFutures.emplace_back(std::move(thriftRunningFuture));
+    return folly::collectAll(mountFutures).unit();
+  } else {
+    // Don't wait for the mount futures.
+    // Only return the thrift future.
+    return thriftRunningFuture;
+  }
+}
+
+std::vector<Future<Unit>> EdenServer::prepareMountsTakeover(
+    shared_ptr<StartupLogger> logger,
+    std::vector<TakeoverData::MountInfo>&& takeoverMounts) {
+  // Trigger remounting of existing mount points
+  // If doingTakeover is true, use the mounts received in TakeoverData
+  std::vector<Future<Unit>> mountFutures;
+#ifndef EDEN_WIN
+  for (auto& info : takeoverMounts) {
+    const auto stateDirectory = info.stateDirectory;
+    auto mountFuture =
+        makeFutureWith([&] {
+          auto initialConfig = ClientConfig::loadFromClientDirectory(
+              AbsolutePathPiece{info.mountPath},
+              AbsolutePathPiece{info.stateDirectory});
+          return mount(std::move(initialConfig), std::move(info));
+        })
+            .thenTry([logger, mountPath = info.mountPath](
+                         folly::Try<std::shared_ptr<EdenMount>>&& result) {
+              if (result.hasValue()) {
+                logger->log("Successfully took over mount ", mountPath);
+                return makeFuture();
+              } else {
+                logger->warn(
+                    "Failed to perform takeover for ",
+                    mountPath,
+                    ": ",
+                    result.exception().what());
+                return makeFuture<Unit>(std::move(result).exception());
+              }
+            });
+    mountFutures.push_back(std::move(mountFuture));
+  }
+#else
+  NOT_IMPLEMENTED();
+#endif
+  return mountFutures;
+}
+
+std::vector<Future<Unit>> EdenServer::prepareMounts(
+    shared_ptr<StartupLogger> logger) {
+  std::vector<Future<Unit>> mountFutures;
+  folly::dynamic dirs = folly::dynamic::object();
+  try {
+    dirs = ClientConfig::loadClientDirectoryMap(edenDir_);
+  } catch (const std::exception& ex) {
+    logger->warn(
+        "Could not parse config.json file: ",
+        ex.what(),
+        "\nSkipping remount step.");
+    mountFutures.emplace_back(
+        folly::exception_wrapper(std::current_exception(), ex));
+    return mountFutures;
+  }
+
+  if (dirs.empty()) {
+    logger->log("No mount points currently configured.");
+    return mountFutures;
+  }
+  logger->log("Remounting ", dirs.size(), " mount points...");
+
+  for (const auto& client : dirs.items()) {
+    auto mountFuture =
+        makeFutureWith([&] {
+          MountInfo mountInfo;
+          mountInfo.mountPoint = client.first.c_str();
+          auto edenClientPath = edenDir_ + PathComponent("clients") +
+              PathComponent(client.second.c_str());
+          mountInfo.edenClientPath = edenClientPath.stringPiece().str();
+          auto initialConfig = ClientConfig::loadFromClientDirectory(
+              AbsolutePathPiece{mountInfo.mountPoint},
+              AbsolutePathPiece{mountInfo.edenClientPath});
+          return mount(std::move(initialConfig));
+        })
+            .thenTry([logger, mountPath = client.first.asString()](
+                         folly::Try<std::shared_ptr<EdenMount>>&& result) {
+              if (result.hasValue()) {
+                logger->log("Successfully remounted ", mountPath);
+                return makeFuture();
+              } else {
+                logger->warn(
+                    "Failed to remount ",
+                    mountPath,
+                    ": ",
+                    result.exception().what());
+                return makeFuture<Unit>(std::move(result).exception());
+              }
+            });
+    mountFutures.push_back(std::move(mountFuture));
+  }
+  return mountFutures;
 }

 void EdenServer::run(void (*runThriftServer)(const EdenServer&)) {
--- a/eden/fs/service/EdenServer.h
+++ b/eden/fs/service/EdenServer.h
@ -115,15 +115,19 @@ class EdenServer : private TakeoverHandler {
   * however a few steps complete asynchronously.  The status of the
   * asynchronous preparation steps is tracked in the returned Future object.
   *
-   * The returned future does not complete until all configured mount points
-   * have been remounted and until the thrift server is accepting connections.
+   * The returned future will complete until the EdenServer is running
+   * successfully and accepting thrift connections.
+   *
+   * If waitForMountCompletion is true the returned future will also not
+   * become ready until all configured mount points have been remounted.
   * If an error occurs remounting some mount points the Future will complete
   * with an exception, but the server will still continue to run.  Everything
   * will be running normally except for the mount points that failed to be
   * remounted.
   */
  FOLLY_NODISCARD folly::Future<folly::Unit> prepare(
-      std::shared_ptr<StartupLogger> logger);
+      std::shared_ptr<StartupLogger> logger,
+      bool waitForMountCompletion = true);

  /**
   * Run the EdenServer.
@ -342,6 +346,12 @@ class EdenServer : private TakeoverHandler {
   * prepareImpl() contains the bulk of the implementation of prepare()
   */
  FOLLY_NODISCARD folly::Future<folly::Unit> prepareImpl(
+      std::shared_ptr<StartupLogger> logger,
+      bool waitForMountCompletion);
+  FOLLY_NODISCARD std::vector<folly::Future<folly::Unit>> prepareMountsTakeover(
+      std::shared_ptr<StartupLogger> logger,
+      std::vector<TakeoverData::MountInfo>&& takeoverMounts);
+  FOLLY_NODISCARD std::vector<folly::Future<folly::Unit>> prepareMounts(
      std::shared_ptr<StartupLogger> logger);

  // Called when a mount has been unmounted and has stopped.
--- a/eden/fs/service/StartupLogger.h
+++ b/eden/fs/service/StartupLogger.h
@ -45,6 +45,10 @@ class StartupLogger {

  /**
   * Log an informational message.
+   *
+   * Note that it is valid to call log() even after success() has been called.
+   * This can occur if edenfs has been asked to report successful startup
+   * without waiting for all mount points to be remounted.
   */
  template <typename... Args>
  void log(Args&&... args) {
--- a/eden/fs/service/main.cpp
+++ b/eden/fs/service/main.cpp
@ -44,6 +44,11 @@ DEFINE_string(
    logPath,
    "",
    "If set, redirects stdout and stderr to the log file given.");
+DEFINE_bool(
+    noWaitForMounts,
+    false,
+    "Report successful startup without waiting for all configured mounts "
+    "to be remounted.");

 constexpr folly::StringPiece kDefaultUserConfigFile{".edenrc"};
 constexpr folly::StringPiece kEdenfsConfigFile{"edenfs.rc"};
@ -250,7 +255,7 @@ int main(int argc, char** argv) {
    server.emplace(
        std::move(identity), std::move(privHelper), std::move(edenConfig));

-    prepareFuture = server->prepare(startupLogger);
+    prepareFuture = server->prepare(startupLogger, !FLAGS_noWaitForMounts);
  } catch (const std::exception& ex) {
    startupLogger->exitUnsuccessfully(
        EX_SOFTWARE, "error starting edenfs: ", folly::exceptionStr(ex));
--- a/eden/integration/lib/edenclient.py
+++ b/eden/integration/lib/edenclient.py
@ -151,7 +151,12 @@ class EdenFS(object):
        cmd.extend(args)
        return cmd

-    def start(self, timeout: float = 60, takeover_from: Optional[int] = None) -> None:
+    def start(
+        self,
+        timeout: float = 60,
+        takeover_from: Optional[int] = None,
+        extra_args: Optional[List[str]] = None,
+    ) -> None:
        """
        Run "eden daemon" to start the eden daemon.
        """
@ -165,7 +170,7 @@ class EdenFS(object):
            timeout += 90

        takeover = takeover_from is not None
-        self._spawn(gdb=use_gdb, takeover=takeover)
+        self.spawn_nowait(gdb=use_gdb, takeover=takeover, extra_args=extra_args)

        assert self._process is not None
        util.wait_for_daemon_healthy(
@ -214,7 +219,15 @@ class EdenFS(object):

        return extra_daemon_args

-    def _spawn(self, gdb: bool = False, takeover: bool = False) -> None:
+    def spawn_nowait(
+        self,
+        gdb: bool = False,
+        takeover: bool = False,
+        extra_args: Optional[List[str]] = None,
+    ) -> None:
+        """
+        Start edenfs but do not wait for it to become healthy.
+        """
        if self._process is not None:
            raise Exception("cannot start an already-running eden client")

@ -226,6 +239,8 @@ class EdenFS(object):
        )

        extra_daemon_args = self.get_extra_daemon_args()
+        if extra_args:
+            extra_daemon_args.extend(extra_args)

        if takeover:
            args.append("--takeover")
--- a/eden/integration/mount_test.py
+++ b/eden/integration/mount_test.py
@ -15,7 +15,10 @@ from pathlib import Path
 from typing import Optional, Set

 from eden.cli.util import poll_until
+from eden.thrift import EdenClient, EdenNotRunningError
 from facebook.eden.ttypes import FaultDefinition, MountState, UnblockFaultArg
+from fb303.ttypes import fb_status
+from thrift.Thrift import TException

 from .lib import testcase

@ -142,17 +145,71 @@ class MountTest(testcase.EdenRepoTest):
            # Unblock mounting and wait for the mount to transition to running
            client.unblockFault(UnblockFaultArg(keyClass="mount", keyValueRegex=".*"))

-            def mount_running() -> Optional[bool]:
-                if (
-                    self.eden.get_mount_state(Path(self.mount), client)
-                    == MountState.RUNNING
-                ):
-                    return True
+            self._wait_for_mount_running(client)
+            self.assertEqual({self.mount: "RUNNING"}, self.eden.list_cmd_simple())
+
+    def test_start_blocked_mount_init(self) -> None:
+        self.eden.shutdown()
+        self.eden.spawn_nowait(
+            extra_args=["--enable_fault_injection", "--fault_injection_block_mounts"]
+        )
+
+        # Wait for eden to report the mount point in the listMounts() output
+        def is_initializing() -> Optional[bool]:
+            try:
+                with self.eden.get_thrift_client() as client:
+                    if self.eden.get_mount_state(Path(self.mount), client) is not None:
+                        return True
+                assert self.eden._process is not None
+                if self.eden._process.poll():
+                    self.fail("eden exited before becoming healthy")
+                return None
+            except (EdenNotRunningError, TException):
                return None

-            poll_until(mount_running, timeout=30)
-            self.assertEqual({self.mount: "RUNNING"}, self.eden.list_cmd_simple())
-            self.assertEqual(0, mount_proc.wait())
+        poll_until(is_initializing, timeout=60)
+        with self.eden.get_thrift_client() as client:
+            # Since we blocked mount initialization the mount should still
+            # report as INITIALIZING, and edenfs should report itself STARTING
+            self.assertEqual({self.mount: "INITIALIZING"}, self.eden.list_cmd_simple())
+            self.assertEqual(fb_status.STARTING, client.getStatus())
+
+            # Unblock mounting and wait for the mount to transition to running
+            client.unblockFault(UnblockFaultArg(keyClass="mount", keyValueRegex=".*"))
+            self._wait_for_mount_running(client)
+            self.assertEqual(fb_status.ALIVE, client.getStatus())
+
+        self.assertEqual({self.mount: "RUNNING"}, self.eden.list_cmd_simple())
+
+    def test_start_no_mount_wait(self) -> None:
+        self.eden.shutdown()
+        self.eden.start(
+            extra_args=[
+                "--noWaitForMounts",
+                "--enable_fault_injection",
+                "--fault_injection_block_mounts",
+            ]
+        )
+        self.assertEqual({self.mount: "INITIALIZING"}, self.eden.list_cmd_simple())
+
+        # Unblock mounting and wait for the mount to transition to running
+        with self.eden.get_thrift_client() as client:
+            self.assertEqual(fb_status.ALIVE, client.getStatus())
+            client.unblockFault(UnblockFaultArg(keyClass="mount", keyValueRegex=".*"))
+            self._wait_for_mount_running(client)
+
+        self.assertEqual({self.mount: "RUNNING"}, self.eden.list_cmd_simple())
+
+    def _wait_for_mount_running(self, client: EdenClient) -> None:
+        def mount_running() -> Optional[bool]:
+            if (
+                self.eden.get_mount_state(Path(self.mount), client)
+                == MountState.RUNNING
+            ):
+                return True
+            return None
+
+        poll_until(mount_running, timeout=60)

    def test_remount_creates_bind_mounts_if_needed(self) -> None:
        # Add a repo definition to the config that includes some bind mounts.