diff --git a/eden/cli/main.py b/eden/cli/main.py index ce0e03d2db..42505e887d 100644 --- a/eden/cli/main.py +++ b/eden/cli/main.py @@ -23,6 +23,7 @@ import eden.thrift from eden.thrift import EdenNotRunningError from facebook.eden import EdenService from facebook.eden.ttypes import GlobParams +from fb303.ttypes import fb_status from . import ( buck, @@ -730,6 +731,7 @@ def stop_aux_processes(client: eden.thrift.EdenClient) -> None: RESTART_MODE_FULL = "full" RESTART_MODE_GRACEFUL = "graceful" +RESTART_MODE_FORCE = "force" @subcmd("restart", "Restart the edenfs daemon") @@ -755,66 +757,159 @@ class RestartCmd(Subcmd): "disruption to clients. Open file handles will continue to work " "across the restart.", ) + mode_group.add_argument( + "--force", + action="store_const", + const=RESTART_MODE_FORCE, + dest="restart_type", + help="Force a full restart, even if the existing edenfs daemon is " + "still in the middle of starting or stopping.", + ) + parser.add_argument( + "--daemon-binary", help="Path to the binary for the Eden daemon." + ) parser.add_argument( "--shutdown-timeout", type=float, - default=30, + default=None, help="How long to wait for the old edenfs process to exit when " "performing a full restart.", ) def run(self, args: argparse.Namespace) -> int: self.args = args - - if self.args.restart_type is None: + if args.restart_type is None: # Default to a full restart for now - self.args.restart_type = RESTART_MODE_FULL + args.restart_type = RESTART_MODE_FULL - self.config = create_config(args) - stopping = False - pid = None - try: - with self.config.get_thrift_client() as client: - pid = client.getPid() - if self.args.restart_type == RESTART_MODE_FULL: - stop_aux_processes(client) - # Ask the old edenfs daemon to shutdown - self.msg("Stopping the existing edenfs daemon (pid {})...", pid) - client.initiateShutdown( - f"`eden restart` requested by pid={os.getpid()} " - f"uid={os.getuid()}" - ) - stopping = True - except EdenNotRunningError: - pass + config = create_config(self.args) - if stopping: - assert isinstance(pid, int) - daemon.wait_for_shutdown( - self.config, pid, timeout=self.args.shutdown_timeout - ) - self._start() - elif pid is None: - self.msg("edenfs is not currently running.") - self._start() + health = config.check_health() + if health.is_healthy(): + assert health.pid is not None + if self.args.restart_type == RESTART_MODE_GRACEFUL: + return self._graceful_restart(config) + else: + return self._full_restart(config, health.pid) + elif health.pid is None: + # The daemon is not running + return self._start(config) else: - self._graceful_start() + if health.status == fb_status.STARTING: + print( + f"The current edenfs daemon (pid {health.pid}) is still starting." + ) + # Give the daemon a little extra time to hopefully finish starting + # before we time out and kill it. + stop_timeout = 30 + elif health.status == fb_status.STOPPING: + print( + f"The current edenfs daemon (pid {health.pid}) is in the middle " + "of stopping." + ) + # Use a reduced stopping timeout. If the user is using --force + # then the daemon is probably stuck or something, and we'll likely need + # to kill it anyway. + stop_timeout = 5 + else: + # The only other status value we generally expect to receive here is + # fb_status.STOPPED. This is returned if we found an existing edenfs + # process but it is not responding to thrift calls. + print( + f"Found an existing edenfs daemon (pid {health.pid} that does not " + "seem to be responding to thrift calls." + ) + # Don't attempt to ask the daemon to stop at all in this case; + # just kill it. + stop_timeout = 0 + + if self.args.restart_type != RESTART_MODE_FORCE: + print(f"Use --force if you want to forcibly restart the current daemon") + return 1 + return self._force_restart(config, health.pid, stop_timeout) + + def _graceful_restart(self, config: config_mod.Config) -> int: + print("Performing a graceful restart...") + daemon.exec_daemon(config, daemon_binary=self.args.daemon_binary, takeover=True) + return 1 # never reached + + def _start(self, config: config_mod.Config) -> int: + print("Eden is not currently running. Starting it...") + daemon.exec_daemon(config, daemon_binary=self.args.daemon_binary) + return 1 # never reached + + def _full_restart(self, config: config_mod.Config, old_pid: int) -> int: + print( + """\ +About to perform a full restart of Eden. +Note: this will temporarily disrupt access to your Eden-managed repositories. +Any programs using files or directories inside the Eden mounts will need to +re-open these files after Eden is restarted. +""" + ) + if self.args.restart_type != RESTART_MODE_FORCE and sys.stdin.isatty(): + if not prompt_confirmation("Proceed?"): + print("Not confirmed.") + return 1 + + self._do_stop(config, old_pid, timeout=15) + return self._finish_restart(config) + + def _force_restart( + self, config: config_mod.Config, old_pid: int, stop_timeout: int + ) -> int: + print("Forcing a full restart...") + if stop_timeout <= 0: + print("Sending SIGTERM...") + os.kill(old_pid, signal.SIGTERM) + self._wait_for_stop(config, old_pid, timeout=5) + else: + self._do_stop(config, old_pid, stop_timeout) + + return self._finish_restart(config) + + def _wait_for_stop(self, config: config_mod.Config, pid: int, timeout: int) -> None: + # If --shutdown-timeout was specified on the command line that always takes + # precedence over the default timeout passed in by our caller. + if self.args.shutdown_timeout is not None: + timeout = self.args.shutdown_timeout + daemon.wait_for_shutdown(config, pid, timeout=timeout) + + def _do_stop(self, config: config_mod.Config, pid: int, timeout: int) -> None: + with config.get_thrift_client() as client: + try: + stop_aux_processes(client) + except Exception as ex: + pass + try: + client.initiateShutdown( + f"`eden restart --force` requested by pid={os.getpid()} " + f"uid={os.getuid()}" + ) + except Exception as ex: + print("Sending SIGTERM...") + os.kill(pid, signal.SIGTERM) + self._wait_for_stop(config, pid, timeout) + + def _finish_restart(self, config: config_mod.Config) -> int: + exit_code = daemon.start_daemon(config, daemon_binary=self.args.daemon_binary) + if exit_code != 0: + print("Failed to start edenfs!", file=sys.stderr) + return exit_code + + print( + """\ + +Successfully restarted edenfs. +Note: any programs running inside of an Eden-managed directory will need to cd +out of and back into the repository to pick up the new working directory state. +If you see "Transport endpoint not connected" errors from any program this +means it is still attempting to use the old mount point from the previous Eden +process.""" + ) return 0 - def msg(self, msg: str, *args: Any, **kwargs: Any) -> None: - if args or kwargs: - msg = msg.format(*args, **kwargs) - print(msg) - - def _start(self) -> None: - self.msg("Starting edenfs...") - daemon.exec_daemon(self.config) - - def _graceful_start(self) -> None: - self.msg("Performing a graceful restart...") - daemon.exec_daemon(self.config, takeover=True) - @subcmd("rage", "Gather diagnostic information about eden") class RageCmd(Subcmd): diff --git a/eden/cli/util.py b/eden/cli/util.py index 187f6d1595..fecb93db1f 100644 --- a/eden/cli/util.py +++ b/eden/cli/util.py @@ -122,7 +122,7 @@ def _check_health_using_lockfile(config_dir: str) -> HealthStatus: comm = stdout.rstrip().decode("utf8") # Note that the command may be just "edenfs" rather than a path, but it # works out fine either way. - if os.path.basename(comm) == "edenfs": + if os.path.basename(comm) in ("edenfs", "fake_edenfs"): return HealthStatus( fb_status.STOPPED, int(pid), diff --git a/eden/integration/helpers/fake_edenfs.cpp b/eden/integration/helpers/fake_edenfs.cpp index c47abf6e6e..bbbe810f1d 100644 --- a/eden/integration/helpers/fake_edenfs.cpp +++ b/eden/integration/helpers/fake_edenfs.cpp @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -24,10 +25,13 @@ using namespace facebook::eden; using apache::thrift::ThriftServer; using facebook::fb303::cpp2::fb_status; using folly::EventBase; +using folly::StringPiece; using std::make_shared; +using std::string; DEFINE_bool(allowRoot, false, "Allow running eden directly as root"); DEFINE_bool(foreground, false, "Run edenfs in the foreground"); +DEFINE_bool(ignoreStop, false, "Ignore attempts to stop edenfs"); DEFINE_string(edenDir, "", "The path to the .eden directory"); DEFINE_string( etcEdenDir, @@ -39,15 +43,72 @@ DEFINE_string( "", "If set, redirects stdout and stderr to the log file given."); -FOLLY_INIT_LOGGING_CONFIG("eden=DBG2"); +FOLLY_INIT_LOGGING_CONFIG(".=INFO,eden=DBG2"); namespace { + +class FakeEdenServiceHandler; + +class FakeEdenServer { + public: + FakeEdenServer() {} + + void run(folly::SocketAddress thriftAddress, StartupLogger& startupLogger); + void stop(StringPiece reason) { + if (!honorStop_) { + XLOG(INFO) << "ignoring stop attempt: " << reason; + return; + } + + XLOG(INFO) << "stopping: " << reason; + eventBase_->terminateLoopSoon(); + } + + void setHonorStop(bool honorStop) { + honorStop_ = honorStop; + } + + private: + EventBase* eventBase_{nullptr}; + ThriftServer server_; + std::shared_ptr handler_; + bool honorStop_{true}; +}; + class FakeEdenServiceHandler : virtual public StreamingEdenServiceSvIf { public: - FakeEdenServiceHandler() {} + explicit FakeEdenServiceHandler(FakeEdenServer* server) : server_{server} {} fb_status getStatus() override { - return fb_status::ALIVE; + return status_; + } + + void setOption(std::unique_ptr name, std::unique_ptr value) + override { + auto badOption = [&]() { + auto errMsg = folly::to( + "invalid value for ", *name, " setting: \"", *value, "\""); + XLOG(ERR) << errMsg; + throw std::invalid_argument(errMsg); + }; + + if (*name == "honor_stop") { + auto boolValue = folly::tryTo(*value); + if (boolValue.hasError()) { + badOption(); + } + server_->setHonorStop(boolValue.value()); + } else if (*name == "status") { + if (*value == "starting") { + status_ = fb_status::STARTING; + } else if (*value == "alive") { + status_ = fb_status::ALIVE; + } else if (*value == "stopping") { + status_ = fb_status::STOPPING; + } else { + badOption(); + } + } } int64_t getPid() override { @@ -59,39 +120,66 @@ class FakeEdenServiceHandler : virtual public StreamingEdenServiceSvIf { } void shutdown() override { - printf("received shutdown() thrift request\n"); + server_->stop("received shutdown() thrift request"); } - void initiateShutdown(std::unique_ptr reason) override { - printf( - "received initiateShutdown() thrift requested: %s\n", reason->c_str()); + void initiateShutdown(std::unique_ptr reason) override { + server_->stop(folly::to( + "received initiateShutdown() thrift requested: ", reason->c_str())); } + + private: + FakeEdenServer* server_{nullptr}; + fb_status status_{fb_status::ALIVE}; }; class SignalHandler : public folly::AsyncSignalHandler { public: - explicit SignalHandler(EventBase* eventBase) : AsyncSignalHandler(eventBase) { + SignalHandler(EventBase* eventBase, FakeEdenServer* server) + : AsyncSignalHandler(eventBase), server_{server} { registerSignalHandler(SIGINT); registerSignalHandler(SIGTERM); } void signalReceived(int sig) noexcept override { - // We just print a message when we receive a signal, - // but ignore it otherwise switch (sig) { case SIGINT: - printf("received SIGINT\n"); + server_->stop("received SIGINT"); break; case SIGTERM: - printf("received SIGTERM\n"); + server_->stop("received SIGTERM"); break; default: - printf("received signal %d\n", sig); + XLOG(INFO) << "received signal " << sig; break; } } + + private: + FakeEdenServer* server_{nullptr}; }; +void FakeEdenServer::run( + folly::SocketAddress thriftAddress, + StartupLogger& startupLogger) { + eventBase_ = folly::EventBaseManager::get()->getEventBase(); + + // Create the ThriftServer object + auto handler = make_shared(this); + server_.setInterface(handler); + server_.setAddress(thriftAddress); + + // Set up a signal handler to ignore SIGINT and SIGTERM + // This lets our integration tests exercise the case where edenfs does not + // shut down on its own. + SignalHandler signalHandler(eventBase_, this); + + // Run the thrift server + server_.setup(); + startupLogger.success(); + eventBase_->loopForever(); +} + bool acquireLock(AbsolutePathPiece edenDir) { const auto lockPath = edenDir + "lock"_pc; auto lockFile = folly::File(lockPath.value(), O_WRONLY | O_CREAT); @@ -108,6 +196,7 @@ bool acquireLock(AbsolutePathPiece edenDir) { lockFile.release(); return true; } + } // namespace int main(int argc, char** argv) { @@ -132,6 +221,8 @@ int main(int argc, char** argv) { startupLogger.exitUnsuccessfully(1, "Failed to acquire lock file"); } + startupLogger.log("Starting fake edenfs daemon"); + // Get the path to the thrift socket. auto thriftSocketPath = edenDir + "socket"_pc; folly::SocketAddress thriftAddress; @@ -149,21 +240,10 @@ int main(int argc, char** argv) { folly::errnoStr(errnum)); } - // Create the ThriftServer object - auto handler = make_shared(); - ThriftServer server; - server.setInterface(handler); - server.setAddress(thriftAddress); - - // Set up a signal handler to ignore SIGINT and SIGTERM - // This lets our integration tests exercise the case where edenfs does not - // shut down on its own. - SignalHandler signalHandler(server.getEventBaseManager()->getEventBase()); - - // Run the thrift server - server.setup(); - startupLogger.success(); - folly::EventBaseManager::get()->getEventBase()->loopForever(); - + FakeEdenServer server; + if (FLAGS_ignoreStop) { + server.setHonorStop(false); + } + server.run(thriftAddress, startupLogger); return 0; } diff --git a/eden/integration/restart_test.py b/eden/integration/restart_test.py new file mode 100644 index 0000000000..969aa429da --- /dev/null +++ b/eden/integration/restart_test.py @@ -0,0 +1,211 @@ +#!/usr/bin/env python3 +# +# Copyright (c) 2016-present, Facebook, Inc. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. An additional grant +# of patent rights can be found in the PATENTS file in the same directory. + +import os +import shutil +import subprocess +import sys +import tempfile +import unittest +from typing import Any + +import eden.thrift +import eden.thrift.client +import pexpect + +from .lib.find_executables import FindExe + + +class RestartTest(unittest.TestCase): + def setUp(self) -> None: + def cleanup_tmp_dir() -> None: + shutil.rmtree(self.tmp_dir, ignore_errors=True) + + self.tmp_dir = tempfile.mkdtemp(prefix="eden_test.") + self.addCleanup(cleanup_tmp_dir) + + def ensure_stopped() -> None: + stop_cmd = [FindExe.EDEN_CLI, "--config-dir", self.tmp_dir, "stop"] + subprocess.call(stop_cmd) + + self.addCleanup(ensure_stopped) + + def _spawn_restart(self, *args: str, **kwargs: Any) -> pexpect.spawn: + restart_cmd = [ + FindExe.EDEN_CLI, + "--config-dir", + self.tmp_dir, + "restart", + "--daemon-binary", + FindExe.FAKE_EDENFS, + ] + restart_cmd.extend(args) + + kwargs.setdefault("logfile", sys.stdout.buffer) + kwargs.setdefault("timeout", 5) + + print("Retarting eden: %r" % (restart_cmd,)) + return pexpect.spawn(restart_cmd[0], restart_cmd[1:], **kwargs) + + def _start_fake_edenfs(self) -> int: + # Run "eden restart". It should start it without prompting since edenfs is not + # already running. + p = self._spawn_restart() + p.expect_exact("Eden is not currently running. Starting it...") + p.expect_exact("Starting fake edenfs daemon") + p.expect(r"Started edenfs \(pid ([0-9]+)\)") + pid = int(p.match.group(1)) + p.wait() + self.assertEqual(p.exitstatus, 0) + return pid + + def _get_thrift_client(self) -> eden.thrift.EdenClient: + return eden.thrift.create_thrift_client(self.tmp_dir) + + def test_restart(self) -> None: + self._start_fake_edenfs() + + # Run "eden restart" + # It should prompt since we are about to do a non-graceful restart. + p = self._spawn_restart() + p.expect_exact("About to perform a full restart of Eden") + p.expect_exact( + "Note: this will temporarily disrupt access to your Eden-managed " + "repositories" + ) + p.expect_exact("Proceed? [y/N] ") + p.sendline("y") + p.expect_exact("Starting fake edenfs daemon") + p.expect(r"Started edenfs \(pid [0-9]+\)") + p.expect_exact("Successfully restarted edenfs.") + p.expect_exact( + "Note: any programs running inside of an Eden-managed " + "directory will need to cd" + ) + p.wait() + self.assertEqual(p.exitstatus, 0) + + def test_restart_sigkill(self) -> None: + self._start_fake_edenfs() + + # Tell the fake edenfs binary to ignore attempts to stop it + with self._get_thrift_client() as client: + client.setOption("honor_stop", "false") + + # Run "eden restart". It should have to kill eden with SIGKILL during the + # restart operation. + # Explicitly pass in a shorter than normal shutdown timeout just to reduce the + # amount of time required for the test. + p = self._spawn_restart("--shutdown-timeout=1") + p.expect_exact("About to perform a full restart of Eden") + p.expect_exact( + "Note: this will temporarily disrupt access to your Eden-managed " + "repositories" + ) + p.expect_exact("Proceed? [y/N] ") + p.sendline("y") + p.expect( + r"sent shutdown request, but edenfs did not exit within " + r"[.0-9]+ seconds. Attempting SIGKILL." + ) + p.expect_exact("Starting fake edenfs daemon") + p.expect(r"Started edenfs \(pid [0-9]+\)") + p.expect_exact("Successfully restarted edenfs.") + p.expect_exact( + "Note: any programs running inside of an Eden-managed " + "directory will need to cd" + ) + p.wait() + self.assertEqual(p.exitstatus, 0) + + def test_restart_force(self) -> None: + self._start_fake_edenfs() + + # "eden restart --force" should not prompt if the user wants to proceed + p = self._spawn_restart("--force") + p.expect_exact("About to perform a full restart of Eden") + p.expect_exact( + "Note: this will temporarily disrupt access to your Eden-managed " + "repositories" + ) + p.expect_exact("Starting fake edenfs daemon") + p.expect(r"Started edenfs \(pid [0-9]+\)") + p.expect_exact("Successfully restarted edenfs.") + p.expect_exact( + "Note: any programs running inside of an Eden-managed " + "directory will need to cd" + ) + p.wait() + self.assertEqual(p.exitstatus, 0) + + def test_restart_while_starting(self) -> None: + orig_pid = self._start_fake_edenfs() + + # Tell the fake edenfs daemon to report its status as "starting" + with self._get_thrift_client() as client: + client.setOption("status", "starting") + + # "eden restart" should not restart if edenfs is still starting + p = self._spawn_restart() + p.expect_exact(f"The current edenfs daemon (pid {orig_pid}) is still starting") + p.expect_exact("Use --force if you want to forcibly restart the current daemon") + p.wait() + self.assertEqual(p.exitstatus, 1) + + # "eden restart --force" should force the restart anyway + p = self._spawn_restart("--force") + p.expect_exact(f"The current edenfs daemon (pid {orig_pid}) is still starting") + p.expect_exact("Forcing a full restart...") + p.expect_exact("Starting fake edenfs daemon") + p.expect(r"Started edenfs \(pid [0-9]+\)") + p.expect_exact("Successfully restarted edenfs.") + p.expect_exact( + "Note: any programs running inside of an Eden-managed " + "directory will need to cd" + ) + p.wait() + self.assertEqual(p.exitstatus, 0) + + def test_restart_unresponsive_thrift(self) -> None: + orig_pid = self._start_fake_edenfs() + + # Rename the thrift socket so that "eden restart" will not be able to + # communicate with the existing daemon. + os.rename( + os.path.join(self.tmp_dir, eden.thrift.client.SOCKET_PATH), + os.path.join(self.tmp_dir, "old.socket"), + ) + + # "eden restart" should not restart if it cannot confirm the current health of + # edenfs. + p = self._spawn_restart() + p.expect_exact( + f"Found an existing edenfs daemon (pid {orig_pid} that does not " + "seem to be responding to thrift calls." + ) + p.expect_exact("Use --force if you want to forcibly restart the current daemon") + p.wait() + self.assertEqual(p.exitstatus, 1) + + # "eden restart --force" should force the restart anyway + p = self._spawn_restart("--force") + p.expect_exact( + f"Found an existing edenfs daemon (pid {orig_pid} that does not " + "seem to be responding to thrift calls." + ) + p.expect_exact("Forcing a full restart...") + p.expect_exact("Starting fake edenfs daemon") + p.expect(r"Started edenfs \(pid [0-9]+\)") + p.expect_exact("Successfully restarted edenfs.") + p.expect_exact( + "Note: any programs running inside of an Eden-managed " + "directory will need to cd" + ) + p.wait() + self.assertEqual(p.exitstatus, 0) diff --git a/eden/integration/stop_test.py b/eden/integration/stop_test.py index 3264e50128..f283839c91 100644 --- a/eden/integration/stop_test.py +++ b/eden/integration/stop_test.py @@ -38,6 +38,8 @@ class StopTest(unittest.TestCase): "start", "--daemon-binary", FindExe.FAKE_EDENFS, + "--", + "--ignoreStop", ] print("Starting eden: %r" % (start_cmd,)) subprocess.check_call(start_cmd)