improve behavior of "eden restart"

Summary:
Update `eden restart` to explain that the restart will be disruptive when
performing a full restart, and prompt the user for confirmation.  This prompt
can be overridden with the `--force` option.  Also print more messaging after
the restart completes telling users what to do if they see "Transport endpoint
not connected" errors from programs still using the old mount points.

This also updates `eden restart` to more carefully handle cases where there is
an existing edenfs daemon running but it is not fully healthy.

Reviewed By: wez

Differential Revision: D8731004

fbshipit-source-id: 05762187b47057b2930d0a6b71b0a6fdbd4aa7e5
This commit is contained in:
Adam Simpkins 2018-07-11 19:01:03 -07:00 committed by Facebook Github Bot
parent 891a889bb5
commit 3d27bdca1b
5 changed files with 462 additions and 74 deletions

View File

@ -23,6 +23,7 @@ import eden.thrift
from eden.thrift import EdenNotRunningError
from facebook.eden import EdenService
from facebook.eden.ttypes import GlobParams
from fb303.ttypes import fb_status
from . import (
buck,
@ -730,6 +731,7 @@ def stop_aux_processes(client: eden.thrift.EdenClient) -> None:
RESTART_MODE_FULL = "full"
RESTART_MODE_GRACEFUL = "graceful"
RESTART_MODE_FORCE = "force"
@subcmd("restart", "Restart the edenfs daemon")
@ -755,66 +757,159 @@ class RestartCmd(Subcmd):
"disruption to clients. Open file handles will continue to work "
"across the restart.",
)
mode_group.add_argument(
"--force",
action="store_const",
const=RESTART_MODE_FORCE,
dest="restart_type",
help="Force a full restart, even if the existing edenfs daemon is "
"still in the middle of starting or stopping.",
)
parser.add_argument(
"--daemon-binary", help="Path to the binary for the Eden daemon."
)
parser.add_argument(
"--shutdown-timeout",
type=float,
default=30,
default=None,
help="How long to wait for the old edenfs process to exit when "
"performing a full restart.",
)
def run(self, args: argparse.Namespace) -> int:
self.args = args
if self.args.restart_type is None:
if args.restart_type is None:
# Default to a full restart for now
self.args.restart_type = RESTART_MODE_FULL
args.restart_type = RESTART_MODE_FULL
self.config = create_config(args)
stopping = False
pid = None
try:
with self.config.get_thrift_client() as client:
pid = client.getPid()
if self.args.restart_type == RESTART_MODE_FULL:
stop_aux_processes(client)
# Ask the old edenfs daemon to shutdown
self.msg("Stopping the existing edenfs daemon (pid {})...", pid)
client.initiateShutdown(
f"`eden restart` requested by pid={os.getpid()} "
f"uid={os.getuid()}"
)
stopping = True
except EdenNotRunningError:
pass
config = create_config(self.args)
if stopping:
assert isinstance(pid, int)
daemon.wait_for_shutdown(
self.config, pid, timeout=self.args.shutdown_timeout
)
self._start()
elif pid is None:
self.msg("edenfs is not currently running.")
self._start()
health = config.check_health()
if health.is_healthy():
assert health.pid is not None
if self.args.restart_type == RESTART_MODE_GRACEFUL:
return self._graceful_restart(config)
else:
return self._full_restart(config, health.pid)
elif health.pid is None:
# The daemon is not running
return self._start(config)
else:
self._graceful_start()
if health.status == fb_status.STARTING:
print(
f"The current edenfs daemon (pid {health.pid}) is still starting."
)
# Give the daemon a little extra time to hopefully finish starting
# before we time out and kill it.
stop_timeout = 30
elif health.status == fb_status.STOPPING:
print(
f"The current edenfs daemon (pid {health.pid}) is in the middle "
"of stopping."
)
# Use a reduced stopping timeout. If the user is using --force
# then the daemon is probably stuck or something, and we'll likely need
# to kill it anyway.
stop_timeout = 5
else:
# The only other status value we generally expect to receive here is
# fb_status.STOPPED. This is returned if we found an existing edenfs
# process but it is not responding to thrift calls.
print(
f"Found an existing edenfs daemon (pid {health.pid} that does not "
"seem to be responding to thrift calls."
)
# Don't attempt to ask the daemon to stop at all in this case;
# just kill it.
stop_timeout = 0
if self.args.restart_type != RESTART_MODE_FORCE:
print(f"Use --force if you want to forcibly restart the current daemon")
return 1
return self._force_restart(config, health.pid, stop_timeout)
def _graceful_restart(self, config: config_mod.Config) -> int:
print("Performing a graceful restart...")
daemon.exec_daemon(config, daemon_binary=self.args.daemon_binary, takeover=True)
return 1 # never reached
def _start(self, config: config_mod.Config) -> int:
print("Eden is not currently running. Starting it...")
daemon.exec_daemon(config, daemon_binary=self.args.daemon_binary)
return 1 # never reached
def _full_restart(self, config: config_mod.Config, old_pid: int) -> int:
print(
"""\
About to perform a full restart of Eden.
Note: this will temporarily disrupt access to your Eden-managed repositories.
Any programs using files or directories inside the Eden mounts will need to
re-open these files after Eden is restarted.
"""
)
if self.args.restart_type != RESTART_MODE_FORCE and sys.stdin.isatty():
if not prompt_confirmation("Proceed?"):
print("Not confirmed.")
return 1
self._do_stop(config, old_pid, timeout=15)
return self._finish_restart(config)
def _force_restart(
self, config: config_mod.Config, old_pid: int, stop_timeout: int
) -> int:
print("Forcing a full restart...")
if stop_timeout <= 0:
print("Sending SIGTERM...")
os.kill(old_pid, signal.SIGTERM)
self._wait_for_stop(config, old_pid, timeout=5)
else:
self._do_stop(config, old_pid, stop_timeout)
return self._finish_restart(config)
def _wait_for_stop(self, config: config_mod.Config, pid: int, timeout: int) -> None:
# If --shutdown-timeout was specified on the command line that always takes
# precedence over the default timeout passed in by our caller.
if self.args.shutdown_timeout is not None:
timeout = self.args.shutdown_timeout
daemon.wait_for_shutdown(config, pid, timeout=timeout)
def _do_stop(self, config: config_mod.Config, pid: int, timeout: int) -> None:
with config.get_thrift_client() as client:
try:
stop_aux_processes(client)
except Exception as ex:
pass
try:
client.initiateShutdown(
f"`eden restart --force` requested by pid={os.getpid()} "
f"uid={os.getuid()}"
)
except Exception as ex:
print("Sending SIGTERM...")
os.kill(pid, signal.SIGTERM)
self._wait_for_stop(config, pid, timeout)
def _finish_restart(self, config: config_mod.Config) -> int:
exit_code = daemon.start_daemon(config, daemon_binary=self.args.daemon_binary)
if exit_code != 0:
print("Failed to start edenfs!", file=sys.stderr)
return exit_code
print(
"""\
Successfully restarted edenfs.
Note: any programs running inside of an Eden-managed directory will need to cd
out of and back into the repository to pick up the new working directory state.
If you see "Transport endpoint not connected" errors from any program this
means it is still attempting to use the old mount point from the previous Eden
process."""
)
return 0
def msg(self, msg: str, *args: Any, **kwargs: Any) -> None:
if args or kwargs:
msg = msg.format(*args, **kwargs)
print(msg)
def _start(self) -> None:
self.msg("Starting edenfs...")
daemon.exec_daemon(self.config)
def _graceful_start(self) -> None:
self.msg("Performing a graceful restart...")
daemon.exec_daemon(self.config, takeover=True)
@subcmd("rage", "Gather diagnostic information about eden")
class RageCmd(Subcmd):

View File

@ -122,7 +122,7 @@ def _check_health_using_lockfile(config_dir: str) -> HealthStatus:
comm = stdout.rstrip().decode("utf8")
# Note that the command may be just "edenfs" rather than a path, but it
# works out fine either way.
if os.path.basename(comm) == "edenfs":
if os.path.basename(comm) in ("edenfs", "fake_edenfs"):
return HealthStatus(
fb_status.STOPPED,
int(pid),

View File

@ -10,6 +10,7 @@
#include <folly/init/Init.h>
#include <folly/io/async/AsyncSignalHandler.h>
#include <folly/logging/Init.h>
#include <folly/logging/xlog.h>
#include <gflags/gflags.h>
#include <signal.h>
#include <thrift/lib/cpp2/server/ThriftServer.h>
@ -24,10 +25,13 @@ using namespace facebook::eden;
using apache::thrift::ThriftServer;
using facebook::fb303::cpp2::fb_status;
using folly::EventBase;
using folly::StringPiece;
using std::make_shared;
using std::string;
DEFINE_bool(allowRoot, false, "Allow running eden directly as root");
DEFINE_bool(foreground, false, "Run edenfs in the foreground");
DEFINE_bool(ignoreStop, false, "Ignore attempts to stop edenfs");
DEFINE_string(edenDir, "", "The path to the .eden directory");
DEFINE_string(
etcEdenDir,
@ -39,15 +43,72 @@ DEFINE_string(
"",
"If set, redirects stdout and stderr to the log file given.");
FOLLY_INIT_LOGGING_CONFIG("eden=DBG2");
FOLLY_INIT_LOGGING_CONFIG(".=INFO,eden=DBG2");
namespace {
class FakeEdenServiceHandler;
class FakeEdenServer {
public:
FakeEdenServer() {}
void run(folly::SocketAddress thriftAddress, StartupLogger& startupLogger);
void stop(StringPiece reason) {
if (!honorStop_) {
XLOG(INFO) << "ignoring stop attempt: " << reason;
return;
}
XLOG(INFO) << "stopping: " << reason;
eventBase_->terminateLoopSoon();
}
void setHonorStop(bool honorStop) {
honorStop_ = honorStop;
}
private:
EventBase* eventBase_{nullptr};
ThriftServer server_;
std::shared_ptr<FakeEdenServiceHandler> handler_;
bool honorStop_{true};
};
class FakeEdenServiceHandler : virtual public StreamingEdenServiceSvIf {
public:
FakeEdenServiceHandler() {}
explicit FakeEdenServiceHandler(FakeEdenServer* server) : server_{server} {}
fb_status getStatus() override {
return fb_status::ALIVE;
return status_;
}
void setOption(std::unique_ptr<string> name, std::unique_ptr<string> value)
override {
auto badOption = [&]() {
auto errMsg = folly::to<string>(
"invalid value for ", *name, " setting: \"", *value, "\"");
XLOG(ERR) << errMsg;
throw std::invalid_argument(errMsg);
};
if (*name == "honor_stop") {
auto boolValue = folly::tryTo<bool>(*value);
if (boolValue.hasError()) {
badOption();
}
server_->setHonorStop(boolValue.value());
} else if (*name == "status") {
if (*value == "starting") {
status_ = fb_status::STARTING;
} else if (*value == "alive") {
status_ = fb_status::ALIVE;
} else if (*value == "stopping") {
status_ = fb_status::STOPPING;
} else {
badOption();
}
}
}
int64_t getPid() override {
@ -59,39 +120,66 @@ class FakeEdenServiceHandler : virtual public StreamingEdenServiceSvIf {
}
void shutdown() override {
printf("received shutdown() thrift request\n");
server_->stop("received shutdown() thrift request");
}
void initiateShutdown(std::unique_ptr<std::string> reason) override {
printf(
"received initiateShutdown() thrift requested: %s\n", reason->c_str());
void initiateShutdown(std::unique_ptr<string> reason) override {
server_->stop(folly::to<string>(
"received initiateShutdown() thrift requested: ", reason->c_str()));
}
private:
FakeEdenServer* server_{nullptr};
fb_status status_{fb_status::ALIVE};
};
class SignalHandler : public folly::AsyncSignalHandler {
public:
explicit SignalHandler(EventBase* eventBase) : AsyncSignalHandler(eventBase) {
SignalHandler(EventBase* eventBase, FakeEdenServer* server)
: AsyncSignalHandler(eventBase), server_{server} {
registerSignalHandler(SIGINT);
registerSignalHandler(SIGTERM);
}
void signalReceived(int sig) noexcept override {
// We just print a message when we receive a signal,
// but ignore it otherwise
switch (sig) {
case SIGINT:
printf("received SIGINT\n");
server_->stop("received SIGINT");
break;
case SIGTERM:
printf("received SIGTERM\n");
server_->stop("received SIGTERM");
break;
default:
printf("received signal %d\n", sig);
XLOG(INFO) << "received signal " << sig;
break;
}
}
private:
FakeEdenServer* server_{nullptr};
};
void FakeEdenServer::run(
folly::SocketAddress thriftAddress,
StartupLogger& startupLogger) {
eventBase_ = folly::EventBaseManager::get()->getEventBase();
// Create the ThriftServer object
auto handler = make_shared<FakeEdenServiceHandler>(this);
server_.setInterface(handler);
server_.setAddress(thriftAddress);
// Set up a signal handler to ignore SIGINT and SIGTERM
// This lets our integration tests exercise the case where edenfs does not
// shut down on its own.
SignalHandler signalHandler(eventBase_, this);
// Run the thrift server
server_.setup();
startupLogger.success();
eventBase_->loopForever();
}
bool acquireLock(AbsolutePathPiece edenDir) {
const auto lockPath = edenDir + "lock"_pc;
auto lockFile = folly::File(lockPath.value(), O_WRONLY | O_CREAT);
@ -108,6 +196,7 @@ bool acquireLock(AbsolutePathPiece edenDir) {
lockFile.release();
return true;
}
} // namespace
int main(int argc, char** argv) {
@ -132,6 +221,8 @@ int main(int argc, char** argv) {
startupLogger.exitUnsuccessfully(1, "Failed to acquire lock file");
}
startupLogger.log("Starting fake edenfs daemon");
// Get the path to the thrift socket.
auto thriftSocketPath = edenDir + "socket"_pc;
folly::SocketAddress thriftAddress;
@ -149,21 +240,10 @@ int main(int argc, char** argv) {
folly::errnoStr(errnum));
}
// Create the ThriftServer object
auto handler = make_shared<FakeEdenServiceHandler>();
ThriftServer server;
server.setInterface(handler);
server.setAddress(thriftAddress);
// Set up a signal handler to ignore SIGINT and SIGTERM
// This lets our integration tests exercise the case where edenfs does not
// shut down on its own.
SignalHandler signalHandler(server.getEventBaseManager()->getEventBase());
// Run the thrift server
server.setup();
startupLogger.success();
folly::EventBaseManager::get()->getEventBase()->loopForever();
FakeEdenServer server;
if (FLAGS_ignoreStop) {
server.setHonorStop(false);
}
server.run(thriftAddress, startupLogger);
return 0;
}

View File

@ -0,0 +1,211 @@
#!/usr/bin/env python3
#
# Copyright (c) 2016-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree. An additional grant
# of patent rights can be found in the PATENTS file in the same directory.
import os
import shutil
import subprocess
import sys
import tempfile
import unittest
from typing import Any
import eden.thrift
import eden.thrift.client
import pexpect
from .lib.find_executables import FindExe
class RestartTest(unittest.TestCase):
def setUp(self) -> None:
def cleanup_tmp_dir() -> None:
shutil.rmtree(self.tmp_dir, ignore_errors=True)
self.tmp_dir = tempfile.mkdtemp(prefix="eden_test.")
self.addCleanup(cleanup_tmp_dir)
def ensure_stopped() -> None:
stop_cmd = [FindExe.EDEN_CLI, "--config-dir", self.tmp_dir, "stop"]
subprocess.call(stop_cmd)
self.addCleanup(ensure_stopped)
def _spawn_restart(self, *args: str, **kwargs: Any) -> pexpect.spawn:
restart_cmd = [
FindExe.EDEN_CLI,
"--config-dir",
self.tmp_dir,
"restart",
"--daemon-binary",
FindExe.FAKE_EDENFS,
]
restart_cmd.extend(args)
kwargs.setdefault("logfile", sys.stdout.buffer)
kwargs.setdefault("timeout", 5)
print("Retarting eden: %r" % (restart_cmd,))
return pexpect.spawn(restart_cmd[0], restart_cmd[1:], **kwargs)
def _start_fake_edenfs(self) -> int:
# Run "eden restart". It should start it without prompting since edenfs is not
# already running.
p = self._spawn_restart()
p.expect_exact("Eden is not currently running. Starting it...")
p.expect_exact("Starting fake edenfs daemon")
p.expect(r"Started edenfs \(pid ([0-9]+)\)")
pid = int(p.match.group(1))
p.wait()
self.assertEqual(p.exitstatus, 0)
return pid
def _get_thrift_client(self) -> eden.thrift.EdenClient:
return eden.thrift.create_thrift_client(self.tmp_dir)
def test_restart(self) -> None:
self._start_fake_edenfs()
# Run "eden restart"
# It should prompt since we are about to do a non-graceful restart.
p = self._spawn_restart()
p.expect_exact("About to perform a full restart of Eden")
p.expect_exact(
"Note: this will temporarily disrupt access to your Eden-managed "
"repositories"
)
p.expect_exact("Proceed? [y/N] ")
p.sendline("y")
p.expect_exact("Starting fake edenfs daemon")
p.expect(r"Started edenfs \(pid [0-9]+\)")
p.expect_exact("Successfully restarted edenfs.")
p.expect_exact(
"Note: any programs running inside of an Eden-managed "
"directory will need to cd"
)
p.wait()
self.assertEqual(p.exitstatus, 0)
def test_restart_sigkill(self) -> None:
self._start_fake_edenfs()
# Tell the fake edenfs binary to ignore attempts to stop it
with self._get_thrift_client() as client:
client.setOption("honor_stop", "false")
# Run "eden restart". It should have to kill eden with SIGKILL during the
# restart operation.
# Explicitly pass in a shorter than normal shutdown timeout just to reduce the
# amount of time required for the test.
p = self._spawn_restart("--shutdown-timeout=1")
p.expect_exact("About to perform a full restart of Eden")
p.expect_exact(
"Note: this will temporarily disrupt access to your Eden-managed "
"repositories"
)
p.expect_exact("Proceed? [y/N] ")
p.sendline("y")
p.expect(
r"sent shutdown request, but edenfs did not exit within "
r"[.0-9]+ seconds. Attempting SIGKILL."
)
p.expect_exact("Starting fake edenfs daemon")
p.expect(r"Started edenfs \(pid [0-9]+\)")
p.expect_exact("Successfully restarted edenfs.")
p.expect_exact(
"Note: any programs running inside of an Eden-managed "
"directory will need to cd"
)
p.wait()
self.assertEqual(p.exitstatus, 0)
def test_restart_force(self) -> None:
self._start_fake_edenfs()
# "eden restart --force" should not prompt if the user wants to proceed
p = self._spawn_restart("--force")
p.expect_exact("About to perform a full restart of Eden")
p.expect_exact(
"Note: this will temporarily disrupt access to your Eden-managed "
"repositories"
)
p.expect_exact("Starting fake edenfs daemon")
p.expect(r"Started edenfs \(pid [0-9]+\)")
p.expect_exact("Successfully restarted edenfs.")
p.expect_exact(
"Note: any programs running inside of an Eden-managed "
"directory will need to cd"
)
p.wait()
self.assertEqual(p.exitstatus, 0)
def test_restart_while_starting(self) -> None:
orig_pid = self._start_fake_edenfs()
# Tell the fake edenfs daemon to report its status as "starting"
with self._get_thrift_client() as client:
client.setOption("status", "starting")
# "eden restart" should not restart if edenfs is still starting
p = self._spawn_restart()
p.expect_exact(f"The current edenfs daemon (pid {orig_pid}) is still starting")
p.expect_exact("Use --force if you want to forcibly restart the current daemon")
p.wait()
self.assertEqual(p.exitstatus, 1)
# "eden restart --force" should force the restart anyway
p = self._spawn_restart("--force")
p.expect_exact(f"The current edenfs daemon (pid {orig_pid}) is still starting")
p.expect_exact("Forcing a full restart...")
p.expect_exact("Starting fake edenfs daemon")
p.expect(r"Started edenfs \(pid [0-9]+\)")
p.expect_exact("Successfully restarted edenfs.")
p.expect_exact(
"Note: any programs running inside of an Eden-managed "
"directory will need to cd"
)
p.wait()
self.assertEqual(p.exitstatus, 0)
def test_restart_unresponsive_thrift(self) -> None:
orig_pid = self._start_fake_edenfs()
# Rename the thrift socket so that "eden restart" will not be able to
# communicate with the existing daemon.
os.rename(
os.path.join(self.tmp_dir, eden.thrift.client.SOCKET_PATH),
os.path.join(self.tmp_dir, "old.socket"),
)
# "eden restart" should not restart if it cannot confirm the current health of
# edenfs.
p = self._spawn_restart()
p.expect_exact(
f"Found an existing edenfs daemon (pid {orig_pid} that does not "
"seem to be responding to thrift calls."
)
p.expect_exact("Use --force if you want to forcibly restart the current daemon")
p.wait()
self.assertEqual(p.exitstatus, 1)
# "eden restart --force" should force the restart anyway
p = self._spawn_restart("--force")
p.expect_exact(
f"Found an existing edenfs daemon (pid {orig_pid} that does not "
"seem to be responding to thrift calls."
)
p.expect_exact("Forcing a full restart...")
p.expect_exact("Starting fake edenfs daemon")
p.expect(r"Started edenfs \(pid [0-9]+\)")
p.expect_exact("Successfully restarted edenfs.")
p.expect_exact(
"Note: any programs running inside of an Eden-managed "
"directory will need to cd"
)
p.wait()
self.assertEqual(p.exitstatus, 0)

View File

@ -38,6 +38,8 @@ class StopTest(unittest.TestCase):
"start",
"--daemon-binary",
FindExe.FAKE_EDENFS,
"--",
"--ignoreStop",
]
print("Starting eden: %r" % (start_cmd,))
subprocess.check_call(start_cmd)