add a flag to report start-up success before finishing mounts

Summary:
Add a flag to tell edenfs to report successful start-up as soon as the thrift
server is running, without waiting for all mount points to finish being
remounted.

In the future I plan to have edenfs automatically perform an fsck scan of the
overlay for checkouts that were not shut down cleanly.  This may cause the
remount to take a significant amount of extra start-up time in some cases.
(This is already true today in some cases even with the simpler scan we do to
re-compute the max inode number.)

I think we will probably want to have systemd invoke edenfs with this option,
so that we do not time out during system start up if some mount points need to
be rescanned.

Reviewed By: strager

Differential Revision: D13522040

fbshipit-source-id: 6f183770c25efee34c4805c9bad42a9cce51039e
This commit is contained in:
Adam Simpkins 2019-03-06 20:30:45 -08:00 committed by Facebook Github Bot
parent ce1c8019a0
commit b47184adc4
7 changed files with 239 additions and 103 deletions

View File

@ -24,6 +24,11 @@ DEFINE_bool(
enable_fault_injection,
false,
"Enable the fault injection framework.");
DEFINE_bool(
fault_injection_block_mounts,
false,
"Block mount attempts via the fault injection framework. "
"Requires --enable_fault_injection.");
namespace facebook {
namespace eden {
@ -56,7 +61,19 @@ ServerState::ServerState(
kUserIgnoreMinPollSeconds}},
systemIgnoreFileMonitor_{CachedParsedFileMonitor<GitIgnoreFileParser>{
edenConfig->getSystemIgnoreFile(),
kSystemIgnoreMinPollSeconds}} {}
kSystemIgnoreMinPollSeconds}} {
// It would be nice if we eventually built a more generic mechanism for
// defining faults to be configured on start up. (e.g., loading this from the
// EdenConfig).
//
// For now, blocking mounts is the main thing we want to be able to control on
// startup (since mounting occurs automatically during startup). Add a
// one-off command line flag to control this for now, until we build a more
// generic mechanism.
if (FLAGS_fault_injection_block_mounts) {
faultInjector_->injectBlock("mount", ".*");
}
}
ServerState::~ServerState() {}

View File

@ -372,8 +372,10 @@ void EdenServer::scheduleInodeUnload(std::chrono::milliseconds timeout) {
}
#endif // !EDEN_WIN
Future<Unit> EdenServer::prepare(std::shared_ptr<StartupLogger> logger) {
return prepareImpl(std::move(logger))
Future<Unit> EdenServer::prepare(
std::shared_ptr<StartupLogger> logger,
bool waitForMountCompletion) {
return prepareImpl(std::move(logger), waitForMountCompletion)
.ensure(
// Mark the server state as RUNNING once we finish setting up the
// mount points. Even if an error occurs we still transition to the
@ -382,7 +384,9 @@ Future<Unit> EdenServer::prepare(std::shared_ptr<StartupLogger> logger) {
[this] { runningState_.wlock()->state = RunState::RUNNING; });
}
Future<Unit> EdenServer::prepareImpl(std::shared_ptr<StartupLogger> logger) {
Future<Unit> EdenServer::prepareImpl(
std::shared_ptr<StartupLogger> logger,
bool waitForMountCompletion) {
bool doingTakeover = false;
if (!acquireEdenLock()) {
// Another edenfs process is already running.
@ -429,8 +433,8 @@ Future<Unit> EdenServer::prepareImpl(std::shared_ptr<StartupLogger> logger) {
// receive its lock, thrift socket, and mount points now.
// This will shut down the old process.
const auto takeoverPath = edenDir_ + PathComponentPiece{kTakeoverSocketName};
TakeoverData takeoverData;
#endif
TakeoverData takeoverData;
if (doingTakeover) {
#ifndef EDEN_WIN
logger->log(
@ -494,96 +498,120 @@ Future<Unit> EdenServer::prepareImpl(std::shared_ptr<StartupLogger> logger) {
takeoverServer_->start();
#endif // !EDEN_WIN
// Trigger remounting of existing mount points
// If doingTakeover is true, use the mounts received in TakeoverData
std::vector<Future<Unit>> mountFutures;
if (doingTakeover) {
#ifndef EDEN_WIN
for (auto& info : takeoverData.mountPoints) {
const auto stateDirectory = info.stateDirectory;
auto mountFuture =
makeFutureWith([&] {
auto initialConfig = ClientConfig::loadFromClientDirectory(
AbsolutePathPiece{info.mountPath},
AbsolutePathPiece{info.stateDirectory});
return mount(std::move(initialConfig), std::move(info));
})
.thenTry([logger, mountPath = info.mountPath](
folly::Try<std::shared_ptr<EdenMount>>&& result) {
if (result.hasValue()) {
logger->log("Successfully took over mount ", mountPath);
return makeFuture();
} else {
logger->warn(
"Failed to perform takeover for ",
mountPath,
": ",
result.exception().what());
return makeFuture<Unit>(std::move(result).exception());
}
});
mountFutures.push_back(std::move(mountFuture));
}
mountFutures =
prepareMountsTakeover(logger, std::move(takeoverData.mountPoints));
#else
NOT_IMPLEMENTED();
#endif
#endif // !EDEN_WIN
} else {
folly::dynamic dirs = folly::dynamic::object();
try {
dirs = ClientConfig::loadClientDirectoryMap(edenDir_);
} catch (const std::exception& ex) {
logger->warn(
"Could not parse config.json file: ",
ex.what(),
"\nSkipping remount step.");
return std::move(thriftRunningFuture)
.thenValue(
[ew = folly::exception_wrapper(std::current_exception(), ex)](
auto&&) { return makeFuture<Unit>(ew); });
}
if (dirs.empty()) {
logger->log("No mount points currently configured.");
return thriftRunningFuture;
}
logger->log("Remounting ", dirs.size(), " mount points...");
for (const auto& client : dirs.items()) {
auto mountFuture =
makeFutureWith([&] {
MountInfo mountInfo;
mountInfo.mountPoint = client.first.c_str();
auto edenClientPath = edenDir_ + PathComponent("clients") +
PathComponent(client.second.c_str());
mountInfo.edenClientPath = edenClientPath.stringPiece().str();
auto initialConfig = ClientConfig::loadFromClientDirectory(
AbsolutePathPiece{mountInfo.mountPoint},
AbsolutePathPiece{mountInfo.edenClientPath});
return mount(std::move(initialConfig));
})
.thenTry([logger, mountPath = client.first.asString()](
folly::Try<std::shared_ptr<EdenMount>>&& result) {
if (result.hasValue()) {
logger->log("Successfully remounted ", mountPath);
return makeFuture();
} else {
logger->warn(
"Failed to remount ",
mountPath,
": ",
result.exception().what());
return makeFuture<Unit>(std::move(result).exception());
}
});
mountFutures.push_back(std::move(mountFuture));
}
mountFutures = prepareMounts(logger);
}
// Return a future that will complete only when all mount points have started
// and the thrift server is also running.
return folly::collectAll(mountFutures)
.thenValue([thriftFuture = std::move(thriftRunningFuture)](
auto&&) mutable { return std::move(thriftFuture); });
if (waitForMountCompletion) {
// Return a future that will complete only when all mount points have
// started and the thrift server is also running.
mountFutures.emplace_back(std::move(thriftRunningFuture));
return folly::collectAll(mountFutures).unit();
} else {
// Don't wait for the mount futures.
// Only return the thrift future.
return thriftRunningFuture;
}
}
std::vector<Future<Unit>> EdenServer::prepareMountsTakeover(
shared_ptr<StartupLogger> logger,
std::vector<TakeoverData::MountInfo>&& takeoverMounts) {
// Trigger remounting of existing mount points
// If doingTakeover is true, use the mounts received in TakeoverData
std::vector<Future<Unit>> mountFutures;
#ifndef EDEN_WIN
for (auto& info : takeoverMounts) {
const auto stateDirectory = info.stateDirectory;
auto mountFuture =
makeFutureWith([&] {
auto initialConfig = ClientConfig::loadFromClientDirectory(
AbsolutePathPiece{info.mountPath},
AbsolutePathPiece{info.stateDirectory});
return mount(std::move(initialConfig), std::move(info));
})
.thenTry([logger, mountPath = info.mountPath](
folly::Try<std::shared_ptr<EdenMount>>&& result) {
if (result.hasValue()) {
logger->log("Successfully took over mount ", mountPath);
return makeFuture();
} else {
logger->warn(
"Failed to perform takeover for ",
mountPath,
": ",
result.exception().what());
return makeFuture<Unit>(std::move(result).exception());
}
});
mountFutures.push_back(std::move(mountFuture));
}
#else
NOT_IMPLEMENTED();
#endif
return mountFutures;
}
std::vector<Future<Unit>> EdenServer::prepareMounts(
shared_ptr<StartupLogger> logger) {
std::vector<Future<Unit>> mountFutures;
folly::dynamic dirs = folly::dynamic::object();
try {
dirs = ClientConfig::loadClientDirectoryMap(edenDir_);
} catch (const std::exception& ex) {
logger->warn(
"Could not parse config.json file: ",
ex.what(),
"\nSkipping remount step.");
mountFutures.emplace_back(
folly::exception_wrapper(std::current_exception(), ex));
return mountFutures;
}
if (dirs.empty()) {
logger->log("No mount points currently configured.");
return mountFutures;
}
logger->log("Remounting ", dirs.size(), " mount points...");
for (const auto& client : dirs.items()) {
auto mountFuture =
makeFutureWith([&] {
MountInfo mountInfo;
mountInfo.mountPoint = client.first.c_str();
auto edenClientPath = edenDir_ + PathComponent("clients") +
PathComponent(client.second.c_str());
mountInfo.edenClientPath = edenClientPath.stringPiece().str();
auto initialConfig = ClientConfig::loadFromClientDirectory(
AbsolutePathPiece{mountInfo.mountPoint},
AbsolutePathPiece{mountInfo.edenClientPath});
return mount(std::move(initialConfig));
})
.thenTry([logger, mountPath = client.first.asString()](
folly::Try<std::shared_ptr<EdenMount>>&& result) {
if (result.hasValue()) {
logger->log("Successfully remounted ", mountPath);
return makeFuture();
} else {
logger->warn(
"Failed to remount ",
mountPath,
": ",
result.exception().what());
return makeFuture<Unit>(std::move(result).exception());
}
});
mountFutures.push_back(std::move(mountFuture));
}
return mountFutures;
}
void EdenServer::run(void (*runThriftServer)(const EdenServer&)) {

View File

@ -115,15 +115,19 @@ class EdenServer : private TakeoverHandler {
* however a few steps complete asynchronously. The status of the
* asynchronous preparation steps is tracked in the returned Future object.
*
* The returned future does not complete until all configured mount points
* have been remounted and until the thrift server is accepting connections.
* The returned future will complete until the EdenServer is running
* successfully and accepting thrift connections.
*
* If waitForMountCompletion is true the returned future will also not
* become ready until all configured mount points have been remounted.
* If an error occurs remounting some mount points the Future will complete
* with an exception, but the server will still continue to run. Everything
* will be running normally except for the mount points that failed to be
* remounted.
*/
FOLLY_NODISCARD folly::Future<folly::Unit> prepare(
std::shared_ptr<StartupLogger> logger);
std::shared_ptr<StartupLogger> logger,
bool waitForMountCompletion = true);
/**
* Run the EdenServer.
@ -342,6 +346,12 @@ class EdenServer : private TakeoverHandler {
* prepareImpl() contains the bulk of the implementation of prepare()
*/
FOLLY_NODISCARD folly::Future<folly::Unit> prepareImpl(
std::shared_ptr<StartupLogger> logger,
bool waitForMountCompletion);
FOLLY_NODISCARD std::vector<folly::Future<folly::Unit>> prepareMountsTakeover(
std::shared_ptr<StartupLogger> logger,
std::vector<TakeoverData::MountInfo>&& takeoverMounts);
FOLLY_NODISCARD std::vector<folly::Future<folly::Unit>> prepareMounts(
std::shared_ptr<StartupLogger> logger);
// Called when a mount has been unmounted and has stopped.

View File

@ -45,6 +45,10 @@ class StartupLogger {
/**
* Log an informational message.
*
* Note that it is valid to call log() even after success() has been called.
* This can occur if edenfs has been asked to report successful startup
* without waiting for all mount points to be remounted.
*/
template <typename... Args>
void log(Args&&... args) {

View File

@ -44,6 +44,11 @@ DEFINE_string(
logPath,
"",
"If set, redirects stdout and stderr to the log file given.");
DEFINE_bool(
noWaitForMounts,
false,
"Report successful startup without waiting for all configured mounts "
"to be remounted.");
constexpr folly::StringPiece kDefaultUserConfigFile{".edenrc"};
constexpr folly::StringPiece kEdenfsConfigFile{"edenfs.rc"};
@ -250,7 +255,7 @@ int main(int argc, char** argv) {
server.emplace(
std::move(identity), std::move(privHelper), std::move(edenConfig));
prepareFuture = server->prepare(startupLogger);
prepareFuture = server->prepare(startupLogger, !FLAGS_noWaitForMounts);
} catch (const std::exception& ex) {
startupLogger->exitUnsuccessfully(
EX_SOFTWARE, "error starting edenfs: ", folly::exceptionStr(ex));

View File

@ -151,7 +151,12 @@ class EdenFS(object):
cmd.extend(args)
return cmd
def start(self, timeout: float = 60, takeover_from: Optional[int] = None) -> None:
def start(
self,
timeout: float = 60,
takeover_from: Optional[int] = None,
extra_args: Optional[List[str]] = None,
) -> None:
"""
Run "eden daemon" to start the eden daemon.
"""
@ -165,7 +170,7 @@ class EdenFS(object):
timeout += 90
takeover = takeover_from is not None
self._spawn(gdb=use_gdb, takeover=takeover)
self.spawn_nowait(gdb=use_gdb, takeover=takeover, extra_args=extra_args)
assert self._process is not None
util.wait_for_daemon_healthy(
@ -214,7 +219,15 @@ class EdenFS(object):
return extra_daemon_args
def _spawn(self, gdb: bool = False, takeover: bool = False) -> None:
def spawn_nowait(
self,
gdb: bool = False,
takeover: bool = False,
extra_args: Optional[List[str]] = None,
) -> None:
"""
Start edenfs but do not wait for it to become healthy.
"""
if self._process is not None:
raise Exception("cannot start an already-running eden client")
@ -226,6 +239,8 @@ class EdenFS(object):
)
extra_daemon_args = self.get_extra_daemon_args()
if extra_args:
extra_daemon_args.extend(extra_args)
if takeover:
args.append("--takeover")

View File

@ -15,7 +15,10 @@ from pathlib import Path
from typing import Optional, Set
from eden.cli.util import poll_until
from eden.thrift import EdenClient, EdenNotRunningError
from facebook.eden.ttypes import FaultDefinition, MountState, UnblockFaultArg
from fb303.ttypes import fb_status
from thrift.Thrift import TException
from .lib import testcase
@ -142,17 +145,71 @@ class MountTest(testcase.EdenRepoTest):
# Unblock mounting and wait for the mount to transition to running
client.unblockFault(UnblockFaultArg(keyClass="mount", keyValueRegex=".*"))
def mount_running() -> Optional[bool]:
if (
self.eden.get_mount_state(Path(self.mount), client)
== MountState.RUNNING
):
return True
self._wait_for_mount_running(client)
self.assertEqual({self.mount: "RUNNING"}, self.eden.list_cmd_simple())
def test_start_blocked_mount_init(self) -> None:
self.eden.shutdown()
self.eden.spawn_nowait(
extra_args=["--enable_fault_injection", "--fault_injection_block_mounts"]
)
# Wait for eden to report the mount point in the listMounts() output
def is_initializing() -> Optional[bool]:
try:
with self.eden.get_thrift_client() as client:
if self.eden.get_mount_state(Path(self.mount), client) is not None:
return True
assert self.eden._process is not None
if self.eden._process.poll():
self.fail("eden exited before becoming healthy")
return None
except (EdenNotRunningError, TException):
return None
poll_until(mount_running, timeout=30)
self.assertEqual({self.mount: "RUNNING"}, self.eden.list_cmd_simple())
self.assertEqual(0, mount_proc.wait())
poll_until(is_initializing, timeout=60)
with self.eden.get_thrift_client() as client:
# Since we blocked mount initialization the mount should still
# report as INITIALIZING, and edenfs should report itself STARTING
self.assertEqual({self.mount: "INITIALIZING"}, self.eden.list_cmd_simple())
self.assertEqual(fb_status.STARTING, client.getStatus())
# Unblock mounting and wait for the mount to transition to running
client.unblockFault(UnblockFaultArg(keyClass="mount", keyValueRegex=".*"))
self._wait_for_mount_running(client)
self.assertEqual(fb_status.ALIVE, client.getStatus())
self.assertEqual({self.mount: "RUNNING"}, self.eden.list_cmd_simple())
def test_start_no_mount_wait(self) -> None:
self.eden.shutdown()
self.eden.start(
extra_args=[
"--noWaitForMounts",
"--enable_fault_injection",
"--fault_injection_block_mounts",
]
)
self.assertEqual({self.mount: "INITIALIZING"}, self.eden.list_cmd_simple())
# Unblock mounting and wait for the mount to transition to running
with self.eden.get_thrift_client() as client:
self.assertEqual(fb_status.ALIVE, client.getStatus())
client.unblockFault(UnblockFaultArg(keyClass="mount", keyValueRegex=".*"))
self._wait_for_mount_running(client)
self.assertEqual({self.mount: "RUNNING"}, self.eden.list_cmd_simple())
def _wait_for_mount_running(self, client: EdenClient) -> None:
def mount_running() -> Optional[bool]:
if (
self.eden.get_mount_state(Path(self.mount), client)
== MountState.RUNNING
):
return True
return None
poll_until(mount_running, timeout=60)
def test_remount_creates_bind_mounts_if_needed(self) -> None:
# Add a repo definition to the config that includes some bind mounts.