2016-05-12 23:43:17 +03:00
|
|
|
/*
|
2017-01-21 09:02:33 +03:00
|
|
|
* Copyright (c) 2016-present, Facebook, Inc.
|
2016-05-12 23:43:17 +03:00
|
|
|
* All rights reserved.
|
|
|
|
*
|
|
|
|
* This source code is licensed under the BSD-style license found in the
|
|
|
|
* LICENSE file in the root directory of this source tree. An additional grant
|
|
|
|
* of patent rights can be found in the PATENTS file in the same directory.
|
|
|
|
*
|
|
|
|
*/
|
2017-06-22 23:39:57 +03:00
|
|
|
#include "eden/fs/service/EdenServer.h"
|
2016-05-12 23:43:17 +03:00
|
|
|
|
2017-11-20 02:18:31 +03:00
|
|
|
#include <folly/Exception.h>
|
Write the PID to the lockfile and update `eden health` to use it.
Summary:
We have encountered cases where `eden health` reported
`"edenfs not healthy: edenfs not running"` even though the `edenfs` process is
still running. Because the existing implementation of `eden health` bases its
health check on the output of a `getStatus()` Thrift call, it will erroneously
report `"edenfs not running"` even if Eden is running but its Thrift server is
not running. This type of false negative could occur if `edenfs` has shutdown
the Thrift server, but not the rest of the process (quite possibly, its
shutdown is blocked on calls to `umount2()`).
This is further problematic because `eden daemon` checks `eden health`
before attempting to start the daemon. If it gets a false negative, then
`eden daemon` will forge ahead, trying to launch a new instance of the daemon,
but it will fail with a nasty error like the following:
```
I1017 11:59:25.188414 3064499 main.cpp:81] Starting edenfs. UID=5256, GID=100, PID=3064499
terminate called after throwing an instance of 'std::runtime_error'
what(): another instance of Eden appears to be running for /home/mbolin/local/.eden
*** Aborted at 1508266765 (Unix time, try 'date -d 1508266765') ***
*** Signal 6 (SIGABRT) (0x1488002ec2b3) received by PID 3064499 (pthread TID 0x7fd0d3787d40) (linux TID 3064499) (maybe from PID 30644
99, UID 5256), stack trace: ***
@ 000000000290d3cd folly::symbolizer::(anonymous namespace)::signalHandler(int, siginfo_t*, void*)
@ 00007fd0d133cacf (unknown)
@ 00007fd0d093e7c8 __GI_raise
@ 00007fd0d0940590 __GI_abort
@ 00007fd0d1dfeecc __gnu_cxx::__verbose_terminate_handler()
@ 00007fd0d1dfcdc5 __cxxabiv1::__terminate(void (*)())
@ 00007fd0d1dfce10 std::terminate()
@ 00007fd0d1dfd090 __cxa_throw
@ 00000000015fe8ca facebook::eden::EdenServer::acquireEdenLock()
@ 000000000160f27b facebook::eden::EdenServer::prepare()
@ 00000000016107d5 facebook::eden::EdenServer::run()
@ 000000000042c4ee main
@ 00007fd0d0929857 __libc_start_main
@ 0000000000548ad8 _start
Aborted
```
By providing more accurate information to `eden daemon`, if the user tries to
run it while the daemon is already running, they will get a more polite error
like the following:
```
error: edenfs is already running (pid 274205)
```
This revision addresses this issue by writing the PID of `edenfs` in the
lockfile. It updated the implementation of `eden health` to use the PID in the
lockfile to assess the health of Eden if the call to `getStatus()` fails. It
does this by running:
```
ps -p PID -o comm=
```
and applying some heuristics on the output to assess whether the command
associated with that process is the `edenfs` command. If it is, then
`eden health` reports the status as `STOPPED` whereas previously it would report
it as `DEAD`.
Reviewed By: wez
Differential Revision: D6086473
fbshipit-source-id: 825421a6818b56ddd7deea257a92c070c2232bdd
2017-10-18 21:18:43 +03:00
|
|
|
#include <folly/FileUtil.h>
|
2016-05-12 23:43:17 +03:00
|
|
|
#include <folly/SocketAddress.h>
|
|
|
|
#include <folly/String.h>
|
2018-01-25 02:18:45 +03:00
|
|
|
#include <folly/chrono/Conv.h>
|
2017-06-27 05:27:34 +03:00
|
|
|
#include <folly/io/async/AsyncSignalHandler.h>
|
2018-05-01 07:20:51 +03:00
|
|
|
#include <folly/logging/xlog.h>
|
2018-06-26 22:05:27 +03:00
|
|
|
#include <folly/stop_watch.h>
|
2016-05-12 23:43:17 +03:00
|
|
|
#include <gflags/gflags.h>
|
2018-06-07 00:32:41 +03:00
|
|
|
#include <signal.h>
|
2017-10-18 21:18:36 +03:00
|
|
|
#include <thrift/lib/cpp/concurrency/ThreadManager.h>
|
2016-05-12 23:43:17 +03:00
|
|
|
#include <thrift/lib/cpp2/server/ThriftServer.h>
|
|
|
|
|
2018-04-10 22:11:21 +03:00
|
|
|
#include "common/stats/ServiceData.h"
|
2016-05-24 07:32:12 +03:00
|
|
|
#include "eden/fs/config/ClientConfig.h"
|
2018-01-10 09:01:00 +03:00
|
|
|
#include "eden/fs/fuse/DirHandle.h"
|
|
|
|
#include "eden/fs/fuse/FileHandle.h"
|
|
|
|
#include "eden/fs/fuse/FileHandleBase.h"
|
2018-01-10 09:00:50 +03:00
|
|
|
#include "eden/fs/fuse/FuseChannel.h"
|
2017-04-14 21:31:48 +03:00
|
|
|
#include "eden/fs/fuse/privhelper/PrivHelper.h"
|
2018-01-10 09:00:50 +03:00
|
|
|
#include "eden/fs/inodes/EdenDispatcher.h"
|
2016-05-20 20:33:42 +03:00
|
|
|
#include "eden/fs/inodes/EdenMount.h"
|
2017-08-25 22:41:41 +03:00
|
|
|
#include "eden/fs/inodes/InodeMap.h"
|
2017-08-01 06:49:35 +03:00
|
|
|
#include "eden/fs/inodes/TreeInode.h"
|
2017-12-12 23:23:57 +03:00
|
|
|
#include "eden/fs/service/EdenCPUThreadPool.h"
|
2017-06-22 23:39:57 +03:00
|
|
|
#include "eden/fs/service/EdenServiceHandler.h"
|
2018-06-26 22:05:27 +03:00
|
|
|
#include "eden/fs/service/StartupLogger.h"
|
2016-12-14 05:11:05 +03:00
|
|
|
#include "eden/fs/store/EmptyBackingStore.h"
|
2016-05-12 23:43:17 +03:00
|
|
|
#include "eden/fs/store/LocalStore.h"
|
2018-02-09 06:54:14 +03:00
|
|
|
#include "eden/fs/store/MemoryLocalStore.h"
|
2017-09-13 18:26:53 +03:00
|
|
|
#include "eden/fs/store/ObjectStore.h"
|
2018-02-07 22:45:41 +03:00
|
|
|
#include "eden/fs/store/RocksDbLocalStore.h"
|
2018-02-09 06:54:14 +03:00
|
|
|
#include "eden/fs/store/SqliteLocalStore.h"
|
2016-06-14 01:15:32 +03:00
|
|
|
#include "eden/fs/store/git/GitBackingStore.h"
|
|
|
|
#include "eden/fs/store/hg/HgBackingStore.h"
|
2017-11-20 22:34:37 +03:00
|
|
|
#include "eden/fs/takeover/TakeoverClient.h"
|
2017-11-20 02:18:29 +03:00
|
|
|
#include "eden/fs/takeover/TakeoverData.h"
|
|
|
|
#include "eden/fs/takeover/TakeoverServer.h"
|
2017-12-05 20:55:27 +03:00
|
|
|
#include "eden/fs/utils/Clock.h"
|
2018-06-20 18:56:26 +03:00
|
|
|
#include "eden/fs/utils/ProcUtil.h"
|
2018-09-10 23:42:11 +03:00
|
|
|
#include "eden/fs/utils/ProcessNameCache.h"
|
2016-05-12 23:43:17 +03:00
|
|
|
|
2018-01-03 03:25:03 +03:00
|
|
|
DEFINE_bool(
|
|
|
|
debug,
|
|
|
|
false,
|
|
|
|
"run fuse in debug mode"); // TODO: remove; no longer needed
|
2017-11-20 22:34:37 +03:00
|
|
|
DEFINE_bool(
|
|
|
|
takeover,
|
|
|
|
false,
|
|
|
|
"If another edenfs process is already running, "
|
|
|
|
"attempt to gracefully takeover its mount points.");
|
2016-05-12 23:43:17 +03:00
|
|
|
|
2018-02-09 06:54:14 +03:00
|
|
|
DEFINE_string(
|
|
|
|
local_storage_engine_unsafe,
|
|
|
|
"rocksdb",
|
|
|
|
"Select storage engine. rocksdb is the default. "
|
|
|
|
"possible choices are (rocksdb|sqlite|memory). "
|
|
|
|
"memory is currently very dangerous as you will "
|
|
|
|
"lose state across restarts and graceful restarts! "
|
|
|
|
"It is unsafe to change this between edenfs invocations!");
|
|
|
|
|
2017-10-18 21:18:36 +03:00
|
|
|
DEFINE_int32(
|
|
|
|
thrift_num_workers,
|
|
|
|
std::thread::hardware_concurrency(),
|
|
|
|
"The number of thrift worker threads");
|
2016-05-12 23:43:17 +03:00
|
|
|
DEFINE_int32(
|
|
|
|
thrift_max_requests,
|
2017-10-18 21:18:36 +03:00
|
|
|
apache::thrift::concurrency::ThreadManager::DEFAULT_MAX_QUEUE_SIZE,
|
2016-05-12 23:43:17 +03:00
|
|
|
"Maximum number of active thrift requests");
|
2017-10-18 21:18:36 +03:00
|
|
|
DEFINE_bool(thrift_enable_codel, false, "Enable Codel queuing timeout");
|
|
|
|
DEFINE_int32(thrift_min_compress_bytes, 0, "Minimum response compression size");
|
2018-09-04 22:19:08 +03:00
|
|
|
DEFINE_int64(
|
|
|
|
unload_interval_minutes,
|
2018-09-29 02:50:08 +03:00
|
|
|
0,
|
2018-09-04 22:19:08 +03:00
|
|
|
"Frequency in minutes of background inode unloading");
|
2017-08-01 06:49:35 +03:00
|
|
|
DEFINE_int64(
|
|
|
|
start_delay_minutes,
|
|
|
|
10,
|
2018-09-04 22:19:08 +03:00
|
|
|
"Initial delay before first background inode unload");
|
2017-08-19 00:10:46 +03:00
|
|
|
DEFINE_int64(
|
|
|
|
unload_age_minutes,
|
2018-09-04 22:19:08 +03:00
|
|
|
6 * 60,
|
|
|
|
"Minimum age of the inodes to be unloaded in background");
|
2016-05-12 23:43:17 +03:00
|
|
|
|
|
|
|
using apache::thrift::ThriftServer;
|
2018-03-20 03:01:15 +03:00
|
|
|
using facebook::eden::FuseChannelData;
|
2018-01-10 09:00:50 +03:00
|
|
|
using folly::File;
|
fix EdenServer::unmount() to fully wait for mount point cleanup
Summary:
This fixes EdenServer::unmount() to actually wait for all EdenMount cleanup
to complete, and fixes unmountAll() to return a Future that correctly waits for
all mount points to be cleaned up.
Previously `unmount()` waited for the mount point to be unmounted from the
kernel, but did not wait for EdenMount shutdown to complete. Previously
EdenMount shutdown was not triggered until the last reference to the
shared_ptr<EdenMount> was released. This often happened in the FUSE channel
thread that triggered the mountFinished() call--it would still hold a
reference to this pointer, and would not release it until after
mountFinished() returns. As a result, when the main thread was shutting down,
`main()` would call `unmountAll()`, and then return soon after it completed.
Some FUSE channel threads may still be running at this point, still performing
`EdenMount` shutdown while the main thread was exiting. This could result in
crashes and deadlocks as shutdown tried to access objects already destroyed by
the main thread.
With this change `EdenMount::shutdown()` is triggered explicitly during
`mountFinished()`, and `unmount()` will not complete until this finishes.
The `EdenMount` object may still exist at this point, and could still be
deleted by the FUSE channel thread, but the deletion now only requires freeing
the memory and does not require accessing other data that may have been cleaned
up by the main thread.
We should still clean up the FUSE channel thread handling in the future, to
make sure these threads are joined before the main thread exits. However, that
cleanup can wait until a separate diff. Ideally I would like to move more of
the mount and unmount logic from EdenServer and EdenServiceHandler and put that
code in EdenMount instead.
Reviewed By: bolinfest
Differential Revision: D5541318
fbshipit-source-id: 470332478357a85c314bc40458373cb0f827f62b
2017-08-03 02:52:18 +03:00
|
|
|
using folly::Future;
|
2018-01-26 22:17:36 +03:00
|
|
|
using folly::makeFuture;
|
2018-06-12 03:54:24 +03:00
|
|
|
using folly::makeFutureWith;
|
2018-01-10 09:01:02 +03:00
|
|
|
using folly::Optional;
|
2016-05-12 23:43:17 +03:00
|
|
|
using folly::StringPiece;
|
fix EdenServer::unmount() to fully wait for mount point cleanup
Summary:
This fixes EdenServer::unmount() to actually wait for all EdenMount cleanup
to complete, and fixes unmountAll() to return a Future that correctly waits for
all mount points to be cleaned up.
Previously `unmount()` waited for the mount point to be unmounted from the
kernel, but did not wait for EdenMount shutdown to complete. Previously
EdenMount shutdown was not triggered until the last reference to the
shared_ptr<EdenMount> was released. This often happened in the FUSE channel
thread that triggered the mountFinished() call--it would still hold a
reference to this pointer, and would not release it until after
mountFinished() returns. As a result, when the main thread was shutting down,
`main()` would call `unmountAll()`, and then return soon after it completed.
Some FUSE channel threads may still be running at this point, still performing
`EdenMount` shutdown while the main thread was exiting. This could result in
crashes and deadlocks as shutdown tried to access objects already destroyed by
the main thread.
With this change `EdenMount::shutdown()` is triggered explicitly during
`mountFinished()`, and `unmount()` will not complete until this finishes.
The `EdenMount` object may still exist at this point, and could still be
deleted by the FUSE channel thread, but the deletion now only requires freeing
the memory and does not require accessing other data that may have been cleaned
up by the main thread.
We should still clean up the FUSE channel thread handling in the future, to
make sure these threads are joined before the main thread exits. However, that
cleanup can wait until a separate diff. Ideally I would like to move more of
the mount and unmount logic from EdenServer and EdenServiceHandler and put that
code in EdenMount instead.
Reviewed By: bolinfest
Differential Revision: D5541318
fbshipit-source-id: 470332478357a85c314bc40458373cb0f827f62b
2017-08-03 02:52:18 +03:00
|
|
|
using folly::Unit;
|
2016-06-14 01:15:31 +03:00
|
|
|
using std::make_shared;
|
|
|
|
using std::shared_ptr;
|
2016-05-12 23:43:17 +03:00
|
|
|
using std::string;
|
2016-06-14 01:15:31 +03:00
|
|
|
using std::unique_ptr;
|
2016-05-12 23:43:17 +03:00
|
|
|
|
|
|
|
namespace {
|
2017-11-20 22:34:38 +03:00
|
|
|
using namespace facebook::eden;
|
|
|
|
|
|
|
|
constexpr StringPiece kLockFileName{"lock"};
|
|
|
|
constexpr StringPiece kThriftSocketName{"socket"};
|
|
|
|
constexpr StringPiece kTakeoverSocketName{"takeover"};
|
2017-12-14 23:35:17 +03:00
|
|
|
constexpr StringPiece kRocksDBPath{"storage/rocks-db"};
|
2018-02-09 06:54:14 +03:00
|
|
|
constexpr StringPiece kSqlitePath{"storage/sqlite.db"};
|
2017-10-18 21:18:36 +03:00
|
|
|
} // namespace
|
2016-05-12 23:43:17 +03:00
|
|
|
|
|
|
|
namespace facebook {
|
|
|
|
namespace eden {
|
|
|
|
|
2017-06-27 05:27:34 +03:00
|
|
|
class EdenServer::ThriftServerEventHandler
|
|
|
|
: public apache::thrift::server::TServerEventHandler,
|
|
|
|
public folly::AsyncSignalHandler {
|
|
|
|
public:
|
|
|
|
explicit ThriftServerEventHandler(EdenServer* edenServer)
|
|
|
|
: AsyncSignalHandler{nullptr}, edenServer_{edenServer} {}
|
|
|
|
|
|
|
|
void preServe(const folly::SocketAddress* /*address*/) override {
|
|
|
|
// preServe() will be called from the thrift server thread once when it is
|
|
|
|
// about to start serving.
|
|
|
|
//
|
|
|
|
// Register for SIGINT and SIGTERM. We do this in preServe() so we can use
|
|
|
|
// the thrift server's EventBase to process the signal callbacks.
|
|
|
|
auto eventBase = folly::EventBaseManager::get()->getEventBase();
|
|
|
|
attachEventBase(eventBase);
|
|
|
|
registerSignalHandler(SIGINT);
|
|
|
|
registerSignalHandler(SIGTERM);
|
2018-06-26 22:05:27 +03:00
|
|
|
runningPromise_.setValue();
|
2017-06-27 05:27:34 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
void signalReceived(int sig) noexcept override {
|
|
|
|
// Stop the server.
|
|
|
|
// Unregister for this signal first, so that we will be terminated
|
|
|
|
// immediately if the signal is sent again before we finish stopping.
|
|
|
|
// This makes it easier to kill the daemon if graceful shutdown hangs or
|
|
|
|
// takes longer than expected for some reason. (For instance, if we
|
|
|
|
// unmounting the mount points hangs for some reason.)
|
|
|
|
XLOG(INFO) << "stopping due to signal " << sig;
|
|
|
|
unregisterSignalHandler(sig);
|
|
|
|
edenServer_->stop();
|
|
|
|
}
|
|
|
|
|
2018-06-26 22:05:27 +03:00
|
|
|
/**
|
|
|
|
* Return a Future that will be fulfilled once the thrift server is bound to
|
|
|
|
* its socket and is ready to accept conenctions.
|
|
|
|
*/
|
|
|
|
Future<Unit> getThriftRunningFuture() {
|
|
|
|
return runningPromise_.getFuture();
|
|
|
|
}
|
|
|
|
|
2017-06-27 05:27:34 +03:00
|
|
|
private:
|
|
|
|
EdenServer* edenServer_{nullptr};
|
2018-06-26 22:05:27 +03:00
|
|
|
folly::Promise<Unit> runningPromise_;
|
2017-06-27 05:27:34 +03:00
|
|
|
};
|
|
|
|
|
2016-07-26 20:15:43 +03:00
|
|
|
EdenServer::EdenServer(
|
2018-02-09 03:33:32 +03:00
|
|
|
UserInfo userInfo,
|
2018-02-09 06:31:59 +03:00
|
|
|
std::unique_ptr<PrivHelper> privHelper,
|
2018-07-31 03:08:04 +03:00
|
|
|
std::shared_ptr<const EdenConfig> edenConfig)
|
|
|
|
: serverState_{make_shared<ServerState>(
|
2018-04-23 23:10:31 +03:00
|
|
|
std::move(userInfo),
|
|
|
|
std::move(privHelper),
|
|
|
|
std::make_shared<EdenCPUThreadPool>(),
|
2018-07-31 03:08:04 +03:00
|
|
|
std::make_shared<UnixClock>(),
|
2018-09-10 23:42:11 +03:00
|
|
|
std::make_shared<ProcessNameCache>(),
|
2018-07-31 03:08:04 +03:00
|
|
|
edenConfig)} {
|
|
|
|
edenDir_ = edenConfig->getEdenDir();
|
|
|
|
configPath_ = edenConfig->getUserConfigPath();
|
2018-08-20 21:07:34 +03:00
|
|
|
clientCertificate_ = edenConfig->getClientCertificate();
|
|
|
|
useMononoke_ = edenConfig->getUseMononoke();
|
2018-10-11 00:44:49 +03:00
|
|
|
mononokeTierName_ = edenConfig->getMononokeTierName();
|
2018-07-10 02:53:17 +03:00
|
|
|
}
|
2016-05-12 23:43:17 +03:00
|
|
|
|
2017-12-06 04:43:07 +03:00
|
|
|
EdenServer::~EdenServer() {}
|
2017-03-21 22:57:07 +03:00
|
|
|
|
2018-04-05 05:40:22 +03:00
|
|
|
Future<Unit> EdenServer::unmountAll() {
|
|
|
|
std::vector<Future<Unit>> futures;
|
2016-07-01 07:00:01 +03:00
|
|
|
{
|
2018-01-18 00:29:59 +03:00
|
|
|
const auto mountPoints = mountPoints_.wlock();
|
2017-08-03 03:40:57 +03:00
|
|
|
for (auto& entry : *mountPoints) {
|
fix EdenServer::unmount() to fully wait for mount point cleanup
Summary:
This fixes EdenServer::unmount() to actually wait for all EdenMount cleanup
to complete, and fixes unmountAll() to return a Future that correctly waits for
all mount points to be cleaned up.
Previously `unmount()` waited for the mount point to be unmounted from the
kernel, but did not wait for EdenMount shutdown to complete. Previously
EdenMount shutdown was not triggered until the last reference to the
shared_ptr<EdenMount> was released. This often happened in the FUSE channel
thread that triggered the mountFinished() call--it would still hold a
reference to this pointer, and would not release it until after
mountFinished() returns. As a result, when the main thread was shutting down,
`main()` would call `unmountAll()`, and then return soon after it completed.
Some FUSE channel threads may still be running at this point, still performing
`EdenMount` shutdown while the main thread was exiting. This could result in
crashes and deadlocks as shutdown tried to access objects already destroyed by
the main thread.
With this change `EdenMount::shutdown()` is triggered explicitly during
`mountFinished()`, and `unmount()` will not complete until this finishes.
The `EdenMount` object may still exist at this point, and could still be
deleted by the FUSE channel thread, but the deletion now only requires freeing
the memory and does not require accessing other data that may have been cleaned
up by the main thread.
We should still clean up the FUSE channel thread handling in the future, to
make sure these threads are joined before the main thread exits. However, that
cleanup can wait until a separate diff. Ideally I would like to move more of
the mount and unmount logic from EdenServer and EdenServiceHandler and put that
code in EdenMount instead.
Reviewed By: bolinfest
Differential Revision: D5541318
fbshipit-source-id: 470332478357a85c314bc40458373cb0f827f62b
2017-08-03 02:52:18 +03:00
|
|
|
const auto& mountPath = entry.first;
|
2018-01-10 09:00:50 +03:00
|
|
|
auto& info = entry.second;
|
|
|
|
|
2018-06-12 03:54:24 +03:00
|
|
|
auto future =
|
|
|
|
makeFutureWith([this, &mountPath] {
|
|
|
|
return serverState_->getPrivHelper()->fuseUnmount(mountPath);
|
|
|
|
})
|
|
|
|
.then(
|
|
|
|
[unmountFuture = info.unmountPromise.getFuture()]() mutable {
|
|
|
|
return std::move(unmountFuture);
|
|
|
|
})
|
|
|
|
.onError(
|
|
|
|
[path = entry.first.str()](folly::exception_wrapper&& ew) {
|
|
|
|
XLOG(ERR) << "Failed to perform unmount for \"" << path
|
|
|
|
<< "\": " << folly::exceptionStr(ew);
|
|
|
|
return makeFuture<Unit>(ew);
|
|
|
|
});
|
|
|
|
futures.push_back(std::move(future));
|
2018-04-05 05:40:22 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
// Use collectAll() rather than collect() to wait for all of the unmounts
|
|
|
|
// to complete, and only check for errors once everything has finished.
|
2018-08-28 05:29:24 +03:00
|
|
|
return folly::collectAllSemiFuture(futures).toUnsafeFuture().thenValue(
|
2018-04-05 05:40:22 +03:00
|
|
|
[](std::vector<folly::Try<Unit>> results) {
|
|
|
|
for (const auto& result : results) {
|
|
|
|
result.throwIfFailed();
|
|
|
|
}
|
|
|
|
});
|
|
|
|
}
|
|
|
|
|
|
|
|
Future<TakeoverData> EdenServer::stopMountsForTakeover() {
|
|
|
|
std::vector<Future<Optional<TakeoverData::MountInfo>>> futures;
|
|
|
|
{
|
|
|
|
const auto mountPoints = mountPoints_.wlock();
|
|
|
|
for (auto& entry : *mountPoints) {
|
|
|
|
const auto& mountPath = entry.first;
|
|
|
|
auto& info = entry.second;
|
|
|
|
|
|
|
|
try {
|
|
|
|
info.takeoverPromise.emplace();
|
|
|
|
auto future = info.takeoverPromise->getFuture();
|
|
|
|
info.edenMount->getFuseChannel()->takeoverStop();
|
2018-09-15 02:57:46 +03:00
|
|
|
futures.emplace_back(std::move(future).thenValue(
|
2018-04-05 05:40:22 +03:00
|
|
|
[self = this,
|
|
|
|
edenMount = info.edenMount](TakeoverData::MountInfo takeover)
|
2018-06-12 03:54:24 +03:00
|
|
|
-> Future<Optional<TakeoverData::MountInfo>> {
|
2018-04-05 05:40:22 +03:00
|
|
|
if (!takeover.fuseFD) {
|
|
|
|
return folly::none;
|
|
|
|
}
|
2018-06-12 03:54:24 +03:00
|
|
|
return self->serverState_->getPrivHelper()
|
|
|
|
->fuseTakeoverShutdown(edenMount->getPath().stringPiece())
|
2018-09-15 02:57:46 +03:00
|
|
|
.thenValue([takeover = std::move(takeover)](auto&&) mutable {
|
2018-06-12 03:54:24 +03:00
|
|
|
return std::move(takeover);
|
|
|
|
});
|
2018-04-05 05:40:22 +03:00
|
|
|
}));
|
|
|
|
} catch (const std::exception& ex) {
|
|
|
|
XLOG(ERR) << "Error while stopping \"" << mountPath
|
|
|
|
<< "\" for takeover: " << folly::exceptionStr(ex);
|
2018-01-10 09:01:02 +03:00
|
|
|
futures.push_back(makeFuture<Optional<TakeoverData::MountInfo>>(
|
fix EdenServer::unmount() to fully wait for mount point cleanup
Summary:
This fixes EdenServer::unmount() to actually wait for all EdenMount cleanup
to complete, and fixes unmountAll() to return a Future that correctly waits for
all mount points to be cleaned up.
Previously `unmount()` waited for the mount point to be unmounted from the
kernel, but did not wait for EdenMount shutdown to complete. Previously
EdenMount shutdown was not triggered until the last reference to the
shared_ptr<EdenMount> was released. This often happened in the FUSE channel
thread that triggered the mountFinished() call--it would still hold a
reference to this pointer, and would not release it until after
mountFinished() returns. As a result, when the main thread was shutting down,
`main()` would call `unmountAll()`, and then return soon after it completed.
Some FUSE channel threads may still be running at this point, still performing
`EdenMount` shutdown while the main thread was exiting. This could result in
crashes and deadlocks as shutdown tried to access objects already destroyed by
the main thread.
With this change `EdenMount::shutdown()` is triggered explicitly during
`mountFinished()`, and `unmount()` will not complete until this finishes.
The `EdenMount` object may still exist at this point, and could still be
deleted by the FUSE channel thread, but the deletion now only requires freeing
the memory and does not require accessing other data that may have been cleaned
up by the main thread.
We should still clean up the FUSE channel thread handling in the future, to
make sure these threads are joined before the main thread exits. However, that
cleanup can wait until a separate diff. Ideally I would like to move more of
the mount and unmount logic from EdenServer and EdenServiceHandler and put that
code in EdenMount instead.
Reviewed By: bolinfest
Differential Revision: D5541318
fbshipit-source-id: 470332478357a85c314bc40458373cb0f827f62b
2017-08-03 02:52:18 +03:00
|
|
|
folly::exception_wrapper(std::current_exception(), ex)));
|
|
|
|
}
|
2016-07-01 07:00:01 +03:00
|
|
|
}
|
|
|
|
}
|
fix EdenServer::unmount() to fully wait for mount point cleanup
Summary:
This fixes EdenServer::unmount() to actually wait for all EdenMount cleanup
to complete, and fixes unmountAll() to return a Future that correctly waits for
all mount points to be cleaned up.
Previously `unmount()` waited for the mount point to be unmounted from the
kernel, but did not wait for EdenMount shutdown to complete. Previously
EdenMount shutdown was not triggered until the last reference to the
shared_ptr<EdenMount> was released. This often happened in the FUSE channel
thread that triggered the mountFinished() call--it would still hold a
reference to this pointer, and would not release it until after
mountFinished() returns. As a result, when the main thread was shutting down,
`main()` would call `unmountAll()`, and then return soon after it completed.
Some FUSE channel threads may still be running at this point, still performing
`EdenMount` shutdown while the main thread was exiting. This could result in
crashes and deadlocks as shutdown tried to access objects already destroyed by
the main thread.
With this change `EdenMount::shutdown()` is triggered explicitly during
`mountFinished()`, and `unmount()` will not complete until this finishes.
The `EdenMount` object may still exist at this point, and could still be
deleted by the FUSE channel thread, but the deletion now only requires freeing
the memory and does not require accessing other data that may have been cleaned
up by the main thread.
We should still clean up the FUSE channel thread handling in the future, to
make sure these threads are joined before the main thread exits. However, that
cleanup can wait until a separate diff. Ideally I would like to move more of
the mount and unmount logic from EdenServer and EdenServiceHandler and put that
code in EdenMount instead.
Reviewed By: bolinfest
Differential Revision: D5541318
fbshipit-source-id: 470332478357a85c314bc40458373cb0f827f62b
2017-08-03 02:52:18 +03:00
|
|
|
// Use collectAll() rather than collect() to wait for all of the unmounts
|
|
|
|
// to complete, and only check for errors once everything has finished.
|
2018-08-28 05:29:24 +03:00
|
|
|
return folly::collectAllSemiFuture(futures).toUnsafeFuture().thenValue(
|
2018-04-05 05:40:22 +03:00
|
|
|
[](std::vector<folly::Try<Optional<TakeoverData::MountInfo>>> results) {
|
|
|
|
TakeoverData data;
|
|
|
|
data.mountPoints.reserve(results.size());
|
|
|
|
for (auto& result : results) {
|
|
|
|
// If something went wrong shutting down a mount point,
|
|
|
|
// log the error but continue trying to perform graceful takeover
|
|
|
|
// of the other mount points.
|
|
|
|
if (!result.hasValue()) {
|
|
|
|
XLOG(ERR) << "error stopping mount during takeover shutdown: "
|
|
|
|
<< result.exception().what();
|
|
|
|
continue;
|
2018-01-10 09:01:02 +03:00
|
|
|
}
|
|
|
|
|
2018-04-05 05:40:22 +03:00
|
|
|
// result might be a successful Try with an empty Optional.
|
|
|
|
// This could happen if the mount point was unmounted while we were
|
|
|
|
// in the middle of stopping it for takeover. Just skip this mount
|
|
|
|
// in this case.
|
|
|
|
if (!result.value().hasValue()) {
|
|
|
|
XLOG(WARN) << "mount point was unmounted during "
|
|
|
|
"takeover shutdown";
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
data.mountPoints.emplace_back(std::move(result.value().value()));
|
fix EdenServer::unmount() to fully wait for mount point cleanup
Summary:
This fixes EdenServer::unmount() to actually wait for all EdenMount cleanup
to complete, and fixes unmountAll() to return a Future that correctly waits for
all mount points to be cleaned up.
Previously `unmount()` waited for the mount point to be unmounted from the
kernel, but did not wait for EdenMount shutdown to complete. Previously
EdenMount shutdown was not triggered until the last reference to the
shared_ptr<EdenMount> was released. This often happened in the FUSE channel
thread that triggered the mountFinished() call--it would still hold a
reference to this pointer, and would not release it until after
mountFinished() returns. As a result, when the main thread was shutting down,
`main()` would call `unmountAll()`, and then return soon after it completed.
Some FUSE channel threads may still be running at this point, still performing
`EdenMount` shutdown while the main thread was exiting. This could result in
crashes and deadlocks as shutdown tried to access objects already destroyed by
the main thread.
With this change `EdenMount::shutdown()` is triggered explicitly during
`mountFinished()`, and `unmount()` will not complete until this finishes.
The `EdenMount` object may still exist at this point, and could still be
deleted by the FUSE channel thread, but the deletion now only requires freeing
the memory and does not require accessing other data that may have been cleaned
up by the main thread.
We should still clean up the FUSE channel thread handling in the future, to
make sure these threads are joined before the main thread exits. However, that
cleanup can wait until a separate diff. Ideally I would like to move more of
the mount and unmount logic from EdenServer and EdenServiceHandler and put that
code in EdenMount instead.
Reviewed By: bolinfest
Differential Revision: D5541318
fbshipit-source-id: 470332478357a85c314bc40458373cb0f827f62b
2017-08-03 02:52:18 +03:00
|
|
|
}
|
2018-04-05 05:40:22 +03:00
|
|
|
return data;
|
fix EdenServer::unmount() to fully wait for mount point cleanup
Summary:
This fixes EdenServer::unmount() to actually wait for all EdenMount cleanup
to complete, and fixes unmountAll() to return a Future that correctly waits for
all mount points to be cleaned up.
Previously `unmount()` waited for the mount point to be unmounted from the
kernel, but did not wait for EdenMount shutdown to complete. Previously
EdenMount shutdown was not triggered until the last reference to the
shared_ptr<EdenMount> was released. This often happened in the FUSE channel
thread that triggered the mountFinished() call--it would still hold a
reference to this pointer, and would not release it until after
mountFinished() returns. As a result, when the main thread was shutting down,
`main()` would call `unmountAll()`, and then return soon after it completed.
Some FUSE channel threads may still be running at this point, still performing
`EdenMount` shutdown while the main thread was exiting. This could result in
crashes and deadlocks as shutdown tried to access objects already destroyed by
the main thread.
With this change `EdenMount::shutdown()` is triggered explicitly during
`mountFinished()`, and `unmount()` will not complete until this finishes.
The `EdenMount` object may still exist at this point, and could still be
deleted by the FUSE channel thread, but the deletion now only requires freeing
the memory and does not require accessing other data that may have been cleaned
up by the main thread.
We should still clean up the FUSE channel thread handling in the future, to
make sure these threads are joined before the main thread exits. However, that
cleanup can wait until a separate diff. Ideally I would like to move more of
the mount and unmount logic from EdenServer and EdenServiceHandler and put that
code in EdenMount instead.
Reviewed By: bolinfest
Differential Revision: D5541318
fbshipit-source-id: 470332478357a85c314bc40458373cb0f827f62b
2017-08-03 02:52:18 +03:00
|
|
|
});
|
2016-07-01 07:00:01 +03:00
|
|
|
}
|
2016-05-12 23:43:17 +03:00
|
|
|
|
2017-09-13 18:26:50 +03:00
|
|
|
void EdenServer::scheduleFlushStats() {
|
|
|
|
mainEventBase_->timer().scheduleTimeoutFn(
|
|
|
|
[this] {
|
|
|
|
flushStatsNow();
|
2018-06-20 18:56:26 +03:00
|
|
|
reportProcStats();
|
2017-09-13 18:26:50 +03:00
|
|
|
scheduleFlushStats();
|
|
|
|
},
|
|
|
|
std::chrono::seconds(1));
|
|
|
|
}
|
|
|
|
|
|
|
|
void EdenServer::unloadInodes() {
|
2018-09-26 03:50:36 +03:00
|
|
|
struct Root {
|
|
|
|
std::string mountName;
|
|
|
|
TreeInodePtr rootInode;
|
|
|
|
};
|
|
|
|
std::vector<Root> roots;
|
2017-09-13 18:26:50 +03:00
|
|
|
{
|
2018-01-18 00:29:59 +03:00
|
|
|
const auto mountPoints = mountPoints_.wlock();
|
2017-09-13 18:26:50 +03:00
|
|
|
for (auto& entry : *mountPoints) {
|
2018-09-26 03:50:36 +03:00
|
|
|
roots.emplace_back(Root{std::string{entry.first},
|
|
|
|
entry.second.edenMount->getRootInode()});
|
2017-09-13 18:26:50 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!roots.empty()) {
|
|
|
|
auto serviceData = stats::ServiceData::get();
|
|
|
|
|
|
|
|
uint64_t totalUnloaded = serviceData->getCounter(kPeriodicUnloadCounterKey);
|
2018-09-26 03:50:36 +03:00
|
|
|
auto cutoff = std::chrono::system_clock::now() -
|
|
|
|
std::chrono::minutes(FLAGS_unload_age_minutes);
|
|
|
|
auto cutoff_ts = folly::to<timespec>(cutoff);
|
|
|
|
for (auto& [name, rootInode] : roots) {
|
|
|
|
auto unloaded = rootInode->unloadChildrenLastAccessedBefore(cutoff_ts);
|
|
|
|
if (unloaded) {
|
|
|
|
XLOG(INFO) << "Unloaded " << unloaded
|
|
|
|
<< " inodes in background from mount " << name;
|
|
|
|
}
|
|
|
|
totalUnloaded += unloaded;
|
2017-09-13 18:26:50 +03:00
|
|
|
}
|
|
|
|
serviceData->setCounter(kPeriodicUnloadCounterKey, totalUnloaded);
|
|
|
|
}
|
|
|
|
|
2018-09-04 22:19:08 +03:00
|
|
|
scheduleInodeUnload(std::chrono::minutes(FLAGS_unload_interval_minutes));
|
2017-09-13 18:26:50 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
void EdenServer::scheduleInodeUnload(std::chrono::milliseconds timeout) {
|
|
|
|
mainEventBase_->timer().scheduleTimeoutFn(
|
2018-09-26 03:50:36 +03:00
|
|
|
[this] {
|
|
|
|
XLOG(DBG4) << "Beginning periodic inode unload";
|
|
|
|
unloadInodes();
|
|
|
|
},
|
|
|
|
timeout);
|
2017-09-13 18:26:50 +03:00
|
|
|
}
|
|
|
|
|
2018-06-26 22:05:27 +03:00
|
|
|
Future<Unit> EdenServer::prepare(std::shared_ptr<StartupLogger> logger) {
|
|
|
|
return prepareImpl(std::move(logger))
|
|
|
|
.ensure(
|
|
|
|
// Mark the server state as RUNNING once we finish setting up the
|
|
|
|
// mount points. Even if an error occurs we still transition to the
|
|
|
|
// running state. The prepare() code will log an error with more
|
|
|
|
// details if we do fail to set up some of the mount points.
|
|
|
|
[this] { runningState_.wlock()->state = RunState::RUNNING; });
|
|
|
|
}
|
|
|
|
|
|
|
|
Future<Unit> EdenServer::prepareImpl(std::shared_ptr<StartupLogger> logger) {
|
2017-11-20 22:34:37 +03:00
|
|
|
bool doingTakeover = false;
|
|
|
|
if (!acquireEdenLock()) {
|
|
|
|
// Another edenfs process is already running.
|
|
|
|
//
|
|
|
|
// If --takeover was specified, fall through and attempt to gracefully
|
|
|
|
// takeover mount points from the existing daemon.
|
|
|
|
//
|
|
|
|
// If --takeover was not specified, fail now.
|
|
|
|
if (!FLAGS_takeover) {
|
|
|
|
throw std::runtime_error(
|
|
|
|
"another instance of Eden appears to be running for " +
|
|
|
|
edenDir_.stringPiece().str());
|
|
|
|
}
|
|
|
|
doingTakeover = true;
|
|
|
|
}
|
|
|
|
|
2017-09-01 04:58:48 +03:00
|
|
|
// Store a pointer to the EventBase that will be used to drive
|
|
|
|
// the main thread. The runServer() code will end up driving this EventBase.
|
|
|
|
mainEventBase_ = folly::EventBaseManager::get()->getEventBase();
|
2018-06-26 22:05:27 +03:00
|
|
|
auto thriftRunningFuture = createThriftServer();
|
2017-09-01 04:58:48 +03:00
|
|
|
|
2018-06-12 03:54:25 +03:00
|
|
|
// Start the PrivHelper client, using our main event base to drive its I/O
|
2018-06-19 03:17:37 +03:00
|
|
|
serverState_->getPrivHelper()->attachEventBase(mainEventBase_);
|
2018-06-12 03:54:25 +03:00
|
|
|
|
2017-08-01 06:49:35 +03:00
|
|
|
// Start stats aggregation
|
2017-09-13 18:26:50 +03:00
|
|
|
scheduleFlushStats();
|
2016-05-12 23:43:17 +03:00
|
|
|
|
2017-08-25 22:41:41 +03:00
|
|
|
// Set the ServiceData counter for tracking number of inodes unloaded by
|
|
|
|
// periodic job for unloading inodes to zero on EdenServer start.
|
|
|
|
stats::ServiceData::get()->setCounter(kPeriodicUnloadCounterKey, 0);
|
|
|
|
|
2017-09-13 18:26:50 +03:00
|
|
|
// Schedule a periodic job to unload unused inodes based on the last access
|
|
|
|
// time. currently Eden does not have accurate timestamp tracking for inodes,
|
|
|
|
// so using unloadChildrenNow just to validate the behaviour. We will have to
|
|
|
|
// modify current unloadChildrenNow function to unload inodes based on the
|
|
|
|
// last access time.
|
2018-09-04 22:19:08 +03:00
|
|
|
if (FLAGS_unload_interval_minutes > 0) {
|
2017-09-13 18:26:50 +03:00
|
|
|
scheduleInodeUnload(std::chrono::minutes(FLAGS_start_delay_minutes));
|
|
|
|
}
|
|
|
|
|
2017-11-20 22:34:37 +03:00
|
|
|
// If we are gracefully taking over from an existing edenfs process,
|
|
|
|
// receive its lock, thrift socket, and mount points now.
|
|
|
|
// This will shut down the old process.
|
2018-06-26 22:05:27 +03:00
|
|
|
const auto takeoverPath = edenDir_ + PathComponentPiece{kTakeoverSocketName};
|
2017-11-20 22:34:37 +03:00
|
|
|
TakeoverData takeoverData;
|
|
|
|
if (doingTakeover) {
|
2018-06-26 22:05:27 +03:00
|
|
|
logger->log(
|
|
|
|
"Requesting existing edenfs process to gracefully "
|
|
|
|
"transfer its mount points...");
|
2017-11-20 22:34:37 +03:00
|
|
|
takeoverData = takeoverMounts(takeoverPath);
|
2018-06-26 22:05:27 +03:00
|
|
|
logger->log(
|
|
|
|
"Received takeover information for ",
|
|
|
|
takeoverData.mountPoints.size(),
|
|
|
|
" mount points");
|
2017-11-20 22:34:37 +03:00
|
|
|
|
|
|
|
// Take over the eden lock file and the thrift server socket.
|
|
|
|
lockFile_ = std::move(takeoverData.lockFile);
|
|
|
|
server_->useExistingSocket(takeoverData.thriftSocket.release());
|
|
|
|
} else {
|
|
|
|
// Remove any old thrift socket from a previous (now dead) edenfs daemon.
|
|
|
|
prepareThriftAddress();
|
|
|
|
}
|
|
|
|
|
2018-02-09 06:54:14 +03:00
|
|
|
if (FLAGS_local_storage_engine_unsafe == "memory") {
|
2018-06-26 22:05:27 +03:00
|
|
|
logger->log("Creating new memory store.");
|
2018-02-09 06:54:14 +03:00
|
|
|
localStore_ = make_shared<MemoryLocalStore>();
|
|
|
|
} else if (FLAGS_local_storage_engine_unsafe == "sqlite") {
|
|
|
|
const auto path = edenDir_ + RelativePathPiece{kSqlitePath};
|
2018-06-26 22:05:31 +03:00
|
|
|
const auto parentDir = path.dirname();
|
|
|
|
ensureDirectoryExists(parentDir);
|
2018-06-26 22:05:27 +03:00
|
|
|
logger->log("Opening local SQLite store ", path, "...");
|
|
|
|
folly::stop_watch<std::chrono::milliseconds> watch;
|
2018-02-09 06:54:14 +03:00
|
|
|
localStore_ = make_shared<SqliteLocalStore>(path);
|
2018-06-26 22:05:27 +03:00
|
|
|
logger->log(
|
|
|
|
"Opened SQLite store in ",
|
|
|
|
watch.elapsed().count() / 1000.0,
|
|
|
|
" seconds.");
|
2018-02-09 06:54:14 +03:00
|
|
|
} else if (FLAGS_local_storage_engine_unsafe == "rocksdb") {
|
2018-06-26 22:05:27 +03:00
|
|
|
logger->log("Opening local RocksDB store...");
|
|
|
|
folly::stop_watch<std::chrono::milliseconds> watch;
|
2018-02-09 06:54:14 +03:00
|
|
|
const auto rocksPath = edenDir_ + RelativePathPiece{kRocksDBPath};
|
2018-06-26 22:05:31 +03:00
|
|
|
ensureDirectoryExists(rocksPath);
|
2018-02-09 06:54:14 +03:00
|
|
|
localStore_ = make_shared<RocksDbLocalStore>(rocksPath);
|
2018-06-26 22:05:27 +03:00
|
|
|
logger->log(
|
|
|
|
"Opened RocksDB store in ",
|
|
|
|
watch.elapsed().count() / 1000.0,
|
|
|
|
" seconds.");
|
2018-02-09 06:54:14 +03:00
|
|
|
} else {
|
2018-06-26 22:05:27 +03:00
|
|
|
throw std::runtime_error(folly::to<string>(
|
|
|
|
"invalid --local_storage_engine_unsafe flag: ",
|
|
|
|
FLAGS_local_storage_engine_unsafe));
|
2018-02-09 06:54:14 +03:00
|
|
|
}
|
2017-11-20 22:34:37 +03:00
|
|
|
|
|
|
|
// Start listening for graceful takeover requests
|
|
|
|
takeoverServer_.reset(
|
|
|
|
new TakeoverServer(getMainEventBase(), takeoverPath, this));
|
|
|
|
takeoverServer_->start();
|
|
|
|
|
2018-06-12 03:54:24 +03:00
|
|
|
// Trigger remounting of existing mount points
|
|
|
|
// If doingTakeover is true, use the mounts received in TakeoverData
|
|
|
|
std::vector<Future<Unit>> mountFutures;
|
2018-01-10 09:00:58 +03:00
|
|
|
if (doingTakeover) {
|
|
|
|
for (auto& info : takeoverData.mountPoints) {
|
2018-01-18 00:29:59 +03:00
|
|
|
const auto stateDirectory = info.stateDirectory;
|
2018-06-12 03:54:24 +03:00
|
|
|
auto mountFuture =
|
|
|
|
makeFutureWith([&] {
|
|
|
|
auto initialConfig = ClientConfig::loadFromClientDirectory(
|
|
|
|
AbsolutePathPiece{info.mountPath},
|
|
|
|
AbsolutePathPiece{info.stateDirectory});
|
|
|
|
return mount(std::move(initialConfig), std::move(info));
|
|
|
|
})
|
2018-06-26 22:05:27 +03:00
|
|
|
.then([logger, mountPath = info.mountPath](
|
|
|
|
folly::Try<std::shared_ptr<EdenMount>>&& result) {
|
|
|
|
if (result.hasValue()) {
|
|
|
|
logger->log("Successfully took over mount ", mountPath);
|
|
|
|
return makeFuture();
|
|
|
|
} else {
|
|
|
|
logger->warn(
|
|
|
|
"Failed to perform takeover for ",
|
|
|
|
mountPath,
|
|
|
|
": ",
|
|
|
|
result.exception().what());
|
|
|
|
return makeFuture<Unit>(std::move(result).exception());
|
|
|
|
}
|
2018-06-12 03:54:24 +03:00
|
|
|
});
|
|
|
|
mountFutures.push_back(std::move(mountFuture));
|
2018-01-10 09:00:58 +03:00
|
|
|
}
|
|
|
|
} else {
|
|
|
|
folly::dynamic dirs = folly::dynamic::object();
|
2016-08-05 22:49:26 +03:00
|
|
|
try {
|
2018-01-10 09:00:58 +03:00
|
|
|
dirs = ClientConfig::loadClientDirectoryMap(edenDir_);
|
2016-08-05 22:49:26 +03:00
|
|
|
} catch (const std::exception& ex) {
|
2018-06-26 22:05:27 +03:00
|
|
|
logger->warn(
|
|
|
|
"Could not parse config.json file: ",
|
|
|
|
ex.what(),
|
|
|
|
"\nSkipping remount step.");
|
2018-08-04 09:24:50 +03:00
|
|
|
return std::move(thriftRunningFuture)
|
2018-09-15 02:57:46 +03:00
|
|
|
.thenValue(
|
|
|
|
[ew = folly::exception_wrapper(std::current_exception(), ex)](
|
|
|
|
auto&&) { return makeFuture<Unit>(ew); });
|
2018-06-26 22:05:27 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
if (dirs.empty()) {
|
|
|
|
logger->log("No mount points currently configured.");
|
|
|
|
return thriftRunningFuture;
|
2018-01-10 09:00:58 +03:00
|
|
|
}
|
2018-06-26 22:05:27 +03:00
|
|
|
logger->log("Remounting ", dirs.size(), " mount points...");
|
|
|
|
|
2018-01-18 00:29:59 +03:00
|
|
|
for (const auto& client : dirs.items()) {
|
2018-06-12 03:54:24 +03:00
|
|
|
auto mountFuture =
|
|
|
|
makeFutureWith([&] {
|
|
|
|
MountInfo mountInfo;
|
|
|
|
mountInfo.mountPoint = client.first.c_str();
|
|
|
|
auto edenClientPath = edenDir_ + PathComponent("clients") +
|
|
|
|
PathComponent(client.second.c_str());
|
|
|
|
mountInfo.edenClientPath = edenClientPath.stringPiece().str();
|
|
|
|
auto initialConfig = ClientConfig::loadFromClientDirectory(
|
|
|
|
AbsolutePathPiece{mountInfo.mountPoint},
|
|
|
|
AbsolutePathPiece{mountInfo.edenClientPath});
|
|
|
|
return mount(std::move(initialConfig));
|
|
|
|
})
|
2018-06-26 22:05:27 +03:00
|
|
|
.then([logger, mountPath = client.first.asString()](
|
|
|
|
folly::Try<std::shared_ptr<EdenMount>>&& result) {
|
|
|
|
if (result.hasValue()) {
|
|
|
|
logger->log("Successfully remounted ", mountPath);
|
|
|
|
return makeFuture();
|
|
|
|
} else {
|
|
|
|
logger->warn(
|
|
|
|
"Failed to remount ",
|
|
|
|
mountPath,
|
|
|
|
": ",
|
|
|
|
result.exception().what());
|
|
|
|
return makeFuture<Unit>(std::move(result).exception());
|
|
|
|
}
|
2018-06-12 03:54:24 +03:00
|
|
|
});
|
|
|
|
mountFutures.push_back(std::move(mountFuture));
|
2016-08-05 22:49:26 +03:00
|
|
|
}
|
|
|
|
}
|
2018-06-26 22:05:27 +03:00
|
|
|
|
|
|
|
// Return a future that will complete only when all mount points have started
|
|
|
|
// and the thrift server is also running.
|
|
|
|
return folly::collectAll(mountFutures)
|
|
|
|
.then([thriftFuture = std::move(thriftRunningFuture)]() mutable {
|
|
|
|
return std::move(thriftFuture);
|
|
|
|
});
|
2016-05-12 23:43:17 +03:00
|
|
|
}
|
|
|
|
|
2018-10-09 20:01:07 +03:00
|
|
|
void EdenServer::run(void (*runThriftServer)(const EdenServer&)) {
|
2018-06-26 22:05:27 +03:00
|
|
|
if (!lockFile_) {
|
|
|
|
throw std::runtime_error(
|
|
|
|
"prepare() must be called before EdenServer::run()");
|
|
|
|
}
|
2018-06-12 03:54:24 +03:00
|
|
|
|
2017-08-30 23:43:39 +03:00
|
|
|
// Run the thrift server
|
2018-10-09 20:01:07 +03:00
|
|
|
runThriftServer(*this);
|
2017-08-30 23:43:39 +03:00
|
|
|
|
2017-11-20 02:18:29 +03:00
|
|
|
bool takeover;
|
2017-11-20 02:18:31 +03:00
|
|
|
folly::File thriftSocket;
|
2017-11-20 02:18:29 +03:00
|
|
|
{
|
2018-02-09 06:32:01 +03:00
|
|
|
auto state = runningState_.wlock();
|
2017-11-20 02:18:29 +03:00
|
|
|
takeover = state->takeoverShutdown;
|
2017-11-20 02:18:31 +03:00
|
|
|
if (takeover) {
|
|
|
|
thriftSocket = std::move(state->takeoverThriftSocket);
|
|
|
|
}
|
2018-02-09 06:32:01 +03:00
|
|
|
state->state = RunState::SHUTTING_DOWN;
|
2017-11-20 02:18:29 +03:00
|
|
|
}
|
2017-11-20 02:18:31 +03:00
|
|
|
auto shutdownFuture = takeover
|
|
|
|
? performTakeoverShutdown(std::move(thriftSocket))
|
|
|
|
: performNormalShutdown();
|
2017-09-09 05:15:17 +03:00
|
|
|
|
2017-11-20 02:18:29 +03:00
|
|
|
// Drive the main event base until shutdownFuture completes
|
2017-09-09 05:15:17 +03:00
|
|
|
CHECK_EQ(mainEventBase_, folly::EventBaseManager::get()->getEventBase());
|
2017-11-20 02:18:29 +03:00
|
|
|
while (!shutdownFuture.isReady()) {
|
2017-09-09 05:15:17 +03:00
|
|
|
mainEventBase_->loopOnce();
|
|
|
|
}
|
2018-06-20 09:17:43 +03:00
|
|
|
std::move(shutdownFuture).get();
|
2017-11-20 02:18:29 +03:00
|
|
|
}
|
|
|
|
|
2017-11-20 02:18:31 +03:00
|
|
|
Future<Unit> EdenServer::performTakeoverShutdown(folly::File thriftSocket) {
|
2018-01-10 09:00:50 +03:00
|
|
|
// stop processing new FUSE requests for the mounts,
|
2018-09-15 02:57:46 +03:00
|
|
|
return stopMountsForTakeover().thenValue(
|
|
|
|
[this,
|
|
|
|
socket = std::move(thriftSocket)](TakeoverData&& takeover) mutable {
|
|
|
|
// Destroy the local store and backing stores.
|
|
|
|
// We shouldn't access the local store any more after giving up our
|
|
|
|
// lock, and we need to close it to release its lock before the new
|
|
|
|
// edenfs process tries to open it.
|
|
|
|
backingStores_.wlock()->clear();
|
|
|
|
// Explicit close the LocalStore before we reset our pointer, to
|
|
|
|
// ensure we release the RocksDB lock. Since this is managed with a
|
|
|
|
// shared_ptr it is somewhat hard to confirm if we really have the
|
|
|
|
// last reference to it.
|
|
|
|
localStore_->close();
|
|
|
|
localStore_.reset();
|
|
|
|
|
|
|
|
// Stop the privhelper process.
|
|
|
|
shutdownPrivhelper();
|
|
|
|
|
|
|
|
takeover.lockFile = std::move(lockFile_);
|
|
|
|
auto future = takeover.takeoverComplete.getFuture();
|
|
|
|
takeover.thriftSocket = std::move(socket);
|
|
|
|
|
|
|
|
takeoverPromise_.setValue(std::move(takeover));
|
|
|
|
return future;
|
|
|
|
});
|
2017-11-20 02:18:29 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
Future<Unit> EdenServer::performNormalShutdown() {
|
|
|
|
takeoverServer_.reset();
|
|
|
|
|
|
|
|
// Clean up all the server mount points before shutting down the privhelper.
|
2018-07-24 04:49:06 +03:00
|
|
|
return unmountAll().thenTry([this](folly::Try<Unit>&& result) {
|
2018-04-05 05:40:22 +03:00
|
|
|
shutdownPrivhelper();
|
|
|
|
result.throwIfFailed();
|
|
|
|
});
|
2017-11-20 02:18:29 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
void EdenServer::shutdownPrivhelper() {
|
2017-08-30 23:43:39 +03:00
|
|
|
// Explicitly stop the privhelper process so we can verify that it
|
|
|
|
// exits normally.
|
2018-04-23 23:10:31 +03:00
|
|
|
const auto privhelperExitCode = serverState_->getPrivHelper()->stop();
|
2017-08-30 23:43:39 +03:00
|
|
|
if (privhelperExitCode != 0) {
|
|
|
|
if (privhelperExitCode > 0) {
|
|
|
|
XLOG(ERR) << "privhelper process exited with unexpected code "
|
|
|
|
<< privhelperExitCode;
|
|
|
|
} else {
|
|
|
|
XLOG(ERR) << "privhelper process was killed by signal "
|
|
|
|
<< privhelperExitCode;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-09-13 18:26:53 +03:00
|
|
|
void EdenServer::addToMountPoints(std::shared_ptr<EdenMount> edenMount) {
|
2016-05-20 20:33:42 +03:00
|
|
|
auto mountPath = edenMount->getPath().stringPiece();
|
2016-07-01 07:00:01 +03:00
|
|
|
{
|
2018-01-18 00:29:59 +03:00
|
|
|
const auto mountPoints = mountPoints_.wlock();
|
|
|
|
const auto ret = mountPoints->emplace(mountPath, EdenMountInfo(edenMount));
|
2016-05-12 23:43:17 +03:00
|
|
|
if (!ret.second) {
|
|
|
|
// This mount point already exists.
|
|
|
|
throw EdenError(folly::to<string>(
|
|
|
|
"mount point \"", mountPath, "\" is already mounted"));
|
|
|
|
}
|
|
|
|
}
|
2017-09-13 18:26:53 +03:00
|
|
|
}
|
2016-05-12 23:43:17 +03:00
|
|
|
|
2017-09-13 18:26:53 +03:00
|
|
|
void EdenServer::registerStats(std::shared_ptr<EdenMount> edenMount) {
|
|
|
|
auto counters = stats::ServiceData::get()->getDynamicCounters();
|
|
|
|
// Register callback for getting Loaded inodes in the memory
|
|
|
|
// for a mountPoint.
|
|
|
|
counters->registerCallback(
|
|
|
|
edenMount->getCounterName(CounterName::LOADED),
|
|
|
|
[edenMount] { return edenMount->getInodeMap()->getLoadedInodeCount(); });
|
|
|
|
// Register callback for getting Unloaded inodes in the
|
|
|
|
// memory for a mountpoint
|
|
|
|
counters->registerCallback(
|
|
|
|
edenMount->getCounterName(CounterName::UNLOADED), [edenMount] {
|
|
|
|
return edenMount->getInodeMap()->getUnloadedInodeCount();
|
|
|
|
});
|
|
|
|
}
|
2017-09-13 18:26:52 +03:00
|
|
|
|
2017-09-13 18:26:53 +03:00
|
|
|
void EdenServer::unregisterStats(EdenMount* edenMount) {
|
|
|
|
auto counters = stats::ServiceData::get()->getDynamicCounters();
|
|
|
|
counters->unregisterCallback(edenMount->getCounterName(CounterName::LOADED));
|
|
|
|
counters->unregisterCallback(
|
|
|
|
edenMount->getCounterName(CounterName::UNLOADED));
|
|
|
|
}
|
|
|
|
|
2018-01-11 00:01:19 +03:00
|
|
|
folly::Future<folly::Unit> EdenServer::performFreshFuseStart(
|
|
|
|
std::shared_ptr<EdenMount> edenMount) {
|
|
|
|
// Start up the fuse workers.
|
2018-03-16 22:07:53 +03:00
|
|
|
return edenMount->startFuse();
|
2018-01-11 00:01:19 +03:00
|
|
|
}
|
2018-01-10 09:00:58 +03:00
|
|
|
|
2018-06-12 03:54:24 +03:00
|
|
|
Future<Unit> EdenServer::performTakeoverFuseStart(
|
2018-01-11 00:01:19 +03:00
|
|
|
std::shared_ptr<EdenMount> edenMount,
|
|
|
|
TakeoverData::MountInfo&& info) {
|
|
|
|
std::vector<std::string> bindMounts;
|
|
|
|
for (const auto& bindMount : info.bindMounts) {
|
|
|
|
bindMounts.emplace_back(bindMount.value());
|
|
|
|
}
|
2018-06-12 03:54:24 +03:00
|
|
|
auto future = serverState_->getPrivHelper()->fuseTakeoverStartup(
|
2018-01-11 00:01:19 +03:00
|
|
|
info.mountPath.stringPiece(), bindMounts);
|
2018-09-15 02:57:46 +03:00
|
|
|
return std::move(future).thenValue([this,
|
|
|
|
edenMount = std::move(edenMount),
|
|
|
|
info = std::move(info)](auto&&) mutable {
|
2018-06-12 03:54:24 +03:00
|
|
|
return completeTakeoverFuseStart(std::move(edenMount), std::move(info));
|
|
|
|
});
|
|
|
|
}
|
2018-01-11 00:01:19 +03:00
|
|
|
|
2018-06-12 03:54:24 +03:00
|
|
|
Future<Unit> EdenServer::completeTakeoverFuseStart(
|
|
|
|
std::shared_ptr<EdenMount> edenMount,
|
|
|
|
TakeoverData::MountInfo&& info) {
|
2018-01-11 00:01:19 +03:00
|
|
|
// (re)open file handles for each entry in info.fileHandleMap
|
2018-06-12 03:54:24 +03:00
|
|
|
std::vector<Future<Unit>> futures;
|
2018-01-11 00:01:19 +03:00
|
|
|
auto dispatcher = edenMount->getDispatcher();
|
|
|
|
|
|
|
|
for (const auto& handleEntry : info.fileHandleMap.entries) {
|
|
|
|
if (handleEntry.isDir) {
|
|
|
|
futures.emplace_back(
|
|
|
|
// TODO: we should record the opendir() flags in the
|
|
|
|
// SerializedFileHandleMap so that we can restore
|
|
|
|
// the correct flags here.
|
2018-02-27 23:40:30 +03:00
|
|
|
dispatcher
|
2018-03-20 03:01:15 +03:00
|
|
|
->opendir(InodeNumber::fromThrift(handleEntry.inodeNumber), 0)
|
2018-10-08 21:11:32 +03:00
|
|
|
.thenValue([dispatcher,
|
|
|
|
inodeNumber = handleEntry.inodeNumber,
|
|
|
|
number = handleEntry.handleId](
|
2018-08-28 05:29:24 +03:00
|
|
|
std::shared_ptr<DirHandle> handle) {
|
2018-01-11 00:01:19 +03:00
|
|
|
dispatcher->getFileHandles().recordHandle(
|
2018-10-08 21:11:32 +03:00
|
|
|
std::static_pointer_cast<FileHandleBase>(handle),
|
|
|
|
InodeNumber::fromThrift(inodeNumber),
|
|
|
|
number);
|
2018-01-11 00:01:19 +03:00
|
|
|
}));
|
|
|
|
} else {
|
|
|
|
futures.emplace_back(
|
|
|
|
// TODO: we should record the open() flags in the
|
|
|
|
// SerializedFileHandleMap so that we can restore
|
|
|
|
// the correct flags here.
|
2018-02-27 23:40:30 +03:00
|
|
|
dispatcher
|
2018-03-20 03:01:15 +03:00
|
|
|
->open(InodeNumber::fromThrift(handleEntry.inodeNumber), O_RDWR)
|
2018-10-08 21:11:32 +03:00
|
|
|
.thenValue([dispatcher,
|
|
|
|
inodeNumber = handleEntry.inodeNumber,
|
|
|
|
number = handleEntry.handleId](
|
2018-08-28 05:29:24 +03:00
|
|
|
std::shared_ptr<FileHandle> handle) {
|
2018-01-11 00:01:19 +03:00
|
|
|
dispatcher->getFileHandles().recordHandle(
|
2018-10-08 21:11:32 +03:00
|
|
|
std::static_pointer_cast<FileHandleBase>(handle),
|
|
|
|
InodeNumber::fromThrift(inodeNumber),
|
|
|
|
number);
|
2018-01-11 00:01:19 +03:00
|
|
|
}));
|
|
|
|
}
|
|
|
|
}
|
2018-01-10 09:00:58 +03:00
|
|
|
|
2018-01-11 00:01:19 +03:00
|
|
|
FuseChannelData channelData;
|
|
|
|
channelData.fd = std::move(info.fuseFD);
|
|
|
|
channelData.connInfo = info.connInfo;
|
2018-01-10 09:00:58 +03:00
|
|
|
|
2018-01-11 00:01:19 +03:00
|
|
|
// Start up the fuse workers.
|
2018-09-15 02:57:46 +03:00
|
|
|
return folly::collectAllSemiFuture(futures).toUnsafeFuture().thenValue(
|
|
|
|
[edenMount, chData = std::move(channelData)](auto&&) mutable {
|
2018-06-01 20:55:42 +03:00
|
|
|
return edenMount->takeoverFuse(std::move(chData));
|
2018-01-10 09:00:58 +03:00
|
|
|
});
|
|
|
|
}
|
|
|
|
|
2017-09-13 18:26:53 +03:00
|
|
|
folly::Future<std::shared_ptr<EdenMount>> EdenServer::mount(
|
2018-01-11 00:01:19 +03:00
|
|
|
std::unique_ptr<ClientConfig> initialConfig,
|
|
|
|
Optional<TakeoverData::MountInfo>&& optionalTakeover) {
|
2018-01-18 00:29:59 +03:00
|
|
|
auto backingStore = getBackingStore(
|
|
|
|
initialConfig->getRepoType(), initialConfig->getRepoSource());
|
2017-09-13 18:26:53 +03:00
|
|
|
auto objectStore =
|
|
|
|
std::make_unique<ObjectStore>(getLocalStore(), backingStore);
|
2018-01-18 00:29:59 +03:00
|
|
|
const bool doTakeover = optionalTakeover.hasValue();
|
2017-09-13 18:26:53 +03:00
|
|
|
|
2018-01-11 00:01:21 +03:00
|
|
|
auto edenMount = EdenMount::create(
|
2018-04-23 23:10:31 +03:00
|
|
|
std::move(initialConfig), std::move(objectStore), serverState_);
|
2018-01-11 00:01:21 +03:00
|
|
|
|
2018-03-24 00:58:51 +03:00
|
|
|
auto initFuture = edenMount->initialize(
|
|
|
|
optionalTakeover ? folly::make_optional(optionalTakeover->inodeMap)
|
|
|
|
: folly::none);
|
2018-08-04 09:24:50 +03:00
|
|
|
return std::move(initFuture)
|
2018-09-15 02:57:46 +03:00
|
|
|
.thenValue([this,
|
|
|
|
doTakeover,
|
|
|
|
edenMount,
|
|
|
|
optionalTakeover =
|
|
|
|
std::move(optionalTakeover)](auto&&) mutable {
|
2018-08-04 09:24:50 +03:00
|
|
|
addToMountPoints(edenMount);
|
|
|
|
|
|
|
|
return (optionalTakeover ? performTakeoverFuseStart(
|
|
|
|
edenMount, std::move(*optionalTakeover))
|
|
|
|
: performFreshFuseStart(edenMount))
|
|
|
|
// If an error occurs we want to call mountFinished and throw the
|
|
|
|
// error here. Once the pool is up and running, the finishFuture
|
|
|
|
// will ensure that this happens.
|
|
|
|
.onError([this, edenMount](folly::exception_wrapper ew) {
|
|
|
|
mountFinished(edenMount.get(), folly::none);
|
|
|
|
return makeFuture<folly::Unit>(ew);
|
|
|
|
})
|
|
|
|
.then([edenMount, doTakeover, this]() mutable {
|
|
|
|
// Now that we've started the workers, arrange to call
|
|
|
|
// mountFinished once the pool is torn down.
|
|
|
|
auto finishFuture = edenMount->getFuseCompletionFuture().then(
|
|
|
|
[this,
|
|
|
|
edenMount](folly::Try<TakeoverData::MountInfo>&& takeover) {
|
|
|
|
folly::Optional<TakeoverData::MountInfo> optTakeover;
|
|
|
|
if (takeover.hasValue()) {
|
|
|
|
optTakeover = std::move(takeover.value());
|
|
|
|
}
|
|
|
|
mountFinished(edenMount.get(), std::move(optTakeover));
|
|
|
|
});
|
|
|
|
|
|
|
|
registerStats(edenMount);
|
|
|
|
|
|
|
|
if (doTakeover) {
|
|
|
|
// The bind mounts are already mounted in the takeover case
|
|
|
|
return makeFuture<std::shared_ptr<EdenMount>>(
|
|
|
|
std::move(edenMount));
|
|
|
|
} else {
|
|
|
|
// Perform all of the bind mounts associated with the
|
|
|
|
// client. We don't need to do this for the takeover
|
|
|
|
// case as they are already mounted.
|
2018-10-10 02:37:33 +03:00
|
|
|
return edenMount->performBindMounts()
|
|
|
|
.thenValue([edenMount](auto&&) { return edenMount; })
|
|
|
|
.onError([this,
|
|
|
|
edenMount,
|
|
|
|
finishFuture = std::move(finishFuture)](
|
|
|
|
folly::exception_wrapper ew) mutable {
|
|
|
|
// Creating a bind mount failed. Trigger an unmount.
|
|
|
|
return unmount(edenMount->getPath().stringPiece())
|
|
|
|
.thenTry([finishFuture = std::move(finishFuture)](
|
|
|
|
auto&&) mutable {
|
|
|
|
return std::move(finishFuture);
|
|
|
|
})
|
|
|
|
.thenTry([ew = std::move(ew)](auto&&) {
|
|
|
|
return makeFuture<shared_ptr<EdenMount>>(ew);
|
|
|
|
});
|
|
|
|
});
|
2018-08-04 09:24:50 +03:00
|
|
|
}
|
|
|
|
});
|
|
|
|
});
|
2016-05-12 23:43:17 +03:00
|
|
|
}
|
|
|
|
|
fix EdenServer::unmount() to fully wait for mount point cleanup
Summary:
This fixes EdenServer::unmount() to actually wait for all EdenMount cleanup
to complete, and fixes unmountAll() to return a Future that correctly waits for
all mount points to be cleaned up.
Previously `unmount()` waited for the mount point to be unmounted from the
kernel, but did not wait for EdenMount shutdown to complete. Previously
EdenMount shutdown was not triggered until the last reference to the
shared_ptr<EdenMount> was released. This often happened in the FUSE channel
thread that triggered the mountFinished() call--it would still hold a
reference to this pointer, and would not release it until after
mountFinished() returns. As a result, when the main thread was shutting down,
`main()` would call `unmountAll()`, and then return soon after it completed.
Some FUSE channel threads may still be running at this point, still performing
`EdenMount` shutdown while the main thread was exiting. This could result in
crashes and deadlocks as shutdown tried to access objects already destroyed by
the main thread.
With this change `EdenMount::shutdown()` is triggered explicitly during
`mountFinished()`, and `unmount()` will not complete until this finishes.
The `EdenMount` object may still exist at this point, and could still be
deleted by the FUSE channel thread, but the deletion now only requires freeing
the memory and does not require accessing other data that may have been cleaned
up by the main thread.
We should still clean up the FUSE channel thread handling in the future, to
make sure these threads are joined before the main thread exits. However, that
cleanup can wait until a separate diff. Ideally I would like to move more of
the mount and unmount logic from EdenServer and EdenServiceHandler and put that
code in EdenMount instead.
Reviewed By: bolinfest
Differential Revision: D5541318
fbshipit-source-id: 470332478357a85c314bc40458373cb0f827f62b
2017-08-03 02:52:18 +03:00
|
|
|
Future<Unit> EdenServer::unmount(StringPiece mountPath) {
|
2018-06-12 03:54:24 +03:00
|
|
|
return makeFutureWith([&] {
|
|
|
|
auto future = Future<Unit>::makeEmpty();
|
|
|
|
{
|
|
|
|
const auto mountPoints = mountPoints_.wlock();
|
|
|
|
const auto it = mountPoints->find(mountPath);
|
|
|
|
if (it == mountPoints->end()) {
|
|
|
|
return makeFuture<Unit>(
|
|
|
|
std::out_of_range("no such mount point " + mountPath.str()));
|
|
|
|
}
|
|
|
|
future = it->second.unmountPromise.getFuture();
|
|
|
|
}
|
|
|
|
|
2018-09-15 02:57:46 +03:00
|
|
|
return serverState_->getPrivHelper()
|
|
|
|
->fuseUnmount(mountPath)
|
|
|
|
.thenValue([f = std::move(future)](auto&&) mutable {
|
|
|
|
return std::move(f);
|
|
|
|
});
|
2018-06-12 03:54:24 +03:00
|
|
|
})
|
|
|
|
.onError([path = mountPath.str()](folly::exception_wrapper&& ew) {
|
|
|
|
XLOG(ERR) << "Failed to perform unmount for \"" << path
|
|
|
|
<< "\": " << folly::exceptionStr(ew);
|
|
|
|
return makeFuture<Unit>(std::move(ew));
|
|
|
|
});
|
2016-05-28 03:40:10 +03:00
|
|
|
}
|
|
|
|
|
2018-01-10 09:00:50 +03:00
|
|
|
void EdenServer::mountFinished(
|
|
|
|
EdenMount* edenMount,
|
2018-01-10 09:01:00 +03:00
|
|
|
folly::Optional<TakeoverData::MountInfo> takeover) {
|
2018-01-18 00:29:59 +03:00
|
|
|
const auto mountPath = edenMount->getPath().stringPiece();
|
2017-06-22 23:39:57 +03:00
|
|
|
XLOG(INFO) << "mount point \"" << mountPath << "\" stopped";
|
2017-09-13 18:26:53 +03:00
|
|
|
unregisterStats(edenMount);
|
fix EdenServer::unmount() to fully wait for mount point cleanup
Summary:
This fixes EdenServer::unmount() to actually wait for all EdenMount cleanup
to complete, and fixes unmountAll() to return a Future that correctly waits for
all mount points to be cleaned up.
Previously `unmount()` waited for the mount point to be unmounted from the
kernel, but did not wait for EdenMount shutdown to complete. Previously
EdenMount shutdown was not triggered until the last reference to the
shared_ptr<EdenMount> was released. This often happened in the FUSE channel
thread that triggered the mountFinished() call--it would still hold a
reference to this pointer, and would not release it until after
mountFinished() returns. As a result, when the main thread was shutting down,
`main()` would call `unmountAll()`, and then return soon after it completed.
Some FUSE channel threads may still be running at this point, still performing
`EdenMount` shutdown while the main thread was exiting. This could result in
crashes and deadlocks as shutdown tried to access objects already destroyed by
the main thread.
With this change `EdenMount::shutdown()` is triggered explicitly during
`mountFinished()`, and `unmount()` will not complete until this finishes.
The `EdenMount` object may still exist at this point, and could still be
deleted by the FUSE channel thread, but the deletion now only requires freeing
the memory and does not require accessing other data that may have been cleaned
up by the main thread.
We should still clean up the FUSE channel thread handling in the future, to
make sure these threads are joined before the main thread exits. However, that
cleanup can wait until a separate diff. Ideally I would like to move more of
the mount and unmount logic from EdenServer and EdenServiceHandler and put that
code in EdenMount instead.
Reviewed By: bolinfest
Differential Revision: D5541318
fbshipit-source-id: 470332478357a85c314bc40458373cb0f827f62b
2017-08-03 02:52:18 +03:00
|
|
|
|
|
|
|
// Erase the EdenMount from our mountPoints_ map
|
|
|
|
folly::SharedPromise<Unit> unmountPromise;
|
2018-01-10 09:01:00 +03:00
|
|
|
folly::Optional<folly::Promise<TakeoverData::MountInfo>> takeoverPromise;
|
2016-07-01 07:00:01 +03:00
|
|
|
{
|
2018-01-18 00:29:59 +03:00
|
|
|
const auto mountPoints = mountPoints_.wlock();
|
|
|
|
const auto it = mountPoints->find(mountPath);
|
2017-08-03 03:40:57 +03:00
|
|
|
CHECK(it != mountPoints->end());
|
fix EdenServer::unmount() to fully wait for mount point cleanup
Summary:
This fixes EdenServer::unmount() to actually wait for all EdenMount cleanup
to complete, and fixes unmountAll() to return a Future that correctly waits for
all mount points to be cleaned up.
Previously `unmount()` waited for the mount point to be unmounted from the
kernel, but did not wait for EdenMount shutdown to complete. Previously
EdenMount shutdown was not triggered until the last reference to the
shared_ptr<EdenMount> was released. This often happened in the FUSE channel
thread that triggered the mountFinished() call--it would still hold a
reference to this pointer, and would not release it until after
mountFinished() returns. As a result, when the main thread was shutting down,
`main()` would call `unmountAll()`, and then return soon after it completed.
Some FUSE channel threads may still be running at this point, still performing
`EdenMount` shutdown while the main thread was exiting. This could result in
crashes and deadlocks as shutdown tried to access objects already destroyed by
the main thread.
With this change `EdenMount::shutdown()` is triggered explicitly during
`mountFinished()`, and `unmount()` will not complete until this finishes.
The `EdenMount` object may still exist at this point, and could still be
deleted by the FUSE channel thread, but the deletion now only requires freeing
the memory and does not require accessing other data that may have been cleaned
up by the main thread.
We should still clean up the FUSE channel thread handling in the future, to
make sure these threads are joined before the main thread exits. However, that
cleanup can wait until a separate diff. Ideally I would like to move more of
the mount and unmount logic from EdenServer and EdenServiceHandler and put that
code in EdenMount instead.
Reviewed By: bolinfest
Differential Revision: D5541318
fbshipit-source-id: 470332478357a85c314bc40458373cb0f827f62b
2017-08-03 02:52:18 +03:00
|
|
|
unmountPromise = std::move(it->second.unmountPromise);
|
2018-01-10 09:00:50 +03:00
|
|
|
takeoverPromise = std::move(it->second.takeoverPromise);
|
2017-08-03 03:40:57 +03:00
|
|
|
mountPoints->erase(it);
|
2016-05-12 23:43:17 +03:00
|
|
|
}
|
fix EdenServer::unmount() to fully wait for mount point cleanup
Summary:
This fixes EdenServer::unmount() to actually wait for all EdenMount cleanup
to complete, and fixes unmountAll() to return a Future that correctly waits for
all mount points to be cleaned up.
Previously `unmount()` waited for the mount point to be unmounted from the
kernel, but did not wait for EdenMount shutdown to complete. Previously
EdenMount shutdown was not triggered until the last reference to the
shared_ptr<EdenMount> was released. This often happened in the FUSE channel
thread that triggered the mountFinished() call--it would still hold a
reference to this pointer, and would not release it until after
mountFinished() returns. As a result, when the main thread was shutting down,
`main()` would call `unmountAll()`, and then return soon after it completed.
Some FUSE channel threads may still be running at this point, still performing
`EdenMount` shutdown while the main thread was exiting. This could result in
crashes and deadlocks as shutdown tried to access objects already destroyed by
the main thread.
With this change `EdenMount::shutdown()` is triggered explicitly during
`mountFinished()`, and `unmount()` will not complete until this finishes.
The `EdenMount` object may still exist at this point, and could still be
deleted by the FUSE channel thread, but the deletion now only requires freeing
the memory and does not require accessing other data that may have been cleaned
up by the main thread.
We should still clean up the FUSE channel thread handling in the future, to
make sure these threads are joined before the main thread exits. However, that
cleanup can wait until a separate diff. Ideally I would like to move more of
the mount and unmount logic from EdenServer and EdenServiceHandler and put that
code in EdenMount instead.
Reviewed By: bolinfest
Differential Revision: D5541318
fbshipit-source-id: 470332478357a85c314bc40458373cb0f827f62b
2017-08-03 02:52:18 +03:00
|
|
|
|
2018-01-18 00:29:59 +03:00
|
|
|
const bool doTakeover = takeoverPromise.hasValue();
|
2018-01-10 09:01:00 +03:00
|
|
|
|
fix EdenServer::unmount() to fully wait for mount point cleanup
Summary:
This fixes EdenServer::unmount() to actually wait for all EdenMount cleanup
to complete, and fixes unmountAll() to return a Future that correctly waits for
all mount points to be cleaned up.
Previously `unmount()` waited for the mount point to be unmounted from the
kernel, but did not wait for EdenMount shutdown to complete. Previously
EdenMount shutdown was not triggered until the last reference to the
shared_ptr<EdenMount> was released. This often happened in the FUSE channel
thread that triggered the mountFinished() call--it would still hold a
reference to this pointer, and would not release it until after
mountFinished() returns. As a result, when the main thread was shutting down,
`main()` would call `unmountAll()`, and then return soon after it completed.
Some FUSE channel threads may still be running at this point, still performing
`EdenMount` shutdown while the main thread was exiting. This could result in
crashes and deadlocks as shutdown tried to access objects already destroyed by
the main thread.
With this change `EdenMount::shutdown()` is triggered explicitly during
`mountFinished()`, and `unmount()` will not complete until this finishes.
The `EdenMount` object may still exist at this point, and could still be
deleted by the FUSE channel thread, but the deletion now only requires freeing
the memory and does not require accessing other data that may have been cleaned
up by the main thread.
We should still clean up the FUSE channel thread handling in the future, to
make sure these threads are joined before the main thread exits. However, that
cleanup can wait until a separate diff. Ideally I would like to move more of
the mount and unmount logic from EdenServer and EdenServiceHandler and put that
code in EdenMount instead.
Reviewed By: bolinfest
Differential Revision: D5541318
fbshipit-source-id: 470332478357a85c314bc40458373cb0f827f62b
2017-08-03 02:52:18 +03:00
|
|
|
// Shutdown the EdenMount, and fulfill the unmount promise
|
|
|
|
// when the shutdown completes
|
2018-01-10 09:01:00 +03:00
|
|
|
edenMount->shutdown(doTakeover)
|
|
|
|
.then([unmountPromise = std::move(unmountPromise),
|
|
|
|
takeoverPromise = std::move(takeoverPromise),
|
|
|
|
takeoverData = std::move(takeover)](
|
2018-03-16 03:26:08 +03:00
|
|
|
folly::Try<
|
|
|
|
std::tuple<SerializedFileHandleMap, SerializedInodeMap>>&&
|
|
|
|
result) mutable {
|
2018-01-10 09:01:00 +03:00
|
|
|
if (takeoverPromise) {
|
|
|
|
takeoverPromise.value().setWith([&]() mutable {
|
2018-03-16 03:26:08 +03:00
|
|
|
takeoverData.value().fileHandleMap =
|
|
|
|
std::move(std::get<0>(result.value()));
|
|
|
|
takeoverData.value().inodeMap =
|
|
|
|
std::move(std::get<1>(result.value()));
|
2018-01-10 09:01:00 +03:00
|
|
|
return std::move(takeoverData.value());
|
|
|
|
});
|
|
|
|
}
|
|
|
|
unmountPromise.setTry(
|
|
|
|
folly::makeTryWith([result = std::move(result)]() {
|
|
|
|
result.throwIfFailed();
|
|
|
|
return Unit{};
|
|
|
|
}));
|
|
|
|
});
|
2017-08-02 06:45:57 +03:00
|
|
|
}
|
|
|
|
|
2016-05-20 20:33:42 +03:00
|
|
|
EdenServer::MountList EdenServer::getMountPoints() const {
|
|
|
|
MountList results;
|
2016-07-01 07:00:01 +03:00
|
|
|
{
|
2018-01-18 00:29:59 +03:00
|
|
|
const auto mountPoints = mountPoints_.rlock();
|
2017-08-03 03:40:57 +03:00
|
|
|
for (const auto& entry : *mountPoints) {
|
2017-08-01 06:48:19 +03:00
|
|
|
results.emplace_back(entry.second.edenMount);
|
2016-05-12 23:43:17 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
return results;
|
|
|
|
}
|
|
|
|
|
2016-06-14 01:15:31 +03:00
|
|
|
shared_ptr<EdenMount> EdenServer::getMount(StringPiece mountPath) const {
|
2018-01-18 00:29:59 +03:00
|
|
|
const auto mount = getMountOrNull(mountPath);
|
2017-04-04 01:47:52 +03:00
|
|
|
if (!mount) {
|
|
|
|
throw EdenError(folly::to<string>(
|
|
|
|
"mount point \"", mountPath, "\" is not known to this eden instance"));
|
|
|
|
}
|
|
|
|
return mount;
|
|
|
|
}
|
|
|
|
|
|
|
|
shared_ptr<EdenMount> EdenServer::getMountOrNull(StringPiece mountPath) const {
|
2018-01-18 00:29:59 +03:00
|
|
|
const auto mountPoints = mountPoints_.rlock();
|
|
|
|
const auto it = mountPoints->find(mountPath);
|
2017-08-03 03:40:57 +03:00
|
|
|
if (it == mountPoints->end()) {
|
2016-07-01 07:00:01 +03:00
|
|
|
return nullptr;
|
2016-05-12 23:43:17 +03:00
|
|
|
}
|
2017-08-01 06:48:19 +03:00
|
|
|
return it->second.edenMount;
|
2016-05-12 23:43:17 +03:00
|
|
|
}
|
|
|
|
|
2016-06-14 01:15:31 +03:00
|
|
|
shared_ptr<BackingStore> EdenServer::getBackingStore(
|
|
|
|
StringPiece type,
|
|
|
|
StringPiece name) {
|
|
|
|
BackingStoreKey key{type.str(), name.str()};
|
|
|
|
SYNCHRONIZED(lockedStores, backingStores_) {
|
2018-01-18 00:29:59 +03:00
|
|
|
const auto it = lockedStores.find(key);
|
2016-06-14 01:15:31 +03:00
|
|
|
if (it != lockedStores.end()) {
|
|
|
|
return it->second;
|
|
|
|
}
|
|
|
|
|
2018-01-18 00:29:59 +03:00
|
|
|
const auto store = createBackingStore(type, name);
|
2016-06-14 01:15:31 +03:00
|
|
|
lockedStores.emplace(key, store);
|
|
|
|
return store;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Ugh. The SYNCHRONIZED() macro is super lame.
|
|
|
|
// We have to return something here, since the compiler can't figure out
|
|
|
|
// that we always return inside SYNCHRONIZED.
|
2017-06-22 23:39:57 +03:00
|
|
|
XLOG(FATAL) << "unreached";
|
2016-06-14 01:15:31 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
shared_ptr<BackingStore> EdenServer::createBackingStore(
|
|
|
|
StringPiece type,
|
|
|
|
StringPiece name) {
|
|
|
|
if (type == "null") {
|
2016-12-14 05:11:05 +03:00
|
|
|
return make_shared<EmptyBackingStore>();
|
2016-06-14 01:15:31 +03:00
|
|
|
} else if (type == "hg") {
|
2018-01-18 00:29:59 +03:00
|
|
|
const auto repoPath = realpath(name);
|
2017-11-15 20:46:11 +03:00
|
|
|
return make_shared<HgBackingStore>(
|
2018-08-20 21:07:34 +03:00
|
|
|
repoPath,
|
|
|
|
localStore_.get(),
|
|
|
|
serverState_->getThreadPool().get(),
|
|
|
|
clientCertificate_,
|
2018-10-11 00:44:49 +03:00
|
|
|
useMononoke_,
|
|
|
|
mononokeTierName_);
|
2016-06-14 01:15:31 +03:00
|
|
|
} else if (type == "git") {
|
2018-01-18 00:29:59 +03:00
|
|
|
const auto repoPath = realpath(name);
|
2017-06-24 01:09:49 +03:00
|
|
|
return make_shared<GitBackingStore>(repoPath, localStore_.get());
|
2016-06-14 01:15:31 +03:00
|
|
|
} else {
|
|
|
|
throw std::domain_error(
|
|
|
|
folly::to<string>("unsupported backing store type: ", type));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-06-26 22:05:27 +03:00
|
|
|
Future<Unit> EdenServer::createThriftServer() {
|
2016-06-14 01:15:31 +03:00
|
|
|
server_ = make_shared<ThriftServer>();
|
2016-05-12 23:43:17 +03:00
|
|
|
server_->setMaxRequests(FLAGS_thrift_max_requests);
|
2016-11-05 01:29:16 +03:00
|
|
|
server_->setNumIOWorkerThreads(FLAGS_thrift_num_workers);
|
2016-05-12 23:43:17 +03:00
|
|
|
server_->setEnableCodel(FLAGS_thrift_enable_codel);
|
|
|
|
server_->setMinCompressBytes(FLAGS_thrift_min_compress_bytes);
|
|
|
|
|
2016-06-14 01:15:31 +03:00
|
|
|
handler_ = make_shared<EdenServiceHandler>(this);
|
2016-05-12 23:43:17 +03:00
|
|
|
server_->setInterface(handler_);
|
2018-02-09 03:33:32 +03:00
|
|
|
|
2018-02-09 06:31:58 +03:00
|
|
|
// Get the path to the thrift socket.
|
|
|
|
auto thriftSocketPath = edenDir_ + PathComponentPiece{kThriftSocketName};
|
|
|
|
folly::SocketAddress thriftAddress;
|
|
|
|
thriftAddress.setFromPath(thriftSocketPath.stringPiece());
|
|
|
|
server_->setAddress(thriftAddress);
|
2018-04-23 23:10:31 +03:00
|
|
|
serverState_->setSocketPath(thriftSocketPath);
|
2017-06-27 05:27:34 +03:00
|
|
|
|
|
|
|
serverEventHandler_ = make_shared<ThriftServerEventHandler>(this);
|
|
|
|
server_->setServerEventHandler(serverEventHandler_);
|
2018-06-26 22:05:27 +03:00
|
|
|
return serverEventHandler_->getThriftRunningFuture();
|
2016-05-12 23:43:17 +03:00
|
|
|
}
|
|
|
|
|
2017-11-20 22:34:37 +03:00
|
|
|
bool EdenServer::acquireEdenLock() {
|
2018-01-18 00:29:59 +03:00
|
|
|
const auto lockPath = edenDir_ + PathComponentPiece{kLockFileName};
|
2018-08-30 03:17:31 +03:00
|
|
|
lockFile_ = folly::File(lockPath.value(), O_WRONLY | O_CREAT | O_CLOEXEC);
|
2016-05-12 23:43:17 +03:00
|
|
|
if (!lockFile_.try_lock()) {
|
2017-11-20 22:34:37 +03:00
|
|
|
lockFile_.close();
|
|
|
|
return false;
|
2016-05-12 23:43:17 +03:00
|
|
|
}
|
Write the PID to the lockfile and update `eden health` to use it.
Summary:
We have encountered cases where `eden health` reported
`"edenfs not healthy: edenfs not running"` even though the `edenfs` process is
still running. Because the existing implementation of `eden health` bases its
health check on the output of a `getStatus()` Thrift call, it will erroneously
report `"edenfs not running"` even if Eden is running but its Thrift server is
not running. This type of false negative could occur if `edenfs` has shutdown
the Thrift server, but not the rest of the process (quite possibly, its
shutdown is blocked on calls to `umount2()`).
This is further problematic because `eden daemon` checks `eden health`
before attempting to start the daemon. If it gets a false negative, then
`eden daemon` will forge ahead, trying to launch a new instance of the daemon,
but it will fail with a nasty error like the following:
```
I1017 11:59:25.188414 3064499 main.cpp:81] Starting edenfs. UID=5256, GID=100, PID=3064499
terminate called after throwing an instance of 'std::runtime_error'
what(): another instance of Eden appears to be running for /home/mbolin/local/.eden
*** Aborted at 1508266765 (Unix time, try 'date -d 1508266765') ***
*** Signal 6 (SIGABRT) (0x1488002ec2b3) received by PID 3064499 (pthread TID 0x7fd0d3787d40) (linux TID 3064499) (maybe from PID 30644
99, UID 5256), stack trace: ***
@ 000000000290d3cd folly::symbolizer::(anonymous namespace)::signalHandler(int, siginfo_t*, void*)
@ 00007fd0d133cacf (unknown)
@ 00007fd0d093e7c8 __GI_raise
@ 00007fd0d0940590 __GI_abort
@ 00007fd0d1dfeecc __gnu_cxx::__verbose_terminate_handler()
@ 00007fd0d1dfcdc5 __cxxabiv1::__terminate(void (*)())
@ 00007fd0d1dfce10 std::terminate()
@ 00007fd0d1dfd090 __cxa_throw
@ 00000000015fe8ca facebook::eden::EdenServer::acquireEdenLock()
@ 000000000160f27b facebook::eden::EdenServer::prepare()
@ 00000000016107d5 facebook::eden::EdenServer::run()
@ 000000000042c4ee main
@ 00007fd0d0929857 __libc_start_main
@ 0000000000548ad8 _start
Aborted
```
By providing more accurate information to `eden daemon`, if the user tries to
run it while the daemon is already running, they will get a more polite error
like the following:
```
error: edenfs is already running (pid 274205)
```
This revision addresses this issue by writing the PID of `edenfs` in the
lockfile. It updated the implementation of `eden health` to use the PID in the
lockfile to assess the health of Eden if the call to `getStatus()` fails. It
does this by running:
```
ps -p PID -o comm=
```
and applying some heuristics on the output to assess whether the command
associated with that process is the `edenfs` command. If it is, then
`eden health` reports the status as `STOPPED` whereas previously it would report
it as `DEAD`.
Reviewed By: wez
Differential Revision: D6086473
fbshipit-source-id: 825421a6818b56ddd7deea257a92c070c2232bdd
2017-10-18 21:18:43 +03:00
|
|
|
|
|
|
|
// Write the PID (with a newline) to the lockfile.
|
2018-01-18 00:29:59 +03:00
|
|
|
const int fd = lockFile_.fd();
|
Write the PID to the lockfile and update `eden health` to use it.
Summary:
We have encountered cases where `eden health` reported
`"edenfs not healthy: edenfs not running"` even though the `edenfs` process is
still running. Because the existing implementation of `eden health` bases its
health check on the output of a `getStatus()` Thrift call, it will erroneously
report `"edenfs not running"` even if Eden is running but its Thrift server is
not running. This type of false negative could occur if `edenfs` has shutdown
the Thrift server, but not the rest of the process (quite possibly, its
shutdown is blocked on calls to `umount2()`).
This is further problematic because `eden daemon` checks `eden health`
before attempting to start the daemon. If it gets a false negative, then
`eden daemon` will forge ahead, trying to launch a new instance of the daemon,
but it will fail with a nasty error like the following:
```
I1017 11:59:25.188414 3064499 main.cpp:81] Starting edenfs. UID=5256, GID=100, PID=3064499
terminate called after throwing an instance of 'std::runtime_error'
what(): another instance of Eden appears to be running for /home/mbolin/local/.eden
*** Aborted at 1508266765 (Unix time, try 'date -d 1508266765') ***
*** Signal 6 (SIGABRT) (0x1488002ec2b3) received by PID 3064499 (pthread TID 0x7fd0d3787d40) (linux TID 3064499) (maybe from PID 30644
99, UID 5256), stack trace: ***
@ 000000000290d3cd folly::symbolizer::(anonymous namespace)::signalHandler(int, siginfo_t*, void*)
@ 00007fd0d133cacf (unknown)
@ 00007fd0d093e7c8 __GI_raise
@ 00007fd0d0940590 __GI_abort
@ 00007fd0d1dfeecc __gnu_cxx::__verbose_terminate_handler()
@ 00007fd0d1dfcdc5 __cxxabiv1::__terminate(void (*)())
@ 00007fd0d1dfce10 std::terminate()
@ 00007fd0d1dfd090 __cxa_throw
@ 00000000015fe8ca facebook::eden::EdenServer::acquireEdenLock()
@ 000000000160f27b facebook::eden::EdenServer::prepare()
@ 00000000016107d5 facebook::eden::EdenServer::run()
@ 000000000042c4ee main
@ 00007fd0d0929857 __libc_start_main
@ 0000000000548ad8 _start
Aborted
```
By providing more accurate information to `eden daemon`, if the user tries to
run it while the daemon is already running, they will get a more polite error
like the following:
```
error: edenfs is already running (pid 274205)
```
This revision addresses this issue by writing the PID of `edenfs` in the
lockfile. It updated the implementation of `eden health` to use the PID in the
lockfile to assess the health of Eden if the call to `getStatus()` fails. It
does this by running:
```
ps -p PID -o comm=
```
and applying some heuristics on the output to assess whether the command
associated with that process is the `edenfs` command. If it is, then
`eden health` reports the status as `STOPPED` whereas previously it would report
it as `DEAD`.
Reviewed By: wez
Differential Revision: D6086473
fbshipit-source-id: 825421a6818b56ddd7deea257a92c070c2232bdd
2017-10-18 21:18:43 +03:00
|
|
|
folly::ftruncateNoInt(fd, /* len */ 0);
|
2018-01-18 00:29:59 +03:00
|
|
|
const auto pidContents = folly::to<std::string>(getpid(), "\n");
|
Write the PID to the lockfile and update `eden health` to use it.
Summary:
We have encountered cases where `eden health` reported
`"edenfs not healthy: edenfs not running"` even though the `edenfs` process is
still running. Because the existing implementation of `eden health` bases its
health check on the output of a `getStatus()` Thrift call, it will erroneously
report `"edenfs not running"` even if Eden is running but its Thrift server is
not running. This type of false negative could occur if `edenfs` has shutdown
the Thrift server, but not the rest of the process (quite possibly, its
shutdown is blocked on calls to `umount2()`).
This is further problematic because `eden daemon` checks `eden health`
before attempting to start the daemon. If it gets a false negative, then
`eden daemon` will forge ahead, trying to launch a new instance of the daemon,
but it will fail with a nasty error like the following:
```
I1017 11:59:25.188414 3064499 main.cpp:81] Starting edenfs. UID=5256, GID=100, PID=3064499
terminate called after throwing an instance of 'std::runtime_error'
what(): another instance of Eden appears to be running for /home/mbolin/local/.eden
*** Aborted at 1508266765 (Unix time, try 'date -d 1508266765') ***
*** Signal 6 (SIGABRT) (0x1488002ec2b3) received by PID 3064499 (pthread TID 0x7fd0d3787d40) (linux TID 3064499) (maybe from PID 30644
99, UID 5256), stack trace: ***
@ 000000000290d3cd folly::symbolizer::(anonymous namespace)::signalHandler(int, siginfo_t*, void*)
@ 00007fd0d133cacf (unknown)
@ 00007fd0d093e7c8 __GI_raise
@ 00007fd0d0940590 __GI_abort
@ 00007fd0d1dfeecc __gnu_cxx::__verbose_terminate_handler()
@ 00007fd0d1dfcdc5 __cxxabiv1::__terminate(void (*)())
@ 00007fd0d1dfce10 std::terminate()
@ 00007fd0d1dfd090 __cxa_throw
@ 00000000015fe8ca facebook::eden::EdenServer::acquireEdenLock()
@ 000000000160f27b facebook::eden::EdenServer::prepare()
@ 00000000016107d5 facebook::eden::EdenServer::run()
@ 000000000042c4ee main
@ 00007fd0d0929857 __libc_start_main
@ 0000000000548ad8 _start
Aborted
```
By providing more accurate information to `eden daemon`, if the user tries to
run it while the daemon is already running, they will get a more polite error
like the following:
```
error: edenfs is already running (pid 274205)
```
This revision addresses this issue by writing the PID of `edenfs` in the
lockfile. It updated the implementation of `eden health` to use the PID in the
lockfile to assess the health of Eden if the call to `getStatus()` fails. It
does this by running:
```
ps -p PID -o comm=
```
and applying some heuristics on the output to assess whether the command
associated with that process is the `edenfs` command. If it is, then
`eden health` reports the status as `STOPPED` whereas previously it would report
it as `DEAD`.
Reviewed By: wez
Differential Revision: D6086473
fbshipit-source-id: 825421a6818b56ddd7deea257a92c070c2232bdd
2017-10-18 21:18:43 +03:00
|
|
|
folly::writeNoInt(fd, pidContents.data(), pidContents.size());
|
2017-11-20 22:34:37 +03:00
|
|
|
|
|
|
|
return true;
|
2016-05-12 23:43:17 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
void EdenServer::prepareThriftAddress() {
|
|
|
|
// If we are serving on a local Unix socket, remove any old socket file
|
|
|
|
// that may be left over from a previous instance.
|
|
|
|
// We have already acquired the mount point lock at this time, so we know
|
|
|
|
// that any existing socket is unused and safe to remove.
|
|
|
|
const auto& addr = server_->getAddress();
|
|
|
|
if (addr.getFamily() != AF_UNIX) {
|
|
|
|
return;
|
|
|
|
}
|
2018-01-18 00:29:59 +03:00
|
|
|
const int rc = unlink(addr.getPath().c_str());
|
2016-05-12 23:43:17 +03:00
|
|
|
if (rc != 0 && errno != ENOENT) {
|
|
|
|
// This might happen if we don't have permission to remove the file.
|
|
|
|
folly::throwSystemError(
|
|
|
|
"unable to remove old Eden thrift socket ", addr.getPath());
|
|
|
|
}
|
|
|
|
}
|
2016-06-11 00:15:26 +03:00
|
|
|
|
2018-07-17 01:59:23 +03:00
|
|
|
void EdenServer::stop() {
|
2018-01-11 00:01:22 +03:00
|
|
|
shutdownSubscribers();
|
2016-06-11 00:15:26 +03:00
|
|
|
server_->stop();
|
|
|
|
}
|
2017-08-01 06:49:35 +03:00
|
|
|
|
2017-11-20 02:18:29 +03:00
|
|
|
folly::Future<TakeoverData> EdenServer::startTakeoverShutdown() {
|
|
|
|
// Make sure we aren't already shutting down, then update our state
|
|
|
|
// to indicate that we should perform mount point takeover shutdown
|
|
|
|
// once runServer() returns.
|
|
|
|
{
|
2018-02-09 06:32:01 +03:00
|
|
|
auto state = runningState_.wlock();
|
|
|
|
if (state->state != RunState::RUNNING) {
|
2017-11-20 02:18:29 +03:00
|
|
|
// We are either still in the process of starting,
|
|
|
|
// or already shutting down.
|
|
|
|
return makeFuture<TakeoverData>(std::runtime_error(folly::to<string>(
|
|
|
|
"can only perform graceful restart when running normally; "
|
|
|
|
"current state is ",
|
|
|
|
static_cast<int>(state->state))));
|
|
|
|
}
|
|
|
|
if (state->takeoverShutdown) {
|
|
|
|
// This can happen if startTakeoverShutdown() is called twice
|
|
|
|
// before runServer() exits.
|
|
|
|
return makeFuture<TakeoverData>(std::runtime_error(
|
|
|
|
"another takeover shutdown has already been started"));
|
|
|
|
}
|
|
|
|
|
|
|
|
state->takeoverShutdown = true;
|
2017-11-20 02:18:31 +03:00
|
|
|
|
2018-10-10 02:37:33 +03:00
|
|
|
// Make a copy of the thrift server socket so we can transfer it to the
|
|
|
|
// new edenfs process. Our local thrift will close its own socket when we
|
|
|
|
// stop the server. The easiest way to avoid completely closing the
|
|
|
|
// server socket for now is simply by duplicating the socket to a new fd.
|
2017-11-20 02:18:31 +03:00
|
|
|
// We will transfer this duplicated FD to the new edenfs process.
|
2018-01-18 00:29:59 +03:00
|
|
|
const int takeoverThriftSocket = dup(server_->getListenSocket());
|
2017-11-20 02:18:31 +03:00
|
|
|
folly::checkUnixError(
|
|
|
|
takeoverThriftSocket,
|
|
|
|
"error duplicating thrift server socket during graceful takeover");
|
|
|
|
state->takeoverThriftSocket =
|
|
|
|
folly::File{takeoverThriftSocket, /* ownsFd */ true};
|
2017-11-20 02:18:29 +03:00
|
|
|
}
|
|
|
|
|
2018-01-11 00:01:22 +03:00
|
|
|
shutdownSubscribers();
|
|
|
|
|
2017-11-20 02:18:29 +03:00
|
|
|
// Stop the thrift server. We will fulfill takeoverPromise_ once it stops.
|
|
|
|
server_->stop();
|
|
|
|
return takeoverPromise_.getFuture();
|
2017-08-01 06:49:35 +03:00
|
|
|
}
|
2017-08-18 21:43:57 +03:00
|
|
|
|
2018-07-17 01:59:23 +03:00
|
|
|
void EdenServer::shutdownSubscribers() {
|
2018-03-17 00:35:46 +03:00
|
|
|
// TODO: Set a flag in handler_ to reject future subscription requests.
|
|
|
|
// Alternatively, have them seamless transfer through takeovers.
|
|
|
|
|
2018-01-11 00:01:22 +03:00
|
|
|
// If we have any subscription sessions from watchman, we want to shut
|
|
|
|
// those down now, otherwise they will block the server_->stop() call
|
|
|
|
// below
|
|
|
|
XLOG(DBG1) << "cancel all subscribers prior to stopping thrift";
|
2018-07-17 01:59:23 +03:00
|
|
|
auto mountPoints = mountPoints_.wlock();
|
2018-01-11 00:01:22 +03:00
|
|
|
for (auto& entry : *mountPoints) {
|
|
|
|
auto& info = entry.second;
|
|
|
|
info.edenMount->getJournal().cancelAllSubscribers();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-02-09 03:33:32 +03:00
|
|
|
void EdenServer::flushStatsNow() {
|
2018-04-23 23:10:31 +03:00
|
|
|
for (auto& stats : serverState_->getStats().accessAllThreads()) {
|
2017-08-18 21:43:57 +03:00
|
|
|
stats.aggregate();
|
|
|
|
}
|
|
|
|
}
|
2018-06-20 18:56:26 +03:00
|
|
|
|
|
|
|
void EdenServer::reportProcStats() {
|
|
|
|
auto now = std::chrono::system_clock::now().time_since_epoch();
|
|
|
|
// Throttle stats collection to every kMemoryPollSeconds
|
|
|
|
if (std::chrono::duration_cast<std::chrono::seconds>(
|
|
|
|
now - lastProcStatsRun_.load()) > kMemoryPollSeconds) {
|
|
|
|
auto privateBytes = facebook::eden::proc_util::calculatePrivateBytes();
|
|
|
|
if (privateBytes) {
|
|
|
|
stats::ServiceData::get()->addStatValue(
|
|
|
|
kPrivateBytes, privateBytes.value(), stats::AVG);
|
|
|
|
}
|
|
|
|
|
2018-06-21 00:51:50 +03:00
|
|
|
auto rssKBytes = facebook::eden::proc_util::getUnsignedLongLongValue(
|
2018-06-20 18:56:26 +03:00
|
|
|
proc_util::loadProcStatus(), kVmRSSKey.data(), kKBytes.data());
|
2018-06-21 00:51:50 +03:00
|
|
|
if (rssKBytes) {
|
2018-06-20 18:56:26 +03:00
|
|
|
stats::ServiceData::get()->addStatValue(
|
2018-06-21 00:51:50 +03:00
|
|
|
kRssBytes, rssKBytes.value() * 1024, stats::AVG);
|
2018-06-20 18:56:26 +03:00
|
|
|
}
|
|
|
|
lastProcStatsRun_.store(now);
|
|
|
|
}
|
|
|
|
}
|
2018-07-10 02:53:17 +03:00
|
|
|
|
2017-10-18 21:18:36 +03:00
|
|
|
} // namespace eden
|
|
|
|
} // namespace facebook
|