2017-11-20 22:34:37 +03:00
|
|
|
#!/usr/bin/env python3
|
2019-06-20 02:58:25 +03:00
|
|
|
# Copyright (c) Facebook, Inc. and its affiliates.
|
2017-11-20 22:34:37 +03:00
|
|
|
#
|
2019-06-20 02:58:25 +03:00
|
|
|
# This software may be used and distributed according to the terms of the
|
|
|
|
# GNU General Public License version 2.
|
2017-11-20 22:34:37 +03:00
|
|
|
|
|
|
|
import os
|
2018-01-10 09:01:00 +03:00
|
|
|
import resource
|
2020-04-20 22:26:18 +03:00
|
|
|
import signal
|
2017-11-20 22:34:37 +03:00
|
|
|
import sys
|
2018-03-29 08:10:46 +03:00
|
|
|
import threading
|
2020-04-20 22:26:18 +03:00
|
|
|
from multiprocessing import Process
|
2019-03-09 06:02:42 +03:00
|
|
|
from pathlib import Path
|
2020-04-20 22:26:18 +03:00
|
|
|
from typing import Dict, Optional
|
2017-11-20 22:34:37 +03:00
|
|
|
|
2020-04-20 22:26:18 +03:00
|
|
|
import pexpect
|
|
|
|
from eden.fs.cli.util import get_pid_using_lockfile, poll_until
|
2020-06-09 20:07:15 +03:00
|
|
|
from eden.thrift.legacy import EdenClient
|
2020-04-20 22:26:18 +03:00
|
|
|
from facebook.eden.ttypes import FaultDefinition, UnblockFaultArg
|
|
|
|
from fb303_core.ttypes import fb303_status
|
2019-01-22 21:44:24 +03:00
|
|
|
|
2017-11-20 22:34:37 +03:00
|
|
|
from .lib import testcase
|
2020-04-20 22:26:18 +03:00
|
|
|
from .lib.find_executables import FindExe
|
2017-11-20 22:34:37 +03:00
|
|
|
|
|
|
|
|
|
|
|
@testcase.eden_repo_test
|
2020-02-25 05:53:48 +03:00
|
|
|
# pyre-ignore[13]: T62487924
|
2018-04-05 03:31:25 +03:00
|
|
|
class TakeoverTest(testcase.EdenRepoTest):
|
2018-11-14 23:13:46 +03:00
|
|
|
pagesize: int
|
|
|
|
page1: str
|
|
|
|
page2: str
|
|
|
|
commit1: str
|
|
|
|
commit2: str
|
2020-04-20 22:26:18 +03:00
|
|
|
enable_fault_injection: bool = True
|
2018-11-14 23:13:46 +03:00
|
|
|
|
2018-04-05 03:31:28 +03:00
|
|
|
def populate_repo(self) -> None:
|
2018-01-10 09:01:00 +03:00
|
|
|
self.pagesize = resource.getpagesize()
|
|
|
|
self.page1 = "1" * self.pagesize
|
|
|
|
self.page2 = "2" * self.pagesize
|
2018-05-10 07:33:49 +03:00
|
|
|
self.repo.write_file("tree/hello", self.page1 + self.page2)
|
|
|
|
self.repo.write_file("tree/deleted", self.page1 + self.page2)
|
|
|
|
self.repo.write_file("src/main.c", "hello world")
|
|
|
|
self.commit1 = self.repo.commit("Initial commit.")
|
2018-03-21 02:34:08 +03:00
|
|
|
|
2018-05-10 07:33:49 +03:00
|
|
|
self.repo.write_file("src/main.c", "hello world v2")
|
|
|
|
self.repo.write_file("src/test/test1.py", "test1")
|
|
|
|
self.repo.write_file("src/test/test2.py", "test2")
|
|
|
|
self.commit2 = self.repo.commit("Initial commit.")
|
2017-11-20 22:34:37 +03:00
|
|
|
|
2018-04-05 03:31:28 +03:00
|
|
|
def select_storage_engine(self) -> str:
|
2018-05-10 07:33:49 +03:00
|
|
|
""" we need to persist data across restarts """
|
|
|
|
return "sqlite"
|
2018-02-09 06:54:18 +03:00
|
|
|
|
2018-04-05 03:31:28 +03:00
|
|
|
def edenfs_logging_settings(self) -> Dict[str, str]:
|
2018-05-10 07:33:49 +03:00
|
|
|
if self._testMethodName == "test_takeover_with_io":
|
2018-03-29 08:10:46 +03:00
|
|
|
# test_takeover_with_io causes lots of I/O, so do not enable
|
|
|
|
# verbose logging of I/O operations in this test.
|
|
|
|
return {}
|
2018-04-10 22:47:11 +03:00
|
|
|
return {
|
2018-05-10 07:33:49 +03:00
|
|
|
"eden.strace": "DBG7",
|
|
|
|
"eden.fs.fuse": "DBG7",
|
|
|
|
"eden.fs.inodes.InodeMap": "DBG6",
|
add additional takeover "ready" handshake
Summary:
For graceful restart takeovers, we would like to implement an additional handshake. This handshake will occur right after the takeover data is ready to be sent to the client, but before actually sending it. This is to make sure the old daemon can recover in case of the client not being responsive (the client replies back to the server, and if no response is recieved in 5 seconds, the server will recover).
There are a few cases here:
* **Server sends ping (two cases discussed below)**
I introduced a new ProtocolVersion. Daemons with this change will now have ProtocolVersion4. The Server checks the max version of the client, and if this version is ProtocolVersion4, we know the client can listen for pings. So we will send the ping. Otherwise, we don't send a ping. With this, we will only send pings if we know the client will be listening for one. The case in which a client isn't listening is if we adopt this change and we downgrade past the change.
* **Server does not send ping and Client knows to listen for ping**
This will be a common case immediately after this change. The client will parse the sent data and check if it matches the "ready" ping, and if it doesn't, the client assumes the server simply sent the Takeover Data.
* **Server does not sends ping and Client doesn't know to listen for ping**
This is the case before this change.
Reviewed By: simpkins
Differential Revision: D20290271
fbshipit-source-id: b68e4df6264fb071d770671a80e28c90ddb0d3f2
2020-04-07 19:50:06 +03:00
|
|
|
"eden.fs.takeover": "DBG7",
|
2020-04-20 22:26:18 +03:00
|
|
|
"eden.fs.service": "DBG4",
|
2018-04-10 22:47:11 +03:00
|
|
|
}
|
2017-11-20 22:34:37 +03:00
|
|
|
|
2018-04-05 03:31:28 +03:00
|
|
|
def do_takeover_test(self) -> None:
|
2018-05-10 07:33:49 +03:00
|
|
|
hello = os.path.join(self.mount, "tree/hello")
|
|
|
|
deleted = os.path.join(self.mount, "tree/deleted")
|
|
|
|
deleted_local = os.path.join(self.mount, "deleted-local")
|
2018-01-12 23:12:08 +03:00
|
|
|
|
|
|
|
# To test our handling of unlinked inodes, in addition
|
|
|
|
# to unlinking something that is in the manifest we
|
|
|
|
# need to check that we handle the case of a local
|
|
|
|
# file being deleted to make sure that we cover both
|
|
|
|
# code paths for FileInode.
|
2018-05-10 07:33:49 +03:00
|
|
|
with open(deleted_local, "w") as dl:
|
2018-01-12 23:12:08 +03:00
|
|
|
dl.write(self.page1)
|
|
|
|
dl.write(self.page2)
|
|
|
|
|
|
|
|
# We'd like to make sure that we do something reasonable
|
|
|
|
# for directories that have been unlinked and that are
|
|
|
|
# still referenced via a file descriptor. Ideally we'd call
|
|
|
|
# opendir() here and then readdir() it after we've performed
|
|
|
|
# the graceful restart, but we can't directly call those
|
|
|
|
# functions from python. The approach used here is to
|
|
|
|
# open a file descriptor to the directory and then try
|
|
|
|
# to stat() it after the restart. Since the directory
|
|
|
|
# has to be empty in order to be unlinked, a readdir
|
|
|
|
# from it wouldn't return any interesting results anyway.
|
2018-05-10 07:33:49 +03:00
|
|
|
deleted_dir = os.path.join(self.mount, "deleted-dir")
|
2018-01-12 23:12:08 +03:00
|
|
|
os.mkdir(deleted_dir)
|
|
|
|
deleted_dir_fd = os.open(deleted_dir, 0)
|
|
|
|
os.rmdir(deleted_dir)
|
|
|
|
|
2018-05-10 07:33:49 +03:00
|
|
|
with open(hello, "r") as f, open(deleted, "r") as d, open(
|
|
|
|
deleted_local, "r"
|
|
|
|
) as dl:
|
2018-01-10 09:01:00 +03:00
|
|
|
# Read the first page only (rather than the whole file)
|
|
|
|
# before we restart the process.
|
|
|
|
# This is so that we can check that the kernel really
|
|
|
|
# does call in to us for the second page and that we're
|
|
|
|
# really servicing the read for the second page and that
|
|
|
|
# it isn't just getting served from the kernel buffer cache
|
|
|
|
self.assertEqual(self.page1, f.read(self.pagesize))
|
|
|
|
|
2018-01-12 23:12:08 +03:00
|
|
|
# Let's make sure that unlinked inodes continue to
|
|
|
|
# work appropriately too. We've opened the file
|
|
|
|
# handles and are holding them alive in `d` and `dl`,
|
|
|
|
# so now let's unlink it from the filesystem
|
|
|
|
os.unlink(deleted)
|
|
|
|
os.unlink(deleted_local)
|
|
|
|
|
2018-05-10 07:33:49 +03:00
|
|
|
print("=== beginning restart ===", file=sys.stderr)
|
2018-01-10 09:01:00 +03:00
|
|
|
self.eden.graceful_restart()
|
2018-05-10 07:33:49 +03:00
|
|
|
print("=== restart complete ===", file=sys.stderr)
|
2018-01-10 09:01:00 +03:00
|
|
|
|
|
|
|
# Ensure that our file handle is still live across
|
|
|
|
# the restart boundary
|
|
|
|
f.seek(0)
|
|
|
|
self.assertEqual(self.page1, f.read(self.pagesize))
|
|
|
|
self.assertEqual(self.page2, f.read(self.pagesize))
|
|
|
|
|
2018-01-12 23:12:08 +03:00
|
|
|
# We should be able to read from the `d` file handle
|
|
|
|
# even though we deleted the file from the tree
|
|
|
|
self.assertEqual(self.page1, d.read(self.pagesize))
|
|
|
|
self.assertEqual(self.page2, d.read(self.pagesize))
|
|
|
|
# Likewise for the `dl` file handle
|
|
|
|
self.assertEqual(self.page1, dl.read(self.pagesize))
|
|
|
|
self.assertEqual(self.page2, dl.read(self.pagesize))
|
|
|
|
|
|
|
|
# Now check that the unlinked directory handle still seems
|
|
|
|
# connected. This is difficult to do directly in python;
|
|
|
|
# the directory had to be empty in order to be removed
|
|
|
|
# so even if we could read its directory entries there
|
|
|
|
# wouldn't be anything to read.
|
|
|
|
# Note that os.stat() will throw if the fd is deemed
|
|
|
|
# bad either by the kernel or the eden instance,
|
|
|
|
# so we're just calling it and discarding the return
|
|
|
|
# value.
|
|
|
|
os.stat(deleted_dir_fd)
|
|
|
|
os.close(deleted_dir_fd)
|
|
|
|
|
|
|
|
# Let's also test opening the same file up again,
|
2018-01-10 09:01:00 +03:00
|
|
|
# just to make sure that that is still working after
|
|
|
|
# the graceful restart.
|
2018-05-10 07:33:49 +03:00
|
|
|
with open(hello, "r") as f:
|
2018-01-10 09:01:00 +03:00
|
|
|
self.assertEqual(self.page1, f.read(self.pagesize))
|
|
|
|
self.assertEqual(self.page2, f.read(self.pagesize))
|
2018-03-17 00:35:46 +03:00
|
|
|
|
2018-04-05 03:31:28 +03:00
|
|
|
def test_takeover(self) -> None:
|
2018-03-21 02:34:08 +03:00
|
|
|
return self.do_takeover_test()
|
|
|
|
|
2018-04-05 03:31:28 +03:00
|
|
|
def test_takeover_after_diff_revisions(self) -> None:
|
2018-03-21 02:34:08 +03:00
|
|
|
# Make a getScmStatusBetweenRevisions() call to Eden.
|
|
|
|
# Previously this thrift call caused Eden to create temporary inode
|
|
|
|
# objects outside of the normal root inode tree, and this would cause
|
|
|
|
# Eden to crash when shutting down afterwards.
|
|
|
|
with self.get_thrift_client() as client:
|
2018-11-14 23:13:46 +03:00
|
|
|
client.getScmStatusBetweenRevisions(
|
|
|
|
os.fsencode(self.mount),
|
|
|
|
self.commit1.encode("utf-8"),
|
|
|
|
self.commit2.encode("utf-8"),
|
|
|
|
)
|
2018-03-21 02:34:08 +03:00
|
|
|
|
|
|
|
return self.do_takeover_test()
|
|
|
|
|
2018-04-05 03:31:28 +03:00
|
|
|
def test_takeover_with_io(self) -> None:
|
2018-03-29 08:10:46 +03:00
|
|
|
num_threads = 4
|
|
|
|
write_chunk_size = 1024 * 1024
|
|
|
|
max_file_length = write_chunk_size * 100
|
|
|
|
|
|
|
|
# TODO: Setting this higher than 1 currently makes it likely that
|
|
|
|
# edenfs will crash during restart.
|
|
|
|
# There are still some other bugs we need to track down in the restart
|
|
|
|
# ordering.
|
|
|
|
num_restarts = 1
|
|
|
|
|
|
|
|
stop = threading.Event()
|
2018-05-10 07:33:49 +03:00
|
|
|
bufs = [b"x" * write_chunk_size, b"y" * write_chunk_size]
|
2018-03-29 08:10:46 +03:00
|
|
|
|
2018-04-05 03:31:28 +03:00
|
|
|
def do_io(thread_id: int, running_event: threading.Event) -> None:
|
2018-05-10 07:33:49 +03:00
|
|
|
path = os.path.join(self.mount, "src", "test", "data%d.log" % thread_id)
|
|
|
|
with open(path, "wb") as f:
|
2018-03-29 08:10:46 +03:00
|
|
|
# Use raw file descriptors to avoid going through python's I/O
|
|
|
|
# buffering code.
|
|
|
|
fd = f.fileno()
|
|
|
|
|
|
|
|
buf_idx = 0
|
|
|
|
buf = bufs[buf_idx]
|
|
|
|
offset = 0
|
|
|
|
|
|
|
|
# Repeatedly write and rewrite the same file,
|
2018-05-10 07:33:49 +03:00
|
|
|
# jalternating between two different data buffers.
|
2018-03-29 08:10:46 +03:00
|
|
|
running_event.set()
|
|
|
|
while True:
|
|
|
|
os.pwrite(fd, buf, offset)
|
|
|
|
if stop.is_set():
|
|
|
|
return
|
|
|
|
offset += len(buf)
|
|
|
|
if offset >= max_file_length:
|
|
|
|
buf_idx += 1
|
|
|
|
buf = bufs[buf_idx % len(bufs)]
|
|
|
|
offset = 0
|
|
|
|
|
|
|
|
# Log the mount points device ID at the start of the test
|
|
|
|
# (Just in case anything hangs and we need to abort the mount
|
|
|
|
# using /sys/fs/fuse/connections/<dev>/)
|
|
|
|
st = os.lstat(self.mount)
|
2018-05-10 07:33:49 +03:00
|
|
|
print("=== eden mount device=%d ===" % st.st_dev, file=sys.stderr)
|
2018-03-29 08:10:46 +03:00
|
|
|
|
|
|
|
# Start several threads doing I/O while we we perform a takeover
|
|
|
|
threads = []
|
|
|
|
try:
|
|
|
|
running_events = []
|
|
|
|
for n in range(num_threads):
|
|
|
|
running = threading.Event()
|
|
|
|
thread = threading.Thread(target=do_io, args=(n, running))
|
|
|
|
thread.start()
|
|
|
|
threads.append(thread)
|
|
|
|
running_events.append(running)
|
|
|
|
|
|
|
|
# Wait until all threads have started and are doing I/O
|
|
|
|
for event in running_events:
|
|
|
|
event.wait()
|
|
|
|
|
|
|
|
# Restart edenfs
|
|
|
|
for n in range(num_restarts):
|
2018-05-10 07:33:49 +03:00
|
|
|
print("=== beginning restart %d ===" % n, file=sys.stderr)
|
2018-03-29 08:10:46 +03:00
|
|
|
self.eden.graceful_restart()
|
2018-05-10 07:33:49 +03:00
|
|
|
print("=== restart %d complete ===" % n, file=sys.stderr)
|
2018-03-29 08:10:46 +03:00
|
|
|
finally:
|
|
|
|
stop.set()
|
|
|
|
for thread in threads:
|
|
|
|
thread.join()
|
|
|
|
|
2019-01-22 21:44:24 +03:00
|
|
|
def test_takeover_updates_process_id_in_lock_file(self) -> None:
|
|
|
|
self.assertEqual(
|
2019-03-09 06:02:42 +03:00
|
|
|
self.eden.get_pid_via_thrift(),
|
|
|
|
get_pid_using_lockfile(Path(self.eden.eden_dir)),
|
2019-01-22 21:44:24 +03:00
|
|
|
)
|
|
|
|
self.eden.graceful_restart()
|
|
|
|
self.assertEqual(
|
2019-03-09 06:02:42 +03:00
|
|
|
self.eden.get_pid_via_thrift(),
|
|
|
|
get_pid_using_lockfile(Path(self.eden.eden_dir)),
|
2019-01-22 21:44:24 +03:00
|
|
|
)
|
|
|
|
|
2018-03-21 02:34:08 +03:00
|
|
|
def test_takeover_preserves_inode_numbers_for_open_nonmaterialized_files(
|
2020-06-09 20:07:15 +03:00
|
|
|
self,
|
2018-04-05 03:31:28 +03:00
|
|
|
) -> None:
|
2018-05-10 07:33:49 +03:00
|
|
|
hello = os.path.join(self.mount, "tree/hello")
|
2018-03-17 00:35:46 +03:00
|
|
|
|
|
|
|
fd = os.open(hello, os.O_RDONLY)
|
|
|
|
try:
|
|
|
|
inode_number = os.fstat(fd).st_ino
|
|
|
|
|
|
|
|
self.eden.graceful_restart()
|
|
|
|
|
|
|
|
self.assertEqual(inode_number, os.fstat(fd).st_ino)
|
|
|
|
finally:
|
|
|
|
os.close(fd)
|
|
|
|
|
|
|
|
fd = os.open(hello, os.O_RDONLY)
|
|
|
|
try:
|
|
|
|
self.assertEqual(inode_number, os.fstat(fd).st_ino)
|
|
|
|
finally:
|
|
|
|
os.close(fd)
|
2018-06-01 04:48:52 +03:00
|
|
|
|
|
|
|
def test_contents_are_the_same_if_handle_is_held_open(self) -> None:
|
|
|
|
with open(os.path.join(self.mount, "tree", "hello")) as c2_hello_file, open(
|
|
|
|
os.path.join(self.mount, "src", "main.c")
|
|
|
|
) as c2_mainc_file:
|
|
|
|
|
|
|
|
self.eden.graceful_restart()
|
|
|
|
self.eden.run_cmd(
|
|
|
|
"debug", "flush_cache", os.path.join("tree", "hello"), cwd=self.mount
|
|
|
|
)
|
|
|
|
self.eden.run_cmd(
|
|
|
|
"debug", "flush_cache", os.path.join("src", "main.c"), cwd=self.mount
|
|
|
|
)
|
|
|
|
|
|
|
|
self.assertEqual(self.page1 + self.page2, c2_hello_file.read())
|
|
|
|
self.assertEqual("hello world v2", c2_mainc_file.read())
|
2019-03-13 05:25:54 +03:00
|
|
|
|
2019-12-10 21:23:50 +03:00
|
|
|
def test_readdir_after_graceful_restart(self) -> None:
|
|
|
|
# Ensure capability flags (e.g. FUSE_NO_OPENDIR_SUPPORT) survive
|
|
|
|
# graceful restart
|
|
|
|
self.eden.graceful_restart()
|
|
|
|
self.assertEqual(
|
|
|
|
["test1.py", "test2.py"],
|
|
|
|
sorted(os.listdir(os.path.join(self.mount, "src", "test"))),
|
|
|
|
)
|
|
|
|
|
|
|
|
def test_readdir_before_and_after_graceful_restart(self) -> None:
|
|
|
|
self.assertEqual(
|
|
|
|
["test1.py", "test2.py"],
|
|
|
|
sorted(os.listdir(os.path.join(self.mount, "src", "test"))),
|
|
|
|
)
|
|
|
|
self.eden.graceful_restart()
|
|
|
|
self.assertEqual(
|
|
|
|
["test1.py", "test2.py"],
|
|
|
|
sorted(os.listdir(os.path.join(self.mount, "src", "test"))),
|
|
|
|
)
|
|
|
|
|
add additional takeover "ready" handshake
Summary:
For graceful restart takeovers, we would like to implement an additional handshake. This handshake will occur right after the takeover data is ready to be sent to the client, but before actually sending it. This is to make sure the old daemon can recover in case of the client not being responsive (the client replies back to the server, and if no response is recieved in 5 seconds, the server will recover).
There are a few cases here:
* **Server sends ping (two cases discussed below)**
I introduced a new ProtocolVersion. Daemons with this change will now have ProtocolVersion4. The Server checks the max version of the client, and if this version is ProtocolVersion4, we know the client can listen for pings. So we will send the ping. Otherwise, we don't send a ping. With this, we will only send pings if we know the client will be listening for one. The case in which a client isn't listening is if we adopt this change and we downgrade past the change.
* **Server does not send ping and Client knows to listen for ping**
This will be a common case immediately after this change. The client will parse the sent data and check if it matches the "ready" ping, and if it doesn't, the client assumes the server simply sent the Takeover Data.
* **Server does not sends ping and Client doesn't know to listen for ping**
This is the case before this change.
Reviewed By: simpkins
Differential Revision: D20290271
fbshipit-source-id: b68e4df6264fb071d770671a80e28c90ddb0d3f2
2020-04-07 19:50:06 +03:00
|
|
|
def test_takeover_doesnt_send_ping(self) -> None:
|
|
|
|
"""
|
|
|
|
tests that if we try a takeover with a version that doesn't know
|
|
|
|
how to accept a ping, we don't send one. This test should not fail
|
|
|
|
in either case since it is running against a client that knows how
|
|
|
|
to listen for a ping. It just is used to look at logs to make sure
|
|
|
|
the correct code path is entered.
|
|
|
|
"""
|
|
|
|
self.eden.fake_takeover_with_version(3)
|
|
|
|
|
2020-04-07 19:50:06 +03:00
|
|
|
def test_takeover_failure(self) -> None:
|
|
|
|
print("=== beginning restart ===", file=sys.stderr)
|
|
|
|
self.eden.takeover_without_ping_response()
|
|
|
|
print("=== restart complete ===", file=sys.stderr)
|
|
|
|
self.assertTrue(self.eden.wait_for_is_healthy())
|
|
|
|
|
2020-04-20 22:26:18 +03:00
|
|
|
def run_restart(self) -> "pexpect.spawn[bytes]":
|
|
|
|
restart_cmd = [
|
|
|
|
FindExe.EDEN_CLI,
|
|
|
|
"--config-dir",
|
|
|
|
str(self.eden_dir),
|
|
|
|
"--etc-eden-dir",
|
|
|
|
str(self.etc_eden_dir),
|
|
|
|
"--home-dir",
|
|
|
|
str(self.home_dir),
|
|
|
|
"restart",
|
|
|
|
"--daemon-binary",
|
|
|
|
FindExe.FAKE_EDENFS,
|
|
|
|
]
|
|
|
|
|
|
|
|
print("Restarting eden: %r" % (restart_cmd,))
|
|
|
|
return pexpect.spawn(
|
2020-07-20 23:26:26 +03:00
|
|
|
restart_cmd[0], restart_cmd[1:], logfile=sys.stdout.buffer, timeout=5
|
2020-04-20 22:26:18 +03:00
|
|
|
)
|
|
|
|
|
|
|
|
def assert_restart_fails_with_in_progress_graceful_restart(
|
|
|
|
self, client: EdenClient
|
|
|
|
) -> None:
|
|
|
|
pid = self.eden.get_pid_via_thrift()
|
|
|
|
p = self.run_restart()
|
|
|
|
p.expect_exact(
|
|
|
|
f"The current edenfs daemon (pid {pid}) is in the middle of stopping."
|
|
|
|
f"\r\nUse --force if you want to forcibly restart the current daemon\r\n"
|
|
|
|
)
|
|
|
|
p.wait()
|
2020-08-13 19:42:13 +03:00
|
|
|
self.assertEqual(p.exitstatus, 4)
|
2020-04-20 22:26:18 +03:00
|
|
|
|
|
|
|
self.assertEqual(client.getStatus(), fb303_status.STOPPING)
|
|
|
|
|
|
|
|
def assert_shutdown_fails_with_in_progress_graceful_restart(
|
|
|
|
self, client: EdenClient
|
|
|
|
) -> None:
|
|
|
|
# call initiateShutdown. This should not throw.
|
|
|
|
try:
|
|
|
|
client.initiateShutdown("shutdown requested during graceful restart")
|
|
|
|
except Exception:
|
|
|
|
self.fail(
|
|
|
|
"initiateShutdown should not throw when graceful restart is in progress"
|
|
|
|
)
|
|
|
|
|
|
|
|
self.assertEqual(client.getStatus(), fb303_status.STOPPING)
|
|
|
|
|
|
|
|
def assert_sigkill_fails_with_in_progress_graceful_restart(
|
|
|
|
self, client: EdenClient
|
|
|
|
) -> None:
|
|
|
|
# send SIGTERM to process. This should not throw.
|
|
|
|
pid = self.eden.get_pid_via_thrift()
|
|
|
|
try:
|
|
|
|
os.kill(pid, signal.SIGTERM)
|
|
|
|
except Exception:
|
|
|
|
self.fail(
|
|
|
|
"sending SIGTERM should not throw when graceful restart is in progress"
|
|
|
|
)
|
|
|
|
|
|
|
|
self.assertEqual(client.getStatus(), fb303_status.STOPPING)
|
|
|
|
|
|
|
|
def test_stop_during_takeover(self) -> None:
|
|
|
|
# block graceful restart
|
|
|
|
with self.eden.get_thrift_client() as client:
|
|
|
|
client.injectFault(
|
|
|
|
FaultDefinition(
|
|
|
|
keyClass="takeover", keyValueRegex="server_shutdown", block=True
|
|
|
|
)
|
|
|
|
)
|
|
|
|
|
|
|
|
self.eden.wait_for_is_healthy()
|
|
|
|
|
|
|
|
# Run a graceful restart
|
|
|
|
# This won't succeed until we unblock the shutdown.
|
|
|
|
p = Process(target=self.eden.graceful_restart)
|
|
|
|
p.start()
|
|
|
|
|
|
|
|
# Wait for the state to be shutting down
|
|
|
|
def state_shutting_down() -> Optional[bool]:
|
|
|
|
if not p.is_alive():
|
|
|
|
raise Exception(
|
2021-01-26 03:11:38 +03:00
|
|
|
"eden restart --graceful command finished while "
|
|
|
|
"graceful restart was still blocked"
|
2020-04-20 22:26:18 +03:00
|
|
|
)
|
|
|
|
if client.getStatus() is fb303_status.STOPPING:
|
|
|
|
return True
|
|
|
|
return None
|
|
|
|
|
|
|
|
poll_until(state_shutting_down, timeout=60)
|
|
|
|
|
|
|
|
# Normal restart should be rejected while a graceful restart
|
|
|
|
# is in progress
|
|
|
|
self.assert_restart_fails_with_in_progress_graceful_restart(client)
|
|
|
|
|
|
|
|
# Normal shutdown should be rejected while a graceful restart
|
|
|
|
# is in progress
|
|
|
|
self.assert_shutdown_fails_with_in_progress_graceful_restart(client)
|
|
|
|
|
|
|
|
# Getting SIGTERM should not kill process while a graceful restart is in
|
|
|
|
# progress
|
|
|
|
self.assert_sigkill_fails_with_in_progress_graceful_restart(client)
|
|
|
|
|
|
|
|
# Unblock the server shutdown and wait for the graceful restart to complete.
|
|
|
|
client.unblockFault(
|
|
|
|
UnblockFaultArg(keyClass="takeover", keyValueRegex="server_shutdown")
|
|
|
|
)
|
|
|
|
|
|
|
|
p.join()
|
|
|
|
|
2019-03-13 05:25:54 +03:00
|
|
|
|
|
|
|
@testcase.eden_repo_test
|
|
|
|
class TakeoverRocksDBStressTest(testcase.EdenRepoTest):
|
|
|
|
enable_fault_injection: bool = True
|
|
|
|
|
|
|
|
def populate_repo(self) -> None:
|
|
|
|
self.repo.write_file("test-directory/file", "")
|
2019-07-11 00:27:04 +03:00
|
|
|
# pyre-fixme[16]: `TakeoverRocksDBStressTest` has no attribute `commit1`.
|
2019-03-13 05:25:54 +03:00
|
|
|
self.commit1 = self.repo.commit("Initial commit.")
|
|
|
|
|
|
|
|
def select_storage_engine(self) -> str:
|
|
|
|
return "rocksdb"
|
|
|
|
|
|
|
|
def test_takeover_with_tree_inode_loading_from_local_store(self) -> None:
|
|
|
|
"""
|
|
|
|
Restart edenfs while a tree inode is being loaded asynchronously. Ensure
|
|
|
|
restarting does not deadlock.
|
|
|
|
"""
|
|
|
|
|
|
|
|
def load_test_directory_inode_from_local_store_asynchronously() -> None:
|
|
|
|
"""
|
|
|
|
Make edenfs start loading "/test-directory" from the local store.
|
|
|
|
|
|
|
|
To ensure that the local store is in use during takeover, load the tree
|
|
|
|
inode using a prefetch.
|
|
|
|
|
|
|
|
At the time of writing, os.listdir("foo") causes edenfs to prefetch
|
|
|
|
the tree inodes of foo/*. Exploit this to load the tree inode for
|
|
|
|
"/directory".
|
|
|
|
|
|
|
|
Other options considered:
|
|
|
|
|
|
|
|
* At the time of writing, if we load the tree inode using a FUSE
|
|
|
|
request (e.g. os.stat), edenfs would wait for the FUSE request to
|
|
|
|
finish before starting the inode shutdown procedure.
|
|
|
|
|
|
|
|
* At the time of writing, 'edenfsctl prefetch' does not prefetch
|
|
|
|
tree inodes asynchronously.
|
|
|
|
"""
|
|
|
|
os.listdir(self.mount)
|
|
|
|
|
|
|
|
graceful_restart_startup_time = 5.0
|
|
|
|
|
|
|
|
with self.eden.get_thrift_client() as client:
|
|
|
|
for key_class in ["local store get single", "local store get batch"]:
|
|
|
|
client.injectFault(
|
|
|
|
FaultDefinition(
|
|
|
|
keyClass=key_class,
|
|
|
|
keyValueRegex=".*",
|
|
|
|
delayMilliseconds=int(graceful_restart_startup_time * 1000),
|
|
|
|
count=100,
|
|
|
|
)
|
|
|
|
)
|
|
|
|
|
|
|
|
load_test_directory_inode_from_local_store_asynchronously()
|
|
|
|
self.eden.graceful_restart()
|