fix eden doctor to avoid interfering with other edenfs instances

Summary:
Only treat edenfs mount points as stale if attempting to access them fails with
ENOTCONN.  If we can successfully access the mount point then it cannot be
stale and does not need to be unmounted.

This prevents `eden doctor` from incorrectly unmounting mount points owned by
another edenfs process running on this system.

Reviewed By: bolinfest

Differential Revision: D7683503

fbshipit-source-id: 412cc17442fac1e8030dd40de0e0f642c15d1253
This commit is contained in:
Adam Simpkins 2018-04-25 17:02:19 -07:00 committed by Facebook Github Bot
parent 2812d73d2c
commit c21571f23a
2 changed files with 54 additions and 45 deletions

View File

@ -93,8 +93,8 @@ def cure_what_ails_you(
if mount_path not in config.get_mount_paths():
# TODO: if there are mounts in active_mount_points that aren't in
# config.get_mount_paths(), should we try to add them to the config?
# I've only seen this happen in the wild if a clone fails partway, for
# example, if a post-clone hook fails.
# I've only seen this happen in the wild if a clone fails partway,
# for example, if a post-clone hook fails.
continue
# For now, we assume that each mount_path is actively mounted. We should
@ -178,10 +178,9 @@ class StaleMountsCheck(Check):
self._mount_table = mount_table
def do_check(self, dry_run: bool) -> CheckResult:
active_mount_devices: List[int] = []
for amp in self._active_mount_points:
try:
active_mount_devices.append(self._mount_table.lstat(amp).st_dev)
self._mount_table.lstat(amp).st_dev
except OSError as e:
# If dry_run, should this return NOT_FIXED_BECAUSE_DRY_RUN?
return CheckResult(
@ -189,14 +188,13 @@ class StaleMountsCheck(Check):
f'Failed to lstat active eden mount {amp}\n'
)
stale_mounts = self.get_all_stale_eden_mount_points(
active_mount_devices
)
stale_mounts = self.get_all_stale_eden_mount_points()
if not stale_mounts:
return CheckResult(CheckResultType.NO_ISSUE, '')
if dry_run:
message = f'Found {len(stale_mounts)} stale edenfs mount point{"s" if len(stale_mounts) != 1 else ""}:\n'
message = (f'Found {len(stale_mounts)} stale edenfs mount '
f'point{"s" if len(stale_mounts) != 1 else ""}:\n')
for mp in stale_mounts:
message += f' {printable_bytes(mp)}\n'
message += 'Not unmounting because dry run.\n'
@ -216,7 +214,7 @@ class StaleMountsCheck(Check):
# Use a refreshed list -- it's possible MNT_DETACH succeeded on some of
# the points.
for mp in self.get_all_stale_eden_mount_points(active_mount_devices):
for mp in self.get_all_stale_eden_mount_points():
if self._mount_table.unmount_force(mp):
unmounted.append(mp)
else:
@ -225,27 +223,33 @@ class StaleMountsCheck(Check):
if failed_to_unmount:
message = ''
if len(unmounted):
message += f'Successfully unmounted {len(unmounted)} mount point{"s" if len(unmounted) != 1 else ""}:\n'
message += (f'Successfully unmounted {len(unmounted)} mount '
f'point{"s" if len(unmounted) != 1 else ""}:\n')
for mp in sorted(unmounted):
message += f' {printable_bytes(mp)}\n'
message += f'Failed to unmount {len(failed_to_unmount)} mount point{"s" if len(failed_to_unmount) != 1 else ""}:\n'
message += (f'Failed to unmount {len(failed_to_unmount)} mount '
f'point{"s" if len(failed_to_unmount) != 1 else ""}:\n')
for mp in sorted(failed_to_unmount):
message += f' {printable_bytes(mp)}\n'
return CheckResult(CheckResultType.FAILED_TO_FIX, message)
else:
message = f'Unmounted {len(stale_mounts)} stale edenfs mount point{"s" if len(stale_mounts) != 1 else ""}:\n'
message = (f'Unmounted {len(stale_mounts)} stale edenfs mount '
f'point{"s" if len(stale_mounts) != 1 else ""}:\n')
for mp in sorted(unmounted):
message += f' {printable_bytes(mp)}\n'
return CheckResult(CheckResultType.FIXED, message)
def get_all_stale_eden_mount_points(self, active_mount_devices: List[int]
) -> List[bytes]:
me = os.getuid()
def get_all_stale_eden_mount_points(self) -> List[bytes]:
stale_eden_mount_points: Set[bytes] = set()
for mount_point in self.get_all_eden_mount_points():
try:
st = self._mount_table.lstat(mount_point)
# All eden mounts should have a .eden directory.
# If the edenfs daemon serving this mount point has died we
# will get ENOTCONN when trying to access it. (Simply calling
# lstat() on the root directory itself can succeed even in this
# case.)
eden_dir = os.path.join(mount_point, b'.eden')
self._mount_table.lstat(eden_dir)
except OSError as e:
if e.errno == errno.ENOTCONN:
stale_eden_mount_points.add(mount_point)
@ -254,13 +258,6 @@ class StaleMountsCheck(Check):
f"Unclear whether {printable_bytes(mount_point)} "
f"is stale or not. lstat() failed: {e}"
)
else:
# Exclude any mounts that aren't owned by the current user and whose
# device ID matches the device ID of any existing mounts. Avoid
# excluding by path because the same FUSE mount can show up in the
# mount table multiple times if it's underneath a bind mount.
if st.st_uid == me and st.st_dev not in active_mount_devices:
stale_eden_mount_points.add(mount_point)
return sorted(stale_eden_mount_points)

View File

@ -615,26 +615,19 @@ class StaleMountsCheckTest(unittest.TestCase):
self.assertEqual([], self.mount_table.unmount_lazy_calls)
self.assertEqual([], self.mount_table.unmount_force_calls)
def test_stale_nonactive_mount_is_unmounted(self):
def test_working_nonactive_mount_is_not_unmounted(self):
# Add a working edenfs mount that is not part of our active list
self.mount_table.add_mount('/mnt/stale1')
self.mount_table.add_mount('/mnt/other1')
result = self.check.do_check(dry_run=False)
self.assertEqual(doctor.CheckResultType.FIXED, result.result_type)
self.assertEqual(
dedent(
'''\
Unmounted 1 stale edenfs mount point:
/mnt/stale1
'''
), result.message
)
self.assertEqual([b'/mnt/stale1'], self.mount_table.unmount_lazy_calls)
self.assertEqual('', result.message)
self.assertEqual(doctor.CheckResultType.NO_ISSUE, result.result_type)
self.assertEqual([], self.mount_table.unmount_lazy_calls)
self.assertEqual([], self.mount_table.unmount_force_calls)
def test_force_unmounts_if_lazy_fails(self):
self.mount_table.add_mount('/mnt/stale1')
self.mount_table.add_mount('/mnt/stale2')
self.mount_table.add_stale_mount('/mnt/stale1')
self.mount_table.add_stale_mount('/mnt/stale2')
self.mount_table.fail_unmount_lazy(b'/mnt/stale1')
result = self.check.do_check(dry_run=False)
@ -655,8 +648,8 @@ class StaleMountsCheckTest(unittest.TestCase):
self.assertEqual([b'/mnt/stale1'], self.mount_table.unmount_force_calls)
def test_dry_run_prints_stale_mounts_and_does_not_unmount(self):
self.mount_table.add_mount('/mnt/stale1')
self.mount_table.add_mount('/mnt/stale2')
self.mount_table.add_stale_mount('/mnt/stale1')
self.mount_table.add_stale_mount('/mnt/stale2')
result = self.check.do_check(dry_run=True)
self.assertEqual(
@ -676,8 +669,8 @@ class StaleMountsCheckTest(unittest.TestCase):
self.assertEqual([], self.mount_table.unmount_force_calls)
def test_fails_if_unmount_fails(self):
self.mount_table.add_mount('/mnt/stale1')
self.mount_table.add_mount('/mnt/stale2')
self.mount_table.add_stale_mount('/mnt/stale1')
self.mount_table.add_stale_mount('/mnt/stale2')
self.mount_table.fail_unmount_lazy(b'/mnt/stale1', b'/mnt/stale2')
self.mount_table.fail_unmount_force(b'/mnt/stale1')
@ -713,7 +706,9 @@ class StaleMountsCheckTest(unittest.TestCase):
self.assertEqual([], self.mount_table.unmount_force_calls)
def test_gives_up_if_cannot_stat_active_mount(self):
del self.mount_table.stats['/mnt/active1']
self.mount_table.fail_access('/mnt/active1', errno.ENOENT)
self.mount_table.fail_access('/mnt/active1/.eden', errno.ENOENT)
result = self.check.do_check(dry_run=False)
self.assertEqual(
doctor.CheckResultType.FAILED_TO_FIX, result.result_type
@ -727,6 +722,7 @@ class StaleMountsCheckTest(unittest.TestCase):
def test_does_not_unmount_if_cannot_stat_stale_mount(self, warning):
self.mount_table.add_mount('/mnt/stale1')
self.mount_table.fail_access('/mnt/stale1', errno.EACCES)
self.mount_table.fail_access('/mnt/stale1/.eden', errno.EACCES)
result = self.check.do_check(dry_run=False)
self.assertEqual(doctor.CheckResultType.NO_ISSUE, result.result_type)
@ -740,8 +736,7 @@ class StaleMountsCheckTest(unittest.TestCase):
)
def test_does_unmount_if_stale_mount_is_unconnected(self):
self.mount_table.add_mount('/mnt/stale1')
self.mount_table.fail_access('/mnt/stale1', errno.ENOTCONN)
self.mount_table.add_stale_mount('/mnt/stale1')
result = self.check.do_check(dry_run=False)
self.assertEqual(doctor.CheckResultType.FIXED, result.result_type)
@ -883,6 +878,23 @@ class FakeMountTable(mtab.MountTable):
self._add_mount_info(path, device=device, vfstype=vfstype)
self.stats[path] = mtab.MTStat(st_uid=uid, st_dev=dev)
if device == 'edenfs':
self.stats[os.path.join(path, '.eden')] = mtab.MTStat(
st_uid=uid, st_dev=dev
)
def add_stale_mount(
self,
path: str,
uid: Optional[int] = None,
dev: Optional[int] = None,
) -> None:
# Stale mounts are always edenfs FUSE mounts
self.add_mount(path, uid=uid, dev=dev)
# Stale mounts still successfully respond to stat() calls for the root
# directory itself, but fail stat() calls to any other path with
# ENOTCONN
self.fail_access(os.path.join(path, '.eden'), errno.ENOTCONN)
def fail_access(self, path: str, errnum: int) -> None:
self.stats[path] = OSError(errnum, os.strerror(errnum))