sapling/eden/cli/doctor.py
Chad Austin 916f069b91 bring back eden doctor's stale mounts check - filter by st_uid and st_dev instead of path
Summary:
The prior implementation of StaleMountsCheck filtered by path
and did not correctly handle seeing the same FUSE mount multiple times
in the mount table. This occurred when an Eden mount was created
underneath a bind mount.

Now it only unmounts mounts where st_dev does not match the st_dev of
any active mounts, and where st_uid matches the current user.

Reviewed By: simpkins

Differential Revision: D6787618

fbshipit-source-id: 24e0f156cb74822500d91205349c0e6638c0340c
2018-01-25 15:14:58 -08:00

465 lines
17 KiB
Python

#!/usr/bin/env python3
#
# Copyright (c) 2016-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree. An additional grant
# of patent rights can be found in the PATENTS file in the same directory.
import abc
import binascii
import json
import logging
import os
import subprocess
import sys
import eden.dirstate
from enum import Enum, auto
from textwrap import dedent
from typing import Dict, List, Set, TextIO, Union
from . import config as config_mod
from . import version
from . import mtab
log = logging.getLogger('eden.cli.doctor')
class CheckResultType(Enum):
NO_ISSUE = auto()
FIXED = auto()
NOT_FIXED_BECAUSE_DRY_RUN = auto()
FAILED_TO_FIX = auto()
NO_CHECK_BECAUSE_EDEN_WAS_NOT_RUNNING = auto()
class CheckResult:
__slots__ = ('result_type', 'message')
def __init__(self, result_type: CheckResultType, message: str) -> None:
self.result_type = result_type
self.message = message
class Check(abc.ABC):
@abc.abstractmethod
def do_check(self, dry_run: bool) -> CheckResult:
pass
def cure_what_ails_you(
config: config_mod.Config,
dry_run: bool,
out: TextIO,
mount_table: mtab.MountTable,
) -> int:
is_healthy = config.check_health().is_healthy()
if not is_healthy:
out.write(
dedent(
'''\
Eden is not running: cannot perform all checks.
To start Eden, run:
eden daemon
'''
)
)
active_mount_points: List[str] = []
else:
with config.get_thrift_client() as client:
active_mount_points = [
mount.mountPoint for mount in client.listMounts()
if mount.mountPoint is not None]
# This list is a mix of messages to print to stdout and checks to perform.
checks_and_messages: List[Union[str, Check]] = [
StaleMountsCheck(
active_mount_points, mount_table),
]
if is_healthy:
checks_and_messages.append(EdenfsIsLatest(config))
else:
out.write(
'Cannot check if running latest edenfs because '
'the daemon is not running.\n'
)
watchman_roots = _get_watch_roots_for_watchman()
for mount_path in active_mount_points:
# For now, we assume that each mount_path is actively mounted. We should
# update the listMounts() Thrift API to return information that notes
# whether a mount point is active and use it here.
checks: List[Union[str, Check]] = []
checks.append(
WatchmanUsingEdenSubscriptionCheck(
mount_path, watchman_roots, is_healthy
)
)
checks.append(
NuclideHasExpectedWatchmanSubscriptions(
mount_path, watchman_roots, is_healthy
)
)
client_info = config.get_client_info(mount_path)
if client_info['scm_type'] == 'hg':
snapshot_hex = client_info['snapshot']
checks.append(
SnapshotDirstateConsistencyCheck(
mount_path, snapshot_hex, is_healthy
)
)
checks_and_messages.append(
f'Performing {len(checks)} checks for {mount_path}.\n'
)
checks_and_messages.extend(checks)
num_fixes = 0
num_failed_fixes = 0
num_not_fixed_because_dry_run = 0
for item in checks_and_messages:
if isinstance(item, str):
out.write(item)
continue
result = item.do_check(dry_run)
result_type = result.result_type
if result_type == CheckResultType.FIXED:
num_fixes += 1
elif result_type == CheckResultType.FAILED_TO_FIX:
num_failed_fixes += 1
elif result_type == CheckResultType.NOT_FIXED_BECAUSE_DRY_RUN:
num_not_fixed_because_dry_run += 1
out.write(result.message)
if num_not_fixed_because_dry_run:
out.write(
'Number of issues discovered during --dry-run: '
f'{num_not_fixed_because_dry_run}.\n'
)
if num_fixes:
out.write(f'Number of fixes made: {num_fixes}.\n')
if num_failed_fixes:
out.write(
'Number of issues that '
f'could not be fixed: {num_failed_fixes}.\n'
)
if num_failed_fixes == 0 and num_not_fixed_because_dry_run == 0:
out.write('All is well.\n')
if num_failed_fixes:
return 1
else:
return 0
def printable_bytes(b: bytes) -> str:
return b.decode('utf-8', 'backslashreplace')
class StaleMountsCheck(Check):
def __init__(self, active_mount_points: List[str],
mount_table: mtab.MountTable) -> None:
self._active_mount_points = active_mount_points
self._mount_table = mount_table
def do_check(self, dry_run: bool) -> CheckResult:
active_mount_devices: List[int] = []
for amp in self._active_mount_points:
try:
active_mount_devices.append(self._mount_table.lstat(amp).st_dev)
except OSError as e:
# If dry_run, should this return NOT_FIXED_BECAUSE_DRY_RUN?
return CheckResult(
CheckResultType.FAILED_TO_FIX,
f'Failed to lstat active eden mount {amp}\n')
stale_mounts = self.get_all_stale_eden_mount_points(active_mount_devices)
if not stale_mounts:
return CheckResult(CheckResultType.NO_ISSUE, '')
if dry_run:
message = f'Found {len(stale_mounts)} stale edenfs mount point{"s" if len(stale_mounts) != 1 else ""}:\n'
for mp in stale_mounts:
message += f' {printable_bytes(mp)}\n'
message += 'Not unmounting because dry run.\n'
return CheckResult(
CheckResultType.NOT_FIXED_BECAUSE_DRY_RUN,
message)
unmounted = []
failed_to_unmount = []
# Attempt to lazy unmount all of them first. For some reason,
# lazy unmount can sometimes release any bind mounts inside.
for mp in stale_mounts:
if self._mount_table.unmount_lazy(mp):
unmounted.append(mp)
# Use a refreshed list -- it's possible MNT_DETACH succeeded on some of
# the points.
for mp in self.get_all_stale_eden_mount_points(active_mount_devices):
if self._mount_table.unmount_force(mp):
unmounted.append(mp)
else:
failed_to_unmount.append(mp)
if failed_to_unmount:
message = ''
if len(unmounted):
message += f'Successfully unmounted {len(unmounted)} mount point{"s" if len(unmounted) != 1 else ""}:\n'
for mp in sorted(unmounted):
message += f' {printable_bytes(mp)}\n'
message += f'Failed to unmount {len(failed_to_unmount)} mount point{"s" if len(failed_to_unmount) != 1 else ""}:\n'
for mp in sorted(failed_to_unmount):
message += f' {printable_bytes(mp)}\n'
return CheckResult(CheckResultType.FAILED_TO_FIX, message)
else:
message = f'Unmounted {len(stale_mounts)} stale edenfs mount point{"s" if len(stale_mounts) != 1 else ""}:\n'
for mp in sorted(unmounted):
message += f' {printable_bytes(mp)}\n'
return CheckResult(CheckResultType.FIXED, message)
def get_all_stale_eden_mount_points(
self, active_mount_devices: List[int]) -> List[bytes]:
me = os.getuid()
stale_eden_mount_points: Set[bytes] = set()
for mount_point in self.get_all_eden_mount_points():
try:
st = self._mount_table.lstat(mount_point)
except OSError as e:
log.warning(f'lstat("{mount_point}") failed: {e}')
continue
# Exclude any mounts that aren't owned by the current user and whose
# device ID matches the device ID of any existing mounts. Avoid
# excluding by path because the same FUSE mount can show up in the
# mount table multiple times if it's underneath a bind mount.
if st.st_uid == me and st.st_dev not in active_mount_devices:
stale_eden_mount_points.add(mount_point)
return sorted(stale_eden_mount_points)
def get_all_eden_mount_points(self) -> Set[bytes]:
all_system_mounts = self._mount_table.read()
return set(
mount.mount_point
for mount in all_system_mounts
if mount.device == b'edenfs' and mount.vfstype == b'fuse')
class WatchmanUsingEdenSubscriptionCheck(Check):
def __init__(self, path: str, watchman_roots: Set[str],
is_healthy: bool) -> None:
self._path = path
self._watchman_roots = watchman_roots
self._is_healthy = is_healthy
self._watcher = None
def do_check(self, dry_run: bool) -> CheckResult:
if not self._is_healthy:
return self._report(
CheckResultType.NO_CHECK_BECAUSE_EDEN_WAS_NOT_RUNNING
)
if self._path not in self._watchman_roots:
return self._report(CheckResultType.NO_ISSUE)
watch_details = _call_watchman(['watch-project', self._path])
self._watcher = watch_details.get('watcher')
if self._watcher == 'eden':
return self._report(CheckResultType.NO_ISSUE)
# At this point, we know there is an issue that needs to be fixed.
if dry_run:
return self._report(CheckResultType.NOT_FIXED_BECAUSE_DRY_RUN)
# Delete the old watch and try to re-establish it. Hopefully it will be
# an Eden watch this time.
_call_watchman(['watch-del', self._path])
watch_details = _call_watchman(['watch-project', self._path])
if watch_details.get('watcher') == 'eden':
return self._report(CheckResultType.FIXED)
else:
return self._report(CheckResultType.FAILED_TO_FIX)
def _report(self, result_type: CheckResultType) -> CheckResult:
old_watcher = self._watcher or '(unknown)'
if result_type == CheckResultType.FIXED:
msg = (
f'Previous Watchman watcher for {self._path} was '
f'"{old_watcher}" but is now "eden".\n'
)
elif result_type == CheckResultType.FAILED_TO_FIX:
msg = (
f'Watchman Watcher for {self._path} was {old_watcher} '
'and we failed to replace it with an "eden" watcher.\n'
)
elif result_type == CheckResultType.NOT_FIXED_BECAUSE_DRY_RUN:
msg = (
f'Watchman Watcher for {self._path} was {old_watcher} '
'but nothing was done because --dry-run was specified.\n'
)
else:
msg = ''
return CheckResult(result_type, msg)
class NuclideHasExpectedWatchmanSubscriptions(Check):
def __init__(self, path: str, watchman_roots: Set[str],
is_healthy: bool) -> None:
self._path = path
self._watchman_roots = watchman_roots
self._is_healthy = is_healthy
def do_check(self, dry_run: bool) -> CheckResult:
if not self._is_healthy:
return self._report(
CheckResultType.NO_CHECK_BECAUSE_EDEN_WAS_NOT_RUNNING
)
if self._path not in self._watchman_roots:
return self._report(CheckResultType.NO_ISSUE)
subscriptions = _call_watchman(['debug-get-subscriptions', self._path])
subscribers = subscriptions.get('subscribers', [])
names = set()
for subscriber in subscribers:
info = subscriber.get('info', {})
name = info.get('name')
names.add(name)
# We use the presence of this in `names` as a heuristic as to whether
# files from self._path are being edited in Nuclide.
if 'hg-repository-watchman-subscription-primary' not in names:
return self._report(CheckResultType.NO_ISSUE)
whole_repo_subscription = f'filewatcher-{self._path}'
if whole_repo_subscription in names:
has_proper_watchman_subscription = True
else:
subrepo_subscription_prefix = whole_repo_subscription + '/'
has_proper_watchman_subscription = any(
s.startswith(subrepo_subscription_prefix) for s in names
)
if has_proper_watchman_subscription:
return self._report(CheckResultType.NO_ISSUE)
elif dry_run:
return self._report(CheckResultType.NOT_FIXED_BECAUSE_DRY_RUN)
else:
return self._report(CheckResultType.FAILED_TO_FIX)
def _report(self, result_type: CheckResultType) -> CheckResult:
if result_type in [
CheckResultType.FAILED_TO_FIX,
CheckResultType.NOT_FIXED_BECAUSE_DRY_RUN
]:
msg = (
f'Nuclide appears to be used to edit {self._path},\n'
'but a key Watchman subscription appears to be missing.\n'
'This can cause file changes to fail to show up in Nuclide.\n'
'Currently, the only workaround this is to run\n'
'"Nuclide Remote Projects: Kill And Restart" from the\n'
'command palette in Atom.\n'
)
else:
msg = ''
return CheckResult(result_type, msg)
class SnapshotDirstateConsistencyCheck(Check):
def __init__(self, path: str, snapshot_hex: str, is_healthy: bool) -> None:
self._path = path
self._snapshot_hex = snapshot_hex
self._is_healthy = is_healthy
def do_check(self, dry_run: bool) -> CheckResult:
if not self._is_healthy:
return self._report(
CheckResultType.NO_CHECK_BECAUSE_EDEN_WAS_NOT_RUNNING
)
dirstate = os.path.join(self._path, '.hg', 'dirstate')
with open(dirstate, 'rb') as f:
parents, _tuples_dict, _copymap = eden.dirstate.read(f, dirstate)
p1 = parents[0]
self._p1_hex = binascii.hexlify(p1).decode('utf-8')
if self._snapshot_hex == self._p1_hex:
return self._report(CheckResultType.NO_ISSUE)
else:
return self._report(CheckResultType.FAILED_TO_FIX)
def _report(self, result_type: CheckResultType) -> CheckResult:
if result_type == CheckResultType.FAILED_TO_FIX:
msg = (
f'p1 for {self._path} is {self._p1_hex}, but Eden\'s internal\n'
f'hash in its SNAPSHOT file is {self._snapshot_hex}.\n'
)
else:
msg = ''
return CheckResult(result_type, msg)
class EdenfsIsLatest(Check):
def __init__(self, config) -> None:
self._config = config
def do_check(self, dry_run: bool) -> CheckResult:
rver, release = \
version.get_running_eden_version_parts(self._config)
if not rver or not release:
# This could be a dev build that returns the empty
# string for both of these values.
return CheckResult(CheckResultType.NO_ISSUE, '')
running_version = version.format_running_eden_version(
(rver, release))
installed_version = version.get_installed_eden_rpm_version()
if running_version == installed_version:
return CheckResult(CheckResultType.NO_ISSUE, '')
else:
return CheckResult(
CheckResultType.FAILED_TO_FIX,
dedent(
f'''\
The version of Eden that is installed on your machine is:
fb-eden-{installed_version}.x86_64
but the version of Eden that is currently running is:
fb-eden-{running_version}.x86_64
Consider running `eden daemon --takeover` to migrate
to the newer version. If that doesn't work out, you
may wish to run `eden shutdown` followed by `eden daemon`
to restart with the installed version, which may have
important bug fixes or performance improvements.
'''
)
)
def _get_watch_roots_for_watchman() -> Set[str]:
js = _call_watchman(['watch-list'])
roots = set()
for root in js['roots']:
roots.add(root)
return roots
def _call_watchman(args: List[str]) -> Dict:
full_args = ['watchman']
full_args.extend(args)
try:
output = subprocess.check_output(full_args)
return json.loads(output)
except OSError as e:
sys.stderr.write(
f'Calling `{" ".join(full_args)}`'
f' failed with: {str(e) if e.strerror is None else e.strerror}\n'
)
return {'error': str(e)}