lock: use flock on POSIX

Summary:
We recently ran into issues with locks in pid namespaces [1]. Let's fix that
by using flock.

flock is more reliable in Linux's pid namespace use-case than file-existence
test, because it works without a /proc filesystem and does not have deadlock
issue if an hg process is killed unexpectedly (ex. OOM or SIGKILL).

The transition should be transparent:
- If the new code saw a symlink lock file generated by the old code.
  `open(..., O_NOFOLLOW)` will fail and it's considered lock taken by the old
  process correctly.
- If the old code saw a new lock file. It will treat it as system without
  symlink support and it's considered lock taken by the new process correctly.

A non-symlink stale lock (regardless of whether it contains pid information or not)
will be confidently removed automatically by the new code.

The change is complicated because it works when both new and old hg
run at the same time. If we have migrated most users to the new code path,
the code can be cleaned up significantly.

[1]: https://fburl.com/85fxjisi

Reviewed By: DurhamG

Differential Revision: D9004614

fbshipit-source-id: d501c4f3a7bc8ad73c9556be1c6a265ffd0d0686
This commit is contained in:
Jun Wu 2018-08-08 16:07:25 -07:00 committed by Facebook Github Bot
parent 4ed4bcf69a
commit 9740116626
12 changed files with 267 additions and 160 deletions

View File

@ -49,7 +49,7 @@ class looselock(object):
try:
self.vfs.makelock(lockcontents, self.lockname)
except (OSError, IOError) as ex:
if ex.errno == errno.EEXIST:
if ex.errno in (errno.EEXIST, errno.EAGAIN):
raise error.LockHeld(
ex.errno,
self.vfs.join(self.lockname),

View File

@ -592,11 +592,16 @@ def notbackedup(repo, subset, x):
@templatekeyword("backingup")
def backingup(repo, ctx, **args):
def backingup(repo, **args):
"""Whether infinitepush is currently backing up commits."""
# If the backup lock exists then a backup should be in progress.
return _islocked(repo)
def _islocked(repo):
srcrepo = shareutil.getsrcrepo(repo)
return srcrepo.vfs.lexists(_backuplockname)
path = srcrepo.vfs.join(_backuplockname)
return util.islocked(path)
def _smartlogbackupsuggestion(ui, repo):
@ -663,8 +668,7 @@ def smartlogsummary(ui, repo):
_smartlogbackuphealthcheckmsg(ui, repo)
# Don't output the summary if a backup is currently in progress.
srcrepo = shareutil.getsrcrepo(repo)
if srcrepo.vfs.lexists(_backuplockname):
if _islocked(repo):
return
unbackeduprevs = repo.revs("notbackedup()")

View File

@ -1594,15 +1594,23 @@ def debuglocks(ui, repo, **opts):
ull = None
if repo.vfs.exists("undolog"):
ull = os.path.join("undolog", "lock")
if opts.get(r"force_lock"):
repo.svfs.unlink("lock")
done = True
if opts.get(r"force_wlock"):
repo.vfs.unlink("wlock")
done = True
if opts.get(r"force_undolog_lock"):
repo.vfs.unlink(ull)
done = True
if pycompat.iswindows:
if opts.get(r"force_lock"):
repo.svfs.unlink("lock")
done = True
if opts.get(r"force_wlock"):
repo.vfs.unlink("wlock")
done = True
if opts.get(r"force_undolog_lock"):
repo.vfs.unlink(ull)
done = True
else:
if (
opts.get(r"force_lock")
or opts.get(r"force_wlock")
or opts.get(r"force_undolog_lock")
):
raise error.Abort(_("cannot force release lock on POSIX"))
if done:
return 0

View File

@ -226,6 +226,7 @@ class lock(object):
self.showspinner = showspinner
self.spinnermsg = spinnermsg
self._debugmessagesprinted = set([])
self._lockfd = None
if dolock:
self.delay = self.lock()
if self.acquirefn:
@ -291,14 +292,18 @@ class lock(object):
if self.held:
self.held += 1
return
assert self._lockfd is None
retry = 5
while not self.held and retry:
retry -= 1
try:
self.vfs.makelock(self._getlockname(), self.f)
self._lockfd = self.vfs.makelock(self._getlockname(), self.f)
self.held = 1
except (OSError, IOError) as why:
if why.errno == errno.EEXIST:
# EEXIST: lockfile exists (Windows)
# ELOOP: lockfile exists as a symlink (POSIX)
# EAGAIN: lockfile flock taken by other process (POSIX)
if why.errno in {errno.EEXIST, errno.ELOOP, errno.EAGAIN}:
lockfilecontents = self._readlock()
if lockfilecontents is None:
continue
@ -317,6 +322,7 @@ class lock(object):
raise error.LockHeld(
errno.EAGAIN, self.vfs.join(self.f), self.desc, info
)
else:
raise error.LockUnavailable(
why.errno, why.strerror, why.filename, self.desc
@ -365,9 +371,19 @@ class lock(object):
m %= (info.uniqueid,)
self._debugprintonce(m)
return info
# POSIX util.makelock removes stale lock based on flock information.
# So it's unnecessary to remove stale lock here. It's also less
# reliable to remove stale lock based on pid due to pid namespaces.
if not pycompat.iswindows:
m = _("some process that claims itself as %r is holding the lock\n")
m %= (info.uniqueid,)
self._debugprintonce(m)
return info
# if lockinfo dead, break lock. must do this with another lock
# held, or can race and break valid lock.
try:
# The "remove dead lock" logic is done by posix.makelock, not here.
assert pycompat.iswindows
msg = _(
"trying to removed the stale lock file " "(will acquire %s for that)\n"
)
@ -446,6 +462,9 @@ class lock(object):
if not self._parentheld:
try:
self.vfs.unlink(self.f)
if self._lockfd is not None:
os.close(self._lockfd)
self._lockfd = None
except OSError:
pass
# The postrelease functions typically assume the lock is not held

View File

@ -7,6 +7,7 @@
from __future__ import absolute_import
import contextlib
import errno
import fcntl
import getpass
@ -71,18 +72,171 @@ def split(p):
return ht[0] + "/", ht[1]
def makelock(info, pathname):
try:
return os.symlink(info, pathname)
except OSError as why:
if why.errno == errno.EEXIST:
raise
except AttributeError: # no symlink in os
pass
@contextlib.contextmanager
def _locked(pathname):
"""Context manager locking on a path. Use this to make short decisions
in an "atomic" way across multiple processes.
ld = os.open(pathname, os.O_CREAT | os.O_WRONLY | os.O_EXCL)
os.write(ld, info)
os.close(ld)
pathname must already exist.
"""
fd = os.open(pathname, os.O_RDONLY | os.O_NOFOLLOW)
fcntl.flock(fd, fcntl.LOCK_EX)
try:
yield
finally:
os.close(fd)
def makelock(info, pathname):
"""Try to make a lock at given path. Write info inside it.
Stale non-symlink locks are removed automatically. Symlink locks
are only used by legacy code.
Return file descriptor on success. The file descriptor must be kept
for the lock to be effective.
Raise EEXIST if another process is holding the lock.
Raise ELOOP if another legacy hg process is holding the lock.
Can also raise other errors.
"""
# This is a bit complex, since it aims to support old lock code where the
# lock file is removed when the lock is released. The simpler version
# where the lock file does not get unlinked when releasing the lock is:
#
# # Open the file. Create on demand. Fail if it's a symlink.
# fd = os.open(pathname, os.O_CREAT | os.O_RDWR | os.O_NOFOLLOW)
# try:
# fcntl.flock(fd, fcntl.LOCK_NB | fcntl.LOCK_EX)
# os.write(fd, info)
# except (OSError, IOError):
# os.close(fd)
# raise
# else:
# return fd
#
# With "unlink" on release, the above simple logic can break in this way:
#
# [process 1] got fd.
# [process 2] got fd pointing to a same file.
# [process 1] .... release lock. file unlinked.
# [process 2] flock on fd. (broken lock - file was gone)
#
# A direct fix is to use O_EXCL to make sure the file is created by the
# current process, then use "flock". That means there needs to be a way to
# remove stale lock, and that is not easy. A naive check and delete can
# break subtly:
#
# [process 1] to check stale lock - got fd.
# [process 2] ... release lock. file unlinked.
# [process 1] flock taken, decided to remove file.
# [process 3] create a new lock.
# [process 1] unlink lock file. (wrong - removed the wrong lock)
#
# Instead of figuring out how to handle all corner cases carefully, we just
# always lock the parent directory when doing "racy" write operations
# (creating a lock, or removing a stale lock). So they become "atomic" and
# safe. There are 2 kinds of write operations that can happen without
# taking the directory lock:
#
# - Legacy symlink lock creation or deletion. The new code errors out
# when it saw a symlink lock (os.open(..., O_NOFOLLOW) and os.rename).
# So they play well with each other.
# - Unlinking lock file when when releasing. The release logic is holding
# the flock. So it knows nobody else has the lock. Therefore it can do
# the unlink without extra locking.
dirname = os.path.dirname(pathname)
with _locked(dirname or "."):
# Check and remove stale lock
try:
fd = os.open(pathname, os.O_RDONLY | os.O_NOFOLLOW)
except (OSError, IOError) as ex:
# ELOOP: symlink, lock taken by legacy process - return directly
if ex.errno != errno.ENOENT:
raise
else:
try:
fcntl.flock(fd, fcntl.LOCK_NB | fcntl.LOCK_EX)
os.unlink(pathname)
except (OSError, IOError) as ex:
# EAGAIN: lock taken - return directly
# ENOENT: lock removed already - continue
if ex.errno != errno.ENOENT:
raise
finally:
os.close(fd)
# Create symlink placeholder. Make sure the file replaced by
# "os.rename" can only be this symlink. This avoids race condition
# when legacy code creates the symlink lock without locking the
# parent directory.
#
# This is basically the legacy lock logic.
placeholdercreated = False
try:
os.symlink(info, pathname)
placeholdercreated = True
except (IOError, OSError) as ex:
if ex.errno == errno.EEXIST:
raise
except AttributeError:
pass
if not placeholdercreated:
# No symlink support. Suboptimal. Create a placeholder by using an
# empty file. Other legacy process might see a "malformed lock"
# temporarily. New processes won't see this because both "readlock"
# and "islocked" take the directory lock.
fd = os.open(pathname, os.O_CREAT | os.O_WRONLY | os.O_EXCL)
os.close(fd)
# Create new lock
fd, tmppath = tempfile.mkstemp(prefix="makelock", dir=dirname)
try:
fcntl.flock(fd, fcntl.LOCK_NB | fcntl.LOCK_EX)
os.write(fd, info)
os.rename(tmppath, pathname)
return fd
except Exception:
unlink(tmppath)
os.close(fd)
raise
def readlock(pathname):
with _locked(os.path.dirname(pathname) or "."):
try:
return os.readlink(pathname)
except OSError as why:
if why.errno not in (errno.EINVAL, errno.ENOSYS):
raise
except AttributeError: # no symlink in os
pass
fp = posixfile(pathname)
r = fp.read()
fp.close()
return r
def islocked(pathname):
with _locked(os.path.dirname(pathname) or "."):
try:
fd = os.open(pathname, os.O_RDONLY | os.O_NOFOLLOW)
except OSError as ex:
# ELOOP, ENOENT, EPERM, ...
# Only treat ENOENT as "not locked".
return ex.errno != errno.ENOENT
try:
fcntl.flock(fd, fcntl.LOCK_NB | fcntl.LOCK_EX)
return False
except IOError as ex:
if ex.errno == errno.EAGAIN:
return True
else:
raise
finally:
os.close(fd)
def openhardlinks():

View File

@ -119,6 +119,7 @@ groupmembers = platform.groupmembers
groupname = platform.groupname
hidewindow = platform.hidewindow
isexec = platform.isexec
islocked = platform.islocked
isowner = platform.isowner
listdir = osutil.listdir
localpath = platform.localpath
@ -138,6 +139,7 @@ poll = platform.poll
popen = platform.popen
posixfile = platform.posixfile
quotecommand = platform.quotecommand
readlock = platform.readlock
readpipe = platform.readpipe
rename = platform.rename
removedirs = platform.removedirs
@ -1498,20 +1500,6 @@ if safehasattr(time, "perf_counter"):
timer = time.perf_counter
def readlock(pathname):
try:
return os.readlink(pathname)
except OSError as why:
if why.errno not in (errno.EINVAL, errno.ENOSYS):
raise
except AttributeError: # no symlink in os
pass
fp = posixfile(pathname)
r = fp.read()
fp.close()
return r
def fstat(fp):
"""stat file object that may not have fileno method."""
try:

View File

@ -582,3 +582,21 @@ def makelock(info, pathname):
except WindowsError:
os.unlink(tname)
raise
def readlock(pathname):
try:
return os.readlink(pathname)
except OSError as why:
if why.errno not in (errno.EINVAL, errno.ENOSYS):
raise
except AttributeError: # no symlink in os
pass
fp = posixfile(pathname)
r = fp.read()
fp.close()
return r
def islocked(pathname):
return os.path.exists(pathname)

View File

@ -153,120 +153,6 @@ Test debuglocks command:
lock: free
wlock: free
* Test setting the lock
waitlock <file> will wait for file to be created. If it isn't in a reasonable
amount of time, displays error message and returns 1
$ waitlock() {
> start=`date +%s`
> timeout=5
> while [ \( ! -f $1 \) -a \( ! -L $1 \) ]; do
> now=`date +%s`
> if [ "`expr $now - $start`" -gt $timeout ]; then
> echo "timeout: $1 was not created in $timeout seconds"
> return 1
> fi
> sleep 0.1
> done
> }
$ dolock() {
> {
> waitlock .hg/unlock
> rm -f .hg/unlock
> echo y
> } | hg debuglocks "$@" > /dev/null
> }
$ dolock -s &
$ waitlock .hg/store/lock
$ hg debuglocks
lock: user *, process * (*s) (glob)
wlock: free
[1]
$ touch .hg/unlock
$ wait
$ [ -f .hg/store/lock ] || echo "There is no lock"
There is no lock
* Test setting the wlock
$ dolock -S &
$ waitlock .hg/wlock
$ hg debuglocks
lock: free
wlock: user *, process * (*s) (glob)
[1]
$ touch .hg/unlock
$ wait
$ [ -f .hg/wlock ] || echo "There is no wlock"
There is no wlock
* Test setting both locks
$ dolock -Ss &
$ waitlock .hg/wlock && waitlock .hg/store/lock
$ hg debuglocks
lock: user *, process * (*s) (glob)
wlock: user *, process * (*s) (glob)
[2]
* Test failing to set a lock
$ hg debuglocks -s
abort: lock is already held
[255]
$ hg debuglocks -S
abort: wlock is already held
[255]
$ touch .hg/unlock
$ wait
$ hg debuglocks
lock: free
wlock: free
* Test forcing the lock
$ dolock -s &
$ waitlock .hg/store/lock
$ hg debuglocks
lock: user *, process * (*s) (glob)
wlock: free
[1]
$ hg debuglocks -L
$ hg debuglocks
lock: free
wlock: free
$ touch .hg/unlock
$ wait
* Test forcing the wlock
$ dolock -S &
$ waitlock .hg/wlock
$ hg debuglocks
lock: free
wlock: user *, process * (*s) (glob)
[1]
$ hg debuglocks -W
$ hg debuglocks
lock: free
wlock: free
$ touch .hg/unlock
$ wait
Test WdirUnsupported exception
$ hg debugdata -c ffffffffffffffffffffffffffffffffffffffff

View File

@ -101,7 +101,7 @@ Add an extension that logs whenever `manifest.readmf()` is called when the lock
> def readmf(orig, self, nodeorrev, **kwargs):
> haslock = False
> try:
> haslock = os.path.islink(os.path.join(self.opener.join(''), "../wlock"))
> haslock = os.path.lexists(os.path.join(self.opener.join(''), "../wlock"))
> except Exception as e:
> print >> sys.stderr, 'manifest: %s' % e
> pass

View File

@ -1,3 +1,4 @@
#require symlink
$ setup() {
> cat << EOF >> .hg/hgrc
@ -186,7 +187,8 @@ Check backup status with an unbacked up changeset that is disjoint from existing
Test template keyword for when a backup is in progress
$ hg log -T '{if(backingup,"Yes","No")}\n' -r .
No
$ echo fakelock > .hg/infinitepushbackup.lock
$ rm -f .hg/infinitepushbackup.lock
$ ln -s fakelock .hg/infinitepushbackup.lock
$ hg log -T '{if(backingup,"Yes","No")}\n' -r .
Yes
$ rm -f .hg/infinitepushbackup.lock

View File

@ -130,6 +130,7 @@ On processs waiting on another, warning disabled, (debug output on)
$ cat stdout
adding f
#if windows
Pushing to a local read-only repo that can't be locked
$ chmod 100 a/.hg/store
@ -163,4 +164,31 @@ Having an undolog lock file
lock: free
wlock: free
undolog/lock: free
#else
Having an empty lock file
$ cd a
$ touch .hg/wlock
$ hg backout # a command which always acquires a lock
abort: please specify a revision to backout
[255]
Non-symlink stale lock is removed automatically.
Having an empty undolog lock file
$ mkdir .hg/undolog && touch .hg/undolog/lock
$ hg debuglocks
lock: free
wlock: free
undolog/lock: free
$ hg debuglocks --force-undolog-lock
abort: cannot force release lock on POSIX
[255]
$ hg debuglocks
lock: free
wlock: free
undolog/lock: free
#endif
$ cd ..

View File

@ -1,4 +1,4 @@
#require no-fsmonitor
#require no-fsmonitor symlink
setup
@ -152,8 +152,8 @@ Tag cache debug info written to blackbox log
Failure to acquire lock results in no write
$ rm -f .hg/cache/tags2-visible .hg/cache/hgtagsfnodes1
$ echo 'foo:1' > .hg/wlock
$ rm -f .hg/cache/tags2-visible .hg/cache/hgtagsfnodes1 .hg/wlock
$ ln -s 'foo:1' .hg/wlock
$ hg identify
b9154636be93 tip
$ hg blackbox -l 6