locks: add logging for invalid lock order taking

Summary:
We've been seeing some issues where non-write processes (like clone) are taking
the wlock/lock without taking the sqllock. This causes processes which do want
the sqllock (like pushes) to hang waiting. Streaming clones are one known case
of this, but this extra logging will help us catch other cases.

Once we're confident that nothing takes the wrong lock ordering, we can change
this to throw an exception instead of logging the error.

This required a minor refactor the synclimit code, so we can detect if we're in
a sync lock (which is a case where it's ok to take the wlock/lock without taking
the sqlock). And it configures the tests to use streaming clones when possible,
which trigger the current known bug.

Test Plan: Added a test

Reviewers: #mercurial, quark

Reviewed By: quark

Subscribers: quark

Differential Revision: https://phabricator.intern.facebook.com/D4287384

Signature: t1:4287384:1481065929:dfac78e3537c3cdf44c810bc99c5f07ed5b12f4c
This commit is contained in:
Durham Goode 2016-12-06 16:04:20 -08:00
parent 593af04245
commit a6491a0b9c
3 changed files with 60 additions and 2 deletions

View File

@ -473,6 +473,35 @@ def wraprepo(repo):
callback()
self.sqlpostrelease = []
def lock(self, *args, **kwargs):
wl = self._wlockref and self._wlockref()
if (not self._synclock and
not (wl is not None and wl.held) and
not self.hassqllock(writelock, checkserver=False)):
self._recordbadlockorder()
return super(sqllocalrepo, self).lock(*args, **kwargs)
def wlock(self, *args, **kwargs):
if (not self._synclock and
not self.hassqllock(writelock, checkserver=False)):
self._recordbadlockorder()
return super(sqllocalrepo, self).wlock(*args, **kwargs)
def _recordbadlockorder(self):
self.ui.debug("hgsql: invalid lock order\n")
outputdir = self.ui.config('hgsql', 'profileoutput')
if outputdir:
import random, traceback
pid = os.getpid()
rand = random.random()
timestamp = time.time()
path = os.path.join(outputdir, 'hgsql-lockorder-%s-%s-%s' %
(timestamp, pid, rand))
with open(path, 'a+') as f:
stack = traceback.format_stack()
f.write(''.join(stack))
def transaction(self, *args, **kwargs):
tr = super(sqllocalrepo, self).transaction(*args, **kwargs)
if tr.count > 1:
@ -528,9 +557,12 @@ def wraprepo(repo):
is returned, it means the limiter was not acquired, and readonly
clients should not attempt to perform a sync."""
try:
def release():
self._synclock = None
wait = False
return self._lock(self.svfs, "synclimiter", wait, None,
None, _('repository %s') % self.origroot)
self._synclock = self._lock(self.svfs, "synclimiter", wait, release,
None, _('repository %s') % self.origroot)
return self._synclock
except error.LockHeld:
return None
@ -1050,6 +1082,7 @@ def wraprepo(repo):
repo.pendingrevs = []
repo.heldlocks = set()
repo.sqlpostrelease = []
repo._synclock = None
repo.__class__ = sqllocalrepo

View File

@ -58,6 +58,10 @@ password = $DBPASS
port = $DBPORT
reponame = $2
[server]
preferuncompressed=True
uncompressed=True
[ui]
ssh=python "$TESTDIR/dummyssh"
EOF

View File

@ -9,6 +9,10 @@
$ cd ..
$ initserver master masterrepo
$ cat >> master/.hg/hgrc <<EOF
> [hgsql]
> profileoutput=$TESTTMP/
> EOF
$ hg -R master log
$ hg -R master pull -q client
@ -45,6 +49,8 @@
$ printf "[hooks]\npre-changegroup.sleep = sleep 2\n" >> master/.hg/hgrc
$ initclient client2
$ hg pull -q -R client2 ssh://user@dummy/master
$ ls -l $TESTTMP/hgsql-lockorder*
* $TESTTMP/hgsql-lockorder-* (glob)
$ cd client
$ hg up -q 1
@ -70,3 +76,18 @@
$ sed -i '/hooks/d' master/.hg/hgrc
$ sed -i '/sleep/d' master/.hg/hgrc
# Verify incorrect lock order taking is logged.
# We know that streaming clones currently hit this code path, and the hg pull in
# the above test used that code path.
$ cat $TESTTMP/hgsql-lockorder* | tail -n 10
File *, line *, in getstream (glob)
for chunk in it:
File *, line *, in generatev1wireproto (glob)
filecount, bytecount, it = generatev1(repo)
File *, line *, in generatev1 (glob)
with repo.lock():
File *, line *, in lock (glob)
self._recordbadlockorder()
File *, line *, in _recordbadlockorder (glob)
stack = traceback.format_stack()