locks: add logging for invalid lock order taking

Summary: We've been seeing some issues where non-write processes (like clone) are taking the wlock/lock without taking the sqllock. This causes processes which do want the sqllock (like pushes) to hang waiting. Streaming clones are one known case of this, but this extra logging will help us catch other cases. Once we're confident that nothing takes the wrong lock ordering, we can change this to throw an exception instead of logging the error. This required a minor refactor the synclimit code, so we can detect if we're in a sync lock (which is a case where it's ok to take the wlock/lock without taking the sqlock). And it configures the tests to use streaming clones when possible, which trigger the current known bug. Test Plan: Added a test Reviewers: #mercurial, quark Reviewed By: quark Subscribers: quark Differential Revision: https://phabricator.intern.facebook.com/D4287384 Signature: t1:4287384:1481065929:dfac78e3537c3cdf44c810bc99c5f07ed5b12f4c
2024-10-09 16:31:02 +03:00 · 2016-12-06 16:04:20 -08:00 · 2016-12-06 16:04:20 -08:00 · a6491a0b9c
commit a6491a0b9c
parent 593af04245
3 changed files with 60 additions and 2 deletions
--- a/hgsql.py
+++ b/hgsql.py
@ -473,6 +473,35 @@ def wraprepo(repo):
                callback()
            self.sqlpostrelease = []

+        def lock(self, *args, **kwargs):
+            wl = self._wlockref and self._wlockref()
+            if (not self._synclock and
+                not (wl is not None and wl.held) and
+                not self.hassqllock(writelock, checkserver=False)):
+                self._recordbadlockorder()
+            return super(sqllocalrepo, self).lock(*args, **kwargs)
+
+        def wlock(self, *args, **kwargs):
+            if (not self._synclock and
+                not self.hassqllock(writelock, checkserver=False)):
+                self._recordbadlockorder()
+            return super(sqllocalrepo, self).wlock(*args, **kwargs)
+
+        def _recordbadlockorder(self):
+            self.ui.debug("hgsql: invalid lock order\n")
+
+            outputdir = self.ui.config('hgsql', 'profileoutput')
+            if outputdir:
+                import random, traceback
+                pid = os.getpid()
+                rand = random.random()
+                timestamp = time.time()
+                path = os.path.join(outputdir, 'hgsql-lockorder-%s-%s-%s' %
+                                               (timestamp, pid, rand))
+                with open(path, 'a+') as f:
+                    stack = traceback.format_stack()
+                    f.write(''.join(stack))
+
        def transaction(self, *args, **kwargs):
            tr = super(sqllocalrepo, self).transaction(*args, **kwargs)
            if tr.count > 1:
@ -528,9 +557,12 @@ def wraprepo(repo):
            is returned, it means the limiter was not acquired, and readonly
            clients should not attempt to perform a sync."""
            try:
+                def release():
+                    self._synclock = None
                wait = False
-                return self._lock(self.svfs, "synclimiter", wait, None,
-                               None, _('repository %s') % self.origroot)
+                self._synclock = self._lock(self.svfs, "synclimiter", wait, release,
+                                           None, _('repository %s') % self.origroot)
+                return self._synclock
            except error.LockHeld:
                return None

@ -1050,6 +1082,7 @@ def wraprepo(repo):
    repo.pendingrevs = []
    repo.heldlocks = set()
    repo.sqlpostrelease = []
+    repo._synclock = None

    repo.__class__ = sqllocalrepo

--- a/tests/library.sh
+++ b/tests/library.sh
@ -58,6 +58,10 @@ password = $DBPASS
 port = $DBPORT
 reponame = $2

+[server]
+preferuncompressed=True
+uncompressed=True
+
 [ui]
 ssh=python "$TESTDIR/dummyssh"
 EOF
--- a/tests/test-race-conditions.t
+++ b/tests/test-race-conditions.t
@ -9,6 +9,10 @@
  $ cd ..

  $ initserver master masterrepo
+  $ cat >> master/.hg/hgrc <<EOF
+  > [hgsql]
+  > profileoutput=$TESTTMP/
+  > EOF
  $ hg -R master log
  $ hg -R master pull -q client

@ -45,6 +49,8 @@
  $ printf "[hooks]\npre-changegroup.sleep = sleep 2\n" >> master/.hg/hgrc
  $ initclient client2
  $ hg pull -q -R client2 ssh://user@dummy/master
+  $ ls -l $TESTTMP/hgsql-lockorder*
+  * $TESTTMP/hgsql-lockorder-* (glob)

  $ cd client
  $ hg up -q 1
@ -70,3 +76,18 @@
  
  $ sed -i '/hooks/d' master/.hg/hgrc
  $ sed -i '/sleep/d' master/.hg/hgrc
+
+# Verify incorrect lock order taking is logged.
+# We know that streaming clones currently hit this code path, and the hg pull in
+# the above test used that code path.
+  $ cat $TESTTMP/hgsql-lockorder* | tail -n 10
+    File *, line *, in getstream (glob)
+      for chunk in it:
+    File *, line *, in generatev1wireproto (glob)
+      filecount, bytecount, it = generatev1(repo)
+    File *, line *, in generatev1 (glob)
+      with repo.lock():
+    File *, line *, in lock (glob)
+      self._recordbadlockorder()
+    File *, line *, in _recordbadlockorder (glob)
+      stack = traceback.format_stack()