treemanifest: add known argument to getancestor api

Summary: During a repack we often want to access the ancestory for a bunch of nodes that might be ancestors of each other. Using getancestors for that results in a lot of duplicated work. For instance, getancestors(0) returns [0], getancestors(1) returns [0, 1], getancestors(2) returns [0, 1, 2], etc. Which is n^2 This patch adds an optional `known` argument for getancestors that let's the caller tell getancestors what ancestors it's already aware of. Then getancestors can short circuit when it reaches that level. This avoids duplicate work during repack. Test Plan: Ran treemanifest repack in our large repo and verified it made progress over the nodes much faster than before Reviewers: #mercurial, quark Reviewed By: quark Subscribers: mjpieters Differential Revision: https://phabricator.intern.facebook.com/D4901308 Signature: t1:4901308:1492640896:27d4a90c2993cd1fefbd8dbc211f2ec181178bce
2024-10-11 09:17:30 +03:00 · 2017-04-19 21:14:04 -07:00 · 2017-04-19 21:14:04 -07:00 · 68030f1dd7
commit 68030f1dd7
parent dae50fc99e
4 changed files with 60 additions and 23 deletions
--- a/remotefilelog/contentstore.py
+++ b/remotefilelog/contentstore.py
@ -185,14 +185,29 @@ class manifestrevlogstore(object):
        revision = self.get(name, node)
        return [(name, node, None, nullid, revision)]

-    def getancestors(self, name, node):
+    def getancestors(self, name, node, known=None):
+        if known is None:
+            known = set()
+        if node in known:
+            return []
+
        rl = self._revlog(name)
        ancestors = {}
+        missing = set((node,))
        for ancrev in rl.ancestors([rl.rev(node)], inclusive=True):
            ancnode = rl.node(ancrev)
+            missing.discard(ancnode)
+
            p1, p2 = rl.parents(ancnode)
+            if p1 != nullid and p1 not in known:
+                missing.add(p1)
+            if p2 != nullid and p2 not in known:
+                missing.add(p2)
+
            linknode = self._cl.node(rl.linkrev(ancrev))
            ancestors[rl.node(ancrev)] = (p1, p2, linknode, '')
+            if not missing:
+                break
        return ancestors

    def add(self, *args):
--- a/remotefilelog/historypack.py
+++ b/remotefilelog/historypack.py
@ -1,6 +1,6 @@
 import hashlib, struct
 from mercurial import util
-from mercurial.node import hex
+from mercurial.node import hex, nullid
 import basepack, constants

 # (filename hash, offset, size)
@ -30,16 +30,16 @@ class historypackstore(basepack.basepackstore):
    def getpack(self, path):
        return historypack(path)

-    def getancestors(self, name, node):
+    def getancestors(self, name, node, known=None):
        for pack in self.packs:
            try:
-                return pack.getancestors(name, node)
+                return pack.getancestors(name, node, known=known)
            except KeyError:
                pass

        for pack in self.refresh():
            try:
-                return pack.getancestors(name, node)
+                return pack.getancestors(name, node, known=known)
            except KeyError:
                pass

@ -64,7 +64,7 @@ class historypack(basepack.basepack):

        return missing

-    def getancestors(self, name, node):
+    def getancestors(self, name, node, known=None):
        """Returns as many ancestors as we're aware of.

        return value: {
@ -72,12 +72,19 @@ class historypack(basepack.basepack):
           ...
        }
        """
+        if known is None:
+            known = set()
+        if node in known:
+            return []
+
        filename, offset, size = self._findsection(name)
-        ancestors = set((node,))
+        pending = set((node,))
        data = self._data[offset:offset + size]
        results = {}
        o = 0
        while o < len(data):
+            if not pending:
+                break
            entry = struct.unpack(PACKFORMAT, data[o:o + PACKENTRYLENGTH])
            o += PACKENTRYLENGTH
            copyfrom = None
@ -86,11 +93,18 @@ class historypack(basepack.basepack):
                copyfrom = data[o:o + copyfromlen]
                o += copyfromlen

-            if entry[ANC_NODE] in ancestors:
-                ancestors.add(entry[ANC_P1NODE])
-                ancestors.add(entry[ANC_P2NODE])
-                result = (entry[ANC_P1NODE],
-                          entry[ANC_P2NODE],
+            ancnode = entry[ANC_NODE]
+            if ancnode in pending:
+                pending.remove(ancnode)
+                p1node = entry[ANC_P1NODE]
+                p2node = entry[ANC_P2NODE]
+                if p1node != nullid and p1node not in known:
+                    pending.add(p1node)
+                if p2node != nullid and p2node not in known:
+                    pending.add(p2node)
+
+                result = (p1node,
+                          p2node,
                          entry[ANC_LINKNODE],
                          copyfrom)
                results[entry[ANC_NODE]] = result
--- a/remotefilelog/metadatastore.py
+++ b/remotefilelog/metadatastore.py
@ -11,7 +11,7 @@ class unionmetadatastore(object):
        # history can't be found.
        self.allowincomplete = kwargs.get('allowincomplete', False)

-    def getancestors(self, name, node):
+    def getancestors(self, name, node, known=None):
        """Returns as many ancestors as we're aware of.

        return value: {
@ -19,6 +19,11 @@ class unionmetadatastore(object):
           ...
        }
        """
+        if known is None:
+            known = set()
+        if node in known:
+            return []
+
        ancestors = {}
        def traverse(curname, curnode):
            # TODO: this algorithm has the potential to traverse parts of
@ -37,9 +42,9 @@ class unionmetadatastore(object):
                    missing.append((name, node))
                    continue
                p1, p2, linknode, copyfrom = value
-                if p1 != nullid:
+                if p1 != nullid and p1 not in known:
                    queue.append((copyfrom or curname, p1))
-                if p2 != nullid:
+                if p2 != nullid and p2 not in known:
                    queue.append((curname, p2))
            return missing

@ -47,7 +52,8 @@ class unionmetadatastore(object):
        while missing:
            curname, curnode = missing.pop()
            try:
-                ancestors.update(self._getpartialancestors(curname, curnode))
+                ancestors.update(self._getpartialancestors(curname, curnode,
+                                                           known=known))
                newmissing = traverse(curname, curnode)
                missing.extend(newmissing)
            except KeyError:
@ -61,10 +67,10 @@ class unionmetadatastore(object):
        # TODO: ancestors should probably be (name, node) -> (value)
        return ancestors

-    def _getpartialancestors(self, name, node):
+    def _getpartialancestors(self, name, node, known=None):
        for store in self.stores:
            try:
-                return store.getancestors(name, node)
+                return store.getancestors(name, node, known=known)
            except KeyError:
                pass

@ -86,7 +92,7 @@ class unionmetadatastore(object):
            store.markledger(ledger)

 class remotefilelogmetadatastore(basestore.basestore):
-    def getancestors(self, name, node):
+    def getancestors(self, name, node, known=None):
        """Returns as many ancestors as we're aware of.

        return value: {
@ -107,10 +113,10 @@ class remotemetadatastore(object):
        self._fileservice = fileservice
        self._shared = shared

-    def getancestors(self, name, node):
+    def getancestors(self, name, node, known=None):
        self._fileservice.prefetch([(name, hex(node))], force=True,
                                   fetchdata=False, fetchhistory=True)
-        return self._shared.getancestors(name, node)
+        return self._shared.getancestors(name, node, known=known)

    def add(self, name, node, data):
        raise RuntimeError("cannot add to a remote store")
--- a/remotefilelog/repack.py
+++ b/remotefilelog/repack.py
@ -290,7 +290,8 @@ class repacker(object):
                if node in ancestors:
                    continue
                try:
-                    ancestors.update(self.history.getancestors(filename, node))
+                    ancestors.update(self.history.getancestors(filename, node,
+                                                               known=ancestors))
                except KeyError:
                    # Since we're packing data entries, we may not have the
                    # corresponding history entries for them. It's not a big
@ -376,7 +377,8 @@ class repacker(object):
            for node in nodes:
                if node in ancestors:
                    continue
-                ancestors.update(self.history.getancestors(filename, node))
+                ancestors.update(self.history.getancestors(filename, node,
+                                                           known=ancestors))

            # Order the nodes children first
            orderednodes = reversed(self._toposort(ancestors))