remotefilelog: move getancestors into remotefilelog

Summary: The getancestors call is a _very_ expansive at it requires walking all the ancestors of the given node, which potentially requires O(N) network round trips. Since we want to discourage/remove such behavior, let's move it out of the store layer, and move it one layer up. The handful of places that calls into ancestormap would need to be optimized to not call it later. Reviewed By: DurhamG Differential Revision: D17946852 fbshipit-source-id: 93eb7873b685ee88f8af5d4ceca500738d779396
2025-01-06 21:48:36 +03:00 · 2019-11-07 08:49:58 -08:00 · 2019-11-07 08:49:58 -08:00 · 230af20935
commit 230af20935
parent d47be1f2fa
3 changed files with 65 additions and 100 deletions
--- a/edenscm/hgext/remotefilelog/metadatastore.py
+++ b/edenscm/hgext/remotefilelog/metadatastore.py
@ -12,70 +12,9 @@ from . import shallowutil


 class unionmetadatastore(object):
-    def __init__(self, *args, **kwargs):
+    def __init__(self, *args):
        self.stores = list(args)

-        # If allowincomplete==True then the union store can return partial
-        # ancestor lists, otherwise it will throw a KeyError if a full
-        # history can't be found.
-        self.allowincomplete = kwargs.get("allowincomplete", False)
-
-    def getancestors(self, name, node, known=None):
-        """Returns as many ancestors as we're aware of.
-
-        return value: {
-           node: (p1, p2, linknode, copyfrom),
-           ...
-        }
-        """
-        if known is None:
-            known = set()
-        if node in known:
-            return []
-
-        ancestors = {}
-
-        def traverse(curname, curnode):
-            # TODO: this algorithm has the potential to traverse parts of
-            # history twice. Ex: with A->B->C->F and A->B->D->F, both D and C
-            # may be queued as missing, then B and A are traversed for both.
-            queue = [(curname, curnode)]
-            missing = []
-            seen = set()
-            while queue:
-                name, node = queue.pop()
-                if (name, node) in seen:
-                    continue
-                seen.add((name, node))
-                value = ancestors.get(node)
-                if not value:
-                    missing.append((name, node))
-                    continue
-                p1, p2, linknode, copyfrom = value
-                if p1 != nullid and p1 not in known:
-                    queue.append((copyfrom or name, p1))
-                if p2 != nullid and p2 not in known:
-                    queue.append((name, p2))
-            return missing
-
-        missing = [(name, node)]
-        while missing:
-            curname, curnode = missing.pop()
-            try:
-                ancestors.update({curnode: self.getnodeinfo(curname, curnode)})
-                newmissing = traverse(curname, curnode)
-                missing.extend(newmissing)
-            except KeyError:
-                # If we allow incomplete histories, don't throw.
-                if not self.allowincomplete:
-                    raise
-                # If the requested name+node doesn't exist, always throw.
-                if (curname, curnode) == (name, node):
-                    raise
-
-        # TODO: ancestors should probably be (name, node) -> (value)
-        return ancestors
-
    def getnodeinfo(self, name, node):
        for store in self.stores:
            try:
--- a/edenscm/hgext/remotefilelog/remotefilelog.py
+++ b/edenscm/hgext/remotefilelog/remotefilelog.py
@ -312,8 +312,64 @@ class remotefilelog(object):
                validatehash = validatehash and vhash
        return text, validatehash

+    def _getancestors(self, node):
+        """Returns as many ancestors as we're aware of.
+
+        return value: {
+           node: (p1, p2, linknode, copyfrom),
+           ...
+        }
+
+        This is a very expansive operation as it requires the entire history
+        for the node, potentially requiring O(N) server roundtrips.
+        """
+        known = set()
+        ancestors = {}
+
+        def traverse(curname, curnode):
+            # TODO: this algorithm has the potential to traverse parts of
+            # history twice. Ex: with A->B->C->F and A->B->D->F, both D and C
+            # may be queued as missing, then B and A are traversed for both.
+            queue = [(curname, curnode)]
+            missing = []
+            seen = set()
+            while queue:
+                name, node = queue.pop()
+                if (name, node) in seen:
+                    continue
+                seen.add((name, node))
+                value = ancestors.get(node)
+                if not value:
+                    missing.append((name, node))
+                    continue
+                p1, p2, linknode, copyfrom = value
+                if p1 != nullid and p1 not in known:
+                    queue.append((copyfrom or name, p1))
+                if p2 != nullid and p2 not in known:
+                    queue.append((name, p2))
+            return missing
+
+        missing = [(self.filename, node)]
+        while missing:
+            curname, curnode = missing.pop()
+            try:
+                ancestors.update(
+                    {
+                        curnode: self.repo.fileslog.metadatastore.getnodeinfo(
+                            curname, curnode
+                        )
+                    }
+                )
+                newmissing = traverse(curname, curnode)
+                missing.extend(newmissing)
+            except KeyError:
+                raise
+
+        # TODO: ancestors should probably be (name, node) -> (value)
+        return ancestors
+
    def ancestormap(self, node):
-        return self.repo.fileslog.metadatastore.getancestors(self.filename, node)
+        return self._getancestors(node)

    def getnodeinfo(self, node):
        return self.repo.fileslog.metadatastore.getnodeinfo(self.filename, node)
@ -503,9 +559,7 @@ class remotefileslog(filelog.fileslog):
                os.umask(mask)

        sunioncontentstore = unioncontentstore(*sharedcontentstores)
-        sunionmetadatastore = unionmetadatastore(
-            *sharedmetadatastores, allowincomplete=True
-        )
+        sunionmetadatastore = unionmetadatastore(*sharedmetadatastores)
        remotecontent, remotemetadata = self.makeremotestores(
            sunioncontentstore, sunionmetadatastore
        )
--- a/tests/test-fb-hgext-remotefilelog-histpack.py
+++ b/tests/test-fb-hgext-remotefilelog-histpack.py
@ -115,32 +115,6 @@ class histpacktestsbase(object):
            self.assertEquals(linknode, actual[2])
            self.assertEquals(copyfrom, actual[3])

-    def testAddAncestorChain(self):
-        """Test putting multiple revisions in into a pack and read the ancestor
-        chain.
-        """
-        revisions = []
-        filename = "foo"
-        lastnode = nullid
-        for i in range(10):
-            node = self.getFakeHash()
-            revisions.append((filename, node, lastnode, nullid, nullid, None))
-            lastnode = node
-
-        # revisions must be added in topological order, newest first
-        revisions = list(reversed(revisions))
-        pack = self.createPack(revisions)
-        store = unionmetadatastore(pack)
-
-        # Test that the chain has all the entries
-        ancestors = store.getancestors(revisions[0][0], revisions[0][1], known=None)
-        for filename, node, p1, p2, linknode, copyfrom in revisions:
-            ap1, ap2, alinknode, acopyfrom = ancestors[node]
-            self.assertEquals(ap1, p1)
-            self.assertEquals(ap2, p2)
-            self.assertEquals(alinknode, linknode)
-            self.assertEquals(acopyfrom, copyfrom)
-
    def testPackMany(self):
        """Pack many related and unrelated ancestors.
        """
@ -175,14 +149,12 @@ class histpacktestsbase(object):

        # Verify the pack contents
        for (filename, node), (p1, p2, lastnode) in allentries.iteritems():
-            ancestors = store.getancestors(filename, node, known=None)
-            self.assertEquals(ancestorcounts[(filename, node)], len(ancestors))
-            for anode, (ap1, ap2, alinknode, copyfrom) in ancestors.iteritems():
-                ep1, ep2, elinknode = allentries[(filename, anode)]
-                self.assertEquals(ap1, ep1)
-                self.assertEquals(ap2, ep2)
-                self.assertEquals(alinknode, elinknode)
-                self.assertEquals(copyfrom, None)
+            ap1, ap2, alinknode, acopyfrom = store.getnodeinfo(filename, node)
+            ep1, ep2, elinknode = allentries[(filename, node)]
+            self.assertEquals(ap1, ep1)
+            self.assertEquals(ap2, ep2)
+            self.assertEquals(alinknode, elinknode)
+            self.assertEquals(acopyfrom, None)

    def testGetNodeInfo(self):
        revisions = []