setdiscovery: avoid a full changelog graph traversal

We were definitely being suboptimal here: we were constructing two full sets,
one with the full set of common nodes (i.e. a graph traversal) and one with all
nodes. Then we subtract one set from the other. This whole process is
O(commits) and causes discovery to be significantly slower than it should be.

Instead, keep track of common incrementally and keep undecided as small as
possible.

This makes discovery massively faster on large repos: on one such repo, 'hg
debugdiscovery' over SSH with one commit missing on the client and five on the
server went from 4.5 seconds to 1.5. (An 'hg debugdiscovery' with no commits
missing on the client, i.e. connection startup time, was 1.2 seconds.)
This commit is contained in:
Siddharth Agarwal 2014-11-16 00:40:29 -08:00
parent 1a87e8b8c3
commit fba9f14547

View File

@ -40,7 +40,7 @@ nodes that will maximize the number of nodes that will be
classified with it (since all ancestors or descendants will be marked as well).
"""
from node import nullid
from node import nullid, nullrev
from i18n import _
import random
import util, dagutil
@ -177,27 +177,23 @@ def findcommonheads(ui, local, remote,
# own nodes where I don't know if remote knows them
undecided = dag.nodeset()
# own nodes I know we both know
common = set()
# own nodes I know remote lacks
missing = set()
# treat remote heads (and maybe own heads) as a first implicit sample
# response
common.update(dag.ancestorset(srvheads))
undecided.difference_update(common)
common = cl.incrementalmissingrevs(srvheads)
commoninsample = set(n for i, n in enumerate(sample) if yesno[i])
common.addbases(commoninsample)
undecided = set(common.missingancestors(ownheads))
# own nodes I know remote lacks
missing = set()
full = False
while undecided:
if sample:
commoninsample = set(n for i, n in enumerate(sample) if yesno[i])
common.update(dag.ancestorset(commoninsample, common))
missinginsample = [n for i, n in enumerate(sample) if not yesno[i]]
missing.update(dag.descendantset(missinginsample, missing))
undecided.difference_update(missing)
undecided.difference_update(common)
if not undecided:
break
@ -206,7 +202,7 @@ def findcommonheads(ui, local, remote,
ui.note(_("sampling from both directions\n"))
sample = _takefullsample(dag, undecided, size=fullsamplesize)
targetsize = fullsamplesize
elif common:
elif common.hasbases():
# use cheapish initial sample
ui.debug("taking initial sample\n")
sample = _takefullsample(dag, undecided, size=fullsamplesize)
@ -228,7 +224,17 @@ def findcommonheads(ui, local, remote,
yesno = remote.known(dag.externalizeall(sample))
full = True
result = dag.headsetofconnecteds(common)
if sample:
commoninsample = set(n for i, n in enumerate(sample) if yesno[i])
common.addbases(commoninsample)
common.removeancestorsfrom(undecided)
# heads(common) == heads(common.bases) since common represents common.bases
# and all its ancestors
result = dag.headsetofconnecteds(common.bases)
# common.bases can include nullrev, but our contract requires us to not
# return any heads in that case, so discard that
result.discard(nullrev)
ui.progress(_('searching'), None)
ui.debug("%d total queries\n" % roundtrips)