fastannotate: implement the annotate algorithm

Summary:
This diff implements the `annotate` algorithm. Unlike the vanilla one, the
annotate method takes 2 revisions: the revision specified for annotating,
and the head of the main branch. The algorithm will do a "hybrid" annotate:
incrementally update the linelog (the cache) so it can answer queries of
any revision in the main branch. And use the traditional algorithm to deal
with revisions not in the main branch: like a side branch of a merge commit,
or the revision the user specified not in the main branch.

The main branch is supposed to be something like `master` or `@`, and their
p1s.

Building up linelog with merge handled reasonably for the main branch, and
the non-linelog part that produces final result share a lot internal states
and logic so they are deeply coupled. Splitting them will probably reduce
performance, or have difficulty (no clean way) to share internal states.
If the caller only wants to build linelog without annotate things, just pass
`rev = master`.

While some attempts are made to support "merge" changeset, the result can
still be different from the vanilla one sometimes. In those cases, both
results make sense. It's really hard, if not impossible, to make the new
implementation 100% same with the vanilla one because of the linear history
restriction of linelog so I guess currently it's good enough. The differences
will be covered by a `.t` test later.


Test Plan: Code Review. A `.t` file will be added.

Reviewers: #sourcecontrol, stash

Reviewed By: stash

Subscribers: stash, mjpieters

Differential Revision: https://phabricator.intern.facebook.com/D3836438

Signature: t1:3836438:1473778829:27978479a01920833fa146f427178292ea1f5306
This commit is contained in:
Jun Wu 2016-09-08 16:54:21 +01:00
parent 943e37217d
commit 60eb8a2c22

View File

@ -12,12 +12,17 @@ import os
from fastannotate import (
revmap as revmapmod,
error as faerror,
)
from mercurial import (
lock as lockmod,
mdiff,
node,
scmutil,
util,
)
from mercurial.i18n import _
import linelog as linelogmod
@ -116,6 +121,302 @@ class _annotatecontext(object):
self.revmap = revmap
self.opts = opts
def annotate(self, rev, master=None, showpath=False, showlines=False):
"""incrementally update the cache so it includes revisions in the main
branch till 'master'. and run annotate on 'rev', which may or may not be
included in the main branch.
if master is None, do not update linelog. if master is a callable, call
it to get the actual master, which can save some time if we don't need
to resolve the master.
the first value returned is the annotate result, it is [(node, linenum)]
by default. [(node, linenum, path)] if showpath is True.
if showlines is True, a second value will be returned, it is a list of
corresponding line contents.
"""
# fast path: if rev is in the main branch already
directly, revfctx = self.canannotatedirectly(rev)
if directly:
if self.ui.debugflag:
self.ui.debug('fastannotate: %s: no need to update linelog\n'
% self.path)
return self.annotatedirectly(revfctx, showpath, showlines)
# resolve master
masterfctx = None
if master:
if callable(master):
master = master()
masterfctx = _getbase(scmutil.revsingle(self.repo,
master)[self.path])
if masterfctx in self.revmap:
masterfctx = None
# ... - @ <- rev (can be an arbitrary changeset,
# / not necessarily a descendant
# master -> o of master)
# |
# a merge -> o 'o': new changesets in the main branch
# |\ '#': revisions in the main branch that
# o * exist in linelog / revmap
# | . '*': changesets in side branches, or
# last master -> # . descendants of master
# | .
# # * joint: '#', and is a parent of a '*'
# |/
# a joint -> # ^^^^ --- side branches
# |
# ^ --- main branch (in linelog)
# these DFSes are similar to the traditional annotate algorithm.
# we cannot really reuse the code for perf reason.
# 1st DFS calculates merges, joint points, and needed.
# "needed" is a simple reference counting dict to free items in
# "hist", reducing its memory usage otherwise could be huge.
initvisit = [revfctx]
if masterfctx:
initvisit.append(masterfctx)
visit = initvisit[:]
pcache = {}
needed = {revfctx: 1}
hist = {} # {fctx: ([(llrev or fctx, linenum)], text)}
while visit:
f = visit.pop()
if f in pcache or f in hist:
continue
if f in self.revmap: # in the old main branch, it's a joint
llrev = self.revmap.hsh2rev(f.node())
self.linelog.annotate(llrev)
result = self.linelog.annotateresult
hist[f] = (result, f.data())
continue
pl = self._parentfunc(f)
pcache[f] = pl
for p in pl:
needed[p] = needed.get(p, 0) + 1
if p not in pcache:
visit.append(p)
# 2nd (simple) DFS calculates new changesets in the main branch
# ('o' nodes in # the above graph), so we know when to update linelog.
newmainbranch = set()
f = masterfctx
while f and f not in self.revmap:
newmainbranch.add(f)
pl = pcache[f]
if pl:
f = pl[0]
else:
f = None
break
# f, if present, is the position where the last build stopped at, and
# should be the "master" last time. check to see if we can continue
# building the linelog incrementally. (we cannot if diverged)
if masterfctx is not None:
self._checklastmasterhead(f)
if self.ui.debugflag:
self.ui.debug('fastannotate: %s: %d new changesets in the main '
'branch\n' % (self.path, len(newmainbranch)))
# prepare annotateresult so we can update linelog incrementally
self.linelog.annotate(self.linelog.maxrev)
# 3rd DFS does the actual annotate
visit = initvisit[:]
progress = 0
while visit:
f = visit[-1]
if f in hist or f in self.revmap:
visit.pop()
continue
ready = True
pl = pcache[f]
for p in pl:
if p not in hist:
ready = False
visit.append(p)
if not ready:
continue
visit.pop()
blocks = None # mdiff blocks, used for appending linelog
ismainbranch = (f in newmainbranch)
# curr is the same as the traditional annotate algorithm,
# if we only care about linear history (do not follow merge),
# then curr is not actually used.
assert f not in hist
curr = _decorate(f)
for i, p in enumerate(pl):
bs = list(mdiff.allblocks(hist[p][1], curr[1]))
if i == 0 and ismainbranch:
blocks = bs
curr = _pair(hist[p], curr, bs)
if needed[p] == 1:
del hist[p]
del needed[p]
else:
needed[p] -= 1
hist[f] = curr
del pcache[f]
if ismainbranch: # need to write to linelog
if not self.ui.quiet:
progress += 1
self.ui.progress(_('building cache'), progress,
total=len(newmainbranch))
bannotated = None
if len(pl) == 2 and self.opts.followmerge: # merge
bannotated = curr[0]
if blocks is None: # no parents, add an empty one
blocks = list(mdiff.allblocks('', curr[1]))
self._appendrev(f, blocks, bannotated)
if progress: # clean progress bar
self.ui.write()
result = [
((self.revmap.rev2hsh(f) if isinstance(f, int) else f.node()), l)
for f, l in hist[revfctx][0]]
return self._refineannotateresult(result, revfctx, showpath, showlines)
def canannotatedirectly(self, rev):
"""(str) -> bool, fctx or node.
return (True, f) if we can annotate without updating the linelog, pass
f to annotatedirectly.
return (False, f) if we need extra calculation. f is the fctx resolved
from rev.
"""
result = True
f = None
if len(rev) == 20 and rev in self.revmap:
f = rev
elif len(rev) == 40 and node.bin(rev) in self.revmap:
f = node.bin(rev)
else:
f = _getbase(scmutil.revsingle(self.repo, rev)[self.path])
result = f in self.revmap
return result, f
def annotatedirectly(self, f, showpath, showlines):
"""like annotate, but when we know that f is in linelog.
f can be either a 20-char str (node) or a fctx. this is for perf - in
the best case, the user provides a node and we don't need to read the
filelog or construct any filecontext.
"""
if isinstance(f, str):
hsh = f
else:
hsh = f.node()
llrev = self.revmap.hsh2rev(hsh)
assert llrev
assert (self.revmap.rev2flag(llrev) & revmapmod.sidebranchflag) == 0
self.linelog.annotate(llrev)
result = [(self.revmap.rev2hsh(r), l)
for r, l in self.linelog.annotateresult]
return self._refineannotateresult(result, f, showpath, showlines)
def _refineannotateresult(self, result, f, showpath, showlines):
"""add the missing path or line contents, they can be expensive.
f could be either node or fctx.
"""
if showpath:
result = self._addpathtoresult(result)
if showlines:
if isinstance(f, str): # f: node or fctx
llrev = self.revmap.hsh2rev(f)
fctx = self.repo[f][self.revmap.rev2path(llrev)]
else:
fctx = f
lines = mdiff.splitnewlines(fctx.data())
assert len(lines) == len(result)
result = (result, lines)
return result
def _appendrev(self, fctx, blocks, bannotated=None):
self._doappendrev(self.linelog, self.revmap, fctx, blocks, bannotated)
@staticmethod
def _doappendrev(linelog, revmap, fctx, blocks, bannotated=None):
"""append a revision to linelog and revmap"""
def getllrev(f):
"""(fctx) -> int"""
# f should not be a linelog revision
assert not isinstance(f, int)
# f is a fctx, allocate linelog rev on demand
hsh = f.node()
rev = revmap.hsh2rev(hsh)
if rev is None:
rev = revmap.append(hsh, sidebranch=True, path=f.path())
return rev
# append sidebranch revisions to revmap
siderevs = []
siderevmap = {} # node: int
if bannotated is not None:
for (a1, a2, b1, b2), op in blocks:
if op != '=':
# f could be either linelong rev, or fctx.
siderevs += [f for f, l in bannotated[b1:b2]
if not isinstance(f, int)]
siderevs = set(siderevs)
if fctx in siderevs: # mainnode must be appended seperately
siderevs.remove(fctx)
for f in siderevs:
siderevmap[f] = getllrev(f)
# the changeset in the main branch, could be a merge
llrev = revmap.append(fctx.node(), path=fctx.path())
siderevmap[fctx] = llrev
for (a1, a2, b1, b2), op in reversed(blocks):
if op == '=':
continue
if bannotated is None:
linelog.replacelines(llrev, a1, a2, b1, b2)
else:
blines = [((r if isinstance(r, int) else siderevmap[r]), l)
for r, l in bannotated[b1:b2]]
linelog.replacelines_vec(llrev, a1, a2, blines)
def _addpathtoresult(self, annotateresult, revmap=None):
"""(revmap, [(node, linenum)]) -> [(node, linenum, path)]"""
if revmap is None:
revmap = self.revmap
nodes = set([n for n, l in annotateresult])
paths = dict((n, revmap.rev2path(revmap.hsh2rev(n))) for n in nodes)
return [(n, l, paths[n]) for n, l in annotateresult]
def _checklastmasterhead(self, fctx):
"""check if fctx is the master's head last time, raise if not"""
if fctx is None:
llrev = 0
else:
llrev = self.revmap.hsh2rev(fctx.node())
assert llrev
if self.linelog.maxrev != llrev:
raise faerror.CannotReuseError()
@util.propertycache
def _parentfunc(self):
"""-> (fctx) -> [fctx]"""
followrename = self.opts.followrename
followmerge = self.opts.followmerge
def parents(f):
pl = _parents(f, follow=followrename)
if not followmerge:
pl = pl[:1]
return pl
return parents
def _unlinkpaths(paths):
"""silent, best-effort unlink"""
for path in paths: