sapling/hgext3rd/linkrevcache.py
Durham Goode e34660b057 commands: update to use registrar instead of cmdutil
Summary: Upstream has deprecated cmdutil.commands() in favor of registrar.commands()

Test Plan: Ran the tests

Reviewers: #mercurial, quark

Reviewed By: quark

Subscribers: mjpieters

Differential Revision: https://phabricator.intern.facebook.com/D5106486

Signature: t1:5106486:1495485074:0e20f00622cc651e8c9dda837f84dd84cc51099e
2017-05-22 13:38:37 -07:00

469 lines
16 KiB
Python

# Copyright 2016-present Facebook. All Rights Reserved.
#
# linkrevcache: a simple caching layer to speed up _adjustlinkrev
#
# This software may be used and distributed according to the terms of the
# GNU General Public License version 2 or any later version.
"""a simple caching layer to speed up _adjustlinkrev
The linkrevcache extension could use a pre-built database to speed up some
_adjustlinkrev operations. The database is stored in the directory
'.hg/cache/linkrevdb'.
To use the extension, you need to prebuild the database using the
`debugbuildlinkrevcache` command, and then keep the extension enabled.
To update the database, run `debugbuildlinkrevcache` again. It would find new
revisions and fill the database incrementally.
If the building process is slow, try setting `checkancestor` to False.
The database won't be updated on demand for I/O and locking concerns. It may be
addressed if we could have some (partially) "append-only" map-like data
structure.
The linkrev caching database would generally speed up the log (following a
file) and annotate operations.
.. note::
The database format is not guaranteed portable. Copying it from a machine
to another may make it unreadable.
Config examples::
[linkrevcache]
# Whether to test ancestors or not. (default: True)
# - When set to False, the build process will be faster, while the database
# will contain some unnecessary entries (mode-only changes and merges
# where the file node is reused).
# - When set to True, the database won't contain unnecessary entries.
checkancestor = False
# Whether to read filelog or not. (default: True)
# - When set to False, the build process will be faster, while the database
# will be probably much larger.
# - When set to True, filelog will be read and existing linkrevs won't be
# stored in the database.
readfilelog = False
"""
import os
import shutil
import sys
from mercurial import (
context,
extensions,
filelog,
node,
registrar,
util,
)
from mercurial.i18n import _
testedwith = 'ships-with-fb-hgext'
cmdtable = {}
command = registrar.command(cmdtable)
_chosendbm = None
def _choosedbm():
"""return (name, module)"""
global _chosendbm
if not _chosendbm:
if sys.version_info >= (3, 0):
candidates = [('gdbm', 'dbm.gnu'), ('ndbm', 'dbm.ndbm'),
('dumb', 'dbm.dumb')]
else:
candidates = [('gdbm', 'gdbm'), ('bsd', 'dbhash'),
('ndbm', 'dbm'), ('dumb', 'dumbdbm')]
for name, modname in candidates:
try:
mod = __import__(modname)
mod.open # sanity check with demandimport enabled
_chosendbm = (name, __import__(modname))
break
except ImportError:
pass
return _chosendbm
# dbm is a bytes -> bytes map, so we need to convert integers to bytes.
# the conversion functions are optimized for space usage.
# not using struct.(un)pack is because we may have things > 4 bytes (revlog
# defines the revision number to be 6 bytes) and 8-byte is wasteful.
def _strinc(s):
"""return the "next" string. useful as an incremental "ID"."""
if not s:
# avoid '\0' so '\0' could be used as a separator
return '\x01'
n = ord(s[-1])
if n == 255:
return _strinc(s[:-1]) + '\x01'
else:
return s[:-1] + chr(n + 1)
def _str2int(s):
# this is faster than "bytearray().extend(map(ord, s))"
x = 0
for ch in s:
x <<= 8
x += ord(ch)
return x
def _int2str(x):
s = ''
while x:
s = chr(x & 255) + s
x >>= 8
return s
def _intlist2str(intlist):
result = ''
for n in intlist:
s = _int2str(n)
l = len(s)
# do not accept huge integers
assert l < 256
result += chr(l) + s
return result
def _str2intlist(s):
result = []
i = 0
end = len(s)
while i < end:
l = ord(s[i])
i += 1
result.append(_str2int(s[i:i + l]))
i += l
return result
class linkrevdbreadonly(object):
_openflag = 'r'
# numbers are useful in the atomic replace case: they can be sorted
# and replaced in a safer order. however, atomic caller should always
# use repo lock so the order only protects things when the repo lock
# does not work.
_metadbname = '0meta'
_pathdbname = '1path'
_nodedbname = '2node'
_linkrevdbname = '3linkrev'
def __init__(self, dirname):
dbmname, self._dbm = _choosedbm()
# use different file names for different dbm engine, to make the repo
# rsync-friendly across different platforms.
self._path = os.path.join(dirname, dbmname)
self._dbs = {}
def getlinkrevs(self, path, fnode):
pathdb = self._getdb(self._pathdbname)
nodedb = self._getdb(self._nodedbname)
lrevdb = self._getdb(self._linkrevdbname)
try:
pathid = pathdb[path]
nodeid = nodedb[fnode]
v = lrevdb[pathid + '\0' + nodeid]
return _str2intlist(v)
except KeyError:
return []
def getlastrev(self):
return _str2int(self._getmeta('lastrev'))
def close(self):
# the check is necessary if __init__ fails - the caller may call
# "close" in a "finally" block and it probably does not want close() to
# raise an exception there.
if util.safehasattr(self, '_dbs'):
for db in self._dbs.itervalues():
db.close()
self._dbs.clear()
def _getmeta(self, name):
try:
return self._getdb(self._metadbname)[name]
except KeyError:
return ''
def _getdb(self, name):
if name not in self._dbs:
self._dbs[name] = self._dbm.open(self._path + name, self._openflag)
return self._dbs[name]
class linkrevdbreadwrite(linkrevdbreadonly):
_openflag = 'c'
def __init__(self, dirname):
util.makedirs(dirname)
super(linkrevdbreadwrite, self).__init__(dirname)
def appendlinkrev(self, path, fnode, linkrev):
pathdb = self._getdb(self._pathdbname)
nodedb = self._getdb(self._nodedbname)
lrevdb = self._getdb(self._linkrevdbname)
metadb = self._getdb(self._metadbname)
try:
pathid = pathdb[path]
except KeyError:
pathid = _strinc(self._getmeta('pathid'))
pathdb[path] = pathid
metadb['pathid'] = pathid
try:
nodeid = nodedb[fnode]
except KeyError:
nodeid = _strinc(self._getmeta('nodeid'))
nodedb[fnode] = nodeid
metadb['nodeid'] = nodeid
k = pathid + '\0' + nodeid
try:
v = _str2intlist(lrevdb[k])
except KeyError:
v = []
if linkrev in v:
return
v.append(linkrev)
lrevdb[k] = _intlist2str(v)
def setlastrev(self, rev):
self._getdb(self._metadbname)['lastrev'] = _int2str(rev)
class linkrevdbwritewithtemprename(linkrevdbreadwrite):
# Some dbm (ex. gdbm) disallows writer and reader to co-exist. This is
# basically to workaround that so a writer can still write to the (copied)
# database when there is a reader.
# Unlike "atomictemp", this applies to a directory. A directory cannot
# work like "atomictemp" unless symlink is used. Symlink is not portable so
# we don't use them. Therefore this is not atomic (while probably good
# enough because we write files in a reasonable order - in the worst case,
# we just drop those cache files).
# Ideally, we can have other dbms which support reader and writer to
# co-exist, and this will become unnecessary.
def __init__(self, dirname):
self._origpath = dirname
head, tail = os.path.split(dirname)
tempdir = '%s-%s' % (dirname, os.getpid())
self._tempdir = tempdir
try:
shutil.copytree(dirname, tempdir)
super(linkrevdbwritewithtemprename, self).__init__(tempdir)
except Exception:
shutil.rmtree(tempdir)
raise
def close(self):
super(linkrevdbwritewithtemprename, self).close()
if util.safehasattr(self, '_tempdir'):
for name in sorted(os.listdir(self._tempdir)):
oldpath = os.path.join(self._tempdir, name)
newpath = os.path.join(self._origpath, name)
os.rename(oldpath, newpath)
os.rmdir(self._tempdir)
def linkrevdb(dirname, write=False, copyonwrite=False):
# As commented in the "linkrevdbwritewithtemprename" above, these flags
# (write, copyonwrite) are mainly designed to workaround gdbm's locking
# issues. If we have a dbm that uses a less aggressive lock, we could get
# rid of these workarounds.
if not write:
return linkrevdbreadonly(dirname)
else:
if copyonwrite:
return linkrevdbwritewithtemprename(dirname)
else:
return linkrevdbreadwrite(dirname)
_linkrevdbpath = 'cache/linkrevdb'
def reposetup(ui, repo):
if repo.local():
# if the repo is single headed, adjustlinkrev can just return linkrev
repo._singleheaded = (len(repo.unfiltered().changelog.headrevs()) == 1)
dbpath = repo.vfs.join(_linkrevdbpath)
setattr(repo, '_linkrevcache', linkrevdb(dbpath, write=False))
@command('debugbuildlinkrevcache',
[('e', 'end', '', _('end revision')),
('', 'copy', False, _('copy the database files to modify them '
'lock-free (EXPERIMENTAL)'))])
def debugbuildlinkrevcache(ui, repo, *pats, **opts):
"""build the linkrev database from filelogs"""
db = linkrevdb(repo.vfs.join(_linkrevdbpath), write=True,
copyonwrite=opts.get('atomic_temp'))
end = int(opts.get('end') or (len(repo) - 1))
try:
_buildlinkrevcache(ui, repo, db, end)
finally:
db.close()
def _buildlinkrevcache(ui, repo, db, end):
checkancestor = ui.configbool('linkrevcache', 'checkancestor', True)
readfilelog = ui.configbool('linkrevcache', 'readfilelog', True)
repo = repo.unfiltered()
cl = repo.changelog
idx = cl.index
ml = repo.manifestlog
filelogcache = {}
def _getfilelog(path):
if path not in filelogcache:
filelogcache[path] = filelog.filelog(repo.svfs, path)
return filelogcache[path]
start = db.getlastrev() + 1
# the number of ancestor tests when the slow (Python) stateful (cache
# ancestors) algorithm is faster than the fast (C) stateless (walk through
# the changelog index every time) algorithm.
ancestorcountthreshold = 10
for rev in xrange(start, end + 1):
ui.progress(_('building'), rev, total=end, unit=_('changesets'))
clr = cl.changelogrevision(rev)
md = ml[clr.manifest].readfast()
if checkancestor:
if len(clr.files) >= ancestorcountthreshold:
# we may need to frequently test ancestors against rev,
# in this case, pre-calculating rev's ancestors helps.
ancestors = cl.ancestors([rev])
def isancestor(x):
return x in ancestors
else:
# the C index ancestor testing is faster than Python's
# lazyancestors.
def isancestor(x):
return x in idx.commonancestorsheads(x, rev)
for path in clr.files:
if path not in md:
continue
fnode = md[path]
if readfilelog:
fl = _getfilelog(path)
frev = fl.rev(fnode)
lrev = fl.linkrev(frev)
if lrev == rev:
continue
else:
lrev = None
if checkancestor:
linkrevs = set(db.getlinkrevs(path, fnode))
if lrev is not None:
linkrevs.add(lrev)
if rev in linkrevs:
continue
if any(isancestor(l) for l in linkrevs):
continue
# found a new linkrev!
if ui.debugflag:
ui.debug('%s@%s: new linkrev %s\n'
% (path, node.hex(fnode), rev))
db.appendlinkrev(path, fnode, rev)
db.setlastrev(rev)
ui.write() # clear progress bar
@command('debugverifylinkrevcache', [])
def debugverifylinkrevcache(ui, repo, *pats, **opts):
"""read the linkrevs from the database and verify if they are correct"""
# restore to the original _adjustlinkrev implementation
c = context.basefilectx
extensions.unwrapfunction(c, '_adjustlinkrev', _adjustlinkrev)
paths = {} # {id: name}
nodes = {} # {id: name}
repo = repo.unfiltered()
idx = repo.unfiltered().changelog.index
db = repo._linkrevcache
paths = dict(db._getdb(db._pathdbname))
nodes = dict(db._getdb(db._nodedbname))
pathsrev = dict((v, k) for k, v in paths.iteritems())
nodesrev = dict((v, k) for k, v in nodes.iteritems())
lrevs = dict(db._getdb(db._linkrevdbname))
readfilelog = ui.configbool('linkrevcache', 'readfilelog', True)
total = len(lrevs)
for i, (k, v) in enumerate(lrevs.iteritems()):
ui.progress(_('verifying'), i, total=total)
pathid, nodeid = k.split('\0')
path = pathsrev[pathid]
fnode = nodesrev[nodeid]
linkrevs = _str2intlist(v)
linkrevs.sort()
for linkrev in linkrevs:
fctx = repo[linkrev][path]
introrev = fctx.introrev()
fctx.linkrev()
if readfilelog:
flinkrev = fctx.linkrev()
else:
flinkrev = None
if introrev == linkrev:
continue
if (introrev in idx.commonancestorsheads(introrev, linkrev) and
(introrev in linkrevs or introrev == flinkrev)):
adjective = _('unnecessary')
else:
adjective = _('incorrect')
ui.warn(_('%s linkrev %s for %s @ %s (expected: %s)\n')
% (adjective, linkrev, path, node.hex(fnode),
introrev))
ui.write(_('%d entries verified\n') % total)
def _adjustlinkrev(orig, self, *args, **kwds):
lkr = self.linkrev()
repo = self._repo
# for a repo with only a single head, linkrev is accurate
if getattr(repo, '_singleheaded', False):
return lkr
# argv can be "path, flog, fnode, srcrev", or "srcrev" - see e81d72b4b0ae
srcrev = args[-1]
cache = getattr(self._repo, '_linkrevcache', None)
if cache is not None and srcrev is not None:
index = repo.unfiltered().changelog.index
try:
linkrevs = set(cache.getlinkrevs(self._path, self._filenode))
except Exception:
# the database may be locked - cannot be used correctly
linkrevs = set()
finally:
# do not keep the database open so others can write to it
# note: this is bad for perf. but it's here to workaround the gdbm
# locking pattern: reader and writer cannot co-exist. if we have
# a dbm engine that locks differently, we don't need this.
cache.close()
linkrevs.add(lkr)
for rev in sorted(linkrevs): # sorted filters out unnecessary linkrevs
if rev in index.commonancestorsheads(rev, srcrev):
return rev
# fallback to the possibly slow implementation
return orig(self, *args, **kwds)
def uisetup(ui):
c = context.basefilectx
extensions.wrapfunction(c, '_adjustlinkrev', _adjustlinkrev)