mirror of
https://github.com/facebook/sapling.git
synced 2024-10-10 16:57:49 +03:00
e75b9fc1b1
Summary: This commit moves most of the stuff in hgext3rd and related tests to hg-crew/hgext and hg-crew/test respectively. The things that are not moved are the ones which require some more complex imports. Depends on D6675309 Test Plan: - tests are failing at this commit, fixes are in the following commits Reviewers: #sourcecontrol Differential Revision: https://phabricator.intern.facebook.com/D6675329
469 lines
16 KiB
Python
469 lines
16 KiB
Python
# Copyright 2016-present Facebook. All Rights Reserved.
|
|
#
|
|
# linkrevcache: a simple caching layer to speed up _adjustlinkrev
|
|
#
|
|
# This software may be used and distributed according to the terms of the
|
|
# GNU General Public License version 2 or any later version.
|
|
|
|
"""a simple caching layer to speed up _adjustlinkrev
|
|
|
|
The linkrevcache extension could use a pre-built database to speed up some
|
|
_adjustlinkrev operations. The database is stored in the directory
|
|
'.hg/cache/linkrevdb'.
|
|
|
|
To use the extension, you need to prebuild the database using the
|
|
`debugbuildlinkrevcache` command, and then keep the extension enabled.
|
|
|
|
To update the database, run `debugbuildlinkrevcache` again. It would find new
|
|
revisions and fill the database incrementally.
|
|
|
|
If the building process is slow, try setting `checkancestor` to False.
|
|
|
|
The database won't be updated on demand for I/O and locking concerns. It may be
|
|
addressed if we could have some (partially) "append-only" map-like data
|
|
structure.
|
|
|
|
The linkrev caching database would generally speed up the log (following a
|
|
file) and annotate operations.
|
|
|
|
.. note::
|
|
|
|
The database format is not guaranteed portable. Copying it from a machine
|
|
to another may make it unreadable.
|
|
|
|
Config examples::
|
|
|
|
[linkrevcache]
|
|
# Whether to test ancestors or not. (default: True)
|
|
# - When set to False, the build process will be faster, while the database
|
|
# will contain some unnecessary entries (mode-only changes and merges
|
|
# where the file node is reused).
|
|
# - When set to True, the database won't contain unnecessary entries.
|
|
checkancestor = False
|
|
|
|
# Whether to read filelog or not. (default: True)
|
|
# - When set to False, the build process will be faster, while the database
|
|
# will be probably much larger.
|
|
# - When set to True, filelog will be read and existing linkrevs won't be
|
|
# stored in the database.
|
|
readfilelog = False
|
|
"""
|
|
|
|
import os
|
|
import shutil
|
|
import sys
|
|
|
|
from mercurial import (
|
|
context,
|
|
extensions,
|
|
filelog,
|
|
node,
|
|
registrar,
|
|
util,
|
|
)
|
|
from mercurial.i18n import _
|
|
|
|
testedwith = 'ships-with-fb-hgext'
|
|
|
|
cmdtable = {}
|
|
command = registrar.command(cmdtable)
|
|
|
|
_chosendbm = None
|
|
|
|
def _choosedbm():
|
|
"""return (name, module)"""
|
|
global _chosendbm
|
|
if not _chosendbm:
|
|
if sys.version_info >= (3, 0):
|
|
candidates = [('gdbm', 'dbm.gnu'), ('ndbm', 'dbm.ndbm'),
|
|
('dumb', 'dbm.dumb')]
|
|
else:
|
|
candidates = [('gdbm', 'gdbm'), ('bsd', 'dbhash'),
|
|
('ndbm', 'dbm'), ('dumb', 'dumbdbm')]
|
|
for name, modname in candidates:
|
|
try:
|
|
mod = __import__(modname)
|
|
mod.open # sanity check with demandimport enabled
|
|
_chosendbm = (name, __import__(modname))
|
|
break
|
|
except ImportError:
|
|
pass
|
|
return _chosendbm
|
|
|
|
# dbm is a bytes -> bytes map, so we need to convert integers to bytes.
|
|
# the conversion functions are optimized for space usage.
|
|
# not using struct.(un)pack is because we may have things > 4 bytes (revlog
|
|
# defines the revision number to be 6 bytes) and 8-byte is wasteful.
|
|
|
|
def _strinc(s):
|
|
"""return the "next" string. useful as an incremental "ID"."""
|
|
if not s:
|
|
# avoid '\0' so '\0' could be used as a separator
|
|
return '\x01'
|
|
n = ord(s[-1])
|
|
if n == 255:
|
|
return _strinc(s[:-1]) + '\x01'
|
|
else:
|
|
return s[:-1] + chr(n + 1)
|
|
|
|
def _str2int(s):
|
|
# this is faster than "bytearray().extend(map(ord, s))"
|
|
x = 0
|
|
for ch in s:
|
|
x <<= 8
|
|
x += ord(ch)
|
|
return x
|
|
|
|
def _int2str(x):
|
|
s = ''
|
|
while x:
|
|
s = chr(x & 255) + s
|
|
x >>= 8
|
|
return s
|
|
|
|
def _intlist2str(intlist):
|
|
result = ''
|
|
for n in intlist:
|
|
s = _int2str(n)
|
|
l = len(s)
|
|
# do not accept huge integers
|
|
assert l < 256
|
|
result += chr(l) + s
|
|
return result
|
|
|
|
def _str2intlist(s):
|
|
result = []
|
|
i = 0
|
|
end = len(s)
|
|
while i < end:
|
|
l = ord(s[i])
|
|
i += 1
|
|
result.append(_str2int(s[i:i + l]))
|
|
i += l
|
|
return result
|
|
|
|
class linkrevdbreadonly(object):
|
|
_openflag = 'r'
|
|
|
|
# numbers are useful in the atomic replace case: they can be sorted
|
|
# and replaced in a safer order. however, atomic caller should always
|
|
# use repo lock so the order only protects things when the repo lock
|
|
# does not work.
|
|
_metadbname = '0meta'
|
|
_pathdbname = '1path'
|
|
_nodedbname = '2node'
|
|
_linkrevdbname = '3linkrev'
|
|
|
|
def __init__(self, dirname):
|
|
dbmname, self._dbm = _choosedbm()
|
|
# use different file names for different dbm engine, to make the repo
|
|
# rsync-friendly across different platforms.
|
|
self._path = os.path.join(dirname, dbmname)
|
|
self._dbs = {}
|
|
|
|
def getlinkrevs(self, path, fnode):
|
|
pathdb = self._getdb(self._pathdbname)
|
|
nodedb = self._getdb(self._nodedbname)
|
|
lrevdb = self._getdb(self._linkrevdbname)
|
|
try:
|
|
pathid = pathdb[path]
|
|
nodeid = nodedb[fnode]
|
|
v = lrevdb[pathid + '\0' + nodeid]
|
|
return _str2intlist(v)
|
|
except KeyError:
|
|
return []
|
|
|
|
def getlastrev(self):
|
|
return _str2int(self._getmeta('lastrev'))
|
|
|
|
def close(self):
|
|
# the check is necessary if __init__ fails - the caller may call
|
|
# "close" in a "finally" block and it probably does not want close() to
|
|
# raise an exception there.
|
|
if util.safehasattr(self, '_dbs'):
|
|
for db in self._dbs.itervalues():
|
|
db.close()
|
|
self._dbs.clear()
|
|
|
|
def _getmeta(self, name):
|
|
try:
|
|
return self._getdb(self._metadbname)[name]
|
|
except KeyError:
|
|
return ''
|
|
|
|
def _getdb(self, name):
|
|
if name not in self._dbs:
|
|
self._dbs[name] = self._dbm.open(self._path + name, self._openflag)
|
|
return self._dbs[name]
|
|
|
|
class linkrevdbreadwrite(linkrevdbreadonly):
|
|
_openflag = 'c'
|
|
|
|
def __init__(self, dirname):
|
|
util.makedirs(dirname)
|
|
super(linkrevdbreadwrite, self).__init__(dirname)
|
|
|
|
def appendlinkrev(self, path, fnode, linkrev):
|
|
pathdb = self._getdb(self._pathdbname)
|
|
nodedb = self._getdb(self._nodedbname)
|
|
lrevdb = self._getdb(self._linkrevdbname)
|
|
metadb = self._getdb(self._metadbname)
|
|
try:
|
|
pathid = pathdb[path]
|
|
except KeyError:
|
|
pathid = _strinc(self._getmeta('pathid'))
|
|
pathdb[path] = pathid
|
|
metadb['pathid'] = pathid
|
|
try:
|
|
nodeid = nodedb[fnode]
|
|
except KeyError:
|
|
nodeid = _strinc(self._getmeta('nodeid'))
|
|
nodedb[fnode] = nodeid
|
|
metadb['nodeid'] = nodeid
|
|
k = pathid + '\0' + nodeid
|
|
try:
|
|
v = _str2intlist(lrevdb[k])
|
|
except KeyError:
|
|
v = []
|
|
if linkrev in v:
|
|
return
|
|
v.append(linkrev)
|
|
lrevdb[k] = _intlist2str(v)
|
|
|
|
def setlastrev(self, rev):
|
|
self._getdb(self._metadbname)['lastrev'] = _int2str(rev)
|
|
|
|
class linkrevdbwritewithtemprename(linkrevdbreadwrite):
|
|
# Some dbm (ex. gdbm) disallows writer and reader to co-exist. This is
|
|
# basically to workaround that so a writer can still write to the (copied)
|
|
# database when there is a reader.
|
|
# Unlike "atomictemp", this applies to a directory. A directory cannot
|
|
# work like "atomictemp" unless symlink is used. Symlink is not portable so
|
|
# we don't use them. Therefore this is not atomic (while probably good
|
|
# enough because we write files in a reasonable order - in the worst case,
|
|
# we just drop those cache files).
|
|
# Ideally, we can have other dbms which support reader and writer to
|
|
# co-exist, and this will become unnecessary.
|
|
def __init__(self, dirname):
|
|
self._origpath = dirname
|
|
head, tail = os.path.split(dirname)
|
|
tempdir = '%s-%s' % (dirname, os.getpid())
|
|
self._tempdir = tempdir
|
|
try:
|
|
shutil.copytree(dirname, tempdir)
|
|
super(linkrevdbwritewithtemprename, self).__init__(tempdir)
|
|
except Exception:
|
|
shutil.rmtree(tempdir)
|
|
raise
|
|
|
|
def close(self):
|
|
super(linkrevdbwritewithtemprename, self).close()
|
|
if util.safehasattr(self, '_tempdir'):
|
|
for name in sorted(os.listdir(self._tempdir)):
|
|
oldpath = os.path.join(self._tempdir, name)
|
|
newpath = os.path.join(self._origpath, name)
|
|
os.rename(oldpath, newpath)
|
|
os.rmdir(self._tempdir)
|
|
|
|
def linkrevdb(dirname, write=False, copyonwrite=False):
|
|
# As commented in the "linkrevdbwritewithtemprename" above, these flags
|
|
# (write, copyonwrite) are mainly designed to workaround gdbm's locking
|
|
# issues. If we have a dbm that uses a less aggressive lock, we could get
|
|
# rid of these workarounds.
|
|
if not write:
|
|
return linkrevdbreadonly(dirname)
|
|
else:
|
|
if copyonwrite:
|
|
return linkrevdbwritewithtemprename(dirname)
|
|
else:
|
|
return linkrevdbreadwrite(dirname)
|
|
|
|
_linkrevdbpath = 'cache/linkrevdb'
|
|
|
|
def reposetup(ui, repo):
|
|
if repo.local():
|
|
# if the repo is single headed, adjustlinkrev can just return linkrev
|
|
repo._singleheaded = (len(repo.unfiltered().changelog.headrevs()) == 1)
|
|
|
|
dbpath = repo.vfs.join(_linkrevdbpath)
|
|
setattr(repo, '_linkrevcache', linkrevdb(dbpath, write=False))
|
|
|
|
@command('debugbuildlinkrevcache',
|
|
[('e', 'end', '', _('end revision')),
|
|
('', 'copy', False, _('copy the database files to modify them '
|
|
'lock-free (EXPERIMENTAL)'))])
|
|
def debugbuildlinkrevcache(ui, repo, *pats, **opts):
|
|
"""build the linkrev database from filelogs"""
|
|
db = linkrevdb(repo.vfs.join(_linkrevdbpath), write=True,
|
|
copyonwrite=opts.get('atomic_temp'))
|
|
end = int(opts.get('end') or (len(repo) - 1))
|
|
try:
|
|
_buildlinkrevcache(ui, repo, db, end)
|
|
finally:
|
|
db.close()
|
|
|
|
def _buildlinkrevcache(ui, repo, db, end):
|
|
checkancestor = ui.configbool('linkrevcache', 'checkancestor', True)
|
|
readfilelog = ui.configbool('linkrevcache', 'readfilelog', True)
|
|
|
|
repo = repo.unfiltered()
|
|
cl = repo.changelog
|
|
idx = cl.index
|
|
ml = repo.manifestlog
|
|
|
|
filelogcache = {}
|
|
|
|
def _getfilelog(path):
|
|
if path not in filelogcache:
|
|
filelogcache[path] = filelog.filelog(repo.svfs, path)
|
|
return filelogcache[path]
|
|
|
|
start = db.getlastrev() + 1
|
|
|
|
# the number of ancestor tests when the slow (Python) stateful (cache
|
|
# ancestors) algorithm is faster than the fast (C) stateless (walk through
|
|
# the changelog index every time) algorithm.
|
|
ancestorcountthreshold = 10
|
|
|
|
for rev in xrange(start, end + 1):
|
|
ui.progress(_('building'), rev, total=end, unit=_('changesets'))
|
|
clr = cl.changelogrevision(rev)
|
|
md = ml[clr.manifest].readfast()
|
|
|
|
if checkancestor:
|
|
if len(clr.files) >= ancestorcountthreshold:
|
|
# we may need to frequently test ancestors against rev,
|
|
# in this case, pre-calculating rev's ancestors helps.
|
|
ancestors = cl.ancestors([rev])
|
|
|
|
def isancestor(x):
|
|
return x in ancestors
|
|
else:
|
|
# the C index ancestor testing is faster than Python's
|
|
# lazyancestors.
|
|
def isancestor(x):
|
|
return x in idx.commonancestorsheads(x, rev)
|
|
|
|
for path in clr.files:
|
|
if path not in md:
|
|
continue
|
|
|
|
fnode = md[path]
|
|
|
|
if readfilelog:
|
|
fl = _getfilelog(path)
|
|
frev = fl.rev(fnode)
|
|
lrev = fl.linkrev(frev)
|
|
if lrev == rev:
|
|
continue
|
|
else:
|
|
lrev = None
|
|
|
|
if checkancestor:
|
|
linkrevs = set(db.getlinkrevs(path, fnode))
|
|
if lrev is not None:
|
|
linkrevs.add(lrev)
|
|
if rev in linkrevs:
|
|
continue
|
|
if any(isancestor(l) for l in linkrevs):
|
|
continue
|
|
|
|
# found a new linkrev!
|
|
if ui.debugflag:
|
|
ui.debug('%s@%s: new linkrev %s\n'
|
|
% (path, node.hex(fnode), rev))
|
|
|
|
db.appendlinkrev(path, fnode, rev)
|
|
|
|
db.setlastrev(rev)
|
|
|
|
ui.write() # clear progress bar
|
|
|
|
@command('debugverifylinkrevcache', [])
|
|
def debugverifylinkrevcache(ui, repo, *pats, **opts):
|
|
"""read the linkrevs from the database and verify if they are correct"""
|
|
# restore to the original _adjustlinkrev implementation
|
|
c = context.basefilectx
|
|
extensions.unwrapfunction(c, '_adjustlinkrev', _adjustlinkrev)
|
|
|
|
paths = {} # {id: name}
|
|
nodes = {} # {id: name}
|
|
|
|
repo = repo.unfiltered()
|
|
idx = repo.unfiltered().changelog.index
|
|
|
|
db = repo._linkrevcache
|
|
paths = dict(db._getdb(db._pathdbname))
|
|
nodes = dict(db._getdb(db._nodedbname))
|
|
pathsrev = dict((v, k) for k, v in paths.iteritems())
|
|
nodesrev = dict((v, k) for k, v in nodes.iteritems())
|
|
lrevs = dict(db._getdb(db._linkrevdbname))
|
|
|
|
readfilelog = ui.configbool('linkrevcache', 'readfilelog', True)
|
|
|
|
total = len(lrevs)
|
|
for i, (k, v) in enumerate(lrevs.iteritems()):
|
|
ui.progress(_('verifying'), i, total=total)
|
|
pathid, nodeid = k.split('\0')
|
|
path = pathsrev[pathid]
|
|
fnode = nodesrev[nodeid]
|
|
linkrevs = _str2intlist(v)
|
|
linkrevs.sort()
|
|
|
|
for linkrev in linkrevs:
|
|
fctx = repo[linkrev][path]
|
|
introrev = fctx.introrev()
|
|
fctx.linkrev()
|
|
if readfilelog:
|
|
flinkrev = fctx.linkrev()
|
|
else:
|
|
flinkrev = None
|
|
if introrev == linkrev:
|
|
continue
|
|
if (introrev in idx.commonancestorsheads(introrev, linkrev) and
|
|
(introrev in linkrevs or introrev == flinkrev)):
|
|
adjective = _('unnecessary')
|
|
else:
|
|
adjective = _('incorrect')
|
|
ui.warn(_('%s linkrev %s for %s @ %s (expected: %s)\n')
|
|
% (adjective, linkrev, path, node.hex(fnode),
|
|
introrev))
|
|
|
|
ui.write(_('%d entries verified\n') % total)
|
|
|
|
def _adjustlinkrev(orig, self, *args, **kwds):
|
|
lkr = self.linkrev()
|
|
repo = self._repo
|
|
|
|
# for a repo with only a single head, linkrev is accurate
|
|
if getattr(repo, '_singleheaded', False):
|
|
return lkr
|
|
|
|
# argv can be "path, flog, fnode, srcrev", or "srcrev" - see e81d72b4b0ae
|
|
srcrev = args[-1]
|
|
cache = getattr(self._repo, '_linkrevcache', None)
|
|
if cache is not None and srcrev is not None:
|
|
index = repo.unfiltered().changelog.index
|
|
try:
|
|
linkrevs = set(cache.getlinkrevs(self._path, self._filenode))
|
|
except Exception:
|
|
# the database may be locked - cannot be used correctly
|
|
linkrevs = set()
|
|
finally:
|
|
# do not keep the database open so others can write to it
|
|
# note: this is bad for perf. but it's here to workaround the gdbm
|
|
# locking pattern: reader and writer cannot co-exist. if we have
|
|
# a dbm engine that locks differently, we don't need this.
|
|
cache.close()
|
|
linkrevs.add(lkr)
|
|
for rev in sorted(linkrevs): # sorted filters out unnecessary linkrevs
|
|
if rev in index.commonancestorsheads(rev, srcrev):
|
|
return rev
|
|
|
|
# fallback to the possibly slow implementation
|
|
return orig(self, *args, **kwds)
|
|
|
|
def uisetup(ui):
|
|
c = context.basefilectx
|
|
extensions.wrapfunction(c, '_adjustlinkrev', _adjustlinkrev)
|