mirror of
https://github.com/facebook/sapling.git
synced 2024-10-16 11:52:02 +03:00
463cc8f299
Summary: Update most locations in the hg extensions to use `repo.localvfs` instead of `repo.vfs`. Reviewed By: quark-zju Differential Revision: D9699153 fbshipit-source-id: 48d5f9678caa4961063db30477d6fbe0d6f34347
506 lines
16 KiB
Python
506 lines
16 KiB
Python
# Copyright 2016-present Facebook. All Rights Reserved.
|
|
#
|
|
# linkrevcache: a simple caching layer to speed up _adjustlinkrev
|
|
#
|
|
# This software may be used and distributed according to the terms of the
|
|
# GNU General Public License version 2 or any later version.
|
|
|
|
"""a simple caching layer to speed up _adjustlinkrev
|
|
|
|
The linkrevcache extension could use a pre-built database to speed up some
|
|
_adjustlinkrev operations. The database is stored in the directory
|
|
'.hg/cache/linkrevdb'.
|
|
|
|
To use the extension, you need to prebuild the database using the
|
|
`debugbuildlinkrevcache` command, and then keep the extension enabled.
|
|
|
|
To update the database, run `debugbuildlinkrevcache` again. It would find new
|
|
revisions and fill the database incrementally.
|
|
|
|
If the building process is slow, try setting `checkancestor` to False.
|
|
|
|
The database won't be updated on demand for I/O and locking concerns. It may be
|
|
addressed if we could have some (partially) "append-only" map-like data
|
|
structure.
|
|
|
|
The linkrev caching database would generally speed up the log (following a
|
|
file) and annotate operations.
|
|
|
|
.. note::
|
|
|
|
The database format is not guaranteed portable. Copying it from a machine
|
|
to another may make it unreadable.
|
|
|
|
Config examples::
|
|
|
|
[linkrevcache]
|
|
# Whether to test ancestors or not. (default: True)
|
|
# - When set to False, the build process will be faster, while the database
|
|
# will contain some unnecessary entries (mode-only changes and merges
|
|
# where the file node is reused).
|
|
# - When set to True, the database won't contain unnecessary entries.
|
|
checkancestor = False
|
|
|
|
# Whether to read filelog or not. (default: True)
|
|
# - When set to False, the build process will be faster, while the database
|
|
# will be probably much larger.
|
|
# - When set to True, filelog will be read and existing linkrevs won't be
|
|
# stored in the database.
|
|
readfilelog = False
|
|
"""
|
|
|
|
import os
|
|
import shutil
|
|
import sys
|
|
|
|
from mercurial import context, extensions, filelog, node, progress, registrar, util
|
|
from mercurial.i18n import _
|
|
|
|
|
|
testedwith = "ships-with-fb-hgext"
|
|
|
|
cmdtable = {}
|
|
command = registrar.command(cmdtable)
|
|
|
|
_chosendbm = None
|
|
|
|
try:
|
|
xrange(0)
|
|
except NameError:
|
|
xrange = range
|
|
|
|
|
|
def _choosedbm():
|
|
"""return (name, module)"""
|
|
global _chosendbm
|
|
if not _chosendbm:
|
|
if sys.version_info >= (3, 0):
|
|
candidates = [
|
|
("gdbm", "dbm.gnu"),
|
|
("ndbm", "dbm.ndbm"),
|
|
("dumb", "dbm.dumb"),
|
|
]
|
|
else:
|
|
candidates = [
|
|
("gdbm", "gdbm"),
|
|
("bsd", "dbhash"),
|
|
("ndbm", "dbm"),
|
|
("dumb", "dumbdbm"),
|
|
]
|
|
for name, modname in candidates:
|
|
try:
|
|
mod = __import__(modname)
|
|
mod.open # sanity check with demandimport enabled
|
|
_chosendbm = (name, __import__(modname))
|
|
break
|
|
except ImportError:
|
|
pass
|
|
return _chosendbm
|
|
|
|
|
|
# dbm is a bytes -> bytes map, so we need to convert integers to bytes.
|
|
# the conversion functions are optimized for space usage.
|
|
# not using struct.(un)pack is because we may have things > 4 bytes (revlog
|
|
# defines the revision number to be 6 bytes) and 8-byte is wasteful.
|
|
|
|
|
|
def _strinc(s):
|
|
"""return the "next" string. useful as an incremental "ID"."""
|
|
if not s:
|
|
# avoid '\0' so '\0' could be used as a separator
|
|
return "\x01"
|
|
n = ord(s[-1])
|
|
if n == 255:
|
|
return _strinc(s[:-1]) + "\x01"
|
|
else:
|
|
return s[:-1] + chr(n + 1)
|
|
|
|
|
|
def _str2int(s):
|
|
# this is faster than "bytearray().extend(map(ord, s))"
|
|
x = 0
|
|
for ch in s:
|
|
x <<= 8
|
|
x += ord(ch)
|
|
return x
|
|
|
|
|
|
def _int2str(x):
|
|
s = ""
|
|
while x:
|
|
s = chr(x & 255) + s
|
|
x >>= 8
|
|
return s
|
|
|
|
|
|
def _intlist2str(intlist):
|
|
result = ""
|
|
for n in intlist:
|
|
s = _int2str(n)
|
|
l = len(s)
|
|
# do not accept huge integers
|
|
assert l < 256
|
|
result += chr(l) + s
|
|
return result
|
|
|
|
|
|
def _str2intlist(s):
|
|
result = []
|
|
i = 0
|
|
end = len(s)
|
|
while i < end:
|
|
l = ord(s[i])
|
|
i += 1
|
|
result.append(_str2int(s[i : i + l]))
|
|
i += l
|
|
return result
|
|
|
|
|
|
class linkrevdbreadonly(object):
|
|
_openflag = "r"
|
|
|
|
# numbers are useful in the atomic replace case: they can be sorted
|
|
# and replaced in a safer order. however, atomic caller should always
|
|
# use repo lock so the order only protects things when the repo lock
|
|
# does not work.
|
|
_metadbname = "0meta"
|
|
_pathdbname = "1path"
|
|
_nodedbname = "2node"
|
|
_linkrevdbname = "3linkrev"
|
|
|
|
def __init__(self, dirname):
|
|
dbmname, self._dbm = _choosedbm()
|
|
# use different file names for different dbm engine, to make the repo
|
|
# rsync-friendly across different platforms.
|
|
self._path = os.path.join(dirname, dbmname)
|
|
self._dbs = {}
|
|
|
|
def getlinkrevs(self, path, fnode):
|
|
pathdb = self._getdb(self._pathdbname)
|
|
nodedb = self._getdb(self._nodedbname)
|
|
lrevdb = self._getdb(self._linkrevdbname)
|
|
try:
|
|
pathid = pathdb[path]
|
|
nodeid = nodedb[fnode]
|
|
v = lrevdb[pathid + "\0" + nodeid]
|
|
return _str2intlist(v)
|
|
except KeyError:
|
|
return []
|
|
|
|
def getlastrev(self):
|
|
return _str2int(self._getmeta("lastrev"))
|
|
|
|
def close(self):
|
|
# the check is necessary if __init__ fails - the caller may call
|
|
# "close" in a "finally" block and it probably does not want close() to
|
|
# raise an exception there.
|
|
if util.safehasattr(self, "_dbs"):
|
|
for db in self._dbs.itervalues():
|
|
db.close()
|
|
self._dbs.clear()
|
|
|
|
def _getmeta(self, name):
|
|
try:
|
|
return self._getdb(self._metadbname)[name]
|
|
except KeyError:
|
|
return ""
|
|
|
|
def _getdb(self, name):
|
|
if name not in self._dbs:
|
|
self._dbs[name] = self._dbm.open(self._path + name, self._openflag)
|
|
return self._dbs[name]
|
|
|
|
|
|
class linkrevdbreadwrite(linkrevdbreadonly):
|
|
_openflag = "c"
|
|
|
|
def __init__(self, dirname):
|
|
util.makedirs(dirname)
|
|
super(linkrevdbreadwrite, self).__init__(dirname)
|
|
|
|
def appendlinkrev(self, path, fnode, linkrev):
|
|
pathdb = self._getdb(self._pathdbname)
|
|
nodedb = self._getdb(self._nodedbname)
|
|
lrevdb = self._getdb(self._linkrevdbname)
|
|
metadb = self._getdb(self._metadbname)
|
|
try:
|
|
pathid = pathdb[path]
|
|
except KeyError:
|
|
pathid = _strinc(self._getmeta("pathid"))
|
|
pathdb[path] = pathid
|
|
metadb["pathid"] = pathid
|
|
try:
|
|
nodeid = nodedb[fnode]
|
|
except KeyError:
|
|
nodeid = _strinc(self._getmeta("nodeid"))
|
|
nodedb[fnode] = nodeid
|
|
metadb["nodeid"] = nodeid
|
|
k = pathid + "\0" + nodeid
|
|
try:
|
|
v = _str2intlist(lrevdb[k])
|
|
except KeyError:
|
|
v = []
|
|
if linkrev in v:
|
|
return
|
|
v.append(linkrev)
|
|
lrevdb[k] = _intlist2str(v)
|
|
|
|
def setlastrev(self, rev):
|
|
self._getdb(self._metadbname)["lastrev"] = _int2str(rev)
|
|
|
|
|
|
class linkrevdbwritewithtemprename(linkrevdbreadwrite):
|
|
# Some dbm (ex. gdbm) disallows writer and reader to co-exist. This is
|
|
# basically to workaround that so a writer can still write to the (copied)
|
|
# database when there is a reader.
|
|
# Unlike "atomictemp", this applies to a directory. A directory cannot
|
|
# work like "atomictemp" unless symlink is used. Symlink is not portable so
|
|
# we don't use them. Therefore this is not atomic (while probably good
|
|
# enough because we write files in a reasonable order - in the worst case,
|
|
# we just drop those cache files).
|
|
# Ideally, we can have other dbms which support reader and writer to
|
|
# co-exist, and this will become unnecessary.
|
|
def __init__(self, dirname):
|
|
self._origpath = dirname
|
|
head, tail = os.path.split(dirname)
|
|
tempdir = "%s-%s" % (dirname, os.getpid())
|
|
self._tempdir = tempdir
|
|
try:
|
|
shutil.copytree(dirname, tempdir)
|
|
super(linkrevdbwritewithtemprename, self).__init__(tempdir)
|
|
except Exception:
|
|
shutil.rmtree(tempdir)
|
|
raise
|
|
|
|
def close(self):
|
|
super(linkrevdbwritewithtemprename, self).close()
|
|
if util.safehasattr(self, "_tempdir"):
|
|
for name in sorted(os.listdir(self._tempdir)):
|
|
oldpath = os.path.join(self._tempdir, name)
|
|
newpath = os.path.join(self._origpath, name)
|
|
os.rename(oldpath, newpath)
|
|
os.rmdir(self._tempdir)
|
|
|
|
|
|
def linkrevdb(dirname, write=False, copyonwrite=False):
|
|
# As commented in the "linkrevdbwritewithtemprename" above, these flags
|
|
# (write, copyonwrite) are mainly designed to workaround gdbm's locking
|
|
# issues. If we have a dbm that uses a less aggressive lock, we could get
|
|
# rid of these workarounds.
|
|
if not write:
|
|
return linkrevdbreadonly(dirname)
|
|
else:
|
|
if copyonwrite:
|
|
return linkrevdbwritewithtemprename(dirname)
|
|
else:
|
|
return linkrevdbreadwrite(dirname)
|
|
|
|
|
|
_linkrevdbpath = "cache/linkrevdb"
|
|
|
|
|
|
def reposetup(ui, repo):
|
|
if repo.local():
|
|
# if the repo is single headed, adjustlinkrev can just return linkrev
|
|
repo._singleheaded = len(repo.unfiltered().changelog.headrevs()) == 1
|
|
|
|
dbpath = repo.localvfs.join(_linkrevdbpath)
|
|
setattr(repo, "_linkrevcache", linkrevdb(dbpath, write=False))
|
|
|
|
|
|
@command(
|
|
"debugbuildlinkrevcache",
|
|
[
|
|
("e", "end", "", _("end revision")),
|
|
(
|
|
"",
|
|
"copy",
|
|
False,
|
|
_("copy the database files to modify them " "lock-free (EXPERIMENTAL)"),
|
|
),
|
|
],
|
|
)
|
|
def debugbuildlinkrevcache(ui, repo, *pats, **opts):
|
|
"""build the linkrev database from filelogs"""
|
|
db = linkrevdb(
|
|
repo.localvfs.join(_linkrevdbpath),
|
|
write=True,
|
|
copyonwrite=opts.get("atomic_temp"),
|
|
)
|
|
end = int(opts.get("end") or (len(repo) - 1))
|
|
try:
|
|
_buildlinkrevcache(ui, repo, db, end)
|
|
finally:
|
|
db.close()
|
|
|
|
|
|
def _buildlinkrevcache(ui, repo, db, end):
|
|
checkancestor = ui.configbool("linkrevcache", "checkancestor", True)
|
|
readfilelog = ui.configbool("linkrevcache", "readfilelog", True)
|
|
|
|
repo = repo.unfiltered()
|
|
cl = repo.changelog
|
|
idx = cl.index
|
|
ml = repo.manifestlog
|
|
|
|
filelogcache = {}
|
|
|
|
def _getfilelog(path):
|
|
if path not in filelogcache:
|
|
filelogcache[path] = filelog.filelog(repo.svfs, path)
|
|
return filelogcache[path]
|
|
|
|
start = db.getlastrev() + 1
|
|
|
|
# the number of ancestor tests when the slow (Python) stateful (cache
|
|
# ancestors) algorithm is faster than the fast (C) stateless (walk through
|
|
# the changelog index every time) algorithm.
|
|
ancestorcountthreshold = 10
|
|
|
|
with progress.bar(ui, _("building"), _("changesets"), end) as prog:
|
|
for rev in xrange(start, end + 1):
|
|
prog.value = rev
|
|
clr = cl.changelogrevision(rev)
|
|
md = ml[clr.manifest].readfast()
|
|
|
|
if checkancestor:
|
|
if len(clr.files) >= ancestorcountthreshold:
|
|
# we may need to frequently test ancestors against rev,
|
|
# in this case, pre-calculating rev's ancestors helps.
|
|
ancestors = cl.ancestors([rev])
|
|
|
|
def isancestor(x):
|
|
return x in ancestors
|
|
|
|
else:
|
|
# the C index ancestor testing is faster than Python's
|
|
# lazyancestors.
|
|
def isancestor(x):
|
|
return x in idx.commonancestorsheads(x, rev)
|
|
|
|
for path in clr.files:
|
|
if path not in md:
|
|
continue
|
|
|
|
fnode = md[path]
|
|
|
|
if readfilelog:
|
|
fl = _getfilelog(path)
|
|
frev = fl.rev(fnode)
|
|
lrev = fl.linkrev(frev)
|
|
if lrev == rev:
|
|
continue
|
|
else:
|
|
lrev = None
|
|
|
|
if checkancestor:
|
|
linkrevs = set(db.getlinkrevs(path, fnode))
|
|
if lrev is not None:
|
|
linkrevs.add(lrev)
|
|
if rev in linkrevs:
|
|
continue
|
|
if any(isancestor(l) for l in linkrevs):
|
|
continue
|
|
|
|
# found a new linkrev!
|
|
if ui.debugflag:
|
|
ui.debug("%s@%s: new linkrev %s\n" % (path, node.hex(fnode), rev))
|
|
|
|
db.appendlinkrev(path, fnode, rev)
|
|
|
|
db.setlastrev(rev)
|
|
|
|
|
|
@command("debugverifylinkrevcache", [])
|
|
def debugverifylinkrevcache(ui, repo, *pats, **opts):
|
|
"""read the linkrevs from the database and verify if they are correct"""
|
|
# restore to the original _adjustlinkrev implementation
|
|
c = context.basefilectx
|
|
extensions.unwrapfunction(c, "_adjustlinkrev", _adjustlinkrev)
|
|
|
|
paths = {} # {id: name}
|
|
nodes = {} # {id: name}
|
|
|
|
repo = repo.unfiltered()
|
|
idx = repo.unfiltered().changelog.index
|
|
|
|
db = repo._linkrevcache
|
|
paths = dict(db._getdb(db._pathdbname))
|
|
nodes = dict(db._getdb(db._nodedbname))
|
|
pathsrev = dict((v, k) for k, v in paths.iteritems())
|
|
nodesrev = dict((v, k) for k, v in nodes.iteritems())
|
|
lrevs = dict(db._getdb(db._linkrevdbname))
|
|
|
|
readfilelog = ui.configbool("linkrevcache", "readfilelog", True)
|
|
|
|
total = len(lrevs)
|
|
with progress.bar(ui, _("verifying"), total=total) as prog:
|
|
for i, (k, v) in enumerate(lrevs.iteritems()):
|
|
prog.value = i
|
|
pathid, nodeid = k.split("\0")
|
|
path = pathsrev[pathid]
|
|
fnode = nodesrev[nodeid]
|
|
linkrevs = _str2intlist(v)
|
|
linkrevs.sort()
|
|
|
|
for linkrev in linkrevs:
|
|
fctx = repo[linkrev][path]
|
|
introrev = fctx.introrev()
|
|
fctx.linkrev()
|
|
if readfilelog:
|
|
flinkrev = fctx.linkrev()
|
|
else:
|
|
flinkrev = None
|
|
if introrev == linkrev:
|
|
continue
|
|
if introrev in idx.commonancestorsheads(introrev, linkrev) and (
|
|
introrev in linkrevs or introrev == flinkrev
|
|
):
|
|
adjective = _("unnecessary")
|
|
else:
|
|
adjective = _("incorrect")
|
|
ui.warn(
|
|
_("%s linkrev %s for %s @ %s (expected: %s)\n")
|
|
% (adjective, linkrev, path, node.hex(fnode), introrev)
|
|
)
|
|
|
|
ui.write(_("%d entries verified\n") % total)
|
|
|
|
|
|
def _adjustlinkrev(orig, self, *args, **kwds):
|
|
lkr = self.linkrev()
|
|
repo = self._repo
|
|
|
|
# for a repo with only a single head, linkrev is accurate
|
|
if getattr(repo, "_singleheaded", False):
|
|
return lkr
|
|
|
|
# argv can be "path, flog, fnode, srcrev", or "srcrev" - see e81d72b4b0ae
|
|
srcrev = args[-1]
|
|
cache = getattr(self._repo, "_linkrevcache", None)
|
|
if cache is not None and srcrev is not None:
|
|
index = repo.unfiltered().changelog.index
|
|
try:
|
|
linkrevs = set(cache.getlinkrevs(self._path, self._filenode))
|
|
except Exception:
|
|
# the database may be locked - cannot be used correctly
|
|
linkrevs = set()
|
|
finally:
|
|
# do not keep the database open so others can write to it
|
|
# note: this is bad for perf. but it's here to workaround the gdbm
|
|
# locking pattern: reader and writer cannot co-exist. if we have
|
|
# a dbm engine that locks differently, we don't need this.
|
|
cache.close()
|
|
linkrevs.add(lkr)
|
|
for rev in sorted(linkrevs): # sorted filters out unnecessary linkrevs
|
|
if rev in index.commonancestorsheads(rev, srcrev):
|
|
return rev
|
|
|
|
# fallback to the possibly slow implementation
|
|
return orig(self, *args, **kwds)
|
|
|
|
|
|
def uisetup(ui):
|
|
c = context.basefilectx
|
|
extensions.wrapfunction(c, "_adjustlinkrev", _adjustlinkrev)
|