sapling/edenscm/hgext/linkrevcache.py
Durham Goode 6210607c36 manifest: remove readfast, replace readdelta
Summary:
Previously readfast was an optimization that sometimes returned a delta
against p1, but other times returned the full manifest. This was weird and
caused certain algorithms to sometimes be O(changes) but other times O(repo).
In a tree world we no longer need this optimization, so let's drop it.

readdelta is similar in that it would read the difference between a manifest and
it's delta base. This has no relationship to the delta between a manifest and
it's parent, so it's weird and we should get rid of it.

There is a legitimate use case for wanting to know what entries are new in a
manifest, like when deciding what to send over the wire. So let's add a new
readnew() function that is explicitly for reading what entries were introduced
by this tree. The implementation is basically the same as readdelta, but
deltaing against p1 instead of the deltabase.

Reviewed By: markbt

Differential Revision: D15344434

fbshipit-source-id: dc8dca326f66b2fc55cc76f93c7ce48aa7efedf3
2019-05-24 10:00:49 -07:00

536 lines
17 KiB
Python

# Copyright 2016-present Facebook. All Rights Reserved.
#
# linkrevcache: a simple caching layer to speed up _adjustlinkrev
#
# This software may be used and distributed according to the terms of the
# GNU General Public License version 2 or any later version.
"""a simple caching layer to speed up _adjustlinkrev
The linkrevcache extension could use a pre-built database to speed up some
_adjustlinkrev operations. The database is stored in the directory
'.hg/cache/linkrevdb'.
To use the extension, you need to prebuild the database using the
`debugbuildlinkrevcache` command, and then keep the extension enabled.
To update the database, run `debugbuildlinkrevcache` again. It would find new
revisions and fill the database incrementally.
If the building process is slow, try setting `checkancestor` to False.
The database won't be updated on demand for I/O and locking concerns. It may be
addressed if we could have some (partially) "append-only" map-like data
structure.
The linkrev caching database would generally speed up the log (following a
file) and annotate operations.
.. note::
The database format is not guaranteed portable. Copying it from a machine
to another may make it unreadable.
Config examples::
[linkrevcache]
# Whether to test ancestors or not. (default: True)
# - When set to False, the build process will be faster, while the database
# will contain some unnecessary entries (mode-only changes and merges
# where the file node is reused).
# - When set to True, the database won't contain unnecessary entries.
checkancestor = False
# Whether to read filelog or not. (default: True)
# - When set to False, the build process will be faster, while the database
# will be probably much larger.
# - When set to True, filelog will be read and existing linkrevs won't be
# stored in the database.
readfilelog = False
# Upper bound fo memory usage for debugbuildlinkrevcache (default: 2441406)
# - debugbuildlinkrevcache will try to reduce memory to sastify the limit
# - has no effect if readfilelog is False
# - has no effect for non-Linux platforms
# - it is a best effort and the program might fail to sastify the limit
maxpagesize = 2441406
"""
import os
import shutil
import sys
from edenscm.mercurial import (
context,
extensions,
filelog,
node,
progress,
registrar,
util,
)
from edenscm.mercurial.i18n import _
testedwith = "ships-with-fb-hgext"
cmdtable = {}
command = registrar.command(cmdtable)
_chosendbm = None
try:
xrange(0)
except NameError:
xrange = range
def _choosedbm():
"""return (name, module)"""
global _chosendbm
if not _chosendbm:
if sys.version_info >= (3, 0):
candidates = [
("gdbm", "dbm.gnu"),
("ndbm", "dbm.ndbm"),
("dumb", "dbm.dumb"),
]
else:
candidates = [
("gdbm", "gdbm"),
("bsd", "dbhash"),
("ndbm", "dbm"),
("dumb", "dumbdbm"),
]
for name, modname in candidates:
try:
mod = __import__(modname)
mod.open # sanity check with demandimport enabled
_chosendbm = (name, __import__(modname))
break
except ImportError:
pass
return _chosendbm
# dbm is a bytes -> bytes map, so we need to convert integers to bytes.
# the conversion functions are optimized for space usage.
# not using struct.(un)pack is because we may have things > 4 bytes (revlog
# defines the revision number to be 6 bytes) and 8-byte is wasteful.
def _strinc(s):
"""return the "next" string. useful as an incremental "ID"."""
if not s:
# avoid '\0' so '\0' could be used as a separator
return "\x01"
n = ord(s[-1])
if n == 255:
return _strinc(s[:-1]) + "\x01"
else:
return s[:-1] + chr(n + 1)
def _str2int(s):
# this is faster than "bytearray().extend(map(ord, s))"
x = 0
for ch in s:
x <<= 8
x += ord(ch)
return x
def _int2str(x):
s = ""
while x:
s = chr(x & 255) + s
x >>= 8
return s
def _intlist2str(intlist):
result = ""
for n in intlist:
s = _int2str(n)
l = len(s)
# do not accept huge integers
assert l < 256
result += chr(l) + s
return result
def _str2intlist(s):
result = []
i = 0
end = len(s)
while i < end:
l = ord(s[i])
i += 1
result.append(_str2int(s[i : i + l]))
i += l
return result
class linkrevdbreadonly(object):
_openflag = "r"
# numbers are useful in the atomic replace case: they can be sorted
# and replaced in a safer order. however, atomic caller should always
# use repo lock so the order only protects things when the repo lock
# does not work.
_metadbname = "0meta"
_pathdbname = "1path"
_nodedbname = "2node"
_linkrevdbname = "3linkrev"
def __init__(self, dirname):
dbmname, self._dbm = _choosedbm()
# use different file names for different dbm engine, to make the repo
# rsync-friendly across different platforms.
self._path = os.path.join(dirname, dbmname)
self._dbs = {}
def getlinkrevs(self, path, fnode):
pathdb = self._getdb(self._pathdbname)
nodedb = self._getdb(self._nodedbname)
lrevdb = self._getdb(self._linkrevdbname)
try:
pathid = pathdb[path]
nodeid = nodedb[fnode]
v = lrevdb[pathid + "\0" + nodeid]
return _str2intlist(v)
except KeyError:
return []
def getlastrev(self):
return _str2int(self._getmeta("lastrev"))
def close(self):
# the check is necessary if __init__ fails - the caller may call
# "close" in a "finally" block and it probably does not want close() to
# raise an exception there.
if util.safehasattr(self, "_dbs"):
for db in self._dbs.itervalues():
db.close()
self._dbs.clear()
def _getmeta(self, name):
try:
return self._getdb(self._metadbname)[name]
except KeyError:
return ""
def _getdb(self, name):
if name not in self._dbs:
self._dbs[name] = self._dbm.open(self._path + name, self._openflag)
return self._dbs[name]
class linkrevdbreadwrite(linkrevdbreadonly):
_openflag = "c"
def __init__(self, dirname):
util.makedirs(dirname)
super(linkrevdbreadwrite, self).__init__(dirname)
def appendlinkrev(self, path, fnode, linkrev):
pathdb = self._getdb(self._pathdbname)
nodedb = self._getdb(self._nodedbname)
lrevdb = self._getdb(self._linkrevdbname)
metadb = self._getdb(self._metadbname)
try:
pathid = pathdb[path]
except KeyError:
pathid = _strinc(self._getmeta("pathid"))
pathdb[path] = pathid
metadb["pathid"] = pathid
try:
nodeid = nodedb[fnode]
except KeyError:
nodeid = _strinc(self._getmeta("nodeid"))
nodedb[fnode] = nodeid
metadb["nodeid"] = nodeid
k = pathid + "\0" + nodeid
try:
v = _str2intlist(lrevdb[k])
except KeyError:
v = []
if linkrev in v:
return
v.append(linkrev)
lrevdb[k] = _intlist2str(v)
def setlastrev(self, rev):
self._getdb(self._metadbname)["lastrev"] = _int2str(rev)
class linkrevdbwritewithtemprename(linkrevdbreadwrite):
# Some dbm (ex. gdbm) disallows writer and reader to co-exist. This is
# basically to workaround that so a writer can still write to the (copied)
# database when there is a reader.
# Unlike "atomictemp", this applies to a directory. A directory cannot
# work like "atomictemp" unless symlink is used. Symlink is not portable so
# we don't use them. Therefore this is not atomic (while probably good
# enough because we write files in a reasonable order - in the worst case,
# we just drop those cache files).
# Ideally, we can have other dbms which support reader and writer to
# co-exist, and this will become unnecessary.
def __init__(self, dirname):
self._origpath = dirname
head, tail = os.path.split(dirname)
tempdir = "%s-%s" % (dirname, os.getpid())
self._tempdir = tempdir
try:
shutil.copytree(dirname, tempdir)
super(linkrevdbwritewithtemprename, self).__init__(tempdir)
except Exception:
shutil.rmtree(tempdir)
raise
def close(self):
super(linkrevdbwritewithtemprename, self).close()
if util.safehasattr(self, "_tempdir"):
for name in sorted(os.listdir(self._tempdir)):
oldpath = os.path.join(self._tempdir, name)
newpath = os.path.join(self._origpath, name)
os.rename(oldpath, newpath)
os.rmdir(self._tempdir)
def linkrevdb(dirname, write=False, copyonwrite=False):
# As commented in the "linkrevdbwritewithtemprename" above, these flags
# (write, copyonwrite) are mainly designed to workaround gdbm's locking
# issues. If we have a dbm that uses a less aggressive lock, we could get
# rid of these workarounds.
if not write:
return linkrevdbreadonly(dirname)
else:
if copyonwrite:
return linkrevdbwritewithtemprename(dirname)
else:
return linkrevdbreadwrite(dirname)
_linkrevdbpath = "cache/linkrevdb"
def reposetup(ui, repo):
if repo.local():
# if the repo is single headed, adjustlinkrev can just return linkrev
repo._singleheaded = len(repo.unfiltered().changelog.headrevs()) == 1
dbpath = repo.localvfs.join(_linkrevdbpath)
setattr(repo, "_linkrevcache", linkrevdb(dbpath, write=False))
@command(
"debugbuildlinkrevcache",
[
("e", "end", "", _("end revision")),
(
"",
"copy",
False,
_("copy the database files to modify them " "lock-free (EXPERIMENTAL)"),
),
],
)
def debugbuildlinkrevcache(ui, repo, *pats, **opts):
"""build the linkrev database from filelogs"""
db = linkrevdb(
repo.localvfs.join(_linkrevdbpath),
write=True,
copyonwrite=opts.get("atomic_temp"),
)
end = int(opts.get("end") or (len(repo) - 1))
try:
_buildlinkrevcache(ui, repo, db, end)
finally:
db.close()
def _getrsspagecount():
"""Get RSS memory usage in pages. Only works on Linux"""
try:
# The second column is VmRSS. See "man procfs".
return sum(map(int, open("/proc/self/statm").read().split()[1]))
except Exception:
return 0
def _buildlinkrevcache(ui, repo, db, end):
checkancestor = ui.configbool("linkrevcache", "checkancestor", True)
readfilelog = ui.configbool("linkrevcache", "readfilelog", True)
# 2441406: 10G by default (assuming page size = 4K).
maxpagesize = ui.configint("linkrevcache", "maxpagesize") or 2441406
repo = repo.unfiltered()
cl = repo.changelog
idx = cl.index
ml = repo.manifestlog
filelogcache = {}
def _getfilelog(path):
if path not in filelogcache:
# Make memory usage bounded
if len(filelogcache) % 1000 == 0:
if _getrsspagecount() > maxpagesize:
filelogcache.clear()
filelogcache[path] = filelog.filelog(repo.svfs, path)
return filelogcache[path]
start = db.getlastrev() + 1
# the number of ancestor tests when the slow (Python) stateful (cache
# ancestors) algorithm is faster than the fast (C) stateless (walk through
# the changelog index every time) algorithm.
ancestorcountthreshold = 10
with progress.bar(ui, _("building"), _("changesets"), end) as prog:
for rev in xrange(start, end + 1):
prog.value = rev
clr = cl.changelogrevision(rev)
md = ml[clr.manifest].read()
if checkancestor:
if len(clr.files) >= ancestorcountthreshold:
# we may need to frequently test ancestors against rev,
# in this case, pre-calculating rev's ancestors helps.
ancestors = cl.ancestors([rev])
def isancestor(x):
return x in ancestors
else:
# the C index ancestor testing is faster than Python's
# lazyancestors.
def isancestor(x):
return x in idx.commonancestorsheads(x, rev)
for path in clr.files:
if path not in md:
continue
fnode = md[path]
if readfilelog:
fl = _getfilelog(path)
frev = fl.rev(fnode)
lrev = fl.linkrev(frev)
if lrev == rev:
continue
else:
lrev = None
if checkancestor:
linkrevs = set(db.getlinkrevs(path, fnode))
if lrev is not None:
linkrevs.add(lrev)
if rev in linkrevs:
continue
if any(isancestor(l) for l in linkrevs):
continue
# found a new linkrev!
if ui.debugflag:
ui.debug("%s@%s: new linkrev %s\n" % (path, node.hex(fnode), rev))
db.appendlinkrev(path, fnode, rev)
db.setlastrev(rev)
@command("debugverifylinkrevcache", [])
def debugverifylinkrevcache(ui, repo, *pats, **opts):
"""read the linkrevs from the database and verify if they are correct"""
# restore to the original _adjustlinkrev implementation
c = context.basefilectx
extensions.unwrapfunction(c, "_adjustlinkrev", _adjustlinkrev)
paths = {} # {id: name}
nodes = {} # {id: name}
repo = repo.unfiltered()
idx = repo.unfiltered().changelog.index
db = repo._linkrevcache
paths = dict(db._getdb(db._pathdbname))
nodes = dict(db._getdb(db._nodedbname))
pathsrev = dict((v, k) for k, v in paths.iteritems())
nodesrev = dict((v, k) for k, v in nodes.iteritems())
lrevs = dict(db._getdb(db._linkrevdbname))
readfilelog = ui.configbool("linkrevcache", "readfilelog", True)
total = len(lrevs)
with progress.bar(ui, _("verifying"), total=total) as prog:
for i, (k, v) in enumerate(lrevs.iteritems()):
prog.value = i
pathid, nodeid = k.split("\0")
path = pathsrev[pathid]
fnode = nodesrev[nodeid]
linkrevs = _str2intlist(v)
linkrevs.sort()
for linkrev in linkrevs:
fctx = repo[linkrev][path]
introrev = fctx.introrev()
fctx.linkrev()
if readfilelog:
flinkrev = fctx.linkrev()
else:
flinkrev = None
if introrev == linkrev:
continue
if introrev in idx.commonancestorsheads(introrev, linkrev) and (
introrev in linkrevs or introrev == flinkrev
):
adjective = _("unnecessary")
else:
adjective = _("incorrect")
ui.warn(
_("%s linkrev %s for %s @ %s (expected: %s)\n")
% (adjective, linkrev, path, node.hex(fnode), introrev)
)
ui.write(_("%d entries verified\n") % total)
def _adjustlinkrev(orig, self, *args, **kwds):
lkr = self.linkrev()
repo = self._repo
# for a repo with only a single head, linkrev is accurate
if getattr(repo, "_singleheaded", False):
return lkr
# argv can be "path, flog, fnode, srcrev", or "srcrev" - see e81d72b4b0ae
srcrev = args[-1]
cache = getattr(self._repo, "_linkrevcache", None)
if cache is not None and srcrev is not None:
index = repo.unfiltered().changelog.index
try:
linkrevs = set(cache.getlinkrevs(self._path, self._filenode))
except Exception:
# the database may be locked - cannot be used correctly
linkrevs = set()
finally:
# do not keep the database open so others can write to it
# note: this is bad for perf. but it's here to workaround the gdbm
# locking pattern: reader and writer cannot co-exist. if we have
# a dbm engine that locks differently, we don't need this.
cache.close()
linkrevs.add(lkr)
for rev in sorted(linkrevs): # sorted filters out unnecessary linkrevs
if rev in index.commonancestorsheads(rev, srcrev):
return rev
# fallback to the possibly slow implementation
return orig(self, *args, **kwds)
def uisetup(ui):
c = context.basefilectx
extensions.wrapfunction(c, "_adjustlinkrev", _adjustlinkrev)