sapling/eden/hg-server/edenscm/hgext/linkrevcache.py
Durham Goode 98d9269874 server: copy hg to a new hg-server directory
Summary:
Create a fork of the Mercurial code that we can use to build server
rpms. The hg servers will continue to exist for a few more months while we move
the darkstorm and ediscovery use cases off them. In the mean time, we want to
start making breaking changes to the client, so let's create a stable copy of
the hg code to produce rpms for the hg servers.

The fork is based off c7770c78d, the latest hg release.

This copies the files as is, then adds some minor tweaks to get it to build:
- Disables some lint checks that appear to be bypassed by path
- sed replace eden/scm with eden/hg-server
- Removed a dependency on scm/telemetry from the edenfs-client tests since
  scm/telemetry pulls in the original eden/scm/lib/configparser which conflicts
  with the hg-server conflict parser.

allow-large-files

Reviewed By: quark-zju

Differential Revision: D27632557

fbshipit-source-id: b2f442f4ec000ea08e4d62de068750832198e1f4
2021-04-09 10:09:06 -07:00

525 lines
17 KiB
Python

# Copyright (c) Facebook, Inc. and its affiliates.
#
# This software may be used and distributed according to the terms of the
# GNU General Public License version 2.
# linkrevcache: a simple caching layer to speed up _adjustlinkrev
"""a simple caching layer to speed up _adjustlinkrev
The linkrevcache extension could use a pre-built database to speed up some
_adjustlinkrev operations. The database is stored in the directory
'.hg/cache/linkrevdb'.
To use the extension, you need to prebuild the database using the
`debugbuildlinkrevcache` command, and then keep the extension enabled.
To update the database, run `debugbuildlinkrevcache` again. It would find new
revisions and fill the database incrementally.
If the building process is slow, try setting `checkancestor` to False.
The database won't be updated on demand for I/O and locking concerns. It may be
addressed if we could have some (partially) "append-only" map-like data
structure.
The linkrev caching database would generally speed up the log (following a
file) and annotate operations.
.. note::
The database format is not guaranteed portable. Copying it from a machine
to another may make it unreadable.
Config examples::
[linkrevcache]
# Whether to test ancestors or not. (default: True)
# - When set to False, the build process will be faster, while the database
# will contain some unnecessary entries (mode-only changes and merges
# where the file node is reused).
# - When set to True, the database won't contain unnecessary entries.
checkancestor = False
# Whether to read filelog or not. (default: True)
# - When set to False, the build process will be faster, while the database
# will be probably much larger.
# - When set to True, filelog will be read and existing linkrevs won't be
# stored in the database.
readfilelog = False
# Upper bound fo memory usage for debugbuildlinkrevcache (default: 2441406)
# - debugbuildlinkrevcache will try to reduce memory to sastify the limit
# - has no effect if readfilelog is False
# - has no effect for non-Linux platforms
# - it is a best effort and the program might fail to sastify the limit
maxpagesize = 2441406
"""
import os
import shutil
import sys
from edenscm.mercurial import (
context,
extensions,
filelog,
node,
progress,
pycompat,
registrar,
util,
)
from edenscm.mercurial.i18n import _
from edenscm.mercurial.pycompat import range
testedwith = "ships-with-fb-hgext"
cmdtable = {}
command = registrar.command(cmdtable)
_chosendbm = None
def _choosedbm():
"""return (name, module)"""
global _chosendbm
if not _chosendbm:
if sys.version_info >= (3, 0):
candidates = [
("gdbm", "dbm.gnu"),
("ndbm", "dbm.ndbm"),
("dumb", "dbm.dumb"),
]
else:
candidates = [
("gdbm", "gdbm"),
("bsd", "dbhash"),
("ndbm", "dbm"),
("dumb", "dumbdbm"),
]
for name, modname in candidates:
try:
mod = __import__(modname)
mod.open # sanity check with demandimport enabled
_chosendbm = (name, __import__(modname))
break
except ImportError:
pass
return _chosendbm
# dbm is a bytes -> bytes map, so we need to convert integers to bytes.
# the conversion functions are optimized for space usage.
# not using struct.(un)pack is because we may have things > 4 bytes (revlog
# defines the revision number to be 6 bytes) and 8-byte is wasteful.
def _strinc(s):
"""return the "next" string. useful as an incremental "ID"."""
if not s:
# avoid '\0' so '\0' could be used as a separator
return "\x01"
n = ord(s[-1])
if n == 255:
return _strinc(s[:-1]) + "\x01"
else:
return s[:-1] + chr(n + 1)
def _str2int(s):
# this is faster than "bytearray().extend(map(ord, s))"
x = 0
for ch in s:
x <<= 8
x += ord(ch)
return x
def _int2str(x):
s = ""
while x:
s = chr(x & 255) + s
x >>= 8
return s
def _intlist2str(intlist):
result = ""
for n in intlist:
s = _int2str(n)
l = len(s)
# do not accept huge integers
assert l < 256
result += chr(l) + s
return result
def _str2intlist(s):
result = []
i = 0
end = len(s)
while i < end:
l = ord(s[i])
i += 1
result.append(_str2int(s[i : i + l]))
i += l
return result
class linkrevdbreadonly(object):
_openflag = "r"
# numbers are useful in the atomic replace case: they can be sorted
# and replaced in a safer order. however, atomic caller should always
# use repo lock so the order only protects things when the repo lock
# does not work.
_metadbname = "0meta"
_pathdbname = "1path"
_nodedbname = "2node"
_linkrevdbname = "3linkrev"
def __init__(self, dirname):
dbmname, self._dbm = _choosedbm()
# use different file names for different dbm engine, to make the repo
# rsync-friendly across different platforms.
self._path = os.path.join(dirname, dbmname)
self._dbs = {}
def getlinkrevs(self, path, fnode):
pathdb = self._getdb(self._pathdbname)
nodedb = self._getdb(self._nodedbname)
lrevdb = self._getdb(self._linkrevdbname)
try:
pathid = pathdb[path]
nodeid = nodedb[fnode]
v = lrevdb[pathid + "\0" + nodeid]
return _str2intlist(v)
except KeyError:
return []
def getlastrev(self):
return _str2int(self._getmeta("lastrev"))
def close(self):
# the check is necessary if __init__ fails - the caller may call
# "close" in a "finally" block and it probably does not want close() to
# raise an exception there.
if util.safehasattr(self, "_dbs"):
for db in pycompat.itervalues(self._dbs):
db.close()
self._dbs.clear()
def _getmeta(self, name):
try:
return self._getdb(self._metadbname)[name]
except KeyError:
return ""
def _getdb(self, name):
if name not in self._dbs:
self._dbs[name] = self._dbm.open(self._path + name, self._openflag)
return self._dbs[name]
class linkrevdbreadwrite(linkrevdbreadonly):
_openflag = "c"
def __init__(self, dirname):
util.makedirs(dirname)
super(linkrevdbreadwrite, self).__init__(dirname)
def appendlinkrev(self, path, fnode, linkrev):
pathdb = self._getdb(self._pathdbname)
nodedb = self._getdb(self._nodedbname)
lrevdb = self._getdb(self._linkrevdbname)
metadb = self._getdb(self._metadbname)
try:
pathid = pathdb[path]
except KeyError:
pathid = _strinc(self._getmeta("pathid"))
pathdb[path] = pathid
metadb["pathid"] = pathid
try:
nodeid = nodedb[fnode]
except KeyError:
nodeid = _strinc(self._getmeta("nodeid"))
nodedb[fnode] = nodeid
metadb["nodeid"] = nodeid
k = pathid + "\0" + nodeid
try:
v = _str2intlist(lrevdb[k])
except KeyError:
v = []
if linkrev in v:
return
v.append(linkrev)
lrevdb[k] = _intlist2str(v)
def setlastrev(self, rev):
self._getdb(self._metadbname)["lastrev"] = _int2str(rev)
class linkrevdbwritewithtemprename(linkrevdbreadwrite):
# Some dbm (ex. gdbm) disallows writer and reader to co-exist. This is
# basically to workaround that so a writer can still write to the (copied)
# database when there is a reader.
# Unlike "atomictemp", this applies to a directory. A directory cannot
# work like "atomictemp" unless symlink is used. Symlink is not portable so
# we don't use them. Therefore this is not atomic (while probably good
# enough because we write files in a reasonable order - in the worst case,
# we just drop those cache files).
# Ideally, we can have other dbms which support reader and writer to
# co-exist, and this will become unnecessary.
def __init__(self, dirname):
self._origpath = dirname
head, tail = os.path.split(dirname)
tempdir = "%s-%s" % (dirname, os.getpid())
self._tempdir = tempdir
try:
shutil.copytree(dirname, tempdir)
super(linkrevdbwritewithtemprename, self).__init__(tempdir)
except Exception:
shutil.rmtree(tempdir)
raise
def close(self):
super(linkrevdbwritewithtemprename, self).close()
if util.safehasattr(self, "_tempdir"):
for name in sorted(os.listdir(self._tempdir)):
oldpath = os.path.join(self._tempdir, name)
newpath = os.path.join(self._origpath, name)
os.rename(oldpath, newpath)
os.rmdir(self._tempdir)
def linkrevdb(dirname, write=False, copyonwrite=False):
# As commented in the "linkrevdbwritewithtemprename" above, these flags
# (write, copyonwrite) are mainly designed to workaround gdbm's locking
# issues. If we have a dbm that uses a less aggressive lock, we could get
# rid of these workarounds.
if not write:
return linkrevdbreadonly(dirname)
else:
if copyonwrite:
return linkrevdbwritewithtemprename(dirname)
else:
return linkrevdbreadwrite(dirname)
_linkrevdbpath = "cache/linkrevdb"
def reposetup(ui, repo):
if repo.local():
dbpath = repo.localvfs.join(_linkrevdbpath)
setattr(repo, "_linkrevcache", linkrevdb(dbpath, write=False))
@command(
"debugbuildlinkrevcache",
[
("e", "end", "", _("end revision")),
(
"",
"copy",
False,
_("copy the database files to modify them " "lock-free (EXPERIMENTAL)"),
),
],
)
def debugbuildlinkrevcache(ui, repo, *pats, **opts):
"""build the linkrev database from filelogs"""
db = linkrevdb(
repo.localvfs.join(_linkrevdbpath),
write=True,
copyonwrite=opts.get("atomic_temp"),
)
end = int(opts.get("end") or (len(repo) - 1))
try:
_buildlinkrevcache(ui, repo, db, end)
finally:
db.close()
def _getrsspagecount():
"""Get RSS memory usage in pages. Only works on Linux"""
try:
# The second column is VmRSS. See "man procfs".
with open("/proc/self/statm") as f:
return sum(map(int, f.read().split()[1]))
except Exception:
return 0
def _buildlinkrevcache(ui, repo, db, end):
checkancestor = ui.configbool("linkrevcache", "checkancestor", True)
readfilelog = ui.configbool("linkrevcache", "readfilelog", True)
# 2441406: 10G by default (assuming page size = 4K).
maxpagesize = ui.configint("linkrevcache", "maxpagesize") or 2441406
cl = repo.changelog
idx = cl.index
ml = repo.manifestlog
filelogcache = {}
def _getfilelog(path):
if path not in filelogcache:
# Make memory usage bounded
if len(filelogcache) % 1000 == 0:
if _getrsspagecount() > maxpagesize:
filelogcache.clear()
filelogcache[path] = filelog.filelog(repo.svfs, path)
return filelogcache[path]
start = db.getlastrev() + 1
# the number of ancestor tests when the slow (Python) stateful (cache
# ancestors) algorithm is faster than the fast (C) stateless (walk through
# the changelog index every time) algorithm.
ancestorcountthreshold = 10
with progress.bar(ui, _("building"), _("changesets"), end) as prog:
for rev in range(start, end + 1):
prog.value = rev
clr = cl.changelogrevision(rev)
md = ml[clr.manifest].read()
if checkancestor:
if len(clr.files) >= ancestorcountthreshold:
# we may need to frequently test ancestors against rev,
# in this case, pre-calculating rev's ancestors helps.
ancestors = cl.ancestors([rev])
def isancestor(x):
return x in ancestors
else:
# the C index ancestor testing is faster than Python's
# lazyancestors.
def isancestor(x):
return x in idx.commonancestorsheads(x, rev)
for path in clr.files:
if path not in md:
continue
fnode = md[path]
if readfilelog:
fl = _getfilelog(path)
frev = fl.rev(fnode)
lrev = fl.linkrev(frev)
if lrev == rev:
continue
else:
lrev = None
if checkancestor:
linkrevs = set(db.getlinkrevs(path, fnode))
if lrev is not None:
linkrevs.add(lrev)
if rev in linkrevs:
continue
if any(isancestor(l) for l in linkrevs):
continue
# found a new linkrev!
if ui.debugflag:
ui.debug("%s@%s: new linkrev %s\n" % (path, node.hex(fnode), rev))
db.appendlinkrev(path, fnode, rev)
db.setlastrev(rev)
@command("debugverifylinkrevcache", [])
def debugverifylinkrevcache(ui, repo, *pats, **opts):
"""read the linkrevs from the database and verify if they are correct"""
# restore to the original _adjustlinkrev implementation
c = context.basefilectx
extensions.unwrapfunction(c, "_adjustlinkrev", _adjustlinkrev)
paths = {} # {id: name}
nodes = {} # {id: name}
idx = repo.changelog.index
db = repo._linkrevcache
paths = dict(db._getdb(db._pathdbname))
nodes = dict(db._getdb(db._nodedbname))
pathsrev = dict((v, pycompat.decodeutf8(k)) for k, v in pycompat.iteritems(paths))
nodesrev = dict((v, k) for k, v in pycompat.iteritems(nodes))
lrevs = dict(db._getdb(db._linkrevdbname))
readfilelog = ui.configbool("linkrevcache", "readfilelog", True)
total = len(lrevs)
with progress.bar(ui, _("verifying"), total=total) as prog:
for i, (k, v) in enumerate(pycompat.iteritems(lrevs)):
prog.value = i
pathid, nodeid = k.split(b"\0")
path = pathsrev[pathid]
fnode = nodesrev[nodeid]
linkrevs = _str2intlist(pycompat.decodeutf8(v))
linkrevs.sort()
for linkrev in linkrevs:
fctx = repo[linkrev][path]
introrev = fctx.introrev()
fctx.linkrev()
if readfilelog:
flinkrev = fctx.linkrev()
else:
flinkrev = None
if introrev == linkrev:
continue
if introrev in idx.commonancestorsheads(introrev, linkrev) and (
introrev in linkrevs or introrev == flinkrev
):
adjective = _("unnecessary")
else:
adjective = _("incorrect")
ui.warn(
_("%s linkrev %s for %s @ %s (expected: %s)\n")
% (adjective, linkrev, path, node.hex(fnode), introrev)
)
ui.write(_("%d entries verified\n") % total)
def _adjustlinkrev(orig, self, *args, **kwds):
lkr = self.linkrev()
repo = self._repo
# argv can be "path, flog, fnode, srcrev", or "srcrev" - see e81d72b4b0ae
srcrev = args[-1]
cache = getattr(self._repo, "_linkrevcache", None)
if cache is not None and srcrev is not None:
index = repo.changelog.index
try:
linkrevs = set(cache.getlinkrevs(self._path, self._filenode))
except Exception:
# the database may be locked - cannot be used correctly
linkrevs = set()
finally:
# do not keep the database open so others can write to it
# note: this is bad for perf. but it's here to workaround the gdbm
# locking pattern: reader and writer cannot co-exist. if we have
# a dbm engine that locks differently, we don't need this.
cache.close()
linkrevs.add(lkr)
for rev in sorted(linkrevs): # sorted filters out unnecessary linkrevs
if rev in index.commonancestorsheads(rev, srcrev):
return rev
# fallback to the possibly slow implementation
return orig(self, *args, **kwds)
def uisetup(ui):
c = context.basefilectx
extensions.wrapfunction(c, "_adjustlinkrev", _adjustlinkrev)