# Copyright 2016-present Facebook. All Rights Reserved. # # linkrevcache: a simple caching layer to speed up _adjustlinkrev # # This software may be used and distributed according to the terms of the # GNU General Public License version 2 or any later version. """a simple caching layer to speed up _adjustlinkrev The linkrevcache extension could use a pre-built database to speed up some _adjustlinkrev operations. The database is stored in the directory '.hg/cache/linkrevdb'. To use the extension, you need to prebuild the database using the `debugbuildlinkrevcache` command, and then keep the extension enabled. To update the database, run `debugbuildlinkrevcache` again. It would find new revisions and fill the database incrementally. If the building process is slow, try setting `checkancestor` to False. The database won't be updated on demand for I/O and locking concerns. It may be addressed if we could have some (partially) "append-only" map-like data structure. The linkrev caching database would generally speed up the log (following a file) and annotate operations. .. note:: The database format is not guaranteed portable. Copying it from a machine to another may make it unreadable. Config examples:: [linkrevcache] # Whether to test ancestors or not. (default: True) # - When set to False, the build process will be faster, while the database # will contain some unnecessary entries (mode-only changes and merges # where the file node is reused). # - When set to True, the database won't contain unnecessary entries. checkancestor = False # Whether to read filelog or not. (default: True) # - When set to False, the build process will be faster, while the database # will be probably much larger. # - When set to True, filelog will be read and existing linkrevs won't be # stored in the database. readfilelog = False """ import os import shutil import sys from edenscm.mercurial import context, extensions, filelog, node, progress, registrar, util from edenscm.mercurial.i18n import _ testedwith = "ships-with-fb-hgext" cmdtable = {} command = registrar.command(cmdtable) _chosendbm = None try: xrange(0) except NameError: xrange = range def _choosedbm(): """return (name, module)""" global _chosendbm if not _chosendbm: if sys.version_info >= (3, 0): candidates = [ ("gdbm", "dbm.gnu"), ("ndbm", "dbm.ndbm"), ("dumb", "dbm.dumb"), ] else: candidates = [ ("gdbm", "gdbm"), ("bsd", "dbhash"), ("ndbm", "dbm"), ("dumb", "dumbdbm"), ] for name, modname in candidates: try: mod = __import__(modname) mod.open # sanity check with demandimport enabled _chosendbm = (name, __import__(modname)) break except ImportError: pass return _chosendbm # dbm is a bytes -> bytes map, so we need to convert integers to bytes. # the conversion functions are optimized for space usage. # not using struct.(un)pack is because we may have things > 4 bytes (revlog # defines the revision number to be 6 bytes) and 8-byte is wasteful. def _strinc(s): """return the "next" string. useful as an incremental "ID".""" if not s: # avoid '\0' so '\0' could be used as a separator return "\x01" n = ord(s[-1]) if n == 255: return _strinc(s[:-1]) + "\x01" else: return s[:-1] + chr(n + 1) def _str2int(s): # this is faster than "bytearray().extend(map(ord, s))" x = 0 for ch in s: x <<= 8 x += ord(ch) return x def _int2str(x): s = "" while x: s = chr(x & 255) + s x >>= 8 return s def _intlist2str(intlist): result = "" for n in intlist: s = _int2str(n) l = len(s) # do not accept huge integers assert l < 256 result += chr(l) + s return result def _str2intlist(s): result = [] i = 0 end = len(s) while i < end: l = ord(s[i]) i += 1 result.append(_str2int(s[i : i + l])) i += l return result class linkrevdbreadonly(object): _openflag = "r" # numbers are useful in the atomic replace case: they can be sorted # and replaced in a safer order. however, atomic caller should always # use repo lock so the order only protects things when the repo lock # does not work. _metadbname = "0meta" _pathdbname = "1path" _nodedbname = "2node" _linkrevdbname = "3linkrev" def __init__(self, dirname): dbmname, self._dbm = _choosedbm() # use different file names for different dbm engine, to make the repo # rsync-friendly across different platforms. self._path = os.path.join(dirname, dbmname) self._dbs = {} def getlinkrevs(self, path, fnode): pathdb = self._getdb(self._pathdbname) nodedb = self._getdb(self._nodedbname) lrevdb = self._getdb(self._linkrevdbname) try: pathid = pathdb[path] nodeid = nodedb[fnode] v = lrevdb[pathid + "\0" + nodeid] return _str2intlist(v) except KeyError: return [] def getlastrev(self): return _str2int(self._getmeta("lastrev")) def close(self): # the check is necessary if __init__ fails - the caller may call # "close" in a "finally" block and it probably does not want close() to # raise an exception there. if util.safehasattr(self, "_dbs"): for db in self._dbs.itervalues(): db.close() self._dbs.clear() def _getmeta(self, name): try: return self._getdb(self._metadbname)[name] except KeyError: return "" def _getdb(self, name): if name not in self._dbs: self._dbs[name] = self._dbm.open(self._path + name, self._openflag) return self._dbs[name] class linkrevdbreadwrite(linkrevdbreadonly): _openflag = "c" def __init__(self, dirname): util.makedirs(dirname) super(linkrevdbreadwrite, self).__init__(dirname) def appendlinkrev(self, path, fnode, linkrev): pathdb = self._getdb(self._pathdbname) nodedb = self._getdb(self._nodedbname) lrevdb = self._getdb(self._linkrevdbname) metadb = self._getdb(self._metadbname) try: pathid = pathdb[path] except KeyError: pathid = _strinc(self._getmeta("pathid")) pathdb[path] = pathid metadb["pathid"] = pathid try: nodeid = nodedb[fnode] except KeyError: nodeid = _strinc(self._getmeta("nodeid")) nodedb[fnode] = nodeid metadb["nodeid"] = nodeid k = pathid + "\0" + nodeid try: v = _str2intlist(lrevdb[k]) except KeyError: v = [] if linkrev in v: return v.append(linkrev) lrevdb[k] = _intlist2str(v) def setlastrev(self, rev): self._getdb(self._metadbname)["lastrev"] = _int2str(rev) class linkrevdbwritewithtemprename(linkrevdbreadwrite): # Some dbm (ex. gdbm) disallows writer and reader to co-exist. This is # basically to workaround that so a writer can still write to the (copied) # database when there is a reader. # Unlike "atomictemp", this applies to a directory. A directory cannot # work like "atomictemp" unless symlink is used. Symlink is not portable so # we don't use them. Therefore this is not atomic (while probably good # enough because we write files in a reasonable order - in the worst case, # we just drop those cache files). # Ideally, we can have other dbms which support reader and writer to # co-exist, and this will become unnecessary. def __init__(self, dirname): self._origpath = dirname head, tail = os.path.split(dirname) tempdir = "%s-%s" % (dirname, os.getpid()) self._tempdir = tempdir try: shutil.copytree(dirname, tempdir) super(linkrevdbwritewithtemprename, self).__init__(tempdir) except Exception: shutil.rmtree(tempdir) raise def close(self): super(linkrevdbwritewithtemprename, self).close() if util.safehasattr(self, "_tempdir"): for name in sorted(os.listdir(self._tempdir)): oldpath = os.path.join(self._tempdir, name) newpath = os.path.join(self._origpath, name) os.rename(oldpath, newpath) os.rmdir(self._tempdir) def linkrevdb(dirname, write=False, copyonwrite=False): # As commented in the "linkrevdbwritewithtemprename" above, these flags # (write, copyonwrite) are mainly designed to workaround gdbm's locking # issues. If we have a dbm that uses a less aggressive lock, we could get # rid of these workarounds. if not write: return linkrevdbreadonly(dirname) else: if copyonwrite: return linkrevdbwritewithtemprename(dirname) else: return linkrevdbreadwrite(dirname) _linkrevdbpath = "cache/linkrevdb" def reposetup(ui, repo): if repo.local(): # if the repo is single headed, adjustlinkrev can just return linkrev repo._singleheaded = len(repo.unfiltered().changelog.headrevs()) == 1 dbpath = repo.localvfs.join(_linkrevdbpath) setattr(repo, "_linkrevcache", linkrevdb(dbpath, write=False)) @command( "debugbuildlinkrevcache", [ ("e", "end", "", _("end revision")), ( "", "copy", False, _("copy the database files to modify them " "lock-free (EXPERIMENTAL)"), ), ], ) def debugbuildlinkrevcache(ui, repo, *pats, **opts): """build the linkrev database from filelogs""" db = linkrevdb( repo.localvfs.join(_linkrevdbpath), write=True, copyonwrite=opts.get("atomic_temp"), ) end = int(opts.get("end") or (len(repo) - 1)) try: _buildlinkrevcache(ui, repo, db, end) finally: db.close() def _buildlinkrevcache(ui, repo, db, end): checkancestor = ui.configbool("linkrevcache", "checkancestor", True) readfilelog = ui.configbool("linkrevcache", "readfilelog", True) repo = repo.unfiltered() cl = repo.changelog idx = cl.index ml = repo.manifestlog filelogcache = {} def _getfilelog(path): if path not in filelogcache: filelogcache[path] = filelog.filelog(repo.svfs, path) return filelogcache[path] start = db.getlastrev() + 1 # the number of ancestor tests when the slow (Python) stateful (cache # ancestors) algorithm is faster than the fast (C) stateless (walk through # the changelog index every time) algorithm. ancestorcountthreshold = 10 with progress.bar(ui, _("building"), _("changesets"), end) as prog: for rev in xrange(start, end + 1): prog.value = rev clr = cl.changelogrevision(rev) md = ml[clr.manifest].readfast() if checkancestor: if len(clr.files) >= ancestorcountthreshold: # we may need to frequently test ancestors against rev, # in this case, pre-calculating rev's ancestors helps. ancestors = cl.ancestors([rev]) def isancestor(x): return x in ancestors else: # the C index ancestor testing is faster than Python's # lazyancestors. def isancestor(x): return x in idx.commonancestorsheads(x, rev) for path in clr.files: if path not in md: continue fnode = md[path] if readfilelog: fl = _getfilelog(path) frev = fl.rev(fnode) lrev = fl.linkrev(frev) if lrev == rev: continue else: lrev = None if checkancestor: linkrevs = set(db.getlinkrevs(path, fnode)) if lrev is not None: linkrevs.add(lrev) if rev in linkrevs: continue if any(isancestor(l) for l in linkrevs): continue # found a new linkrev! if ui.debugflag: ui.debug("%s@%s: new linkrev %s\n" % (path, node.hex(fnode), rev)) db.appendlinkrev(path, fnode, rev) db.setlastrev(rev) @command("debugverifylinkrevcache", []) def debugverifylinkrevcache(ui, repo, *pats, **opts): """read the linkrevs from the database and verify if they are correct""" # restore to the original _adjustlinkrev implementation c = context.basefilectx extensions.unwrapfunction(c, "_adjustlinkrev", _adjustlinkrev) paths = {} # {id: name} nodes = {} # {id: name} repo = repo.unfiltered() idx = repo.unfiltered().changelog.index db = repo._linkrevcache paths = dict(db._getdb(db._pathdbname)) nodes = dict(db._getdb(db._nodedbname)) pathsrev = dict((v, k) for k, v in paths.iteritems()) nodesrev = dict((v, k) for k, v in nodes.iteritems()) lrevs = dict(db._getdb(db._linkrevdbname)) readfilelog = ui.configbool("linkrevcache", "readfilelog", True) total = len(lrevs) with progress.bar(ui, _("verifying"), total=total) as prog: for i, (k, v) in enumerate(lrevs.iteritems()): prog.value = i pathid, nodeid = k.split("\0") path = pathsrev[pathid] fnode = nodesrev[nodeid] linkrevs = _str2intlist(v) linkrevs.sort() for linkrev in linkrevs: fctx = repo[linkrev][path] introrev = fctx.introrev() fctx.linkrev() if readfilelog: flinkrev = fctx.linkrev() else: flinkrev = None if introrev == linkrev: continue if introrev in idx.commonancestorsheads(introrev, linkrev) and ( introrev in linkrevs or introrev == flinkrev ): adjective = _("unnecessary") else: adjective = _("incorrect") ui.warn( _("%s linkrev %s for %s @ %s (expected: %s)\n") % (adjective, linkrev, path, node.hex(fnode), introrev) ) ui.write(_("%d entries verified\n") % total) def _adjustlinkrev(orig, self, *args, **kwds): lkr = self.linkrev() repo = self._repo # for a repo with only a single head, linkrev is accurate if getattr(repo, "_singleheaded", False): return lkr # argv can be "path, flog, fnode, srcrev", or "srcrev" - see e81d72b4b0ae srcrev = args[-1] cache = getattr(self._repo, "_linkrevcache", None) if cache is not None and srcrev is not None: index = repo.unfiltered().changelog.index try: linkrevs = set(cache.getlinkrevs(self._path, self._filenode)) except Exception: # the database may be locked - cannot be used correctly linkrevs = set() finally: # do not keep the database open so others can write to it # note: this is bad for perf. but it's here to workaround the gdbm # locking pattern: reader and writer cannot co-exist. if we have # a dbm engine that locks differently, we don't need this. cache.close() linkrevs.add(lkr) for rev in sorted(linkrevs): # sorted filters out unnecessary linkrevs if rev in index.commonancestorsheads(rev, srcrev): return rev # fallback to the possibly slow implementation return orig(self, *args, **kwds) def uisetup(ui): c = context.basefilectx extensions.wrapfunction(c, "_adjustlinkrev", _adjustlinkrev)