# Copyright 2016-present Facebook. All Rights Reserved. # # linkrevcache: a simple caching layer to speed up _adjustlinkrev # # This software may be used and distributed according to the terms of the # GNU General Public License version 2 or any later version. """a simple caching layer to speed up _adjustlinkrev The linkrevcache extension could use a pre-built database to speed up some _adjustlinkrev operations. The database is stored in the directory '.hg/cache/linkrevdb'. To use the extension, you need to prebuild the database using the `debugbuildlinkrevcache` command, and then keep the extension enabled. To update the database, run `debugbuildlinkrevcache` again. It would find new revisions and fill the database incrementally. If the building process is slow, try setting `checkancestor` to False. The database won't be updated on demand for I/O and locking concerns. It may be addressed if we could have some (partially) "append-only" map-like data structure. The linkrev caching database would generally speed up the log (following a file) and annotate operations. .. note:: The database format is not guaranteed portable. Copying it from a machine to another may make it unreadable. Config examples:: [linkrevcache] # Whether to test ancestors or not. (default: True) # - When set to False, the build process will be faster, while the database # will contain some unnecessary entries (mode-only changes and merges # where the file node is reused). # - When set to True, the database won't contain unnecessary entries. checkancestor = False # Whether to read filelog or not. (default: True) # - When set to False, the build process will be faster, while the database # will be probably much larger. # - When set to True, filelog will be read and existing linkrevs won't be # stored in the database. readfilelog = False """ import os import shutil import sys from mercurial import ( context, extensions, filelog, node, registrar, util, ) from mercurial.i18n import _ testedwith = 'ships-with-fb-hgext' cmdtable = {} command = registrar.command(cmdtable) _chosendbm = None def _choosedbm(): """return (name, module)""" global _chosendbm if not _chosendbm: if sys.version_info >= (3, 0): candidates = [('gdbm', 'dbm.gnu'), ('ndbm', 'dbm.ndbm'), ('dumb', 'dbm.dumb')] else: candidates = [('gdbm', 'gdbm'), ('bsd', 'dbhash'), ('ndbm', 'dbm'), ('dumb', 'dumbdbm')] for name, modname in candidates: try: mod = __import__(modname) mod.open # sanity check with demandimport enabled _chosendbm = (name, __import__(modname)) break except ImportError: pass return _chosendbm # dbm is a bytes -> bytes map, so we need to convert integers to bytes. # the conversion functions are optimized for space usage. # not using struct.(un)pack is because we may have things > 4 bytes (revlog # defines the revision number to be 6 bytes) and 8-byte is wasteful. def _strinc(s): """return the "next" string. useful as an incremental "ID".""" if not s: # avoid '\0' so '\0' could be used as a separator return '\x01' n = ord(s[-1]) if n == 255: return _strinc(s[:-1]) + '\x01' else: return s[:-1] + chr(n + 1) def _str2int(s): # this is faster than "bytearray().extend(map(ord, s))" x = 0 for ch in s: x <<= 8 x += ord(ch) return x def _int2str(x): s = '' while x: s = chr(x & 255) + s x >>= 8 return s def _intlist2str(intlist): result = '' for n in intlist: s = _int2str(n) l = len(s) # do not accept huge integers assert l < 256 result += chr(l) + s return result def _str2intlist(s): result = [] i = 0 end = len(s) while i < end: l = ord(s[i]) i += 1 result.append(_str2int(s[i:i + l])) i += l return result class linkrevdbreadonly(object): _openflag = 'r' # numbers are useful in the atomic replace case: they can be sorted # and replaced in a safer order. however, atomic caller should always # use repo lock so the order only protects things when the repo lock # does not work. _metadbname = '0meta' _pathdbname = '1path' _nodedbname = '2node' _linkrevdbname = '3linkrev' def __init__(self, dirname): dbmname, self._dbm = _choosedbm() # use different file names for different dbm engine, to make the repo # rsync-friendly across different platforms. self._path = os.path.join(dirname, dbmname) self._dbs = {} def getlinkrevs(self, path, fnode): pathdb = self._getdb(self._pathdbname) nodedb = self._getdb(self._nodedbname) lrevdb = self._getdb(self._linkrevdbname) try: pathid = pathdb[path] nodeid = nodedb[fnode] v = lrevdb[pathid + '\0' + nodeid] return _str2intlist(v) except KeyError: return [] def getlastrev(self): return _str2int(self._getmeta('lastrev')) def close(self): # the check is necessary if __init__ fails - the caller may call # "close" in a "finally" block and it probably does not want close() to # raise an exception there. if util.safehasattr(self, '_dbs'): for db in self._dbs.itervalues(): db.close() self._dbs.clear() def _getmeta(self, name): try: return self._getdb(self._metadbname)[name] except KeyError: return '' def _getdb(self, name): if name not in self._dbs: self._dbs[name] = self._dbm.open(self._path + name, self._openflag) return self._dbs[name] class linkrevdbreadwrite(linkrevdbreadonly): _openflag = 'c' def __init__(self, dirname): util.makedirs(dirname) super(linkrevdbreadwrite, self).__init__(dirname) def appendlinkrev(self, path, fnode, linkrev): pathdb = self._getdb(self._pathdbname) nodedb = self._getdb(self._nodedbname) lrevdb = self._getdb(self._linkrevdbname) metadb = self._getdb(self._metadbname) try: pathid = pathdb[path] except KeyError: pathid = _strinc(self._getmeta('pathid')) pathdb[path] = pathid metadb['pathid'] = pathid try: nodeid = nodedb[fnode] except KeyError: nodeid = _strinc(self._getmeta('nodeid')) nodedb[fnode] = nodeid metadb['nodeid'] = nodeid k = pathid + '\0' + nodeid try: v = _str2intlist(lrevdb[k]) except KeyError: v = [] if linkrev in v: return v.append(linkrev) lrevdb[k] = _intlist2str(v) def setlastrev(self, rev): self._getdb(self._metadbname)['lastrev'] = _int2str(rev) class linkrevdbwritewithtemprename(linkrevdbreadwrite): # Some dbm (ex. gdbm) disallows writer and reader to co-exist. This is # basically to workaround that so a writer can still write to the (copied) # database when there is a reader. # Unlike "atomictemp", this applies to a directory. A directory cannot # work like "atomictemp" unless symlink is used. Symlink is not portable so # we don't use them. Therefore this is not atomic (while probably good # enough because we write files in a reasonable order - in the worst case, # we just drop those cache files). # Ideally, we can have other dbms which support reader and writer to # co-exist, and this will become unnecessary. def __init__(self, dirname): self._origpath = dirname head, tail = os.path.split(dirname) tempdir = '%s-%s' % (dirname, os.getpid()) self._tempdir = tempdir try: shutil.copytree(dirname, tempdir) super(linkrevdbwritewithtemprename, self).__init__(tempdir) except Exception: shutil.rmtree(tempdir) raise def close(self): super(linkrevdbwritewithtemprename, self).close() if util.safehasattr(self, '_tempdir'): for name in sorted(os.listdir(self._tempdir)): oldpath = os.path.join(self._tempdir, name) newpath = os.path.join(self._origpath, name) os.rename(oldpath, newpath) os.rmdir(self._tempdir) def linkrevdb(dirname, write=False, copyonwrite=False): # As commented in the "linkrevdbwritewithtemprename" above, these flags # (write, copyonwrite) are mainly designed to workaround gdbm's locking # issues. If we have a dbm that uses a less aggressive lock, we could get # rid of these workarounds. if not write: return linkrevdbreadonly(dirname) else: if copyonwrite: return linkrevdbwritewithtemprename(dirname) else: return linkrevdbreadwrite(dirname) _linkrevdbpath = 'cache/linkrevdb' def reposetup(ui, repo): if repo.local(): # if the repo is single headed, adjustlinkrev can just return linkrev repo._singleheaded = (len(repo.unfiltered().changelog.headrevs()) == 1) dbpath = repo.vfs.join(_linkrevdbpath) setattr(repo, '_linkrevcache', linkrevdb(dbpath, write=False)) @command('debugbuildlinkrevcache', [('e', 'end', '', _('end revision')), ('', 'copy', False, _('copy the database files to modify them ' 'lock-free (EXPERIMENTAL)'))]) def debugbuildlinkrevcache(ui, repo, *pats, **opts): """build the linkrev database from filelogs""" db = linkrevdb(repo.vfs.join(_linkrevdbpath), write=True, copyonwrite=opts.get('atomic_temp')) end = int(opts.get('end') or (len(repo) - 1)) try: _buildlinkrevcache(ui, repo, db, end) finally: db.close() def _buildlinkrevcache(ui, repo, db, end): checkancestor = ui.configbool('linkrevcache', 'checkancestor', True) readfilelog = ui.configbool('linkrevcache', 'readfilelog', True) repo = repo.unfiltered() cl = repo.changelog idx = cl.index ml = repo.manifestlog filelogcache = {} def _getfilelog(path): if path not in filelogcache: filelogcache[path] = filelog.filelog(repo.svfs, path) return filelogcache[path] start = db.getlastrev() + 1 # the number of ancestor tests when the slow (Python) stateful (cache # ancestors) algorithm is faster than the fast (C) stateless (walk through # the changelog index every time) algorithm. ancestorcountthreshold = 10 for rev in xrange(start, end + 1): ui.progress(_('building'), rev, total=end, unit=_('changesets')) clr = cl.changelogrevision(rev) md = ml[clr.manifest].readfast() if checkancestor: if len(clr.files) >= ancestorcountthreshold: # we may need to frequently test ancestors against rev, # in this case, pre-calculating rev's ancestors helps. ancestors = cl.ancestors([rev]) def isancestor(x): return x in ancestors else: # the C index ancestor testing is faster than Python's # lazyancestors. def isancestor(x): return x in idx.commonancestorsheads(x, rev) for path in clr.files: if path not in md: continue fnode = md[path] if readfilelog: fl = _getfilelog(path) frev = fl.rev(fnode) lrev = fl.linkrev(frev) if lrev == rev: continue else: lrev = None if checkancestor: linkrevs = set(db.getlinkrevs(path, fnode)) if lrev is not None: linkrevs.add(lrev) if rev in linkrevs: continue if any(isancestor(l) for l in linkrevs): continue # found a new linkrev! if ui.debugflag: ui.debug('%s@%s: new linkrev %s\n' % (path, node.hex(fnode), rev)) db.appendlinkrev(path, fnode, rev) db.setlastrev(rev) ui.write() # clear progress bar @command('debugverifylinkrevcache', []) def debugverifylinkrevcache(ui, repo, *pats, **opts): """read the linkrevs from the database and verify if they are correct""" # restore to the original _adjustlinkrev implementation c = context.basefilectx extensions.unwrapfunction(c, '_adjustlinkrev', _adjustlinkrev) paths = {} # {id: name} nodes = {} # {id: name} repo = repo.unfiltered() idx = repo.unfiltered().changelog.index db = repo._linkrevcache paths = dict(db._getdb(db._pathdbname)) nodes = dict(db._getdb(db._nodedbname)) pathsrev = dict((v, k) for k, v in paths.iteritems()) nodesrev = dict((v, k) for k, v in nodes.iteritems()) lrevs = dict(db._getdb(db._linkrevdbname)) readfilelog = ui.configbool('linkrevcache', 'readfilelog', True) total = len(lrevs) for i, (k, v) in enumerate(lrevs.iteritems()): ui.progress(_('verifying'), i, total=total) pathid, nodeid = k.split('\0') path = pathsrev[pathid] fnode = nodesrev[nodeid] linkrevs = _str2intlist(v) linkrevs.sort() for linkrev in linkrevs: fctx = repo[linkrev][path] introrev = fctx.introrev() fctx.linkrev() if readfilelog: flinkrev = fctx.linkrev() else: flinkrev = None if introrev == linkrev: continue if (introrev in idx.commonancestorsheads(introrev, linkrev) and (introrev in linkrevs or introrev == flinkrev)): adjective = _('unnecessary') else: adjective = _('incorrect') ui.warn(_('%s linkrev %s for %s @ %s (expected: %s)\n') % (adjective, linkrev, path, node.hex(fnode), introrev)) ui.write(_('%d entries verified\n') % total) def _adjustlinkrev(orig, self, *args, **kwds): lkr = self.linkrev() repo = self._repo # for a repo with only a single head, linkrev is accurate if getattr(repo, '_singleheaded', False): return lkr # argv can be "path, flog, fnode, srcrev", or "srcrev" - see e81d72b4b0ae srcrev = args[-1] cache = getattr(self._repo, '_linkrevcache', None) if cache is not None and srcrev is not None: index = repo.unfiltered().changelog.index try: linkrevs = set(cache.getlinkrevs(self._path, self._filenode)) except Exception: # the database may be locked - cannot be used correctly linkrevs = set() finally: # do not keep the database open so others can write to it # note: this is bad for perf. but it's here to workaround the gdbm # locking pattern: reader and writer cannot co-exist. if we have # a dbm engine that locks differently, we don't need this. cache.close() linkrevs.add(lkr) for rev in sorted(linkrevs): # sorted filters out unnecessary linkrevs if rev in index.commonancestorsheads(rev, srcrev): return rev # fallback to the possibly slow implementation return orig(self, *args, **kwds) def uisetup(ui): c = context.basefilectx extensions.wrapfunction(c, '_adjustlinkrev', _adjustlinkrev)