sapling/fastmanifest.py
Tony Tung 703e9954c1 [fastmanifest] remove duplicated iteritems
Summary: Also moved `iterentries` alongside other `iter...` methods.

Test Plan: pass unit tests

Reviewers: #mercurial, lcharignon

Reviewed By: lcharignon

Subscribers: mitrandir, mjpieters

Differential Revision: https://phabricator.intern.facebook.com/D3294761

Signature: t1:3294761:1463438929:822453d8f99e85858cc8bdbac1188e7614d9abb3
2016-05-16 16:11:54 -07:00

638 lines
21 KiB
Python

# fastmanifest.py
#
# Copyright 2016 Facebook, Inc.
#
# This software may be used and distributed according to the terms of the
# GNU General Public License version 2 or any later version.
"""
This extension adds fastmanifest, a treemanifest disk cache for speeding up
manifest comparison. It also contains utilities to investigate manifest access
patterns.
Configuration options:
[fastmanifest]
logfile = "" # Filename, is not empty will log access to any manifest
Description:
`manifestaccesslogger` logs manifest accessed to a logfile specified with
the option fastmanifest.logfile
`fastmanifesttocache` is a revset of relevant manifests to cache
`hybridmanifest` is a proxy class for flat and cached manifest that loads
manifest from cache or from disk.
It chooses what kind of manifest is relevant to create based on the operation,
ideally the fastest.
TODO instantiate fastmanifest when they are more suitable
`manifestcache` is the class handling the interface with the cache, it supports
caching flat and fast manifest and retrieving them.
TODO logic for loading fastmanifest
TODO logic for saving fastmanifest
TODO garbage collection
`manifestfactory` is a class whose method wraps manifest creating method of
manifest.manifest. It intercepts the calls to build hybridmanifest instead of
regularmanifests. We use a class for that to allow sharing the ui object that
is not normally accessible to manifests.
`debugcachemanifest` is a command calling `_cachemanifest`, a function to add
manifests to the cache and manipulate what is cached. It allows caching fast
and flat manifest, asynchronously and synchronously.
TODO handle asynchronous save
TODO size limit handling
"""
import array
import os
from mercurial import cmdutil
from mercurial import extensions
from mercurial import manifest
from mercurial import mdiff
from mercurial import revset
from mercurial import revlog
from mercurial import scmutil
from mercurial import util
import fastmanifest_wrapper
CACHE_SUBDIR = "manifestcache"
cmdtable = {}
command = cmdutil.command(cmdtable)
class manifestaccesslogger(object):
"""Class to log manifest access and confirm our assumptions"""
def __init__(self, logfile):
self._logfile = logfile
def revwrap(self, orig, *args, **kwargs):
"""Wraps manifest.rev and log access"""
r = orig(*args, **kwargs)
try:
with open(self._logfile, "a") as f:
f.write("%s\n" % r)
except EnvironmentError:
pass
return r
def fastmanifesttocache(repo, subset, x):
"""Revset of the interesting revisions to cache"""
return scmutil.revrange(repo, ["not public() + bookmark()"])
class hybridmanifest(object):
"""
Hybrid manifest that behaves like a lazy manifest.
Initialized with one of the three:
- flat an existing flat manifest
- fast an existing fast manifest
- loadflat a function to load a flat manifest from disk
"""
def __init__(self, ui, opener,
flat=None, fast=None, loadflat=None, node=None):
self.__flatmanifest = flat
self.__cachedmanifest = fast
self.loadflat = loadflat
assert (self.__flatmanifest is not None or
self.__cachedmanifest is None or
self.loadflat is None)
self.ui = ui
self.opener = opener
self.node = node
self.cachekey = revlog.hex(self.node) if self.node is not None else None
self.fastcache = fastmanifestcache.getinstance(opener, self.ui)
self.debugfastmanifest = (self.ui.configbool("fastmanifest",
"debugfastmanifest")
if self.ui is not None
else False)
self.incache = True if self.__cachedmanifest is not None else None
def _flatmanifest(self):
if self.__flatmanifest is None:
if self.loadflat is not None:
# Load the manifest and cache it.
self.__flatmanifest = self.loadflat()
if isinstance(self.__flatmanifest, hybridmanifest):
# See comment in extsetup to see why we have to do that
self.__flatmanifest = self.__flatmanifest._flatmanifest()
elif self.__cachedmanifest is not None:
# build a flat manifest from the text of the fastmanifest.
self.__flatmanifest = manifest.manifestdict(
self.__cachedmanifest.text())
assert isinstance(self.__flatmanifest, manifest.manifestdict)
return self.__flatmanifest
def _cachedmanifest(self):
if self.incache is None:
# Cache lookup
if (self.cachekey is not None and
self.fastcache.contains(self.cachekey)):
self.__cachedmanifest = self.fastcache.get(self.cachekey)
self.incache = self.__cachedmanifest is not None
self.ui.debug("cache %s for fastmanifest %s\n"
% ("hit" if self.incache else "miss", self.cachekey))
return self.__cachedmanifest
def _incache(self):
if self.incache:
return True
elif self.cachekey:
return self.fastcache.contains(self.cachekey)
return False
def _manifest(self, operation):
# Get the manifest most suited for the operations (flat or cached)
# TODO return fastmanifest when suitable
if self.debugfastmanifest:
if self.__cachedmanifest:
return self.__cachedmanifest
flatmanifest = self._flatmanifest().text()
fm = fastmanifest_wrapper.fastManifest(flatmanifest)
self.__cachedmanifest = fastmanifestdict(fm)
return self.__cachedmanifest
c = self._cachedmanifest()
if c is not None:
return c
r = self._flatmanifest()
return r
# Proxy all the manifest methods to the flatmanifest except magic methods
def __getattr__(self, name):
return getattr(self._manifest(name), name)
# Magic methods should be proxied differently than __getattr__
# For the moment all methods they all use the _flatmanifest
def __iter__(self):
return self._manifest('__iter__').__iter__()
def __contains__(self, key):
return self._manifest('__contains__').__contains__(key)
def __getitem__(self, key):
return self._manifest('__getitem__').__getitem__(key)
def __setitem__(self, key, val):
return self._manifest('__setitem__').__setitem__(key, val)
def __delitem__(self, key):
return self._manifest('__delitem__').__delitem__(key)
def __len__(self):
return self._manifest('__len__').__len__()
def copy(self):
copy = self._manifest('copy').copy()
if isinstance(copy, hybridmanifest):
return copy
elif isinstance(copy, fastmanifestdict):
return hybridmanifest(self.ui, self.opener, fast=copy,
node=self.node)
elif isinstance(copy, manifest.manifestdict):
return hybridmanifest(self.ui, self.opener, flat=copy,
node=self.node)
else:
raise ValueError("unknown manifest type {0}".format(type(copy)))
def matches(self, *args, **kwargs):
matches = self._manifest('matches').matches(*args, **kwargs)
if isinstance(matches, hybridmanifest):
return matches
elif isinstance(matches, fastmanifestdict):
return hybridmanifest(self.ui, self.opener, fast=matches)
elif isinstance(matches, manifest.manifestdict):
return hybridmanifest(self.ui, self.opener, flat=matches)
else:
raise ValueError("unknown manifest type {0}".format(type(matches)))
def diff(self, m2, *args, **kwargs):
self.ui.debug("performing diff\n")
# Find _m1 and _m2 of the same type, to provide the fastest computation
_m1, _m2 = None, None
if isinstance(m2, hybridmanifest):
self.ui.debug("diff: other side is hybrid manifest\n")
# CACHE HIT
if self._incache() and m2._incache():
_m1, _m2 = self._cachedmanifest(), m2._cachedmanifest()
# _m1 or _m2 can be None if _incache was True if the cache
# got garbage collected in the meantime or entry is corrupted
if not _m1 or not _m2:
self.ui.debug("diff: unable to load one or "
"more manifests\n")
_m1, _m2 = self._flatmanifest(), m2._flatmanifest()
# CACHE MISS
else:
self.ui.debug("diff: cache miss\n")
_m1, _m2 = self._flatmanifest(), m2._flatmanifest()
else:
# This happens when diffing against a new manifest (like rev -1)
self.ui.debug("diff: other side not hybrid manifest\n")
_m1, _m2 = self._flatmanifest(), m2
assert type(_m1) == type(_m2)
return _m1.diff(_m2, *args, **kwargs)
class fastmanifestcache(object):
_instance = None
@classmethod
def getinstance(cls, opener, ui):
if not cls._instance:
cls._instance = fastmanifestcache(opener, ui)
return cls._instance
def __init__(self, opener, ui):
self.opener = opener
self.ui = ui
self.inmemorycache = {}
base = opener.join(None)
self.cachepath = os.path.join(base, CACHE_SUBDIR)
if not os.path.exists(self.cachepath):
os.makedirs(self.cachepath)
def keyprefix(self):
return "fast"
def load(self, fpath):
try:
fm = fastmanifest_wrapper.fastManifest.load(fpath)
except EnvironmentError:
return None
else:
return fastmanifestdict(fm)
def dump(self, fpath, manifest):
# TODO: is this already a hybridmanifest/fastmanifest? if so, we may be
# able to skip a frivolous conversion step.
fm = fastmanifest_wrapper.fastManifest(manifest.text())
fm.save(fpath)
def inmemorycachekey(self, key):
return (self.keyprefix(), key)
def filecachepath(self, key):
return os.path.join(self.cachepath, self.keyprefix() + key)
def get(self, key):
# In memory cache lookup
ident = self.inmemorycachekey(key)
r = self.inmemorycache.get(ident, None)
if r:
return r
# On disk cache lookup
realfpath = self.filecachepath(key)
r = self.load(realfpath)
# In memory cache update
if r:
self.inmemorycache[ident] = r
return r
def contains(self, key):
if self.inmemorycachekey(key) in self.inmemorycache:
return True
return os.path.exists(self.filecachepath(key))
def put(self, key, manifest):
if self.contains(key):
self.ui.debug("skipped %s, already cached\n" % key)
else:
self.ui.debug("caching revision %s\n" % key)
realfpath = self.filecachepath(key)
tmpfpath = util.mktempcopy(realfpath, True)
try:
self.dump(tmpfpath, manifest)
util.rename(tmpfpath, realfpath)
finally:
try:
os.unlink(tmpfpath)
except OSError:
pass
def prune(self, limit):
# TODO logic to prune old entries
pass
class manifestfactory(object):
def __init__(self, ui):
self.ui = ui
def newmanifest(self, orig, *args, **kwargs):
loadfn = lambda: orig(*args, **kwargs)
return hybridmanifest(self.ui,
args[0].opener,
loadflat=loadfn)
def read(self, orig, *args, **kwargs):
loadfn = lambda: orig(*args, **kwargs)
return hybridmanifest(self.ui,
args[0].opener,
loadflat=loadfn,
node=args[1])
def _cachemanifest(ui, repo, revs, sync, limit):
ui.debug(("caching rev: %s, synchronous(%s)\n")
% (revs, sync))
cache = fastmanifestcache.getinstance(repo.store.opener, ui)
for rev in revs:
manifest = repo[rev].manifest()
nodehex = revlog.hex(manifest.node)
cache.put(nodehex, manifest)
if limit:
cache.prune(limit)
@command('^debugcachemanifest', [
('r', 'rev', [], 'cache the manifest for revs', 'REV'),
('a', 'all', False, 'cache all relevant revisions', ''),
('l', 'limit', False, 'limit size of total rev in bytes', 'BYTES'),
('s', 'synchronous', False, 'wait for completion to return', '')],
'hg debugcachemanifest')
def debugcachemanifest(ui, repo, *pats, **opts):
sync = opts["synchronous"]
limit = opts["limit"]
if opts["all"]:
revs = scmutil.revrange(repo, ["fastmanifesttocache()"])
elif opts["rev"]:
revs = scmutil.revrange(repo, opts["rev"])
else:
revs = []
_cachemanifest(ui, repo, revs, sync, limit)
def extsetup(ui):
logfile = ui.config("fastmanifest", "logfile", "")
factory = manifestfactory(ui)
if logfile:
logger = manifestaccesslogger(logfile)
extensions.wrapfunction(manifest.manifest, 'rev', logger.revwrap)
# Wraps all the function creating a manifestdict
# We have to do that because the logic to create manifest can take
# 7 different codepaths and we want to retain the node information
# that comes at the top level:
#
# read -> _newmanifest ---------------------------> manifestdict
#
# readshallowfast -> readshallow -----------------> manifestdict
# \ \------> _newmanifest --> manifestdict
# --> readshallowdelta ------------------------> manifestdict
# \->readdelta -------> _newmanifest --> manifestdict
# \->slowreaddelta --> _newmanifest --> manifestdict
#
# othermethods -----------------------------------> manifestdict
#
# We can have hybridmanifest that wraps one hybridmanifest in some
# codepath. We resolve to the correct flatmanifest when asked in the
# _flatmanifest method
#
# The recursion level is at most 2 because we wrap the two top level
# functions and _newmanifest (wrapped only for the case of -1)
extensions.wrapfunction(manifest.manifest, '_newmanifest',
factory.newmanifest)
extensions.wrapfunction(manifest.manifest, 'read', factory.read)
try:
extensions.wrapfunction(manifest.manifest, 'readshallowfast',
factory.read)
except AttributeError:
# The function didn't use to be defined in previous versions of hg
pass
revset.symbols['fastmanifesttocache'] = fastmanifesttocache
revset.safesymbols.add('fastmanifesttocache')
class fastmanifestdict(object):
def __init__(self, fm):
self._fm = fm
def __getitem__(self, key):
return self._fm[key][0]
def find(self, key):
return self._fm[key]
def __len__(self):
return len(self._fm)
def __setitem__(self, key, node):
self._fm[key] = node, self.flags(key, '')
def __contains__(self, key):
return key in self._fm
def __delitem__(self, key):
del self._fm[key]
def __iter__(self):
return self._fm.__iter__()
def iterkeys(self):
return self._fm.iterkeys()
def iterentries(self):
return self._fm.iterentries()
def iteritems(self):
return (x[:2] for x in self._fm.iterentries())
def keys(self):
return list(self.iterkeys())
def filesnotin(self, m2):
'''Set of files in this manifest that are not in the other'''
diff = self.diff(m2)
files = set(filepath
for filepath, hashflags in diff.iteritems()
if hashflags[1][0] is None)
return files
@util.propertycache
def _dirs(self):
return util.dirs(self)
def dirs(self):
return self._dirs
def hasdir(self, dir):
return dir in self._dirs
def _filesfastpath(self, match):
'''Checks whether we can correctly and quickly iterate over matcher
files instead of over manifest files.'''
files = match.files()
return (len(files) < 100 and (match.isexact() or
(match.prefix() and all(fn in self for fn in files))))
def walk(self, match):
'''Generates matching file names.
Equivalent to manifest.matches(match).iterkeys(), but without creating
an entirely new manifest.
It also reports nonexistent files by marking them bad with match.bad().
'''
if match.always():
for f in iter(self):
yield f
return
fset = set(match.files())
# avoid the entire walk if we're only looking for specific files
if self._filesfastpath(match):
for fn in sorted(fset):
yield fn
return
for fn in self:
if fn in fset:
# specified pattern is the exact name
fset.remove(fn)
if match(fn):
yield fn
# for dirstate.walk, files=['.'] means "walk the whole tree".
# follow that here, too
fset.discard('.')
for fn in sorted(fset):
if not self.hasdir(fn):
match.bad(fn, None)
def matches(self, match):
'''generate a new manifest filtered by the match argument'''
if match.always():
return self.copy()
if self._filesfastpath(match):
m = fastmanifestdict()
lm = self._fm
for fn in match.files():
if fn in lm:
m._fm[fn] = lm[fn]
return m
m = fastmanifestdict()
m._fm = self._fm.filtercopy(match)
return m
def diff(self, m2, clean=False):
'''Finds changes between the current manifest and m2.
Args:
m2: the manifest to which this manifest should be compared.
clean: if true, include files unchanged between these manifests
with a None value in the returned dictionary.
The result is returned as a dict with filename as key and
values of the form ((n1,fl1),(n2,fl2)), where n1/n2 is the
nodeid in the current/other manifest and fl1/fl2 is the flag
in the current/other manifest. Where the file does not exist,
the nodeid will be None and the flags will be the empty
string.
'''
return self._fm.diff(m2._fm, clean)
def setflag(self, key, flag):
self._fm[key] = self[key], flag
def get(self, key, default=None):
try:
return self._fm[key][0]
except KeyError:
return default
def flags(self, key, default=''):
try:
return self._fm[key][1]
except KeyError:
return default
def copy(self):
c = fastmanifestdict(self._fm.copy())
return c
def text(self, usemanifestv2=False):
if usemanifestv2:
return _textv2(self._fm.iterentries())
else:
# use (probably) native version for v1
return self._fm.text()
def fastdelta(self, base, changes):
"""Given a base manifest text as an array.array and a list of changes
relative to that text, compute a delta that can be used by revlog.
"""
delta = []
dstart = None
dend = None
dline = [""]
start = 0
# zero copy representation of base as a buffer
addbuf = util.buffer(base)
changes = list(changes)
if len(changes) < 1000:
# start with a readonly loop that finds the offset of
# each line and creates the deltas
for f, todelete in changes:
# bs will either be the index of the item or the insert point
start, end = manifest._msearch(addbuf, f, start)
if not todelete:
h, fl = self._fm[f]
l = "%s\0%s%s\n" % (f, revlog.hex(h), fl)
else:
if start == end:
# item we want to delete was not found, error out
raise AssertionError(
_("failed to remove %s from manifest") % f)
l = ""
if dstart is not None and dstart <= start and dend >= start:
if dend < end:
dend = end
if l:
dline.append(l)
else:
if dstart is not None:
delta.append([dstart, dend, "".join(dline)])
dstart = start
dend = end
dline = [l]
if dstart is not None:
delta.append([dstart, dend, "".join(dline)])
# apply the delta to the base, and get a delta for addrevision
deltatext, arraytext = manifest._addlistdelta(base, delta)
else:
# For large changes, it's much cheaper to just build the text and
# diff it.
arraytext = array.array('c', self.text())
deltatext = mdiff.textdiff(base, arraytext)
return arraytext, deltatext