mirror of
https://github.com/facebook/sapling.git
synced 2024-10-11 01:07:15 +03:00
[manifestdiskcache] RFC: a disk cache for manifests
Summary: # Intercept manifest.revision(..). The cache is checked, and if it's not there, resume the normal path. Once the normal path is complete, write to the cache. # Intercept manifest._addrevision(..). This may be used in bulk operations, such as pull. Since we don't want to flood the cache with a lot of entries we may not care about, we record all the nodes that are added. We add an atexit hook to then record the last N nodes to the cache. # Writes to the cache are done to a temp file, then atomically renamed into place. # On reads, we run the checkhash function. This costs us ~100ms per manifest in fbsource, but ensures a corrupt cache doesn't break us. Love to debate this matter. # On each batch of writes, we spawn a background copy of ourselves to prune the cache. We use the mtime of a marker file to determine the last time the prune happened. We calculate the odds that we should be doing the prune using a couple constants and the time since the last prune. If another prune happened recently, the dominant factor is a small probability that we run the prune, regardless of the interval. If no prune has happened recently, the dominant factor is the seconds-since-prune configuration variable. Some performance numbers (all averaged across 10 runs) * diff between two revs: 2.29s without caching; 1.75s with caching * rebasing 4 diffs: 12.1s without caching; 10.7s with caching Test Plan: passed the rudimentary correctness unit tests. with fbsource, ran a small handful of commands without anybody tripping and falling. Reviewers: rmcelroy, ericsumner, mpm, pyd, durham Reviewed By: durham Subscribers: akushner, mitrandir, cdelahousse Differential Revision: https://phabricator.fb.com/D2564490 Signature: t1:2564490:1449075868:70974c62e6bff6521b6f500b5bff3a260ddd6c6d
This commit is contained in:
parent
b5178471a0
commit
1a1da38248
322
manifestdiskcache.py
Normal file
322
manifestdiskcache.py
Normal file
@ -0,0 +1,322 @@
|
||||
# manifestdiskcache.py - manifest disk cache for mercurial
|
||||
#
|
||||
# Copyright 2012 Facebook
|
||||
#
|
||||
# This software may be used and distributed according to the terms of the
|
||||
# GNU General Public License version 2 or any later version.
|
||||
|
||||
'''Cache manifests on disk to speed up access.
|
||||
|
||||
This extension intercepts reads and writes of manifests to cache them on disk.
|
||||
|
||||
On writes, we spawn a second process (to avoid penalizing interactive use) to
|
||||
check if we should prune the cache. The pruning is guided by several
|
||||
configuration variables:
|
||||
|
||||
manifestdiskcache.pinned-revsets: revsets to pin in the cache. NOTE: This is
|
||||
not implemented yet.
|
||||
|
||||
manifestdiskcache.cache-size: the upper limit for the size of the cache.
|
||||
|
||||
manifestdiskcache.runs-between-prunes: the approximate number of writes that
|
||||
will elapse before we prune.
|
||||
|
||||
manifestdiskcache.seconds-between-prunes: the number of seconds since the last
|
||||
prune that can elapse before we prune.
|
||||
|
||||
Because this is a cache, exceptions are generally suppresed. If the
|
||||
configuration variable manifestdiskcache.logging is set to True, exceptions will
|
||||
be written to standard error, but will still be suppressed.
|
||||
err
|
||||
|
||||
'''
|
||||
|
||||
from mercurial import changegroup, cmdutil, error, extensions, localrepo
|
||||
from mercurial import manifest, revlog, util
|
||||
from mercurial.node import bin, hex
|
||||
from mercurial.i18n import _
|
||||
|
||||
import collections
|
||||
import os
|
||||
import random
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
import traceback
|
||||
|
||||
CACHE_SUBDIR = 'manifestdiskcache'
|
||||
CONFIG_KEY = 'manifestdiskcache'
|
||||
HEX_SHA_SIZE_BYTES = 40
|
||||
|
||||
testedwith = 'internal'
|
||||
|
||||
def replaceclass(container, classname):
|
||||
'''Replace a class with another in a module, and interpose it into
|
||||
the hierarchies of all loaded subclasses. This function is
|
||||
intended for use as a decorator.
|
||||
|
||||
import mymodule
|
||||
@replaceclass(mymodule, 'myclass')
|
||||
class mysubclass(mymodule.myclass):
|
||||
def foo(self):
|
||||
f = super(mysubclass, self).foo()
|
||||
return f + ' bar'
|
||||
|
||||
Existing instances of the class being replaced will not have their
|
||||
__class__ modified, so call this function before creating any
|
||||
objects of the target type.
|
||||
'''
|
||||
def wrap(cls):
|
||||
oldcls = getattr(container, classname)
|
||||
for subcls in oldcls.__subclasses__():
|
||||
if subcls is not cls:
|
||||
assert oldcls in subcls.__bases__
|
||||
newbases = [oldbase
|
||||
for oldbase in subcls.__bases__
|
||||
if oldbase != oldcls]
|
||||
newbases.append(cls)
|
||||
subcls.__bases__ = tuple(newbases)
|
||||
setattr(container, classname, cls)
|
||||
return cls
|
||||
return wrap
|
||||
|
||||
def extsetup(ui):
|
||||
global logging
|
||||
logging = ui.configbool(CONFIG_KEY, 'logging', False)
|
||||
|
||||
cmdtable = {}
|
||||
command = cmdutil.command(cmdtable)
|
||||
@command(
|
||||
'prunemanifestdiskcache', [],
|
||||
_('hg prunemanifestdiskcache'))
|
||||
def prunemanifestdiskcache(ui, repo):
|
||||
# retrieve the options.
|
||||
pinnedrevsets = ui.config(CONFIG_KEY, 'pinned-revsets', None)
|
||||
cachesizelimit = ui.configbytes(CONFIG_KEY, 'cache-size', '5g')
|
||||
runsbetween = ui.configint(CONFIG_KEY, 'runs-between-prunes', 100)
|
||||
secondsbetween = ui.configint(CONFIG_KEY, 'seconds-between-prunes', 86400)
|
||||
|
||||
# validate the arguments
|
||||
if runsbetween < 1:
|
||||
raise util.Abort("runs-between-prunes should be >= 1")
|
||||
if secondsbetween < 0:
|
||||
raise util.Abort("seconds-between-prunes should be >= 0")
|
||||
|
||||
store = repo.store
|
||||
opener = store.opener
|
||||
base = store.opener.join(None)
|
||||
|
||||
# decide whether we run.
|
||||
markerpath = os.path.join(base, CACHE_SUBDIR, '.marker')
|
||||
try:
|
||||
stat = os.stat(markerpath)
|
||||
except OSError:
|
||||
# create the file.
|
||||
with open(markerpath, 'w'):
|
||||
pass
|
||||
else:
|
||||
now = time.time()
|
||||
delta = now - stat.st_mtime
|
||||
|
||||
intercept = (1.0 / runsbetween)
|
||||
odds = intercept + (((1 - intercept) * delta) / secondsbetween)
|
||||
|
||||
if odds < random.random():
|
||||
# no pruning.
|
||||
ui.note("no pruning needed at this time.")
|
||||
return
|
||||
|
||||
# update the file timestamp.
|
||||
os.utime(markerpath, None)
|
||||
|
||||
# enumerate all the existing cache entries, ordered by time ascending.
|
||||
entries = []
|
||||
for dirpath, dirs, files in opener.walk(CACHE_SUBDIR):
|
||||
for fname in files:
|
||||
# don't remove the marker.
|
||||
if fname == '.marker':
|
||||
continue
|
||||
|
||||
path = os.path.join(base, dirpath, fname)
|
||||
if len(fname) > HEX_SHA_SIZE_BYTES:
|
||||
# this is probably a temp file. trash it, but do it directly,
|
||||
# because opener.unlink will try to case-escape.
|
||||
try:
|
||||
os.unlink(path)
|
||||
except:
|
||||
pass
|
||||
continue
|
||||
|
||||
try:
|
||||
stat = os.stat(path)
|
||||
except OSError:
|
||||
# file presumably does not exist.
|
||||
continue
|
||||
|
||||
entries.append(
|
||||
(stat.st_atime, stat.st_size, path))
|
||||
entries.sort(reverse=True)
|
||||
ui.debug("pid: {0}\ncache entries: {1}\n".format(
|
||||
os.getpid(),
|
||||
"\n".join(["{0}".format(entry)
|
||||
for entry in entries])))
|
||||
|
||||
# TODO: remove entries that pinnedrevsets wants us to keep.
|
||||
|
||||
# accumulate up to cachesize, then remove the remainder.
|
||||
accumsize = 0
|
||||
for atime, size, path in entries:
|
||||
accumsize += size
|
||||
|
||||
if accumsize > cachesizelimit:
|
||||
# remove the file, but once again, do it directly because
|
||||
# opener.unlink will try to case-escape.
|
||||
try:
|
||||
os.unlink(path)
|
||||
except:
|
||||
pass
|
||||
|
||||
@replaceclass(changegroup, 'cg1unpacker')
|
||||
class cg1unpackerwithdc(changegroup.cg1unpacker):
|
||||
def apply(self, repo, *args, **kwargs):
|
||||
# disable manifest caching.
|
||||
repo.manifest.markbatchoperationstart()
|
||||
try:
|
||||
# call the original function
|
||||
return super(cg1unpackerwithdc, self).apply(repo, *args, **kwargs)
|
||||
finally:
|
||||
# re-enable manifest caching.
|
||||
repo.manifest.markbatchoperationend()
|
||||
|
||||
@replaceclass(manifest, 'manifest')
|
||||
class manifestwithdc(manifest.manifest):
|
||||
def __init__(self, opener, dir='', dirlogcache=None):
|
||||
super(manifestwithdc, self).__init__(opener, dir, dirlogcache)
|
||||
|
||||
self.manifestdiskcacheenabled = False
|
||||
opts = getattr(opener, 'options', None)
|
||||
if opts is not None:
|
||||
self.manifestdiskcacheenabled = opts.get(
|
||||
CONFIG_KEY, False)
|
||||
|
||||
if self.manifestdiskcacheenabled:
|
||||
# this logic is copied from the constructor of manifest.__init__
|
||||
if self._dir:
|
||||
self.diskcachedir = "meta/" + self._dir + CACHE_SUBDIR
|
||||
else:
|
||||
self.diskcachedir = CACHE_SUBDIR
|
||||
|
||||
self.inbatchoperation = False
|
||||
|
||||
def markbatchoperationstart(self):
|
||||
self.inbatchoperation = True
|
||||
|
||||
def markbatchoperationend(self):
|
||||
self.inbatchoperation = False
|
||||
|
||||
def revision(self, nodeorrev):
|
||||
global logging
|
||||
|
||||
if self.manifestdiskcacheenabled:
|
||||
expectedexception = False
|
||||
|
||||
try:
|
||||
if isinstance(nodeorrev, int):
|
||||
rev = nodeorrev
|
||||
node = self.node(nodeorrev)
|
||||
else:
|
||||
rev = self.rev(nodeorrev)
|
||||
node = nodeorrev
|
||||
|
||||
hexnode = hex(node)
|
||||
|
||||
subpath = os.path.join(self.diskcachedir,
|
||||
hexnode[0:2], hexnode[2:4], hexnode)
|
||||
|
||||
result = None
|
||||
try:
|
||||
with self.opener(subpath, "r") as fh:
|
||||
result = fh.read()
|
||||
except IOError:
|
||||
# this is an expected exception, so no need to sound the
|
||||
# alarms.
|
||||
expectedexception = True
|
||||
raise
|
||||
|
||||
if result:
|
||||
# verify that the output passes _checkhash(..)
|
||||
result = self._checkhash(result, node, rev)
|
||||
|
||||
return result
|
||||
except:
|
||||
# it's a cache. suppress the exception, disable caching
|
||||
# going forward, and then report if logging is enabled.
|
||||
if logging and not expectedexception:
|
||||
sys.stderr.write("Encountered exception in extension "
|
||||
"manifestdiskcache: {0}\n".format(
|
||||
traceback.format_exc()))
|
||||
|
||||
result = super(manifestwithdc, self).revision(nodeorrev)
|
||||
|
||||
if self.manifestdiskcacheenabled:
|
||||
self._writetomanifestcache(hexnode, result, logging)
|
||||
self._prune_cache()
|
||||
|
||||
return result
|
||||
|
||||
def _addrevision(self, node, text, *args, **kwargs):
|
||||
global logging
|
||||
|
||||
node = super(manifestwithdc, self)._addrevision(
|
||||
node, text, *args, **kwargs)
|
||||
|
||||
if self.manifestdiskcacheenabled and not self.inbatchoperation:
|
||||
hexnode = hex(node)
|
||||
self._writetomanifestcache(hexnode, str(text), logging)
|
||||
|
||||
self._prune_cache()
|
||||
|
||||
return node
|
||||
|
||||
def _writetomanifestcache(self, hexnode, text, loggingenabled):
|
||||
try:
|
||||
base = self.opener.join(None)
|
||||
dirsubpath = os.path.join(self.diskcachedir,
|
||||
hexnode[0:2],
|
||||
hexnode[2:4])
|
||||
entrysubpath = os.path.join(dirsubpath, hexnode)
|
||||
|
||||
try:
|
||||
os.makedirs(os.path.join(base, dirsubpath))
|
||||
except OSError:
|
||||
pass
|
||||
fh = util.atomictempfile(
|
||||
os.path.join(base, entrysubpath),
|
||||
mode="w+")
|
||||
try:
|
||||
fh.write(text)
|
||||
finally:
|
||||
fh.close()
|
||||
except:
|
||||
# it's a cache. suppress the exception, disable caching
|
||||
# going forward, and then report if logging is enabled.
|
||||
if loggingenabled:
|
||||
sys.stderr.write("Encountered exception in extension "
|
||||
"manifestdiskcache: {0}\n".format(
|
||||
traceback.format_exc()))
|
||||
|
||||
def _prune_cache(self):
|
||||
# spawn a subprocess (but don't wait for it) to prune the cache. this
|
||||
# may result in us (the main process) becoming a zombie, because we
|
||||
# could finish execution before the subprocess finishes. if this
|
||||
# becomes an issue, we can have the spawned subprocess execute the
|
||||
# double-fork daemonization.
|
||||
cmd = util.hgcmd()[:]
|
||||
cmd.append("prunemanifestdiskcache")
|
||||
subprocess.Popen(cmd, close_fds=True)
|
||||
|
||||
@replaceclass(localrepo, 'localrepository')
|
||||
class repowithmdc(localrepo.localrepository):
|
||||
def _applyopenerreqs(self):
|
||||
super(repowithmdc, self)._applyopenerreqs()
|
||||
self.svfs.options[CONFIG_KEY] = True
|
114
tests/test-manifestdiskcache.t
Normal file
114
tests/test-manifestdiskcache.t
Normal file
@ -0,0 +1,114 @@
|
||||
Setup
|
||||
|
||||
$ extpath=$(dirname $TESTDIR)
|
||||
$ cp $extpath/manifestdiskcache.py $TESTTMP # use $TESTTMP substitution in message
|
||||
|
||||
Test functionality is present
|
||||
|
||||
$ mkdir create_on_commit
|
||||
$ cd create_on_commit
|
||||
$ hg init
|
||||
$ cat >> .hg/hgrc << EOF
|
||||
> [extensions]
|
||||
> manifestdiskcache=$TESTTMP/manifestdiskcache.py
|
||||
> [manifestdiskcache]
|
||||
> logging=True
|
||||
> EOF
|
||||
$ echo "abcabc" > abcabc
|
||||
$ hg add abcabc
|
||||
$ hg commit -m "testing 123"
|
||||
$ ls -1 .hg/store/manifestdiskcache/ce/e3/cee32e58a3ba8300f0a7f0d4d9a014c98cc2fc33
|
||||
.hg/store/manifestdiskcache/ce/e3/cee32e58a3ba8300f0a7f0d4d9a014c98cc2fc33
|
||||
$ echo "defdef" > defdef
|
||||
$ hg add defdef
|
||||
$ hg commit -m "testing 456"
|
||||
$ ls -1 .hg/store/manifestdiskcache/8a/85/8a854c1c1a950742983621c0632c0828e0fd8e12
|
||||
.hg/store/manifestdiskcache/8a/85/8a854c1c1a950742983621c0632c0828e0fd8e12
|
||||
$ hg diff -r 0 --nodates
|
||||
diff -r 53f12ffb3d86 defdef
|
||||
--- /dev/null
|
||||
+++ b/defdef
|
||||
@@ -0,0 +1,1 @@
|
||||
+defdef
|
||||
$ cd ..
|
||||
|
||||
Test that we prune the cache.
|
||||
|
||||
$ mkdir cache_prune
|
||||
$ cd cache_prune
|
||||
$ hg init
|
||||
$ cat >> .hg/hgrc << EOF
|
||||
> [extensions]
|
||||
> manifestdiskcache=$TESTTMP/manifestdiskcache.py
|
||||
> [manifestdiskcache]
|
||||
> logging=True
|
||||
> EOF
|
||||
$ echo "abcabc" > abcabc
|
||||
$ hg add abcabc
|
||||
$ hg commit -m "testing 123"
|
||||
$ ls -1 .hg/store/manifestdiskcache/ce/e3/cee32e58a3ba8300f0a7f0d4d9a014c98cc2fc33
|
||||
.hg/store/manifestdiskcache/ce/e3/cee32e58a3ba8300f0a7f0d4d9a014c98cc2fc33
|
||||
$ echo "defdef" > defdef
|
||||
$ hg add defdef
|
||||
$ hg commit -m "testing 456"
|
||||
$ ls -1 .hg/store/manifestdiskcache/8a/85/8a854c1c1a950742983621c0632c0828e0fd8e12
|
||||
.hg/store/manifestdiskcache/8a/85/8a854c1c1a950742983621c0632c0828e0fd8e12
|
||||
$ echo "ghighi" > ghighi
|
||||
$ hg add ghighi
|
||||
$ hg commit -m "testing 789"
|
||||
# the first two commits won't be accessed in subsequent commands, and as
|
||||
# such, should be pruned. the third commit will still be accessed when
|
||||
# creating the fourth commit. we wait 2 seconds because that's resolution
|
||||
# of atime on windows.
|
||||
$ sleep 2
|
||||
$ cat >> .hg/hgrc << EOF
|
||||
> [manifestdiskcache]
|
||||
> cache-size=431
|
||||
> runs-between-prunes=1
|
||||
> EOF
|
||||
$ echo "jkljkl" > jkljkl
|
||||
$ hg add jkljkl
|
||||
$ hg commit -m "testing 0ab"
|
||||
# ensure the prune command completes before we read out the disk.
|
||||
$ sleep 1
|
||||
$ ls -1 .hg/store/manifestdiskcache/ce/e3/cee32e58a3ba8300f0a7f0d4d9a014c98cc2fc33
|
||||
ls: .hg/store/manifestdiskcache/ce/e3/cee32e58a3ba8300f0a7f0d4d9a014c98cc2fc33: No such file or directory
|
||||
[1]
|
||||
$ ls -1 .hg/store/manifestdiskcache/8a/85/8a854c1c1a950742983621c0632c0828e0fd8e12
|
||||
ls: .hg/store/manifestdiskcache/8a/85/8a854c1c1a950742983621c0632c0828e0fd8e12: No such file or directory
|
||||
[1]
|
||||
$ ls -1 .hg/store/manifestdiskcache/fd/cf/fdcfc1aafe7a6dfe64bbe8358eefd5bd22ca9fb6
|
||||
.hg/store/manifestdiskcache/fd/cf/fdcfc1aafe7a6dfe64bbe8358eefd5bd22ca9fb6
|
||||
$ ls -1 .hg/store/manifestdiskcache/76/03/76035e7b5645d9b4ed6a3b904b23cd7592fdd01a
|
||||
.hg/store/manifestdiskcache/76/03/76035e7b5645d9b4ed6a3b904b23cd7592fdd01a
|
||||
$ cd ..
|
||||
|
||||
Test that a corrupt cache does not interfere with correctness.
|
||||
|
||||
$ mkdir corrupt_cache
|
||||
$ cd corrupt_cache
|
||||
$ hg init
|
||||
$ cat >> .hg/hgrc << EOF
|
||||
> [extensions]
|
||||
> manifestdiskcache=$TESTTMP/manifestdiskcache.py
|
||||
> EOF
|
||||
$ echo "abcabc" > abcabc
|
||||
$ hg add abcabc
|
||||
$ hg commit -m "testing 123"
|
||||
$ ls -1 .hg/store/manifestdiskcache/ce/e3/cee32e58a3ba8300f0a7f0d4d9a014c98cc2fc33
|
||||
.hg/store/manifestdiskcache/ce/e3/cee32e58a3ba8300f0a7f0d4d9a014c98cc2fc33
|
||||
$ echo "defdef" > defdef
|
||||
$ hg add defdef
|
||||
$ hg commit -m "testing 456"
|
||||
$ ls -1 .hg/store/manifestdiskcache/8a/85/8a854c1c1a950742983621c0632c0828e0fd8e12
|
||||
.hg/store/manifestdiskcache/8a/85/8a854c1c1a950742983621c0632c0828e0fd8e12
|
||||
$ echo "garbage" > .hg/store/manifestdiskcache/ce/e3/cee32e58a3ba8300f0a7f0d4d9a014c98cc2fc33
|
||||
$ echo "garbage" > .hg/store/manifestdiskcache/ce/e3/cee32e58a3ba8300f0a7f0d4d9a014c98cc2fc33
|
||||
$ hg diff -r 0 --nodates
|
||||
diff -r 53f12ffb3d86 defdef
|
||||
--- /dev/null
|
||||
+++ b/defdef
|
||||
@@ -0,0 +1,1 @@
|
||||
+defdef
|
||||
$ cd ..
|
||||
|
Loading…
Reference in New Issue
Block a user