sapling/infinitepush/backupcommands.py
Stanislau Hlebik 0e50fa2aa0 infinitepush: wrap deltaparent function
Summary:
On a big repo with many old heads bundle that is sent to the server can become
big (in some cases even 500 Mb). That looks like a waste of bundlestore space
and will probably make backup and restore slower. Most of the space is taken
by manifest deltas because it prefers to diff manifest against previous commit
in the bundle. There are two possible approaches to reduce the size:
1) Send many small bundles (for example, one bundle per head)
2) Wrap deltaparent function and diff against actual parent previous commit in the bundle.

I chose the second approach for the following reasons:
1) It's easier to implement (main reason)
2) Many bundles probably means slower restore because there will be many requests to the bundlestore instead of just one

With this diff bundle size was reduced from 500 Mb to 8 Mb.

It can potentially increase CPU usage. I'm not sure how bad is it and will investigate it more.

Test Plan: Run `test-infinitepush-*`

Reviewers: rmcelroy, mitrandir, durham

Reviewed By: durham

Subscribers: simpkins, mjpieters, #sourcecontrol

Differential Revision: https://phabricator.intern.facebook.com/D4384053

Tasks: 12479677

Signature: t1:4384053:1483664446:62ec30fad433e8d279758926199a8330cb73ed2b
2017-01-09 01:42:02 -08:00

369 lines
13 KiB
Python

# Copyright 2017 Facebook, Inc.
#
# This software may be used and distributed according to the terms of the
# GNU General Public License version 2 or any later version.
from __future__ import absolute_import
import hashlib
import os
import re
import socket
from .bundleparts import (
getscratchbookmarkspart,
getscratchbranchpart,
)
from mercurial import (
bundle2,
changegroup,
cmdutil,
commands,
discovery,
encoding,
error,
hg,
util,
)
from collections import namedtuple
from hgext3rd.extutil import runshellcommand
from mercurial.extensions import wrapfunction, unwrapfunction
from mercurial.node import bin, hex, nullrev
from mercurial.i18n import _
cmdtable = {}
command = cmdutil.command(cmdtable)
backupbookmarktuple = namedtuple('backupbookmarktuple',
['hostname', 'reporoot', 'localbookmark'])
@command('pushbackup',
[('', 'background', None, 'run backup in background')])
def backup(ui, repo, dest=None, **opts):
"""
Pushes commits, bookmarks and heads to infinitepush.
New non-extinct commits are saved since the last `hg pushbackup`
or since 0 revision if this backup is the first.
Local bookmarks are saved remotely as:
infinitepush/backups/USERNAME/HOST/REPOROOT/bookmarks/LOCAL_BOOKMARK
Local heads are saved remotely as:
infinitepush/backups/USERNAME/HOST/REPOROOT/heads/HEAD_HASH
"""
if opts.get('background'):
background_cmd = ['hg', 'pushbackup']
if dest:
background_cmd.append(dest)
logfile = ui.config('infinitepush', 'pushbackuplog')
if logfile:
background_cmd.extend(('>>', logfile, '2>&1'))
runshellcommand(' '.join(background_cmd), os.environ)
return 0
backuptip, bookmarkshash = _readbackupstatefile(ui, repo)
bookmarkstobackup = _getbookmarkstobackup(ui, repo)
# To avoid race conditions save current tip of the repo and backup
# everything up to this revision.
currenttiprev = len(repo) - 1
other = _getremote(repo, ui, dest, **opts)
outgoing = _getrevstobackup(repo, other, backuptip,
currenttiprev, bookmarkstobackup)
currentbookmarkshash = _getbookmarkshash(bookmarkstobackup)
# Wrap deltaparent function to make sure that bundle takes less space
# See _deltaparent comments for details
wrapfunction(changegroup.cg2packer, 'deltaparent', _deltaparent)
try:
bundler = _createbundler(ui, repo, other)
backup = False
if outgoing and outgoing.missing:
backup = True
bundler.addpart(getscratchbranchpart(repo, other, outgoing,
confignonforwardmove=False,
ui=ui, bookmark=None,
create=False))
if currentbookmarkshash != bookmarkshash:
backup = True
bundler.addpart(getscratchbookmarkspart(other, bookmarkstobackup))
if backup:
_sendbundle(bundler, other)
_writebackupstatefile(repo.svfs, currenttiprev,
currentbookmarkshash)
else:
ui.status(_('nothing to backup\n'))
finally:
unwrapfunction(changegroup.cg2packer, 'deltaparent', _deltaparent)
return 0
@command('pullbackup', [
('', 'reporoot', '', 'root of the repo to restore'),
('', 'hostname', '', 'hostname of the repo to restore')])
def restore(ui, repo, dest=None, **opts):
"""
Pulls commits from infinitepush that were previously saved with
`hg pushbackup`.
If user has only one backup for the `dest` repo then it will be restored.
But user may have backed up many local repos that points to `dest` repo.
These local repos may reside on different hosts or in different
repo roots. It makes restore ambiguous; `--reporoot` and `--hostname`
options are used to disambiguate.
"""
other = _getremote(repo, ui, dest, **opts)
sourcereporoot = opts.get('reporoot')
sourcehostname = opts.get('hostname')
pattern = _getcommonuserprefix(ui) + '/*'
fetchedbookmarks = other.listkeyspatterns('bookmarks', patterns=[pattern])
reporoots = set()
hostnames = set()
nodestopull = set()
localbookmarks = {}
for book, node in fetchedbookmarks.iteritems():
parsed = _parsebackupbookmark(ui, book)
if parsed:
if sourcereporoot and sourcereporoot != parsed.reporoot:
continue
if sourcehostname and sourcehostname != parsed.hostname:
continue
nodestopull.add(node)
if parsed.localbookmark:
localbookmarks[parsed.localbookmark] = node
reporoots.add(parsed.reporoot)
hostnames.add(parsed.hostname)
else:
ui.warn(_('wrong format of backup bookmark: %s') % book)
if len(reporoots) > 1:
raise error.Abort(
_('ambiguous repo root to restore: %s') % sorted(reporoots),
hint=_('set --reporoot to disambiguate'))
if len(hostnames) > 1:
raise error.Abort(
_('ambiguous hostname to restore: %s') % sorted(hostnames),
hint=_('set --hostname to disambiguate'))
pullcmd, pullopts = _getcommandandoptions('^pull')
pullopts['rev'] = list(nodestopull)
result = pullcmd(ui, repo, **pullopts)
with repo.wlock():
with repo.lock():
with repo.transaction('bookmark') as tr:
for scratchbook, hexnode in localbookmarks.iteritems():
repo._bookmarks[scratchbook] = bin(hexnode)
repo._bookmarks.recordchange(tr)
return result
_backupedstatefile = 'infinitepushlastbackupedstate'
# Common helper functions
def _getcommonuserprefix(ui):
username = ui.shortuser(ui.username())
return '/'.join(('infinitepush', 'backups', username))
def _getcommonprefix(ui, repo):
hostname = socket.gethostname()
result = '/'.join((_getcommonuserprefix(ui), hostname))
if not repo.origroot.startswith('/'):
result += '/'
result += repo.origroot
if result.endswith('/'):
result = result[:-1]
return result
def _getbackupbookmarkprefix(ui, repo):
return '/'.join((_getcommonprefix(ui, repo),
'bookmarks'))
def _escapebookmark(bookmark):
'''
If `bookmark` contains "bookmarks" as a substring then replace it with
"bookmarksbookmarks". This will make parsing remote bookmark name
unambigious.
'''
bookmark = encoding.fromlocal(bookmark)
return bookmark.replace('bookmarks', 'bookmarksbookmarks')
def _unescapebookmark(bookmark):
bookmark = encoding.tolocal(bookmark)
return bookmark.replace('bookmarksbookmarks', 'bookmarks')
def _getbackupbookmarkname(ui, bookmark, repo):
bookmark = _escapebookmark(bookmark)
return '/'.join((_getbackupbookmarkprefix(ui, repo), bookmark))
def _getbackupheadprefix(ui, repo):
return '/'.join((_getcommonprefix(ui, repo),
'heads'))
def _getbackupheadname(ui, hexhead, repo):
return '/'.join((_getbackupheadprefix(ui, repo), hexhead))
def _getremote(repo, ui, dest, **opts):
path = ui.paths.getpath(dest, default=('default-push', 'default'))
if not path:
raise error.Abort(_('default repository not configured!'),
hint=_("see 'hg help config.paths'"))
dest = path.pushloc or path.loc
return hg.peer(repo, opts, dest)
def _getcommandandoptions(command):
pushcmd = commands.table[command][0]
pushopts = dict(opt[1:3] for opt in commands.table[command][1])
return pushcmd, pushopts
# Backup helper functions
def _deltaparent(orig, self, revlog, rev, p1, p2, prev):
# This version of deltaparent prefers p1 over prev to use less space
dp = revlog.deltaparent(rev)
if dp == nullrev and not revlog.storedeltachains:
# send full snapshot only if revlog configured to do so
return nullrev
return p1
def _getdefaultbookmarkstobackup(ui, repo):
bookmarkstobackup = {}
bookmarkstobackup[_getbackupheadprefix(ui, repo) + '/*'] = ''
bookmarkstobackup[_getbackupbookmarkprefix(ui, repo) + '/*'] = ''
return bookmarkstobackup
def _getbookmarkstobackup(ui, repo):
bookmarkstobackup = _getdefaultbookmarkstobackup(ui, repo)
for bookmark, node in repo._bookmarks.iteritems():
bookmark = _getbackupbookmarkname(ui, bookmark, repo)
hexnode = hex(node)
bookmarkstobackup[bookmark] = hexnode
for headrev in repo.revs('head() & not public()'):
hexhead = repo[headrev].hex()
headbookmarksname = _getbackupheadname(ui, hexhead, repo)
bookmarkstobackup[headbookmarksname] = hexhead
return bookmarkstobackup
def _getbookmarkshash(bookmarkstobackup):
currentbookmarkshash = hashlib.sha1()
for book, node in sorted(bookmarkstobackup.iteritems()):
currentbookmarkshash.update(book)
currentbookmarkshash.update(node)
return currentbookmarkshash.hexdigest()
def _createbundler(ui, repo, other):
bundler = bundle2.bundle20(ui, bundle2.bundle2caps(other))
# Disallow pushback because we want to avoid taking repo locks.
# And we don't need pushback anyway
capsblob = bundle2.encodecaps(bundle2.getrepocaps(repo,
allowpushback=False))
bundler.newpart('replycaps', data=capsblob)
return bundler
def _sendbundle(bundler, other):
stream = util.chunkbuffer(bundler.getchunks())
try:
other.unbundle(stream, ['force'], other.url())
except error.BundleValueError as exc:
raise error.Abort(_('missing support for %s') % exc)
def findcommonoutgoing(repo, other, heads):
if heads:
nodes = map(repo.changelog.node, heads)
return discovery.findcommonoutgoing(repo, other, onlyheads=nodes)
else:
return None
def _getrevstobackup(repo, other, backuptip, currenttiprev, bookmarkstobackup):
# Use unfiltered repo because backuptip may now point to filtered commit
repo = repo.unfiltered()
revs = []
if backuptip <= currenttiprev:
revset = 'head() & draft() & %d:' % backuptip
revs = list(repo.revs(revset))
outgoing = findcommonoutgoing(repo, other, revs)
rootstofilter = []
if outgoing:
# In rare cases it's possible to have node without filelogs only
# locally. It is possible if remotefilelog is enabled and if node was
# stripped server-side. In this case we want to filter this
# nodes and all ancestors out
for node in outgoing.missing:
changectx = repo[node]
for file in changectx.files():
try:
changectx.filectx(file)
except error.ManifestLookupError:
rootstofilter.append(changectx.rev())
if rootstofilter:
revstofilter = list(repo.revs('%ld::', rootstofilter))
revs = set(revs) - set(revstofilter)
outgoing = findcommonoutgoing(repo, other, revs)
filteredhexnodes = set([repo[filteredrev].hex()
for filteredrev in revstofilter])
# Use list(...) to make it work in python2 and python3
for book, hexnode in list(bookmarkstobackup.items()):
if hexnode in filteredhexnodes:
del bookmarkstobackup[book]
return outgoing
def _readbackupstatefile(ui, repo):
backuptipbookmarkshash = repo.svfs.tryread(_backupedstatefile).split(' ')
backuptip = 0
# hash of the default bookmarks to backup. This is to prevent backuping of
# empty repo
bookmarkshash = _getbookmarkshash(_getdefaultbookmarkstobackup(ui, repo))
if len(backuptipbookmarkshash) == 2:
try:
backuptip = int(backuptipbookmarkshash[0]) + 1
except ValueError:
pass
if len(backuptipbookmarkshash[1]) == 40:
bookmarkshash = backuptipbookmarkshash[1]
return backuptip, bookmarkshash
def _writebackupstatefile(vfs, backuptip, bookmarkshash):
with vfs(_backupedstatefile, mode="w", atomictemp=True) as f:
f.write(str(backuptip) + ' ' + bookmarkshash)
# Restore helper functions
def _parsebackupbookmark(ui, backupbookmark):
'''Parses backup bookmark and returns info about it
Backup bookmark may represent either a local bookmark or a head.
Returns None if backup bookmark has wrong format or tuple.
First entry is a hostname where this bookmark came from.
Second entry is a root of the repo where this bookmark came from.
Third entry in a tuple is local bookmark if backup bookmark
represents a local bookmark and None otherwise.
'''
commonre = '^{0}/([-\w.]+)(/.*)'.format(re.escape(_getcommonuserprefix(ui)))
bookmarkre = commonre + '/bookmarks/(.*)$'
headsre = commonre + '/heads/[a-f0-9]{40}$'
match = re.search(bookmarkre, backupbookmark)
if not match:
match = re.search(headsre, backupbookmark)
if not match:
return None
# It's a local head not a local bookmark.
# That's why localbookmark is None
return backupbookmarktuple(hostname=match.group(1),
reporoot=match.group(2),
localbookmark=None)
return backupbookmarktuple(hostname=match.group(1),
reporoot=match.group(2),
localbookmark=_unescapebookmark(match.group(3)))