snapshot: bundle metadata and the related files

Summary:
Instead of the lfs remote storage it was chosen to send the snapshot metadata via bundles.
Snapshot metadata consists of: the actual metadata blob + several other blobs (untracked files etc).
If we have several snapshot revisions in a single bundle, the blobs could repeat.
Then we should store each blob as a separate entry in a binary stream, keeping its id and contents.

Here we introduce a new bundle part type `"b2x:snapshotmetadataparttype"`.
```
1 byte of version info
[ # a list of binary entries, each corresponds to a separate file
  # (either a metadata file itself or a related -- externally stored -- file)
  <oid><length><data>
  :oid: is a 64char string with the hash of the file
  :length: is an unsigned int with length of the data
  :data: is binary data of length <length>, the actual file contents
]
```

So far there is an ongoing discussion on the exact format of serialization.
Actual state is at [the quip doc](https://fb.quip.com/R5OVAzabX8oo).

Reviewed By: markbt

Differential Revision: D17184222

fbshipit-source-id: 90f833ec71556e90d513e3be3f3efa7f870b037d
This commit is contained in:
Aleksei Kulikov 2019-09-06 08:15:36 -07:00 committed by Facebook Github Bot
parent bcd08fcda3
commit 22dce8230d
6 changed files with 298 additions and 1 deletions

View File

@ -144,6 +144,19 @@ def getscratchbranchparts(
except KeyError:
pass
try:
snapshot = extensions.find("snapshot")
except KeyError:
pass
else:
data = snapshot.bundleparts.getmetadatafromrevs(repo, outgoing.missing)
if data:
parts.append(
bundle2.bundlepart(
snapshot.bundleparts.snapshotmetadataparttype, data=data
)
)
return parts

View File

@ -465,6 +465,12 @@ def processparts(orig, repo, op, unbundler):
except KeyError:
pass
try:
snapshot = extensions.find("snapshot")
partforwardingwhitelist.append(snapshot.bundleparts.snapshotmetadataparttype)
except KeyError:
pass
bundler = bundle2.bundle20(repo.ui)
compress = repo.ui.config("infinitepush", "bundlecompression", "UN")
bundler.setcompression(compress)

View File

@ -20,12 +20,16 @@ Configs::
[ui]
# Allow to run `hg checkout` for snapshot revisions
allow-checkout-snapshot = False
[snapshot]
# Sync snapshot metadata via bundle2
enable-sync-bundle = False
"""
from edenscm.mercurial import error, extensions, hg, registrar
from edenscm.mercurial.i18n import _
from . import blobstore, cmds as snapshotcommands, metadata
from . import blobstore, bundleparts, cmds as snapshotcommands, metadata
cmdtable = snapshotcommands.cmdtable
@ -33,6 +37,11 @@ cmdtable = snapshotcommands.cmdtable
configtable = {}
configitem = registrar.configitem(configtable)
configitem("ui", "allow-checkout-snapshot", default=False)
configitem("snapshot", "enable-sync-bundle", default=False)
def uisetup(ui):
bundleparts.uisetup(ui)
def reposetup(ui, repo):

View File

@ -0,0 +1,114 @@
# Copyright 2019 Facebook, Inc.
#
# This software may be used and distributed according to the terms of the
# GNU General Public License version 2 or any later version.
import struct
from edenscm.mercurial import bundle2, error
from edenscm.mercurial.i18n import _
from . import metadata
snapshotmetadataparttype = "b2x:snapshotmetadata"
def uisetup(ui):
if ui.configbool("snapshot", "enable-sync-bundle"):
bundle2.capabilities[snapshotmetadataparttype] = ()
def getmetadatafromrevs(repo, revs):
"""get binary representation of snapshot metadata by a list of revs
"""
metadataids = set()
unfi = repo.unfiltered()
for rev in revs:
# TODO(alexeyqu): move this check into a function
if rev not in unfi:
raise error.Abort(_("%s not found in repo\n") % rev)
ctx = unfi[rev]
snapshotmetadataid = ctx.extra().get("snapshotmetadataid", None)
if snapshotmetadataid:
metadataids.add(snapshotmetadataid)
if not metadataids:
return None
return binaryencode(repo, metadataids)
@bundle2.parthandler(snapshotmetadataparttype)
def handlemetadata(op, inpart):
"""unpack metadata for snapshots
"""
binarydecode(op.repo, inpart)
_versionentry = struct.Struct(">B")
_binaryentry = struct.Struct(">64sI")
def binaryencode(repo, metadataids):
"""encode snapshot metadata into a binary stream
the binary format is:
<version-byte>[<chunk-id><chunk-length><chunk-content>]+
:version-byte: is a version byte
:chunk-id: is a string of 64 chars -- sha256 of the chunk
:chunk-length: is an unsigned int
:chunk-content: is the metadata contents (of length <chunk-length>)
"""
def _encode(oid, data):
return [_binaryentry.pack(oid, len(data)), data]
metadataauxfileids = set()
binaryparts = []
# store the version info
binaryparts.append(_versionentry.pack(metadata.snapshotmetadata.VERSION))
# store the metadata files
for metadataid in metadataids:
snapmetadata = metadata.snapshotmetadata.getfromlocalstorage(repo, metadataid)
metadataauxfileids.update(snapmetadata.getauxfileids())
data = snapmetadata.serialize()
binaryparts += _encode(metadataid, data)
# store files that are mentioned in metadata
for auxfileid in metadataauxfileids:
data = repo.svfs.snapshotstore.read(auxfileid)
binaryparts += _encode(auxfileid, data)
return "".join(binaryparts)
def binarydecode(repo, stream):
"""decode a binary stream into individual blobs and store them
Returns a list of file ids.
the binary format is:
<version-byte>[<chunk-id><chunk-length><chunk-content>]+
:version-byte: is a version byte
:chunk-id: is a string of 64 chars -- sha256 of the chunk
:chunk-length: is an unsigned int
:chunk-content: is the metadata contents (of length <chunk-length>)
"""
# check the version info
version = _versionentry.unpack(stream.read(_versionentry.size))[0]
if version != metadata.snapshotmetadata.VERSION:
raise error.Abort(_("invalid version number %d") % version)
entrysize = _binaryentry.size
fileids = []
while True:
entry = stream.read(entrysize)
if len(entry) < entrysize:
if entry:
raise error.Abort(_("bad snapshot metadata stream"))
break
oid, length = _binaryentry.unpack(entry)
data = stream.read(length)
if len(data) < length:
if data:
raise error.Abort(_("bad snapshot metadata stream"))
repo.svfs.snapshotstore.write(oid, data)
fileids.append(oid)
return fileids

View File

@ -115,6 +115,12 @@ class snapshotmetadata(object):
except ValueError:
raise error.Abort(_("invalid metadata json: %s\n") % json_string)
def getauxfileids(self):
auxfileids = set()
auxfileids.update(f.oid for f in self.unknown)
auxfileids.update(f.oid for f in self.localvfsfiles)
return auxfileids
@classmethod
def createfromworkingcopy(cls, repo, status=None, include_untracked=True):
metadata = cls(repo)

View File

@ -0,0 +1,149 @@
# Initial setup
$ setconfig extensions.lfs=
$ setconfig extensions.rebase=
$ setconfig extensions.snapshot=
$ setconfig extensions.treemanifest=!
$ setconfig visibility.enabled=true
$ . "$TESTDIR/library.sh"
$ . "$TESTDIR/infinitepush/library.sh"
$ setupcommon
$ setconfig infinitepushbackup.logdir="$TESTTMP/logs" infinitepushbackup.hostname=testhost
$ setconfig snapshot.enable-sync-bundle=true
# Setup server
$ hg init server
$ cd server
$ setupserver
$ cd ..
# Setup clients
$ hg clone -q ssh://user@dummy/server client
$ hg clone -q ssh://user@dummy/server restored
$ cd client
$ hg debugvisibility start
# Add a file to the store
$ echo "foo" > foofile
$ mkdir bar
$ echo "bar" > bar/file
$ hg add foofile bar/file
$ hg commit -m "add some files"
$ hg push
pushing to ssh://user@dummy/server
searching for changes
remote: adding changesets
remote: adding manifests
remote: adding file changes
remote: added 1 changesets with 2 changes to 2 files
# Call this state a base revision
$ BASEREV="$(hg id -i)"
$ echo "$BASEREV"
3490593cf53c
# Snapshot backup test plan:
# 1) Create a snapshot, back it up + restore on another client
# 1) Create a snapshot, back it up + restore on another client
# Setup the environment
$ echo "a" > mergefile
$ hg add mergefile
$ hg commit -m "merge #1"
$ MERGEREV="$(hg id -i)"
$ hg checkout "$BASEREV"
0 files updated, 0 files merged, 1 files removed, 0 files unresolved
$ echo "b" > mergefile
$ hg add mergefile
$ hg commit -m "merge #2"
$ hg merge "$MERGEREV"
merging mergefile
warning: 1 conflicts while merging mergefile! (edit, then use 'hg resolve --mark')
0 files updated, 0 files merged, 0 files removed, 1 files unresolved
use 'hg resolve' to retry unresolved file merges or 'hg update -C .' to abandon
[1]
$ hg rm bar/file
$ rm foofile
$ echo "another" > bazfile
$ hg add bazfile
$ echo "fizz" > untrackedfile
$ BEFORESTATUS="$(hg status --verbose)"
$ echo "$BEFORESTATUS"
M mergefile
A bazfile
R bar/file
! foofile
? mergefile.orig
? untrackedfile
# The repository is in an unfinished *merge* state.
# Unresolved merge conflicts:
#
# mergefile
#
# To mark files as resolved: hg resolve --mark FILE
# To continue: hg commit
# To abort: hg update --clean . (warning: this will discard uncommitted changes)
$ BEFOREDIFF="$(hg diff)"
$ echo "$BEFOREDIFF"
diff -r 6eb2552aed20 bar/file
--- a/bar/file Thu Jan 01 00:00:00 1970 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,1 +0,0 @@
-bar
diff -r 6eb2552aed20 bazfile
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/bazfile Thu Jan 01 00:00:00 1970 +0000
@@ -0,0 +1,1 @@
+another
diff -r 6eb2552aed20 mergefile
--- a/mergefile Thu Jan 01 00:00:00 1970 +0000
+++ b/mergefile Thu Jan 01 00:00:00 1970 +0000
@@ -1,1 +1,5 @@
+<<<<<<< working copy: 6eb2552aed20 - test: merge #2
b
+=======
+a
+>>>>>>> merge rev: f473d4d5a1c0 - test: merge #1
# Make a snapshot
$ OID="$(hg snapshot create | cut -f2 -d' ')"
$ echo "$OID"
aaa7692160b6c5c0e4c13787d9343cf89fc2311a
# Back it up
$ hg cloud backup --hidden -r "$OID"
backing up stack rooted at f473d4d5a1c0
remote: pushing 3 commits:
remote: f473d4d5a1c0 merge #1
remote: 6eb2552aed20 merge #2
remote: aaa7692160b6 snapshot
backing up stack rooted at 6eb2552aed20
remote: pushing 3 commits:
remote: f473d4d5a1c0 merge #1
remote: 6eb2552aed20 merge #2
remote: aaa7692160b6 snapshot
commitcloud: backed up 3 commits
# Restore it on another client
$ cd ../restored
$ hg pull -r "$OID"
pulling from ssh://user@dummy/server
adding changesets
adding manifests
adding file changes
added 1 changesets with 2 changes to 2 files
adding changesets
adding manifests
adding file changes
added 3 changesets with 4 changes to 2 files
new changesets 3490593cf53c:aaa7692160b6
$ hg snapshot checkout "$OID"
will checkout on aaa7692160b6c5c0e4c13787d9343cf89fc2311a
3 files updated, 0 files merged, 0 files removed, 0 files unresolved
checkout complete
# hg status/diff are unchanged
$ test "$BEFORESTATUS" = "$(hg status --verbose)"
$ test "$BEFOREDIFF" = "$(hg diff)"