Merge pull request #740 from enkore/feature-diff

borg diff: find different files between archives
This commit is contained in:
TW 2016-03-17 00:23:23 +01:00
commit f0cb6379b7
4 changed files with 266 additions and 13 deletions

View File

@ -7,6 +7,7 @@ Borg Contributors ("The Borg Collective")
- Yuri D'Elia - Yuri D'Elia
- Michael Hanselmann <public@hansmi.ch> - Michael Hanselmann <public@hansmi.ch>
- Teemu Toivanen <public@profnetti.fi> - Teemu Toivanen <public@profnetti.fi>
- Marian Beermann <public@enkore.de>
Borg is a fork of Attic. Borg is a fork of Attic.

View File

@ -145,6 +145,7 @@ def __init__(self, repository, key, manifest, name, cache=None, create=False,
self.numeric_owner = numeric_owner self.numeric_owner = numeric_owner
if start is None: if start is None:
start = datetime.utcnow() start = datetime.utcnow()
self.chunker_params = chunker_params
self.start = start self.start = start
if end is None: if end is None:
end = datetime.utcnow() end = datetime.utcnow()
@ -261,6 +262,7 @@ def save(self, name=None, timestamp=None):
'username': getuser(), 'username': getuser(),
'time': start.isoformat(), 'time': start.isoformat(),
'time_end': end.isoformat(), 'time_end': end.isoformat(),
'chunker_params': self.chunker_params,
}) })
data = msgpack.packb(metadata, unicode_errors='surrogateescape') data = msgpack.packb(metadata, unicode_errors='surrogateescape')
self.id = self.key.id_hash(data) self.id = self.key.id_hash(data)

View File

@ -1,9 +1,10 @@
from binascii import hexlify, unhexlify from binascii import hexlify, unhexlify
from datetime import datetime from datetime import datetime
from hashlib import sha256 from itertools import zip_longest
from operator import attrgetter from operator import attrgetter
import argparse import argparse
import functools import functools
import hashlib
import inspect import inspect
import io import io
import os import os
@ -81,6 +82,45 @@ def print_file_status(self, status, path):
if self.output_list and (self.output_filter is None or status in self.output_filter): if self.output_list and (self.output_filter is None or status in self.output_filter):
logger.info("%1s %s", status, remove_surrogates(path)) logger.info("%1s %s", status, remove_surrogates(path))
@staticmethod
def compare_chunk_contents(chunks1, chunks2):
"""Compare two chunk iterators (like returned by :meth:`.DownloadPipeline.fetch_many`)"""
end = object()
alen = ai = 0
blen = bi = 0
while True:
if not alen - ai:
a = next(chunks1, end)
if a is end:
return not blen - bi and next(chunks2, end) is end
a = memoryview(a)
alen = len(a)
ai = 0
if not blen - bi:
b = next(chunks2, end)
if b is end:
return not alen - ai and next(chunks1, end) is end
b = memoryview(b)
blen = len(b)
bi = 0
slicelen = min(alen - ai, blen - bi)
if a[ai:ai + slicelen] != b[bi:bi + slicelen]:
return False
ai += slicelen
bi += slicelen
@staticmethod
def build_matcher(excludes, paths):
matcher = PatternMatcher()
if excludes:
matcher.add(excludes, False)
include_patterns = []
if paths:
include_patterns.extend(parse_pattern(i, PathPrefixPattern) for i in paths)
matcher.add(include_patterns, True)
matcher.fallback = not include_patterns
return matcher, include_patterns
def do_serve(self, args): def do_serve(self, args):
"""Start in server mode. This command is usually not used manually. """Start in server mode. This command is usually not used manually.
""" """
@ -305,17 +345,7 @@ def do_extract(self, args):
archive = Archive(repository, key, manifest, args.location.archive, archive = Archive(repository, key, manifest, args.location.archive,
numeric_owner=args.numeric_owner) numeric_owner=args.numeric_owner)
matcher = PatternMatcher() matcher, include_patterns = self.build_matcher(args.excludes, args.paths)
if args.excludes:
matcher.add(args.excludes, False)
include_patterns = []
if args.paths:
include_patterns.extend(parse_pattern(i, PathPrefixPattern) for i in args.paths)
matcher.add(include_patterns, True)
matcher.fallback = not include_patterns
output_list = args.output_list output_list = args.output_list
dry_run = args.dry_run dry_run = args.dry_run
@ -354,6 +384,123 @@ def do_extract(self, args):
self.print_warning("Include pattern '%s' never matched.", pattern) self.print_warning("Include pattern '%s' never matched.", pattern)
return self.exit_code return self.exit_code
def do_diff(self, args):
"""Diff contents of two archives"""
def format_bytes(count):
if count is None:
return "<deleted>"
return format_file_size(count)
def fetch_and_compare_chunks(chunk_ids1, chunk_ids2, archive1, archive2):
chunks1 = archive1.pipeline.fetch_many(chunk_ids1)
chunks2 = archive2.pipeline.fetch_many(chunk_ids2)
return self.compare_chunk_contents(chunks1, chunks2)
def get_owner(item):
if args.numeric_owner:
return item[b'uid'], item[b'gid']
else:
return item[b'user'], item[b'group']
def compare_items(path, item1, item2, deleted=False):
"""
Compare two items with identical paths.
:param deleted: Whether one of the items has been deleted
"""
if not deleted:
if item1[b'mode'] != item2[b'mode']:
print(remove_surrogates(path), 'different mode')
print('\t', args.location.archive, stat.filemode(item1[b'mode']))
print('\t', args.archive2, stat.filemode(item2[b'mode']))
user1, group1 = get_owner(item1)
user2, group2 = get_owner(item2)
if user1 != user2 or group1 != group2:
print(remove_surrogates(path), 'different owner')
print('\t', args.location.archive, 'user=%s, group=%s' % (user1, group1))
print('\t', args.archive2, 'user=%s, group=%s' % (user2, group2))
if not stat.S_ISREG(item1[b'mode']):
return
if b'chunks' not in item1 or b'chunks' not in item2:
# At least one of the items is a link
if item1.get(b'source') != item2.get(b'source'):
print(remove_surrogates(path), 'different link')
print('\t', args.location.archive, item1.get(b'source', '<regular file>'))
print('\t', args.archive2, item2.get(b'source', '<regular file>'))
return
if deleted or not can_compare_chunk_ids or item1[b'chunks'] != item2[b'chunks']:
# Contents are different
chunk_ids1 = [c[0] for c in item1[b'chunks']]
chunk_ids2 = [c[0] for c in item2[b'chunks']]
chunk_id_set1 = set(chunk_ids1)
chunk_id_set2 = set(chunk_ids2)
total1 = None if item1.get(b'deleted') else sum(c[1] for c in item1[b'chunks'])
total2 = None if item2.get(b'deleted') else sum(c[1] for c in item2[b'chunks'])
if (not can_compare_chunk_ids and total1 == total2 and not deleted and
fetch_and_compare_chunks(chunk_ids1, chunk_ids2, archive1, archive2)):
return
added = sum(c[1] for c in (chunk_id_set2 - chunk_id_set1))
removed = sum(c[1] for c in (chunk_id_set1 - chunk_id_set2))
print(remove_surrogates(path), 'different contents')
print('\t +%s, -%s, %s, %s' % (format_bytes(added), format_bytes(removed),
format_bytes(total1), format_bytes(total2)))
def compare_archives(archive1, archive2, matcher):
orphans_archive1 = {}
orphans_archive2 = {}
for item1, item2 in zip_longest(
archive1.iter_items(lambda item: matcher.match(item[b'path'])),
archive2.iter_items(lambda item: matcher.match(item[b'path'])),
):
if item1 and item2 and item1[b'path'] == item2[b'path']:
compare_items(item1[b'path'], item1, item2)
continue
if item1:
matching_orphan = orphans_archive2.pop(item1[b'path'], None)
if matching_orphan:
compare_items(item1[b'path'], item1, matching_orphan)
else:
orphans_archive1[item1[b'path']] = item1
if item2:
matching_orphan = orphans_archive1.pop(item2[b'path'], None)
if matching_orphan:
compare_items(item2[b'path'], matching_orphan, item2)
else:
orphans_archive2[item2[b'path']] = item2
# At this point orphans_* contain items that had no matching partner in the other archive
for added in orphans_archive2.values():
compare_items(added[b'path'], {
b'deleted': True,
b'chunks': [],
}, added, deleted=True)
for deleted in orphans_archive1.values():
compare_items(deleted[b'path'], deleted, {
b'deleted': True,
b'chunks': [],
}, deleted=True)
repository = self.open_repository(args)
manifest, key = Manifest.load(repository)
archive1 = Archive(repository, key, manifest, args.location.archive)
archive2 = Archive(repository, key, manifest, args.archive2)
can_compare_chunk_ids = archive1.metadata.get(b'chunker_params', False) == archive2.metadata.get(
b'chunker_params', True) or args.same_chunker_params
if not can_compare_chunk_ids:
self.print_warning('--chunker-params might be different between archives, diff will be slow.\n'
'If you know for certain that they are the same, pass --same-chunker-params '
'to override this check.')
matcher, include_patterns = self.build_matcher(args.excludes, args.paths)
compare_archives(archive1, archive2, matcher)
for pattern in include_patterns:
if pattern.match_count == 0:
self.print_warning("Include pattern '%s' never matched.", pattern)
return self.exit_code
def do_rename(self, args): def do_rename(self, args):
"""Rename an existing archive""" """Rename an existing archive"""
repository = self.open_repository(args, exclusive=True) repository = self.open_repository(args, exclusive=True)
@ -650,7 +797,7 @@ def do_debug_put_obj(self, args):
for path in args.paths: for path in args.paths:
with open(path, "rb") as f: with open(path, "rb") as f:
data = f.read() data = f.read()
h = sha256(data) # XXX hardcoded h = hashlib.sha256(data) # XXX hardcoded
repository.put(h.digest(), data) repository.put(h.digest(), data)
print("object %s put." % h.hexdigest()) print("object %s put." % h.hexdigest())
repository.commit() repository.commit()
@ -1095,6 +1242,41 @@ def build_parser(self, args=None, prog=None):
subparser.add_argument('paths', metavar='PATH', nargs='*', type=str, subparser.add_argument('paths', metavar='PATH', nargs='*', type=str,
help='paths to extract; patterns are supported') help='paths to extract; patterns are supported')
diff_epilog = textwrap.dedent("""
This command finds differences in files (contents, user, group, mode) between archives.
Both archives need to be in the same repository, and a repository location may only
be specified for ARCHIVE1.
See the output of the "borg help patterns" command for more help on exclude patterns.
""")
subparser = subparsers.add_parser('diff', parents=[common_parser],
description=self.do_diff.__doc__,
epilog=diff_epilog,
formatter_class=argparse.RawDescriptionHelpFormatter,
help='find differences in archive contents')
subparser.set_defaults(func=self.do_diff)
subparser.add_argument('-e', '--exclude', dest='excludes',
type=parse_pattern, action='append',
metavar="PATTERN", help='exclude paths matching PATTERN')
subparser.add_argument('--exclude-from', dest='exclude_files',
type=argparse.FileType('r'), action='append',
metavar='EXCLUDEFILE', help='read exclude patterns from EXCLUDEFILE, one per line')
subparser.add_argument('--numeric-owner', dest='numeric_owner',
action='store_true', default=False,
help='only consider numeric user and group identifiers')
subparser.add_argument('--same-chunker-params', dest='same_chunker_params',
action='store_true', default=False,
help='Override check of chunker parameters.')
subparser.add_argument('location', metavar='ARCHIVE1',
type=location_validator(archive=True),
help='archive')
subparser.add_argument('archive2', metavar='ARCHIVE2',
type=archivename_validator(),
help='archive to compare with ARCHIVE1 (no repository location)')
subparser.add_argument('paths', metavar='PATH', nargs='*', type=str,
help='paths to compare; patterns are supported')
rename_epilog = textwrap.dedent(""" rename_epilog = textwrap.dedent("""
This command renames an archive in the repository. This command renames an archive in the repository.
""") """)

View File

@ -1143,6 +1143,43 @@ def test_debug_put_get_delete_obj(self):
pass pass
class DiffArchiverTestCase(ArchiverTestCaseBase):
create_test_files = ArchiverTestCase.create_test_files
create_regular_file = ArchiverTestCase.create_regular_file
def test_basic_functionality(self):
self.create_test_files()
self.cmd('init', self.repository_location)
os.chmod('input/dir2', stat.S_IFDIR | 0o755)
self.create_regular_file('file3', size=1024)
self.cmd('create', self.repository_location + '::test0', 'input')
# replace 'hardlink' with a file
os.unlink('input/hardlink')
self.create_regular_file('hardlink', size=1024 * 80)
# replace directory with a file
os.unlink('input/dir2/file2')
os.rmdir('input/dir2')
self.create_regular_file('dir2', size=1024 * 80)
os.chmod('input/dir2', stat.S_IFREG | 0o755)
self.create_regular_file('file3', size=1024, contents=b'0')
self.cmd('create', self.repository_location + '::test1a', 'input')
self.cmd('create', '--chunker-params', '16,18,17,4095', self.repository_location + '::test1b', 'input')
def do_asserts(output, archive):
assert 'input/file3 different contents' in output
assert 'input/hardlink different mode' in output
assert ('input/hardlink different link\n'
' test0 input/file1\n'
' test%s <regular file>' % archive) in output
assert ('input/dir2 different mode\n'
' test0 drwxr-xr-x\n'
' test%s -rwxr-xr-x\n' % archive) in output
assert 'input/dir2/file2 different contents' in output
do_asserts(self.cmd('diff', self.repository_location + '::test0', 'test1a'), '1a')
# We expect exit_code=1 due to the chunker params warning
do_asserts(self.cmd('diff', self.repository_location + '::test0', 'test1b', exit_code=1), '1b')
def test_get_args(): def test_get_args():
archiver = Archiver() archiver = Archiver()
# everything normal: # everything normal:
@ -1162,3 +1199,34 @@ def test_get_args():
args = archiver.get_args(['borg', 'serve', '--restrict-to-path=/p1', '--restrict-to-path=/p2', ], args = archiver.get_args(['borg', 'serve', '--restrict-to-path=/p1', '--restrict-to-path=/p2', ],
'borg init /') 'borg init /')
assert args.func == archiver.do_serve assert args.func == archiver.do_serve
def test_compare_chunk_contents():
def ccc(a, b):
compare1 = Archiver.compare_chunk_contents(iter(a), iter(b))
compare2 = Archiver.compare_chunk_contents(iter(b), iter(a))
assert compare1 == compare2
return compare1
assert ccc([
b'1234', b'567A', b'bC'
], [
b'1', b'23', b'4567A', b'b', b'C'
])
# one iterator exhausted before the other
assert not ccc([
b'12345',
], [
b'1234', b'56'
])
# content mismatch
assert not ccc([
b'1234', b'65'
], [
b'1234', b'56'
])
# first is the prefix of second
assert not ccc([
b'1234', b'56'
], [
b'1234', b'565'
])