Merge pull request #8436 from ThomasWaldmann/analyze-cmd

analyze: changed chunks per directory
This commit is contained in:
TW 2024-10-02 17:22:08 +02:00 committed by GitHub
commit 8cd951f324
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 356 additions and 0 deletions

91
docs/man/borg-analyze.1 Normal file
View File

@ -0,0 +1,91 @@
.\" Man page generated from reStructuredText.
.
.
.nr rst2man-indent-level 0
.
.de1 rstReportMargin
\\$1 \\n[an-margin]
level \\n[rst2man-indent-level]
level margin: \\n[rst2man-indent\\n[rst2man-indent-level]]
-
\\n[rst2man-indent0]
\\n[rst2man-indent1]
\\n[rst2man-indent2]
..
.de1 INDENT
.\" .rstReportMargin pre:
. RS \\$1
. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin]
. nr rst2man-indent-level +1
.\" .rstReportMargin post:
..
.de UNINDENT
. RE
.\" indent \\n[an-margin]
.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]]
.nr rst2man-indent-level -1
.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]]
.in \\n[rst2man-indent\\n[rst2man-indent-level]]u
..
.TH "BORG-ANALYZE" 1 "2024-10-02" "" "borg backup tool"
.SH NAME
borg-analyze \- Analyze archives
.SH SYNOPSIS
.sp
borg [common options] analyze [options]
.SH DESCRIPTION
.sp
Analyze archives to find \(dqhot spots\(dq.
.sp
Borg analyze relies on the usual archive matching options to select the
archives that should be considered for analysis (e.g. \fB\-a series_name\fP).
Then it iterates over all matching archives, over all contained files and
collects information about chunks stored in all directories it encountered.
.sp
It considers chunk IDs and their plaintext sizes (we don\(aqt have the compressed
size in the repository easily available) and adds up added/removed chunks\(aq
sizes per direct parent directory and outputs a list of \(dqdirectory: size\(dq.
.sp
You can use that list to find directories with a lot of \(dqactivity\(dq \- maybe
some of these are temporary or cache directories you did forget to exclude.
.sp
To not have these unwanted directories in your backups, you could carefully
exclude these in \fBborg create\fP (for future backups) or use \fBborg recreate\fP
to re\-create existing archives without these.
.SH OPTIONS
.sp
See \fIborg\-common(1)\fP for common options of Borg commands.
.SS Archive filters
.INDENT 0.0
.TP
.BI \-a \ PATTERN\fR,\fB \ \-\-match\-archives \ PATTERN
only consider archives matching all patterns. see \(dqborg help match\-archives\(dq.
.TP
.BI \-\-sort\-by \ KEYS
Comma\-separated list of sorting keys; valid keys are: timestamp, archive, name, id, tags, host, user; default is: timestamp
.TP
.BI \-\-first \ N
consider first N archives after other filters were applied
.TP
.BI \-\-last \ N
consider last N archives after other filters were applied
.TP
.BI \-\-oldest \ TIMESPAN
consider archives between the oldest archive\(aqs timestamp and (oldest + TIMESPAN), e.g. 7d or 12m.
.TP
.BI \-\-newest \ TIMESPAN
consider archives between the newest archive\(aqs timestamp and (newest \- TIMESPAN), e.g. 7d or 12m.
.TP
.BI \-\-older \ TIMESPAN
consider archives older than (now \- TIMESPAN), e.g. 7d or 12m.
.TP
.BI \-\-newer \ TIMESPAN
consider archives newer than (now \- TIMESPAN), e.g. 7d or 12m.
.UNINDENT
.SH SEE ALSO
.sp
\fIborg\-common(1)\fP
.SH AUTHOR
The Borg Collective
.\" Generated by docutils manpage writer.
.

View File

@ -57,6 +57,7 @@ Usage
usage/delete
usage/prune
usage/info
usage/analyze
usage/mount
usage/recreate
usage/tar

1
docs/usage/analyze.rst Normal file
View File

@ -0,0 +1 @@
.. include:: analyze.rst.inc

View File

@ -0,0 +1,84 @@
.. IMPORTANT: this file is auto-generated from borg's built-in help, do not edit!
.. _borg_analyze:
borg analyze
------------
.. code-block:: none
borg [common options] analyze [options]
.. only:: html
.. class:: borg-options-table
+-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
| .. class:: borg-common-opt-ref |
| |
| :ref:`common_options` |
+-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
| **Archive filters** Archive filters can be applied to repository targets. |
+-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
| | ``-a PATTERN``, ``--match-archives PATTERN`` | only consider archives matching all patterns. see "borg help match-archives". |
+-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
| | ``--sort-by KEYS`` | Comma-separated list of sorting keys; valid keys are: timestamp, archive, name, id, tags, host, user; default is: timestamp |
+-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
| | ``--first N`` | consider first N archives after other filters were applied |
+-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
| | ``--last N`` | consider last N archives after other filters were applied |
+-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
| | ``--oldest TIMESPAN`` | consider archives between the oldest archive's timestamp and (oldest + TIMESPAN), e.g. 7d or 12m. |
+-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
| | ``--newest TIMESPAN`` | consider archives between the newest archive's timestamp and (newest - TIMESPAN), e.g. 7d or 12m. |
+-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
| | ``--older TIMESPAN`` | consider archives older than (now - TIMESPAN), e.g. 7d or 12m. |
+-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
| | ``--newer TIMESPAN`` | consider archives newer than (now - TIMESPAN), e.g. 7d or 12m. |
+-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
.. raw:: html
<script type='text/javascript'>
$(document).ready(function () {
$('.borg-options-table colgroup').remove();
})
</script>
.. only:: latex
:ref:`common_options`
|
Archive filters
-a PATTERN, --match-archives PATTERN only consider archives matching all patterns. see "borg help match-archives".
--sort-by KEYS Comma-separated list of sorting keys; valid keys are: timestamp, archive, name, id, tags, host, user; default is: timestamp
--first N consider first N archives after other filters were applied
--last N consider last N archives after other filters were applied
--oldest TIMESPAN consider archives between the oldest archive's timestamp and (oldest + TIMESPAN), e.g. 7d or 12m.
--newest TIMESPAN consider archives between the newest archive's timestamp and (newest - TIMESPAN), e.g. 7d or 12m.
--older TIMESPAN consider archives older than (now - TIMESPAN), e.g. 7d or 12m.
--newer TIMESPAN consider archives newer than (now - TIMESPAN), e.g. 7d or 12m.
Description
~~~~~~~~~~~
Analyze archives to find "hot spots".
Borg analyze relies on the usual archive matching options to select the
archives that should be considered for analysis (e.g. ``-a series_name``).
Then it iterates over all matching archives, over all contained files and
collects information about chunks stored in all directories it encountered.
It considers chunk IDs and their plaintext sizes (we don't have the compressed
size in the repository easily available) and adds up added/removed chunks'
sizes per direct parent directory and outputs a list of "directory: size".
You can use that list to find directories with a lot of "activity" - maybe
some of these are temporary or cache directories you did forget to exclude.
To not have these unwanted directories in your backups, you could carefully
exclude these in ``borg create`` (for future backups) or use ``borg recreate``
to re-create existing archives without these.

View File

@ -64,6 +64,7 @@ def get_func(args):
raise Exception("expected func attributes not found")
from .analyze_cmd import AnalyzeMixIn
from .benchmark_cmd import BenchmarkMixIn
from .check_cmd import CheckMixIn
from .compact_cmd import CompactMixIn
@ -94,6 +95,7 @@ def get_func(args):
class Archiver(
AnalyzeMixIn,
BenchmarkMixIn,
CheckMixIn,
CompactMixIn,
@ -332,6 +334,7 @@ def build_parser(self):
subparsers = parser.add_subparsers(title="required arguments", metavar="<command>")
self.build_parser_analyze(subparsers, common_parser, mid_common_parser)
self.build_parser_benchmarks(subparsers, common_parser, mid_common_parser)
self.build_parser_check(subparsers, common_parser, mid_common_parser)
self.build_parser_compact(subparsers, common_parser, mid_common_parser)

View File

@ -0,0 +1,135 @@
import argparse
from collections import defaultdict
import os
from ._common import with_repository, define_archive_filters_group
from ..archive import Archive
from ..constants import * # NOQA
from ..helpers import bin_to_hex, Error
from ..helpers import ProgressIndicatorPercent
from ..manifest import Manifest
from ..remote import RemoteRepository
from ..repository import Repository
from ..logger import create_logger
logger = create_logger()
class ArchiveAnalyzer:
def __init__(self, args, repository, manifest):
self.args = args
self.repository = repository
assert isinstance(repository, (Repository, RemoteRepository))
self.manifest = manifest
self.difference_by_path = defaultdict(int) # directory path -> count of chunks changed
def analyze(self):
logger.info("Starting archives analysis...")
self.analyze_archives()
self.report()
logger.info("Finished archives analysis.")
def analyze_archives(self) -> None:
"""Analyze all archives matching the given selection criteria."""
archive_infos = self.manifest.archives.list_considering(self.args)
num_archives = len(archive_infos)
if num_archives < 2:
raise Error("Need at least 2 archives to analyze.")
pi = ProgressIndicatorPercent(
total=num_archives, msg="Analyzing archives %3.1f%%", step=0.1, msgid="analyze.analyze_archives"
)
i = 0
info = archive_infos[i]
pi.show(i)
logger.info(f"Analyzing archive {info.name} {info.ts} {bin_to_hex(info.id)} ({i + 1}/{num_archives})")
base = self.analyze_archive(info.id)
for i, info in enumerate(archive_infos[1:]):
pi.show(i + 1)
logger.info(f"Analyzing archive {info.name} {info.ts} {bin_to_hex(info.id)} ({i + 2}/{num_archives})")
new = self.analyze_archive(info.id)
self.analyze_change(base, new)
base = new
pi.finish()
def analyze_archive(self, id):
"""compute the set of chunks for each directory in this archive"""
archive = Archive(self.manifest, id)
chunks_by_path = defaultdict(dict) # collect all chunk IDs generated from files in this directory path
for item in archive.iter_items():
if "chunks" in item:
item_chunks = dict(item.chunks) # chunk id -> plaintext size
directory_path = os.path.dirname(item.path)
chunks_by_path[directory_path].update(item_chunks)
return chunks_by_path
def analyze_change(self, base, new):
"""for each directory path, sum up the changed (removed or added) chunks' sizes between base and new."""
def analyze_path_change(path):
base_chunks = base[path]
new_chunks = new[path]
# add up added chunks' sizes
for id in new_chunks.keys() - base_chunks.keys():
self.difference_by_path[directory_path] += new_chunks[id]
# add up removed chunks' sizes
for id in base_chunks.keys() - new_chunks.keys():
self.difference_by_path[directory_path] += base_chunks[id]
for directory_path in base:
analyze_path_change(directory_path)
for directory_path in new:
if directory_path not in base:
analyze_path_change(directory_path)
def report(self):
print()
print("chunks added or removed by directory path")
print("=========================================")
for directory_path in sorted(self.difference_by_path, key=lambda p: self.difference_by_path[p], reverse=True):
difference = self.difference_by_path[directory_path]
print(f"{directory_path}: {difference}")
class AnalyzeMixIn:
@with_repository(compatibility=(Manifest.Operation.READ,))
def do_analyze(self, args, repository, manifest):
"""Analyze archives"""
ArchiveAnalyzer(args, repository, manifest).analyze()
def build_parser_analyze(self, subparsers, common_parser, mid_common_parser):
from ._common import process_epilog
analyze_epilog = process_epilog(
"""
Analyze archives to find "hot spots".
Borg analyze relies on the usual archive matching options to select the
archives that should be considered for analysis (e.g. ``-a series_name``).
Then it iterates over all matching archives, over all contained files and
collects information about chunks stored in all directories it encountered.
It considers chunk IDs and their plaintext sizes (we don't have the compressed
size in the repository easily available) and adds up added/removed chunks'
sizes per direct parent directory and outputs a list of "directory: size".
You can use that list to find directories with a lot of "activity" - maybe
some of these are temporary or cache directories you did forget to exclude.
To not have these unwanted directories in your backups, you could carefully
exclude these in ``borg create`` (for future backups) or use ``borg recreate``
to re-create existing archives without these.
"""
)
subparser = subparsers.add_parser(
"analyze",
parents=[common_parser],
add_help=False,
description=self.do_analyze.__doc__,
epilog=analyze_epilog,
formatter_class=argparse.RawDescriptionHelpFormatter,
help="analyze archives",
)
subparser.set_defaults(func=self.do_analyze)
define_archive_filters_group(subparser)

View File

@ -0,0 +1,41 @@
import pathlib
from ...constants import * # NOQA
from . import cmd, generate_archiver_tests, RK_ENCRYPTION
pytest_generate_tests = lambda metafunc: generate_archiver_tests(metafunc, kinds="local") # NOQA
def test_analyze(archivers, request):
def create_archive():
cmd(archiver, "create", "archive", archiver.input_path)
def analyze_archives():
return cmd(archiver, "analyze", "-a", "archive")
archiver = request.getfixturevalue(archivers)
cmd(archiver, "repo-create", RK_ENCRYPTION)
input_path = pathlib.Path(archiver.input_path)
# 1st archive
(input_path / "file1").write_text("1")
create_archive()
# 2nd archive
(input_path / "file2").write_text("22")
create_archive()
assert "/input: 2" in analyze_archives() # 2nd archive added 1 chunk for input path
# 3rd archive
(input_path / "file3").write_text("333")
create_archive()
assert "/input: 5" in analyze_archives() # 2nd/3rd archives added 2 chunks for input path
# 4th archive
(input_path / "file2").unlink()
create_archive()
assert "/input: 7" in analyze_archives() # 2nd/3rd archives added 2, 4th archive removed 1