Coverage for bzfs_main/bzfs.py: 99%
4671 statements
« prev ^ index » next coverage.py v7.9.1, created at 2025-06-20 13:09 +0000
« prev ^ index » next coverage.py v7.9.1, created at 2025-06-20 13:09 +0000
1#
2# Copyright 2024 Wolfgang Hoschek AT mac DOT com
3#
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8# http://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
16# Inline script metadata conforming to https://packaging.python.org/specifications/inline-script-metadata
17# /// script
18# requires-python = ">=3.8"
19# dependencies = []
20# ///
22"""
23* Overview of the bzfs.py codebase:
24* The codebase starts with docs, definition of input data and associated argument parsing into a "Params" class.
25* All CLI option/parameter values are reachable from the "Params" class.
26* Control flow starts in main(), far below ..., which kicks off a "Job".
27* A Job runs one or more "tasks" via run_tasks(), each task replicating a separate dataset tree.
28* The core replication algorithm is in run_task() and especially in replicate_datasets() and replicate_dataset().
29* The filter algorithms that apply include/exclude policies are in filter_datasets() and filter_snapshots().
30* The --create-src-snapshots-* and --delete-* and --compare-* and --monitor-* algorithms also start in run_task().
31* Consider using an IDE/editor that can open multiple windows for the same (long) file, such as PyCharm or Sublime Text, etc.
32* The main retry logic is in run_with_retries() and clear_resumable_recv_state_if_necessary().
33* Progress reporting for use during `zfs send/recv` data transfers is in class ProgressReporter.
34* Executing a CLI commmand on a local or remote host is in run_ssh_command().
35* Network connection management is in refresh_ssh_connection_if_necessary() and class ConnectionPool.
36* Caching functionality can be found by searching for this regex: .*cach.*
37* The parallel processing engine is in itr_ssh_cmd_parallel() and process_datasets_in_parallel_and_fault_tolerant().
38* README.md is mostly auto-generated from the ArgumentParser help texts as the source of "truth", via update_readme.sh.
39Simply run that script whenever you change or add ArgumentParser help text.
40"""
42import argparse
43import ast
44import bisect
45import calendar
46import collections
47import concurrent
48import contextlib
49import copy
50import dataclasses
51import errno
52import fcntl
53import glob
54import hashlib
55import heapq
56import itertools
57import logging
58import logging.config
59import logging.handlers
60import math
61import operator
62import os
63import platform
64import pwd
65import random
66import re
67import selectors
68import shlex
69import shutil
70import signal
71import socket
72import stat
73import subprocess
74import sys
75import tempfile
76import threading
77import time
78import types
79from collections import defaultdict, deque, Counter
80from concurrent.futures import ThreadPoolExecutor, Future, FIRST_COMPLETED
81from dataclasses import dataclass, field
82from datetime import datetime, timedelta, timezone, tzinfo
83from logging import Logger
84from os import stat as os_stat, utime as os_utime
85from os.path import exists as os_path_exists, join as os_path_join
86from pathlib import Path
87from subprocess import CalledProcessError, DEVNULL, PIPE
88from typing import (
89 Any,
90 Callable,
91 DefaultDict,
92 Deque,
93 Dict,
94 Final,
95 FrozenSet,
96 Generator,
97 Generic,
98 ItemsView,
99 Iterable,
100 Iterator,
101 List,
102 Literal,
103 NamedTuple,
104 NoReturn,
105 Optional,
106 Protocol,
107 Sequence,
108 Set,
109 TextIO,
110 Tuple,
111 Type,
112 TypeVar,
113 Union,
114 cast,
115)
117from bzfs_main.utils import cut
119# constants:
120__version__ = "1.12.0-dev"
121prog_name = "bzfs"
122prog_author = "Wolfgang Hoschek"
123die_status = 3
124critical_status = 2
125warning_status = 1
126still_running_status = 4
127min_python_version = (3, 8)
128if sys.version_info < min_python_version:
129 print(f"ERROR: {prog_name} requires Python version >= {'.'.join(map(str, min_python_version))}!")
130 sys.exit(die_status)
131exclude_dataset_regexes_default = r"(.*/)?[Tt][Ee]?[Mm][Pp][-_]?[0-9]*" # skip tmp datasets by default
132create_src_snapshots_prefix_dflt = prog_name + "_"
133create_src_snapshots_suffix_dflt = "_adhoc"
134time_threshold_secs = 1.1 # 1 second ZFS creation time resolution + NTP clock skew is typically < 10ms
135disable_prg = "-"
136env_var_prefix = prog_name + "_"
137pv_file_thread_separator = "_"
138dummy_dataset = "dummy"
139zfs_version_is_at_least_2_1_0 = "zfs>=2.1.0"
140zfs_version_is_at_least_2_2_0 = "zfs>=2.2.0"
141zfs_recv_groups = {"zfs_recv_o": "-o", "zfs_recv_x": "-x", "zfs_set": ""}
142snapshot_regex_filter_names = frozenset({"include_snapshot_regex", "exclude_snapshot_regex"})
143snapshot_regex_filter_name = "snapshot_regex"
144snapshot_filters_var = "snapshot_filters_var"
145cmp_choices_items = ("src", "dst", "all")
146inject_dst_pipe_fail_kbytes = 400
147unixtime_infinity_secs = 2**64 # billions of years in the future and to be extra safe, larger than the largest ZFS GUID
148year_with_four_digits_regex = re.compile(r"[1-9][0-9][0-9][0-9]") # regex for empty target shall not match non-empty target
149log_stderr = (logging.INFO + logging.WARN) // 2 # custom log level is halfway in between
150log_stdout = (log_stderr + logging.INFO) // 2 # custom log level is halfway in between
151log_debug = logging.DEBUG
152log_trace = logging.DEBUG // 2 # custom log level is halfway in between
153SNAPSHOTS_CHANGED = "snapshots_changed" # See https://openzfs.github.io/openzfs-docs/man/7/zfsprops.7.html#snapshots_changed
154BARRIER_CHAR = "~"
155SHARED = "shared"
156DEDICATED = "dedicated"
157DONT_SKIP_DATASET = ""
158SHELL_CHARS = '"' + "'`~!@#$%^&*()+={}[]|;<>?,\\"
161def argument_parser() -> argparse.ArgumentParser:
162 create_src_snapshots_plan_example1 = str({"test": {"": {"adhoc": 1}}}).replace(" ", "")
163 create_src_snapshots_plan_example2 = str({"prod": {"us-west-1": {"hourly": 36, "daily": 31}}}).replace(" ", "")
164 delete_dst_snapshots_except_plan_example1 = str(
165 {
166 "prod": {
167 "onsite": {
168 "secondly": 40,
169 "minutely": 40,
170 "hourly": 36,
171 "daily": 31,
172 "weekly": 12,
173 "monthly": 18,
174 "yearly": 5,
175 }
176 }
177 }
178 ).replace(" ", "")
180 # fmt: off
181 parser = argparse.ArgumentParser(
182 prog=prog_name,
183 allow_abbrev=False,
184 formatter_class=argparse.RawTextHelpFormatter,
185 description=f"""
186*{prog_name} is a backup command line tool that reliably replicates ZFS snapshots from a (local or remote)
187source ZFS dataset (ZFS filesystem or ZFS volume) and its descendant datasets to a (local or remote)
188destination ZFS dataset to make the destination dataset a recursively synchronized copy of the source dataset,
189using zfs send/receive/rollback/destroy and ssh tunnel as directed. For example, {prog_name} can be used to
190incrementally replicate all ZFS snapshots since the most recent common snapshot from source to destination,
191in order to help protect against data loss or ransomware.*
193When run for the first time, {prog_name} replicates the dataset and all its snapshots from the source to the
194destination. On subsequent runs, {prog_name} transfers only the data that has changed since the previous run,
195i.e. it incrementally replicates to the destination all intermediate snapshots that have been created on
196the source since the last run. Source ZFS snapshots older than the most recent common snapshot found on the
197destination are auto-skipped.
199Unless {prog_name} is explicitly told to create snapshots on the source, it treats the source as read-only,
200thus the source remains unmodified. With the --dryrun flag, {prog_name} also treats the destination as read-only.
201In normal operation, {prog_name} treats the destination as append-only. Optional CLI flags are available to
202delete destination snapshots and destination datasets as directed, for example to make the destination
203identical to the source if the two have somehow diverged in unforeseen ways. This easily enables
204(re)synchronizing the backup from the production state, as well as restoring the production state from
205backup.
207In the spirit of rsync, {prog_name} supports a variety of powerful include/exclude filters that can be combined to
208select which datasets, snapshots and properties to create, replicate, delete or compare.
210Typically, a `cron` job on the source host runs `{prog_name}` periodically to create new snapshots and prune outdated
211snapshots on the source, whereas another `cron` job on the destination host runs `{prog_name}` periodically to prune
212outdated destination snapshots. Yet another `cron` job runs `{prog_name}` periodically to replicate the recently created
213snapshots from the source to the destination. The frequency of these periodic activities is typically every N milliseconds,
214every second, minute, hour, day, week, month and/or year (or multiples thereof).
216All {prog_name} functions including snapshot creation, replication, deletion, monitoring, comparison, etc. happily work
217with any snapshots in any format, even created or managed by third party ZFS snapshot management tools, including manual
218zfs snapshot/destroy. All functions can also be used independently. That is, if you wish you can use {prog_name} just
219for creating snapshots, or just for replicating, or just for deleting/pruning, or just for monitoring, or just for
220comparing snapshot lists.
222The source 'pushes to' the destination whereas the destination 'pulls from' the source. {prog_name} is installed
223and executed on the 'initiator' host which can be either the host that contains the source dataset (push mode),
224or the destination dataset (pull mode), or both datasets (local mode, no network required, no ssh required),
225or any third-party (even non-ZFS OSX) host as long as that host is able to SSH (via standard 'ssh' OpenSSH CLI) into
226both the source and destination host (pull-push mode). In pull-push mode the source 'zfs send's the data stream
227to the initiator which immediately pipes the stream (without storing anything locally) to the destination
228host that 'zfs receive's it. Pull-push mode means that {prog_name} need not be installed or executed on either
229source or destination host. Only the underlying 'zfs' CLI must be installed on both source and destination host.
230{prog_name} can run as root or non-root user, in the latter case via a) sudo or b) when granted corresponding
231ZFS permissions by administrators via 'zfs allow' delegation mechanism.
233{prog_name} is written in Python and continously runs a wide set of unit tests and integration tests to ensure
234coverage and compatibility with old and new versions of ZFS on Linux, FreeBSD and Solaris, on all Python
235versions >= 3.8 (including latest stable which is currently python-3.13).
237{prog_name} is a stand-alone program with zero required dependencies, akin to a
238stand-alone shell script or binary executable. It is designed to be able to run in restricted barebones server
239environments. No external Python packages are required; indeed no Python package management at all is required.
240You can just symlink the program wherever you like, for example into /usr/local/bin or similar, and simply run it like
241any stand-alone shell script or binary executable.
243{prog_name} automatically replicates the snapshots of multiple datasets in parallel for best performance.
244Similarly, it quickly deletes (or monitors or compares) snapshots of multiple datasets in parallel. Atomic snapshots can be
245created as frequently as every N milliseconds.
247Optionally, {prog_name} applies bandwidth rate-limiting and progress monitoring (via 'pv' CLI) during 'zfs
248send/receive' data transfers. When run across the network, {prog_name} also transparently inserts lightweight
249data compression (via 'zstd -1' CLI) and efficient data buffering (via 'mbuffer' CLI) into the pipeline
250between network endpoints during 'zfs send/receive' network transfers. If one of these utilities is not
251installed this is auto-detected, and the operation continues reliably without the corresponding auxiliary
252feature.
254# Periodic Jobs with bzfs_jobrunner
256The software also ships with the [bzfs_jobrunner](README_bzfs_jobrunner.md) companion program, which is a convenience
257wrapper around `{prog_name}` that simplifies efficient periodic ZFS snapshot creation, replication, pruning, and monitoring,
258across N source hosts and M destination hosts, using a single shared [jobconfig](bzfs_tests/bzfs_job_example.py) script.
259For example, this simplifies the deployment of an efficient geo-replicated backup service where each of the M destination
260hosts is located in a separate geographic region and pulls replicas from (the same set of) N source hosts. It also
261simplifies low latency replication from a primary to a secondary or to M read replicas, or backup to removable drives, etc.
263# Quickstart
265* Create adhoc atomic snapshots without a schedule:
267```$ {prog_name} tank1/foo/bar dummy --recursive --skip-replication --create-src-snapshots
268--create-src-snapshots-plan "{create_src_snapshots_plan_example1}"```
270```$ zfs list -t snapshot tank1/foo/bar
272tank1/foo/bar@test_2024-11-06_08:30:05_adhoc```
274* Create periodic atomic snapshots on a schedule, every hour and every day, by launching this from a periodic `cron` job:
276```$ {prog_name} tank1/foo/bar dummy --recursive --skip-replication --create-src-snapshots
277--create-src-snapshots-plan "{create_src_snapshots_plan_example2}"```
279```$ zfs list -t snapshot tank1/foo/bar
281tank1/foo/bar@prod_us-west-1_2024-11-06_08:30:05_daily
283tank1/foo/bar@prod_us-west-1_2024-11-06_08:30:05_hourly```
285Note: A periodic snapshot is created if it is due per the schedule indicated by its suffix (e.g. `_daily` or `_hourly`
286or `_minutely` or `_2secondly` or `_100millisecondly`), or if the --create-src-snapshots-even-if-not-due flag is specified,
287or if the most recent scheduled snapshot is somehow missing. In the latter case {prog_name} immediately creates a snapshot
288(named with the current time, not backdated to the missed time), and then resumes the original schedule. If the suffix is
289`_adhoc` or not a known period then a snapshot is considered non-periodic and is thus created immediately regardless of the
290creation time of any existing snapshot.
292* Replication example in local mode (no network, no ssh), to replicate ZFS dataset tank1/foo/bar to tank2/boo/bar:
294```$ {prog_name} tank1/foo/bar tank2/boo/bar```
296```$ zfs list -t snapshot tank1/foo/bar
298tank1/foo/bar@prod_us-west-1_2024-11-06_08:30:05_daily
300tank1/foo/bar@prod_us-west-1_2024-11-06_08:30:05_hourly```
302```$ zfs list -t snapshot tank2/boo/bar
304tank2/boo/bar@prod_us-west-1_2024-11-06_08:30:05_daily
306tank2/boo/bar@prod_us-west-1_2024-11-06_08:30:05_hourly```
308* Same example in pull mode:
310```$ {prog_name} root@host1.example.com:tank1/foo/bar tank2/boo/bar```
312* Same example in push mode:
314```$ {prog_name} tank1/foo/bar root@host2.example.com:tank2/boo/bar```
316* Same example in pull-push mode:
318```$ {prog_name} root@host1:tank1/foo/bar root@host2:tank2/boo/bar```
320* Example in local mode (no network, no ssh) to recursively replicate ZFS dataset tank1/foo/bar and its descendant
321datasets to tank2/boo/bar:
323```$ {prog_name} tank1/foo/bar tank2/boo/bar --recursive```
325```$ zfs list -t snapshot -r tank1/foo/bar
327tank1/foo/bar@prod_us-west-1_2024-11-06_08:30:05_daily
329tank1/foo/bar@prod_us-west-1_2024-11-06_08:30:05_hourly
331tank1/foo/bar/baz@prod_us-west-1_2024-11-06_08:40:00_daily
333tank1/foo/bar/baz@prod_us-west-1_2024-11-06_08:40:00_hourly```
335```$ zfs list -t snapshot -r tank2/boo/bar
337tank2/boo/bar@prod_us-west-1_2024-11-06_08:30:05_daily
339tank2/boo/bar@prod_us-west-1_2024-11-06_08:30:05_hourly
341tank2/boo/bar/baz@prod_us-west-1_2024-11-06_08:40:00_daily
343tank2/boo/bar/baz@prod_us-west-1_2024-11-06_08:40:00_hourly```
345* Example that makes destination identical to source even if the two have drastically diverged:
347```$ {prog_name} tank1/foo/bar tank2/boo/bar --recursive --force --delete-dst-datasets --delete-dst-snapshots```
349* Replicate all daily snapshots created during the last 7 days, and at the same time ensure that the latest 7 daily
350snapshots (per dataset) are replicated regardless of creation time:
352```$ {prog_name} tank1/foo/bar tank2/boo/bar --recursive --include-snapshot-regex '.*_daily'
353--include-snapshot-times-and-ranks '7 days ago..anytime' 'latest 7'```
355Note: The example above compares the specified times against the standard ZFS 'creation' time property of the snapshots
356(which is a UTC Unix time in integer seconds), rather than against a timestamp that may be part of the snapshot name.
358* Delete all daily snapshots older than 7 days, but ensure that the latest 7 daily snapshots (per dataset) are retained
359regardless of creation time:
361```$ {prog_name} {dummy_dataset} tank2/boo/bar --dryrun --recursive --skip-replication --delete-dst-snapshots
362--include-snapshot-regex '.*_daily' --include-snapshot-times-and-ranks notime 'all except latest 7'
363--include-snapshot-times-and-ranks 'anytime..7 days ago'```
365Note: This also prints how many GB of disk space in total would be freed if the command were to be run for real without
366the --dryrun flag.
368* Delete all daily snapshots older than 7 days, but ensure that the latest 7 daily snapshots (per dataset) are retained
369regardless of creation time. Additionally, only delete a snapshot if no corresponding snapshot or bookmark exists in
370the source dataset (same as above except replace the 'dummy' source with 'tank1/foo/bar'):
372```$ {prog_name} tank1/foo/bar tank2/boo/bar --dryrun --recursive --skip-replication --delete-dst-snapshots
373--include-snapshot-regex '.*_daily' --include-snapshot-times-and-ranks notime 'all except latest 7'
374--include-snapshot-times-and-ranks '7 days ago..anytime'```
376* Delete all daily snapshots older than 7 days, but ensure that the latest 7 daily snapshots (per dataset) are retained
377regardless of creation time. Additionally, only delete a snapshot if no corresponding snapshot exists in the source
378dataset (same as above except append 'no-crosscheck'):
380```$ {prog_name} tank1/foo/bar tank2/boo/bar --dryrun --recursive --skip-replication --delete-dst-snapshots
381--include-snapshot-regex '.*_daily' --include-snapshot-times-and-ranks notime 'all except latest 7'
382--include-snapshot-times-and-ranks 'anytime..7 days ago' --delete-dst-snapshots-no-crosscheck```
384* Delete all daily bookmarks older than 90 days, but retain the latest 200 daily bookmarks (per dataset) regardless
385of creation time:
387```$ {prog_name} {dummy_dataset} tank1/foo/bar --dryrun --recursive --skip-replication --delete-dst-snapshots=bookmarks
388--include-snapshot-regex '.*_daily' --include-snapshot-times-and-ranks notime 'all except latest 200'
389--include-snapshot-times-and-ranks 'anytime..90 days ago'```
391* Delete all tmp datasets within tank2/boo/bar:
393```$ {prog_name} {dummy_dataset} tank2/boo/bar --dryrun --recursive --skip-replication --delete-dst-datasets
394--include-dataset-regex '(.*/)?tmp.*' --exclude-dataset-regex '!.*'```
396* Retain all secondly snapshots that were created less than 40 seconds ago, and ensure that the latest 40
397secondly snapshots (per dataset) are retained regardless of creation time. Same for 40 minutely snapshots, 36 hourly
398snapshots, 31 daily snapshots, 12 weekly snapshots, 18 monthly snapshots, and 5 yearly snapshots:
400```$ {prog_name} {dummy_dataset} tank2/boo/bar --dryrun --recursive --skip-replication --delete-dst-snapshots
401--delete-dst-snapshots-except
402--include-snapshot-regex '.*_secondly' --include-snapshot-times-and-ranks '40 seconds ago..anytime' 'latest 40'
403--new-snapshot-filter-group
404--include-snapshot-regex '.*_minutely' --include-snapshot-times-and-ranks '40 minutes ago..anytime' 'latest 40'
405--new-snapshot-filter-group
406--include-snapshot-regex '.*_hourly' --include-snapshot-times-and-ranks '36 hours ago..anytime' 'latest 36'
407--new-snapshot-filter-group
408--include-snapshot-regex '.*_daily' --include-snapshot-times-and-ranks '31 days ago..anytime' 'latest 31'
409--new-snapshot-filter-group
410--include-snapshot-regex '.*_weekly' --include-snapshot-times-and-ranks '12 weeks ago..anytime' 'latest 12'
411--new-snapshot-filter-group
412--include-snapshot-regex '.*_monthly' --include-snapshot-times-and-ranks '18 months ago..anytime' 'latest 18'
413--new-snapshot-filter-group
414--include-snapshot-regex '.*_yearly' --include-snapshot-times-and-ranks '5 years ago..anytime' 'latest 5'```
416For convenience, the lengthy command line above can be expressed in a more concise way, like so:
418```$ {prog_name} {dummy_dataset} tank2/boo/bar --dryrun --recursive --skip-replication --delete-dst-snapshots
419--delete-dst-snapshots-except-plan "{delete_dst_snapshots_except_plan_example1}"```
421* Compare source and destination dataset trees recursively, for example to check if all recently taken snapshots have
422been successfully replicated by a periodic job. List snapshots only contained in src (tagged with 'src'),
423only contained in dst (tagged with 'dst'), and contained in both src and dst (tagged with 'all'), restricted to hourly
424and daily snapshots taken within the last 7 days, excluding the last 4 hours (to allow for some slack/stragglers),
425excluding temporary datasets:
427```$ {prog_name} tank1/foo/bar tank2/boo/bar --skip-replication --compare-snapshot-lists=src+dst+all --recursive
428--include-snapshot-regex '.*_(hourly|daily)' --include-snapshot-times-and-ranks '7 days ago..4 hours ago'
429--exclude-dataset-regex '(.*/)?tmp.*'```
431If the resulting TSV output file contains zero lines starting with the prefix 'src' and zero lines starting with the
432prefix 'dst' then no source snapshots are missing on the destination, and no destination snapshots are missing
433on the source, indicating that the periodic replication and pruning jobs perform as expected. The TSV output is sorted
434by dataset, and by ZFS creation time within each dataset - the first and last line prefixed with 'all' contains the
435metadata of the oldest and latest common snapshot, respectively. The --compare-snapshot-lists option also directly
436logs various summary stats, such as the metadata of the latest common snapshot, latest snapshots and oldest snapshots,
437as well as the time diff between the latest common snapshot and latest snapshot only in src (and only in dst), as well
438as how many src snapshots and how many GB of data are missing on dst, etc.
440* Example with further options:
442```$ {prog_name} tank1/foo/bar root@host2.example.com:tank2/boo/bar --recursive
443--exclude-snapshot-regex '.*_(secondly|minutely)' --exclude-snapshot-regex 'test_.*'
444--include-snapshot-times-and-ranks '7 days ago..anytime' 'latest 7' --exclude-dataset /tank1/foo/bar/temporary
445--exclude-dataset /tank1/foo/bar/baz/trash --exclude-dataset-regex '(.*/)?private'
446--exclude-dataset-regex '(.*/)?[Tt][Ee]?[Mm][Pp][-_]?[0-9]*' --ssh-dst-private-key /root/.ssh/id_rsa```
447""")
449 parser.add_argument(
450 "root_dataset_pairs", nargs="+", action=DatasetPairsAction, metavar="SRC_DATASET DST_DATASET",
451 help="SRC_DATASET: "
452 "Source ZFS dataset (and its descendants) that will be replicated. Can be a ZFS filesystem or ZFS volume. "
453 "Format is [[user@]host:]dataset. The host name can also be an IPv4 address (or an IPv6 address where "
454 "each ':' colon character must be replaced with a '|' pipe character for disambiguation). If the "
455 "host name is '-', the dataset will be on the local host, and the corresponding SSH leg will be omitted. "
456 "The same is true if the host is omitted and the dataset does not contain a ':' colon at the same time. "
457 "Local dataset examples: `tank1/foo/bar`, `tank1`, `-:tank1/foo/bar:baz:boo` "
458 "Remote dataset examples: `host:tank1/foo/bar`, `host.example.com:tank1/foo/bar`, "
459 "`root@host:tank`, `root@host.example.com:tank1/foo/bar`, `user@127.0.0.1:tank1/foo/bar:baz:boo`, "
460 "`user@||1:tank1/foo/bar:baz:boo`. "
461 "The first component of the ZFS dataset name is the ZFS pool name, here `tank1`. "
462 "If the option starts with a `+` prefix then dataset names are read from the UTF-8 text file given "
463 "after the `+` prefix, with each line in the file containing a SRC_DATASET and a DST_DATASET, "
464 "separated by a tab character. Example: `+root_dataset_names.txt`, `+/path/to/root_dataset_names.txt`\n\n"
465 "DST_DATASET: "
466 "Destination ZFS dataset for replication and deletion. Has same naming format as SRC_DATASET. During "
467 "replication, destination datasets that do not yet exist are created as necessary, along with their "
468 "parent and ancestors.\n\n"
469 f"*Performance Note:* {prog_name} automatically replicates multiple datasets in parallel. It replicates "
470 "snapshots in parallel across datasets and serially within a dataset. All child datasets of a dataset "
471 "may be processed in parallel. For consistency, processing of a dataset only starts after processing of "
472 "all its ancestor datasets has completed. Further, when a thread is ready to start processing another "
473 "dataset, it chooses the next dataset wrt. lexicographical sort order from the datasets that are "
474 "currently available for start of processing. Initially, only the roots of the selected dataset subtrees "
475 "are available for start of processing. The degree of parallelism is configurable with the --threads "
476 "option (see below).\n\n")
477 parser.add_argument(
478 "--recursive", "-r", action="store_true",
479 help="During snapshot creation, replication, deletion and comparison, also consider descendant datasets, i.e. "
480 "datasets within the dataset tree, including children, and children of children, etc.\n\n")
481 parser.add_argument(
482 "--include-dataset", action=FileOrLiteralAction, nargs="+", default=[], metavar="DATASET",
483 help="During snapshot creation, replication, deletion and comparison, select any ZFS dataset (and its descendants) "
484 "that is contained within SRC_DATASET (DST_DATASET in case of deletion) if its dataset name is one of the "
485 "given include dataset names but none of the exclude dataset names. If a dataset is excluded its descendants "
486 "are automatically excluded too, and this decision is never reconsidered even for the descendants because "
487 "exclude takes precedence over include.\n\n"
488 "A dataset name is absolute if the specified dataset is prefixed by `/`, e.g. `/tank/baz/tmp`. "
489 "Otherwise the dataset name is relative wrt. source and destination, e.g. `baz/tmp` if the source "
490 "is `tank`.\n\n"
491 "This option is automatically translated to an --include-dataset-regex (see below) and can be "
492 "specified multiple times.\n\n"
493 "If the option starts with a `+` prefix then dataset names are read from the newline-separated "
494 "UTF-8 text file given after the `+` prefix, one dataset per line inside of the text file.\n\n"
495 "Examples: `/tank/baz/tmp` (absolute), `baz/tmp` (relative), "
496 "`+dataset_names.txt`, `+/path/to/dataset_names.txt`\n\n")
497 parser.add_argument(
498 "--exclude-dataset", action=FileOrLiteralAction, nargs="+", default=[], metavar="DATASET",
499 help="Same syntax as --include-dataset (see above) except that the option is automatically translated to an "
500 "--exclude-dataset-regex (see below).\n\n")
501 parser.add_argument(
502 "--include-dataset-regex", action=FileOrLiteralAction, nargs="+", default=[], metavar="REGEX",
503 help="During snapshot creation, replication (and deletion) and comparison, select any ZFS dataset (and its "
504 "descendants) that is contained within SRC_DATASET (DST_DATASET in case of deletion) if its relative dataset "
505 "path (e.g. `baz/tmp`) wrt. SRC_DATASET (DST_DATASET in case of deletion) matches at least one of the given "
506 "include regular expressions but none of the exclude regular expressions. "
507 "If a dataset is excluded its descendants are automatically excluded too, and this decision is never "
508 "reconsidered even for the descendants because exclude takes precedence over include.\n\n"
509 "This option can be specified multiple times. "
510 "A leading `!` character indicates logical negation, i.e. the regex matches if the regex with the "
511 "leading `!` character removed does not match.\n\n"
512 "If the option starts with a `+` prefix then regex names are read from the newline-separated "
513 "UTF-8 text file given after the `+` prefix, one regex per line inside of the text file.\n\n"
514 "Default: `.*` (include all datasets).\n\n"
515 "Examples: `baz/tmp`, `(.*/)?doc[^/]*/(private|confidential).*`, `!public`, "
516 "`+dataset_regexes.txt`, `+/path/to/dataset_regexes.txt`\n\n")
517 parser.add_argument(
518 "--exclude-dataset-regex", action=FileOrLiteralAction, nargs="+", default=[], metavar="REGEX",
519 help="Same syntax as --include-dataset-regex (see above) except that the default is "
520 f"`{exclude_dataset_regexes_default}` (exclude tmp datasets). Example: `!.*` (exclude no dataset)\n\n")
521 parser.add_argument(
522 "--exclude-dataset-property", default=None, action=NonEmptyStringAction, metavar="STRING",
523 help="The name of a ZFS dataset user property (optional). If this option is specified, the effective value "
524 "(potentially inherited) of that user property is read via 'zfs list' for each selected source dataset "
525 "to determine whether the dataset will be included or excluded, as follows:\n\n"
526 "a) Value is 'true' or '-' or empty string or the property is missing: Include the dataset.\n\n"
527 "b) Value is 'false': Exclude the dataset and its descendants.\n\n"
528 "c) Value is a comma-separated list of host names (no spaces, for example: "
529 "'store001,store002'): Include the dataset if the host name of "
530 f"the host executing {prog_name} is contained in the list, otherwise exclude the dataset and its "
531 "descendants.\n\n"
532 "If a dataset is excluded its descendants are automatically excluded too, and the property values of the "
533 "descendants are ignored because exclude takes precedence over include.\n\n"
534 "Examples: 'syncoid:sync', 'com.example.eng.project.x:backup'\n\n"
535 "*Note:* The use of --exclude-dataset-property is discouraged for most use cases. It is more flexible, "
536 "more powerful, *and* more efficient to instead use a combination of --include/exclude-dataset-regex "
537 "and/or --include/exclude-dataset to achieve the same or better outcome.\n\n")
538 parser.add_argument(
539 "--include-snapshot-regex", action=FileOrLiteralAction, nargs="+", default=[], metavar="REGEX",
540 help="During replication, deletion and comparison, select any source ZFS snapshot that has a name (i.e. the part "
541 "after the '@') that matches at least one of the given include regular expressions but none of the "
542 "exclude regular expressions. If a snapshot is excluded this decision is never reconsidered because "
543 "exclude takes precedence over include.\n\n"
544 "This option can be specified multiple times. "
545 "A leading `!` character indicates logical negation, i.e. the regex matches if the regex with the "
546 "leading `!` character removed does not match.\n\n"
547 "Default: `.*` (include all snapshots). "
548 "Examples: `test_.*`, `!prod_.*`, `.*_(hourly|frequent)`, `!.*_(weekly|daily)`\n\n"
549 "*Note:* All --include/exclude-snapshot-* CLI option groups are combined into a mini filter pipeline. "
550 "A filter pipeline is executed in the order given on the command line, left to right. For example if "
551 "--include-snapshot-times-and-ranks (see below) is specified on the command line before "
552 "--include/exclude-snapshot-regex, then --include-snapshot-times-and-ranks will be applied before "
553 "--include/exclude-snapshot-regex. The pipeline results would not always be the same if the order were "
554 "reversed. Order matters.\n\n"
555 "*Note:* During replication, bookmarks are always retained aka selected in order to help find common "
556 "snapshots between source and destination.\n\n")
557 parser.add_argument(
558 "--exclude-snapshot-regex", action=FileOrLiteralAction, nargs="+", default=[], metavar="REGEX",
559 help="Same syntax as --include-snapshot-regex (see above) except that the default is to exclude no "
560 "snapshots.\n\n")
561 parser.add_argument(
562 "--include-snapshot-times-and-ranks", action=TimeRangeAndRankRangeAction, nargs="+", default=[],
563 metavar=("TIMERANGE", "RANKRANGE"),
564 help="This option takes as input parameters a time range filter and an optional rank range filter. It "
565 "separately computes the results for each filter and selects the UNION of both results. "
566 "To instead use a pure rank range filter (no UNION), or a pure time range filter (no UNION), simply "
567 "use 'notime' aka '0..0' to indicate an empty time range, or omit the rank range, respectively. "
568 "This option can be specified multiple times.\n\n"
569 "<b>*Replication Example (UNION):* </b>\n\n"
570 "Specify to replicate all daily snapshots created during the last 7 days, "
571 "and at the same time ensure that the latest 7 daily snapshots (per dataset) are replicated regardless "
572 "of creation time, like so: "
573 "`--include-snapshot-regex '.*_daily' --include-snapshot-times-and-ranks '7 days ago..anytime' 'latest 7'`\n\n"
574 "<b>*Deletion Example (no UNION):* </b>\n\n"
575 "Specify to delete all daily snapshots older than 7 days, but ensure that the "
576 "latest 7 daily snapshots (per dataset) are retained regardless of creation time, like so: "
577 "`--include-snapshot-regex '.*_daily' --include-snapshot-times-and-ranks notime 'all except latest 7' "
578 "--include-snapshot-times-and-ranks 'anytime..7 days ago'`"
579 "\n\n"
580 "This helps to safely cope with irregular scenarios where no snapshots were created or received within "
581 "the last 7 days, or where more than 7 daily snapshots were created within the last 7 days. It can also "
582 "help to avoid accidental pruning of the last snapshot that source and destination have in common.\n\n"
583 ""
584 "<b>*TIMERANGE:* </b>\n\n"
585 "The ZFS 'creation' time of a snapshot (and bookmark) must fall into this time range in order for the "
586 "snapshot to be included. The time range consists of a 'start' time, followed by a '..' separator, "
587 "followed by an 'end' time. For example '2024-01-01..2024-04-01', or 'anytime..anytime' aka `*..*` aka all "
588 "times, or 'notime' aka '0..0' aka empty time range. Only snapshots (and bookmarks) in the half-open time "
589 "range [start, end) are included; other snapshots (and bookmarks) are excluded. If a snapshot is excluded "
590 "this decision is never reconsidered because exclude takes precedence over include. Each of the two specified "
591 "times can take any of the following forms:\n\n"
592 "* a) `anytime` aka `*` wildcard; represents negative or positive infinity.\n\n"
593 "* b) a non-negative integer representing a UTC Unix time in seconds. Example: 1728109805\n\n"
594 "* c) an ISO 8601 datetime string with or without timezone. Examples: '2024-10-05', "
595 "'2024-10-05T14:48:55', '2024-10-05T14:48:55+02', '2024-10-05T14:48:55-04:30'. If the datetime string "
596 "does not contain time zone info then it is assumed to be in the local time zone. Timezone string support "
597 "requires Python >= 3.11.\n\n"
598 "* d) a duration that indicates how long ago from the current time, using the following syntax: "
599 "a non-negative integer, followed by an optional space, followed by a duration unit that is "
600 "*one* of 'seconds', 'secs', 'minutes', 'mins', 'hours', 'days', 'weeks', 'months', 'years', "
601 "followed by an optional space, followed by the word 'ago'. "
602 "Examples: '0secs ago', '40 mins ago', '36hours ago', '90days ago', '12weeksago'.\n\n"
603 "* Note: This option compares the specified time against the standard ZFS 'creation' time property of the "
604 "snapshot (which is a UTC Unix time in integer seconds), rather than against a timestamp that may be "
605 "part of the snapshot name. You can list the ZFS creation time of snapshots and bookmarks as follows: "
606 "`zfs list -t snapshot,bookmark -o name,creation -s creation -d 1 $SRC_DATASET` (optionally add "
607 "the -p flag to display UTC Unix time in integer seconds).\n\n"
608 "*Note:* During replication, bookmarks are always retained aka selected in order to help find common "
609 "snapshots between source and destination.\n\n"
610 ""
611 "<b>*RANKRANGE:* </b>\n\n"
612 "Specifies to include the N (or N%%) oldest snapshots or latest snapshots, and exclude all other "
613 "snapshots (default: include no snapshots). Snapshots are sorted by creation time (actually, by the "
614 "'createtxg' ZFS property, which serves the same purpose but is more precise). The rank position of a "
615 "snapshot is the zero-based integer position of the snapshot within that sorted list. A rank consists of the "
616 "optional words 'all except' (followed by an optional space), followed by the word 'oldest' or 'latest', "
617 "followed by a non-negative integer, followed by an optional '%%' percent sign. A rank range consists of a "
618 "lower rank, followed by a '..' separator, followed by a higher rank. "
619 "If the optional lower rank is missing it is assumed to be 0. Examples:\n\n"
620 "* 'oldest 10%%' aka 'oldest 0..oldest 10%%' (include the oldest 10%% of all snapshots)\n\n"
621 "* 'latest 10%%' aka 'latest 0..latest 10%%' (include the latest 10%% of all snapshots)\n\n"
622 "* 'all except latest 10%%' aka 'oldest 90%%' aka 'oldest 0..oldest 90%%' (include all snapshots except the "
623 "latest 10%% of all snapshots)\n\n"
624 "* 'oldest 90' aka 'oldest 0..oldest 90' (include the oldest 90 snapshots)\n\n"
625 "* 'latest 90' aka 'latest 0..latest 90' (include the latest 90 snapshots)\n\n"
626 "* 'all except oldest 90' aka 'oldest 90..oldest 100%%' (include all snapshots except the oldest 90 snapshots)"
627 "\n\n"
628 "* 'all except latest 90' aka 'latest 90..latest 100%%' (include all snapshots except the latest 90 snapshots)"
629 "\n\n"
630 "* 'latest 1' aka 'latest 0..latest 1' (include the latest snapshot)\n\n"
631 "* 'all except latest 1' aka 'latest 1..latest 100%%' (include all snapshots except the latest snapshot)\n\n"
632 "* 'oldest 2' aka 'oldest 0..oldest 2' (include the oldest 2 snapshots)\n\n"
633 "* 'all except oldest 2' aka 'oldest 2..oldest 100%%' (include all snapshots except the oldest 2 snapshots)\n\n"
634 "* 'oldest 100%%' aka 'oldest 0..oldest 100%%' (include all snapshots)\n\n"
635 "* 'oldest 0%%' aka 'oldest 0..oldest 0%%' (include no snapshots)\n\n"
636 "* 'oldest 0' aka 'oldest 0..oldest 0' (include no snapshots)\n\n"
637 "*Note:* If multiple RANKRANGEs are specified within a single --include-snapshot-times-and-ranks option, each "
638 "subsequent rank range operates on the output of the preceding rank rage.\n\n"
639 "*Note:* Percentage calculations are not based on the number of snapshots "
640 "contained in the dataset on disk, but rather based on the number of snapshots arriving at the filter. "
641 "For example, if only two daily snapshots arrive at the filter because a prior filter excludes hourly "
642 "snapshots, then 'latest 10' will only include these two daily snapshots, and 'latest 50%%' will only "
643 "include one of these two daily snapshots.\n\n"
644 "*Note:* During replication, bookmarks are always retained aka selected in order to help find common "
645 "snapshots between source and destination. Bookmarks do not count towards N or N%% wrt. rank.\n\n"
646 "*Note:* If a snapshot is excluded this decision is never reconsidered because exclude takes precedence "
647 "over include.\n\n")
649 src_snapshot_plan_example = {
650 "prod": {
651 "onsite": {"secondly": 40, "minutely": 40, "hourly": 36, "daily": 31, "weekly": 12, "monthly": 18, "yearly": 5},
652 "us-west-1": {"secondly": 0, "minutely": 0, "hourly": 36, "daily": 31, "weekly": 12, "monthly": 18, "yearly": 5},
653 "eu-west-1": {"secondly": 0, "minutely": 0, "hourly": 36, "daily": 31, "weekly": 12, "monthly": 18, "yearly": 5},
654 },
655 "test": {
656 "offsite": {"12hourly": 42, "weekly": 12},
657 "onsite": {"100millisecondly": 42},
658 },
659 }
660 parser.add_argument(
661 "--include-snapshot-plan", action=IncludeSnapshotPlanAction, default=None, metavar="DICT_STRING",
662 help="Replication periods to be used if replicating snapshots within the selected destination datasets. "
663 "Has the same format as --create-src-snapshots-plan and --delete-dst-snapshots-except-plan (see below). "
664 "Snapshots that do not match a period will not be replicated. To avoid unexpected surprises, make sure to "
665 "carefully specify ALL snapshot names and periods that shall be replicated, in combination with --dryrun.\n\n"
666 f"Example: `{format_dict(src_snapshot_plan_example)}`. This example will, for the organization 'prod' and the "
667 "intended logical target 'onsite', replicate secondly snapshots that were created less than 40 seconds ago, "
668 "yet replicate the latest 40 secondly snapshots regardless of creation time. Analog for the latest 40 minutely "
669 "snapshots, latest 36 hourly snapshots, etc. "
670 "Note: A zero within a period (e.g. 'hourly': 0) indicates that no snapshots shall be replicated for the given "
671 "period.\n\n"
672 "Note: --include-snapshot-plan is a convenience option that auto-generates a series of the following other "
673 "options: --new-snapshot-filter-group, --include-snapshot-regex, --include-snapshot-times-and-ranks\n\n")
674 parser.add_argument(
675 "--new-snapshot-filter-group", action=NewSnapshotFilterGroupAction, nargs=0,
676 help="Starts a new snapshot filter group containing separate --{include|exclude}-snapshot-* filter options. The "
677 "program separately computes the results for each filter group and selects the UNION of all results. "
678 "This option can be specified multiple times and serves as a separator between groups. Example:\n\n"
679 "Delete all minutely snapshots older than 40 minutes, but ensure that the latest 40 minutely snapshots (per "
680 "dataset) are retained regardless of creation time. Additionally, delete all hourly snapshots older than 36 "
681 "hours, but ensure that the latest 36 hourly snapshots (per dataset) are retained regardless of creation time. "
682 "Additionally, delete all daily snapshots older than 31 days, but ensure that the latest 31 daily snapshots "
683 "(per dataset) are retained regardless of creation time: "
684 f"`{prog_name} {dummy_dataset} tank2/boo/bar --dryrun --recursive --skip-replication --delete-dst-snapshots "
685 "--include-snapshot-regex '.*_minutely' --include-snapshot-times-and-ranks notime 'all except latest 40' "
686 "--include-snapshot-times-and-ranks 'anytime..40 minutes ago' "
687 "--new-snapshot-filter-group "
688 "--include-snapshot-regex '.*_hourly' --include-snapshot-times-and-ranks notime 'all except latest 36' "
689 "--include-snapshot-times-and-ranks 'anytime..36 hours ago' "
690 "--new-snapshot-filter-group "
691 "--include-snapshot-regex '.*_daily' --include-snapshot-times-and-ranks notime 'all except latest 31' "
692 "--include-snapshot-times-and-ranks 'anytime..31 days ago'`\n\n")
693 parser.add_argument(
694 "--create-src-snapshots", action="store_true",
695 help="Do nothing if the --create-src-snapshots flag is missing. Otherwise, before the replication step (see below), "
696 "atomically create new snapshots of the source datasets selected via --{include|exclude}-dataset* policy. "
697 "The names of the snapshots can be configured via --create-src-snapshots-* suboptions (see below). "
698 "To create snapshots only, without any other processing such as replication, etc, consider using this flag "
699 "together with the --skip-replication flag.\n\n"
700 "A periodic snapshot is created if it is due per the schedule indicated by --create-src-snapshots-plan "
701 "(for example '_daily' or '_hourly' or _'10minutely' or '_2secondly' or '_100millisecondly'), or if the "
702 "--create-src-snapshots-even-if-not-due flag is specified, or if the most recent scheduled snapshot "
703 f"is somehow missing. In the latter case {prog_name} immediately creates a snapshot (tagged with the current "
704 "time, not backdated to the missed time), and then resumes the original schedule.\n\n"
705 "If the snapshot suffix is '_adhoc' or not a known period then a snapshot is considered "
706 "non-periodic and is thus created immediately regardless of the creation time of any existing snapshot.\n\n"
707 "The implementation attempts to fit as many datasets as possible into a single (atomic) 'zfs snapshot' command "
708 "line, using lexicographical sort order, and using 'zfs snapshot -r' to the extent that this is compatible "
709 "with the actual results of the schedule and the actual results of the --{include|exclude}-dataset* pruning "
710 "policy. The snapshots of all datasets that fit "
711 "within the same single 'zfs snapshot' CLI invocation will be taken within the same ZFS transaction group, and "
712 "correspondingly have identical 'createtxg' ZFS property (but not necessarily identical 'creation' ZFS time "
713 "property as ZFS actually provides no such guarantee), and thus be consistent. Dataset names that can't fit "
714 "into a single command line are spread over multiple command line invocations, respecting the limits that the "
715 "operating system places on the maximum length of a single command line, per `getconf ARG_MAX`.\n\n"
716 f"Note: All {prog_name} functions including snapshot creation, replication, deletion, monitoring, comparison, "
717 "etc. happily work with any snapshots in any format, even created or managed by third party ZFS snapshot "
718 "management tools, including manual zfs snapshot/destroy.\n\n")
719 parser.add_argument(
720 "--create-src-snapshots-plan", default=None, type=str, metavar="DICT_STRING",
721 help="Creation periods that specify a schedule for when new snapshots shall be created on src within the selected "
722 "datasets. Has the same format as --delete-dst-snapshots-except-plan.\n\n"
723 f"Example: `{format_dict(src_snapshot_plan_example)}`. This example will, for the organization 'prod' and "
724 "the intended logical target 'onsite', create 'secondly' snapshots every second, 'minutely' snapshots every "
725 "minute, hourly snapshots every hour, and so on. "
726 "It will also create snapshots for the targets 'us-west-1' and 'eu-west-1' within the 'prod' organization. "
727 "In addition, it will create snapshots every 12 hours and every week for the 'test' organization, "
728 "and name them as being intended for the 'offsite' replication target. Analog for snapshots that are taken "
729 "every 100 milliseconds within the 'test' organization.\n\n"
730 "The example creates ZFS snapshots with names like "
731 "`prod_onsite_<timestamp>_secondly`, `prod_onsite_<timestamp>_minutely`, "
732 "`prod_us-west-1_<timestamp>_hourly`, `prod_us-west-1_<timestamp>_daily`, "
733 "`prod_eu-west-1_<timestamp>_hourly`, `prod_eu-west-1_<timestamp>_daily`, "
734 "`test_offsite_<timestamp>_12hourly`, `test_offsite_<timestamp>_weekly`, and so on.\n\n"
735 "Note: A period name that is missing indicates that no snapshots shall be created for the given period.\n\n"
736 "The period name can contain an optional positive integer immediately preceding the time period unit, for "
737 "example `_2secondly` or `_10minutely` or `_100millisecondly` to indicate that snapshots are taken every 2 "
738 "seconds, or every 10 minutes, or every 100 milliseconds, respectively.\n\n")
740 def argparser_escape(text: str) -> str:
741 return text.replace('%', '%%')
743 parser.add_argument(
744 "--create-src-snapshots-timeformat", default="%Y-%m-%d_%H:%M:%S", metavar="STRFTIME_SPEC",
745 help="Default is `%(default)s`. For the strftime format, see "
746 "https://docs.python.org/3.11/library/datetime.html#strftime-strptime-behavior. "
747 f"Examples: `{argparser_escape('%Y-%m-%d_%H:%M:%S.%f')}` (adds microsecond resolution), "
748 f"`{argparser_escape('%Y-%m-%d_%H:%M:%S%z')}` (adds timezone offset), "
749 f"`{argparser_escape('%Y-%m-%dT%H-%M-%S')}` (no colons).\n\n"
750 "The name of the snapshot created on the src is `$org_$target_strftime(--create-src-snapshots-time*)_$period`. "
751 "Example: `tank/foo@prod_us-west-1_2024-09-03_12:26:15_daily`\n\n")
752 parser.add_argument(
753 "--create-src-snapshots-timezone", default="", type=str, metavar="TZ_SPEC",
754 help=f"Default is the local timezone of the system running {prog_name}. When creating a new snapshot on the source, "
755 "fetch the current time in the specified timezone, and feed that time, and the value of "
756 "--create-src-snapshots-timeformat, into the standard strftime() function to generate the timestamp portion "
757 "of the snapshot name. The TZ_SPEC input parameter is of the form 'UTC' or '+HHMM' or '-HHMM' for fixed UTC "
758 "offsets, or an IANA TZ identifier for auto-adjustment to daylight savings time, or the empty string to use "
759 "the local timezone, for example '', 'UTC', '+0000', '+0530', '-0400', 'America/Los_Angeles', 'Europe/Vienna'. "
760 "For a list of valid IANA TZ identifiers see https://en.wikipedia.org/wiki/List_of_tz_database_time_zones#List"
761 "\n\nTo change the timezone not only for snapshot name creation, but in all respects for the entire program, "
762 "use the standard 'TZ' Unix environment variable, like so: `export TZ=UTC`.\n\n")
763 parser.add_argument(
764 "--create-src-snapshots-even-if-not-due", action="store_true",
765 help="Take snapshots immediately regardless of the creation time of any existing snapshot, even if snapshots "
766 "are periodic and not actually due per the schedule.\n\n")
767 parser.add_argument(
768 "--create-src-snapshots-enable-snapshots-changed-cache", action="store_true",
769 help=argparse.SUPPRESS) # deprecated; was replaced by --cache-snapshots
770 parser.add_argument(
771 "--zfs-send-program-opts", type=str, default="--props --raw --compressed", metavar="STRING",
772 help="Parameters to fine-tune 'zfs send' behaviour (optional); will be passed into 'zfs send' CLI. "
773 "The value is split on runs of one or more whitespace characters. "
774 "Default is '%(default)s'. To run `zfs send` without options, specify the empty "
775 "string: `--zfs-send-program-opts=''`. "
776 "See https://openzfs.github.io/openzfs-docs/man/master/8/zfs-send.8.html "
777 "and https://github.com/openzfs/zfs/issues/13024\n\n")
778 parser.add_argument(
779 "--zfs-recv-program-opts", type=str, default="-u", metavar="STRING",
780 help="Parameters to fine-tune 'zfs receive' behaviour (optional); will be passed into 'zfs receive' CLI. "
781 "The value is split on runs of one or more whitespace characters. "
782 "Default is '%(default)s'. To run `zfs receive` without options, specify the empty "
783 "string: `--zfs-recv-program-opts=''`. "
784 "Example: '-u -o canmount=noauto -o readonly=on -x keylocation -x keyformat -x encryption'. "
785 "See https://openzfs.github.io/openzfs-docs/man/master/8/zfs-receive.8.html "
786 "and https://openzfs.github.io/openzfs-docs/man/master/7/zfsprops.7.html\n\n")
787 parser.add_argument(
788 "--zfs-recv-program-opt", action="append", default=[], metavar="STRING",
789 help="Parameter to fine-tune 'zfs receive' behaviour (optional); will be passed into 'zfs receive' CLI. "
790 "The value can contain spaces and is not split. This option can be specified multiple times. Example: `"
791 "--zfs-recv-program-opt=-o "
792 "--zfs-recv-program-opt='org.zfsbootmenu:commandline=ro debug zswap.enabled=1'`\n\n")
793 parser.add_argument(
794 "--force-rollback-to-latest-snapshot", action="store_true",
795 help="Before replication, rollback the destination dataset to its most recent destination snapshot (if there "
796 "is one), via 'zfs rollback', just in case the destination dataset was modified since its most recent "
797 "snapshot. This is much less invasive than the other --force* options (see below).\n\n")
798 parser.add_argument(
799 "--force-rollback-to-latest-common-snapshot", action="store_true",
800 help="Before replication, delete destination ZFS snapshots that are more recent than the most recent common "
801 "snapshot selected on the source ('conflicting snapshots'), via 'zfs rollback'. Do no rollback if no common "
802 "snapshot is selected.\n\n")
803 parser.add_argument(
804 "--force", action="store_true",
805 help="Same as --force-rollback-to-latest-common-snapshot (see above), except that additionally, if no common "
806 "snapshot is selected, then delete all destination snapshots before starting replication, and proceed "
807 "without aborting. Without the --force* flags, the destination dataset is treated as append-only, hence "
808 "no destination snapshot that already exists is deleted, and instead the operation is aborted with an "
809 "error when encountering a conflicting snapshot.\n\n"
810 "Analogy: --force-rollback-to-latest-snapshot is a tiny hammer, whereas "
811 "--force-rollback-to-latest-common-snapshot is a medium sized hammer, --force is a large hammer, and "
812 "--force-destroy-dependents is a very large hammer. "
813 "Consider using the smallest hammer that can fix the problem. No hammer is ever used by default.\n\n")
814 parser.add_argument(
815 "--force-destroy-dependents", action="store_true",
816 help="On destination, --force and --force-rollback-to-latest-common-snapshot and --delete-* will add the "
817 "'-R' flag to their use of 'zfs rollback' and 'zfs destroy', causing them to delete dependents such as "
818 "clones and bookmarks. This can be very destructive and is rarely advisable.\n\n")
819 parser.add_argument(
820 "--force-hard", action="store_true", # deprecated; was renamed to --force-destroy-dependents
821 help=argparse.SUPPRESS)
822 parser.add_argument(
823 "--force-unmount", action="store_true",
824 help="On destination, --force and --force-rollback-to-latest-common-snapshot will add the '-f' flag to their "
825 "use of 'zfs rollback' and 'zfs destroy'.\n\n")
826 parser.add_argument(
827 "--force-once", "--f1", action="store_true",
828 help="Use the --force option or --force-rollback-to-latest-common-snapshot option at most once to resolve a "
829 "conflict, then abort with an error on any subsequent conflict. This helps to interactively resolve "
830 "conflicts, one conflict at a time.\n\n")
831 parser.add_argument(
832 "--skip-parent", action="store_true",
833 help="During replication and deletion, skip processing of the SRC_DATASET and DST_DATASET and only process "
834 "their descendant datasets, i.e. children, and children of children, etc (with --recursive). No dataset "
835 "is processed unless --recursive is also specified. "
836 f"Analogy: `{prog_name} --recursive --skip-parent src dst` is akin to Unix `cp -r src/* dst/` whereas "
837 f" `{prog_name} --recursive --skip-parent --skip-replication --delete-dst-datasets dummy dst` is akin to "
838 "Unix `rm -r dst/*`\n\n")
839 parser.add_argument(
840 "--skip-missing-snapshots", choices=["fail", "dataset", "continue"], default="dataset", nargs="?",
841 help="During replication, handle source datasets that select no snapshots (and no relevant bookmarks) "
842 "as follows:\n\n"
843 "a) 'fail': Abort with an error.\n\n"
844 "b) 'dataset' (default): Skip the source dataset with a warning. Skip descendant datasets if "
845 "--recursive and destination dataset does not exist. Otherwise skip to the next dataset.\n\n"
846 "c) 'continue': Skip nothing. If destination snapshots exist, delete them (with --force) or abort "
847 "with an error (without --force). If there is no such abort, continue processing with the next dataset. "
848 "Eventually create empty destination dataset and ancestors if they do not yet exist and source dataset "
849 "has at least one descendant that selects at least one snapshot.\n\n")
850 parser.add_argument(
851 "--retries", type=int, min=0, default=2, action=CheckRange, metavar="INT",
852 help="The maximum number of times a retryable replication or deletion step shall be retried if it fails, for "
853 "example because of network hiccups (default: %(default)s, min: %(min)s). "
854 "Also consider this option if a periodic pruning script may simultaneously delete a dataset or "
855 f"snapshot or bookmark while {prog_name} is running and attempting to access it.\n\n")
856 parser.add_argument(
857 "--retry-min-sleep-secs", type=float, min=0, default=0.125, action=CheckRange, metavar="FLOAT",
858 help="The minimum duration to sleep between retries (default: %(default)s).\n\n")
859 parser.add_argument(
860 "--retry-max-sleep-secs", type=float, min=0, default=5 * 60, action=CheckRange, metavar="FLOAT",
861 help="The maximum duration to sleep between retries initially starts with --retry-min-sleep-secs (see above), "
862 "and doubles on each retry, up to the final maximum of --retry-max-sleep-secs "
863 "(default: %(default)s). On each retry a random sleep time in the "
864 "[--retry-min-sleep-secs, current max] range is picked. The timer resets after each operation.\n\n")
865 parser.add_argument(
866 "--retry-max-elapsed-secs", type=float, min=0, default=60 * 60, action=CheckRange, metavar="FLOAT",
867 help="A single operation (e.g. 'zfs send/receive' of the current dataset, or deletion of a list of snapshots "
868 "within the current dataset) will not be retried (or not retried anymore) once this much time has elapsed "
869 "since the initial start of the operation, including retries (default: %(default)s). "
870 "The timer resets after each operation completes or retries exhaust, such that subsequently failing "
871 "operations can again be retried.\n\n")
872 parser.add_argument(
873 "--skip-on-error", choices=["fail", "tree", "dataset"], default="dataset",
874 help="During replication and deletion, if an error is not retryable, or --retries has been exhausted, "
875 "or --skip-missing-snapshots raises an error, proceed as follows:\n\n"
876 "a) 'fail': Abort the program with an error. This mode is ideal for testing, clear "
877 "error reporting, and situations where consistency trumps availability.\n\n"
878 "b) 'tree': Log the error, skip the dataset tree rooted at the dataset for which the error "
879 "occurred, and continue processing the next (sibling) dataset tree. "
880 "Example: Assume datasets tank/user1/foo and tank/user2/bar and an error occurs while processing "
881 "tank/user1. In this case processing skips tank/user1/foo and proceeds with tank/user2.\n\n"
882 "c) 'dataset' (default): Same as 'tree' except if the destination dataset already exists, skip to "
883 "the next dataset instead.\n\n"
884 "Example: Assume datasets tank/user1/foo and tank/user2/bar and an error occurs while "
885 "processing tank/user1. In this case processing skips tank/user1 and proceeds with tank/user1/foo "
886 "if the destination already contains tank/user1. Otherwise processing continues with tank/user2. "
887 "This mode is for production use cases that require timely forward progress even in the presence of "
888 "partial failures. For example, assume the job is to backup the home directories or virtual machines "
889 "of thousands of users across an organization. Even if replication of some of the datasets for some "
890 "users fails due too conflicts, busy datasets, etc, the replication job will continue for the "
891 "remaining datasets and the remaining users.\n\n")
892 parser.add_argument(
893 "--skip-replication", action="store_true",
894 help="Skip replication step (see above) and proceed to the optional --delete-dst-datasets step "
895 "immediately (see below).\n\n")
896 parser.add_argument(
897 "--delete-dst-datasets", action="store_true",
898 help="Do nothing if the --delete-dst-datasets option is missing. Otherwise, after successful replication "
899 "step, if any, delete existing destination datasets that are selected via --{include|exclude}-dataset* "
900 "policy yet do not exist within SRC_DATASET (which can be an empty dataset, such as the hardcoded virtual "
901 f"dataset named '{dummy_dataset}'!). Do not recurse without --recursive. With --recursive, never delete "
902 "non-selected dataset subtrees or their ancestors.\n\n"
903 "For example, if the destination contains datasets h1,h2,h3,d1 whereas source only contains h3, "
904 "and the include/exclude policy selects h1,h2,h3,d1, then delete datasets h1,h2,d1 on "
905 "the destination to make it 'the same'. On the other hand, if the include/exclude policy "
906 "only selects h1,h2,h3 then only delete datasets h1,h2 on the destination to make it 'the same'.\n\n"
907 "Example to delete all tmp datasets within tank2/boo/bar: "
908 f"`{prog_name} {dummy_dataset} tank2/boo/bar --dryrun --skip-replication --recursive "
909 "--delete-dst-datasets --include-dataset-regex '(.*/)?tmp.*' --exclude-dataset-regex '!.*'`\n\n")
910 parser.add_argument(
911 "--delete-dst-snapshots", choices=["snapshots", "bookmarks"], default=None, const="snapshots", nargs="?",
912 help="Do nothing if the --delete-dst-snapshots option is missing. Otherwise, after successful "
913 "replication, and successful --delete-dst-datasets step, if any, delete existing destination snapshots "
914 "whose GUID does not exist within the source dataset (which can be an empty dummy dataset!) if the "
915 "destination snapshots are selected by the --include/exclude-snapshot-* policy, and the destination "
916 "dataset is selected via --{include|exclude}-dataset* policy. Does not recurse without --recursive.\n\n"
917 "For example, if the destination dataset contains snapshots h1,h2,h3,d1 (h=hourly, d=daily) whereas "
918 "the source dataset only contains snapshot h3, and the include/exclude policy selects "
919 "h1,h2,h3,d1, then delete snapshots h1,h2,d1 on the destination dataset to make it 'the same'. "
920 "On the other hand, if the include/exclude policy only selects snapshots h1,h2,h3 then only "
921 "delete snapshots h1,h2 on the destination dataset to make it 'the same'.\n\n"
922 "*Note:* To delete snapshots regardless, consider using --delete-dst-snapshots in combination with a "
923 f"source that is an empty dataset, such as the hardcoded virtual dataset named '{dummy_dataset}', like so:"
924 f" `{prog_name} {dummy_dataset} tank2/boo/bar --dryrun --skip-replication --delete-dst-snapshots "
925 "--include-snapshot-regex '.*_daily' --recursive`\n\n"
926 "*Note:* Use --delete-dst-snapshots=bookmarks to delete bookmarks instead of snapshots, in which "
927 "case no snapshots are selected and the --{include|exclude}-snapshot-* filter options treat bookmarks as "
928 "snapshots wrt. selecting.\n\n"
929 "*Performance Note:* --delete-dst-snapshots operates on multiple datasets in parallel (and serially "
930 f"within a dataset), using the same dataset order as {prog_name} replication. "
931 "The degree of parallelism is configurable with the --threads option (see below).\n\n")
932 parser.add_argument(
933 "--delete-dst-snapshots-no-crosscheck", action="store_true",
934 help="This flag indicates that --delete-dst-snapshots=snapshots shall check the source dataset only for "
935 "a snapshot with the same GUID, and ignore whether a bookmark with the same GUID is present in the "
936 "source dataset. Similarly, it also indicates that --delete-dst-snapshots=bookmarks shall check the "
937 "source dataset only for a bookmark with the same GUID, and ignore whether a snapshot with the same GUID "
938 "is present in the source dataset.\n\n")
939 parser.add_argument(
940 "--delete-dst-snapshots-except", action="store_true",
941 help="This flag indicates that the --include/exclude-snapshot-* options shall have inverted semantics for the "
942 "--delete-dst-snapshots option, thus deleting all snapshots except for the selected snapshots (within the "
943 "specified datasets), instead of deleting all selected snapshots (within the specified datasets). In other "
944 "words, this flag enables to specify which snapshots to retain instead of which snapshots to delete.\n\n"
945 "*Synchronization vs. Backup*: When a real (non-dummy) source dataset is specified in combination with "
946 "--delete-dst-snapshots-except, then any destination snapshot retained by the rules above is actually only "
947 "retained if it also exists in the source dataset - __all other destination snapshots are deleted__. This is "
948 "great for synchronization use cases but should __NEVER BE USED FOR LONG-TERM ARCHIVAL__. Long-term archival "
949 "use cases should instead specify the `dummy` source dataset as they require an independent retention policy "
950 "that is not tied to the current contents of the source dataset.\n\n")
951 parser.add_argument(
952 "--delete-dst-snapshots-except-plan", action=DeleteDstSnapshotsExceptPlanAction, default=None, metavar="DICT_STRING",
953 help="Retention periods to be used if pruning snapshots or bookmarks within the selected destination datasets via "
954 "--delete-dst-snapshots. Has the same format as --create-src-snapshots-plan. "
955 "Snapshots (--delete-dst-snapshots=snapshots) or bookmarks (with --delete-dst-snapshots=bookmarks) that "
956 "do not match a period will be deleted. To avoid unexpected surprises, make sure to carefully specify ALL "
957 "snapshot names and periods that shall be retained, in combination with --dryrun.\n\n"
958 f"Example: `{format_dict(src_snapshot_plan_example)}`. This example will, for the organization 'prod' and "
959 "the intended logical target 'onsite', retain secondly snapshots that were created less than 40 seconds ago, "
960 "yet retain the latest 40 secondly snapshots regardless of creation time. Analog for the latest 40 minutely "
961 "snapshots, latest 36 hourly snapshots, etc. "
962 "It will also retain snapshots for the targets 'us-west-1' and 'eu-west-1' within the 'prod' organization. "
963 "In addition, within the 'test' organization, it will retain snapshots that are created every 12 hours and "
964 "every week as specified, and name them as being intended for the 'offsite' replication target. Analog for "
965 "snapshots that are taken every 100 milliseconds within the 'test' organization. "
966 "All other snapshots within the selected datasets will be deleted - you've been warned!\n\n"
967 "The example scans the selected ZFS datasets for snapshots with names like "
968 "`prod_onsite_<timestamp>_secondly`, `prod_onsite_<timestamp>_minutely`, "
969 "`prod_us-west-1_<timestamp>_hourly`, `prod_us-west-1_<timestamp>_daily`, "
970 "`prod_eu-west-1_<timestamp>_hourly`, `prod_eu-west-1_<timestamp>_daily`, "
971 "`test_offsite_<timestamp>_12hourly`, `test_offsite_<timestamp>_weekly`, and so on, and deletes all snapshots "
972 "that do not match a retention rule.\n\n"
973 "Note: A zero within a period (e.g. 'hourly': 0) indicates that no snapshots shall be retained for the given "
974 "period.\n\n"
975 "Note: --delete-dst-snapshots-except-plan is a convenience option that auto-generates a series of the "
976 "following other options: --delete-dst-snapshots-except, "
977 "--new-snapshot-filter-group, --include-snapshot-regex, --include-snapshot-times-and-ranks\n\n")
978 parser.add_argument(
979 "--delete-empty-dst-datasets", choices=["snapshots", "snapshots+bookmarks"], default=None,
980 const="snapshots+bookmarks", nargs="?",
981 help="Do nothing if the --delete-empty-dst-datasets option is missing or --recursive is missing. Otherwise, "
982 "after successful replication "
983 "step and successful --delete-dst-datasets and successful --delete-dst-snapshots steps, if any, "
984 "delete any selected destination dataset that has no snapshot and no bookmark if all descendants of "
985 "that destination dataset are also selected and do not have a snapshot or bookmark either "
986 "(again, only if the existing destination dataset is selected via --{include|exclude}-dataset* policy). "
987 "Never delete non-selected dataset subtrees or their ancestors.\n\n"
988 "For example, if the destination contains datasets h1,d1, and the include/exclude policy "
989 "selects h1,d1, then check if h1,d1 can be deleted. "
990 "On the other hand, if the include/exclude policy only selects h1 then only check if h1 can be deleted.\n\n"
991 "*Note:* Use --delete-empty-dst-datasets=snapshots to delete snapshot-less datasets even if they still "
992 "contain bookmarks.\n\n")
993 monitor_snapshot_plan_example = {
994 "prod": {
995 "onsite": {
996 "100millisecondly": {"latest": {"warning": "300 milliseconds", "critical": "2 seconds"}},
997 "secondly": {"latest": {"warning": "2 seconds", "critical": "14 seconds"}},
998 "minutely": {"latest": {"warning": "30 seconds", "critical": "300 seconds"}},
999 "hourly": {"latest": {"warning": "30 minutes", "critical": "300 minutes"}},
1000 "daily": {"latest": {"warning": "4 hours", "critical": "8 hours"}},
1001 "weekly": {"latest": {"warning": "2 days", "critical": "8 days"}},
1002 "monthly": {"latest": {"warning": "2 days", "critical": "8 days"}},
1003 "yearly": {"latest": {"warning": "5 days", "critical": "14 days"}},
1004 "10minutely": {"latest": {"warning": "0 minutes", "critical": "0 minutes"}},
1005 },
1006 "": {
1007 "daily": {"latest": {"warning": "4 hours", "critical": "8 hours"}},
1008 },
1009 },
1010 }
1011 parser.add_argument(
1012 "--monitor-snapshots", default="{}", type=str, metavar="DICT_STRING",
1013 help="Do nothing if the --monitor-snapshots flag is missing. Otherwise, after all other steps, "
1014 "alert the user if the ZFS 'creation' time property of the latest snapshot for any specified snapshot name "
1015 "pattern within the selected datasets is too old wrt. the specified age limit. The purpose is to check if "
1016 "snapshots are successfully taken on schedule, successfully replicated on schedule, and successfully pruned on "
1017 "schedule. Process exit code is 0, 1, 2 on OK, WARNING, CRITICAL, respectively. "
1018 f"Example DICT_STRING: `{format_dict(monitor_snapshot_plan_example)}`. "
1019 "This example alerts the user if the latest src or dst snapshot named `prod_onsite_<timestamp>_hourly` is more "
1020 "than 30 minutes late (i.e. more than 30+60=90 minutes old) [warning] or more than 300 minutes late (i.e. more "
1021 "than 300+60=360 minutes old) [critical]. "
1022 "Analog for the latest snapshot named `prod_<timestamp>_daily`, and so on.\n\n"
1023 "Note: A duration that is missing or zero (e.g. '0 minutes') indicates that no snapshots shall be checked for "
1024 "the given snapshot name pattern.\n\n")
1025 parser.add_argument(
1026 "--monitor-snapshots-dont-warn", action="store_true",
1027 help="Log a message for monitoring warnings but nonetheless exit with zero exit code.\n\n")
1028 parser.add_argument(
1029 "--monitor-snapshots-dont-crit", action="store_true",
1030 help="Log a message for monitoring criticals but nonetheless exit with zero exit code.\n\n")
1031 parser.add_argument(
1032 "--monitor-snapshots-no-latest-check", action="store_true",
1033 # help="Disable monitoring check of latest snapshot.\n\n")
1034 help=argparse.SUPPRESS)
1035 parser.add_argument(
1036 "--monitor-snapshots-no-oldest-check", action="store_true",
1037 # help="Disable monitoring check of oldest snapshot.\n\n")
1038 help=argparse.SUPPRESS)
1039 cmp_choices_dflt = "+".join(cmp_choices_items)
1040 cmp_choices: List[str] = []
1041 for i in range(0, len(cmp_choices_items)):
1042 cmp_choices += map(lambda item: "+".join(item), itertools.combinations(cmp_choices_items, i + 1))
1043 parser.add_argument(
1044 "--compare-snapshot-lists", choices=cmp_choices, default="", const=cmp_choices_dflt, nargs="?",
1045 help="Do nothing if the --compare-snapshot-lists option is missing. Otherwise, after successful replication "
1046 "step and successful --delete-dst-datasets, --delete-dst-snapshots steps and --delete-empty-dst-datasets "
1047 "steps, if any, proceed as follows:\n\n"
1048 "Compare source and destination dataset trees recursively wrt. snapshots, for example to check if all "
1049 "recently taken snapshots have been successfully replicated by a periodic job.\n\n"
1050 "Example: List snapshots only contained in source (tagged with 'src'), only contained in destination "
1051 "(tagged with 'dst'), and contained in both source and destination (tagged with 'all'), restricted to "
1052 "hourly and daily snapshots taken within the last 7 days, excluding the last 4 hours (to allow for some "
1053 "slack/stragglers), excluding temporary datasets: "
1054 f"`{prog_name} tank1/foo/bar tank2/boo/bar --skip-replication "
1055 "--compare-snapshot-lists=src+dst+all --recursive --include-snapshot-regex '.*_(hourly|daily)' "
1056 "--include-snapshot-times-and-ranks '7 days ago..4 hours ago' --exclude-dataset-regex 'tmp.*'`\n\n"
1057 "This outputs a TSV file containing the following columns:\n\n"
1058 "`location creation_iso createtxg rel_name guid root_dataset rel_dataset name creation written`\n\n"
1059 "Example output row:\n\n"
1060 "`src 2024-11-06_08:30:05 17435050 /foo@test_2024-11-06_08:30:05_daily 2406491805272097867 tank1/src "
1061 "/foo tank1/src/foo@test_2024-10-06_08:30:04_daily 1730878205 24576`\n\n"
1062 "If the TSV output file contains zero lines starting with the prefix 'src' and zero lines starting with "
1063 "the prefix 'dst' then no source snapshots are missing on the destination, and no destination "
1064 "snapshots are missing on the source, indicating that the periodic replication and pruning jobs perform "
1065 "as expected. The TSV output is sorted by rel_dataset, and by ZFS creation time within each rel_dataset "
1066 "- the first and last line prefixed with 'all' contains the metadata of the oldest and latest common "
1067 "snapshot, respectively. Third party tools can use this info for post-processing, for example using "
1068 "custom scripts using 'csplit' or duckdb analytics queries.\n\n"
1069 "The --compare-snapshot-lists option also directly logs various summary stats, such as the metadata of "
1070 "the latest common snapshot, latest snapshots and oldest snapshots, as well as the time diff between the "
1071 "latest common snapshot and latest snapshot only in src (and only in dst), as well as how many src "
1072 "snapshots and how many GB of data are missing on dst, etc.\n\n"
1073 "*Note*: Consider omitting the 'all' flag to reduce noise and instead focus on missing snapshots only, "
1074 "like so: --compare-snapshot-lists=src+dst \n\n"
1075 "*Note*: The source can also be an empty dataset, such as the hardcoded virtual dataset named "
1076 f"'{dummy_dataset}'.\n\n"
1077 "*Note*: --compare-snapshot-lists is typically *much* faster than standard 'zfs list -t snapshot' CLI "
1078 "usage because the former issues requests with a higher degree of parallelism than the latter. The "
1079 "degree is configurable with the --threads option (see below).\n\n")
1080 parser.add_argument(
1081 "--cache-snapshots", choices=["true", "false"], default="false", const="true", nargs="?",
1082 help="Default is '%(default)s'. If 'true', maintain a persistent local cache of recent snapshot creation times, "
1083 "recent successful replication times, and recent monitoring times, and compare them to a quick "
1084 "'zfs list -t filesystem,volume -p -o snapshots_changed' to help determine if a new snapshot shall be created "
1085 "on the src, and if there are any changes that need to be replicated or monitored. Enabling the cache "
1086 "improves performance if --create-src-snapshots and/or replication and/or --monitor-snapshots is invoked "
1087 "frequently (e.g. every minute via cron) over a large number of datasets, with each dataset containing a large "
1088 "number of snapshots, yet it is seldom for a new src snapshot to actually be created, or there are seldom any "
1089 "changes to replicate or monitor (e.g. a snapshot is only created every day and/or deleted every day).\n\n"
1090 "*Note:* This flag only has an effect on OpenZFS >= 2.2.\n\n"
1091 "*Note:* This flag is only relevant for snapshot creation on the src if --create-src-snapshots-even-if-not-due "
1092 "is not specified.\n\n")
1093 parser.add_argument(
1094 "--dryrun", "-n", choices=["recv", "send"], default=None, const="send", nargs="?",
1095 help="Do a dry run (aka 'no-op') to print what operations would happen if the command were to be executed "
1096 "for real (optional). This option treats both the ZFS source and destination as read-only. "
1097 "Accepts an optional argument for fine tuning that is handled as follows:\n\n"
1098 "a) 'recv': Send snapshot data via 'zfs send' to the destination host and receive it there via "
1099 "'zfs receive -n', which discards the received data there.\n\n"
1100 "b) 'send': Do not execute 'zfs send' and do not execute 'zfs receive'. This is a less 'realistic' form "
1101 "of dry run, but much faster, especially for large snapshots and slow networks/disks, as no snapshot is "
1102 "actually transferred between source and destination. This is the default when specifying --dryrun.\n\n"
1103 "Examples: --dryrun, --dryrun=send, --dryrun=recv\n\n")
1104 parser.add_argument(
1105 "--verbose", "-v", action="count", default=0,
1106 help="Print verbose information. This option can be specified multiple times to increase the level of "
1107 "verbosity. To print what ZFS/SSH operation exactly is happening (or would happen), add the `-v -v` "
1108 "flag, maybe along with --dryrun. All ZFS and SSH commands (even with --dryrun) are logged such that "
1109 "they can be inspected, copy-and-pasted into a terminal shell and run manually to help anticipate or "
1110 "diagnose issues. ERROR, WARN, INFO, DEBUG, TRACE output lines are identified by [E], [W], [I], [D], [T] "
1111 "prefixes, respectively.\n\n")
1112 parser.add_argument(
1113 "--quiet", "-q", action="store_true",
1114 help="Suppress non-error, info, debug, and trace output.\n\n")
1115 parser.add_argument(
1116 "--no-privilege-elevation", "-p", action="store_true",
1117 help="Do not attempt to run state changing ZFS operations 'zfs create/rollback/destroy/send/receive/snapshot' as "
1118 "root (via 'sudo -u root' elevation granted by administrators appending the following to /etc/sudoers: "
1119 "`<NON_ROOT_USER_NAME> ALL=NOPASSWD:/path/to/zfs`\n\n"
1120 "Instead, the --no-privilege-elevation flag is for non-root users that have been granted corresponding "
1121 "ZFS permissions by administrators via 'zfs allow' delegation mechanism, like so: "
1122 "sudo zfs allow -u $SRC_NON_ROOT_USER_NAME snapshot,destroy,send,bookmark,hold $SRC_DATASET; "
1123 "sudo zfs allow -u $DST_NON_ROOT_USER_NAME mount,create,receive,rollback,destroy,canmount,mountpoint,"
1124 "readonly,compression,encryption,keylocation,recordsize $DST_DATASET_OR_POOL.\n\n"
1125 "For extra security $SRC_NON_ROOT_USER_NAME should be different than $DST_NON_ROOT_USER_NAME, i.e. the "
1126 "sending Unix user on the source and the receiving Unix user at the destination should be separate Unix "
1127 "user accounts with separate private keys even if both accounts reside on the same machine, per the "
1128 "principle of least privilege. Further, if you do not plan to use the --force* flags and "
1129 "--delete-* CLI options then ZFS permissions 'rollback,destroy' can "
1130 "be omitted. If you do not plan to customize the respective ZFS dataset property then ZFS permissions "
1131 "'canmount,mountpoint,readonly,compression,encryption,keylocation,recordsize' can be omitted, arriving "
1132 "at the absolutely minimal set of required destination permissions: "
1133 "`mount,create,receive`.\n\n"
1134 "Also see https://openzfs.github.io/openzfs-docs/man/master/8/zfs-allow.8.html#EXAMPLES and "
1135 "https://tinyurl.com/9h97kh8n and "
1136 "https://youtu.be/o_jr13Z9f1k?si=7shzmIQJpzNJV6cq\n\n")
1137 parser.add_argument(
1138 "--no-stream", action="store_true",
1139 help="During replication, only replicate the most recent selected source snapshot of a dataset (using -i "
1140 "incrementals instead of -I incrementals), hence skip all intermediate source snapshots that may exist "
1141 "between that and the most recent common snapshot. If there is no common snapshot also skip all other "
1142 "source snapshots for the dataset, except for the most recent selected source snapshot. This option helps "
1143 "the destination to 'catch up' with the source ASAP, consuming a minimum of disk space, at the expense "
1144 "of reducing reliable options for rolling back to intermediate snapshots in the future.\n\n")
1145 parser.add_argument(
1146 "--no-resume-recv", action="store_true",
1147 help="Replication of snapshots via 'zfs send/receive' can be interrupted by intermittent network hiccups, "
1148 "reboots, hardware issues, etc. Interrupted 'zfs send/receive' operations are retried if the --retries "
1149 f"and --retry-* options enable it (see above). In normal operation {prog_name} automatically retries "
1150 "such that only the portion of the snapshot is transmitted that has not yet been fully received on the "
1151 "destination. For example, this helps to progressively transfer a large individual snapshot over a "
1152 "wireless network in a timely manner despite frequent intermittent network hiccups. This optimization is "
1153 "called 'resume receive' and uses the 'zfs receive -s' and 'zfs send -t' feature.\n\n"
1154 "The --no-resume-recv option disables this optimization such that a retry now retransmits the entire "
1155 "snapshot from scratch, which could slow down or even prohibit progress in case of frequent network "
1156 f"hiccups. {prog_name} automatically falls back to using the --no-resume-recv option if it is "
1157 "auto-detected that the ZFS pool does not reliably support the 'resume receive' optimization.\n\n"
1158 "*Note:* Snapshots that have already been fully transferred as part of the current 'zfs send/receive' "
1159 "operation need not be retransmitted regardless of the --no-resume-recv flag. For example, assume "
1160 "a single 'zfs send/receive' operation is transferring incremental snapshots 1 through 10 via "
1161 "'zfs send -I', but the operation fails while transferring snapshot 10, then snapshots 1 through 9 "
1162 "need not be retransmitted regardless of the --no-resume-recv flag, as these snapshots have already "
1163 "been successfully received at the destination either way.\n\n")
1164 parser.add_argument(
1165 "--create-bookmarks", choices=["all", "many", "none"], default="many",
1166 help=f"For increased safety, {prog_name} replication behaves as follows wrt. ZFS bookmark creation, if it is "
1167 "autodetected that the source ZFS pool support bookmarks:\n\n"
1168 "* `many` (default): Whenever it has successfully completed replication of the most recent source snapshot, "
1169 f"{prog_name} creates a ZFS bookmark of that snapshot, and attaches it to the source dataset. In addition, "
1170 f"whenever it has successfully completed a 'zfs send' operation, {prog_name} creates a ZFS bookmark of each "
1171 f"hourly, daily, weekly, monthly and yearly source snapshot that was sent during that 'zfs send' operation, "
1172 "and attaches it to the source dataset.\n\n"
1173 "* `all`: Whenever it has successfully completed a 'zfs send' operation, "
1174 f"{prog_name} creates a ZFS bookmark of each source snapshot that was sent during that 'zfs send' operation, "
1175 "and attaches it to the source dataset. This increases safety at the expense of some performance.\n\n"
1176 "* `none`: No bookmark is created.\n\n"
1177 "Bookmarks exist so an incremental stream can continue to be sent from the source dataset without having "
1178 "to keep the already replicated snapshot around on the source dataset until the next upcoming snapshot "
1179 "has been successfully replicated. This way you can send the snapshot from the source dataset to another "
1180 "host, then bookmark the snapshot on the source dataset, then delete the snapshot from the source "
1181 "dataset to save disk space, and then still incrementally send the next upcoming snapshot from the "
1182 "source dataset to the other host by referring to the bookmark.\n\n"
1183 "The --create-bookmarks=none option disables this safety feature but is discouraged, because bookmarks "
1184 "are tiny and relatively cheap and help to ensure that ZFS replication can continue even if source and "
1185 "destination dataset somehow have no common snapshot anymore. "
1186 "For example, if a pruning script has accidentally deleted too many (or even all) snapshots on the "
1187 "source dataset in an effort to reclaim disk space, replication can still proceed because it can use "
1188 "the info in the bookmark (the bookmark must still exist in the source dataset) instead of the info in "
1189 "the metadata of the (now missing) source snapshot.\n\n"
1190 "A ZFS bookmark is a tiny bit of metadata extracted from a ZFS snapshot by the 'zfs bookmark' CLI, and "
1191 "attached to a dataset, much like a ZFS snapshot. Note that a ZFS bookmark does not contain user data; "
1192 "instead a ZFS bookmark is essentially a tiny pointer in the form of the GUID of the snapshot and 64-bit "
1193 "transaction group number of the snapshot and creation time of the snapshot, which is sufficient to tell "
1194 "the destination ZFS pool how to find the destination snapshot corresponding to the source bookmark "
1195 "and (potentially already deleted) source snapshot. A bookmark can be fed into 'zfs send' as the "
1196 "source of an incremental send. Note that while a bookmark allows for its snapshot "
1197 "to be deleted on the source after successful replication, it still requires that its snapshot is not "
1198 "somehow deleted prematurely on the destination dataset, so be mindful of that. "
1199 f"By convention, a bookmark created by {prog_name} has the same name as its corresponding "
1200 "snapshot, the only difference being the leading '#' separator instead of the leading '@' separator. "
1201 "Also see https://www.youtube.com/watch?v=LaNgoAZeTww&t=316s.\n\n"
1202 "You can list bookmarks, like so: "
1203 "`zfs list -t bookmark -o name,guid,createtxg,creation -d 1 $SRC_DATASET`, and you can (and should) "
1204 "periodically prune obsolete bookmarks just like snapshots, like so: "
1205 "`zfs destroy $SRC_DATASET#$BOOKMARK`. Typically, bookmarks should be pruned less aggressively "
1206 "than snapshots, and destination snapshots should be pruned less aggressively than source snapshots. "
1207 "As an example starting point, here is a command that deletes all bookmarks older than "
1208 "90 days, but retains the latest 200 bookmarks (per dataset) regardless of creation time: "
1209 f"`{prog_name} {dummy_dataset} tank2/boo/bar --dryrun --recursive --skip-replication "
1210 "--delete-dst-snapshots=bookmarks --include-snapshot-times-and-ranks notime 'all except latest 200' "
1211 "--include-snapshot-times-and-ranks 'anytime..90 days ago'`\n\n")
1212 parser.add_argument(
1213 "--no-create-bookmark", action="store_true",
1214 help=argparse.SUPPRESS) # deprecated; was replaced by --create-bookmarks=none
1215 parser.add_argument(
1216 "--no-use-bookmark", action="store_true",
1217 help=f"For increased safety, in normal replication operation {prog_name} replication also looks for bookmarks "
1218 "(in addition to snapshots) on the source dataset in order to find the most recent common snapshot wrt. the "
1219 "destination dataset, if it is auto-detected that the source ZFS pool support bookmarks. "
1220 "The --no-use-bookmark option disables this safety feature but is discouraged, because bookmarks help "
1221 "to ensure that ZFS replication can continue even if source and destination dataset somehow have no "
1222 "common snapshot anymore.\n\n"
1223 f"Note that it does not matter whether a bookmark was created by {prog_name} or a third party script, "
1224 "as only the GUID of the bookmark and the GUID of the snapshot is considered for comparison, and ZFS "
1225 "guarantees that any bookmark of a given snapshot automatically has the same GUID, transaction group "
1226 "number and creation time as the snapshot. Also note that you can create, delete and prune bookmarks "
1227 f"any way you like, as {prog_name} (without --no-use-bookmark) will happily work with whatever "
1228 "bookmarks currently exist, if any.\n\n")
1230 ssh_cipher_default = "^aes256-gcm@openssh.com" if platform.system() != "SunOS" else ""
1231 # for speed with confidentiality and integrity
1232 # measure cipher perf like so: count=5000; for i in $(seq 1 3); do echo "iteration $i:"; for cipher in $(ssh -Q cipher); do dd if=/dev/zero bs=1M count=$count 2> /dev/null | ssh -c $cipher -p 40999 127.0.0.1 "(time -p cat) > /dev/null" 2>&1 | grep real | awk -v count=$count -v cipher=$cipher '{print cipher ": " count / $2 " MB/s"}'; done; done # noqa: E501
1233 # see https://gbe0.com/posts/linux/server/benchmark-ssh-ciphers/
1234 # and https://crypto.stackexchange.com/questions/43287/what-are-the-differences-between-these-aes-ciphers
1235 parser.add_argument(
1236 "--ssh-cipher", type=str, default=ssh_cipher_default, metavar="STRING",
1237 help="SSH cipher specification for encrypting the session (optional); will be passed into ssh -c CLI. "
1238 "--ssh-cipher is a comma-separated list of ciphers listed in order of preference. See the 'Ciphers' "
1239 "keyword in ssh_config(5) for more information: "
1240 "https://manpages.ubuntu.com/manpages/man5/sshd_config.5.html. Default: `%(default)s`\n\n")
1242 ssh_private_key_file_default = ".ssh/id_rsa"
1243 locations = ["src", "dst"]
1244 for loc in locations:
1245 parser.add_argument(
1246 f"--ssh-{loc}-private-key", action="append", default=[], metavar="FILE",
1247 help=f"Path to SSH private key file on local host to connect to {loc} (optional); will be passed into "
1248 "ssh -i CLI. This option can be specified multiple times. "
1249 f"default: $HOME/{ssh_private_key_file_default}\n\n")
1250 for loc in locations:
1251 parser.add_argument(
1252 f"--ssh-{loc}-user", type=str, metavar="STRING",
1253 help=f"Remote SSH username on {loc} host to connect to (optional). Overrides username given in "
1254 f"{loc.upper()}_DATASET.\n\n")
1255 for loc in locations:
1256 parser.add_argument(
1257 f"--ssh-{loc}-host", type=str, metavar="STRING",
1258 help=f"Remote SSH hostname of {loc} host to connect to (optional). Can also be an IPv4 or IPv6 address. "
1259 f"Overrides hostname given in {loc.upper()}_DATASET.\n\n")
1260 for loc in locations:
1261 parser.add_argument(
1262 f"--ssh-{loc}-port", type=int, metavar="INT",
1263 help=f"Remote SSH port on {loc} host to connect to (optional).\n\n")
1264 for loc in locations:
1265 parser.add_argument(
1266 f"--ssh-{loc}-extra-opts", type=str, default="", metavar="STRING",
1267 help=f"Additional options to be passed to ssh CLI when connecting to {loc} host (optional). "
1268 "The value is split on runs of one or more whitespace characters. "
1269 f"Example: `--ssh-{loc}-extra-opts='-v -v'` to debug ssh config issues.\n\n")
1270 parser.add_argument(
1271 f"--ssh-{loc}-extra-opt", action="append", default=[], metavar="STRING",
1272 help=f"Additional option to be passed to ssh CLI when connecting to {loc} host (optional). The value "
1273 "can contain spaces and is not split. This option can be specified multiple times. "
1274 f"Example: `--ssh-{loc}-extra-opt='-oProxyCommand=nc %%h %%p'` to disable the TCP_NODELAY "
1275 "socket option for OpenSSH.\n\n")
1276 for loc in locations:
1277 parser.add_argument(
1278 f"--ssh-{loc}-config-file", type=str, metavar="FILE",
1279 help=f"Path to SSH ssh_config(5) file to connect to {loc} (optional); will be passed into ssh -F CLI.\n\n")
1280 parser.add_argument(
1281 "--timeout", default=None, metavar="DURATION",
1282 # help="Exit the program (or current task with non-zero --daemon-lifetime) with an error after this much time has "
1283 # "elapsed. Default is to never timeout. Examples: '600 seconds', '90 minutes', '10years'\n\n")
1284 help=argparse.SUPPRESS)
1285 threads_default = 100 # percent
1286 parser.add_argument(
1287 "--threads", min=1, default=(threads_default, True), action=CheckPercentRange, metavar="INT[%]",
1288 help="The maximum number of threads to use for parallel operations; can be given as a positive integer, "
1289 f"optionally followed by the %% percent character (min: %(min)s, default: {threads_default}%%). Percentages "
1290 "are relative to the number of CPU cores on the machine. Example: 200%% uses twice as many threads as "
1291 "there are cores on the machine; 75%% uses num_threads = num_cores * 0.75. Currently this option only "
1292 "applies to dataset and snapshot replication, --create-src-snapshots, --delete-dst-snapshots, "
1293 "--delete-empty-dst-datasets, --monitor-snapshots and --compare-snapshot-lists. The ideal value for this "
1294 "parameter depends on the use case and its performance requirements, as well as the number of available CPU "
1295 "cores and the parallelism offered by SSDs vs. HDDs, ZFS topology and configuration, as well as the network "
1296 "bandwidth and other workloads simultaneously running on the system. The current default is geared towards a "
1297 "high degreee of parallelism, and as such may perform poorly on HDDs. Examples: 1, 4, 75%%, 150%%\n\n")
1298 parser.add_argument(
1299 "--max-concurrent-ssh-sessions-per-tcp-connection", type=int, min=1, default=8, action=CheckRange, metavar="INT",
1300 help=f"For best throughput, {prog_name} uses multiple SSH TCP connections in parallel, as indicated by "
1301 "--threads (see above). For best startup latency, each such parallel TCP connection can carry a "
1302 "maximum of S concurrent SSH sessions, where "
1303 "S=--max-concurrent-ssh-sessions-per-tcp-connection (default: %(default)s, min: %(min)s). "
1304 "Concurrent SSH sessions are mostly used for metadata operations such as listing ZFS datasets and their "
1305 "snapshots. This client-side max sessions parameter must not be higher than the server-side "
1306 "sshd_config(5) MaxSessions parameter (which defaults to 10, see "
1307 "https://manpages.ubuntu.com/manpages/man5/sshd_config.5.html).\n\n"
1308 f"*Note:* For better throughput, {prog_name} uses one dedicated TCP connection per ZFS "
1309 "send/receive operation such that the dedicated connection is never used by any other "
1310 "concurrent SSH session, effectively ignoring the value of the "
1311 "--max-concurrent-ssh-sessions-per-tcp-connection parameter in the ZFS send/receive case.\n\n")
1312 parser.add_argument(
1313 "--bwlimit", default=None, action=NonEmptyStringAction, metavar="STRING",
1314 help="Sets 'pv' bandwidth rate limit for zfs send/receive data transfer (optional). Example: `100m` to cap "
1315 "throughput at 100 MB/sec. Default is unlimited. Also see "
1316 "https://manpages.ubuntu.com/manpages/man1/pv.1.html\n\n")
1317 parser.add_argument(
1318 "--daemon-lifetime", default="0 seconds", metavar="DURATION",
1319 # help="Exit the daemon after this much time has elapsed. Default is '0 seconds', i.e. no daemon mode. "
1320 # "Examples: '600 seconds', '86400 seconds', '1000years'\n\n")
1321 help=argparse.SUPPRESS)
1322 parser.add_argument(
1323 "--daemon-frequency", default="minutely", metavar="STRING",
1324 # help="Run a daemon iteration every N time units. Default is '%(default)s'. "
1325 # "Examples: '100 millisecondly', '10secondly, 'minutely' to request the daemon to run every 100 milliseconds, "
1326 # "or every 10 seconds, or every minute, respectively. Only has an effect if --daemon-lifetime is nonzero.\n\n")
1327 help=argparse.SUPPRESS)
1328 parser.add_argument(
1329 "--daemon-remote-conf-cache-ttl", default="300 seconds", metavar="DURATION",
1330 # help="The Time-To-Live for the remote host configuration cache, which stores available programs and "
1331 # f"ZFS features. After this duration, {prog_name} will re-detect the remote environment. Set to '0 seconds' "
1332 # "to re-detect on every daemon iteration. Default: %(default)s.\n\n")
1333 help=argparse.SUPPRESS)
1334 parser.add_argument(
1335 "--no-estimate-send-size", action="store_true",
1336 # help="Skip 'zfs send -n -v'. This may improve performance if replicating small snapshots at high frequency.\n\n")
1337 help=argparse.SUPPRESS)
1339 def hlp(program: str) -> str:
1340 return f"The name or path to the '{program}' executable (optional). Default is '{program}'. "
1342 msg = f"Use '{disable_prg}' to disable the use of this program.\n\n"
1343 parser.add_argument(
1344 "--compression-program", default="zstd", action=NonEmptyStringAction, metavar="STRING",
1345 help=hlp("zstd") + "Examples: 'lz4', 'pzstd', 'pigz', 'gzip', '/opt/bin/zstd'. " + msg.rstrip() + " The use is "
1346 "auto-disabled if data is transferred locally instead of via the network. This "
1347 "option is about transparent compression-on-the-wire, not about compression-at-rest.\n\n")
1348 parser.add_argument(
1349 "--compression-program-opts", default="-1", metavar="STRING",
1350 help="The options to be passed to the compression program on the compression step (optional). "
1351 "Default is '%(default)s' (fastest).\n\n")
1352 parser.add_argument(
1353 "--mbuffer-program", default="mbuffer", action=NonEmptyStringAction, metavar="STRING",
1354 help=hlp("mbuffer") + msg.rstrip() + " The use is auto-disabled if data is transferred locally "
1355 "instead of via the network. This tool is used to smooth out the rate "
1356 "of data flow and prevent bottlenecks caused by network latency or "
1357 "speed fluctuation.\n\n")
1358 parser.add_argument(
1359 "--mbuffer-program-opts", default="-q -m 128M", metavar="STRING",
1360 help="Options to be passed to 'mbuffer' program (optional). Default: '%(default)s'.\n\n")
1361 parser.add_argument(
1362 "--ps-program", default="ps", action=NonEmptyStringAction, metavar="STRING",
1363 help=hlp("ps") + msg)
1364 parser.add_argument(
1365 "--pv-program", default="pv", action=NonEmptyStringAction, metavar="STRING",
1366 help=hlp("pv") + msg.rstrip() + " This is used for bandwidth rate-limiting and progress monitoring.\n\n")
1367 parser.add_argument(
1368 "--pv-program-opts", metavar="STRING",
1369 default="--progress --timer --eta --fineta --rate --average-rate --bytes --interval=1 --width=120 --buffer-size=2M",
1370 help="The options to be passed to the 'pv' program (optional). Default: '%(default)s'.\n\n")
1371 parser.add_argument(
1372 "--shell-program", default="sh", action=NonEmptyStringAction, metavar="STRING",
1373 help=hlp("sh") + msg)
1374 parser.add_argument(
1375 "--ssh-program", default="ssh", action=NonEmptyStringAction, metavar="STRING",
1376 help=hlp("ssh") + "Examples: 'hpnssh' or 'ssh' or '/opt/bin/ssh' or wrapper scripts around 'ssh'. " + msg)
1377 parser.add_argument(
1378 "--sudo-program", default="sudo", action=NonEmptyStringAction, metavar="STRING",
1379 help=hlp("sudo") + msg)
1380 parser.add_argument(
1381 "--zfs-program", default="zfs", action=NonEmptyStringAction, metavar="STRING",
1382 help=hlp("zfs") + "\n\n")
1383 parser.add_argument(
1384 "--zpool-program", default="zpool", action=NonEmptyStringAction, metavar="STRING",
1385 help=hlp("zpool") + msg)
1386 parser.add_argument(
1387 "--log-dir", type=str, metavar="DIR",
1388 help=f"Path to the log output directory on local host (optional). Default: $HOME/{prog_name}-logs. The logger "
1389 "that is used by default writes log files there, in addition to the console. The basename of --log-dir must "
1390 f"start with the prefix '{prog_name}-logs' as this helps prevent accidents. The current.dir symlink "
1391 "always points to the subdirectory containing the most recent log file. The current.log symlink "
1392 "always points to the most recent log file. The current.pv symlink always points to the most recent "
1393 "data transfer monitoring log. Run `tail --follow=name --max-unchanged-stats=1` on both symlinks to "
1394 "follow what's currently going on. Parallel replication generates a separate .pv file per thread. To "
1395 "monitor these, run something like "
1396 "`while true; do clear; for f in $(realpath $HOME/bzfs-logs/current/current.pv)*; "
1397 "do tac -s $(printf '\\r') $f | tr '\\r' '\\n' | grep -m1 -v '^$'; done; sleep 1; done`\n\n")
1398 h_fix = ("The path name of the log file on local host is "
1399 "`${--log-dir}/${--log-file-prefix}<timestamp>${--log-file-infix}${--log-file-suffix}-<random>.log`. "
1400 "Example: `--log-file-prefix=zrun_us-west-1_ --log-file-suffix=_daily` will generate log "
1401 "file names such as `zrun_us-west-1_2024-09-03_12:26:15_daily-bl4i1fth.log`\n\n")
1402 parser.add_argument(
1403 "--log-file-prefix", default="zrun_", action=SafeFileNameAction, metavar="STRING",
1404 help="Default is %(default)s. " + h_fix)
1405 parser.add_argument(
1406 "--log-file-infix", default="", action=SafeFileNameAction, metavar="STRING",
1407 help="Default is the empty string. " + h_fix)
1408 parser.add_argument(
1409 "--log-file-suffix", default="", action=SafeFileNameAction, metavar="STRING",
1410 help="Default is the empty string. " + h_fix)
1411 parser.add_argument(
1412 "--log-subdir", choices=["daily", "hourly", "minutely"], default="daily",
1413 help="Make a new subdirectory in --log-dir every day, hour or minute; write log files there. "
1414 "Default is '%(default)s'.")
1415 parser.add_argument(
1416 "--log-syslog-address", default=None, action=NonEmptyStringAction, metavar="STRING",
1417 help="Host:port of the syslog machine to send messages to (e.g. 'foo.example.com:514' or '127.0.0.1:514'), or "
1418 "the file system path to the syslog socket file on localhost (e.g. '/dev/log'). The default is no "
1419 "address, i.e. do not log anything to syslog by default. See "
1420 "https://docs.python.org/3/library/logging.handlers.html#sysloghandler\n\n")
1421 parser.add_argument(
1422 "--log-syslog-socktype", choices=["UDP", "TCP"], default="UDP",
1423 help="The socket type to use to connect if no local socket file system path is used. Default is '%(default)s'.\n\n")
1424 parser.add_argument(
1425 "--log-syslog-facility", type=int, min=0, max=7, default=1, action=CheckRange, metavar="INT",
1426 help="The local facility aka category that identifies msg sources in syslog "
1427 "(default: %(default)s, min=%(min)s, max=%(max)s).\n\n")
1428 parser.add_argument(
1429 "--log-syslog-prefix", default=prog_name, action=NonEmptyStringAction, metavar="STRING",
1430 help=f"The name to prepend to each message that is sent to syslog; identifies {prog_name} messages as opposed "
1431 "to messages from other sources. Default is '%(default)s'.\n\n")
1432 parser.add_argument(
1433 "--log-syslog-level", choices=["CRITICAL", "ERROR", "WARN", "INFO", "DEBUG", "TRACE"],
1434 default="ERROR",
1435 help="Only send messages with equal or higher priority than this log level to syslog. Default is '%(default)s'.\n\n")
1436 parser.add_argument(
1437 "--log-config-file", default=None, action=NonEmptyStringAction, metavar="STRING",
1438 help="The contents of a JSON file that defines a custom python logging configuration to be used (optional). "
1439 "If the option starts with a `+` prefix then the contents are read from the UTF-8 JSON file given "
1440 "after the `+` prefix. Examples: +log_config.json, +/path/to/log_config.json. "
1441 "Here is an example config file that demonstrates usage: "
1442 "https://github.com/whoschek/bzfs/blob/main/bzfs_tests/log_config.json\n\n"
1443 "For more examples see "
1444 "https://stackoverflow.com/questions/7507825/where-is-a-complete-example-of-logging-config-dictconfig "
1445 "and for details see "
1446 "https://docs.python.org/3/library/logging.config.html#configuration-dictionary-schema\n\n"
1447 "*Note:* Lines starting with a # character are ignored as comments within the JSON. Also, if a line ends "
1448 "with a # character the portion between that # character and the preceding # character on the same line "
1449 "is ignored as a comment.\n\n")
1450 parser.add_argument(
1451 "--log-config-var", action=LogConfigVariablesAction, nargs="+", default=[], metavar="NAME:VALUE",
1452 help="User defined variables in the form of zero or more NAME:VALUE pairs (optional). "
1453 "These variables can be used within the JSON passed with --log-config-file (see above) via "
1454 "`${name[:default]}` references, which are substituted (aka interpolated) as follows:\n\n"
1455 "If the variable contains a non-empty CLI value then that value is used. Else if a default value for the "
1456 "variable exists in the JSON file that default value is used. Else the program aborts with an error. "
1457 "Example: In the JSON variable `${syslog_address:/dev/log}`, the variable name is 'syslog_address' "
1458 "and the default value is '/dev/log'. The default value is the portion after the optional : colon "
1459 "within the variable declaration. The default value is used if the CLI user does not specify a non-empty "
1460 "value via --log-config-var, for example via "
1461 "--log-config-var syslog_address:/path/to/socket_file or via "
1462 "--log-config-var syslog_address:[host,port].\n\n"
1463 f"{prog_name} automatically supplies the following convenience variables: "
1464 "`${bzfs.log_level}`, `${bzfs.log_dir}`, `${bzfs.log_file}`, `${bzfs.sub.logger}`, "
1465 "`${bzfs.get_default_log_formatter}`, `${bzfs.timestamp}`. "
1466 "For a complete list see the source code of get_dict_config_logger().\n\n")
1467 parser.add_argument(
1468 "--include-envvar-regex", action=FileOrLiteralAction, nargs="+", default=[], metavar="REGEX",
1469 help="On program startup, unset all Unix environment variables for which the full environment variable "
1470 "name matches at least one of the excludes but none of the includes. If an environment variable is "
1471 "included this decision is never reconsidered because include takes precedence over exclude. "
1472 "The purpose is to tighten security and help guard against accidental inheritance or malicious "
1473 "injection of environment variable values that may have unintended effects.\n\n"
1474 "This option can be specified multiple times. "
1475 "A leading `!` character indicates logical negation, i.e. the regex matches if the regex with the "
1476 "leading `!` character removed does not match. "
1477 "The default is to include no environment variables, i.e. to make no exceptions to "
1478 "--exclude-envvar-regex. "
1479 "Example that retains at least these two env vars: "
1480 "`--include-envvar-regex PATH "
1481 f"--include-envvar-regex {env_var_prefix}min_pipe_transfer_size`. "
1482 "Example that retains all environment variables without tightened security: `'.*'`\n\n")
1483 parser.add_argument(
1484 "--exclude-envvar-regex", action=FileOrLiteralAction, nargs="+", default=[], metavar="REGEX",
1485 help="Same syntax as --include-envvar-regex (see above) except that the default is to exclude no "
1486 f"environment variables. Example: `{env_var_prefix}.*`\n\n")
1488 for period, label in {"yearly": "years", "monthly": "months", "weekly": "weeks", "daily": "days", "hourly": "hours",
1489 "minutely": "minutes", "secondly": "seconds", "millisecondly": "milliseconds"}.items():
1490 anchor_group = parser.add_argument_group(
1491 f"{period.title()} period anchors", "Use these options to customize when snapshots that happen "
1492 f"every N {label} are scheduled to be created on the source by the --create-src-snapshots option.")
1493 for f in [f for f in dataclasses.fields(PeriodAnchors) if f.name.startswith(period + "_")]:
1494 _min = f.metadata.get("min")
1495 _max = f.metadata.get("max")
1496 anchor_group.add_argument(
1497 "--" + f.name, type=int, min=_min, max=_max, default=f.default, action=CheckRange, metavar="INT",
1498 help=f"{f.metadata.get('help')} ({_min} ≤ x ≤ {_max}, default: %(default)s).\n\n")
1500 for option_name, flag in zfs_recv_groups.items():
1501 grup = option_name.replace("_", "-") # one of zfs_recv_o, zfs_recv_x
1502 flag = "'" + flag + "'" # one of -o or -x
1504 def h(text: str) -> str:
1505 return argparse.SUPPRESS if option_name == "zfs_set" else text # noqa: B023
1507 argument_group = parser.add_argument_group(
1508 grup + " (Experimental)",
1509 description=h(f"The following group of parameters specifies additional zfs receive {flag} options that "
1510 "can be used to configure the copying of ZFS dataset properties from the source dataset to "
1511 "its corresponding destination dataset. The 'zfs-recv-o' group of parameters is applied "
1512 "before the 'zfs-recv-x' group."))
1513 target_choices_items = ["full", "incremental"]
1514 target_choices_default = "+".join(target_choices_items)
1515 target_choices = target_choices_items + [target_choices_default]
1516 qq = "'"
1517 argument_group.add_argument(
1518 f"--{grup}-targets", choices=target_choices, default=target_choices_default,
1519 help=h(f"The zfs send phase or phases during which the extra {flag} options are passed to 'zfs receive'. "
1520 "This can be one of the following choices: "
1521 f"{', '.join([f'{qq}{x}{qq}' for x in target_choices])}. "
1522 "Default is '%(default)s'. "
1523 "A 'full' send is sometimes also known as an 'initial' send.\n\n"))
1524 msg = "Thus, -x opts do not benefit from source != 'local' (which is the default already)." \
1525 if flag == "'-x'" else ""
1526 argument_group.add_argument(
1527 f"--{grup}-sources", action=NonEmptyStringAction, default="local", metavar="STRING",
1528 help=h("The ZFS sources to provide to the 'zfs get -s' CLI in order to fetch the ZFS dataset properties "
1529 f"that will be fed into the --{grup}-include/exclude-regex filter (see below). The sources are in "
1530 "the form of a comma-separated list (no spaces) containing one or more of the following choices: "
1531 "'local', 'default', 'inherited', 'temporary', 'received', 'none', with the default being '%(default)s'. "
1532 f"Uses 'zfs get -p -s ${grup}-sources all $SRC_DATASET' to fetch the "
1533 "properties to copy - https://openzfs.github.io/openzfs-docs/man/master/8/zfs-get.8.html. P.S: Note "
1534 "that the existing 'zfs send --props' option does not filter and that --props only reads properties "
1535 f"from the 'local' ZFS property source (https://github.com/openzfs/zfs/issues/13024). {msg}\n\n"))
1536 argument_group.add_argument(
1537 f"--{grup}-include-regex", action=FileOrLiteralAction, nargs="+", default=[], metavar="REGEX",
1538 help=h(f"Take the output properties of --{grup}-sources (see above) and filter them such that we only "
1539 "retain the properties whose name matches at least one of the --include regexes but none of the "
1540 "--exclude regexes. If a property is excluded this decision is never reconsidered because exclude "
1541 f"takes precedence over include. Append each retained property to the list of {flag} options in "
1542 "--zfs-recv-program-opt(s), unless another '-o' or '-x' option with the same name already exists "
1543 "therein. In other words, --zfs-recv-program-opt(s) takes precedence.\n\n"
1544 f"The --{grup}-include-regex option can be specified multiple times. "
1545 "A leading `!` character indicates logical negation, i.e. the regex matches if the regex with the "
1546 "leading `!` character removed does not match. "
1547 "If the option starts with a `+` prefix then regexes are read from the newline-separated "
1548 "UTF-8 text file given after the `+` prefix, one regex per line inside of the text file.\n\n"
1549 f"The default is to include no properties, thus by default no extra {flag} option is appended. "
1550 f"Example: `--{grup}-include-regex recordsize volblocksize`. "
1551 "More examples: `.*` (include all properties), `foo bar myapp:.*` (include three regexes) "
1552 f"`+{grup}_regexes.txt`, `+/path/to/{grup}_regexes.txt`\n\n"))
1553 argument_group.add_argument(
1554 f"--{grup}-exclude-regex", action=FileOrLiteralAction, nargs="+", default=[], metavar="REGEX",
1555 help=h(f"Same syntax as --{grup}-include-regex (see above), and the default is to exclude no properties. "
1556 f"Example: --{grup}-exclude-regex encryptionroot keystatus origin volblocksize volsize\n\n"))
1557 parser.add_argument(
1558 "--version", action="version", version=f"{prog_name}-{__version__}, by {prog_author}",
1559 help="Display version information and exit.\n\n")
1560 parser.add_argument(
1561 "--help, -h", action="help",
1562 help="Show this help message and exit.\n\n")
1563 return parser
1564 # fmt: on
1567#############################################################################
1568class LogParams:
1569 def __init__(self, args: argparse.Namespace) -> None:
1570 """Option values for logging; reads from ArgumentParser via args."""
1571 # immutable variables:
1572 if args.quiet:
1573 self.log_level = "ERROR"
1574 elif args.verbose >= 2:
1575 self.log_level = "TRACE"
1576 elif args.verbose >= 1:
1577 self.log_level = "DEBUG"
1578 else:
1579 self.log_level = "INFO"
1580 self.log_config_file = args.log_config_file
1581 self.log_config_vars = dict(var.split(":", 1) for var in args.log_config_var)
1582 timestamp = datetime.now().isoformat(sep="_", timespec="seconds") # 2024-09-03_12:26:15
1583 self.timestamp: str = timestamp
1584 self.home_dir: str = get_home_directory()
1585 default_dir_name = prog_name + "-logs"
1586 log_parent_dir: str = args.log_dir if args.log_dir else os.path.join(self.home_dir, default_dir_name)
1587 if not os.path.basename(log_parent_dir).startswith(default_dir_name):
1588 msg = f"Basename of --log-dir must start with prefix '{default_dir_name}', but got: {log_parent_dir}"
1589 get_simple_logger(prog_name).error("%s", msg)
1590 die(msg)
1591 self.last_modified_cache_dir = os.path.join(log_parent_dir, ".cache", "last_modified")
1592 sep = "_" if args.log_subdir == "daily" else ":"
1593 subdir = timestamp[0 : timestamp.rindex(sep) if args.log_subdir == "minutely" else timestamp.index(sep)]
1594 self.log_dir: str = os.path.join(log_parent_dir, subdir) # 2024-09-03 (d), 2024-09-03_12 (h), 2024-09-03_12:26 (m)
1595 os.makedirs(log_parent_dir, mode=stat.S_IRWXU, exist_ok=True) # aka chmod u=rwx,go=
1596 os.makedirs(self.log_dir, mode=stat.S_IRWXU, exist_ok=True)
1597 self.log_file_prefix = args.log_file_prefix
1598 self.log_file_infix = args.log_file_infix
1599 self.log_file_suffix = args.log_file_suffix
1600 fd, self.log_file = tempfile.mkstemp(
1601 suffix=".log",
1602 prefix=f"{self.log_file_prefix}{self.timestamp}{self.log_file_infix}{self.log_file_suffix}-",
1603 dir=self.log_dir,
1604 )
1605 os.close(fd)
1606 self.pv_log_file = self.log_file[0 : -len(".log")] + ".pv"
1608 # Create/update "current" symlink to current_dir, which is a subdir containing further symlinks to log files.
1609 # For parallel usage, ensures there is no time window when the symlinks are inconsistent or do not exist.
1610 current = "current"
1611 dot_current_dir = os.path.join(log_parent_dir, f".{current}")
1612 current_dir = os.path.join(dot_current_dir, os.path.basename(self.log_file)[0 : -len(".log")])
1613 os.makedirs(current_dir, exist_ok=True)
1614 create_symlink(self.log_file, current_dir, f"{current}.log")
1615 create_symlink(self.pv_log_file, current_dir, f"{current}.pv")
1616 create_symlink(self.log_dir, current_dir, f"{current}.dir")
1617 dst_file = os.path.join(current_dir, current)
1618 os.symlink(os.path.relpath(current_dir, start=log_parent_dir), dst_file)
1619 os.replace(dst_file, os.path.join(log_parent_dir, current)) # atomic rename
1620 delete_stale_files(dot_current_dir, prefix="", millis=10, dirs=True, exclude=os.path.basename(current_dir))
1621 self.params: Optional[Params] = None
1623 def __repr__(self) -> str:
1624 return str(self.__dict__)
1627#############################################################################
1628RegexList = List[Tuple[re.Pattern, bool]] # Type alias
1629UnixTimeRange = Optional[Tuple[Union[timedelta, int], Union[timedelta, int]]] # Type alias
1630RankRange = Tuple[Tuple[str, int, bool], Tuple[str, int, bool]] # Type alias
1631Tree = Dict[str, Dict[str, Any]] # Type alias
1634#############################################################################
1635class Params:
1636 def __init__(
1637 self,
1638 args: argparse.Namespace,
1639 sys_argv: Optional[List[str]] = None,
1640 log_params: Optional[LogParams] = None,
1641 log: Optional[Logger] = None,
1642 inject_params: Optional[Dict[str, bool]] = None,
1643 ) -> None:
1644 """Option values for all aspects; reads from ArgumentParser via args."""
1645 # immutable variables:
1646 assert args is not None
1647 self.args: argparse.Namespace = args
1648 self.sys_argv: List[str] = sys_argv if sys_argv is not None else []
1649 assert isinstance(self.sys_argv, list)
1650 self.log_params: LogParams = cast(LogParams, log_params)
1651 self.log: Logger = cast(Logger, log)
1652 self.inject_params: Dict[str, bool] = inject_params if inject_params is not None else {} # for testing only
1653 self.one_or_more_whitespace_regex: re.Pattern = re.compile(r"\s+")
1654 self.two_or_more_spaces_regex: re.Pattern = re.compile(r" +")
1655 self.unset_matching_env_vars(args)
1656 self.program_validator = ProgramValidator()
1657 self.xperiods = SnapshotPeriods()
1659 assert len(args.root_dataset_pairs) > 0
1660 self.root_dataset_pairs: List[Tuple[str, str]] = args.root_dataset_pairs
1661 self.recursive: bool = args.recursive
1662 self.recursive_flag: str = "-r" if args.recursive else ""
1664 self.dry_run: bool = args.dryrun is not None
1665 self.dry_run_recv: str = "-n" if self.dry_run else ""
1666 self.dry_run_destroy: str = self.dry_run_recv
1667 self.dry_run_no_send: bool = args.dryrun == "send"
1668 self.verbose_zfs: bool = args.verbose >= 2
1669 self.verbose_destroy: str = "" if args.quiet else "-v"
1670 self.quiet: bool = args.quiet
1672 self.zfs_send_program_opts: List[str] = self.fix_send_opts(self.split_args(args.zfs_send_program_opts))
1673 zfs_recv_program_opts: List[str] = self.split_args(args.zfs_recv_program_opts)
1674 for extra_opt in args.zfs_recv_program_opt:
1675 zfs_recv_program_opts.append(self.validate_arg_str(extra_opt, allow_all=True))
1676 self.zfs_recv_program_opts: List[str] = self.fix_recv_opts(zfs_recv_program_opts)
1677 if self.verbose_zfs:
1678 append_if_absent(self.zfs_send_program_opts, "-v")
1679 append_if_absent(self.zfs_recv_program_opts, "-v")
1680 self.zfs_full_recv_opts: List[str] = self.zfs_recv_program_opts.copy()
1681 cpconfigs = [CopyPropertiesConfig(group, flag, args, self) for group, flag in zfs_recv_groups.items()]
1682 self.zfs_recv_o_config, self.zfs_recv_x_config, self.zfs_set_config = cpconfigs
1684 self.force_rollback_to_latest_snapshot: bool = args.force_rollback_to_latest_snapshot
1685 self.force_rollback_to_latest_common_snapshot = SynchronizedBool(args.force_rollback_to_latest_common_snapshot)
1686 self.force: SynchronizedBool = SynchronizedBool(args.force)
1687 self.force_once: bool = args.force_once
1688 self.force_unmount: str = "-f" if args.force_unmount else ""
1689 force_hard: str = "-R" if args.force_destroy_dependents else ""
1690 self.force_hard: str = "-R" if args.force_hard else force_hard # --force-hard is deprecated
1692 self.skip_parent: bool = args.skip_parent
1693 self.skip_missing_snapshots: str = args.skip_missing_snapshots
1694 self.skip_on_error: str = args.skip_on_error
1695 self.retry_policy: RetryPolicy = RetryPolicy(args, self)
1696 self.skip_replication: bool = args.skip_replication
1697 self.delete_dst_snapshots: bool = args.delete_dst_snapshots is not None
1698 self.delete_dst_bookmarks: bool = args.delete_dst_snapshots == "bookmarks"
1699 self.delete_dst_snapshots_no_crosscheck: bool = args.delete_dst_snapshots_no_crosscheck
1700 self.delete_dst_snapshots_except: bool = args.delete_dst_snapshots_except
1701 self.delete_dst_datasets: bool = args.delete_dst_datasets
1702 self.delete_empty_dst_datasets: bool = args.delete_empty_dst_datasets is not None
1703 self.delete_empty_dst_datasets_if_no_bookmarks_and_no_snapshots: bool = (
1704 args.delete_empty_dst_datasets == "snapshots+bookmarks"
1705 )
1706 self.compare_snapshot_lists: str = args.compare_snapshot_lists
1707 self.daemon_lifetime_nanos: int = 1_000_000 * parse_duration_to_milliseconds(args.daemon_lifetime)
1708 self.daemon_frequency: str = args.daemon_frequency
1709 self.enable_privilege_elevation: bool = not args.no_privilege_elevation
1710 self.no_stream: bool = args.no_stream
1711 self.resume_recv: bool = not args.no_resume_recv
1712 self.create_bookmarks: str = "none" if args.no_create_bookmark else args.create_bookmarks # no_create_bookmark depr
1713 self.use_bookmark: bool = not args.no_use_bookmark
1715 self.src: Remote = Remote("src", args, self) # src dataset, host and ssh options
1716 self.dst: Remote = Remote("dst", args, self) # dst dataset, host and ssh options
1717 self.create_src_snapshots_config: CreateSrcSnapshotConfig = CreateSrcSnapshotConfig(args, self)
1718 self.monitor_snapshots_config: MonitorSnapshotsConfig = MonitorSnapshotsConfig(args, self)
1719 self.is_caching_snapshots: bool = args.cache_snapshots == "true"
1721 self.compression_program: str = self.program_name(args.compression_program, allow_compression=True)
1722 self.compression_program_opts: List[str] = self.split_args(args.compression_program_opts)
1723 self.getconf_program: str = self.program_name("getconf") # print number of CPUs on POSIX except Solaris
1724 self.psrinfo_program: str = self.program_name("psrinfo") # print number of CPUs on Solaris
1725 self.mbuffer_program: str = self.program_name(args.mbuffer_program)
1726 self.mbuffer_program_opts: List[str] = self.split_args(args.mbuffer_program_opts)
1727 self.ps_program: str = self.program_name(args.ps_program)
1728 self.pv_program: str = self.program_name(args.pv_program)
1729 self.pv_program_opts: List[str] = self.split_args(args.pv_program_opts)
1730 self.isatty: bool = getenv_bool("isatty", True)
1731 if args.bwlimit:
1732 self.pv_program_opts += [f"--rate-limit={self.validate_arg_str(args.bwlimit)}"]
1733 self.shell_program_local: str = "sh"
1734 self.shell_program: str = self.program_name(args.shell_program, allow_shell=True)
1735 self.ssh_program: str = self.program_name(args.ssh_program, allow_ssh=True)
1736 self.sudo_program: str = self.program_name(args.sudo_program, allow_sudo=True)
1737 self.uname_program: str = self.program_name("uname")
1738 self.zfs_program: str = self.program_name(args.zfs_program, allow_zfs=True)
1739 self.zpool_program: str = self.program_name(args.zpool_program, allow_zpool=True)
1741 # no point creating complex shell pipeline commands for tiny data transfers:
1742 self.min_pipe_transfer_size: int = getenv_int("min_pipe_transfer_size", 1024 * 1024)
1743 self.max_datasets_per_batch_on_list_snaps: int = getenv_int("max_datasets_per_batch_on_list_snaps", 1024)
1744 self.max_datasets_per_minibatch_on_list_snaps: int = getenv_int("max_datasets_per_minibatch_on_list_snaps", -1)
1745 self.max_snapshots_per_minibatch_on_delete_snaps = getenv_int("max_snapshots_per_minibatch_on_delete_snaps", 2**29)
1746 self.dedicated_tcp_connection_per_zfs_send: bool = getenv_bool("dedicated_tcp_connection_per_zfs_send", True)
1747 self.threads: Tuple[int, bool] = (1, False) if self.force_once else args.threads
1748 timeout_nanos = None if args.timeout is None else 1_000_000 * parse_duration_to_milliseconds(args.timeout)
1749 self.timeout_nanos: Optional[int] = timeout_nanos
1750 self.no_estimate_send_size: bool = args.no_estimate_send_size
1751 self.remote_conf_cache_ttl_nanos: int = 1_000_000 * parse_duration_to_milliseconds(args.daemon_remote_conf_cache_ttl)
1752 self.terminal_columns: int = (
1753 getenv_int("terminal_columns", shutil.get_terminal_size(fallback=(120, 24)).columns)
1754 if self.isatty and self.pv_program != disable_prg and not self.quiet
1755 else 0
1756 )
1758 self.os_cpu_count: Optional[int] = os.cpu_count()
1759 self.os_geteuid: int = os.geteuid()
1760 self.prog_version: str = __version__
1761 self.python_version: str = sys.version
1762 self.platform_version: str = platform.version()
1763 self.platform_platform: str = platform.platform()
1765 # mutable variables:
1766 snapshot_filters = args.snapshot_filters_var if hasattr(args, snapshot_filters_var) else [[]]
1767 self.snapshot_filters: List[List[SnapshotFilter]] = [optimize_snapshot_filters(f) for f in snapshot_filters]
1768 self.exclude_dataset_property: Optional[str] = args.exclude_dataset_property
1769 self.exclude_dataset_regexes: RegexList = [] # deferred to validate_task() phase
1770 self.include_dataset_regexes: RegexList = [] # deferred to validate_task() phase
1771 self.tmp_exclude_dataset_regexes: RegexList = [] # deferred to validate_task() phase
1772 self.tmp_include_dataset_regexes: RegexList = [] # deferred to validate_task() phase
1773 self.abs_exclude_datasets: List[str] = [] # deferred to validate_task() phase
1774 self.abs_include_datasets: List[str] = [] # deferred to validate_task() phase
1776 self.curr_zfs_send_program_opts: List[str] = []
1777 self.zfs_recv_ox_names: Set[str] = set()
1778 self.available_programs: Dict[str, Dict[str, str]] = {}
1779 self.zpool_features: Dict[str, Dict[str, str]] = {}
1780 self.connection_pools: Dict[str, "ConnectionPools"] = {}
1782 def split_args(self, text: str, *items: Union[str, Iterable[str]], allow_all: bool = False) -> List[str]:
1783 """Splits option string on runs of one or more whitespace into an option list."""
1784 text = text.strip()
1785 opts = self.one_or_more_whitespace_regex.split(text) if text else []
1786 xappend(opts, items)
1787 if not allow_all:
1788 self.validate_quoting(opts)
1789 return opts
1791 def validate_arg(self, opt: str, allow_spaces: bool = False, allow_all: bool = False) -> Optional[str]:
1792 """allow_all permits all characters, including whitespace and quotes. See squote() and dquote()."""
1793 if allow_all or opt is None:
1794 return opt
1795 if any(char.isspace() and (char != " " or not allow_spaces) for char in opt):
1796 die(f"Option must not contain a whitespace character{' other than space' if allow_spaces else ''}: {opt}")
1797 self.validate_quoting([opt])
1798 return opt
1800 def validate_arg_str(self, opt: str, allow_spaces: bool = False, allow_all: bool = False) -> str:
1801 if opt is None:
1802 die("Option must not be missing")
1803 self.validate_arg(opt, allow_spaces=allow_spaces, allow_all=allow_all)
1804 return opt
1806 @staticmethod
1807 def validate_quoting(opts: List[str]) -> None:
1808 for opt in opts:
1809 if "'" in opt or '"' in opt or "$" in opt or "`" in opt:
1810 die(f"Option must not contain a single quote or double quote or dollar or backtick character: {opt}")
1812 @staticmethod
1813 def fix_recv_opts(opts: List[str]) -> List[str]:
1814 return fix_send_recv_opts(
1815 opts, exclude_long_opts={"--dryrun"}, exclude_short_opts="n", include_arg_opts={"-o", "-x"}
1816 )
1818 @staticmethod
1819 def fix_send_opts(opts: List[str]) -> List[str]:
1820 return fix_send_recv_opts(
1821 opts,
1822 exclude_long_opts={"--dryrun"},
1823 exclude_short_opts="den",
1824 include_arg_opts={"-X", "--exclude", "--redact"},
1825 exclude_arg_opts=frozenset({"-i", "-I"}),
1826 )
1828 def program_name(
1829 self,
1830 program: str,
1831 allow_shell: bool = False,
1832 allow_sudo: bool = False,
1833 allow_ssh: bool = False,
1834 allow_zfs: bool = False,
1835 allow_zpool: bool = False,
1836 allow_compression: bool = False,
1837 ) -> str:
1838 """For testing: helps simulate errors caused by external programs."""
1839 self.validate_arg_str(program)
1840 self.program_validator.validate_program(
1841 program,
1842 allow_shell=allow_shell,
1843 allow_sudo=allow_sudo,
1844 allow_ssh=allow_ssh,
1845 allow_zfs=allow_zfs,
1846 allow_zpool=allow_zpool,
1847 allow_compression=allow_compression,
1848 )
1849 if self.inject_params.get("inject_unavailable_" + program, False):
1850 return program + "-xxx" # substitute a program that cannot be found on the PATH
1851 if self.inject_params.get("inject_failing_" + program, False):
1852 return "false" # substitute a program that will error out with non-zero return code
1853 return program
1855 def unset_matching_env_vars(self, args: argparse.Namespace) -> None:
1856 exclude_envvar_regexes = compile_regexes(args.exclude_envvar_regex)
1857 include_envvar_regexes = compile_regexes(args.include_envvar_regex)
1858 for envvar_name in list(os.environ.keys()):
1859 if is_included(envvar_name, exclude_envvar_regexes, include_envvar_regexes):
1860 os.environ.pop(envvar_name, None)
1861 self.log.debug("Unsetting b/c envvar regex: %s", envvar_name)
1863 def lock_file_name(self) -> str:
1864 """Makes it such that a job that runs periodically declines to start if the same previous periodic
1865 job is still running without completion yet."""
1866 # fmt: off
1867 key = (tuple(self.root_dataset_pairs), self.args.recursive, self.args.exclude_dataset_property,
1868 tuple(self.args.include_dataset), tuple(self.args.exclude_dataset),
1869 tuple(self.args.include_dataset_regex), tuple(self.args.exclude_dataset_regex),
1870 tuple(tuple(f) for f in self.snapshot_filters), self.args.skip_replication, self.args.create_src_snapshots,
1871 self.args.create_src_snapshots_plan, self.args.create_src_snapshots_timeformat,
1872 self.create_src_snapshots_config.anchors,
1873 self.args.delete_dst_datasets, self.args.delete_dst_snapshots, self.args.delete_dst_snapshots_except,
1874 self.args.delete_empty_dst_datasets,
1875 self.args.compare_snapshot_lists, self.args.monitor_snapshots,
1876 self.args.log_file_infix,
1877 self.src.basis_ssh_host, self.dst.basis_ssh_host,
1878 self.src.basis_ssh_user, self.dst.basis_ssh_user)
1879 # fmt: on
1880 hash_code = hashlib.sha256(str(key).encode("utf-8")).hexdigest()
1881 return os.path.join(tempfile.gettempdir(), f"{prog_name}-lockfile-{hash_code}.lock")
1883 def dry(self, msg: str) -> str:
1884 return "Dry " + msg if self.dry_run else msg
1887#############################################################################
1888class Remote:
1889 def __init__(self, loc: str, args: argparse.Namespace, p: Params) -> None:
1890 """Option values for either location=='src' or location=='dst'; reads from ArgumentParser via args."""
1891 # immutable variables:
1892 assert loc == "src" or loc == "dst"
1893 self.location: str = loc
1894 self.params = p
1895 self.basis_ssh_user: str = getattr(args, f"ssh_{loc}_user")
1896 self.basis_ssh_host: str = getattr(args, f"ssh_{loc}_host")
1897 self.ssh_port: int = getattr(args, f"ssh_{loc}_port")
1898 self.ssh_config_file: Optional[str] = p.validate_arg(getattr(args, f"ssh_{loc}_config_file"))
1899 self.ssh_cipher: Optional[str] = p.validate_arg(args.ssh_cipher)
1900 self.ssh_private_key_files: List[str] = [p.validate_arg_str(key) for key in getattr(args, f"ssh_{loc}_private_key")]
1901 # disable interactive password prompts and X11 forwarding and pseudo-terminal allocation:
1902 self.ssh_extra_opts: List[str] = ["-oBatchMode=yes", "-oServerAliveInterval=0", "-x", "-T"]
1903 self.ssh_extra_opts += p.split_args(getattr(args, f"ssh_{loc}_extra_opts"))
1904 for extra_opt in getattr(args, f"ssh_{loc}_extra_opt"):
1905 self.ssh_extra_opts.append(p.validate_arg_str(extra_opt, allow_spaces=True))
1906 self.max_concurrent_ssh_sessions_per_tcp_connection: int = args.max_concurrent_ssh_sessions_per_tcp_connection
1907 self.reuse_ssh_connection: bool = getenv_bool("reuse_ssh_connection", True)
1908 if self.reuse_ssh_connection:
1909 self.ssh_socket_dir: str = os.path.join(get_home_directory(), ".ssh", "bzfs")
1910 os.makedirs(os.path.dirname(self.ssh_socket_dir), exist_ok=True)
1911 os.makedirs(self.ssh_socket_dir, mode=stat.S_IRWXU, exist_ok=True) # aka chmod u=rwx,go=
1912 self.socket_prefix = "s"
1913 delete_stale_files(self.ssh_socket_dir, self.socket_prefix, ssh=True)
1914 self.sanitize1_regex = re.compile(r"[\s\\/@$]") # replace whitespace, /, $, \, @ with a ~ tilde char
1915 self.sanitize2_regex = re.compile(rf"[^a-zA-Z0-9{re.escape('~.:_-')}]") # Remove chars not in the allowed set
1917 # mutable variables:
1918 self.root_dataset: str = "" # deferred until run_main()
1919 self.basis_root_dataset: str = "" # deferred until run_main()
1920 self.pool: str = ""
1921 self.sudo: str = ""
1922 self.use_zfs_delegation: bool = False
1923 self.ssh_user: str = ""
1924 self.ssh_host: str = ""
1925 self.ssh_user_host: str = ""
1926 self.is_nonlocal: bool = False
1928 def local_ssh_command(self) -> List[str]:
1929 """Returns the ssh CLI command to run locally in order to talk to the remote host. This excludes the (trailing)
1930 command to run on the remote host, which will be appended later."""
1931 if self.ssh_user_host == "":
1932 return [] # dataset is on local host - don't use ssh
1934 # dataset is on remote host
1935 p = self.params
1936 if p.ssh_program == disable_prg:
1937 die("Cannot talk to remote host because ssh CLI is disabled.")
1938 ssh_cmd = [p.ssh_program] + self.ssh_extra_opts
1939 if self.ssh_config_file:
1940 ssh_cmd += ["-F", self.ssh_config_file]
1941 for ssh_private_key_file in self.ssh_private_key_files:
1942 ssh_cmd += ["-i", ssh_private_key_file]
1943 if self.ssh_cipher:
1944 ssh_cmd += ["-c", self.ssh_cipher]
1945 if self.ssh_port:
1946 ssh_cmd += ["-p", str(self.ssh_port)]
1947 if self.reuse_ssh_connection:
1948 # Performance: reuse ssh connection for low latency startup of frequent ssh invocations via the 'ssh -S' and
1949 # 'ssh -S -M -oControlPersist=60s' options. See https://en.wikibooks.org/wiki/OpenSSH/Cookbook/Multiplexing
1950 # Generate unique private Unix domain socket file name in user's home dir and pass it to 'ssh -S /path/to/socket'
1951 def sanitize(name: str) -> str:
1952 name = self.sanitize1_regex.sub("~", name) # replace whitespace, /, $, \, @ with a ~ tilde char
1953 name = self.sanitize2_regex.sub("", name) # Remove chars not in the allowed set
1954 return name
1956 unique = f"{os.getpid()}@{time.time_ns()}@{random.SystemRandom().randint(0, 999_999_999_999)}"
1957 socket_name = f"{self.socket_prefix}{unique}@{sanitize(self.ssh_host)[:45]}@{sanitize(self.ssh_user)}"
1958 socket_file = os.path.join(self.ssh_socket_dir, socket_name)[: max(100, len(self.ssh_socket_dir) + 10)]
1959 ssh_cmd += ["-S", socket_file]
1960 ssh_cmd += [self.ssh_user_host]
1961 return ssh_cmd
1963 def cache_key(self) -> Tuple:
1964 # fmt: off
1965 return (self.location, self.pool, self.ssh_user_host, self.ssh_port, self.ssh_config_file, self.ssh_cipher,
1966 tuple(self.ssh_private_key_files), tuple(self.ssh_extra_opts))
1967 # fmt: on
1969 def __repr__(self) -> str:
1970 return str(self.__dict__)
1973#############################################################################
1974class CopyPropertiesConfig:
1975 def __init__(self, group: str, flag: str, args: argparse.Namespace, p: Params) -> None:
1976 """Option values for --zfs-recv-o* and --zfs-recv-x* option groups; reads from ArgumentParser via args."""
1977 # immutable variables:
1978 grup = group
1979 self.group: str = group
1980 self.flag: str = flag # one of -o or -x
1981 sources: str = p.validate_arg_str(getattr(args, f"{grup}_sources"))
1982 self.sources: str = ",".join(sorted([s.strip() for s in sources.strip().split(",")])) # canonicalize
1983 self.targets: str = p.validate_arg_str(getattr(args, f"{grup}_targets"))
1984 self.include_regexes: RegexList = compile_regexes(getattr(args, f"{grup}_include_regex"))
1985 self.exclude_regexes: RegexList = compile_regexes(getattr(args, f"{grup}_exclude_regex"))
1987 def __repr__(self) -> str:
1988 return str(self.__dict__)
1991#############################################################################
1992class RetryPolicy:
1993 def __init__(self, args: argparse.Namespace, p: Params) -> None:
1994 """Option values for retries; reads from ArgumentParser via args."""
1995 # immutable variables:
1996 self.retries: int = args.retries
1997 self.min_sleep_secs: float = args.retry_min_sleep_secs
1998 self.max_sleep_secs: float = args.retry_max_sleep_secs
1999 self.max_elapsed_secs: float = args.retry_max_elapsed_secs
2000 self.min_sleep_nanos: int = int(self.min_sleep_secs * 1_000_000_000)
2001 self.max_sleep_nanos: int = int(self.max_sleep_secs * 1_000_000_000)
2002 self.max_elapsed_nanos: int = int(self.max_elapsed_secs * 1_000_000_000)
2003 self.min_sleep_nanos = max(1, self.min_sleep_nanos)
2004 self.max_sleep_nanos = max(self.min_sleep_nanos, self.max_sleep_nanos)
2006 def __repr__(self) -> str:
2007 return (
2008 f"retries: {self.retries}, min_sleep_secs: {self.min_sleep_secs}, "
2009 f"max_sleep_secs: {self.max_sleep_secs}, max_elapsed_secs: {self.max_elapsed_secs}"
2010 )
2013#############################################################################
2014@dataclass(frozen=True)
2015class Retry:
2016 count: int
2019#############################################################################
2020class SnapshotLabel(NamedTuple):
2021 """Contains the individual parts that are concatenated into a ZFS snapshot name."""
2023 prefix: str # bzfs_
2024 infix: str # us-west-1_
2025 timestamp: str # 2024-11-06_08:30:05
2026 suffix: str # _hourly
2028 def __str__(self) -> str: # bzfs_us-west-1_2024-11-06_08:30:05_hourly
2029 return f"{self.prefix}{self.infix}{self.timestamp}{self.suffix}"
2031 def validate_label(self, input_text: str) -> None:
2032 name = str(self)
2033 validate_dataset_name(name, input_text)
2034 if "/" in name:
2035 die(f"Invalid ZFS snapshot name: '{name}' for: '{input_text}*'")
2036 for key, value in {"prefix": self.prefix, "infix": self.infix, "suffix": self.suffix}.items():
2037 if key == "prefix":
2038 if not value.endswith("_"):
2039 die(f"Invalid {input_text}{key}: Must end with an underscore character: '{value}'")
2040 if value.count("_") > 1:
2041 die(f"Invalid {input_text}{key}: Must not contain multiple underscore characters: '{value}'")
2042 elif key == "infix":
2043 if value:
2044 if not value.endswith("_"):
2045 die(f"Invalid {input_text}{key}: Must end with an underscore character: '{value}'")
2046 if value.count("_") > 1:
2047 die(f"Invalid {input_text}{key}: Must not contain multiple underscore characters: '{value}'")
2048 elif value:
2049 if not value.startswith("_"):
2050 die(f"Invalid {input_text}{key}: Must start with an underscore character: '{value}'")
2051 if value.count("_") > 1:
2052 die(f"Invalid {input_text}{key}: Must not contain multiple underscore characters: '{value}'")
2055#############################################################################
2056class SnapshotPeriods: # thread-safe
2057 def __init__(self) -> None:
2058 # immutable variables:
2059 self.suffix_milliseconds: Final = {
2060 "yearly": 365 * 86400 * 1000,
2061 "monthly": round(30.5 * 86400 * 1000),
2062 "weekly": 7 * 86400 * 1000,
2063 "daily": 86400 * 1000,
2064 "hourly": 60 * 60 * 1000,
2065 "minutely": 60 * 1000,
2066 "secondly": 1000,
2067 "millisecondly": 1,
2068 }
2069 self.period_labels: Final = {
2070 "yearly": "years",
2071 "monthly": "months",
2072 "weekly": "weeks",
2073 "daily": "days",
2074 "hourly": "hours",
2075 "minutely": "minutes",
2076 "secondly": "seconds",
2077 "millisecondly": "milliseconds",
2078 }
2079 self._suffix_regex0: Final = re.compile(rf"([1-9][0-9]*)?({'|'.join(self.suffix_milliseconds.keys())})")
2080 self._suffix_regex1: Final = re.compile("_" + self._suffix_regex0.pattern)
2082 def suffix_to_duration0(self, suffix: str) -> Tuple[int, str]:
2083 return self._suffix_to_duration(suffix, self._suffix_regex0)
2085 def suffix_to_duration1(self, suffix: str) -> Tuple[int, str]:
2086 return self._suffix_to_duration(suffix, self._suffix_regex1)
2088 @staticmethod
2089 def _suffix_to_duration(suffix: str, regex: re.Pattern) -> Tuple[int, str]:
2090 """Ex: Converts "2 hourly" to (2, "hourly") and "hourly" to (1, "hourly"), i.e. perform some action every N hours."""
2091 if match := regex.fullmatch(suffix):
2092 duration_amount = int(match.group(1)) if match.group(1) else 1
2093 assert duration_amount > 0
2094 duration_unit = match.group(2)
2095 return duration_amount, duration_unit
2096 else:
2097 return 0, ""
2099 def label_milliseconds(self, snapshot: str) -> int:
2100 i = snapshot.rfind("_")
2101 snapshot = "" if i < 0 else snapshot[i + 1 :]
2102 duration_amount, duration_unit = self._suffix_to_duration(snapshot, self._suffix_regex0)
2103 return duration_amount * self.suffix_milliseconds.get(duration_unit, 0)
2106#############################################################################
2107class CreateSrcSnapshotConfig:
2108 def __init__(self, args: argparse.Namespace, p: Params) -> None:
2109 """Option values for --create-src-snapshots*; reads from ArgumentParser via args."""
2110 # immutable variables:
2111 self.skip_create_src_snapshots: bool = not args.create_src_snapshots
2112 self.create_src_snapshots_even_if_not_due: bool = args.create_src_snapshots_even_if_not_due
2113 tz_spec: Optional[str] = args.create_src_snapshots_timezone if args.create_src_snapshots_timezone else None
2114 self.tz: Optional[tzinfo] = get_timezone(tz_spec)
2115 self.current_datetime: datetime = current_datetime(tz_spec)
2116 self.timeformat: str = args.create_src_snapshots_timeformat
2117 self.anchors: PeriodAnchors = PeriodAnchors.parse(args)
2119 # Compute the schedule for upcoming periodic time events (suffix_durations). This event schedule is also used in
2120 # daemon mode via sleep_until_next_daemon_iteration()
2121 suffixes: List[str] = []
2122 labels = []
2123 create_src_snapshots_plan = args.create_src_snapshots_plan or str({"bzfs": {"onsite": {"adhoc": 1}}})
2124 for org, target_periods in ast.literal_eval(create_src_snapshots_plan).items():
2125 for target, periods in target_periods.items():
2126 for period_unit, period_amount in periods.items(): # e.g. period_unit can be "10minutely" or "minutely"
2127 if not isinstance(period_amount, int) or period_amount < 0:
2128 die(f"--create-src-snapshots-plan: Period amount must be a non-negative integer: {period_amount}")
2129 if period_amount > 0:
2130 suffix = nsuffix(period_unit)
2131 suffixes.append(suffix)
2132 labels.append(SnapshotLabel(prefix=nprefix(org), infix=ninfix(target), timestamp="", suffix=suffix))
2133 xperiods: SnapshotPeriods = p.xperiods
2134 if self.skip_create_src_snapshots:
2135 duration_amount, duration_unit = p.xperiods.suffix_to_duration0(p.daemon_frequency)
2136 if duration_amount <= 0 or not duration_unit:
2137 die(f"Invalid --daemon-frequency: {p.daemon_frequency}")
2138 suffixes = [nsuffix(p.daemon_frequency)]
2139 labels = []
2140 suffix_durations = {suffix: xperiods.suffix_to_duration1(suffix) for suffix in suffixes}
2142 def suffix_key(suffix: str) -> Tuple[int, str]:
2143 duration_amount, duration_unit = suffix_durations[suffix]
2144 duration_milliseconds = duration_amount * xperiods.suffix_milliseconds.get(duration_unit, 0)
2145 if suffix.endswith("hourly") or suffix.endswith("minutely") or suffix.endswith("secondly"):
2146 if duration_milliseconds != 0 and 86400 * 1000 % duration_milliseconds != 0:
2147 die(
2148 "Invalid --create-src-snapshots-plan: Period duration should be a divisor of 86400 seconds "
2149 f"without remainder so that snapshots will be created at the same time of day every day: {suffix}"
2150 )
2151 if suffix.endswith("monthly"):
2152 if duration_amount != 0 and 12 % duration_amount != 0:
2153 die(
2154 "Invalid --create-src-snapshots-plan: Period duration should be a divisor of 12 months "
2155 f"without remainder so that snapshots will be created at the same time every year: {suffix}"
2156 )
2157 return duration_milliseconds, suffix
2159 suffixes = sorted(suffixes, key=suffix_key, reverse=True) # take snapshots for dailies before hourlies, and so on
2160 self.suffix_durations: Dict[str, Tuple[int, str]] = {suffix: suffix_durations[suffix] for suffix in suffixes} # sort
2161 suffix_indexes = {suffix: k for k, suffix in enumerate(suffixes)}
2162 labels.sort(key=lambda label: (suffix_indexes[label.suffix], label)) # take snapshots for dailies before hourlies
2163 self._snapshot_labels: List[SnapshotLabel] = labels
2164 for label in self.snapshot_labels():
2165 label.validate_label("--create-src-snapshots-plan ")
2167 def snapshot_labels(self) -> List[SnapshotLabel]:
2168 """Returns the snapshot name patterns for which snapshots shall be created."""
2169 timeformat = self.timeformat
2170 is_millis = timeformat.endswith("%F") # non-standard hack to append milliseconds
2171 if is_millis:
2172 timeformat = timeformat[0:-1] + "f" # replace %F with %f (append microseconds)
2173 timestamp: str = self.current_datetime.strftime(timeformat)
2174 if is_millis:
2175 timestamp = timestamp[0 : -len("000")] # replace microseconds with milliseconds
2176 timestamp = timestamp.replace("+", "z") # zfs CLI does not accept the '+' character in snapshot names
2177 return [SnapshotLabel(label.prefix, label.infix, timestamp, label.suffix) for label in self._snapshot_labels]
2179 def __repr__(self) -> str:
2180 return str(self.__dict__)
2183#############################################################################
2184@dataclass(frozen=True)
2185class AlertConfig:
2186 kind: Literal["Latest", "Oldest"]
2187 warning_millis: int
2188 critical_millis: int
2191#############################################################################
2192@dataclass(frozen=True)
2193class MonitorSnapshotAlert:
2194 label: SnapshotLabel
2195 latest: Optional[AlertConfig]
2196 oldest: Optional[AlertConfig]
2199#############################################################################
2200class MonitorSnapshotsConfig:
2201 def __init__(self, args: argparse.Namespace, p: Params) -> None:
2202 """Option values for --monitor-snapshots*; reads from ArgumentParser via args."""
2203 # immutable variables:
2204 self.monitor_snapshots: Dict = ast.literal_eval(args.monitor_snapshots)
2205 self.dont_warn: bool = args.monitor_snapshots_dont_warn
2206 self.dont_crit: bool = args.monitor_snapshots_dont_crit
2207 self.no_latest_check: bool = args.monitor_snapshots_no_latest_check
2208 self.no_oldest_check: bool = args.monitor_snapshots_no_oldest_check
2209 alerts = []
2210 xperiods: SnapshotPeriods = p.xperiods
2211 for org, target_periods in self.monitor_snapshots.items():
2212 prefix = nprefix(org)
2213 for target, periods in target_periods.items():
2214 for period_unit, alert_dicts in periods.items(): # e.g. period_unit can be "10minutely" or "minutely"
2215 label = SnapshotLabel(prefix=prefix, infix=ninfix(target), timestamp="", suffix=nsuffix(period_unit))
2216 alert_latest, alert_oldest = None, None
2217 for alert_type, alert_dict in alert_dicts.items():
2218 m = "--monitor-snapshots: "
2219 if alert_type not in ["latest", "oldest"]:
2220 die(f"{m}'{alert_type}' must be 'latest' or 'oldest' within {args.monitor_snapshots}")
2221 warning_millis: int = 0
2222 critical_millis: int = 0
2223 cycles: int = 1
2224 for kind, value in alert_dict.items():
2225 context = args.monitor_snapshots
2226 if kind == "warning":
2227 warning_millis = max(0, parse_duration_to_milliseconds(str(value), context=context))
2228 elif kind == "critical":
2229 critical_millis = max(0, parse_duration_to_milliseconds(str(value), context=context))
2230 elif kind == "cycles":
2231 cycles = max(0, int(value))
2232 else:
2233 die(f"{m}'{kind}' must be 'warning', 'critical' or 'cycles' within {context}")
2234 if warning_millis > 0 or critical_millis > 0:
2235 duration_amount, duration_unit = xperiods.suffix_to_duration1(label.suffix)
2236 duration_milliseconds = duration_amount * xperiods.suffix_milliseconds.get(duration_unit, 0)
2237 warning_millis += 0 if warning_millis <= 0 else cycles * duration_milliseconds
2238 critical_millis += 0 if critical_millis <= 0 else cycles * duration_milliseconds
2239 warning_millis = unixtime_infinity_secs if warning_millis <= 0 else warning_millis
2240 critical_millis = unixtime_infinity_secs if critical_millis <= 0 else critical_millis
2241 capitalized_alert_type = cast(Literal["Latest", "Oldest"], sys.intern(alert_type.capitalize()))
2242 alert_config = AlertConfig(capitalized_alert_type, warning_millis, critical_millis)
2243 if alert_type == "latest":
2244 if not self.no_latest_check:
2245 alert_latest = alert_config
2246 else:
2247 assert alert_type == "oldest"
2248 if not self.no_oldest_check:
2249 alert_oldest = alert_config
2250 if alert_latest is not None or alert_oldest is not None:
2251 alerts.append(MonitorSnapshotAlert(label, alert_latest, alert_oldest))
2253 def alert_sort_key(alert: MonitorSnapshotAlert) -> Tuple[int, SnapshotLabel]:
2254 duration_amount, duration_unit = xperiods.suffix_to_duration1(alert.label.suffix)
2255 duration_milliseconds = duration_amount * xperiods.suffix_milliseconds.get(duration_unit, 0)
2256 return duration_milliseconds, alert.label
2258 alerts.sort(key=alert_sort_key, reverse=True) # check snapshots for dailies before hourlies, and so on
2259 self.alerts: List[MonitorSnapshotAlert] = alerts
2260 self.enable_monitor_snapshots: bool = len(alerts) > 0
2262 def __repr__(self) -> str:
2263 return str(self.__dict__)
2266#############################################################################
2267@dataclass(frozen=True)
2268class RemoteConfCacheItem:
2269 connection_pools: "ConnectionPools"
2270 available_programs: Dict[str, str]
2271 zpool_features: Dict[str, str]
2272 timestamp_nanos: int = field(default_factory=time.monotonic_ns)
2275#############################################################################
2276def main() -> None:
2277 """API for command line clients."""
2278 try:
2279 run_main(argument_parser().parse_args(), sys.argv)
2280 except subprocess.CalledProcessError as e:
2281 sys.exit(e.returncode)
2284def run_main(args: argparse.Namespace, sys_argv: Optional[List[str]] = None, log: Optional[Logger] = None) -> None:
2285 """API for Python clients; visible for testing; may become a public API eventually."""
2286 Job().run_main(args, sys_argv, log)
2289#############################################################################
2290class Job:
2291 def __init__(self) -> None:
2292 self.params: Params
2293 self.all_dst_dataset_exists: Dict[str, Dict[str, bool]] = defaultdict(lambda: defaultdict(bool))
2294 self.dst_dataset_exists: SynchronizedDict[str, bool] = SynchronizedDict({})
2295 self.src_properties: Dict[str, Dict[str, Union[str, int]]] = {}
2296 self.dst_properties: Dict[str, Dict[str, Union[str, int]]] = {}
2297 self.all_exceptions: List[str] = []
2298 self.all_exceptions_count = 0
2299 self.max_exceptions_to_summarize = 10000
2300 self.first_exception: Optional[BaseException] = None
2301 self.remote_conf_cache: Dict[Tuple, RemoteConfCacheItem] = {}
2302 self.dedicated_tcp_connection_per_zfs_send: bool = True
2303 self.max_datasets_per_minibatch_on_list_snaps: Dict[str, int] = {}
2304 self.max_workers: Dict[str, int] = {}
2305 self.re_suffix = r"(?:/.*)?" # also match descendants of a matching dataset
2306 self.stats_lock = threading.Lock()
2307 self.num_cache_hits: int = 0
2308 self.num_cache_misses: int = 0
2309 self.num_snapshots_found: int = 0
2310 self.num_snapshots_replicated: int = 0
2311 self.control_persist_secs: int = 90
2312 self.control_persist_margin_secs: int = 2
2313 self.progress_reporter: ProgressReporter = cast(ProgressReporter, None)
2314 self.is_first_replication_task: SynchronizedBool = SynchronizedBool(True)
2315 self.replication_start_time_nanos: int = time.monotonic_ns()
2316 self.timeout_nanos: Optional[int] = None
2318 self.is_test_mode: bool = False # for testing only
2319 self.creation_prefix = "" # for testing only
2320 self.isatty: Optional[bool] = None # for testing only
2321 self.use_select: bool = False # for testing only
2322 self.progress_update_intervals: Optional[Tuple[float, float]] = None # for testing only
2323 self.error_injection_triggers: Dict[str, Counter] = {} # for testing only
2324 self.delete_injection_triggers: Dict[str, Counter] = {} # for testing only
2325 self.param_injection_triggers: Dict[str, Dict[str, bool]] = {} # for testing only
2326 self.inject_params: Dict[str, bool] = {} # for testing only
2327 self.injection_lock = threading.Lock() # for testing only
2328 self.max_command_line_bytes: Optional[int] = None # for testing only
2330 def shutdown(self) -> None:
2331 """Exit any multiplexed ssh sessions that may be leftover."""
2332 cache_items = self.remote_conf_cache.values()
2333 for i, cache_item in enumerate(cache_items):
2334 cache_item.connection_pools.shutdown(f"{i + 1}/{len(cache_items)}")
2336 def terminate(self, old_term_handler: Any, except_current_process: bool = False) -> None:
2337 def post_shutdown() -> None:
2338 signal.signal(signal.SIGTERM, old_term_handler) # restore original signal handler
2339 terminate_process_subtree(except_current_process=except_current_process)
2341 with xfinally(post_shutdown):
2342 self.shutdown()
2344 def run_main(self, args: argparse.Namespace, sys_argv: Optional[List[str]] = None, log: Optional[Logger] = None) -> None:
2345 assert isinstance(self.error_injection_triggers, dict)
2346 assert isinstance(self.delete_injection_triggers, dict)
2347 assert isinstance(self.inject_params, dict)
2348 log_params = LogParams(args)
2349 with xfinally(reset_logger): # runs reset_logger() on exit, without masking exception raised in body of `with` block
2350 log = get_logger(log_params, args, log)
2351 log.info("%s", f"Log file is: {log_params.log_file}")
2352 aux_args: List[str] = []
2353 if getattr(args, "include_snapshot_plan", None):
2354 aux_args += args.include_snapshot_plan
2355 if getattr(args, "delete_dst_snapshots_except_plan", None):
2356 aux_args += args.delete_dst_snapshots_except_plan
2357 if len(aux_args) > 0:
2358 log.info("Auxiliary CLI arguments: %s", " ".join(aux_args))
2359 args = argument_parser().parse_args(xappend(aux_args, "--", args.root_dataset_pairs), namespace=args)
2360 log.info("CLI arguments: %s %s", " ".join(sys_argv or []), f"[euid: {os.geteuid()}]")
2361 log.log(log_trace, "Parsed CLI arguments: %s", args)
2362 try:
2363 self.params = p = Params(args, sys_argv, log_params, log, self.inject_params)
2364 except SystemExit as e:
2365 log.error("%s", e)
2366 raise
2367 log_params.params = p
2368 with open(log_params.log_file, "a", encoding="utf-8") as log_file_fd:
2369 with contextlib.redirect_stderr(cast(TextIO, Tee(log_file_fd, sys.stderr))): # send stderr to logfile+stderr
2370 lock_file = p.lock_file_name()
2371 with open(lock_file, "w") as lock_fd:
2372 try:
2373 # Acquire an exclusive lock; will raise an error if lock is already held by another process.
2374 # The (advisory) lock is auto-released when the process terminates or the fd is closed.
2375 fcntl.flock(lock_fd, fcntl.LOCK_EX | fcntl.LOCK_NB) # LOCK_NB ... non-blocking
2376 except BlockingIOError:
2377 msg = "Exiting as same previous periodic job is still running without completion yet per "
2378 msg += lock_file
2379 log.error("%s", msg)
2380 die(msg, still_running_status)
2381 with xfinally(lambda: Path(lock_file).unlink(missing_ok=True)): # avoid accumulating stale lockfiles
2382 # On CTRL-C and SIGTERM, send signal also to descendant processes to also terminate descendants
2383 old_term_handler = signal.getsignal(signal.SIGTERM)
2384 signal.signal(signal.SIGTERM, lambda sig, f: self.terminate(old_term_handler))
2385 old_int_handler = signal.signal(signal.SIGINT, lambda sig, f: self.terminate(old_term_handler))
2386 try:
2387 self.run_tasks()
2388 except BaseException as e:
2389 self.terminate(old_term_handler, except_current_process=True)
2390 raise e
2391 finally:
2392 signal.signal(signal.SIGTERM, old_term_handler) # restore original signal handler
2393 signal.signal(signal.SIGINT, old_int_handler) # restore original signal handler
2394 for _ in range(2 if self.max_command_line_bytes else 1):
2395 self.shutdown()
2397 def run_tasks(self) -> None:
2398 def log_error_on_exit(error: Any, status_code: Any) -> None:
2399 log.error("%s%s", f"Exiting {prog_name} with status code {status_code}. Cause: ", error)
2401 p, log = self.params, self.params.log
2402 try:
2403 self.all_exceptions = []
2404 self.all_exceptions_count = 0
2405 self.first_exception = None
2406 self.remote_conf_cache = {}
2407 self.isatty = self.isatty if self.isatty is not None else p.isatty
2408 self.validate_once()
2409 self.replication_start_time_nanos = time.monotonic_ns()
2410 self.progress_reporter = ProgressReporter(p, self.use_select, self.progress_update_intervals)
2411 with xfinally(lambda: self.progress_reporter.stop()):
2412 daemon_stoptime_nanos = time.monotonic_ns() + p.daemon_lifetime_nanos
2413 while True: # loop for daemon mode
2414 self.timeout_nanos = None if p.timeout_nanos is None else time.monotonic_ns() + p.timeout_nanos
2415 self.all_dst_dataset_exists.clear()
2416 self.progress_reporter.reset()
2417 src, dst = p.src, p.dst
2418 for src_root_dataset, dst_root_dataset in p.root_dataset_pairs:
2419 src.root_dataset = src.basis_root_dataset = src_root_dataset
2420 dst.root_dataset = dst.basis_root_dataset = dst_root_dataset
2421 p.curr_zfs_send_program_opts = p.zfs_send_program_opts.copy()
2422 if p.daemon_lifetime_nanos > 0:
2423 self.timeout_nanos = None if p.timeout_nanos is None else time.monotonic_ns() + p.timeout_nanos
2424 recsep = " " if p.recursive_flag else ""
2425 task_description = f"{src.basis_root_dataset} {p.recursive_flag}{recsep}--> {dst.basis_root_dataset}"
2426 if len(p.root_dataset_pairs) > 1:
2427 log.info("Starting task: %s", task_description + " ...")
2428 try:
2429 try:
2430 self.maybe_inject_error(cmd=[], error_trigger="retryable_run_tasks")
2431 self.timeout()
2432 self.validate_task()
2433 self.run_task()
2434 except RetryableError as retryable_error:
2435 assert retryable_error.__cause__ is not None
2436 raise retryable_error.__cause__ from None
2437 except (CalledProcessError, subprocess.TimeoutExpired, SystemExit, UnicodeDecodeError) as e:
2438 if p.skip_on_error == "fail" or (
2439 isinstance(e, subprocess.TimeoutExpired) and p.daemon_lifetime_nanos == 0
2440 ):
2441 raise
2442 log.error("%s", e)
2443 self.append_exception(e, "task", task_description)
2444 if not self.sleep_until_next_daemon_iteration(daemon_stoptime_nanos):
2445 break
2446 if not p.skip_replication:
2447 self.print_replication_stats(self.replication_start_time_nanos)
2448 error_count = self.all_exceptions_count
2449 if error_count > 0 and p.daemon_lifetime_nanos == 0:
2450 msgs = "\n".join([f"{i + 1}/{error_count}: {e}" for i, e in enumerate(self.all_exceptions)])
2451 log.error("%s", f"Tolerated {error_count} errors. Error Summary: \n{msgs}")
2452 assert self.first_exception is not None
2453 raise self.first_exception
2454 except subprocess.CalledProcessError as e:
2455 log_error_on_exit(e, e.returncode)
2456 raise
2457 except SystemExit as e:
2458 log_error_on_exit(e, e.code)
2459 raise
2460 except (subprocess.TimeoutExpired, UnicodeDecodeError) as e:
2461 log_error_on_exit(e, die_status)
2462 raise SystemExit(die_status) from e
2463 except re.error as e:
2464 log_error_on_exit(f"{e} within regex {e.pattern!r}", die_status)
2465 raise SystemExit(die_status) from e
2466 finally:
2467 log.info("%s", f"Log file was: {p.log_params.log_file}")
2469 log.info("Success. Goodbye!")
2470 print("", end="", file=sys.stderr)
2471 sys.stderr.flush()
2473 def append_exception(self, e: BaseException, task_name: str, task_description: str) -> None:
2474 self.first_exception = self.first_exception or e
2475 if len(self.all_exceptions) < self.max_exceptions_to_summarize: # cap max memory consumption
2476 self.all_exceptions.append(str(e))
2477 self.all_exceptions_count += 1
2478 self.params.log.error(f"#{self.all_exceptions_count}: Done with %s: %s", task_name, task_description)
2480 def sleep_until_next_daemon_iteration(self, daemon_stoptime_nanos: int) -> bool:
2481 sleep_nanos = daemon_stoptime_nanos - time.monotonic_ns()
2482 if sleep_nanos <= 0:
2483 return False
2484 self.progress_reporter.pause()
2485 p, log = self.params, self.params.log
2486 config = p.create_src_snapshots_config
2487 curr_datetime = config.current_datetime + timedelta(microseconds=1)
2488 next_snapshotting_event_dt = min(
2489 (
2490 round_datetime_up_to_duration_multiple(curr_datetime, duration_amount, duration_unit, config.anchors)
2491 for duration_amount, duration_unit in config.suffix_durations.values()
2492 ),
2493 default=curr_datetime + timedelta(days=10 * 365), # infinity
2494 )
2495 offset: timedelta = next_snapshotting_event_dt - datetime.now(config.tz)
2496 offset_nanos = (offset.days * 86400 + offset.seconds) * 1_000_000_000 + offset.microseconds * 1_000
2497 sleep_nanos = min(sleep_nanos, max(0, offset_nanos))
2498 log.info("Daemon sleeping for: %s%s", human_readable_duration(sleep_nanos), f" ... [Log {p.log_params.log_file}]")
2499 time.sleep(sleep_nanos / 1_000_000_000)
2500 config.current_datetime = datetime.now(config.tz)
2501 return daemon_stoptime_nanos - time.monotonic_ns() > 0
2503 def print_replication_stats(self, start_time_nanos: int) -> None:
2504 p, log = self.params, self.params.log
2505 elapsed_nanos = time.monotonic_ns() - start_time_nanos
2506 msg = p.dry(f"Replicated {self.num_snapshots_replicated} snapshots in {human_readable_duration(elapsed_nanos)}.")
2507 if self.is_program_available("pv", "local"):
2508 sent_bytes = count_num_bytes_transferred_by_zfs_send(p.log_params.pv_log_file)
2509 sent_bytes_per_sec = round(1_000_000_000 * sent_bytes / elapsed_nanos)
2510 msg += f" zfs sent {human_readable_bytes(sent_bytes)} [{human_readable_bytes(sent_bytes_per_sec)}/s]."
2511 log.info("%s", msg.ljust(p.terminal_columns - len("2024-01-01 23:58:45 [I] ")))
2513 def validate_once(self) -> None:
2514 p = self.params
2515 p.zfs_recv_ox_names = self.recv_option_property_names(p.zfs_recv_program_opts)
2516 for snapshot_filter in p.snapshot_filters:
2517 for _filter in snapshot_filter:
2518 if _filter.name == snapshot_regex_filter_name:
2519 exclude_snapshot_regexes = compile_regexes(_filter.options[0])
2520 include_snapshot_regexes = compile_regexes(_filter.options[1] or [".*"])
2521 _filter.options = (exclude_snapshot_regexes, include_snapshot_regexes)
2523 exclude_regexes = [exclude_dataset_regexes_default]
2524 if len(p.args.exclude_dataset_regex) > 0: # some patterns don't exclude anything
2525 exclude_regexes = [regex for regex in p.args.exclude_dataset_regex if regex != "" and regex != "!.*"]
2526 include_regexes = p.args.include_dataset_regex
2528 # relative datasets need not be compiled more than once as they don't change between tasks
2529 def separate_abs_vs_rel_datasets(datasets: List[str]) -> Tuple[List[str], List[str]]:
2530 abs_datasets: List[str] = []
2531 rel_datasets: List[str] = []
2532 for dataset in datasets:
2533 (abs_datasets if dataset.startswith("/") else rel_datasets).append(dataset)
2534 return abs_datasets, rel_datasets
2536 p.abs_exclude_datasets, rel_exclude_datasets = separate_abs_vs_rel_datasets(p.args.exclude_dataset)
2537 p.abs_include_datasets, rel_include_datasets = separate_abs_vs_rel_datasets(p.args.include_dataset)
2538 p.tmp_exclude_dataset_regexes, p.tmp_include_dataset_regexes = (
2539 compile_regexes(exclude_regexes + self.dataset_regexes(rel_exclude_datasets), suffix=self.re_suffix),
2540 compile_regexes(include_regexes + self.dataset_regexes(rel_include_datasets), suffix=self.re_suffix),
2541 )
2543 if p.pv_program != disable_prg:
2544 pv_program_opts_set = set(p.pv_program_opts)
2545 if pv_program_opts_set.isdisjoint({"--bytes", "-b", "--bits", "-8"}):
2546 die("--pv-program-opts must contain one of --bytes or --bits for progress metrics to function.")
2547 if self.isatty and not p.quiet:
2548 for opts in [["--eta", "-e"], ["--fineta", "-I"], ["--average-rate", "-a"]]:
2549 if pv_program_opts_set.isdisjoint(opts):
2550 die(f"--pv-program-opts must contain one of {', '.join(opts)} for progress report line to function.")
2552 src, dst = p.src, p.dst
2553 for remote in [src, dst]:
2554 r, loc = remote, remote.location
2555 validate_user_name(r.basis_ssh_user, f"--ssh-{loc}-user")
2556 validate_host_name(r.basis_ssh_host, f"--ssh-{loc}-host")
2557 validate_port(r.ssh_port, f"--ssh-{loc}-port ")
2559 def validate_task(self) -> None:
2560 p, log = self.params, self.params.log
2561 src, dst = p.src, p.dst
2562 for remote in [src, dst]:
2563 r = remote
2564 r.ssh_user, r.ssh_host, r.ssh_user_host, r.pool, r.root_dataset = parse_dataset_locator(
2565 r.basis_root_dataset, user=r.basis_ssh_user, host=r.basis_ssh_host, port=r.ssh_port
2566 )
2567 r.sudo, r.use_zfs_delegation = self.sudo_cmd(r.ssh_user_host, r.ssh_user)
2568 local_addrs = ("",) if self.is_test_mode else ("", "127.0.0.1", "::1") # ::1 is IPv6 version of loopback address
2569 remote.is_nonlocal = r.ssh_host not in local_addrs
2570 self.dst_dataset_exists = SynchronizedDict(self.all_dst_dataset_exists[dst.ssh_user_host])
2572 if src.ssh_host == dst.ssh_host:
2573 msg = f"src: {src.basis_root_dataset}, dst: {dst.basis_root_dataset}"
2574 if src.root_dataset == dst.root_dataset:
2575 die(f"Source and destination dataset must not be the same! {msg}")
2576 if p.recursive and (
2577 is_descendant(src.root_dataset, of_root_dataset=dst.root_dataset)
2578 or is_descendant(dst.root_dataset, of_root_dataset=src.root_dataset)
2579 ):
2580 die(f"Source and destination dataset trees must not overlap! {msg}")
2582 suffix = self.re_suffix # also match descendants of a matching dataset
2583 p.exclude_dataset_regexes, p.include_dataset_regexes = (
2584 p.tmp_exclude_dataset_regexes + compile_regexes(self.dataset_regexes(p.abs_exclude_datasets), suffix=suffix),
2585 p.tmp_include_dataset_regexes + compile_regexes(self.dataset_regexes(p.abs_include_datasets), suffix=suffix),
2586 )
2587 if len(p.include_dataset_regexes) == 0:
2588 p.include_dataset_regexes = [(re.compile(".*"), False)]
2590 self.detect_available_programs()
2592 zfs_send_program_opts = p.curr_zfs_send_program_opts
2593 if self.is_zpool_feature_enabled_or_active(dst, "feature@large_blocks"):
2594 append_if_absent(zfs_send_program_opts, "--large-block") # solaris-11.4 does not have this feature
2595 if self.is_solaris_zfs(dst):
2596 p.dry_run_destroy = "" # solaris-11.4 knows no 'zfs destroy -n' flag
2597 p.verbose_destroy = "" # solaris-11.4 knows no 'zfs destroy -v' flag
2598 if self.is_solaris_zfs(src): # solaris-11.4 only knows -w compress
2599 zfs_send_program_opts = ["-p" if opt == "--props" else opt for opt in zfs_send_program_opts]
2600 zfs_send_program_opts = fix_solaris_raw_mode(zfs_send_program_opts)
2601 p.curr_zfs_send_program_opts = zfs_send_program_opts
2603 self.max_workers = {}
2604 self.max_datasets_per_minibatch_on_list_snaps = {}
2605 for r in [src, dst]:
2606 cpus = int(p.available_programs[r.location].get("getconf_cpu_count", 8))
2607 threads, is_percent = p.threads
2608 cpus = max(1, round(cpus * threads / 100.0) if is_percent else round(threads))
2609 self.max_workers[r.location] = cpus
2610 bs = max(1, p.max_datasets_per_batch_on_list_snaps) # 1024 by default
2611 max_datasets_per_minibatch = p.max_datasets_per_minibatch_on_list_snaps
2612 if max_datasets_per_minibatch <= 0:
2613 max_datasets_per_minibatch = max(1, bs // cpus)
2614 max_datasets_per_minibatch = min(bs, max_datasets_per_minibatch)
2615 self.max_datasets_per_minibatch_on_list_snaps[r.location] = max_datasets_per_minibatch
2616 log.log(
2617 log_trace,
2618 "%s",
2619 f"max_datasets_per_batch_on_list_snaps: {p.max_datasets_per_batch_on_list_snaps}, "
2620 f"max_datasets_per_minibatch_on_list_snaps: {max_datasets_per_minibatch}, "
2621 f"max_workers: {self.max_workers[r.location]}, "
2622 f"location: {r.location}",
2623 )
2624 log.log(log_trace, "Validated Param values: %s", pretty_print_formatter(self.params))
2626 def sudo_cmd(self, ssh_user_host: str, ssh_user: str) -> Tuple[str, bool]:
2627 p = self.params
2628 assert isinstance(ssh_user_host, str)
2629 assert isinstance(ssh_user, str)
2630 assert isinstance(p.sudo_program, str)
2631 assert isinstance(p.enable_privilege_elevation, bool)
2633 is_root = True
2634 if ssh_user_host != "":
2635 if ssh_user == "":
2636 if os.geteuid() != 0:
2637 is_root = False
2638 elif ssh_user != "root":
2639 is_root = False
2640 elif os.geteuid() != 0:
2641 is_root = False
2643 if is_root:
2644 sudo = "" # using sudo in an attempt to make ZFS operations work even if we are not root user?
2645 use_zfs_delegation = False # or instead using 'zfs allow' delegation?
2646 return sudo, use_zfs_delegation
2647 elif p.enable_privilege_elevation:
2648 if p.sudo_program == disable_prg:
2649 die(f"sudo CLI is not available on host: {ssh_user_host or 'localhost'}")
2650 # The '-n' option makes 'sudo' safer and more fail-fast. It avoids having sudo prompt the user for input of any
2651 # kind. If a password is required for the sudo command to run, sudo will display an error message and exit.
2652 return p.sudo_program + " -n", False
2653 else:
2654 return "", True
2656 def run_task(self) -> None:
2657 def filter_src_datasets() -> List[str]: # apply --{include|exclude}-dataset policy
2658 return self.filter_datasets(src, basis_src_datasets) if src_datasets is None else src_datasets
2660 p, log = self.params, self.params.log
2661 src, dst = p.src, p.dst
2662 max_workers = min(self.max_workers[src.location], self.max_workers[dst.location])
2663 recursive_sep = " " if p.recursive_flag else ""
2664 task_description = f"{src.basis_root_dataset} {p.recursive_flag}{recursive_sep}--> {dst.basis_root_dataset} ..."
2665 failed = False
2666 src_datasets = None
2667 basis_src_datasets = []
2668 self.src_properties = {}
2669 self.dst_properties = {}
2670 if not self.is_dummy(src): # find src dataset or all datasets in src dataset tree (with --recursive)
2671 is_caching = self.is_caching_snapshots(src)
2672 props = "volblocksize,recordsize,name"
2673 props = "snapshots_changed," + props if is_caching else props
2674 cmd = p.split_args(
2675 f"{p.zfs_program} list -t filesystem,volume -s name -Hp -o {props} {p.recursive_flag}", src.root_dataset
2676 )
2677 for line in (self.try_ssh_command(src, log_debug, cmd=cmd) or "").splitlines():
2678 cols = line.split("\t")
2679 snapshots_changed, volblocksize, recordsize, src_dataset = cols if is_caching else ["-"] + cols
2680 self.src_properties[src_dataset] = {
2681 "recordsize": int(recordsize) if recordsize != "-" else -int(volblocksize),
2682 SNAPSHOTS_CHANGED: int(snapshots_changed) if snapshots_changed and snapshots_changed != "-" else 0,
2683 }
2684 basis_src_datasets.append(src_dataset)
2685 assert not self.is_test_mode or basis_src_datasets == sorted(basis_src_datasets), "List is not sorted"
2687 # Optionally, atomically create a new snapshot of the src datasets selected by --{include|exclude}-dataset* policy.
2688 # The implementation attempts to fit as many datasets as possible into a single (atomic) 'zfs snapshot' command line,
2689 # using lexicographical sort order, and using 'zfs snapshot -r' to the extent that this is compatible with the
2690 # --{include|exclude}-dataset* pruning policy. The snapshots of all datasets that fit within the same single
2691 # 'zfs snapshot' CLI invocation will be taken within the same ZFS transaction group, and correspondingly have
2692 # identical 'createtxg' ZFS property (but not necessarily identical 'creation' ZFS time property as ZFS actually
2693 # provides no such guarantee), and thus be consistent. Dataset names that can't fit into a single command line are
2694 # spread over multiple command line invocations, respecting the limits that the operating system places on the
2695 # maximum length of a single command line, per `getconf ARG_MAX`.
2696 if not p.create_src_snapshots_config.skip_create_src_snapshots:
2697 log.info(p.dry("--create-src-snapshots: %s"), f"{src.basis_root_dataset} {p.recursive_flag}{recursive_sep}...")
2698 if len(basis_src_datasets) == 0:
2699 die(f"Source dataset does not exist: {src.basis_root_dataset}")
2700 src_datasets = filter_src_datasets() # apply include/exclude policy
2701 datasets_to_snapshot: Dict[SnapshotLabel, List[str]] = self.find_datasets_to_snapshot(src_datasets)
2702 datasets_to_snapshot = {label: datasets for label, datasets in datasets_to_snapshot.items() if len(datasets) > 0}
2703 basis_datasets_to_snapshot = datasets_to_snapshot.copy() # shallow copy
2704 commands = {}
2705 for label, datasets in datasets_to_snapshot.items():
2706 cmd = p.split_args(f"{src.sudo} {p.zfs_program} snapshot")
2707 if p.recursive:
2708 # Run 'zfs snapshot -r' on the roots of subtrees if possible, else fallback to non-recursive CLI flavor
2709 root_datasets = self.root_datasets_if_recursive_zfs_snapshot_is_possible(datasets, basis_src_datasets)
2710 if root_datasets is not None:
2711 cmd.append("-r") # recursive; takes a snapshot of all datasets in the subtree(s)
2712 datasets_to_snapshot[label] = root_datasets
2713 commands[label] = cmd
2714 creation_msg = f"Creating {sum(len(datasets) for datasets in basis_datasets_to_snapshot.values())} snapshots"
2715 log.info(p.dry("--create-src-snapshots: %s"), f"{creation_msg} within {len(src_datasets)} datasets ...")
2716 # create snapshots in large (parallel) batches, without using a command line that's too big for the OS to handle
2717 self.run_ssh_cmd_parallel(
2718 src,
2719 [(commands[lbl], [f"{ds}@{lbl}" for ds in datasets]) for lbl, datasets in datasets_to_snapshot.items()],
2720 fn=lambda cmd, batch: self.run_ssh_command(src, is_dry=p.dry_run, print_stdout=True, cmd=cmd + batch),
2721 max_batch_items=1 if self.is_solaris_zfs(src) else 2**29, # solaris CLI doesn't accept multiple datasets
2722 )
2723 # perf: copy lastmodified time of source dataset into local cache to reduce future 'zfs list -t snapshot' calls
2724 self.update_last_modified_cache(basis_datasets_to_snapshot)
2726 # Optionally, replicate src.root_dataset (optionally including its descendants) to dst.root_dataset
2727 if not p.skip_replication:
2728 if len(basis_src_datasets) == 0:
2729 die(f"Replication: Source dataset does not exist: {src.basis_root_dataset}")
2730 if self.is_dummy(dst):
2731 die("Replication: Destination may be a dummy dataset only if exclusively creating snapshots on the source!")
2732 src_datasets = filter_src_datasets() # apply include/exclude policy
2733 failed = self.replicate_datasets(src_datasets, task_description, max_workers)
2735 if failed or not (
2736 p.delete_dst_datasets
2737 or p.delete_dst_snapshots
2738 or p.delete_empty_dst_datasets
2739 or p.compare_snapshot_lists
2740 or p.monitor_snapshots_config.enable_monitor_snapshots
2741 ):
2742 return
2743 log.info("Listing dst datasets: %s", task_description)
2744 if self.is_dummy(dst):
2745 die("Destination may be a dummy dataset only if exclusively creating snapshots on the source!")
2746 is_caching = self.is_caching_snapshots(dst) and p.monitor_snapshots_config.enable_monitor_snapshots
2747 props = "name"
2748 props = "snapshots_changed," + props if is_caching else props
2749 cmd = p.split_args(
2750 f"{p.zfs_program} list -t filesystem,volume -s name -Hp -o {props}", p.recursive_flag, dst.root_dataset
2751 )
2752 basis_dst_datasets = []
2753 basis_dst_datasets_str = self.try_ssh_command(dst, log_trace, cmd=cmd)
2754 if basis_dst_datasets_str is None:
2755 log.warning("Destination dataset does not exist: %s", dst.root_dataset)
2756 else:
2757 for line in basis_dst_datasets_str.splitlines():
2758 cols = line.split("\t")
2759 snapshots_changed, dst_dataset = cols if is_caching else ["-"] + cols
2760 self.dst_properties[dst_dataset] = {
2761 SNAPSHOTS_CHANGED: int(snapshots_changed) if snapshots_changed and snapshots_changed != "-" else 0,
2762 }
2763 basis_dst_datasets.append(dst_dataset)
2765 assert not self.is_test_mode or basis_dst_datasets == sorted(basis_dst_datasets), "List is not sorted"
2766 dst_datasets = self.filter_datasets(dst, basis_dst_datasets) # apply include/exclude policy
2768 # Optionally, delete existing destination datasets that do not exist within the source dataset if they are
2769 # included via --{include|exclude}-dataset* policy. Do not recurse without --recursive. With --recursive,
2770 # never delete non-selected dataset subtrees or their ancestors.
2771 if p.delete_dst_datasets and not failed:
2772 log.info(p.dry("--delete-dst-datasets: %s"), task_description)
2773 children = defaultdict(set)
2774 for dst_dataset in basis_dst_datasets: # Compute the direct children of each NON-FILTERED dataset
2775 parent = os.path.dirname(dst_dataset)
2776 children[parent].add(dst_dataset)
2777 to_delete: Set[str] = set()
2778 for dst_dataset in reversed(dst_datasets):
2779 if children[dst_dataset].issubset(to_delete):
2780 to_delete.add(dst_dataset) # all children are deletable, thus the dataset itself is deletable too
2781 to_delete = to_delete.difference(
2782 {replace_prefix(src_dataset, src.root_dataset, dst.root_dataset) for src_dataset in basis_src_datasets}
2783 )
2784 self.delete_datasets(dst, to_delete)
2785 dst_datasets = sorted(set(dst_datasets).difference(to_delete))
2786 basis_dst_datasets = sorted(set(basis_dst_datasets).difference(to_delete))
2788 # Optionally, delete existing destination snapshots that do not exist within the source dataset if they
2789 # are included by the --{include|exclude}-snapshot-* policy, and the destination dataset is included
2790 # via --{include|exclude}-dataset* policy.
2791 if p.delete_dst_snapshots and not failed:
2792 log.info(p.dry("--delete-dst-snapshots: %s"), task_description + f" [{len(dst_datasets)} datasets]")
2793 kind = "bookmark" if p.delete_dst_bookmarks else "snapshot"
2794 filter_needs_creation_time = has_timerange_filter(p.snapshot_filters)
2795 props = self.creation_prefix + "creation,guid,name" if filter_needs_creation_time else "guid,name"
2796 basis_src_datasets_set = set(basis_src_datasets)
2797 num_snapshots_found, num_snapshots_deleted = 0, 0
2799 def delete_destination_snapshots(dst_dataset: str, tid: str, retry: Retry) -> bool: # thread-safe
2800 src_dataset = replace_prefix(dst_dataset, old_prefix=dst.root_dataset, new_prefix=src.root_dataset)
2801 if src_dataset in basis_src_datasets_set and (self.are_bookmarks_enabled(src) or not p.delete_dst_bookmarks):
2802 src_kind = kind
2803 if not p.delete_dst_snapshots_no_crosscheck:
2804 src_kind = "snapshot,bookmark" if self.are_bookmarks_enabled(src) else "snapshot"
2805 src_cmd = p.split_args(f"{p.zfs_program} list -t {src_kind} -d 1 -s name -Hp -o guid", src_dataset)
2806 else:
2807 src_cmd = None
2808 dst_cmd = p.split_args(f"{p.zfs_program} list -t {kind} -d 1 -s createtxg -Hp -o {props}", dst_dataset)
2809 self.maybe_inject_delete(dst, dataset=dst_dataset, delete_trigger="zfs_list_delete_dst_snapshots")
2810 src_snaps_with_guids, dst_snaps_with_guids = self.run_in_parallel( # list src+dst snapshots in parallel
2811 lambda: set(self.run_ssh_command(src, log_trace, cmd=src_cmd).splitlines() if src_cmd else []),
2812 lambda: self.try_ssh_command(dst, log_trace, cmd=dst_cmd),
2813 )
2814 if dst_snaps_with_guids is None:
2815 log.warning("Third party deleted destination: %s", dst_dataset)
2816 return False
2817 dst_snaps_with_guids = dst_snaps_with_guids.splitlines()
2818 num_dst_snaps_with_guids = len(dst_snaps_with_guids)
2819 basis_dst_snaps_with_guids = dst_snaps_with_guids.copy()
2820 if p.delete_dst_bookmarks:
2821 replace_in_lines(dst_snaps_with_guids, old="#", new="@", count=1) # treat bookmarks as snapshots
2822 # The check against the source dataset happens *after* filtering the dst snapshots with filter_snapshots().
2823 # `p.delete_dst_snapshots_except` means the user wants to specify snapshots to *retain* aka *keep*
2824 all_except = p.delete_dst_snapshots_except
2825 if p.delete_dst_snapshots_except and not self.is_dummy(src):
2826 # However, as here we are in "except" mode AND the source is NOT a dummy, we first filter to get what
2827 # the policy says to *keep* (so all_except=False for the filter_snapshots() call), then from that "keep"
2828 # list, we later further refine by checking what's on the source dataset.
2829 all_except = False
2830 dst_snaps_with_guids = self.filter_snapshots(dst_snaps_with_guids, all_except=all_except)
2831 if p.delete_dst_bookmarks:
2832 replace_in_lines(dst_snaps_with_guids, old="@", new="#", count=1) # restore pre-filtering bookmark state
2833 if filter_needs_creation_time:
2834 dst_snaps_with_guids = cut(field=2, lines=dst_snaps_with_guids)
2835 basis_dst_snaps_with_guids = cut(field=2, lines=basis_dst_snaps_with_guids)
2836 if p.delete_dst_snapshots_except and not self.is_dummy(src): # Non-dummy Source + "Except" (Keep) Mode
2837 # Retain dst snapshots that match snapshot filter policy AND are on src dataset, aka
2838 # Delete dst snapshots except snapshots that match snapshot filter policy AND are on src dataset.
2839 # Concretely, `dst_snaps_with_guids` contains GUIDs of DST snapshots that the filter policy says to KEEP.
2840 # We only actually keep them if they are ALSO on the SRC.
2841 # So, snapshots to DELETE (`dst_tags_to_delete`) are ALL snapshots on DST (`basis_dst_snaps_with_guids`)
2842 # EXCEPT those whose GUIDs are in `dst_snaps_with_guids` AND ALSO in `src_snaps_with_guids`.
2843 except_dst_guids = set(cut(field=1, lines=dst_snaps_with_guids)).intersection(src_snaps_with_guids)
2844 dst_tags_to_delete = filter_lines_except(basis_dst_snaps_with_guids, except_dst_guids)
2845 else: # Standard Delete Mode OR Dummy Source + "Except" (Keep) Mode
2846 # In standard delete mode:
2847 # `dst_snaps_with_guids` contains GUIDs of policy-selected snapshots on DST.
2848 # We delete those that are NOT on SRC.
2849 # `dst_tags_to_delete` = `dst_snaps_with_guids` - `src_snaps_with_guids`.
2850 # In dummy source + "except" (keep) mode:
2851 # `all_except` was True.
2852 # `dst_snaps_with_guids` contains snaps NOT matching the "keep" policy -- these are the ones to delete.
2853 # `src_snaps_with_guids` is empty.
2854 # `dst_tags_to_delete` = `dst_snaps_with_guids` - {} = `dst_snaps_with_guids`.
2855 dst_guids_to_delete = set(cut(field=1, lines=dst_snaps_with_guids)).difference(src_snaps_with_guids)
2856 dst_tags_to_delete = filter_lines(dst_snaps_with_guids, dst_guids_to_delete)
2857 separator = "#" if p.delete_dst_bookmarks else "@"
2858 dst_tags_to_delete = cut(field=2, separator=separator, lines=dst_tags_to_delete)
2859 if p.delete_dst_bookmarks:
2860 self.delete_bookmarks(dst, dst_dataset, snapshot_tags=dst_tags_to_delete)
2861 else:
2862 self.delete_snapshots(dst, dst_dataset, snapshot_tags=dst_tags_to_delete)
2863 with self.stats_lock:
2864 nonlocal num_snapshots_found
2865 num_snapshots_found += num_dst_snaps_with_guids
2866 nonlocal num_snapshots_deleted
2867 num_snapshots_deleted += len(dst_tags_to_delete)
2868 if len(dst_tags_to_delete) > 0 and not p.delete_dst_bookmarks:
2869 self.dst_properties[dst_dataset][SNAPSHOTS_CHANGED] = 0 # invalidate cache
2870 return True
2872 # Run delete_destination_snapshots(dataset) for each dataset, while handling errors, retries + parallel exec
2873 if self.are_bookmarks_enabled(dst) or not p.delete_dst_bookmarks:
2874 start_time_nanos = time.monotonic_ns()
2875 failed = self.process_datasets_in_parallel_and_fault_tolerant(
2876 dst_datasets,
2877 process_dataset=delete_destination_snapshots, # lambda
2878 skip_tree_on_error=lambda dataset: False,
2879 max_workers=max_workers,
2880 enable_barriers=False,
2881 task_name="--delete-dst-snapshots",
2882 )
2883 elapsed_nanos = time.monotonic_ns() - start_time_nanos
2884 log.info(
2885 p.dry("--delete-dst-snapshots: %s"),
2886 task_description + f" [Deleted {num_snapshots_deleted} out of {num_snapshots_found} {kind}s "
2887 f"within {len(dst_datasets)} datasets; took {human_readable_duration(elapsed_nanos)}]",
2888 )
2890 # Optionally, delete any existing destination dataset that has no snapshot and no bookmark if all descendants
2891 # of that dataset do not have a snapshot or bookmark either. To do so, we walk the dataset list (conceptually,
2892 # a tree) depth-first (i.e. sorted descending). If a dst dataset has zero snapshots and zero bookmarks and all
2893 # its children are already marked as orphans, then it is itself an orphan, and we mark it as such. Walking in
2894 # a reverse sorted way means that we efficiently check for zero snapshots/bookmarks not just over the direct
2895 # children but the entire tree. Finally, delete all orphan datasets in an efficient batched way.
2896 if p.delete_empty_dst_datasets and p.recursive and not failed:
2897 log.info(p.dry("--delete-empty-dst-datasets: %s"), task_description)
2898 delete_empty_dst_datasets_if_no_bookmarks_and_no_snapshots = (
2899 p.delete_empty_dst_datasets_if_no_bookmarks_and_no_snapshots and self.are_bookmarks_enabled(dst)
2900 )
2902 # Compute the direct children of each NON-FILTERED dataset. Thus, no non-selected dataset and no ancestor of a
2903 # non-selected dataset will ever be added to the "orphan" set. In other words, this treats non-selected dataset
2904 # subtrees as if they all had snapshots, so non-selected dataset subtrees and their ancestors are guaranteed
2905 # to not get deleted.
2906 children = defaultdict(set)
2907 for dst_dataset in basis_dst_datasets:
2908 parent = os.path.dirname(dst_dataset)
2909 children[parent].add(dst_dataset)
2911 # Find and mark orphan datasets, finally delete them in an efficient way. Using two filter runs instead of one
2912 # filter run is an optimization. The first run only computes candidate orphans, without incurring I/O, to reduce
2913 # the list of datasets for which we list snapshots via 'zfs list -t snapshot ...' from dst_datasets to a subset
2914 # of dst_datasets, which in turn reduces I/O and improves perf. Essentially, this eliminates the I/O to list
2915 # snapshots for ancestors of excluded datasets. The second run computes the real orphans.
2916 btype = "bookmark,snapshot" if delete_empty_dst_datasets_if_no_bookmarks_and_no_snapshots else "snapshot"
2917 dst_datasets_having_snapshots: Set[str] = set()
2918 for run in range(0, 2):
2919 orphans: Set[str] = set()
2920 for dst_dataset in reversed(dst_datasets):
2921 if children[dst_dataset].issubset(orphans):
2922 # all children turned out to be orphans, thus the dataset itself could be an orphan
2923 if dst_dataset not in dst_datasets_having_snapshots: # always True during first filter run
2924 orphans.add(dst_dataset)
2925 if run == 0:
2926 # find datasets with >= 1 snapshot; update dst_datasets_having_snapshots for real use in the 2nd run
2927 cmd = p.split_args(f"{p.zfs_program} list -t {btype} -d 1 -S name -Hp -o name")
2928 for datasets_having_snapshots in self.zfs_list_snapshots_in_parallel(
2929 dst, cmd, sorted(orphans), ordered=False
2930 ):
2931 if delete_empty_dst_datasets_if_no_bookmarks_and_no_snapshots:
2932 replace_in_lines(datasets_having_snapshots, old="#", new="@", count=1) # treat bookmarks as snap
2933 datasets_having_snapshots = set(cut(field=1, separator="@", lines=datasets_having_snapshots))
2934 dst_datasets_having_snapshots.update(datasets_having_snapshots) # union
2935 else:
2936 self.delete_datasets(dst, orphans)
2937 dst_datasets = sorted(set(dst_datasets).difference(orphans))
2939 # Optionally, compare source and destination dataset trees recursively wrt. snapshots, for example to check if all
2940 # recently taken snapshots have been successfully replicated by a periodic job.
2941 if p.compare_snapshot_lists and not failed:
2942 log.info("--compare-snapshot-lists: %s", task_description)
2943 if len(basis_src_datasets) == 0 and not self.is_dummy(src):
2944 die(f"Source dataset does not exist: {src.basis_root_dataset}")
2945 src_datasets = filter_src_datasets() # apply include/exclude policy
2946 self.run_compare_snapshot_lists(src_datasets, dst_datasets)
2948 # Optionally, alert the user if the ZFS 'creation' time property of the latest or oldest snapshot for any specified
2949 # snapshot name pattern within the selected datasets is too old wrt. the specified age limit. The purpose is to
2950 # check if snapshots are successfully taken on schedule, successfully replicated on schedule, and successfully
2951 # pruned on schedule. Process exit code is 0, 1, 2 on OK, WARNING, CRITICAL, respectively.
2952 if p.monitor_snapshots_config.enable_monitor_snapshots and not failed:
2953 log.info("--monitor-snapshots: %s", task_description)
2954 src_datasets = filter_src_datasets() # apply include/exclude policy
2955 num_cache_hits = self.num_cache_hits
2956 num_cache_misses = self.num_cache_misses
2957 start_time_nanos = time.monotonic_ns()
2958 self.run_in_parallel(
2959 lambda: self.monitor_snapshots(dst, dst_datasets), lambda: self.monitor_snapshots(src, src_datasets)
2960 )
2961 elapsed = human_readable_duration(time.monotonic_ns() - start_time_nanos)
2962 if num_cache_hits != self.num_cache_hits or num_cache_misses != self.num_cache_misses:
2963 total = self.num_cache_hits + self.num_cache_misses
2964 msg = f", cache hits: {percent(self.num_cache_hits, total)}, misses: {percent(self.num_cache_misses, total)}"
2965 else:
2966 msg = ""
2967 log.info(
2968 "--monitor-snapshots done: %s",
2969 f"{task_description} [{len(src_datasets) + len(dst_datasets)} datasets; took {elapsed}{msg}]",
2970 )
2972 def monitor_snapshots(self, remote: Remote, sorted_datasets: List[str]) -> None:
2973 p, log = self.params, self.params.log
2974 alerts: List[MonitorSnapshotAlert] = p.monitor_snapshots_config.alerts
2975 labels: List[SnapshotLabel] = [alert.label for alert in alerts]
2976 current_unixtime_millis: float = p.create_src_snapshots_config.current_datetime.timestamp() * 1000
2977 is_debug: bool = log.isEnabledFor(log_debug)
2978 if self.is_caching_snapshots(remote):
2979 props = self.dst_properties if remote is p.dst else self.src_properties
2980 snapshots_changed_dict: Dict[str, int] = {
2981 dataset: int(vals[SNAPSHOTS_CHANGED]) for dataset, vals in props.items()
2982 }
2983 hash_code: str = hashlib.sha256(str(tuple(alerts)).encode("utf-8")).hexdigest()
2984 is_caching = False
2986 def monitor_last_modified_cache_file(r: Remote, dataset: str, label: SnapshotLabel, alert_cfg: AlertConfig) -> str:
2987 cache_label = SnapshotLabel(os_path_join("===", alert_cfg.kind, str(label), hash_code), "", "", "")
2988 return self.last_modified_cache_file(r, dataset, cache_label)
2990 def alert_msg(
2991 kind: str, dataset: str, snapshot: str, label: SnapshotLabel, snapshot_age_millis: float, delta_millis: int
2992 ) -> str:
2993 assert kind == "Latest" or kind == "Oldest"
2994 lbl = f"{label.prefix}{label.infix}<timestamp>{label.suffix}"
2995 if snapshot_age_millis >= current_unixtime_millis:
2996 return f"No snapshot exists for {dataset}@{lbl}"
2997 msg = f"{kind} snapshot for {dataset}@{lbl} is {human_readable_duration(snapshot_age_millis, unit='ms')} old"
2998 s = f" ({snapshot})" if snapshot else ""
2999 if delta_millis == -1:
3000 return f"{msg}{s}"
3001 return f"{msg} but should be at most {human_readable_duration(delta_millis, unit='ms')} old{s}"
3003 def check_alert(
3004 label: SnapshotLabel,
3005 alert_cfg: Optional[AlertConfig],
3006 creation_unixtime_secs: int,
3007 dataset: str,
3008 snapshot: str,
3009 ) -> None:
3010 if alert_cfg is None:
3011 return
3012 if is_caching and not p.dry_run: # update cache with latest state from 'zfs list -t snapshot'
3013 snapshots_changed = snapshots_changed_dict.get(dataset, 0)
3014 cache_file = monitor_last_modified_cache_file(remote, dataset, label, alert_cfg)
3015 set_last_modification_time_safe(cache_file, unixtime_in_secs=(creation_unixtime_secs, snapshots_changed))
3016 warning_millis = alert_cfg.warning_millis
3017 critical_millis = alert_cfg.critical_millis
3018 alert_kind = alert_cfg.kind
3019 snapshot_age_millis = current_unixtime_millis - creation_unixtime_secs * 1000
3020 m = "--monitor_snapshots: "
3021 if snapshot_age_millis > critical_millis:
3022 msg = m + alert_msg(alert_kind, dataset, snapshot, label, snapshot_age_millis, critical_millis)
3023 log.critical("%s", msg)
3024 if not p.monitor_snapshots_config.dont_crit:
3025 die(msg, exit_code=critical_status)
3026 elif snapshot_age_millis > warning_millis:
3027 msg = m + alert_msg(alert_kind, dataset, snapshot, label, snapshot_age_millis, warning_millis)
3028 log.warning("%s", msg)
3029 if not p.monitor_snapshots_config.dont_warn:
3030 die(msg, exit_code=warning_status)
3031 elif is_debug:
3032 msg = m + "OK. " + alert_msg(alert_kind, dataset, snapshot, label, snapshot_age_millis, delta_millis=-1)
3033 log.debug("%s", msg)
3035 def alert_latest_snapshot(i: int, creation_unixtime_secs: int, dataset: str, snapshot: str) -> None:
3036 alert: MonitorSnapshotAlert = alerts[i]
3037 check_alert(alert.label, alert.latest, creation_unixtime_secs, dataset, snapshot)
3039 def alert_oldest_snapshot(i: int, creation_unixtime_secs: int, dataset: str, snapshot: str) -> None:
3040 alert: MonitorSnapshotAlert = alerts[i]
3041 check_alert(alert.label, alert.oldest, creation_unixtime_secs, dataset, snapshot)
3043 def find_stale_datasets_and_check_alerts() -> List[str]:
3044 """If the cache is enabled, check which datasets have changed to determine which datasets can be skipped cheaply,
3045 i.e. without incurring 'zfs list -t snapshots'. This is done by comparing the "snapshots_changed" ZFS dataset
3046 property with the local cache - https://openzfs.github.io/openzfs-docs/man/7/zfsprops.7.html#snapshots_changed"""
3047 stale_datasets = []
3048 time_threshold = time.time() - time_threshold_secs
3049 for dataset in sorted_datasets:
3050 is_stale_dataset = False
3051 snapshots_changed: int = snapshots_changed_dict.get(dataset, 0)
3052 for alert in alerts:
3053 for cfg in (alert.latest, alert.oldest):
3054 if cfg is None:
3055 continue
3056 if (
3057 snapshots_changed != 0
3058 and snapshots_changed < time_threshold
3059 and (
3060 cached_unix_times := self.cache_get_snapshots_changed2(
3061 monitor_last_modified_cache_file(remote, dataset, alert.label, cfg)
3062 )
3063 )
3064 and snapshots_changed == cached_unix_times[1] # cached snapshots_changed aka last modified time
3065 and snapshots_changed >= cached_unix_times[0] # creation time of minmax snapshot aka access time
3066 ): # cached state is still valid; emit an alert if the latest/oldest snapshot is too old
3067 lbl = alert.label
3068 check_alert(lbl, cfg, creation_unixtime_secs=cached_unix_times[0], dataset=dataset, snapshot="")
3069 else: # cached state is nomore valid; fallback to 'zfs list -t snapshot'
3070 is_stale_dataset = True
3071 if is_stale_dataset:
3072 stale_datasets.append(dataset)
3073 return stale_datasets
3075 # satisfy request from local cache as much as possible
3076 if self.is_caching_snapshots(remote):
3077 stale_datasets = find_stale_datasets_and_check_alerts()
3078 with self.stats_lock:
3079 self.num_cache_misses += len(stale_datasets)
3080 self.num_cache_hits += len(sorted_datasets) - len(stale_datasets)
3081 else:
3082 stale_datasets = sorted_datasets
3084 # fallback to 'zfs list -t snapshot' for any remaining datasets, as these couldn't be satisfied from local cache
3085 is_caching = self.is_caching_snapshots(remote)
3086 datasets_without_snapshots = self.handle_minmax_snapshots(
3087 remote, stale_datasets, labels, fn_latest=alert_latest_snapshot, fn_oldest=alert_oldest_snapshot
3088 )
3089 for dataset in datasets_without_snapshots:
3090 for i in range(len(alerts)):
3091 alert_latest_snapshot(i, creation_unixtime_secs=0, dataset=dataset, snapshot="")
3092 alert_oldest_snapshot(i, creation_unixtime_secs=0, dataset=dataset, snapshot="")
3094 def replicate_datasets(self, src_datasets: List[str], task_description: str, max_workers: int) -> bool:
3095 assert not self.is_test_mode or src_datasets == sorted(src_datasets), "List is not sorted"
3096 p, log = self.params, self.params.log
3097 src, dst = p.src, p.dst
3098 self.num_snapshots_found = 0
3099 self.num_snapshots_replicated = 0
3100 # perf/latency: no need to set up a dedicated TCP connection if no parallel replication is possible
3101 self.dedicated_tcp_connection_per_zfs_send = (
3102 p.dedicated_tcp_connection_per_zfs_send
3103 and max_workers > 1
3104 and has_siblings(src_datasets) # siblings can be replicated in parallel
3105 )
3106 log.info("Starting replication task: %s", task_description + f" [{len(src_datasets)} datasets]")
3107 start_time_nanos = time.monotonic_ns()
3109 def src2dst(src_dataset: str) -> str:
3110 return replace_prefix(src_dataset, old_prefix=src.root_dataset, new_prefix=dst.root_dataset)
3112 def dst2src(dst_dataset: str) -> str:
3113 return replace_prefix(dst_dataset, old_prefix=dst.root_dataset, new_prefix=src.root_dataset)
3115 def find_stale_datasets() -> Tuple[List[str], Dict[str, str]]:
3116 """If the cache is enabled on replication, check which src datasets or dst datasets have changed to determine
3117 which datasets can be skipped cheaply, i.e. without incurring 'zfs list -t snapshots'. This is done by comparing
3118 the "snapshots_changed" ZFS dataset property with the local cache.
3119 See https://openzfs.github.io/openzfs-docs/man/7/zfsprops.7.html#snapshots_changed"""
3120 # First, check which src datasets have changed since the last replication to that destination
3121 cache_files = {}
3122 stale_src_datasets1 = []
3123 maybe_stale_dst_datasets = []
3124 userhost_dir = p.dst.ssh_user_host # cache is only valid for identical destination username+host
3125 userhost_dir = userhost_dir if userhost_dir else "-"
3126 hash_key = tuple(tuple(f) for f in p.snapshot_filters) # cache is only valid for same --include/excl-snapshot*
3127 hash_code = hashlib.sha256(str(hash_key).encode("utf-8")).hexdigest()
3128 for src_dataset in src_datasets:
3129 dst_dataset = src2dst(src_dataset) # cache is only valid for identical destination dataset
3130 cache_label = SnapshotLabel(os.path.join("==", userhost_dir, dst_dataset, hash_code), "", "", "")
3131 cache_file = self.last_modified_cache_file(src, src_dataset, cache_label)
3132 cache_files[src_dataset] = cache_file
3133 snapshots_changed: int = int(self.src_properties[src_dataset][SNAPSHOTS_CHANGED]) # get prop "for free"
3134 if (
3135 snapshots_changed != 0
3136 and time.time() > snapshots_changed + time_threshold_secs
3137 and snapshots_changed == self.cache_get_snapshots_changed(cache_file)
3138 ):
3139 maybe_stale_dst_datasets.append(dst_dataset)
3140 else:
3141 stale_src_datasets1.append(src_dataset)
3143 # For each src dataset that hasn't changed, check if the corresponding dst dataset has changed
3144 stale_src_datasets2 = []
3145 dst_snapshots_changed_dict = self.zfs_get_snapshots_changed(dst, maybe_stale_dst_datasets)
3146 for dst_dataset in maybe_stale_dst_datasets:
3147 snapshots_changed = dst_snapshots_changed_dict.get(dst_dataset, 0)
3148 cache_file = self.last_modified_cache_file(dst, dst_dataset)
3149 if (
3150 snapshots_changed != 0
3151 and time.time() > snapshots_changed + time_threshold_secs
3152 and snapshots_changed == self.cache_get_snapshots_changed(cache_file)
3153 ):
3154 log.info("Already up-to-date [cached]: %s", dst_dataset)
3155 else:
3156 stale_src_datasets2.append(dst2src(dst_dataset))
3157 assert not self.is_test_mode or stale_src_datasets1 == sorted(stale_src_datasets1), "List is not sorted"
3158 assert not self.is_test_mode or stale_src_datasets2 == sorted(stale_src_datasets2), "List is not sorted"
3159 stale_src_datasets = list(heapq.merge(stale_src_datasets1, stale_src_datasets2)) # merge two sorted lists
3160 assert not self.is_test_mode or not has_duplicates(stale_src_datasets), "List contains duplicates"
3161 return stale_src_datasets, cache_files
3163 if self.is_caching_snapshots(src):
3164 stale_src_datasets, cache_files = find_stale_datasets()
3165 num_cache_misses = len(stale_src_datasets)
3166 num_cache_hits = len(src_datasets) - len(stale_src_datasets)
3167 self.num_cache_misses += num_cache_misses
3168 self.num_cache_hits += num_cache_hits
3169 total = self.num_cache_hits + self.num_cache_misses
3170 cmsg = f", cache hits: {percent(self.num_cache_hits, total)}, misses: {percent(self.num_cache_misses, total)}"
3171 else:
3172 stale_src_datasets = src_datasets
3173 cache_files = {}
3174 cmsg = ""
3176 # Run replicate_dataset(dataset) for each dataset, while taking care of errors, retries + parallel execution
3177 failed = self.process_datasets_in_parallel_and_fault_tolerant(
3178 stale_src_datasets,
3179 process_dataset=self.replicate_dataset, # lambda
3180 skip_tree_on_error=lambda dataset: not self.dst_dataset_exists[src2dst(dataset)],
3181 max_workers=max_workers,
3182 enable_barriers=False,
3183 task_name="Replication",
3184 )
3186 if self.is_caching_snapshots(src) and not failed:
3187 # refresh "snapshots_changed" ZFS dataset property from dst
3188 stale_dst_datasets = [src2dst(src_dataset) for src_dataset in stale_src_datasets]
3189 dst_snapshots_changed_dict = self.zfs_get_snapshots_changed(dst, stale_dst_datasets)
3190 for dst_dataset in stale_dst_datasets: # update local cache
3191 dst_snapshots_changed = dst_snapshots_changed_dict.get(dst_dataset, 0)
3192 dst_cache_file = self.last_modified_cache_file(dst, dst_dataset)
3193 src_dataset = dst2src(dst_dataset)
3194 src_snapshots_changed: int = int(self.src_properties[src_dataset][SNAPSHOTS_CHANGED])
3195 if not p.dry_run:
3196 set_last_modification_time_safe(cache_files[src_dataset], unixtime_in_secs=src_snapshots_changed)
3197 set_last_modification_time_safe(dst_cache_file, unixtime_in_secs=dst_snapshots_changed)
3199 elapsed_nanos = time.monotonic_ns() - start_time_nanos
3200 log.info(
3201 p.dry("Replication done: %s"),
3202 f"{task_description} [Replicated {self.num_snapshots_replicated} out of {self.num_snapshots_found} snapshots"
3203 f" within {len(src_datasets)} datasets; took {human_readable_duration(elapsed_nanos)}{cmsg}]",
3204 )
3205 return failed
3207 def replicate_dataset(self, src_dataset: str, tid: str, retry: Retry) -> bool:
3208 """Replicates src_dataset (without handling descendants) to dst_dataset (thread-safe)."""
3210 p, log = self.params, self.params.log
3211 src, dst = p.src, p.dst
3212 retry_count = retry.count
3213 dst_dataset = replace_prefix(src_dataset, old_prefix=src.root_dataset, new_prefix=dst.root_dataset)
3214 log.debug(p.dry(f"{tid} Replicating: %s"), f"{src_dataset} --> {dst_dataset} ...")
3216 # list GUID and name for dst snapshots, sorted ascending by createtxg (more precise than creation time)
3217 dst_cmd = p.split_args(f"{p.zfs_program} list -t snapshot -d 1 -s createtxg -Hp -o guid,name", dst_dataset)
3219 # list GUID and name for src snapshots + bookmarks, primarily sort ascending by transaction group (which is more
3220 # precise than creation time), secondarily sort such that snapshots appear after bookmarks for the same GUID.
3221 # Note: A snapshot and its ZFS bookmarks always have the same GUID, creation time and transaction group. A snapshot
3222 # changes its transaction group but retains its creation time and GUID on 'zfs receive' on another pool, i.e.
3223 # comparing createtxg is only meaningful within a single pool, not across pools from src to dst. Comparing creation
3224 # time remains meaningful across pools from src to dst. Creation time is a UTC Unix time in integer seconds.
3225 # Note that 'zfs create', 'zfs snapshot' and 'zfs bookmark' CLIs enforce that snapshot names must not contain a '#'
3226 # char, bookmark names must not contain a '@' char, and dataset names must not contain a '#' or '@' char.
3227 # GUID and creation time also do not contain a '#' or '@' char.
3228 filter_needs_creation_time = has_timerange_filter(p.snapshot_filters)
3229 types = "snapshot,bookmark" if p.use_bookmark and self.are_bookmarks_enabled(src) else "snapshot"
3230 props = self.creation_prefix + "creation,guid,name" if filter_needs_creation_time else "guid,name"
3231 src_cmd = p.split_args(f"{p.zfs_program} list -t {types} -s createtxg -s type -d 1 -Hp -o {props}", src_dataset)
3232 self.maybe_inject_delete(src, dataset=src_dataset, delete_trigger="zfs_list_snapshot_src")
3233 src_snapshots_and_bookmarks, dst_snapshots_with_guids = self.run_in_parallel( # list src+dst snapshots in parallel
3234 lambda: self.try_ssh_command(src, log_trace, cmd=src_cmd),
3235 lambda: self.try_ssh_command(dst, log_trace, cmd=dst_cmd, error_trigger="zfs_list_snapshot_dst"),
3236 )
3237 self.dst_dataset_exists[dst_dataset] = dst_snapshots_with_guids is not None
3238 dst_snapshots_with_guids = dst_snapshots_with_guids.splitlines() if dst_snapshots_with_guids is not None else []
3239 if src_snapshots_and_bookmarks is None:
3240 log.warning("Third party deleted source: %s", src_dataset)
3241 return False # src dataset has been deleted by some third party while we're running - nothing to do anymore
3242 src_snapshots_with_guids: List[str] = src_snapshots_and_bookmarks.splitlines()
3243 src_snapshots_and_bookmarks = None
3244 if len(dst_snapshots_with_guids) == 0 and "bookmark" in types:
3245 # src bookmarks serve no purpose if the destination dataset has no snapshot; ignore them
3246 src_snapshots_with_guids = [snapshot for snapshot in src_snapshots_with_guids if "@" in snapshot]
3247 num_src_snapshots_found = sum(1 for snapshot in src_snapshots_with_guids if "@" in snapshot)
3248 with self.stats_lock:
3249 self.num_snapshots_found += num_src_snapshots_found
3250 # apply include/exclude regexes to ignore irrelevant src snapshots
3251 basis_src_snapshots_with_guids = src_snapshots_with_guids
3252 src_snapshots_with_guids = self.filter_snapshots(src_snapshots_with_guids)
3253 if filter_needs_creation_time:
3254 src_snapshots_with_guids = cut(field=2, lines=src_snapshots_with_guids)
3255 basis_src_snapshots_with_guids = cut(field=2, lines=basis_src_snapshots_with_guids)
3257 # find oldest and latest "true" snapshot, as well as GUIDs of all snapshots and bookmarks.
3258 # a snapshot is "true" if it is not a bookmark.
3259 oldest_src_snapshot = ""
3260 latest_src_snapshot = ""
3261 included_src_guids: Set[str] = set()
3262 for line in src_snapshots_with_guids:
3263 guid, snapshot = line.split("\t", 1)
3264 included_src_guids.add(guid)
3265 if "@" in snapshot:
3266 latest_src_snapshot = snapshot
3267 if not oldest_src_snapshot:
3268 oldest_src_snapshot = snapshot
3269 if len(src_snapshots_with_guids) == 0:
3270 if p.skip_missing_snapshots == "fail":
3271 die(f"Source dataset includes no snapshot: {src_dataset}. Consider using --skip-missing-snapshots=dataset")
3272 elif p.skip_missing_snapshots == "dataset":
3273 log.warning("Skipping source dataset because it includes no snapshot: %s", src_dataset)
3274 if p.recursive and not self.dst_dataset_exists[dst_dataset]:
3275 log.warning("Also skipping descendant datasets as dst dataset does not exist for %s", src_dataset)
3276 return self.dst_dataset_exists[dst_dataset]
3278 log.debug("latest_src_snapshot: %s", latest_src_snapshot)
3279 latest_dst_snapshot = ""
3280 latest_dst_guid = ""
3281 latest_common_src_snapshot = ""
3282 props_cache: Dict[Tuple[str, str, str], Dict[str, Optional[str]]] = {}
3283 done_checking = False
3285 if self.dst_dataset_exists[dst_dataset]:
3286 if len(dst_snapshots_with_guids) > 0:
3287 latest_dst_guid, latest_dst_snapshot = dst_snapshots_with_guids[-1].split("\t", 1)
3288 if p.force_rollback_to_latest_snapshot:
3289 log.info(p.dry(f"{tid} Rolling back destination to most recent snapshot: %s"), latest_dst_snapshot)
3290 # rollback just in case the dst dataset was modified since its most recent snapshot
3291 done_checking = done_checking or self.check_zfs_dataset_busy(dst, dst_dataset)
3292 cmd = p.split_args(f"{dst.sudo} {p.zfs_program} rollback", latest_dst_snapshot)
3293 self.try_ssh_command(dst, log_debug, is_dry=p.dry_run, print_stdout=True, cmd=cmd, exists=False)
3294 elif latest_src_snapshot == "":
3295 log.info(f"{tid} Already-up-to-date: %s", dst_dataset)
3296 return True
3298 # find most recent snapshot (or bookmark) that src and dst have in common - we'll start to replicate
3299 # from there up to the most recent src snapshot. any two snapshots are "common" iff their ZFS GUIDs (i.e.
3300 # contents) are equal. See https://github.com/openzfs/zfs/commit/305bc4b370b20de81eaf10a1cf724374258b74d1
3301 def latest_common_snapshot(
3302 snapshots_with_guids: List[str], intersect_guids: Set[str]
3303 ) -> Tuple[Optional[str], str]:
3304 """Returns a true snapshot instead of its bookmark with the same GUID, per the sort order previously
3305 used for 'zfs list -s ...'"""
3306 for _line in reversed(snapshots_with_guids):
3307 _guid, _snapshot = _line.split("\t", 1)
3308 if _guid in intersect_guids:
3309 return _guid, _snapshot # can be a snapshot or bookmark
3310 return None, ""
3312 latest_common_guid, latest_common_src_snapshot = latest_common_snapshot(
3313 src_snapshots_with_guids, set(cut(field=1, lines=dst_snapshots_with_guids))
3314 )
3315 log.debug("latest_common_src_snapshot: %s", latest_common_src_snapshot) # is a snapshot or bookmark
3316 log.log(log_trace, "latest_dst_snapshot: %s", latest_dst_snapshot)
3318 if latest_common_src_snapshot and latest_common_guid != latest_dst_guid:
3319 # found latest common snapshot but dst has an even newer snapshot. rollback dst to that common snapshot.
3320 assert latest_common_guid
3321 _, latest_common_dst_snapshot = latest_common_snapshot(dst_snapshots_with_guids, {latest_common_guid})
3322 if not (p.force_rollback_to_latest_common_snapshot or p.force):
3323 die(
3324 f"Conflict: Most recent destination snapshot {latest_dst_snapshot} is more recent than "
3325 f"most recent common snapshot {latest_common_dst_snapshot}. Rollback destination first, "
3326 "for example via --force-rollback-to-latest-common-snapshot (or --force) option."
3327 )
3328 if p.force_once:
3329 p.force.value = False
3330 p.force_rollback_to_latest_common_snapshot.value = False
3331 log.info(
3332 p.dry(f"{tid} Rolling back destination to most recent common snapshot: %s"), latest_common_dst_snapshot
3333 )
3334 done_checking = done_checking or self.check_zfs_dataset_busy(dst, dst_dataset)
3335 cmd = p.split_args(
3336 f"{dst.sudo} {p.zfs_program} rollback -r {p.force_unmount} {p.force_hard}", latest_common_dst_snapshot
3337 )
3338 try:
3339 self.run_ssh_command(dst, log_debug, is_dry=p.dry_run, print_stdout=True, cmd=cmd)
3340 except (subprocess.CalledProcessError, UnicodeDecodeError) as e:
3341 stderr = stderr_to_str(e.stderr) if hasattr(e, "stderr") else ""
3342 no_sleep = self.clear_resumable_recv_state_if_necessary(dst_dataset, stderr)
3343 # op isn't idempotent so retries regather current state from the start of replicate_dataset()
3344 raise RetryableError("Subprocess failed", no_sleep=no_sleep) from e
3346 if latest_src_snapshot and latest_src_snapshot == latest_common_src_snapshot:
3347 log.info(f"{tid} Already up-to-date: %s", dst_dataset)
3348 return True
3350 # endif self.dst_dataset_exists[dst_dataset]
3351 log.debug("latest_common_src_snapshot: %s", latest_common_src_snapshot) # is a snapshot or bookmark
3352 log.log(log_trace, "latest_dst_snapshot: %s", latest_dst_snapshot)
3353 dry_run_no_send = False
3354 right_just = 7
3356 def format_size(num_bytes: int) -> str:
3357 return human_readable_bytes(num_bytes, separator="").rjust(right_just)
3359 if not latest_common_src_snapshot:
3360 # no common snapshot was found. delete all dst snapshots, if any
3361 if latest_dst_snapshot:
3362 if not p.force:
3363 die(
3364 f"Conflict: No common snapshot found between {src_dataset} and {dst_dataset} even though "
3365 "destination has at least one snapshot. Aborting. Consider using --force option to first "
3366 "delete all existing destination snapshots in order to be able to proceed with replication."
3367 )
3368 if p.force_once:
3369 p.force.value = False
3370 done_checking = done_checking or self.check_zfs_dataset_busy(dst, dst_dataset)
3371 self.delete_snapshots(dst, dst_dataset, snapshot_tags=cut(2, separator="@", lines=dst_snapshots_with_guids))
3372 if p.dry_run:
3373 # As we're in --dryrun (--force) mode this conflict resolution step (see above) wasn't really executed:
3374 # "no common snapshot was found. delete all dst snapshots". In turn, this would cause the subsequent
3375 # 'zfs receive -n' to fail with "cannot receive new filesystem stream: destination has snapshots; must
3376 # destroy them to overwrite it". So we skip the zfs send/receive step and keep on trucking.
3377 dry_run_no_send = True
3379 # to start with, fully replicate oldest snapshot, which in turn creates a common snapshot
3380 if p.no_stream:
3381 oldest_src_snapshot = latest_src_snapshot
3382 if oldest_src_snapshot:
3383 if not self.dst_dataset_exists[dst_dataset]:
3384 # on destination, create parent filesystem and ancestors if they do not yet exist
3385 dst_dataset_parent = os.path.dirname(dst_dataset)
3386 if not self.dst_dataset_exists[dst_dataset_parent]:
3387 if p.dry_run:
3388 dry_run_no_send = True
3389 if dst_dataset_parent != "": 3389 ↛ 3392line 3389 didn't jump to line 3392 because the condition on line 3389 was always true
3390 self.create_zfs_filesystem(dst_dataset_parent)
3392 recv_resume_token, send_resume_opts, recv_resume_opts = self._recv_resume_token(dst_dataset, retry_count)
3393 curr_size = self.estimate_send_size(src, dst_dataset, recv_resume_token, oldest_src_snapshot)
3394 humansize = format_size(curr_size)
3395 if recv_resume_token:
3396 send_opts = send_resume_opts # e.g. ["-t", "1-c740b4779-..."]
3397 else:
3398 send_opts = p.curr_zfs_send_program_opts + [oldest_src_snapshot]
3399 send_cmd = p.split_args(f"{src.sudo} {p.zfs_program} send", send_opts)
3400 recv_opts = p.zfs_full_recv_opts.copy() + recv_resume_opts
3401 recv_opts, set_opts = self.add_recv_property_options(True, recv_opts, src_dataset, props_cache)
3402 recv_cmd = p.split_args(
3403 f"{dst.sudo} {p.zfs_program} receive -F", p.dry_run_recv, recv_opts, dst_dataset, allow_all=True
3404 )
3405 log.info(p.dry(f"{tid} Full send: %s"), f"{oldest_src_snapshot} --> {dst_dataset} ({humansize.strip()}) ...")
3406 done_checking = done_checking or self.check_zfs_dataset_busy(dst, dst_dataset)
3407 dry_run_no_send = dry_run_no_send or p.dry_run_no_send
3408 self.maybe_inject_params(error_trigger="full_zfs_send_params")
3409 humansize = humansize.rjust(right_just * 3 + 2)
3410 self.run_zfs_send_receive(
3411 src_dataset, dst_dataset, send_cmd, recv_cmd, curr_size, humansize, dry_run_no_send, "full_zfs_send"
3412 )
3413 latest_common_src_snapshot = oldest_src_snapshot # we have now created a common snapshot
3414 if not dry_run_no_send and not p.dry_run:
3415 self.dst_dataset_exists[dst_dataset] = True
3416 with self.stats_lock:
3417 self.num_snapshots_replicated += 1
3418 self.create_zfs_bookmarks(src, src_dataset, [oldest_src_snapshot])
3419 self.zfs_set(set_opts, dst, dst_dataset)
3420 dry_run_no_send = dry_run_no_send or p.dry_run
3421 retry_count = 0
3423 # endif not latest_common_src_snapshot
3424 # finally, incrementally replicate all snapshots from most recent common snapshot until most recent src snapshot
3425 if latest_common_src_snapshot:
3427 def replication_candidates() -> Tuple[List[str], List[str]]:
3428 assert len(basis_src_snapshots_with_guids) > 0
3429 result_snapshots = []
3430 result_guids = []
3431 last_appended_guid = ""
3432 snapshot_itr = reversed(basis_src_snapshots_with_guids)
3433 while True:
3434 guid, snapshot = snapshot_itr.__next__().split("\t", 1)
3435 if "@" in snapshot:
3436 result_snapshots.append(snapshot)
3437 result_guids.append(guid)
3438 last_appended_guid = guid
3439 if snapshot == latest_common_src_snapshot: # latest_common_src_snapshot is a snapshot or bookmark
3440 if guid != last_appended_guid and "@" not in snapshot:
3441 # only appends the src bookmark if it has no snapshot. If the bookmark has a snapshot then that
3442 # snapshot has already been appended, per the sort order previously used for 'zfs list -s ...'
3443 result_snapshots.append(snapshot)
3444 result_guids.append(guid)
3445 break
3446 result_snapshots.reverse()
3447 result_guids.reverse()
3448 assert len(result_snapshots) > 0
3449 assert len(result_snapshots) == len(result_guids)
3450 return result_guids, result_snapshots
3452 # collect the most recent common snapshot (which may be a bookmark) followed by all src snapshots
3453 # (that are not a bookmark) that are more recent than that.
3454 cand_guids, cand_snapshots = replication_candidates()
3455 if len(cand_snapshots) == 1:
3456 # latest_src_snapshot is a (true) snapshot that is equal to latest_common_src_snapshot or LESS recent
3457 # than latest_common_src_snapshot. The latter case can happen if latest_common_src_snapshot is a
3458 # bookmark whose snapshot has been deleted on src.
3459 return True # nothing more tbd
3461 recv_resume_token, send_resume_opts, recv_resume_opts = self._recv_resume_token(dst_dataset, retry_count)
3462 recv_opts = p.zfs_recv_program_opts.copy() + recv_resume_opts
3463 recv_opts, set_opts = self.add_recv_property_options(False, recv_opts, src_dataset, props_cache)
3464 if p.no_stream:
3465 # skip intermediate snapshots
3466 steps_todo = [("-i", latest_common_src_snapshot, latest_src_snapshot, [latest_src_snapshot])]
3467 else:
3468 # include intermediate src snapshots that pass --{include,exclude}-snapshot-* policy, using
3469 # a series of -i/-I send/receive steps that skip excluded src snapshots.
3470 steps_todo = self.incremental_send_steps_wrapper(
3471 cand_snapshots, cand_guids, included_src_guids, recv_resume_token is not None
3472 )
3473 log.log(log_trace, "steps_todo: %s", list_formatter(steps_todo, "; "))
3474 estimate_send_sizes = [
3475 self.estimate_send_size(
3476 src, dst_dataset, recv_resume_token if i == 0 else None, incr_flag, from_snap, to_snap
3477 )
3478 for i, (incr_flag, from_snap, to_snap, to_snapshots) in enumerate(steps_todo)
3479 ]
3480 total_size = sum(estimate_send_sizes)
3481 total_num = sum(len(to_snapshots) for incr_flag, from_snap, to_snap, to_snapshots in steps_todo)
3482 done_size = 0
3483 done_num = 0
3484 for i, (incr_flag, from_snap, to_snap, to_snapshots) in enumerate(steps_todo):
3485 assert len(to_snapshots) >= 1
3486 curr_num_snapshots = len(to_snapshots)
3487 curr_size = estimate_send_sizes[i]
3488 humansize = format_size(total_size) + "/" + format_size(done_size) + "/" + format_size(curr_size)
3489 human_num = f"{total_num}/{done_num}/{curr_num_snapshots} snapshots"
3490 if recv_resume_token:
3491 send_opts = send_resume_opts # e.g. ["-t", "1-c740b4779-..."]
3492 else:
3493 send_opts = p.curr_zfs_send_program_opts + [incr_flag, from_snap, to_snap]
3494 send_cmd = p.split_args(f"{src.sudo} {p.zfs_program} send", send_opts)
3495 recv_cmd = p.split_args(
3496 f"{dst.sudo} {p.zfs_program} receive", p.dry_run_recv, recv_opts, dst_dataset, allow_all=True
3497 )
3498 dense_size = p.two_or_more_spaces_regex.sub("", humansize.strip())
3499 log.info(
3500 p.dry(f"{tid} Incremental send {incr_flag}: %s"),
3501 f"{from_snap} .. {to_snap[to_snap.index('@'):]} --> {dst_dataset} ({dense_size}) ({human_num}) ...",
3502 )
3503 done_checking = done_checking or self.check_zfs_dataset_busy(dst, dst_dataset, busy_if_send=False)
3504 if p.dry_run and not self.dst_dataset_exists[dst_dataset]:
3505 dry_run_no_send = True
3506 dry_run_no_send = dry_run_no_send or p.dry_run_no_send
3507 self.maybe_inject_params(error_trigger="incr_zfs_send_params")
3508 self.run_zfs_send_receive(
3509 src_dataset, dst_dataset, send_cmd, recv_cmd, curr_size, humansize, dry_run_no_send, "incr_zfs_send"
3510 )
3511 done_size += curr_size
3512 done_num += curr_num_snapshots
3513 recv_resume_token = None
3514 with self.stats_lock:
3515 self.num_snapshots_replicated += curr_num_snapshots
3516 if p.create_bookmarks == "all":
3517 self.create_zfs_bookmarks(src, src_dataset, to_snapshots)
3518 elif p.create_bookmarks == "many":
3519 to_snapshots = [snap for snap in to_snapshots if p.xperiods.label_milliseconds(snap) >= 60 * 60 * 1000]
3520 if i == len(steps_todo) - 1 and (len(to_snapshots) == 0 or to_snapshots[-1] != to_snap):
3521 to_snapshots.append(to_snap)
3522 self.create_zfs_bookmarks(src, src_dataset, to_snapshots)
3523 self.zfs_set(set_opts, dst, dst_dataset)
3524 return True
3526 def prepare_zfs_send_receive(
3527 self, src_dataset: str, send_cmd: List[str], recv_cmd: List[str], size_estimate_bytes: int, size_estimate_human: str
3528 ) -> Tuple[str, str, str]:
3529 p = self.params
3530 send_cmd_str = shlex.join(send_cmd)
3531 recv_cmd_str = shlex.join(recv_cmd)
3533 if self.is_program_available("zstd", "src") and self.is_program_available("zstd", "dst"):
3534 _compress_cmd = self.compress_cmd("src", size_estimate_bytes)
3535 _decompress_cmd = self.decompress_cmd("dst", size_estimate_bytes)
3536 else: # no compression is used if source and destination do not both support compression
3537 _compress_cmd, _decompress_cmd = "cat", "cat"
3539 recordsize = abs(int(self.src_properties[src_dataset]["recordsize"]))
3540 src_buffer = self.mbuffer_cmd("src", size_estimate_bytes, recordsize)
3541 dst_buffer = self.mbuffer_cmd("dst", size_estimate_bytes, recordsize)
3542 local_buffer = self.mbuffer_cmd("local", size_estimate_bytes, recordsize)
3544 pv_src_cmd = ""
3545 pv_dst_cmd = ""
3546 pv_loc_cmd = ""
3547 if p.src.ssh_user_host == "":
3548 pv_src_cmd = self.pv_cmd("local", size_estimate_bytes, size_estimate_human)
3549 elif p.dst.ssh_user_host == "":
3550 pv_dst_cmd = self.pv_cmd("local", size_estimate_bytes, size_estimate_human)
3551 elif _compress_cmd == "cat":
3552 pv_loc_cmd = self.pv_cmd("local", size_estimate_bytes, size_estimate_human) # compression disabled
3553 else:
3554 # pull-push mode with compression enabled: reporting "percent complete" isn't straightforward because
3555 # localhost observes the compressed data instead of the uncompressed data, so we disable the progress bar.
3556 pv_loc_cmd = self.pv_cmd("local", size_estimate_bytes, size_estimate_human, disable_progress_bar=True)
3558 # assemble pipeline running on source leg
3559 src_pipe = ""
3560 if self.inject_params.get("inject_src_pipe_fail", False):
3561 # for testing; initially forward some bytes and then fail
3562 src_pipe = f"{src_pipe} | dd bs=64 count=1 2>/dev/null && false"
3563 if self.inject_params.get("inject_src_pipe_garble", False):
3564 src_pipe = f"{src_pipe} | base64" # for testing; forward garbled bytes
3565 if pv_src_cmd != "" and pv_src_cmd != "cat":
3566 src_pipe = f"{src_pipe} | {pv_src_cmd}"
3567 if _compress_cmd != "cat":
3568 src_pipe = f"{src_pipe} | {_compress_cmd}"
3569 if src_buffer != "cat":
3570 src_pipe = f"{src_pipe} | {src_buffer}"
3571 if src_pipe.startswith(" |"):
3572 src_pipe = src_pipe[2:] # strip leading ' |' part
3573 if self.inject_params.get("inject_src_send_error", False):
3574 send_cmd_str = f"{send_cmd_str} --injectedGarbageParameter" # for testing; induce CLI parse error
3575 if src_pipe != "":
3576 src_pipe = f"{send_cmd_str} | {src_pipe}"
3577 if p.src.ssh_user_host != "":
3578 src_pipe = p.shell_program + " -c " + self.dquote(src_pipe)
3579 else:
3580 src_pipe = send_cmd_str
3582 # assemble pipeline running on middle leg between source and destination. only enabled for pull-push mode
3583 local_pipe = ""
3584 if local_buffer != "cat":
3585 local_pipe = f"{local_buffer}"
3586 if pv_loc_cmd != "" and pv_loc_cmd != "cat":
3587 local_pipe = f"{local_pipe} | {pv_loc_cmd}"
3588 if local_buffer != "cat":
3589 local_pipe = f"{local_pipe} | {local_buffer}"
3590 if local_pipe.startswith(" |"):
3591 local_pipe = local_pipe[2:] # strip leading ' |' part
3592 if local_pipe != "":
3593 local_pipe = f"| {local_pipe}"
3595 # assemble pipeline running on destination leg
3596 dst_pipe = ""
3597 if dst_buffer != "cat":
3598 dst_pipe = f"{dst_buffer}"
3599 if _decompress_cmd != "cat":
3600 dst_pipe = f"{dst_pipe} | {_decompress_cmd}"
3601 if pv_dst_cmd != "" and pv_dst_cmd != "cat":
3602 dst_pipe = f"{dst_pipe} | {pv_dst_cmd}"
3603 if self.inject_params.get("inject_dst_pipe_fail", False):
3604 # interrupt zfs receive for testing retry/resume; initially forward some bytes and then stop forwarding
3605 dst_pipe = f"{dst_pipe} | dd bs=1024 count={inject_dst_pipe_fail_kbytes} 2>/dev/null"
3606 if self.inject_params.get("inject_dst_pipe_garble", False):
3607 dst_pipe = f"{dst_pipe} | base64" # for testing; forward garbled bytes
3608 if dst_pipe.startswith(" |"):
3609 dst_pipe = dst_pipe[2:] # strip leading ' |' part
3610 if self.inject_params.get("inject_dst_receive_error", False):
3611 recv_cmd_str = f"{recv_cmd_str} --injectedGarbageParameter" # for testing; induce CLI parse error
3612 if dst_pipe != "":
3613 dst_pipe = f"{dst_pipe} | {recv_cmd_str}"
3614 if p.dst.ssh_user_host != "":
3615 dst_pipe = p.shell_program + " -c " + self.dquote(dst_pipe)
3616 else:
3617 dst_pipe = recv_cmd_str
3619 # If there's no support for shell pipelines, we can't do compression, mbuffering, monitoring and rate-limiting,
3620 # so we fall back to simple zfs send/receive.
3621 if not self.is_program_available("sh", "src"):
3622 src_pipe = send_cmd_str
3623 if not self.is_program_available("sh", "dst"):
3624 dst_pipe = recv_cmd_str
3625 if not self.is_program_available("sh", "local"):
3626 local_pipe = ""
3628 src_pipe = self.squote(p.src, src_pipe)
3629 dst_pipe = self.squote(p.dst, dst_pipe)
3630 return src_pipe, local_pipe, dst_pipe
3632 def run_zfs_send_receive(
3633 self,
3634 src_dataset: str,
3635 dst_dataset: str,
3636 send_cmd: List[str],
3637 recv_cmd: List[str],
3638 size_estimate_bytes: int,
3639 size_estimate_human: str,
3640 dry_run_no_send: bool,
3641 error_trigger: Optional[str] = None,
3642 ) -> None:
3643 p, log = self.params, self.params.log
3644 src_pipe, local_pipe, dst_pipe = self.prepare_zfs_send_receive(
3645 src_dataset, send_cmd, recv_cmd, size_estimate_bytes, size_estimate_human
3646 )
3647 conn_pool_name = DEDICATED if self.dedicated_tcp_connection_per_zfs_send else SHARED
3648 src_conn_pool: ConnectionPool = p.connection_pools["src"].pool(conn_pool_name)
3649 src_conn: Connection = src_conn_pool.get_connection()
3650 dst_conn_pool: ConnectionPool = p.connection_pools["dst"].pool(conn_pool_name)
3651 dst_conn: Connection = dst_conn_pool.get_connection()
3652 try:
3653 self.refresh_ssh_connection_if_necessary(p.src, src_conn)
3654 self.refresh_ssh_connection_if_necessary(p.dst, dst_conn)
3655 src_ssh_cmd = " ".join(src_conn.ssh_cmd_quoted)
3656 dst_ssh_cmd = " ".join(dst_conn.ssh_cmd_quoted)
3657 cmd = [p.shell_program_local, "-c", f"{src_ssh_cmd} {src_pipe} {local_pipe} | {dst_ssh_cmd} {dst_pipe}"]
3658 msg = "Would execute: %s" if dry_run_no_send else "Executing: %s"
3659 log.debug(msg, cmd[2].lstrip())
3660 if not dry_run_no_send:
3661 try:
3662 self.maybe_inject_error(cmd=cmd, error_trigger=error_trigger)
3663 process = subprocess_run(
3664 cmd, stdin=DEVNULL, stdout=PIPE, stderr=PIPE, text=True, timeout=self.timeout(), check=True
3665 )
3666 except (subprocess.CalledProcessError, UnicodeDecodeError) as e:
3667 no_sleep = False
3668 if not isinstance(e, UnicodeDecodeError):
3669 xprint(log, stderr_to_str(e.stdout), file=sys.stdout)
3670 log.warning("%s", stderr_to_str(e.stderr).rstrip())
3671 if isinstance(e, subprocess.CalledProcessError):
3672 no_sleep = self.clear_resumable_recv_state_if_necessary(dst_dataset, e.stderr)
3673 # op isn't idempotent so retries regather current state from the start of replicate_dataset()
3674 raise RetryableError("Subprocess failed", no_sleep=no_sleep) from e
3675 else:
3676 xprint(log, process.stdout, file=sys.stdout)
3677 xprint(log, process.stderr, file=sys.stderr)
3678 finally:
3679 dst_conn_pool.return_connection(dst_conn)
3680 src_conn_pool.return_connection(src_conn)
3682 def clear_resumable_recv_state_if_necessary(self, dst_dataset: str, stderr: str) -> bool:
3683 def clear_resumable_recv_state() -> bool:
3684 log.warning(p.dry("Aborting an interrupted zfs receive -s, deleting partially received state: %s"), dst_dataset)
3685 cmd = p.split_args(f"{p.dst.sudo} {p.zfs_program} receive -A", dst_dataset)
3686 self.try_ssh_command(p.dst, log_trace, is_dry=p.dry_run, print_stdout=True, cmd=cmd)
3687 log.log(log_trace, p.dry("Done Aborting an interrupted zfs receive -s: %s"), dst_dataset)
3688 return True
3690 p, log = self.params, self.params.log
3691 # "cannot resume send: 'wb_src/tmp/src@s1' is no longer the same snapshot used in the initial send"
3692 # "cannot resume send: 'wb_src/tmp/src@s1' used in the initial send no longer exists"
3693 # "cannot resume send: incremental source 0xa000000000000000 no longer exists"
3694 if "cannot resume send" in stderr and (
3695 "is no longer the same snapshot used in the initial send" in stderr
3696 or "used in the initial send no longer exists" in stderr
3697 or re.match(r".*incremental source [0-9a-fx]+ no longer exists", stderr)
3698 ):
3699 return clear_resumable_recv_state()
3701 # "cannot receive resume stream: incompatible embedded data stream feature with encrypted receive."
3702 # see https://github.com/openzfs/zfs/issues/12480
3703 # 'cannot receive new filesystem stream: destination xx contains partially-complete state from "zfs receive -s"'
3704 # this indicates that --no-resume-recv detects that dst contains a previously interrupted recv -s
3705 elif "cannot receive" in stderr and (
3706 "cannot receive resume stream: incompatible embedded data stream feature with encrypted receive" in stderr
3707 or 'contains partially-complete state from "zfs receive -s"' in stderr
3708 ):
3709 return clear_resumable_recv_state()
3711 elif ( # this signals normal behavior on interrupt of 'zfs receive -s' if running without --no-resume-recv
3712 "cannot receive new filesystem stream: checksum mismatch or incomplete stream" in stderr
3713 and "Partially received snapshot is saved" in stderr
3714 ):
3715 return True
3717 # "cannot destroy 'wb_dest/tmp/dst@s1': snapshot has dependent clones ... use '-R' to destroy the following
3718 # datasets: wb_dest/tmp/dst/%recv" # see https://github.com/openzfs/zfs/issues/10439#issuecomment-642774560
3719 # This msg indicates a failed 'zfs destroy' via --delete-dst-snapshots. This "clone" is caused by a previously
3720 # interrupted 'zfs receive -s'. The fix used here is to delete the partially received state of said
3721 # 'zfs receive -s' via 'zfs receive -A', followed by an automatic retry, which will now succeed to delete the
3722 # snapshot without user intervention.
3723 elif (
3724 "cannot destroy" in stderr
3725 and "snapshot has dependent clone" in stderr
3726 and "use '-R' to destroy the following dataset" in stderr
3727 and f"\n{dst_dataset}/%recv\n" in stderr
3728 ):
3729 return clear_resumable_recv_state()
3731 # Same cause as above, except that this error can occur during 'zfs rollback'
3732 # Also see https://github.com/openzfs/zfs/blob/master/cmd/zfs/zfs_main.c
3733 elif (
3734 "cannot rollback to" in stderr
3735 and "clones of previous snapshots exist" in stderr
3736 and "use '-R' to force deletion of the following clones and dependents" in stderr
3737 and f"\n{dst_dataset}/%recv\n" in stderr
3738 ):
3739 return clear_resumable_recv_state()
3741 return False
3743 def _recv_resume_token(self, dst_dataset: str, retry_count: int) -> Tuple[Optional[str], List[str], List[str]]:
3744 """Gets recv_resume_token ZFS property from dst_dataset and returns corresponding opts to use for send+recv."""
3745 p, log = self.params, self.params.log
3746 if not p.resume_recv:
3747 return None, [], []
3748 warning = None
3749 if not self.is_zpool_feature_enabled_or_active(p.dst, "feature@extensible_dataset"):
3750 warning = "not available on destination dataset"
3751 elif not self.is_program_available(zfs_version_is_at_least_2_1_0, "dst"):
3752 warning = "unreliable as zfs version is too old" # e.g. zfs-0.8.3 "internal error: Unknown error 1040"
3753 if warning:
3754 log.warning(f"ZFS receive resume feature is {warning}. Falling back to --no-resume-recv: %s", dst_dataset)
3755 return None, [], []
3756 recv_resume_token = None
3757 send_resume_opts = []
3758 if self.dst_dataset_exists[dst_dataset]:
3759 cmd = p.split_args(f"{p.zfs_program} get -Hp -o value -s none receive_resume_token", dst_dataset)
3760 recv_resume_token = self.run_ssh_command(p.dst, log_trace, cmd=cmd).rstrip()
3761 if recv_resume_token == "-" or not recv_resume_token:
3762 recv_resume_token = None
3763 else:
3764 send_resume_opts += ["-n"] if p.dry_run else []
3765 send_resume_opts += ["-v"] if p.verbose_zfs else []
3766 send_resume_opts += ["-t", recv_resume_token]
3767 recv_resume_opts = ["-s"]
3768 return recv_resume_token, send_resume_opts, recv_resume_opts
3770 def mbuffer_cmd(self, loc: str, size_estimate_bytes: int, recordsize: int) -> str:
3771 """If mbuffer command is on the PATH, uses it in the ssh network pipe between 'zfs send' and 'zfs receive' to
3772 smooth out the rate of data flow and prevent bottlenecks caused by network latency or speed fluctuation."""
3773 p = self.params
3774 if (
3775 size_estimate_bytes >= p.min_pipe_transfer_size
3776 and (
3777 (loc == "src" and (p.src.is_nonlocal or p.dst.is_nonlocal))
3778 or (loc == "dst" and (p.src.is_nonlocal or p.dst.is_nonlocal))
3779 or (loc == "local" and p.src.is_nonlocal and p.dst.is_nonlocal)
3780 )
3781 and self.is_program_available("mbuffer", loc)
3782 ):
3783 recordsize = max(recordsize, 128 * 1024 if self.is_solaris_zfs_location(loc) else 2 * 1024 * 1024)
3784 return shlex.join([p.mbuffer_program, "-s", str(recordsize)] + p.mbuffer_program_opts)
3785 else:
3786 return "cat"
3788 def compress_cmd(self, loc: str, size_estimate_bytes: int) -> str:
3789 """If zstd command is on the PATH, uses it in the ssh network pipe between 'zfs send' and 'zfs receive' to
3790 reduce network bottlenecks by sending compressed data."""
3791 p = self.params
3792 if (
3793 size_estimate_bytes >= p.min_pipe_transfer_size
3794 and (p.src.is_nonlocal or p.dst.is_nonlocal)
3795 and self.is_program_available("zstd", loc)
3796 ):
3797 return shlex.join([p.compression_program] + p.compression_program_opts)
3798 else:
3799 return "cat"
3801 def decompress_cmd(self, loc: str, size_estimate_bytes: int) -> str:
3802 p = self.params
3803 if (
3804 size_estimate_bytes >= p.min_pipe_transfer_size
3805 and (p.src.is_nonlocal or p.dst.is_nonlocal)
3806 and self.is_program_available("zstd", loc)
3807 ):
3808 return shlex.join([p.compression_program, "-dc"])
3809 else:
3810 return "cat"
3812 worker_thread_number_regex: re.Pattern = re.compile(r"ThreadPoolExecutor-\d+_(\d+)")
3814 def pv_cmd(
3815 self, loc: str, size_estimate_bytes: int, size_estimate_human: str, disable_progress_bar: bool = False
3816 ) -> str:
3817 """If pv command is on the PATH, monitors the progress of data transfer from 'zfs send' to 'zfs receive'.
3818 Progress can be viewed via "tail -f $pv_log_file" aka tail -f ~/bzfs-logs/current.pv or similar."""
3819 p = self.params
3820 if self.is_program_available("pv", loc):
3821 size = f"--size={size_estimate_bytes}"
3822 if disable_progress_bar or size_estimate_bytes == 0:
3823 size = ""
3824 pv_log_file = p.log_params.pv_log_file
3825 thread_name = threading.current_thread().name
3826 if match := Job.worker_thread_number_regex.fullmatch(thread_name):
3827 worker = int(match.group(1))
3828 if worker > 0:
3829 pv_log_file += pv_file_thread_separator + f"{worker:04}"
3830 if self.is_first_replication_task.get_and_set(False):
3831 if self.isatty and not p.quiet:
3832 self.progress_reporter.start()
3833 self.replication_start_time_nanos = time.monotonic_ns()
3834 if self.isatty and not p.quiet:
3835 self.progress_reporter.enqueue_pv_log_file(pv_log_file)
3836 pv_program_opts = [p.pv_program] + p.pv_program_opts
3837 if self.progress_update_intervals is not None: # for testing
3838 pv_program_opts += [f"--interval={self.progress_update_intervals[0]}"]
3839 pv_program_opts += ["--force", f"--name={size_estimate_human}"] + ([size] if size else [])
3840 return f"LC_ALL=C {shlex.join(pv_program_opts)} 2>> {shlex.quote(pv_log_file)}"
3841 else:
3842 return "cat"
3844 def run_ssh_command(
3845 self,
3846 remote: Remote,
3847 level: int = -1,
3848 is_dry: bool = False,
3849 check: bool = True,
3850 print_stdout: bool = False,
3851 print_stderr: bool = True,
3852 cmd: Optional[List[str]] = None,
3853 ) -> str:
3854 """Runs the given cmd via ssh on the given remote, and returns stdout. The full command is the concatenation
3855 of both the command to run on the localhost in order to talk to the remote host ($remote.local_ssh_command())
3856 and the command to run on the given remote host ($cmd)."""
3857 level = level if level >= 0 else logging.INFO
3858 assert cmd is not None and isinstance(cmd, list) and len(cmd) > 0
3859 p, log = self.params, self.params.log
3860 quoted_cmd = [shlex.quote(arg) for arg in cmd]
3861 conn_pool: ConnectionPool = p.connection_pools[remote.location].pool(SHARED)
3862 conn: Connection = conn_pool.get_connection()
3863 try:
3864 ssh_cmd: List[str] = conn.ssh_cmd
3865 if remote.ssh_user_host != "":
3866 self.refresh_ssh_connection_if_necessary(remote, conn)
3867 cmd = quoted_cmd
3868 msg = "Would execute: %s" if is_dry else "Executing: %s"
3869 log.log(level, msg, list_formatter(conn.ssh_cmd_quoted + quoted_cmd, lstrip=True))
3870 if is_dry:
3871 return ""
3872 try:
3873 process = subprocess_run(
3874 ssh_cmd + cmd, stdin=DEVNULL, stdout=PIPE, stderr=PIPE, text=True, timeout=self.timeout(), check=check
3875 )
3876 except (subprocess.CalledProcessError, subprocess.TimeoutExpired, UnicodeDecodeError) as e:
3877 if not isinstance(e, UnicodeDecodeError): 3877 ↛ 3880line 3877 didn't jump to line 3880 because the condition on line 3877 was always true
3878 xprint(log, stderr_to_str(e.stdout), run=print_stdout, end="")
3879 xprint(log, stderr_to_str(e.stderr), run=print_stderr, end="")
3880 raise
3881 else:
3882 xprint(log, process.stdout, run=print_stdout, end="")
3883 xprint(log, process.stderr, run=print_stderr, end="")
3884 return process.stdout # type: ignore[no-any-return] # need to ignore on python <= 3.8
3885 finally:
3886 conn_pool.return_connection(conn)
3888 def try_ssh_command(
3889 self,
3890 remote: Remote,
3891 level: int,
3892 is_dry: bool = False,
3893 print_stdout: bool = False,
3894 cmd: Optional[List[str]] = None,
3895 exists: bool = True,
3896 error_trigger: Optional[str] = None,
3897 ) -> Optional[str]:
3898 """Convenience method that helps retry/react to a dataset or pool that potentially doesn't exist anymore."""
3899 assert cmd is not None and isinstance(cmd, list) and len(cmd) > 0
3900 log = self.params.log
3901 try:
3902 self.maybe_inject_error(cmd=cmd, error_trigger=error_trigger)
3903 return self.run_ssh_command(remote, level=level, is_dry=is_dry, print_stdout=print_stdout, cmd=cmd)
3904 except (subprocess.CalledProcessError, UnicodeDecodeError) as e:
3905 if not isinstance(e, UnicodeDecodeError): 3905 ↛ 3915line 3905 didn't jump to line 3915 because the condition on line 3905 was always true
3906 stderr = stderr_to_str(e.stderr)
3907 if exists and (
3908 ": dataset does not exist" in stderr
3909 or ": filesystem does not exist" in stderr # solaris 11.4.0
3910 or ": does not exist" in stderr # solaris 11.4.0 'zfs send' with missing snapshot
3911 or ": no such pool" in stderr
3912 ):
3913 return None
3914 log.warning("%s", stderr.rstrip())
3915 raise RetryableError("Subprocess failed") from e
3917 def refresh_ssh_connection_if_necessary(self, remote: Remote, conn: "Connection") -> None:
3918 p, log = self.params, self.params.log
3919 if remote.ssh_user_host == "":
3920 return # we're in local mode; no ssh required
3921 if not self.is_program_available("ssh", "local"):
3922 die(f"{p.ssh_program} CLI is not available to talk to remote host. Install {p.ssh_program} first!")
3923 if not remote.reuse_ssh_connection:
3924 return
3925 # Performance: reuse ssh connection for low latency startup of frequent ssh invocations via the 'ssh -S' and
3926 # 'ssh -S -M -oControlPersist=60s' options. See https://en.wikibooks.org/wiki/OpenSSH/Cookbook/Multiplexing
3927 control_persist_limit_nanos = (self.control_persist_secs - self.control_persist_margin_secs) * 1_000_000_000
3928 with conn.lock:
3929 if time.monotonic_ns() - conn.last_refresh_time < control_persist_limit_nanos:
3930 return # ssh master is alive, reuse its TCP connection (this is the common case & the ultra-fast path)
3931 ssh_cmd = conn.ssh_cmd
3932 ssh_socket_cmd = ssh_cmd[0:-1] # omit trailing ssh_user_host
3933 ssh_socket_cmd += ["-O", "check", remote.ssh_user_host]
3934 # extend lifetime of ssh master by $control_persist_secs via 'ssh -O check' if master is still running.
3935 # 'ssh -S /path/to/socket -O check' doesn't talk over the network, hence is still a low latency fast path.
3936 t = self.timeout()
3937 if subprocess_run(ssh_socket_cmd, stdin=DEVNULL, stdout=PIPE, stderr=PIPE, text=True, timeout=t).returncode == 0:
3938 log.log(log_trace, "ssh connection is alive: %s", list_formatter(ssh_socket_cmd))
3939 else: # ssh master is not alive; start a new master:
3940 log.log(log_trace, "ssh connection is not yet alive: %s", list_formatter(ssh_socket_cmd))
3941 ssh_socket_cmd = ssh_cmd[0:-1] # omit trailing ssh_user_host
3942 ssh_socket_cmd += ["-M", f"-oControlPersist={self.control_persist_secs}s", remote.ssh_user_host, "exit"]
3943 log.log(log_trace, "Executing: %s", list_formatter(ssh_socket_cmd))
3944 process = subprocess_run(ssh_socket_cmd, stdin=DEVNULL, stderr=PIPE, text=True, timeout=self.timeout())
3945 if process.returncode != 0:
3946 log.error("%s", process.stderr.rstrip())
3947 die(
3948 f"Cannot ssh into remote host via '{' '.join(ssh_socket_cmd)}'. Fix ssh configuration "
3949 f"first, considering diagnostic log file output from running {prog_name} with: "
3950 "-v -v --ssh-src-extra-opts='-v -v' --ssh-dst-extra-opts='-v -v'"
3951 )
3952 conn.last_refresh_time = time.monotonic_ns()
3954 def timeout(self) -> Optional[float]:
3955 """Raises TimeoutExpired if necessary, else returns the number of seconds left until timeout is to occur."""
3956 timeout_nanos = self.timeout_nanos
3957 if timeout_nanos is None:
3958 return None # never raise a timeout
3959 delta_nanos = timeout_nanos - time.monotonic_ns()
3960 if delta_nanos <= 0:
3961 assert self.params.timeout_nanos is not None
3962 raise subprocess.TimeoutExpired(prog_name + "_timeout", timeout=self.params.timeout_nanos / 1_000_000_000)
3963 return delta_nanos / 1_000_000_000 # seconds
3965 def maybe_inject_error(self, cmd: List[str], error_trigger: Optional[str] = None) -> None:
3966 """For testing only; for unit tests to simulate errors during replication and test correct handling of them."""
3967 if error_trigger:
3968 counter = self.error_injection_triggers.get("before")
3969 if counter and self.decrement_injection_counter(counter, error_trigger):
3970 try:
3971 raise CalledProcessError(returncode=1, cmd=" ".join(cmd), stderr=error_trigger + ":dataset is busy")
3972 except subprocess.CalledProcessError as e:
3973 if error_trigger.startswith("retryable_"):
3974 raise RetryableError("Subprocess failed") from e
3975 else:
3976 raise
3978 def maybe_inject_delete(self, remote: Remote, dataset: str, delete_trigger: str) -> None:
3979 """For testing only; for unit tests to delete datasets during replication and test correct handling of that."""
3980 assert delete_trigger
3981 counter = self.delete_injection_triggers.get("before")
3982 if counter and self.decrement_injection_counter(counter, delete_trigger):
3983 p = self.params
3984 cmd = p.split_args(f"{remote.sudo} {p.zfs_program} destroy -r", p.force_unmount, p.force_hard, dataset or "")
3985 self.run_ssh_command(remote, log_debug, print_stdout=True, cmd=cmd)
3987 def maybe_inject_params(self, error_trigger: str) -> None:
3988 """For testing only; for unit tests to simulate errors during replication and test correct handling of them."""
3989 assert error_trigger
3990 counter = self.error_injection_triggers.get("before")
3991 if counter and self.decrement_injection_counter(counter, error_trigger):
3992 self.inject_params = self.param_injection_triggers[error_trigger]
3993 elif error_trigger in self.param_injection_triggers:
3994 self.inject_params = {}
3996 def decrement_injection_counter(self, counter: Counter, trigger: str) -> bool:
3997 """For testing only."""
3998 with self.injection_lock:
3999 if counter[trigger] <= 0:
4000 return False
4001 counter[trigger] -= 1
4002 return True
4004 @staticmethod
4005 def squote(remote: Remote, arg: str) -> str:
4006 return arg if remote.ssh_user_host == "" else shlex.quote(arg)
4008 @staticmethod
4009 def dquote(arg: str) -> str:
4010 """shell-escapes double quotes and dollar and backticks, then surrounds with double quotes."""
4011 return '"' + arg.replace('"', '\\"').replace("$", "\\$").replace("`", "\\`") + '"'
4013 def filter_datasets(self, remote: Remote, sorted_datasets: List[str]) -> List[str]:
4014 """Returns all datasets (and their descendants) that match at least one of the include regexes but none of the
4015 exclude regexes. Assumes the list of input datasets is sorted. The list of output datasets will be sorted too."""
4016 p, log = self.params, self.params.log
4017 results = []
4018 for i, dataset in enumerate(sorted_datasets):
4019 if i == 0 and p.skip_parent:
4020 continue
4021 rel_dataset = relativize_dataset(dataset, remote.root_dataset)
4022 if rel_dataset.startswith("/"):
4023 rel_dataset = rel_dataset[1:] # strip leading '/' char if any
4024 if is_included(rel_dataset, p.include_dataset_regexes, p.exclude_dataset_regexes):
4025 results.append(dataset)
4026 log.debug("Including b/c dataset regex: %s", dataset)
4027 else:
4028 log.debug("Excluding b/c dataset regex: %s", dataset)
4029 if p.exclude_dataset_property:
4030 results = self.filter_datasets_by_exclude_property(remote, results)
4031 is_debug = p.log.isEnabledFor(log_debug)
4032 for dataset in results:
4033 if is_debug:
4034 log.debug("Finally included %s dataset: %s", remote.location, dataset)
4035 if self.is_test_mode:
4036 # Asserts the following: If a dataset is excluded its descendants are automatically excluded too, and this
4037 # decision is never reconsidered even for the descendants because exclude takes precedence over include.
4038 resultset = set(results)
4039 root_datasets = [dataset for dataset in results if os.path.dirname(dataset) not in resultset] # have no parent
4040 for root in root_datasets: # each root is not a descendant of another dataset
4041 assert not any(is_descendant(root, of_root_dataset=dataset) for dataset in results if dataset != root)
4042 for dataset in results: # each dataset belongs to a subtree rooted at one of the roots
4043 assert any(is_descendant(dataset, of_root_dataset=root) for root in root_datasets)
4044 return results
4046 def filter_datasets_by_exclude_property(self, remote: Remote, sorted_datasets: List[str]) -> List[str]:
4047 """Excludes datasets that are marked with a ZFS user property value that, in effect, says 'skip me'."""
4048 p, log = self.params, self.params.log
4049 results = []
4050 localhostname = None
4051 skip_dataset = DONT_SKIP_DATASET
4052 for dataset in sorted_datasets:
4053 if is_descendant(dataset, of_root_dataset=skip_dataset):
4054 # skip_dataset shall be ignored or has been deleted by some third party while we're running
4055 continue # nothing to do anymore for this dataset subtree (note that datasets is sorted)
4056 skip_dataset = DONT_SKIP_DATASET
4057 # TODO perf: on zfs >= 2.3 use json via zfs list -j to safely merge all zfs list's into one 'zfs list' call
4058 cmd = p.split_args(f"{p.zfs_program} list -t filesystem,volume -Hp -o {p.exclude_dataset_property}", dataset)
4059 self.maybe_inject_delete(remote, dataset=dataset, delete_trigger="zfs_list_exclude_property")
4060 property_value = self.try_ssh_command(remote, log_trace, cmd=cmd)
4061 if property_value is None:
4062 log.warning(f"Third party deleted {remote.location}: %s", dataset)
4063 skip_dataset = dataset
4064 else:
4065 reason = ""
4066 property_value = property_value.strip()
4067 if not property_value or property_value == "-" or property_value.lower() == "true":
4068 sync = True
4069 elif property_value.lower() == "false":
4070 sync = False
4071 else:
4072 localhostname = localhostname or socket.gethostname()
4073 sync = any(localhostname == hostname.strip() for hostname in property_value.split(","))
4074 reason = f", localhostname: {localhostname}, hostnames: {property_value}"
4076 if sync:
4077 results.append(dataset)
4078 log.debug("Including b/c dataset prop: %s%s", dataset, reason)
4079 else:
4080 skip_dataset = dataset
4081 log.debug("Excluding b/c dataset prop: %s%s", dataset, reason)
4082 return results
4084 def filter_snapshots(self, basis_snapshots: List[str], all_except: bool = False) -> List[str]:
4085 """Returns all snapshots that pass all include/exclude policies.
4086 `all_except=False` returns snapshots *matching* the filters,
4087 for example those that should be deleted if we are in "delete selected" mode.
4088 `all_except=True` returns snapshots *not* matching the filters,
4089 for example those that should be deleted if we are in "retain selected" mode."""
4091 def resolve_timerange(timerange: UnixTimeRange) -> UnixTimeRange:
4092 assert timerange is not None
4093 lo, hi = timerange
4094 if isinstance(lo, timedelta):
4095 lo = math.ceil(current_unixtime_in_secs - lo.total_seconds())
4096 if isinstance(hi, timedelta):
4097 hi = math.ceil(current_unixtime_in_secs - hi.total_seconds())
4098 assert isinstance(lo, int)
4099 assert isinstance(hi, int)
4100 return (lo, hi) if lo <= hi else (hi, lo)
4102 p, log = self.params, self.params.log
4103 current_unixtime_in_secs: float = p.create_src_snapshots_config.current_datetime.timestamp()
4104 resultset = set()
4105 for snapshot_filter in p.snapshot_filters:
4106 snapshots = basis_snapshots
4107 for _filter in snapshot_filter:
4108 name = _filter.name
4109 if name == snapshot_regex_filter_name:
4110 snapshots = self.filter_snapshots_by_regex(snapshots, regexes=_filter.options)
4111 elif name == "include_snapshot_times":
4112 timerange = resolve_timerange(_filter.timerange) if _filter.timerange is not None else _filter.timerange
4113 snapshots = self.filter_snapshots_by_creation_time(snapshots, include_snapshot_times=timerange)
4114 else:
4115 assert name == "include_snapshot_times_and_ranks"
4116 timerange = resolve_timerange(_filter.timerange) if _filter.timerange is not None else _filter.timerange
4117 snapshots = self.filter_snapshots_by_creation_time_and_rank(
4118 snapshots, include_snapshot_times=timerange, include_snapshot_ranks=_filter.options
4119 )
4120 resultset.update(snapshots) # union
4121 snapshots = [line for line in basis_snapshots if "#" in line or ((line in resultset) != all_except)]
4122 is_debug = log.isEnabledFor(log_debug)
4123 for snapshot in snapshots:
4124 if is_debug:
4125 log.debug("Finally included snapshot: %s", snapshot[snapshot.rindex("\t") + 1 :])
4126 return snapshots
4128 def filter_snapshots_by_regex(self, snapshots: List[str], regexes: Tuple[RegexList, RegexList]) -> List[str]:
4129 """Returns all snapshots that match at least one of the include regexes but none of the exclude regexes."""
4130 exclude_snapshot_regexes, include_snapshot_regexes = regexes
4131 log = self.params.log
4132 is_debug = log.isEnabledFor(log_debug)
4133 results = []
4134 for snapshot in snapshots:
4135 i = snapshot.find("@") # snapshot separator
4136 if i < 0:
4137 continue # retain bookmarks to help find common snapshots, apply filter only to snapshots
4138 elif is_included(snapshot[i + 1 :], include_snapshot_regexes, exclude_snapshot_regexes):
4139 results.append(snapshot)
4140 if is_debug:
4141 log.debug("Including b/c snapshot regex: %s", snapshot[snapshot.rindex("\t") + 1 :])
4142 else:
4143 if is_debug:
4144 log.debug("Excluding b/c snapshot regex: %s", snapshot[snapshot.rindex("\t") + 1 :])
4145 return results
4147 def filter_snapshots_by_creation_time(self, snapshots: List[str], include_snapshot_times: UnixTimeRange) -> List[str]:
4148 log = self.params.log
4149 is_debug = log.isEnabledFor(log_debug)
4150 lo_snaptime, hi_snaptime = include_snapshot_times or (0, unixtime_infinity_secs)
4151 assert isinstance(lo_snaptime, int)
4152 assert isinstance(hi_snaptime, int)
4153 results = []
4154 for snapshot in snapshots:
4155 if "@" not in snapshot:
4156 continue # retain bookmarks to help find common snapshots, apply filter only to snapshots
4157 elif lo_snaptime <= int(snapshot[0 : snapshot.index("\t")]) < hi_snaptime:
4158 results.append(snapshot)
4159 if is_debug:
4160 log.debug("Including b/c creation time: %s", snapshot[snapshot.rindex("\t") + 1 :])
4161 else:
4162 if is_debug:
4163 log.debug("Excluding b/c creation time: %s", snapshot[snapshot.rindex("\t") + 1 :])
4164 return results
4166 def filter_snapshots_by_creation_time_and_rank(
4167 self, snapshots: List[str], include_snapshot_times: UnixTimeRange, include_snapshot_ranks: List[RankRange]
4168 ) -> List[str]:
4170 def get_idx(rank: Tuple[str, int, bool], n: int) -> int:
4171 kind, num, is_percent = rank
4172 m = round(n * num / 100) if is_percent else min(n, num)
4173 assert kind == "latest" or kind == "oldest"
4174 return m if kind == "oldest" else n - m
4176 assert isinstance(include_snapshot_ranks, list)
4177 assert len(include_snapshot_ranks) > 0
4178 log = self.params.log
4179 is_debug = log.isEnabledFor(log_debug)
4180 lo_time, hi_time = include_snapshot_times or (0, unixtime_infinity_secs)
4181 assert isinstance(lo_time, int)
4182 assert isinstance(hi_time, int)
4183 n = sum(1 for snapshot in snapshots if "@" in snapshot)
4184 for rank_range in include_snapshot_ranks:
4185 lo_rank, hi_rank = rank_range
4186 lo = get_idx(lo_rank, n)
4187 hi = get_idx(hi_rank, n)
4188 lo, hi = (lo, hi) if lo <= hi else (hi, lo)
4189 i = 0
4190 results = []
4191 for snapshot in snapshots:
4192 if "@" not in snapshot:
4193 continue # retain bookmarks to help find common snapshots, apply filter only to snapshots
4194 else:
4195 msg = None
4196 if lo <= i < hi:
4197 msg = "Including b/c snapshot rank: %s"
4198 elif lo_time <= int(snapshot[0 : snapshot.index("\t")]) < hi_time:
4199 msg = "Including b/c creation time: %s"
4200 if msg:
4201 results.append(snapshot)
4202 else:
4203 msg = "Excluding b/c snapshot rank: %s"
4204 if is_debug:
4205 log.debug(msg, snapshot[snapshot.rindex("\t") + 1 :])
4206 i += 1
4207 snapshots = results
4208 n = hi - lo
4209 return snapshots
4211 def filter_properties(
4212 self, props: Dict[str, Optional[str]], include_regexes: RegexList, exclude_regexes: RegexList
4213 ) -> Dict[str, Optional[str]]:
4214 """Returns ZFS props whose name matches at least one of the include regexes but none of the exclude regexes."""
4215 log = self.params.log
4216 is_debug = log.isEnabledFor(log_debug)
4217 results: Dict[str, Optional[str]] = {}
4218 for propname, propvalue in props.items():
4219 if is_included(propname, include_regexes, exclude_regexes):
4220 results[propname] = propvalue
4221 if is_debug:
4222 log.debug("Including b/c property regex: %s", propname)
4223 else:
4224 if is_debug:
4225 log.debug("Excluding b/c property regex: %s", propname)
4226 return results
4228 def delete_snapshots(self, remote: Remote, dataset: str, snapshot_tags: List[str]) -> None:
4229 if len(snapshot_tags) == 0:
4230 return
4231 p, log = self.params, self.params.log
4232 log.info(p.dry(f"Deleting {len(snapshot_tags)} snapshots within %s: %s"), dataset, snapshot_tags)
4233 # delete snapshots in batches without creating a command line that's too big for the OS to handle
4234 self.run_ssh_cmd_batched(
4235 remote,
4236 self.delete_snapshot_cmd(remote, dataset + "@"),
4237 snapshot_tags,
4238 lambda batch: self.delete_snapshot(remote, dataset, dataset + "@" + ",".join(batch)),
4239 max_batch_items=1 if self.is_solaris_zfs(remote) else self.params.max_snapshots_per_minibatch_on_delete_snaps,
4240 sep=",",
4241 )
4243 def delete_snapshot(self, r: Remote, dataset: str, snapshots_to_delete: str) -> None:
4244 p = self.params
4245 cmd = self.delete_snapshot_cmd(r, snapshots_to_delete)
4246 is_dry = p.dry_run and self.is_solaris_zfs(r) # solaris-11.4 knows no 'zfs destroy -n' flag
4247 try:
4248 self.maybe_inject_error(cmd=cmd, error_trigger="zfs_delete_snapshot")
4249 self.run_ssh_command(r, log_debug, is_dry=is_dry, print_stdout=True, cmd=cmd)
4250 except (subprocess.CalledProcessError, UnicodeDecodeError) as e:
4251 stderr = stderr_to_str(e.stderr) if hasattr(e, "stderr") else ""
4252 no_sleep = self.clear_resumable_recv_state_if_necessary(dataset, stderr)
4253 # op isn't idempotent so retries regather current state from the start
4254 raise RetryableError("Subprocess failed", no_sleep=no_sleep) from e
4256 def delete_snapshot_cmd(self, r: Remote, snapshots_to_delete: str) -> List[str]:
4257 p = self.params
4258 return p.split_args(
4259 f"{r.sudo} {p.zfs_program} destroy", p.force_hard, p.verbose_destroy, p.dry_run_destroy, snapshots_to_delete
4260 )
4262 def delete_bookmarks(self, remote: Remote, dataset: str, snapshot_tags: List[str]) -> None:
4263 if len(snapshot_tags) == 0:
4264 return
4265 # Unfortunately ZFS has no syntax yet to delete multiple bookmarks in a single CLI invocation
4266 p, log = self.params, self.params.log
4267 log.info(
4268 p.dry(f"Deleting {len(snapshot_tags)} bookmarks within %s: %s"), dataset, dataset + "#" + ",".join(snapshot_tags)
4269 )
4270 cmd = p.split_args(f"{remote.sudo} {p.zfs_program} destroy")
4271 self.run_ssh_cmd_parallel(
4272 remote,
4273 [(cmd, [f"{dataset}#{snapshot_tag}" for snapshot_tag in snapshot_tags])],
4274 lambda _cmd, batch: self.try_ssh_command(
4275 remote, log_debug, is_dry=p.dry_run, print_stdout=True, cmd=_cmd + batch, exists=False
4276 ),
4277 max_batch_items=1,
4278 )
4280 def delete_datasets(self, remote: Remote, datasets: Iterable[str]) -> None:
4281 """Deletes the given datasets via zfs destroy -r on the given remote."""
4282 # Impl is batch optimized to minimize CLI + network roundtrips: only need to run zfs destroy if previously
4283 # destroyed dataset (within sorted datasets) is not a prefix (aka ancestor) of current dataset
4284 p, log = self.params, self.params.log
4285 last_deleted_dataset = DONT_SKIP_DATASET
4286 for dataset in sorted(datasets):
4287 if is_descendant(dataset, of_root_dataset=last_deleted_dataset):
4288 continue
4289 log.info(p.dry("Deleting dataset tree: %s"), f"{dataset} ...")
4290 cmd = p.split_args(
4291 f"{remote.sudo} {p.zfs_program} destroy -r {p.force_unmount} {p.force_hard} {p.verbose_destroy}",
4292 p.dry_run_destroy,
4293 dataset,
4294 )
4295 is_dry = p.dry_run and self.is_solaris_zfs(remote) # solaris-11.4 knows no 'zfs destroy -n' flag
4296 self.run_ssh_command(remote, log_debug, is_dry=is_dry, print_stdout=True, cmd=cmd)
4297 last_deleted_dataset = dataset
4299 def create_zfs_filesystem(self, filesystem: str) -> None:
4300 # zfs create -p -u $filesystem
4301 # To ensure the filesystems that we create do not get mounted, we apply a separate 'zfs create -p -u'
4302 # invocation for each non-existing ancestor. This is because a single 'zfs create -p -u' applies the '-u'
4303 # part only to the immediate filesystem, rather than to the not-yet existing ancestors.
4304 p = self.params
4305 parent = ""
4306 no_mount = "-u" if self.is_program_available(zfs_version_is_at_least_2_1_0, "dst") else ""
4307 for component in filesystem.split("/"):
4308 parent += component
4309 if not self.dst_dataset_exists[parent]:
4310 cmd = p.split_args(f"{p.dst.sudo} {p.zfs_program} create -p", no_mount, parent)
4311 try:
4312 self.run_ssh_command(p.dst, log_debug, is_dry=p.dry_run, print_stdout=True, cmd=cmd)
4313 except subprocess.CalledProcessError as e:
4314 # ignore harmless error caused by 'zfs create' without the -u flag, or by dataset already existing
4315 if ( 4315 ↛ 4321line 4315 didn't jump to line 4321 because the condition on line 4315 was never true
4316 "filesystem successfully created, but it may only be mounted by root" not in e.stderr
4317 and "filesystem successfully created, but not mounted" not in e.stderr # SolarisZFS
4318 and "dataset already exists" not in e.stderr
4319 and "filesystem already exists" not in e.stderr # SolarisZFS?
4320 ):
4321 raise
4322 if not p.dry_run:
4323 self.dst_dataset_exists[parent] = True
4324 parent += "/"
4326 def create_zfs_bookmarks(self, remote: Remote, dataset: str, snapshots: List[str]) -> None:
4327 """Creates bookmarks for the given snapshots, using the 'zfs bookmark' CLI."""
4328 # Unfortunately ZFS has no syntax yet to create multiple bookmarks in a single CLI invocation
4329 p = self.params
4331 def create_zfs_bookmark(cmd: List[str]) -> None:
4332 snapshot = cmd[-1]
4333 assert "@" in snapshot
4334 bookmark_cmd = cmd + [replace_prefix(snapshot, old_prefix=f"{dataset}@", new_prefix=f"{dataset}#")]
4335 try:
4336 self.run_ssh_command(remote, log_debug, is_dry=p.dry_run, print_stderr=False, cmd=bookmark_cmd)
4337 except subprocess.CalledProcessError as e:
4338 # ignore harmless zfs error caused by bookmark with the same name already existing
4339 if ": bookmark exists" not in e.stderr: 4339 ↛ 4340line 4339 didn't jump to line 4340 because the condition on line 4339 was never true
4340 print(e.stderr, file=sys.stderr, end="")
4341 raise
4343 if p.create_bookmarks != "none" and self.are_bookmarks_enabled(remote):
4344 cmd = p.split_args(f"{remote.sudo} {p.zfs_program} bookmark")
4345 self.run_ssh_cmd_parallel(
4346 remote, [(cmd, snapshots)], lambda _cmd, batch: create_zfs_bookmark(_cmd + batch), max_batch_items=1
4347 )
4349 def estimate_send_size(self, remote: Remote, dst_dataset: str, recv_resume_token: Optional[str], *items: str) -> int:
4350 """Estimates num bytes to transfer via 'zfs send'."""
4351 p = self.params
4352 if p.no_estimate_send_size or self.is_solaris_zfs(remote):
4353 return 0 # solaris-11.4 does not have a --parsable equivalent
4354 zfs_send_program_opts = ["--parsable" if opt == "-P" else opt for opt in p.curr_zfs_send_program_opts]
4355 zfs_send_program_opts = append_if_absent(zfs_send_program_opts, "-v", "-n", "--parsable")
4356 if recv_resume_token:
4357 zfs_send_program_opts = ["-Pnv", "-t", recv_resume_token]
4358 items = ()
4359 cmd = p.split_args(f"{remote.sudo} {p.zfs_program} send", zfs_send_program_opts, items)
4360 try:
4361 lines = self.try_ssh_command(remote, log_trace, cmd=cmd)
4362 except RetryableError as retryable_error:
4363 assert retryable_error.__cause__ is not None
4364 if recv_resume_token:
4365 e = retryable_error.__cause__
4366 stderr = stderr_to_str(e.stderr) if hasattr(e, "stderr") else ""
4367 retryable_error.no_sleep = self.clear_resumable_recv_state_if_necessary(dst_dataset, stderr)
4368 # op isn't idempotent so retries regather current state from the start of replicate_dataset()
4369 raise retryable_error
4370 if lines is None: 4370 ↛ 4371line 4370 didn't jump to line 4371 because the condition on line 4370 was never true
4371 return 0 # src dataset or snapshot has been deleted by third party
4372 size = lines.splitlines()[-1]
4373 assert size.startswith("size")
4374 return int(size[size.index("\t") + 1 :])
4376 def dataset_regexes(self, datasets: List[str]) -> List[str]:
4377 src, dst = self.params.src, self.params.dst
4378 results = []
4379 for dataset in datasets:
4380 if dataset.startswith("/"):
4381 # it's an absolute dataset - convert it to a relative dataset
4382 dataset = dataset[1:]
4383 if is_descendant(dataset, of_root_dataset=src.root_dataset):
4384 dataset = relativize_dataset(dataset, src.root_dataset)
4385 elif is_descendant(dataset, of_root_dataset=dst.root_dataset):
4386 dataset = relativize_dataset(dataset, dst.root_dataset)
4387 else:
4388 continue # ignore datasets that make no difference
4389 if dataset.startswith("/"):
4390 dataset = dataset[1:]
4391 if dataset.endswith("/"):
4392 dataset = dataset[0:-1]
4393 if dataset:
4394 regex = re.escape(dataset)
4395 else:
4396 regex = ".*"
4397 results.append(regex)
4398 return results
4400 TRetryRun = TypeVar("TRetryRun")
4402 def run_with_retries(self, policy: RetryPolicy, fn: Callable[..., TRetryRun], *args: Any, **kwargs: Any) -> TRetryRun:
4403 """Runs the given function with the given arguments, and retries on failure as indicated by policy."""
4404 log = self.params.log
4405 max_sleep_mark = policy.min_sleep_nanos
4406 retry_count = 0
4407 sysrandom = None
4408 start_time_nanos = time.monotonic_ns()
4409 while True:
4410 try:
4411 return fn(*args, **kwargs, retry=Retry(retry_count)) # Call the target function with provided args
4412 except RetryableError as retryable_error:
4413 elapsed_nanos = time.monotonic_ns() - start_time_nanos
4414 if retry_count < policy.retries and elapsed_nanos < policy.max_elapsed_nanos:
4415 retry_count += 1
4416 if retryable_error.no_sleep and retry_count <= 1:
4417 log.info(f"Retrying [{retry_count}/{policy.retries}] immediately ...")
4418 else: # jitter: pick a random sleep duration within the range [min_sleep_nanos, max_sleep_mark] as delay
4419 sysrandom = random.SystemRandom() if sysrandom is None else sysrandom
4420 sleep_nanos = sysrandom.randint(policy.min_sleep_nanos, max_sleep_mark)
4421 log.info(f"Retrying [{retry_count}/{policy.retries}] in {human_readable_duration(sleep_nanos)} ...")
4422 time.sleep(sleep_nanos / 1_000_000_000)
4423 max_sleep_mark = min(policy.max_sleep_nanos, 2 * max_sleep_mark) # exponential backoff with cap
4424 else:
4425 if policy.retries > 0:
4426 log.warning(
4427 f"Giving up because the last [{retry_count}/{policy.retries}] retries across "
4428 f"[{elapsed_nanos // 1_000_000_000}/{policy.max_elapsed_nanos // 1_000_000_000}] "
4429 "seconds for the current request failed!"
4430 )
4431 assert retryable_error.__cause__ is not None
4432 raise retryable_error.__cause__ from None
4434 def incremental_send_steps_wrapper(
4435 self, src_snapshots: List[str], src_guids: List[str], included_guids: Set[str], is_resume: bool
4436 ) -> List[Tuple[str, str, str, List[str]]]:
4437 force_convert_I_to_i = self.params.src.use_zfs_delegation and not getenv_bool("no_force_convert_I_to_i", True)
4438 # force_convert_I_to_i == True implies that:
4439 # If using 'zfs allow' delegation mechanism, force convert 'zfs send -I' to a series of
4440 # 'zfs send -i' as a workaround for zfs issue https://github.com/openzfs/zfs/issues/16394
4441 return self.incremental_send_steps(src_snapshots, src_guids, included_guids, is_resume, force_convert_I_to_i)
4443 @staticmethod
4444 def incremental_send_steps(
4445 src_snapshots: List[str], src_guids: List[str], included_guids: Set[str], is_resume: bool, force_convert_I_to_i: bool
4446 ) -> List[Tuple[str, str, str, List[str]]]:
4447 """Computes steps to incrementally replicate the given src snapshots with the given src_guids such that we
4448 include intermediate src snapshots that pass the policy specified by --{include,exclude}-snapshot-*
4449 (represented here by included_guids), using an optimal series of -i/-I send/receive steps that skip
4450 excluded src snapshots. The steps are optimal in the sense that no solution with fewer steps exists. A step
4451 corresponds to a single ZFS send/receive operation. Fewer steps translate to better performance, especially
4452 when sending many small snapshots. For example, 1 step that sends 100 small snapshots in a single operation is
4453 much faster than 100 steps that each send only 1 such snapshot per ZFS send/receive operation.
4454 Example: skip hourly snapshots and only include daily shapshots for replication
4455 Example: [d1, h1, d2, d3, d4] (d is daily, h is hourly) --> [d1, d2, d3, d4] via
4456 -i d1:d2 (i.e. exclude h1; '-i' and ':' indicate 'skip intermediate snapshots')
4457 -I d2-d4 (i.e. also include d3; '-I' and '-' indicate 'include intermediate snapshots')
4458 * The force_convert_I_to_i param is necessary as a work-around for https://github.com/openzfs/zfs/issues/16394
4459 * The 'zfs send' CLI with a bookmark as starting snapshot does not (yet) support including intermediate
4460 src_snapshots via -I flag per https://github.com/openzfs/zfs/issues/12415. Thus, if the replication source
4461 is a bookmark we convert a -I step to a -i step followed by zero or more -i/-I steps.
4462 * The is_resume param is necessary as 'zfs send -t' does not support sending more than a single snapshot
4463 on resuming a previously interrupted 'zfs receive -s'. Thus, here too, we convert a -I step to a -i step
4464 followed by zero or more -i/-I steps."""
4466 def append_run(i: int, label: str) -> int:
4467 # step = ("-I", src_snapshots[start], src_snapshots[i], i - start)
4468 # print(f"{label} {self.send_step_to_str(step)}")
4469 is_not_resume = len(steps) > 0 or not is_resume
4470 if i - start > 1 and (not force_convert_I_to_i) and "@" in src_snapshots[start] and is_not_resume:
4471 steps.append(("-I", src_snapshots[start], src_snapshots[i], src_snapshots[start + 1 : i + 1]))
4472 elif "@" in src_snapshots[start] and is_not_resume:
4473 for j in range(start, i): # convert -I step to -i steps
4474 steps.append(("-i", src_snapshots[j], src_snapshots[j + 1], src_snapshots[j + 1 : j + 2]))
4475 else: # it's a bookmark src or zfs send -t; convert -I step to -i step followed by zero or more -i/-I steps
4476 steps.append(("-i", src_snapshots[start], src_snapshots[start + 1], src_snapshots[start + 1 : start + 2]))
4477 i = start + 1
4478 return i - 1
4480 assert len(src_guids) == len(src_snapshots)
4481 assert len(included_guids) >= 0
4482 steps = []
4483 guids = src_guids
4484 n = len(guids)
4485 i = 0
4486 while i < n and guids[i] not in included_guids: # skip hourlies
4487 i += 1
4489 while i < n:
4490 assert guids[i] in included_guids # it's a daily
4491 start = i
4492 i += 1
4493 while i < n and guids[i] in included_guids: # skip dailies
4494 i += 1
4495 if i < n:
4496 if i - start == 1:
4497 # it's a single daily (that was already replicated) followed by an hourly
4498 i += 1
4499 while i < n and guids[i] not in included_guids: # skip hourlies
4500 i += 1
4501 if i < n:
4502 assert start != i
4503 step = ("-i", src_snapshots[start], src_snapshots[i], src_snapshots[i : i + 1])
4504 # print(f"r1 {self.send_step_to_str(step)}")
4505 steps.append(step)
4506 i -= 1
4507 else: # it's a run of more than one daily
4508 i -= 1
4509 assert start != i
4510 i = append_run(i, "r2")
4511 else: # finish up run of trailing dailies
4512 i -= 1
4513 if start != i:
4514 i = append_run(i, "r3")
4515 i += 1
4516 return steps
4518 @staticmethod
4519 def send_step_to_str(step: Tuple[str, str, str]) -> str:
4520 # return str(step[1]) + ('-' if step[0] == '-I' else ':') + str(step[2])
4521 return str(step)
4523 def zfs_set(self, properties: List[str], remote: Remote, dataset: str) -> None:
4524 """Applies the given property key=value pairs via 'zfs set' CLI to the given dataset on the given remote."""
4525 p = self.params
4526 if len(properties) == 0:
4527 return
4528 # set properties in batches without creating a command line that's too big for the OS to handle
4529 cmd = p.split_args(f"{remote.sudo} {p.zfs_program} set")
4530 self.run_ssh_cmd_batched(
4531 remote,
4532 cmd,
4533 properties,
4534 lambda batch: self.run_ssh_command(
4535 remote, log_debug, is_dry=p.dry_run, print_stdout=True, cmd=cmd + batch + [dataset]
4536 ),
4537 max_batch_items=1 if self.is_solaris_zfs(remote) else 2**29, # solaris-11.4 CLI doesn't accept multiple props
4538 )
4540 def zfs_get(
4541 self,
4542 remote: Remote,
4543 dataset: str,
4544 sources: str,
4545 output_columns: str,
4546 propnames: str,
4547 splitlines: bool,
4548 props_cache: Dict[Tuple[str, str, str], Dict[str, Optional[str]]],
4549 ) -> Dict[str, Optional[str]]:
4550 """Returns the results of 'zfs get' CLI on the given dataset on the given remote."""
4551 if not propnames:
4552 return {}
4553 p = self.params
4554 cache_key = (sources, output_columns, propnames)
4555 props = props_cache.get(cache_key)
4556 if props is None:
4557 cmd = p.split_args(f"{p.zfs_program} get -Hp -o {output_columns} -s {sources} {propnames}", dataset)
4558 lines = self.run_ssh_command(remote, log_trace, cmd=cmd)
4559 is_name_value_pair = "," in output_columns
4560 props = {}
4561 # if not splitlines: omit single trailing newline that was appended by 'zfs get' CLI
4562 for line in lines.splitlines() if splitlines else [lines[0:-1]]:
4563 if is_name_value_pair:
4564 propname, propvalue = line.split("\t", 1)
4565 props[propname] = propvalue
4566 else:
4567 props[line] = None
4568 props_cache[cache_key] = props
4569 return props
4571 def add_recv_property_options(
4572 self,
4573 full_send: bool,
4574 recv_opts: List[str],
4575 dataset: str,
4576 cache: Dict[Tuple[str, str, str], Dict[str, Optional[str]]],
4577 ) -> Tuple[List[str], List[str]]:
4578 """Reads the ZFS properties of the given src dataset. Appends zfs recv -o and -x values to recv_opts according to CLI
4579 params, and returns properties to explicitly set on the dst dataset after 'zfs receive' completes successfully."""
4580 p = self.params
4581 set_opts = []
4582 ox_names = p.zfs_recv_ox_names.copy()
4583 for config in [p.zfs_recv_o_config, p.zfs_recv_x_config, p.zfs_set_config]:
4584 if len(config.include_regexes) == 0:
4585 continue # this is the default - it's an instant noop
4586 if (full_send and "full" in config.targets) or (not full_send and "incremental" in config.targets):
4587 # 'zfs get' uses newline as record separator and tab as separator between output columns. A ZFS user property
4588 # may contain newline and tab characters (indeed anything). Together, this means that there is no reliable
4589 # way to determine where a record ends and the next record starts when listing multiple arbitrary records in
4590 # a single 'zfs get' call. Therefore, here we use a separate 'zfs get' call for each ZFS user property.
4591 # TODO: perf: on zfs >= 2.3 use json via zfs get -j to safely merge all zfs gets into one 'zfs get' call
4592 try:
4593 props_any = self.zfs_get(p.src, dataset, config.sources, "property", "all", True, cache)
4594 props_filtered = self.filter_properties(props_any, config.include_regexes, config.exclude_regexes)
4595 user_propnames = [name for name in props_filtered.keys() if ":" in name]
4596 sys_propnames = ",".join([name for name in props_filtered.keys() if ":" not in name])
4597 props = self.zfs_get(p.src, dataset, config.sources, "property,value", sys_propnames, True, cache)
4598 for propnames in user_propnames:
4599 props.update(self.zfs_get(p.src, dataset, config.sources, "property,value", propnames, False, cache))
4600 except (subprocess.CalledProcessError, UnicodeDecodeError) as e:
4601 raise RetryableError("Subprocess failed") from e
4602 for propname in sorted(props.keys()):
4603 if config is p.zfs_recv_o_config:
4604 if propname not in ox_names:
4605 recv_opts.append("-o")
4606 recv_opts.append(f"{propname}={props[propname]}")
4607 ox_names.add(propname)
4608 elif config is p.zfs_recv_x_config:
4609 if propname not in ox_names:
4610 recv_opts.append("-x")
4611 recv_opts.append(propname)
4612 ox_names.add(propname)
4613 else:
4614 set_opts.append(f"{propname}={props[propname]}")
4615 return recv_opts, set_opts
4617 @staticmethod
4618 def recv_option_property_names(recv_opts: List[str]) -> Set[str]:
4619 """Extracts -o and -x property names that are already specified on the command line. This can be used to check
4620 for dupes because 'zfs receive' does not accept multiple -o or -x options with the same property name."""
4621 propnames = set()
4622 i = 0
4623 n = len(recv_opts)
4624 while i < n:
4625 stripped = recv_opts[i].strip()
4626 if stripped in ("-o", "-x"):
4627 i += 1
4628 if i == n or recv_opts[i].strip() in ("-o", "-x"):
4629 die(f"Missing value for {stripped} option in --zfs-recv-program-opt(s): {' '.join(recv_opts)}")
4630 propnames.add(recv_opts[i] if stripped == "-x" else recv_opts[i].split("=", 1)[0])
4631 i += 1
4632 return propnames
4634 def root_datasets_if_recursive_zfs_snapshot_is_possible(
4635 self, datasets: List[str], basis_datasets: List[str]
4636 ) -> Optional[List[str]]:
4637 """Returns the root datasets within the (filtered) `datasets` list if no incompatible pruning is detected. A dataset
4638 within `datasets` is considered a root dataset if it has no parent, i.e. it is not a descendant of any dataset in
4639 `datasets`. Returns `None` if any (unfiltered) dataset in `basis_dataset` that is a descendant of at least one of
4640 the root datasets is missing in `datasets`, indicating that --include/exclude-dataset* or the snapshot schedule
4641 have pruned a dataset in a way that is incompatible with 'zfs snapshot -r' CLI semantics, thus requiring a switch
4642 to the non-recursive 'zfs snapshot snapshot1 .. snapshot N' CLI flavor.
4643 Assumes that set(datasets).issubset(set(basis_datasets)). Also assumes that datasets and basis_datasets are both
4644 sorted (and thus the output root_datasets is sorted too), which is why this algorithm is efficient - O(N) time
4645 complexity. The impl is akin to the merge algorithm of a merge sort, adapted to our specific use case.
4646 See root_datasets_if_recursive_zfs_snapshot_is_possible_slow_but_correct() in the unit test suite for an alternative
4647 impl that's easier to grok."""
4648 datasets_set: Set[str] = set(datasets)
4649 root_datasets: List[str] = self.find_root_datasets(datasets)
4650 len_root_datasets = len(root_datasets)
4651 len_basis_datasets = len(basis_datasets)
4652 i, j = 0, 0
4653 while i < len_root_datasets and j < len_basis_datasets: # walk and "merge" both sorted lists, in sync
4654 if basis_datasets[j] < root_datasets[i]: # irrelevant subtree?
4655 j += 1 # move to the next basis_src_dataset
4656 elif is_descendant(basis_datasets[j], of_root_dataset=root_datasets[i]): # relevant subtree?
4657 if basis_datasets[j] not in datasets_set: # was dataset chopped off by schedule or --incl/exclude-dataset*?
4658 return None # detected filter pruning that is incompatible with 'zfs snapshot -r'
4659 j += 1 # move to the next basis_src_dataset
4660 else:
4661 i += 1 # move to next root dataset; no need to check root_datasets that are nomore (or not yet) reachable
4662 return root_datasets
4664 @staticmethod
4665 def find_root_datasets(sorted_datasets: List[str]) -> List[str]:
4666 """Returns the roots of the subtrees in the (sorted) input datasets. The output root dataset list is sorted, too.
4667 A dataset is a root dataset if it has no parent, i.e. it is not a descendant of any dataset in the input datasets."""
4668 root_datasets = []
4669 skip_dataset = DONT_SKIP_DATASET
4670 for dataset in sorted_datasets:
4671 if is_descendant(dataset, of_root_dataset=skip_dataset):
4672 continue
4673 skip_dataset = dataset
4674 root_datasets.append(dataset)
4675 return root_datasets
4677 def find_datasets_to_snapshot(self, sorted_datasets: List[str]) -> Dict[SnapshotLabel, List[str]]:
4678 """Given a (sorted) list of source datasets, returns a dict where the key is a snapshot name (aka SnapshotLabel, e.g.
4679 bzfs_2024-11-06_08:30:05_hourly) and the value is the (sorted) (sub)list of datasets for which a snapshot needs to
4680 be created with that name, because these datasets are due per the schedule, either because the 'creation' time of
4681 their most recent snapshot with that name pattern is now too old, or such a snapshot does not even exist.
4682 The baseline implementation uses the 'zfs list -t snapshot' CLI to find the most recent snapshots, which is simple
4683 but doesn't scale well with the number of snapshots, at least if the goal is to take snapshots every second.
4684 An alternative, much more scalable, implementation queries the standard ZFS "snapshots_changed" dataset property
4685 (requires zfs >= 2.2.0), in combination with a local cache that stores this property, as well as the creation time
4686 of the most recent snapshot, for each SnapshotLabel and each dataset."""
4687 p, log = self.params, self.params.log
4688 src, config = p.src, p.create_src_snapshots_config
4689 datasets_to_snapshot: Dict[SnapshotLabel, List[str]] = defaultdict(list)
4690 is_caching = False
4691 msgs = []
4693 def create_snapshot_if_latest_is_too_old(
4694 datasets_to_snapshot: Dict[SnapshotLabel, List[str]], dataset: str, label: SnapshotLabel, creation_unixtime: int
4695 ) -> None:
4696 """Schedules creation of a snapshot for the given label if the label's existing latest snapshot is too old."""
4697 creation_dt = datetime.fromtimestamp(creation_unixtime, tz=config.tz)
4698 log.log(log_trace, "Latest snapshot creation: %s for %s", creation_dt, label)
4699 duration_amount, duration_unit = config.suffix_durations[label.suffix]
4700 next_event_dt = round_datetime_up_to_duration_multiple(
4701 creation_dt + timedelta(microseconds=1), duration_amount, duration_unit, config.anchors
4702 )
4703 msg = ""
4704 if config.current_datetime >= next_event_dt:
4705 datasets_to_snapshot[label].append(dataset) # mark it as scheduled for snapshot creation
4706 msg = " has passed"
4707 msgs.append((next_event_dt, dataset, label, msg))
4708 if is_caching and not p.dry_run: # update cache with latest state from 'zfs list -t snapshot'
4709 cache_file = self.last_modified_cache_file(src, dataset, label)
4710 set_last_modification_time_safe(cache_file, unixtime_in_secs=creation_unixtime, if_more_recent=True)
4712 labels = []
4713 config_labels: List[SnapshotLabel] = config.snapshot_labels()
4714 for label in config_labels:
4715 _duration_amount, _duration_unit = config.suffix_durations[label.suffix]
4716 if _duration_amount == 0 or config.create_src_snapshots_even_if_not_due:
4717 datasets_to_snapshot[label] = sorted_datasets # take snapshot regardless of creation time of existing snaps
4718 else:
4719 labels.append(label)
4720 if len(labels) == 0:
4721 return datasets_to_snapshot # nothing more TBD
4723 # satisfy request from local cache as much as possible
4724 cached_datasets_to_snapshot: Dict[SnapshotLabel, List[str]] = defaultdict(list)
4725 if self.is_caching_snapshots(src):
4726 sorted_datasets_todo = []
4727 for dataset in sorted_datasets:
4728 cached_snapshots_changed: int = self.cache_get_snapshots_changed(self.last_modified_cache_file(src, dataset))
4729 if cached_snapshots_changed == 0:
4730 sorted_datasets_todo.append(dataset) # request cannot be answered from cache
4731 continue
4732 if cached_snapshots_changed != self.src_properties[dataset][SNAPSHOTS_CHANGED]: # get that prop "for free"
4733 self.invalidate_last_modified_cache_dataset(dataset)
4734 sorted_datasets_todo.append(dataset) # request cannot be answered from cache
4735 continue
4736 creation_unixtimes = []
4737 for label in labels:
4738 creation_unixtime = self.cache_get_snapshots_changed(self.last_modified_cache_file(src, dataset, label))
4739 if creation_unixtime == 0:
4740 sorted_datasets_todo.append(dataset) # request cannot be answered from cache
4741 break
4742 creation_unixtimes.append(creation_unixtime)
4743 if len(creation_unixtimes) == len(labels):
4744 for j, label in enumerate(labels):
4745 create_snapshot_if_latest_is_too_old(
4746 cached_datasets_to_snapshot, dataset, label, creation_unixtimes[j]
4747 )
4748 sorted_datasets = sorted_datasets_todo
4750 def create_snapshot_fn(i: int, creation_unixtime_secs: int, dataset: str, snapshot: str) -> None:
4751 create_snapshot_if_latest_is_too_old(datasets_to_snapshot, dataset, labels[i], creation_unixtime_secs)
4753 def on_finish_dataset(dataset: str) -> None:
4754 if is_caching and not p.dry_run:
4755 set_last_modification_time_safe(
4756 self.last_modified_cache_file(src, dataset),
4757 unixtime_in_secs=int(self.src_properties[dataset][SNAPSHOTS_CHANGED]),
4758 if_more_recent=True,
4759 )
4761 # fallback to 'zfs list -t snapshot' for any remaining datasets, as these couldn't be satisfied from local cache
4762 is_caching = self.is_caching_snapshots(src)
4763 datasets_without_snapshots = self.handle_minmax_snapshots(
4764 src, sorted_datasets, labels, fn_latest=create_snapshot_fn, fn_on_finish_dataset=on_finish_dataset
4765 )
4766 for lbl in labels: # merge (sorted) results from local cache + 'zfs list -t snapshot' into (sorted) combined result
4767 datasets_to_snapshot[lbl].sort()
4768 if datasets_without_snapshots or (lbl in cached_datasets_to_snapshot): # +take snaps for snapshot-less datasets
4769 datasets_to_snapshot[lbl] = list( # inputs to merge() are sorted, and outputs are sorted too
4770 heapq.merge(datasets_to_snapshot[lbl], cached_datasets_to_snapshot[lbl], datasets_without_snapshots)
4771 )
4772 msgs.sort()
4773 prefx = "Next scheduled snapshot time: "
4774 text = "\n".join(f"{prefx}{next_event_dt} for {dataset}@{label}{msg}" for next_event_dt, dataset, label, msg in msgs)
4775 if len(text) > 0:
4776 log.info("Next scheduled snapshot times ...\n%s", text)
4777 # sort to ensure that we take snapshots for dailies before hourlies, and so on
4778 label_indexes = {label: k for k, label in enumerate(config_labels)}
4779 datasets_to_snapshot = dict(sorted(datasets_to_snapshot.items(), key=lambda kv: label_indexes[kv[0]]))
4780 return datasets_to_snapshot
4782 def handle_minmax_snapshots(
4783 self,
4784 remote: Remote,
4785 sorted_datasets: List[str],
4786 labels: List[SnapshotLabel],
4787 fn_latest: Callable[[int, int, str, str], None], # callback function for latest snapshot
4788 fn_oldest: Optional[Callable[[int, int, str, str], None]] = None, # callback function for oldest snapshot
4789 fn_on_finish_dataset: Callable[[str], None] = lambda dataset: None,
4790 ) -> List[str]: # thread-safe
4791 """For each dataset in `sorted_datasets`, for each label in `labels`, finds the latest and oldest snapshot, and runs
4792 the callback functions on them. Ignores the timestamp of the input labels and the timestamp of the snapshot names."""
4793 p = self.params
4794 cmd = p.split_args(f"{p.zfs_program} list -t snapshot -d 1 -Hp -o createtxg,creation,name") # sort dataset,createtxg
4795 datasets_with_snapshots: Set[str] = set()
4796 for lines in self.zfs_list_snapshots_in_parallel(remote, cmd, sorted_datasets, ordered=False):
4797 # streaming group by dataset name (consumes constant memory only)
4798 for dataset, group in itertools.groupby(lines, key=lambda line: line[line.rindex("\t") + 1 : line.index("@")]):
4799 snapshots = sorted( # fetch all snapshots of current dataset and sort by createtxg,creation,name
4800 (int(createtxg), int(creation_unixtime_secs), name[name.index("@") + 1 :])
4801 for createtxg, creation_unixtime_secs, name in (line.split("\t", 2) for line in group)
4802 )
4803 assert len(snapshots) > 0
4804 datasets_with_snapshots.add(dataset)
4805 snapshot_names = [snapshot[-1] for snapshot in snapshots]
4806 year_with_4_digits_regex = year_with_four_digits_regex
4807 fns = ((fn_latest, True),) if fn_oldest is None else ((fn_latest, True), (fn_oldest, False))
4808 for i, label in enumerate(labels):
4809 infix = label.infix
4810 start = label.prefix + infix
4811 end = label.suffix
4812 startlen = len(start)
4813 endlen = len(end)
4814 minlen = startlen + endlen if infix else 4 + startlen + endlen # year_with_four_digits_regex
4815 year_slice = slice(startlen, startlen + 4) # [startlen:startlen+4] # year_with_four_digits_regex
4816 for fn, is_reverse in fns:
4817 creation_unixtime_secs: int = 0 # find creation time of latest or oldest snapshot matching the label
4818 minmax_snapshot = ""
4819 for j, s in enumerate(reversed(snapshot_names) if is_reverse else snapshot_names):
4820 if (
4821 s.endswith(end)
4822 and s.startswith(start)
4823 and len(s) >= minlen
4824 and (infix or year_with_4_digits_regex.fullmatch(s[year_slice]))
4825 ):
4826 k = len(snapshots) - j - 1 if is_reverse else j
4827 creation_unixtime_secs = snapshots[k][1]
4828 minmax_snapshot = s
4829 break
4830 fn(i, creation_unixtime_secs, dataset, minmax_snapshot)
4831 fn_on_finish_dataset(dataset)
4833 datasets_without_snapshots = [dataset for dataset in sorted_datasets if dataset not in datasets_with_snapshots]
4834 return datasets_without_snapshots
4836 def cache_get_snapshots_changed(self, path: str) -> int:
4837 return self.cache_get_snapshots_changed2(path)[1]
4839 @staticmethod
4840 def cache_get_snapshots_changed2(path: str) -> Tuple[int, int]:
4841 """Like zfs_get_snapshots_changed() but reads from local cache."""
4842 try: # perf: inode metadata reads and writes are fast - ballpark O(200k) ops/sec.
4843 s = os_stat(path)
4844 return round(s.st_atime), round(s.st_mtime)
4845 except FileNotFoundError:
4846 return 0, 0 # harmless
4848 def last_modified_cache_file(self, remote: Remote, dataset: str, label: Optional[SnapshotLabel] = None) -> str:
4849 cache_file = "=" if label is None else f"{label.prefix}{label.infix}{label.suffix}"
4850 userhost_dir = remote.ssh_user_host if remote.ssh_user_host else "-"
4851 return os_path_join(self.params.log_params.last_modified_cache_dir, userhost_dir, dataset, cache_file)
4853 def invalidate_last_modified_cache_dataset(self, dataset: str) -> None:
4854 """Resets the last_modified timestamp of all cache files of the given dataset to zero."""
4855 p = self.params
4856 cache_file = self.last_modified_cache_file(p.src, dataset)
4857 if not p.dry_run:
4858 try:
4859 zero_times = (0, 0)
4860 for entry in os.scandir(os.path.dirname(cache_file)):
4861 os_utime(entry.path, times=zero_times)
4862 os_utime(cache_file, times=zero_times)
4863 except FileNotFoundError:
4864 pass # harmless
4866 def update_last_modified_cache(self, datasets_to_snapshot: Dict[SnapshotLabel, List[str]]) -> None:
4867 """perf: copy lastmodified time of source dataset into local cache to reduce future 'zfs list -t snapshot' calls."""
4868 p = self.params
4869 src = p.src
4870 if not self.is_caching_snapshots(src):
4871 return
4872 src_datasets_set: Set[str] = set()
4873 dataset_labels: Dict[str, List[SnapshotLabel]] = defaultdict(list)
4874 for label, datasets in datasets_to_snapshot.items():
4875 src_datasets_set.update(datasets) # union
4876 for dataset in datasets:
4877 dataset_labels[dataset].append(label)
4879 sorted_datasets = sorted(src_datasets_set)
4880 snapshots_changed_dict = self.zfs_get_snapshots_changed(src, sorted_datasets)
4881 for src_dataset in sorted_datasets:
4882 snapshots_changed = snapshots_changed_dict.get(src_dataset, 0)
4883 self.src_properties[src_dataset][SNAPSHOTS_CHANGED] = snapshots_changed
4884 if snapshots_changed == 0:
4885 self.invalidate_last_modified_cache_dataset(src_dataset)
4886 else:
4887 cache_file = self.last_modified_cache_file(src, src_dataset)
4888 cache_dir = os.path.dirname(cache_file)
4889 if not p.dry_run:
4890 try:
4891 os.makedirs(cache_dir, exist_ok=True)
4892 set_last_modification_time(cache_file, unixtime_in_secs=snapshots_changed, if_more_recent=True)
4893 for label in dataset_labels[src_dataset]:
4894 cache_file = self.last_modified_cache_file(src, src_dataset, label)
4895 set_last_modification_time(cache_file, unixtime_in_secs=snapshots_changed, if_more_recent=True)
4896 except FileNotFoundError:
4897 pass # harmless
4899 def zfs_get_snapshots_changed(self, remote: Remote, datasets: List[str]) -> Dict[str, int]:
4900 """Returns the ZFS dataset property "snapshots_changed", which is a UTC Unix time in integer seconds.
4901 See https://openzfs.github.io/openzfs-docs/man/7/zfsprops.7.html#snapshots_changed"""
4903 def try_zfs_list_command(_cmd: List[str], batch: List[str]) -> List[str]:
4904 try:
4905 return self.run_ssh_command(remote, print_stderr=False, cmd=_cmd + batch).splitlines()
4906 except CalledProcessError as e:
4907 return stderr_to_str(e.stdout).splitlines()
4908 except UnicodeDecodeError:
4909 return []
4911 p = self.params
4912 cmd = p.split_args(f"{p.zfs_program} list -t filesystem,volume -s name -Hp -o snapshots_changed,name")
4913 results = {}
4914 for lines in self.itr_ssh_cmd_parallel(
4915 remote, [(cmd, datasets)], lambda _cmd, batch: try_zfs_list_command(_cmd, batch), ordered=False
4916 ):
4917 for line in lines:
4918 if "\t" not in line:
4919 break # partial output from failing 'zfs list' command
4920 snapshots_changed, dataset = line.split("\t", 1)
4921 if not dataset:
4922 break # partial output from failing 'zfs list' command
4923 if snapshots_changed == "-" or not snapshots_changed:
4924 snapshots_changed = "0"
4925 results[dataset] = int(snapshots_changed)
4926 return results
4928 @dataclass(order=True, frozen=True)
4929 class ComparableSnapshot:
4930 key: Tuple[str, str] # rel_dataset, guid
4931 cols: List[str] = field(compare=False)
4933 def run_compare_snapshot_lists(self, src_datasets: List[str], dst_datasets: List[str]) -> None:
4934 """Compares source and destination dataset trees recursively wrt. snapshots, for example to check if all recently
4935 taken snapshots have been successfully replicated by a periodic job. Lists snapshots only contained in source
4936 (tagged with 'src'), only contained in destination (tagged with 'dst'), and contained in both source and destination
4937 (tagged with 'all'), in the form of a TSV file, along with other snapshot metadata. Implemented with a time and
4938 space efficient streaming algorithm; easily scales to millions of datasets and any number of snapshots.
4939 Assumes that both src_datasets and dst_datasets are sorted."""
4940 p, log = self.params, self.params.log
4941 src, dst = p.src, p.dst
4942 task = src.root_dataset + " vs. " + dst.root_dataset
4943 tsv_dir = p.log_params.log_file[0 : -len(".log")] + ".cmp"
4944 os.makedirs(tsv_dir, exist_ok=True)
4945 tsv_file = os.path.join(tsv_dir, (src.root_dataset + "%" + dst.root_dataset).replace("/", "~") + ".tsv")
4946 tmp_tsv_file = tsv_file + ".tmp"
4947 compare_snapshot_lists = set(p.compare_snapshot_lists.split("+"))
4948 is_src_dst_all = all(choice in compare_snapshot_lists for choice in cmp_choices_items)
4949 all_src_dst = [loc for loc in ("all", "src", "dst") if loc in compare_snapshot_lists]
4950 is_first_row = True
4951 now = None
4953 def zfs_list_snapshot_iterator(r: Remote, sorted_datasets: List[str]) -> Generator[str, None, None]:
4954 """Lists snapshots sorted by dataset name. All snapshots of a given dataset will be adjacent."""
4955 assert not self.is_test_mode or sorted_datasets == sorted(sorted_datasets), "List is not sorted"
4956 written_zfs_prop = "written" # https://openzfs.github.io/openzfs-docs/man/master/7/zfsprops.7.html#written
4957 if self.is_solaris_zfs(r): # solaris-11.4 zfs does not know the "written" ZFS snapshot property
4958 written_zfs_prop = "type" # for simplicity, fill in the non-integer dummy constant type="snapshot"
4959 props = self.creation_prefix + f"creation,guid,createtxg,{written_zfs_prop},name"
4960 types = "snapshot"
4961 if p.use_bookmark and r.location == "src" and self.are_bookmarks_enabled(r):
4962 types = "snapshot,bookmark" # output list ordering: intentionally makes bookmarks appear *after* snapshots
4963 cmd = p.split_args(f"{p.zfs_program} list -t {types} -d 1 -Hp -o {props}") # sorted by dataset, createtxg
4964 for lines in self.zfs_list_snapshots_in_parallel(r, cmd, sorted_datasets):
4965 yield from lines
4967 def snapshot_iterator(
4968 root_dataset: str, sorted_itr: Generator[str, None, None]
4969 ) -> Generator[Job.ComparableSnapshot, None, None]:
4970 """Splits/groups snapshot stream into distinct datasets, sorts by GUID within a dataset such that any two
4971 snapshots with the same GUID will lie adjacent to each other during the upcoming phase that merges
4972 src snapshots and dst snapshots."""
4973 # streaming group by dataset name (consumes constant memory only)
4974 for dataset, group in itertools.groupby(
4975 sorted_itr, key=lambda line: line[line.rindex("\t") + 1 : line.replace("#", "@").index("@")]
4976 ):
4977 snapshots = list(group) # fetch all snapshots of current dataset, e.g. dataset=tank1/src/foo
4978 snapshots = self.filter_snapshots(snapshots) # apply include/exclude policy
4979 snapshots.sort(key=lambda line: line.split("\t", 2)[1]) # stable sort by GUID (2nd remains createtxg)
4980 rel_dataset = relativize_dataset(dataset, root_dataset) # rel_dataset=/foo, root_dataset=tank1/src
4981 last_guid = ""
4982 for line in snapshots:
4983 cols = line.split("\t")
4984 creation, guid, createtxg, written, snapshot_name = cols
4985 if guid == last_guid:
4986 assert "#" in snapshot_name
4987 continue # ignore bookmarks whose snapshot still exists. also ignore dupes of bookmarks
4988 last_guid = guid
4989 if written == "snapshot":
4990 written = "-" # sanitize solaris-11.4 work-around (solaris-11.4 also has no bookmark feature)
4991 cols = [creation, guid, createtxg, written, snapshot_name]
4992 key = (rel_dataset, guid) # ensures src snapshots and dst snapshots with the same GUID will be adjacent
4993 yield Job.ComparableSnapshot(key, cols)
4995 def print_dataset(rel_dataset: str, entries: Iterable[Tuple[str, Job.ComparableSnapshot]]) -> None:
4996 entries = sorted( # fetch all snapshots of current dataset and sort em by creation, createtxg, snapshot_tag
4997 entries,
4998 key=lambda entry: (
4999 int((cols := entry[1].cols)[0]),
5000 int(cols[2]),
5001 (snapshot_name := cols[-1])[snapshot_name.replace("#", "@").index("@") + 1 :],
5002 ),
5003 )
5005 @dataclass
5006 class SnapshotStats:
5007 snapshot_count: int = field(default=0)
5008 sum_written: int = field(default=0)
5009 snapshot_count_since: int = field(default=0)
5010 sum_written_since: int = field(default=0)
5011 latest_snapshot_idx: Optional[int] = field(default=None)
5012 latest_snapshot_row_str: Optional[str] = field(default=None)
5013 latest_snapshot_creation: Optional[str] = field(default=None)
5014 oldest_snapshot_row_str: Optional[str] = field(default=None)
5015 oldest_snapshot_creation: Optional[str] = field(default=None)
5017 # print metadata of snapshots of current dataset to TSV file; custom stats can later be computed from there
5018 stats: DefaultDict[str, SnapshotStats] = defaultdict(SnapshotStats)
5019 header = "location creation_iso createtxg rel_name guid root_dataset rel_dataset name creation written"
5020 nonlocal is_first_row
5021 if is_first_row:
5022 fd.write(header.replace(" ", "\t") + "\n")
5023 is_first_row = False
5024 for i, entry in enumerate(entries):
5025 loc = location = entry[0]
5026 creation, guid, createtxg, written, name = entry[1].cols
5027 root_dataset = dst.root_dataset if location == cmp_choices_items[1] else src.root_dataset
5028 rel_name = relativize_dataset(name, root_dataset)
5029 creation_iso = isotime_from_unixtime(int(creation))
5030 row = loc, creation_iso, createtxg, rel_name, guid, root_dataset, rel_dataset, name, creation, written
5031 # Example: src 2024-11-06_08:30:05 17435050 /foo@test_2024-11-06_08:30:05_daily 2406491805272097867 tank1/src /foo tank1/src/foo@test_2024-10-06_08:30:04_daily 1730878205 24576 # noqa: E501
5032 row_str = "\t".join(row)
5033 if not p.dry_run:
5034 fd.write(row_str + "\n")
5035 s = stats[location]
5036 s.snapshot_count += 1
5037 s.sum_written += int(written) if written != "-" else 0
5038 s.latest_snapshot_idx = i
5039 s.latest_snapshot_row_str = row_str
5040 s.latest_snapshot_creation = creation
5041 if not s.oldest_snapshot_row_str:
5042 s.oldest_snapshot_row_str = row_str
5043 s.oldest_snapshot_creation = creation
5045 # for convenience, directly log basic summary stats of current dataset
5046 k = stats["all"].latest_snapshot_idx # defaults to None
5047 k = k if k is not None else -1
5048 for entry in entries[k + 1 :]: # aggregate basic stats since latest common snapshot
5049 location = entry[0]
5050 creation, guid, createtxg, written, name = entry[1].cols
5051 s = stats[location]
5052 s.snapshot_count_since += 1
5053 s.sum_written_since += int(written) if written != "-" else 0
5054 prefix = f"Comparing {rel_dataset}~"
5055 msgs = []
5056 msgs.append(f"{prefix} of {task}")
5057 msgs.append(
5058 f"{prefix} Q: No src snapshots are missing on dst, and no dst snapshots are missing on src, "
5059 "and there is a common snapshot? A: "
5060 + (
5061 "n/a"
5062 if not is_src_dst_all
5063 else str(
5064 stats["src"].snapshot_count == 0
5065 and stats["dst"].snapshot_count == 0
5066 and stats["all"].snapshot_count > 0
5067 )
5068 )
5069 )
5070 nonlocal now
5071 now = now or round(time.time()) # uses the same timestamp across the entire dataset tree
5072 latcom = "latest common snapshot"
5073 for loc in all_src_dst:
5074 s = stats[loc]
5075 msgs.append(f"{prefix} Latest snapshot only in {loc}: {s.latest_snapshot_row_str or 'n/a'}")
5076 msgs.append(f"{prefix} Oldest snapshot only in {loc}: {s.oldest_snapshot_row_str or 'n/a'}")
5077 msgs.append(f"{prefix} Snapshots only in {loc}: {s.snapshot_count}")
5078 msgs.append(f"{prefix} Snapshot data written only in {loc}: {human_readable_bytes(s.sum_written)}")
5079 if loc != "all":
5080 na = None if k >= 0 else "n/a"
5081 msgs.append(f"{prefix} Snapshots only in {loc} since {latcom}: {na or s.snapshot_count_since}")
5082 msgs.append(
5083 f"{prefix} Snapshot data written only in {loc} since {latcom}: "
5084 f"{na or human_readable_bytes(s.sum_written_since)}"
5085 )
5086 all_creation = stats["all"].latest_snapshot_creation
5087 latest = ("latest", s.latest_snapshot_creation)
5088 oldest = ("oldest", s.oldest_snapshot_creation)
5089 for label, s_creation in latest, oldest:
5090 if loc != "all":
5091 hd = "n/a"
5092 if s_creation and k >= 0:
5093 assert all_creation is not None
5094 hd = human_readable_duration(int(all_creation) - int(s_creation), unit="s")
5095 msgs.append(f"{prefix} Time diff between {latcom} and {label} snapshot only in {loc}: {hd}")
5096 for label, s_creation in latest, oldest:
5097 hd = "n/a" if not s_creation else human_readable_duration(now - int(s_creation), unit="s")
5098 msgs.append(f"{prefix} Time diff between now and {label} snapshot only in {loc}: {hd}")
5099 log.info("%s", "\n".join(msgs))
5101 # setup streaming pipeline
5102 src_snap_itr = snapshot_iterator(src.root_dataset, zfs_list_snapshot_iterator(src, src_datasets))
5103 dst_snap_itr = snapshot_iterator(dst.root_dataset, zfs_list_snapshot_iterator(dst, dst_datasets))
5104 merge_itr = self.merge_sorted_iterators(cmp_choices_items, p.compare_snapshot_lists, src_snap_itr, dst_snap_itr)
5106 rel_datasets: Dict[str, Set[str]] = defaultdict(set)
5107 for datasets, remote in (src_datasets, src), (dst_datasets, dst):
5108 for dataset in datasets: # rel_dataset=/foo, root_dataset=tank1/src
5109 rel_datasets[remote.location].add(relativize_dataset(dataset, remote.root_dataset))
5110 rel_src_or_dst: List[str] = sorted(rel_datasets["src"].union(rel_datasets["dst"]))
5112 log.debug("%s", f"Temporary TSV output file comparing {task} is: {tmp_tsv_file}")
5113 with open(tmp_tsv_file, "w", encoding="utf-8") as fd:
5114 # streaming group by rel_dataset (consumes constant memory only); entry is a Tuple[str, ComparableSnapshot]
5115 group = itertools.groupby(merge_itr, key=lambda entry: entry[1].key[0])
5116 self.print_datasets(group, lambda rel_ds, entries: print_dataset(rel_ds, entries), rel_src_or_dst)
5117 os.rename(tmp_tsv_file, tsv_file)
5118 log.info("%s", f"Final TSV output file comparing {task} is: {tsv_file}")
5120 tsv_file = tsv_file[0 : tsv_file.rindex(".")] + ".rel_datasets_tsv"
5121 tmp_tsv_file = tsv_file + ".tmp"
5122 with open(tmp_tsv_file, "w", encoding="utf-8") as fd:
5123 header = "location rel_dataset src_dataset dst_dataset"
5124 fd.write(header.replace(" ", "\t") + "\n")
5125 src_only: Set[str] = rel_datasets["src"].difference(rel_datasets["dst"])
5126 dst_only: Set[str] = rel_datasets["dst"].difference(rel_datasets["src"])
5127 for rel_dataset in rel_src_or_dst:
5128 loc = "src" if rel_dataset in src_only else "dst" if rel_dataset in dst_only else "all"
5129 src_dataset = src.root_dataset + rel_dataset if rel_dataset not in dst_only else ""
5130 dst_dataset = dst.root_dataset + rel_dataset if rel_dataset not in src_only else ""
5131 row = loc, rel_dataset, src_dataset, dst_dataset # Example: all /foo/bar tank1/src/foo/bar tank2/dst/foo/bar
5132 if not p.dry_run:
5133 fd.write("\t".join(row) + "\n")
5134 os.rename(tmp_tsv_file, tsv_file)
5136 @staticmethod
5137 def print_datasets(group: itertools.groupby, fn: Callable[[str, Iterable], None], rel_datasets: Iterable[str]) -> None:
5138 rel_datasets = sorted(rel_datasets)
5139 n = len(rel_datasets)
5140 i = 0
5141 for rel_dataset, entries in group:
5142 while i < n and rel_datasets[i] < rel_dataset:
5143 fn(rel_datasets[i], []) # Also print summary stats for datasets whose snapshot stream is empty
5144 i += 1
5145 assert i >= n or rel_datasets[i] == rel_dataset
5146 i += 1
5147 fn(rel_dataset, entries)
5148 while i < n:
5149 fn(rel_datasets[i], []) # Also print summary stats for datasets whose snapshot stream is empty
5150 i += 1
5152 def merge_sorted_iterators(
5153 self,
5154 choices: Sequence[str], # ["src", "dst", "all"]
5155 choice: str, # Example: "src+dst+all"
5156 src_itr: Iterator,
5157 dst_itr: Iterator,
5158 ) -> Generator[Tuple[Any, ...], None, None]:
5159 """This is the typical merge algorithm of a merge sort, slightly adapted to our specific use case."""
5160 assert len(choices) == 3
5161 assert choice
5162 flags = 0
5163 for i, item in enumerate(choices):
5164 if item in choice:
5165 flags |= 1 << i
5166 src_next, dst_next = self.run_in_parallel(lambda: next(src_itr, None), lambda: next(dst_itr, None))
5167 while not (src_next is None and dst_next is None):
5168 if src_next == dst_next:
5169 n = 2
5170 if (flags & (1 << n)) != 0:
5171 yield choices[n], src_next, dst_next
5172 src_next = next(src_itr, None)
5173 dst_next = next(dst_itr, None)
5174 elif src_next is None or (dst_next is not None and dst_next < src_next):
5175 n = 1
5176 if (flags & (1 << n)) != 0:
5177 yield choices[n], dst_next
5178 dst_next = next(dst_itr, None)
5179 else:
5180 n = 0
5181 if (flags & (1 << n)) != 0:
5182 if isinstance(src_next, Job.ComparableSnapshot):
5183 name = src_next.cols[-1]
5184 if "@" in name:
5185 yield choices[n], src_next # include snapshot
5186 else: # ignore src bookmarks for which no snapshot exists in dst; those aren't useful
5187 assert "#" in name
5188 else:
5189 yield choices[n], src_next
5190 src_next = next(src_itr, None)
5192 @staticmethod
5193 def build_dataset_tree(sorted_datasets: List[str]) -> Tree:
5194 """Takes as input a sorted list of datasets and returns a sorted directory tree containing the same dataset names,
5195 in the form of nested dicts."""
5196 tree: Tree = {}
5197 for dataset in sorted_datasets:
5198 current = tree
5199 for component in dataset.split("/"):
5200 child = current.get(component)
5201 if child is None:
5202 child = {}
5203 current[component] = child
5204 current = child
5205 return tree
5207 def process_datasets_in_parallel_and_fault_tolerant(
5208 self,
5209 datasets: List[str],
5210 process_dataset: Callable[[str, str, Retry], bool], # lambda, must be thread-safe
5211 skip_tree_on_error: Callable[[str], bool],
5212 max_workers: int = os.cpu_count() or 1,
5213 interval_nanos: Callable[[str], int] = lambda dataset: 0,
5214 task_name: str = "Task",
5215 enable_barriers: Optional[bool] = None, # for testing only; None means 'auto-detect'
5216 ) -> bool:
5217 """Runs process_dataset(dataset) for each dataset in datasets, while taking care of error handling and retries and
5218 parallel execution. Assumes that the input dataset list is sorted and does not contain duplicates. All children of a
5219 dataset may be processed in parallel. For consistency (even during parallel dataset replication/deletion), processing
5220 of a dataset only starts after processing of all its ancestor datasets has completed. Further, when a thread is ready
5221 to start processing another dataset, it chooses the "smallest" dataset wrt. lexicographical sort order from the
5222 datasets that are currently available for start of processing. Initially, only the roots of the selected dataset
5223 subtrees are available for start of processing."""
5224 assert not self.is_test_mode or datasets == sorted(datasets), "List is not sorted"
5225 assert not self.is_test_mode or not has_duplicates(datasets), "List contains duplicates"
5226 assert callable(process_dataset)
5227 assert callable(skip_tree_on_error)
5228 assert max_workers > 0
5229 assert callable(interval_nanos)
5230 assert "%" not in task_name
5231 has_barrier = any(BARRIER_CHAR in dataset.split("/") for dataset in datasets)
5232 assert (enable_barriers is not False) or not has_barrier
5233 barriers_enabled: bool = bool(has_barrier or enable_barriers)
5234 p, log = self.params, self.params.log
5236 def _process_dataset(dataset: str, tid: str) -> bool:
5237 start_time_nanos = time.monotonic_ns()
5238 try:
5239 return self.run_with_retries(p.retry_policy, process_dataset, dataset, tid)
5240 finally:
5241 elapsed_nanos = time.monotonic_ns() - start_time_nanos
5242 log.debug(p.dry(f"{tid} {task_name} done: %s took %s"), dataset, human_readable_duration(elapsed_nanos))
5244 class TreeNodeMutableAttributes:
5245 __slots__ = ("pending", "barrier") # uses more compact memory layout than __dict__
5247 def __init__(self) -> None:
5248 self.pending: int = 0 # number of children added to priority queue that haven't completed their work yet
5249 self.barrier: Optional[TreeNode] = None # zero or one barrier TreeNode waiting for this node to complete
5251 class TreeNode(NamedTuple):
5252 # TreeNodes are ordered by dataset name within a priority queue via __lt__ comparisons.
5253 dataset: str # Each dataset name is unique, thus attributes other than `dataset` are never used for comparisons
5254 children: Tree # dataset "directory" tree consists of nested dicts; aka Dict[str, Dict]
5255 parent: Any # aka TreeNode
5256 mut: TreeNodeMutableAttributes
5258 def __repr__(self) -> str:
5259 dataset, pending, barrier, nchildren = self.dataset, self.mut.pending, self.mut.barrier, len(self.children)
5260 return str({"dataset": dataset, "pending": pending, "barrier": barrier is not None, "nchildren": nchildren})
5262 def make_tree_node(dataset: str, children: Tree, parent: Optional[TreeNode] = None) -> TreeNode:
5263 return TreeNode(dataset, children, parent, TreeNodeMutableAttributes())
5265 def build_dataset_tree_and_find_roots() -> List[TreeNode]:
5266 """For consistency, processing of a dataset only starts after processing of its ancestors has completed."""
5267 tree: Tree = self.build_dataset_tree(datasets) # tree consists of nested dictionaries
5268 skip_dataset = DONT_SKIP_DATASET
5269 roots = []
5270 for dataset in datasets:
5271 if is_descendant(dataset, of_root_dataset=skip_dataset):
5272 continue
5273 skip_dataset = dataset
5274 children = tree
5275 for component in dataset.split("/"):
5276 children = children[component]
5277 roots.append(make_tree_node(dataset, children))
5278 return roots
5280 assert (not self.is_test_mode) or str(make_tree_node("foo", {}))
5281 immutable_empty_barrier: TreeNode = make_tree_node("immutable_empty_barrier", {})
5282 priority_queue: List[TreeNode] = build_dataset_tree_and_find_roots()
5283 heapq.heapify(priority_queue) # same order as sorted()
5284 len_datasets: int = len(datasets)
5285 datasets_set: Set[str] = set(datasets)
5286 with ThreadPoolExecutor(max_workers=max_workers) as executor:
5287 todo_futures: Set[Future] = set()
5288 submitted: int = 0
5289 next_update_nanos: int = time.monotonic_ns()
5290 fw_timeout: Optional[float] = None
5292 def submit_datasets() -> bool:
5293 nonlocal fw_timeout
5294 fw_timeout = None # indicates to use blocking flavor of concurrent.futures.wait()
5295 while len(priority_queue) > 0 and len(todo_futures) < max_workers:
5296 # pick "smallest" dataset (wrt. sort order) available for start of processing; submit to thread pool
5297 nonlocal next_update_nanos
5298 sleep_nanos = next_update_nanos - time.monotonic_ns()
5299 if sleep_nanos > 0:
5300 time.sleep(sleep_nanos / 1_000_000_000) # seconds
5301 if sleep_nanos > 0 and len(todo_futures) > 0:
5302 fw_timeout = 0 # indicates to use non-blocking flavor of concurrent.futures.wait()
5303 # It's possible an even "smaller" dataset (wrt. sort order) has become available while we slept.
5304 # If so it's preferable to submit to the thread pool the smaller one first.
5305 break # break out of loop to check if that's the case via non-blocking concurrent.futures.wait()
5306 node: TreeNode = heapq.heappop(priority_queue)
5307 next_update_nanos += max(0, interval_nanos(node.dataset))
5308 nonlocal submitted
5309 submitted += 1
5310 future = executor.submit(_process_dataset, node.dataset, tid=f"{submitted}/{len_datasets}")
5311 future.node = node # type: ignore[attr-defined]
5312 todo_futures.add(future)
5313 return len(todo_futures) > 0
5315 # coordination loop; runs in the (single) main thread; submits tasks to worker threads and handles their results
5316 failed = False
5317 while submit_datasets():
5318 done_futures, todo_futures = concurrent.futures.wait(todo_futures, fw_timeout, return_when=FIRST_COMPLETED)
5319 for done_future in done_futures:
5320 dataset = done_future.node.dataset # type: ignore[attr-defined]
5321 try:
5322 no_skip: bool = done_future.result() # does not block as processing has already completed
5323 except (CalledProcessError, subprocess.TimeoutExpired, SystemExit, UnicodeDecodeError) as e:
5324 failed = True
5325 if p.skip_on_error == "fail":
5326 [todo_future.cancel() for todo_future in todo_futures]
5327 terminate_process_subtree(except_current_process=True)
5328 raise e
5329 no_skip = not (p.skip_on_error == "tree" or skip_tree_on_error(dataset))
5330 log.error("%s", e)
5331 self.append_exception(e, task_name, dataset)
5333 if not barriers_enabled:
5334 # This simple algorithm is sufficient for almost all use cases:
5335 def simple_enqueue_children(node: TreeNode) -> None:
5336 for child, grandchildren in node.children.items(): # as processing of parent has now completed
5337 child_node = make_tree_node(f"{node.dataset}/{child}", grandchildren)
5338 if child_node.dataset in datasets_set:
5339 heapq.heappush(priority_queue, child_node) # make it available for start of processing
5340 else: # it's an intermediate node that has no job attached; pass the enqueue operation
5341 simple_enqueue_children(child_node) # ... recursively down the tree
5343 if no_skip:
5344 simple_enqueue_children(done_future.node) # type: ignore[attr-defined]
5345 else:
5346 # The (more complex) algorithm below is for more general job scheduling, as in bzfs_jobrunner.
5347 # Here, a "dataset" string is treated as an identifier for any kind of job rather than a reference
5348 # to a concrete ZFS object. Example "dataset" job string: "src_host1/createsnapshot/push/prune".
5349 # Jobs can depend on another job via a parent/child relationship formed by '/' directory separators
5350 # within the dataset string, and multiple "datasets" form a job dependency tree by way of common
5351 # dataset directory prefixes. Jobs that do not depend on each other can be executed in parallel, and
5352 # jobs can be told to first wait for other jobs to complete successfully. The algorithm is based on
5353 # a barrier primitive and is typically disabled; it is only required for rare jobrunner configs. For
5354 # example, a job scheduler can specify that all parallel push replications jobs to multiple
5355 # destinations must succeed before the jobs of the pruning phase can start. More generally, with
5356 # this algo, a job scheduler can specify that all jobs within a given job subtree (containing any
5357 # nested combination of sequential and/or parallel jobs) must successfully complete before a certain
5358 # other job within the job tree is started. This is specified via the barrier directory named "~".
5359 # Example: "src_host1/createsnapshot/~/prune".
5360 # Note that "~" is unambiguous as it is not a valid ZFS dataset name component per the naming rules
5361 # enforced by the 'zfs create', 'zfs snapshot' and 'zfs bookmark' CLIs.
5362 def enqueue_children(node: TreeNode) -> int:
5363 """Returns number of jobs that were added to priority_queue for immediate start of processing."""
5364 n = 0
5365 children = node.children
5366 for child, grandchildren in children.items():
5367 child_node = make_tree_node(f"{node.dataset}/{child}", grandchildren, parent=node)
5368 if child != BARRIER_CHAR:
5369 if child_node.dataset in datasets_set:
5370 # it's not a barrier; make job available for immediate start of processing
5371 heapq.heappush(priority_queue, child_node)
5372 k = 1
5373 else: # it's an intermediate node that has no job attached; pass the enqueue operation
5374 k = enqueue_children(child_node) # ... recursively down the tree
5375 elif len(children) == 1: # if the only child is a barrier then pass the enqueue operation
5376 k = enqueue_children(child_node) # ... recursively down the tree
5377 else: # park the barrier node within the (still closed) barrier for the time being
5378 assert node.mut.barrier is None
5379 node.mut.barrier = child_node
5380 k = 0
5381 node.mut.pending += min(1, k)
5382 n += k
5383 assert n >= 0
5384 return n
5386 def on_job_completion_with_barriers(node: TreeNode, no_skip: bool) -> None:
5387 if no_skip:
5388 enqueue_children(node) # make child datasets available for start of processing
5389 else: # job completed without success
5390 tmp = node # ... thus, opening the barrier shall always do nothing in node and its ancestors
5391 while tmp is not None:
5392 tmp.mut.barrier = immutable_empty_barrier
5393 tmp = tmp.parent
5394 assert node.mut.pending >= 0
5395 while node.mut.pending == 0: # have all jobs in subtree of current node completed?
5396 if no_skip: # ... if so open the barrier, if it exists, and enqueue jobs waiting on it
5397 if not (node.mut.barrier is None or node.mut.barrier is immutable_empty_barrier):
5398 node.mut.pending += min(1, enqueue_children(node.mut.barrier))
5399 node.mut.barrier = immutable_empty_barrier
5400 if node.mut.pending > 0: # did opening of barrier cause jobs to be enqueued in subtree?
5401 break # ... if so we aren't quite done yet with this subtree
5402 if node.parent is None:
5403 break # we've reached the root node
5404 node = node.parent # recurse up the tree to propagate completion upward
5405 node.mut.pending -= 1 # mark subtree as completed
5406 assert node.mut.pending >= 0
5408 assert barriers_enabled
5409 on_job_completion_with_barriers(done_future.node, no_skip) # type: ignore[attr-defined]
5410 # endwhile submit_datasets()
5411 assert len(priority_queue) == 0
5412 assert len(todo_futures) == 0
5413 return failed
5415 def is_program_available(self, program: str, location: str) -> bool:
5416 return program in self.params.available_programs.get(location, {})
5418 def detect_available_programs(self) -> None:
5419 p = params = self.params
5420 log = p.log
5421 available_programs = params.available_programs
5422 if "local" not in available_programs:
5423 cmd = [p.shell_program_local, "-c", self.find_available_programs()]
5424 available_programs["local"] = {
5425 prog: ""
5426 for prog in subprocess.run(cmd, stdin=DEVNULL, stdout=PIPE, stderr=sys.stderr, text=True).stdout.splitlines()
5427 }
5428 cmd = [p.shell_program_local, "-c", "exit"]
5429 if subprocess.run(cmd, stdin=DEVNULL, stdout=PIPE, stderr=sys.stderr, text=True).returncode != 0: 5429 ↛ 5430line 5429 didn't jump to line 5430 because the condition on line 5429 was never true
5430 self.disable_program("sh", ["local"])
5432 for r in [p.dst, p.src]:
5433 loc = r.location
5434 remote_conf_cache_key = r.cache_key()
5435 cache_item: Optional[RemoteConfCacheItem] = self.remote_conf_cache.get(remote_conf_cache_key)
5436 if cache_item is not None:
5437 # startup perf: cache avoids ssh connect setup and feature detection roundtrips on revisits to same site
5438 p.connection_pools[loc] = cache_item.connection_pools
5439 if time.monotonic_ns() - cache_item.timestamp_nanos < p.remote_conf_cache_ttl_nanos: 5439 ↛ 5447line 5439 didn't jump to line 5447 because the condition on line 5439 was always true
5440 available_programs[loc] = cache_item.available_programs
5441 p.zpool_features[loc] = cache_item.zpool_features
5442 continue # cache hit, skip remote detection
5443 else:
5444 p.connection_pools[loc] = ConnectionPools(
5445 r, {SHARED: r.max_concurrent_ssh_sessions_per_tcp_connection, DEDICATED: 1}
5446 )
5447 self.detect_zpool_features(r)
5448 self.detect_available_programs_remote(r, available_programs, r.ssh_user_host)
5449 self.remote_conf_cache[remote_conf_cache_key] = RemoteConfCacheItem(
5450 p.connection_pools[loc], available_programs[loc], p.zpool_features[loc]
5451 )
5452 if r.use_zfs_delegation and p.zpool_features[loc].get("delegation") == "off":
5453 die(
5454 f"Permission denied as ZFS delegation is disabled for {r.location} "
5455 f"dataset: {r.basis_root_dataset}. Manually enable it via 'sudo zpool set delegation=on {r.pool}'"
5456 )
5458 locations = ["src", "dst", "local"]
5459 if params.compression_program == disable_prg:
5460 self.disable_program("zstd", locations)
5461 if params.mbuffer_program == disable_prg:
5462 self.disable_program("mbuffer", locations)
5463 if params.ps_program == disable_prg:
5464 self.disable_program("ps", locations)
5465 if params.pv_program == disable_prg:
5466 self.disable_program("pv", locations)
5467 if params.shell_program == disable_prg:
5468 self.disable_program("sh", locations)
5469 if params.sudo_program == disable_prg:
5470 self.disable_program("sudo", locations)
5471 if params.zpool_program == disable_prg:
5472 self.disable_program("zpool", locations)
5474 for key, programs in available_programs.items():
5475 for program in list(programs.keys()):
5476 if program.startswith("uname-"):
5477 # uname-Linux foo 5.15.0-69-generic #76-Ubuntu SMP Fri Mar 17 17:19:29 UTC 2023 x86_64 x86_64 x86_64 GNU/Linux # noqa: E501
5478 # uname-FreeBSD freebsd 14.1-RELEASE FreeBSD 14.1-RELEASE releng/14.1-n267679-10e31f0946d8 GENERIC amd64
5479 # uname-SunOS solaris 5.11 11.4.42.111.0 i86pc i386 i86pc # https://blogs.oracle.com/solaris/post/building-open-source-software-on-oracle-solaris-114-cbe-release # noqa: E501
5480 # uname-SunOS solaris 5.11 11.4.0.15.0 i86pc i386 i86pc
5481 # uname-Darwin foo 23.6.0 Darwin Kernel Version 23.6.0: Mon Jul 29 21:13:04 PDT 2024; root:xnu-10063.141.2~1/RELEASE_ARM64_T6020 arm64 # noqa: E501
5482 programs.pop(program)
5483 uname = program[len("uname-") :]
5484 programs["uname"] = uname
5485 log.log(log_trace, f"available_programs[{key}][uname]: %s", uname)
5486 programs["os"] = uname.split(" ")[0] # Linux|FreeBSD|SunOS|Darwin
5487 log.log(log_trace, f"available_programs[{key}][os]: %s", programs["os"])
5488 elif program.startswith("default_shell-"):
5489 programs.pop(program)
5490 default_shell = program[len("default_shell-") :]
5491 programs["default_shell"] = default_shell
5492 log.log(log_trace, f"available_programs[{key}][default_shell]: %s", default_shell)
5493 validate_default_shell(default_shell, r)
5494 elif program.startswith("getconf_cpu_count-"):
5495 programs.pop(program)
5496 getconf_cpu_count = program[len("getconf_cpu_count-") :]
5497 programs["getconf_cpu_count"] = getconf_cpu_count
5498 log.log(log_trace, f"available_programs[{key}][getconf_cpu_count]: %s", getconf_cpu_count)
5500 for key, programs in available_programs.items():
5501 log.debug(f"available_programs[{key}]: %s", list_formatter(programs, separator=", "))
5503 for r in [p.dst, p.src]:
5504 if r.sudo and not self.is_program_available("sudo", r.location):
5505 die(f"{p.sudo_program} CLI is not available on {r.location} host: {r.ssh_user_host or 'localhost'}")
5507 def disable_program(self, program: str, locations: List[str]) -> None:
5508 for location in locations:
5509 self.params.available_programs[location].pop(program, None)
5511 def find_available_programs(self) -> str:
5512 """POSIX shell script that checks for the existence of various programs. It uses `if` statements instead of `&&` plus
5513 `printf` instead of `echo` to ensure maximum compatibility across shells."""
5514 p = self.params
5515 cmds = []
5516 cmds.append("printf 'default_shell-%s\n' \"$SHELL\"")
5517 cmds.append("if command -v echo > /dev/null; then printf 'echo\n'; fi")
5518 cmds.append(f"if command -v {p.zpool_program} > /dev/null; then printf 'zpool\n'; fi")
5519 cmds.append(f"if command -v {p.ssh_program} > /dev/null; then printf 'ssh\n'; fi")
5520 cmds.append(f"if command -v {p.shell_program} > /dev/null; then printf 'sh\n'; fi")
5521 cmds.append(f"if command -v {p.sudo_program} > /dev/null; then printf 'sudo\n'; fi")
5522 cmds.append(f"if command -v {p.compression_program} > /dev/null; then printf 'zstd\n'; fi")
5523 cmds.append(f"if command -v {p.mbuffer_program} > /dev/null; then printf 'mbuffer\n'; fi")
5524 cmds.append(f"if command -v {p.pv_program} > /dev/null; then printf 'pv\n'; fi")
5525 cmds.append(f"if command -v {p.ps_program} > /dev/null; then printf 'ps\n'; fi")
5526 cmds.append(
5527 f"if command -v {p.psrinfo_program} > /dev/null; then "
5528 f"printf 'getconf_cpu_count-'; {p.psrinfo_program} -p; "
5529 f"elif command -v {p.getconf_program} > /dev/null; then "
5530 f"printf 'getconf_cpu_count-'; {p.getconf_program} _NPROCESSORS_ONLN; "
5531 "fi"
5532 )
5533 cmds.append(f"if command -v {p.uname_program} > /dev/null; then printf 'uname-'; {p.uname_program} -a || true; fi")
5534 return "; ".join(cmds)
5536 def detect_available_programs_remote(self, remote: Remote, available_programs: Dict, ssh_user_host: str) -> None:
5537 p, log = self.params, self.params.log
5538 location = remote.location
5539 available_programs_minimum = {"zpool": None, "sudo": None}
5540 available_programs[location] = {}
5541 lines = None
5542 try:
5543 # on Linux, 'zfs --version' returns with zero status and prints the correct info
5544 # on FreeBSD, 'zfs --version' always prints the same (correct) info as Linux, but nonetheless sometimes
5545 # returns with non-zero status (sometimes = if the zfs kernel module is not loaded)
5546 # on Solaris, 'zfs --version' returns with non-zero status without printing useful info as the --version
5547 # option is not known there
5548 lines = self.run_ssh_command(remote, log_trace, print_stderr=False, cmd=[p.zfs_program, "--version"])
5549 assert lines
5550 except (FileNotFoundError, PermissionError): # location is local and program file was not found
5551 die(f"{p.zfs_program} CLI is not available on {location} host: {ssh_user_host or 'localhost'}")
5552 except subprocess.CalledProcessError as e:
5553 if "unrecognized command '--version'" in e.stderr and "run: zfs help" in e.stderr:
5554 available_programs[location]["zfs"] = "notOpenZFS" # solaris-11.4 zfs does not know --version flag
5555 elif not e.stdout.startswith("zfs"): 5555 ↛ 5558line 5555 didn't jump to line 5558 because the condition on line 5555 was always true
5556 die(f"{p.zfs_program} CLI is not available on {location} host: {ssh_user_host or 'localhost'}")
5557 else:
5558 lines = e.stdout # FreeBSD if the zfs kernel module is not loaded
5559 assert lines
5560 if lines:
5561 line = lines.splitlines()[0]
5562 assert line.startswith("zfs")
5563 # Example: zfs-2.1.5~rc5-ubuntu3 -> 2.1.5, zfswin-2.2.3rc5 -> 2.2.3
5564 version = line.split("-")[1].strip()
5565 match = re.fullmatch(r"(\d+\.\d+\.\d+).*", version)
5566 assert match, "Unparsable zfs version string: " + version
5567 version = match.group(1)
5568 available_programs[location]["zfs"] = version
5569 if is_version_at_least(version, "2.1.0"):
5570 available_programs[location][zfs_version_is_at_least_2_1_0] = True
5571 if is_version_at_least(version, "2.2.0"):
5572 available_programs[location][zfs_version_is_at_least_2_2_0] = True
5573 log.log(log_trace, f"available_programs[{location}][zfs]: %s", available_programs[location]["zfs"])
5575 if p.shell_program != disable_prg:
5576 try:
5577 cmd = [p.shell_program, "-c", self.find_available_programs()]
5578 available_programs[location].update(
5579 dict.fromkeys(self.run_ssh_command(remote, log_trace, cmd=cmd).splitlines())
5580 )
5581 return
5582 except (FileNotFoundError, PermissionError) as e: # location is local and shell program file was not found
5583 if e.filename != p.shell_program: 5583 ↛ 5584line 5583 didn't jump to line 5584 because the condition on line 5583 was never true
5584 raise
5585 except subprocess.CalledProcessError:
5586 pass
5587 log.warning("%s", f"Failed to find {p.shell_program} on {location}. Continuing with minimal assumptions...")
5588 available_programs[location].update(available_programs_minimum)
5590 def is_solaris_zfs(self, remote: Remote) -> bool:
5591 return self.is_solaris_zfs_location(remote.location)
5593 def is_solaris_zfs_location(self, location: str) -> bool:
5594 if location == "local":
5595 return platform.system() == "SunOS"
5596 return self.params.available_programs[location].get("zfs") == "notOpenZFS"
5598 @staticmethod
5599 def is_dummy(r: Remote) -> bool:
5600 return r.root_dataset == dummy_dataset
5602 def detect_zpool_features(self, remote: Remote) -> None:
5603 p = params = self.params
5604 r, loc, log = remote, remote.location, p.log
5605 lines = []
5606 features = {}
5607 params.zpool_features.pop(loc, None)
5608 if self.is_dummy(r):
5609 params.zpool_features[loc] = {}
5610 return
5611 if params.zpool_program != disable_prg:
5612 cmd = params.split_args(f"{params.zpool_program} get -Hp -o property,value all", r.pool)
5613 try:
5614 lines = self.run_ssh_command(remote, log_trace, check=False, cmd=cmd).splitlines()
5615 except (FileNotFoundError, PermissionError) as e:
5616 if e.filename != params.zpool_program: 5616 ↛ 5617line 5616 didn't jump to line 5617 because the condition on line 5616 was never true
5617 raise
5618 log.warning(
5619 "%s", f"Failed to detect zpool features on {loc}: {r.pool}. Continuing with minimal assumptions ..."
5620 )
5621 else:
5622 props = {line.split("\t", 1)[0]: line.split("\t", 1)[1] for line in lines}
5623 features = {k: v for k, v in props.items() if k.startswith("feature@") or k == "delegation"}
5624 if len(lines) == 0:
5625 cmd = p.split_args(f"{p.zfs_program} list -t filesystem -Hp -o name -s name", r.pool)
5626 if self.try_ssh_command(remote, log_trace, cmd=cmd) is None:
5627 die(f"Pool does not exist for {loc} dataset: {r.basis_root_dataset}. Manually create the pool first!")
5628 params.zpool_features[loc] = features
5630 def is_zpool_feature_enabled_or_active(self, remote: Remote, feature: str) -> bool:
5631 return self.params.zpool_features[remote.location].get(feature) in ("active", "enabled")
5633 def are_bookmarks_enabled(self, remote: Remote) -> bool:
5634 return self.is_zpool_feature_enabled_or_active(
5635 remote, "feature@bookmark_v2"
5636 ) and self.is_zpool_feature_enabled_or_active(remote, "feature@bookmark_written")
5638 def is_caching_snapshots(self, remote: Remote) -> bool:
5639 return (
5640 self.params.is_caching_snapshots
5641 and self.is_program_available(zfs_version_is_at_least_2_2_0, remote.location)
5642 and self.is_zpool_feature_enabled_or_active(remote, "feature@extensible_dataset")
5643 )
5645 def check_zfs_dataset_busy(self, remote: Remote, dataset: str, busy_if_send: bool = True) -> bool:
5646 """Decline to start a state changing ZFS operation that is, although harmless, likely to collide with other
5647 currently running processes. Instead, retry the operation later, after some delay. For example, decline to
5648 start a 'zfs receive' into a destination dataset if another process is already running another 'zfs receive'
5649 into the same destination dataset, as ZFS would reject any such attempt. However, it's actually fine to run an
5650 incremental 'zfs receive' into a dataset in parallel with a 'zfs send' out of the very same dataset. This also
5651 helps daisy chain use cases where A replicates to B, and B replicates to C.
5653 check_zfs_dataset_busy() offers no guarantees, it merely proactively avoids likely collisions. In other words,
5654 even if the process check below passes there is no guarantee that the destination dataset won't be busy by the
5655 time we actually execute the 'zfs send' operation. In such an event ZFS will reject the operation, we'll detect
5656 that, and we'll simply retry, after some delay. check_zfs_dataset_busy() can be disabled via --ps-program=-.
5658 TLDR: As is common for long-running operations in distributed systems, we use coordination-free optimistic
5659 concurrency control where the parties simply retry on collision detection (rather than coordinate concurrency
5660 via a remote lock server)."""
5661 p, log = self.params, self.params.log
5662 if not self.is_program_available("ps", remote.location):
5663 return True
5664 cmd = p.split_args(f"{p.ps_program} -Ao args")
5665 procs = (self.try_ssh_command(remote, log_trace, cmd=cmd) or "").splitlines()
5666 if self.inject_params.get("is_zfs_dataset_busy", False):
5667 procs += ["sudo -n zfs receive -u -o foo:bar=/baz " + dataset] # for unit testing only
5668 if not self.is_zfs_dataset_busy(procs, dataset, busy_if_send=busy_if_send):
5669 return True
5670 op = "zfs {receive" + ("|send" if busy_if_send else "") + "} operation"
5671 try:
5672 die(f"Cannot continue now: Destination is already busy with {op} from another process: {dataset}")
5673 except SystemExit as e:
5674 log.warning("%s", e)
5675 raise RetryableError("dst currently busy with zfs mutation op") from e
5677 zfs_dataset_busy_prefix = r"(([^ ]*?/)?(sudo|doas)( +-n)? +)?([^ ]*?/)?zfs (receive|recv"
5678 zfs_dataset_busy_if_mods = re.compile((zfs_dataset_busy_prefix + ") .*").replace("(", "(?:"))
5679 zfs_dataset_busy_if_send = re.compile((zfs_dataset_busy_prefix + "|send) .*").replace("(", "(?:"))
5681 @staticmethod
5682 def is_zfs_dataset_busy(procs: List[str], dataset: str, busy_if_send: bool) -> bool:
5683 regex = Job.zfs_dataset_busy_if_send if busy_if_send else Job.zfs_dataset_busy_if_mods
5684 suffix = " " + dataset
5685 infix = " " + dataset + "@"
5686 return any((proc.endswith(suffix) or infix in proc) and regex.fullmatch(proc) for proc in procs)
5688 def run_ssh_cmd_batched(
5689 self,
5690 r: Remote,
5691 cmd: List[str],
5692 cmd_args: List[str],
5693 fn: Callable[[List[str]], Any],
5694 max_batch_items: int = 2**29,
5695 sep: str = " ",
5696 ) -> None:
5697 drain(self.itr_ssh_cmd_batched(r, cmd, cmd_args, fn, max_batch_items=max_batch_items, sep=sep))
5699 def itr_ssh_cmd_batched(
5700 self,
5701 r: Remote,
5702 cmd: List[str],
5703 cmd_args: List[str],
5704 fn: Callable[[List[str]], Any],
5705 max_batch_items: int = 2**29,
5706 sep: str = " ",
5707 ) -> Generator[Any, None, None]:
5708 """Runs fn(cmd_args) in batches w/ cmd, without creating a command line that's too big for the OS to handle."""
5709 max_bytes = min(self.get_max_command_line_bytes("local"), self.get_max_command_line_bytes(r.location))
5710 assert isinstance(sep, str)
5711 # Max size of a single argument is 128KB on Linux - https://lists.gnu.org/archive/html/bug-bash/2020-09/msg00095.html
5712 max_bytes = max_bytes if sep == " " else min(max_bytes, 131071) # e.g. 'zfs destroy foo@s1,s2,...,sN'
5713 fsenc = sys.getfilesystemencoding()
5714 seplen = len(sep.encode(fsenc))
5715 conn_pool: ConnectionPool = self.params.connection_pools[r.location].pool(SHARED)
5716 conn: Connection = conn_pool.get_connection()
5717 cmd = conn.ssh_cmd + cmd
5718 conn_pool.return_connection(conn)
5719 header_bytes: int = len(" ".join(cmd).encode(fsenc))
5720 batch: List[str] = []
5721 total_bytes: int = header_bytes
5722 max_items = max_batch_items
5724 def flush() -> Any:
5725 if len(batch) > 0:
5726 return fn(batch)
5727 return None
5729 for cmd_arg in cmd_args:
5730 curr_bytes = seplen + len(cmd_arg.encode(fsenc))
5731 if total_bytes + curr_bytes > max_bytes or max_items <= 0:
5732 results = flush()
5733 if results is not None:
5734 yield results
5735 batch, total_bytes, max_items = [], header_bytes, max_batch_items
5736 batch.append(cmd_arg)
5737 total_bytes += curr_bytes
5738 max_items -= 1
5739 results = flush()
5740 if results is not None:
5741 yield results
5743 def run_ssh_cmd_parallel(
5744 self,
5745 r: Remote,
5746 cmd_args_list: List[Tuple[List[str], List[str]]],
5747 fn: Callable[[List[str], List[str]], Any],
5748 max_batch_items: int = 2**29,
5749 ) -> None:
5750 drain(self.itr_ssh_cmd_parallel(r, cmd_args_list, fn=fn, max_batch_items=max_batch_items, ordered=False))
5752 def itr_ssh_cmd_parallel(
5753 self,
5754 r: Remote,
5755 cmd_args_list: List[Tuple[List[str], List[str]]],
5756 fn: Callable[[List[str], List[str]], Any],
5757 max_batch_items: int = 2**29,
5758 ordered: bool = True,
5759 ) -> Generator[Any, None, Any]:
5760 """Returns output datasets in the same order as the input datasets (not in random order) if ordered == True."""
5761 max_workers = self.max_workers[r.location]
5762 with ThreadPoolExecutor(max_workers=max_workers) as executor:
5763 iterators = [
5764 self.itr_ssh_cmd_batched(
5765 r, cmd, cmd_args, lambda batch, cmd=cmd: executor.submit(fn, cmd, batch), max_batch_items=max_batch_items # type: ignore[misc]
5766 )
5767 for cmd, cmd_args in cmd_args_list
5768 ]
5769 iterator = itertools.chain(*iterators)
5770 iterators.clear() # help gc
5771 # Materialize the next N futures into a buffer, causing submission + parallel execution of their CLI calls
5772 fifo_buffer: Deque[Future] = deque(itertools.islice(iterator, max_workers))
5773 next_future: Optional[Future]
5775 if ordered:
5776 while fifo_buffer: # submit the next CLI call whenever the current CLI call returns
5777 curr_future: Future = fifo_buffer.popleft()
5778 next_future = next(iterator, None) # causes the next CLI call to be submitted
5779 if next_future is not None:
5780 fifo_buffer.append(next_future)
5781 yield curr_future.result() # blocks until CLI returns
5782 else:
5783 todo_futures: Set[Future] = set(fifo_buffer)
5784 fifo_buffer.clear() # help gc
5785 while todo_futures:
5786 done_futures, todo_futures = concurrent.futures.wait(todo_futures, return_when=FIRST_COMPLETED) # blocks
5787 for done_future in done_futures: # submit the next CLI call whenever a CLI call returns
5788 next_future = next(iterator, None) # causes the next CLI call to be submitted
5789 if next_future is not None:
5790 todo_futures.add(next_future)
5791 yield done_future.result() # does not block as processing has already completed
5792 assert next(iterator, None) is None
5794 def zfs_list_snapshots_in_parallel(
5795 self, r: Remote, cmd: List[str], datasets: List[str], ordered: bool = True
5796 ) -> Generator[Any, None, Any]:
5797 """Runs 'zfs list -t snapshot' on multiple datasets at the same time."""
5798 max_workers = self.max_workers[r.location]
5799 return self.itr_ssh_cmd_parallel(
5800 r,
5801 [(cmd, datasets)],
5802 fn=lambda cmd, batch: (self.try_ssh_command(r, log_trace, cmd=cmd + batch) or "").splitlines(),
5803 max_batch_items=min(
5804 self.max_datasets_per_minibatch_on_list_snaps[r.location],
5805 max(
5806 len(datasets) // (max_workers if r.ssh_user_host else max_workers * 8),
5807 max_workers if r.ssh_user_host else 1,
5808 ),
5809 ),
5810 ordered=ordered,
5811 )
5813 @staticmethod
5814 def run_in_parallel(fn1: Callable[[], Any], fn2: Callable[[], Any]) -> Tuple[Any, Any]:
5815 """perf: Runs both I/O functions in parallel/concurrently."""
5816 with ThreadPoolExecutor(max_workers=1) as executor:
5817 future: Future = executor.submit(fn2) # async fn2
5818 result1 = fn1() # blocks until fn1 call returns
5819 result2 = future.result() # blocks until fn2 call returns
5820 return result1, result2
5822 def get_max_command_line_bytes(self, location: str, os_name: Optional[str] = None) -> int:
5823 """Remote flavor of os.sysconf("SC_ARG_MAX") - size(os.environb) - safety margin"""
5824 os_name = os_name if os_name else self.params.available_programs[location].get("os")
5825 if os_name == "Linux":
5826 arg_max = 2 * 1024 * 1024
5827 elif os_name == "FreeBSD":
5828 arg_max = 256 * 1024
5829 elif os_name == "SunOS":
5830 arg_max = 1 * 1024 * 1024
5831 elif os_name == "Darwin":
5832 arg_max = 1 * 1024 * 1024
5833 elif os_name == "Windows":
5834 arg_max = 32 * 1024
5835 else:
5836 arg_max = 256 * 1024 # unknown
5838 environ_size = 4 * 1024 # typically is 1-4 KB
5839 safety_margin = (8 * 2 * 4 + 4) * 1024 if arg_max >= 1 * 1024 * 1024 else 8 * 1024
5840 max_bytes = max(4 * 1024, arg_max - environ_size - safety_margin)
5841 if self.max_command_line_bytes is not None:
5842 return self.max_command_line_bytes # for testing only
5843 else:
5844 return max_bytes
5847#############################################################################
5848@dataclass(order=True, repr=False)
5849class Connection:
5850 """Represents the ability to multiplex N=capacity concurrent SSH sessions over the same TCP connection."""
5852 free: int # sort order evens out the number of concurrent sessions among the TCP connections
5853 last_modified: int # LIFO: tiebreaker favors latest returned conn as that's most alive and hot
5855 def __init__(self, remote: Remote, max_concurrent_ssh_sessions_per_tcp_connection: int, cid: int) -> None:
5856 assert max_concurrent_ssh_sessions_per_tcp_connection > 0
5857 self.capacity: int = max_concurrent_ssh_sessions_per_tcp_connection
5858 self.free: int = max_concurrent_ssh_sessions_per_tcp_connection
5859 self.last_modified: int = 0
5860 self.cid: int = cid
5861 self.ssh_cmd: List[str] = remote.local_ssh_command()
5862 self.ssh_cmd_quoted: List[str] = [shlex.quote(item) for item in self.ssh_cmd]
5863 self.lock: threading.Lock = threading.Lock()
5864 self.last_refresh_time: int = 0
5866 def __repr__(self) -> str:
5867 return str({"free": self.free, "cid": self.cid})
5869 def increment_free(self, value: int) -> None:
5870 self.free += value
5871 assert self.free >= 0
5872 assert self.free <= self.capacity
5874 def is_full(self) -> bool:
5875 return self.free <= 0
5877 def update_last_modified(self, last_modified: int) -> None:
5878 self.last_modified = last_modified
5880 def shutdown(self, msg_prefix: str, p: Params) -> None:
5881 ssh_cmd = self.ssh_cmd
5882 if ssh_cmd:
5883 ssh_socket_cmd = ssh_cmd[0:-1] + ["-O", "exit", ssh_cmd[-1]]
5884 p.log.log(log_trace, f"Executing {msg_prefix}: %s", shlex.join(ssh_socket_cmd))
5885 process = subprocess.run(ssh_socket_cmd, stdin=DEVNULL, stderr=PIPE, text=True)
5886 if process.returncode != 0:
5887 p.log.log(log_trace, "%s", process.stderr.rstrip())
5890#############################################################################
5891class ConnectionPool:
5892 """Fetch a TCP connection for use in an SSH session, use it, finally return it back to the pool for future reuse."""
5894 def __init__(self, remote: Remote, max_concurrent_ssh_sessions_per_tcp_connection: int) -> None:
5895 assert max_concurrent_ssh_sessions_per_tcp_connection > 0
5896 self.remote: Remote = copy.copy(remote) # shallow copy for immutability (Remote is mutable)
5897 self.capacity: int = max_concurrent_ssh_sessions_per_tcp_connection
5898 self.priority_queue: SmallPriorityQueue = SmallPriorityQueue(reverse=True) # sorted by #free slots and last_modified
5899 self.last_modified: int = 0 # monotonically increasing sequence number
5900 self.cid: int = 0 # monotonically increasing connection number
5901 self._lock: threading.Lock = threading.Lock()
5903 def get_connection(self) -> Connection:
5904 """Any Connection object returned on get_connection() also remains intentionally contained in the priority queue,
5905 and that identical Connection object is later, on return_connection(), temporarily removed from the priority queue,
5906 updated with an incremented "free" slot count and then immediately reinserted into the priority queue. In effect,
5907 any Connection object remains intentionally contained in the priority queue at all times."""
5908 with self._lock:
5909 conn = self.priority_queue.pop() if len(self.priority_queue) > 0 else None
5910 if conn is None or conn.is_full():
5911 if conn is not None:
5912 self.priority_queue.push(conn)
5913 conn = Connection(self.remote, self.capacity, self.cid) # add a new connection
5914 self.last_modified += 1
5915 conn.update_last_modified(self.last_modified) # LIFO tiebreaker favors latest conn as that's most alive
5916 self.cid += 1
5917 conn.increment_free(-1)
5918 self.priority_queue.push(conn)
5919 return conn
5921 def return_connection(self, conn: Connection) -> None:
5922 assert conn is not None
5923 with self._lock:
5924 # update priority = remove conn from queue, increment priority, finally reinsert updated conn into queue
5925 if self.priority_queue.remove(conn): # conn is not contained only if ConnectionPool.shutdown() was called
5926 conn.increment_free(1)
5927 self.last_modified += 1
5928 conn.update_last_modified(self.last_modified) # LIFO tiebreaker favors latest conn as that's most alive
5929 self.priority_queue.push(conn)
5931 def shutdown(self, msg_prefix: str) -> None:
5932 with self._lock:
5933 if self.remote.reuse_ssh_connection:
5934 for conn in self.priority_queue:
5935 conn.shutdown(msg_prefix, self.remote.params)
5936 self.priority_queue.clear()
5938 def __repr__(self) -> str:
5939 with self._lock:
5940 queue = self.priority_queue
5941 return str({"capacity": self.capacity, "queue_len": len(queue), "cid": self.cid, "queue": queue})
5944#############################################################################
5945class ConnectionPools:
5946 """A bunch of named connection pools with various multiplexing capacities."""
5948 def __init__(self, remote: Remote, capacities: Dict[str, int]) -> None:
5949 self.pools = {name: ConnectionPool(remote, capacity) for name, capacity in capacities.items()}
5951 def __repr__(self) -> str:
5952 return str(self.pools)
5954 def pool(self, name: str) -> ConnectionPool:
5955 return self.pools[name]
5957 def shutdown(self, msg_prefix: str) -> None:
5958 for name, pool in self.pools.items():
5959 pool.shutdown(msg_prefix + "/" + name)
5962#############################################################################
5963class ProgressReporter:
5964 """Periodically prints progress updates to the same console status line, which is helpful if the program runs in an
5965 interactive Unix terminal session. Tails the 'pv' output log files that are being written to by (parallel) replication,
5966 and extracts aggregate progress and throughput metrics from them, such as MB, MB/s, ETA, etc. Periodically prints these
5967 metrics to the console status line (but not to the log file), and in doing so "visually overwrites" the previous status
5968 line, via appending a \r carriage return control char rather than a \n newline char. Does not print a status line if the
5969 Unix environment var 'bzfs_isatty' is set to 'false', in order not to confuse programs that scrape redirected stdout.
5970 Example console status line:
5971 2025-01-17 01:23:04 [I] zfs sent 41.7 GiB 0:00:46 [963 MiB/s] [907 MiB/s] [==========> ] 80% ETA 0:00:04 ETA 01:23:08"""
5973 def __init__(
5974 self, p: Params, use_select: bool, progress_update_intervals: Optional[Tuple[float, float]], fail: bool = False
5975 ) -> None:
5976 # immutable variables:
5977 self.params: Params = p
5978 self.use_select: bool = use_select
5979 self.progress_update_intervals = progress_update_intervals
5980 self.inject_error: bool = fail # for testing only
5982 # mutable variables:
5983 self.thread: Optional[threading.Thread] = None
5984 self.exception: Optional[BaseException] = None
5985 self.lock: threading.Lock = threading.Lock()
5986 self.sleeper: InterruptibleSleep = InterruptibleSleep(self.lock) # sleeper shares lock with reporter
5987 self.file_name_queue: Set[str] = set()
5988 self.file_name_set: Set[str] = set()
5989 self.is_resetting = True
5990 self.is_pausing = False
5992 def start(self) -> None:
5993 with self.lock:
5994 assert self.thread is None
5995 self.thread = threading.Thread(target=lambda: self._run(), name="progress_reporter", daemon=True)
5996 self.thread.start()
5998 def stop(self) -> None:
5999 """Blocks until reporter is stopped, then reraises any exception that may have happened during log processing."""
6000 self.sleeper.interrupt()
6001 t = self.thread
6002 if t is not None:
6003 t.join()
6004 e = self.exception
6005 if e is not None:
6006 raise e # reraise exception in current thread
6008 def pause(self) -> None:
6009 with self.lock:
6010 self.is_pausing = True
6012 def reset(self) -> None:
6013 with self.lock:
6014 self.is_resetting = True
6016 def enqueue_pv_log_file(self, pv_log_file: str) -> None:
6017 """Tells progress reporter thread to also monitor and tail the given pv log file."""
6018 with self.lock:
6019 if pv_log_file not in self.file_name_set:
6020 self.file_name_queue.add(pv_log_file)
6022 def _run(self) -> None:
6023 log = self.params.log
6024 try:
6025 fds: List[TextIO] = []
6026 try:
6027 selector = selectors.SelectSelector() if self.use_select else selectors.PollSelector()
6028 try:
6029 self._run_internal(fds, selector)
6030 finally:
6031 selector.close()
6032 finally:
6033 for fd in fds:
6034 fd.close()
6035 except BaseException as e:
6036 self.exception = e # will be reraised in stop()
6037 log.error("%s", "ProgressReporter:", exc_info=e)
6039 @dataclass
6040 class TransferStat:
6041 @dataclass(order=True)
6042 class ETA: # Estimated time of arrival
6043 timestamp_nanos: int # sorted by future time at which current zfs send/recv transfer is estimated to complete
6044 seq_nr: int # tiebreaker wrt. sort order
6045 line_tail: str = field(compare=False) # trailing pv log line part w/ progress bar, duration ETA, timestamp ETA
6047 bytes_in_flight: int
6048 eta: ETA
6050 def _run_internal(self, fds: List[TextIO], selector: selectors.BaseSelector) -> None:
6052 class Sample(NamedTuple):
6053 sent_bytes: int
6054 timestamp_nanos: int
6056 log = self.params.log
6057 update_interval_secs, sliding_window_secs = (
6058 self.progress_update_intervals if self.progress_update_intervals is not None else self.get_update_intervals()
6059 )
6060 update_interval_nanos: int = round(update_interval_secs * 1_000_000_000)
6061 sliding_window_nanos: int = round(sliding_window_secs * 1_000_000_000)
6062 sleep_nanos = round(update_interval_nanos / 2.5)
6063 etas: List = []
6064 while True:
6065 empty_file_name_queue: Set[str] = set()
6066 with self.lock:
6067 if self.sleeper.is_stopping:
6068 return
6069 # progress reporter thread picks up pv log files that so far aren't being tailed
6070 n = len(self.file_name_queue)
6071 m = len(self.file_name_set)
6072 self.file_name_set.update(self.file_name_queue) # union
6073 assert len(self.file_name_set) == n + m # aka assert (previous) file_name_set.isdisjoint(file_name_queue)
6074 local_file_name_queue = self.file_name_queue
6075 self.file_name_queue = empty_file_name_queue # exchange buffers
6076 is_pausing = self.is_pausing
6077 self.is_pausing = False
6078 is_resetting = self.is_resetting
6079 self.is_resetting = False
6080 if is_pausing:
6081 next_update_nanos = time.monotonic_ns() + 10 * 365 * 86400 * 1_000_000_000 # infinity
6082 if is_resetting:
6083 sent_bytes, last_status_len = 0, 0
6084 num_lines, num_readables = 0, 0
6085 start_time_nanos = time.monotonic_ns()
6086 next_update_nanos = start_time_nanos + update_interval_nanos
6087 latest_samples: Deque[Sample] = deque([Sample(0, start_time_nanos)]) # sliding window w/ recent measurements
6088 for pv_log_file in local_file_name_queue:
6089 try:
6090 Path(pv_log_file).touch()
6091 fd = open(pv_log_file, mode="r", newline="", encoding="utf-8")
6092 except FileNotFoundError: # a third party has somehow deleted the log file or directory
6093 with self.lock:
6094 self.file_name_set.discard(pv_log_file) # enable re-adding the file later via enqueue_pv_log_file()
6095 log.warning("ProgressReporter: pv log file disappeared before initial open, skipping: %s", pv_log_file)
6096 continue # skip to the next file in the queue
6097 fds.append(fd)
6098 eta = self.TransferStat.ETA(timestamp_nanos=0, seq_nr=-len(fds), line_tail="")
6099 selector.register(fd, selectors.EVENT_READ, data=(iter(fd), self.TransferStat(bytes_in_flight=0, eta=eta)))
6100 etas.append(eta)
6101 readables = selector.select(timeout=0) # 0 indicates "don't block"
6102 has_line = False
6103 curr_time_nanos = time.monotonic_ns()
6104 selector_key: selectors.SelectorKey
6105 for selector_key, _ in readables: # for each file that's ready for non-blocking read
6106 num_readables += 1
6107 iter_fd, s = selector_key.data
6108 for line in iter_fd: # aka iter(fd)
6109 sent_bytes += self.update_transfer_stat(line, s, curr_time_nanos)
6110 num_lines += 1
6111 has_line = True
6112 if curr_time_nanos >= next_update_nanos:
6113 elapsed_nanos = curr_time_nanos - start_time_nanos
6114 msg0, msg3 = self.format_sent_bytes(sent_bytes, elapsed_nanos) # throughput etc since replication start time
6115 msg1 = self.format_duration(elapsed_nanos) # duration since replication start time
6116 oldest: Sample = latest_samples[0] # throughput etc, over sliding window
6117 _, msg2 = self.format_sent_bytes(sent_bytes - oldest.sent_bytes, curr_time_nanos - oldest.timestamp_nanos)
6118 msg4 = max(etas).line_tail if len(etas) > 0 else "" # progress bar, ETAs
6119 timestamp = datetime.now().isoformat(sep=" ", timespec="seconds") # 2024-09-03 12:26:15
6120 status_line = f"{timestamp} [I] zfs sent {msg0} {msg1} {msg2} {msg3} {msg4}"
6121 status_line = status_line.ljust(last_status_len) # "overwrite" trailing chars of previous status with spaces
6123 # The Unix console skips back to the beginning of the console line when it sees this \r control char:
6124 sys.stdout.write(f"{status_line}\r")
6125 sys.stdout.flush()
6127 # log.log(log_trace, "\nnum_lines: %s, num_readables: %s", num_lines, num_readables)
6128 last_status_len = len(status_line.rstrip())
6129 next_update_nanos += update_interval_nanos
6130 latest_samples.append(Sample(sent_bytes, curr_time_nanos))
6131 if elapsed_nanos >= sliding_window_nanos:
6132 latest_samples.popleft() # slide the sliding window containing recent measurements
6133 elif not has_line:
6134 # Avoid burning CPU busily spinning on I/O readiness as fds are almost always ready for non-blocking read
6135 # even if no new pv log line has been written. Yet retain ability to wake up immediately on reporter.stop().
6136 self.sleeper.sleep(min(sleep_nanos, next_update_nanos - curr_time_nanos))
6137 if self.inject_error:
6138 raise ValueError("Injected ProgressReporter error") # for testing only
6140 def update_transfer_stat(self, line: str, s: TransferStat, curr_time_nanos: int) -> int:
6141 num_bytes, s.eta.timestamp_nanos, s.eta.line_tail = self.parse_pv_line(line, curr_time_nanos)
6142 bytes_in_flight = s.bytes_in_flight
6143 s.bytes_in_flight = num_bytes if line.endswith("\r") else 0 # intermediate vs. final status update of each transfer
6144 return num_bytes - bytes_in_flight
6146 no_rates_regex = re.compile(r".*/s\s*[)\]]?\s*") # matches until end of last pv rate, e.g. "834MiB/s]" or "834MiB/s)"
6147 # time remaining --eta "ETA 00:00:39" or "ETA 2+0:00:39" or "ETA 2:0:00:39", followed by trailing --fineta timestamp ETA
6148 time_remaining_eta_regex = re.compile(r".*?ETA\s*((\d+)[+:])?(\d\d?):(\d\d):(\d\d).*(ETA|FIN).*")
6150 @staticmethod
6151 def parse_pv_line(line: str, curr_time_nanos: int) -> Tuple[int, int, str]:
6152 assert isinstance(line, str)
6153 if ":" in line:
6154 line = line.split(":", 1)[1].strip()
6155 sent_bytes, line = pv_size_to_bytes(line)
6156 line = ProgressReporter.no_rates_regex.sub("", line.lstrip(), 1) # remove pv --timer, --rate, --average-rate
6157 if match := ProgressReporter.time_remaining_eta_regex.fullmatch(line): # extract pv --eta duration
6158 _, days, hours, minutes, secs, _ = match.groups()
6159 time_remaining_secs = (86400 * int(days) if days else 0) + int(hours) * 3600 + int(minutes) * 60 + int(secs)
6160 curr_time_nanos += time_remaining_secs * 1_000_000_000 # ETA timestamp = now + time remaining duration
6161 return sent_bytes, curr_time_nanos, line
6162 return 0, curr_time_nanos, ""
6164 @staticmethod
6165 def format_sent_bytes(num_bytes: int, duration_nanos: int) -> Tuple[str, str]:
6166 bytes_per_sec = round(1_000_000_000 * num_bytes / max(1, duration_nanos))
6167 return f"{human_readable_bytes(num_bytes, precision=2)}", f"[{human_readable_bytes(bytes_per_sec, precision=2)}/s]"
6169 @staticmethod
6170 def format_duration(duration_nanos: int) -> str:
6171 total_seconds = round(duration_nanos / 1_000_000_000)
6172 hours, remainder = divmod(total_seconds, 3600)
6173 minutes, seconds = divmod(remainder, 60)
6174 return f"{hours}:{minutes:02d}:{seconds:02d}"
6176 def get_update_intervals(self) -> Tuple[float, float]:
6177 parser = argparse.ArgumentParser(allow_abbrev=False)
6178 parser.add_argument("--interval", "-i", type=float, default=1)
6179 parser.add_argument("--average-rate-window", "-m", type=float, default=30)
6180 args, _ = parser.parse_known_args(args=self.params.pv_program_opts)
6181 interval = min(60 * 60, max(args.interval, 0.1))
6182 return interval, min(60 * 60, max(args.average_rate_window, interval))
6185#############################################################################
6186class InterruptibleSleep:
6187 """Provides a sleep(timeout) function that can be interrupted by another thread."""
6189 def __init__(self, lock: Optional[threading.Lock] = None) -> None:
6190 self.is_stopping: bool = False
6191 self._lock = lock if lock is not None else threading.Lock()
6192 self._condition = threading.Condition(self._lock)
6194 def sleep(self, duration_nanos: int) -> None:
6195 """Delays the current thread by the given number of nanoseconds."""
6196 end_time_nanos = time.monotonic_ns() + duration_nanos
6197 with self._lock:
6198 while not self.is_stopping:
6199 diff_nanos = end_time_nanos - time.monotonic_ns()
6200 if diff_nanos <= 0:
6201 return
6202 self._condition.wait(timeout=diff_nanos / 1_000_000_000) # release, then block until notified or timeout
6204 def interrupt(self) -> None:
6205 """Wakes up currently sleeping threads and makes any future sleep()s a noop."""
6206 with self._lock:
6207 if not self.is_stopping:
6208 self.is_stopping = True
6209 self._condition.notify_all()
6212#############################################################################
6213def fix_send_recv_opts(
6214 opts: List[str],
6215 exclude_long_opts: Set[str],
6216 exclude_short_opts: str,
6217 include_arg_opts: Set[str],
6218 exclude_arg_opts: FrozenSet[str] = frozenset(),
6219) -> List[str]:
6220 """These opts are instead managed via bzfs CLI args --dryrun, etc."""
6221 assert "-" not in exclude_short_opts
6222 results = []
6223 i = 0
6224 n = len(opts)
6225 while i < n:
6226 opt = opts[i]
6227 i += 1
6228 if opt in exclude_arg_opts: # example: {"-X", "--exclude"}
6229 i += 1
6230 continue
6231 elif opt in include_arg_opts: # example: {"-o", "-x"}
6232 results.append(opt)
6233 if i < n:
6234 results.append(opts[i])
6235 i += 1
6236 elif opt not in exclude_long_opts: # example: {"--dryrun", "--verbose"}
6237 if opt.startswith("-") and opt != "-" and not opt.startswith("--"):
6238 for char in exclude_short_opts: # example: "den"
6239 opt = opt.replace(char, "")
6240 if opt == "-":
6241 continue
6242 results.append(opt)
6243 return results
6246def fix_solaris_raw_mode(lst: List[str]) -> List[str]:
6247 lst = ["-w" if opt == "--raw" else opt for opt in lst]
6248 lst = ["compress" if opt == "--compressed" else opt for opt in lst]
6249 i = lst.index("-w") if "-w" in lst else -1
6250 if i >= 0:
6251 i += 1
6252 if i == len(lst) or (lst[i] != "none" and lst[i] != "compress" and lst[i] != "crypto"):
6253 lst.insert(i, "none")
6254 return lst
6257ssh_master_domain_socket_file_pid_regex = re.compile(r"^[0-9]+") # see socket_name in local_ssh_command()
6260def delete_stale_files(
6261 root_dir: str,
6262 prefix: str,
6263 millis: int = 60 * 60 * 1000,
6264 dirs: bool = False,
6265 exclude: Optional[str] = None,
6266 ssh: bool = False,
6267) -> None:
6268 """Cleans up obsolete files. For example caused by abnormal termination, OS crash."""
6269 seconds = millis / 1000
6270 now = time.time()
6271 for entry in os.scandir(root_dir):
6272 if entry.name == exclude or not entry.name.startswith(prefix):
6273 continue
6274 try:
6275 if ((dirs and entry.is_dir()) or (not dirs and not entry.is_dir())) and now - entry.stat().st_mtime >= seconds:
6276 if dirs:
6277 shutil.rmtree(entry.path, ignore_errors=True)
6278 elif not (ssh and stat.S_ISSOCK(entry.stat().st_mode)):
6279 os.remove(entry.path)
6280 elif match := ssh_master_domain_socket_file_pid_regex.match(entry.name[len(prefix) :]): 6280 ↛ 6271line 6280 didn't jump to line 6271 because the condition on line 6280 was always true
6281 pid = int(match.group(0))
6282 if pid_exists(pid) is False or now - entry.stat().st_mtime >= 31 * 24 * 60 * 60:
6283 os.remove(entry.path) # bzfs process is nomore alive hence its ssh master process isn't alive either
6284 except FileNotFoundError:
6285 pass # harmless
6288def die(msg: str, exit_code: int = die_status) -> NoReturn:
6289 ex = SystemExit(msg)
6290 ex.code = exit_code
6291 raise ex
6294def filter_lines(input_list: Iterable[str], input_set: Set[str]) -> List[str]:
6295 """For each line in input_list, includes the line if input_set contains the first column field of that line."""
6296 if len(input_set) == 0:
6297 return []
6298 return [line for line in input_list if line[0 : line.index("\t")] in input_set]
6301def filter_lines_except(input_list: List[str], input_set: Set[str]) -> List[str]:
6302 """For each line in input_list, includes the line if input_set does not contain the first column field of that line."""
6303 if len(input_set) == 0:
6304 return input_list
6305 return [line for line in input_list if line[0 : line.index("\t")] not in input_set]
6308def has_siblings(sorted_datasets: List[str]) -> bool:
6309 """Returns whether the (sorted) list of input datasets contains any siblings."""
6310 skip_dataset = DONT_SKIP_DATASET
6311 parents: Set[str] = set()
6312 for dataset in sorted_datasets:
6313 assert dataset
6314 parent = os.path.dirname(dataset)
6315 if parent in parents:
6316 return True # I have a sibling if my parent already has another child
6317 parents.add(parent)
6318 if is_descendant(dataset, of_root_dataset=skip_dataset):
6319 continue
6320 if skip_dataset != DONT_SKIP_DATASET:
6321 return True # I have a sibling if I am a root dataset and another root dataset already exists
6322 skip_dataset = dataset
6323 return False
6326def is_descendant(dataset: str, of_root_dataset: str) -> bool:
6327 return (dataset + "/").startswith(of_root_dataset + "/")
6330def relativize_dataset(dataset: str, root_dataset: str) -> str:
6331 """Converts an absolute dataset path to a relative dataset path wrt root_dataset
6332 Example: root_dataset=tank/foo, dataset=tank/foo/bar/baz --> relative_path=/bar/baz"""
6333 return dataset[len(root_dataset) :]
6336def replace_prefix(s: str, old_prefix: str, new_prefix: str) -> str:
6337 """In a string s, replaces a leading old_prefix string with new_prefix. Assumes the leading string is present."""
6338 assert s.startswith(old_prefix)
6339 return new_prefix + s[len(old_prefix) :]
6342def replace_in_lines(lines: List[str], old: str, new: str, count: int = -1) -> None:
6343 for i in range(len(lines)):
6344 lines[i] = lines[i].replace(old, new, count)
6347def has_duplicates(sorted_list: List) -> bool:
6348 """Returns True if any adjacent items within the given sorted sequence are equal."""
6349 return any(a == b for a, b in zip(sorted_list, sorted_list[1:]))
6352def is_included(name: str, include_regexes: RegexList, exclude_regexes: RegexList) -> bool:
6353 """Returns True if the name matches at least one of the include regexes but none of the exclude regexes; else False.
6354 A regex that starts with a `!` is a negation - the regex matches if the regex without the `!` prefix does not match."""
6355 for regex, is_negation in exclude_regexes:
6356 is_match = regex.fullmatch(name) if regex.pattern != ".*" else True
6357 if is_negation:
6358 is_match = not is_match
6359 if is_match:
6360 return False
6362 for regex, is_negation in include_regexes:
6363 is_match = regex.fullmatch(name) if regex.pattern != ".*" else True
6364 if is_negation:
6365 is_match = not is_match
6366 if is_match:
6367 return True
6369 return False
6372def compile_regexes(regexes: List[str], suffix: str = "") -> RegexList:
6373 assert isinstance(regexes, list)
6374 compiled_regexes = []
6375 for regex in regexes:
6376 if suffix: # disallow non-trailing end-of-str symbol in dataset regexes to ensure descendants will also match
6377 if regex.endswith("\\$"):
6378 pass # trailing literal $ is ok
6379 elif regex.endswith("$"):
6380 regex = regex[0:-1] # ok because all users of compile_regexes() call re.fullmatch()
6381 elif "$" in regex:
6382 raise re.error("Must not use non-trailing '$' character", regex)
6383 if is_negation := regex.startswith("!"):
6384 regex = regex[1:]
6385 regex = replace_capturing_groups_with_non_capturing_groups(regex)
6386 if regex != ".*" or not (suffix.startswith("(") and suffix.endswith(")?")):
6387 regex = f"{regex}{suffix}"
6388 compiled_regexes.append((re.compile(regex), is_negation))
6389 return compiled_regexes
6392def replace_capturing_groups_with_non_capturing_groups(regex: str) -> str:
6393 """Replaces regex capturing groups with non-capturing groups for better matching performance.
6394 Example: '(.*/)?tmp(foo|bar)(?!public)\\(' --> '(?:.*/)?tmp(?:foo|bar)(?!public)\\()'
6395 Aka replaces brace '(' followed by a char other than question mark '?', but not preceded by a backslash
6396 with the replacement string '(?:'
6397 Also see https://docs.python.org/3/howto/regex.html#non-capturing-and-named-groups"""
6398 # pattern = re.compile(r'(?<!\\)\((?!\?)')
6399 # return pattern.sub('(?:', regex)
6400 i = len(regex) - 2
6401 while i >= 0:
6402 i = regex.rfind("(", 0, i + 1)
6403 if i >= 0 and regex[i] == "(" and (regex[i + 1] != "?") and (i == 0 or regex[i - 1] != "\\"):
6404 regex = f"{regex[0:i]}(?:{regex[i + 1:]}"
6405 i -= 1
6406 return regex
6409def getenv_any(key: str, default: Optional[str] = None) -> Optional[str]:
6410 """All shell environment variable names used for configuration start with this prefix."""
6411 return os.getenv(env_var_prefix + key, default)
6414def getenv_int(key: str, default: int) -> int:
6415 return int(cast(str, getenv_any(key, str(default))))
6418def getenv_bool(key: str, default: bool = False) -> bool:
6419 return cast(str, getenv_any(key, str(default))).lower().strip().lower() == "true"
6422P = TypeVar("P")
6425def find_match(
6426 seq: Sequence[P],
6427 predicate: Callable[[P], bool],
6428 start: Optional[int] = None,
6429 end: Optional[int] = None,
6430 reverse: bool = False,
6431 raises: Union[bool, str, Callable[[], str]] = False, # raises: bool | str | Callable = False, # python >= 3.10
6432) -> int:
6433 """Returns the integer index within seq of the first item (or last item if reverse==True) that matches the given
6434 predicate condition. If no matching item is found returns -1 or ValueError, depending on the raises parameter,
6435 which is a bool indicating whether to raise an error, or a string containing the error message, but can also be a
6436 Callable/lambda in order to support efficient deferred generation of error messages.
6437 Analog to str.find(), including slicing semantics with parameters start and end.
6438 For example, seq can be a list, tuple or str.
6440 Example usage:
6441 lst = ["a", "b", "-c", "d"]
6442 i = find_match(lst, lambda arg: arg.startswith("-"), start=1, end=3, reverse=True)
6443 if i >= 0:
6444 ...
6445 i = find_match(lst, lambda arg: arg.startswith("-"), raises=f"Tag {tag} not found in {file}")
6446 i = find_match(lst, lambda arg: arg.startswith("-"), raises=lambda: f"Tag {tag} not found in {file}")
6447 """
6448 offset = 0 if start is None else start if start >= 0 else len(seq) + start
6449 if start is not None or end is not None:
6450 seq = seq[start:end]
6451 for i, item in enumerate(reversed(seq) if reverse else seq):
6452 if predicate(item):
6453 if reverse:
6454 return len(seq) - i - 1 + offset
6455 else:
6456 return i + offset
6457 if raises is False or raises is None:
6458 return -1
6459 if raises is True:
6460 raise ValueError("No matching item found in sequence")
6461 if callable(raises):
6462 raises = raises()
6463 raise ValueError(raises)
6466TAPPEND = TypeVar("TAPPEND")
6469def xappend(lst: List[TAPPEND], *items: Union[TAPPEND, Iterable[TAPPEND]]) -> List[TAPPEND]:
6470 """Appends each of the items to the given list if the item is "truthy", e.g. not None and not an empty string.
6471 If an item is an iterable does so recursively, flattening the output."""
6472 for item in items:
6473 if isinstance(item, str) or not isinstance(item, collections.abc.Iterable):
6474 if item:
6475 lst.append(cast(TAPPEND, item))
6476 else:
6477 xappend(lst, *item)
6478 return lst
6481def human_readable_bytes(num_bytes: float, separator: str = " ", precision: Optional[int] = None, long: bool = False) -> str:
6482 sign = "-" if num_bytes < 0 else ""
6483 s = abs(num_bytes)
6484 units = ("B", "KiB", "MiB", "GiB", "TiB", "PiB", "EiB", "ZiB", "YiB", "RiB", "QiB")
6485 n = len(units) - 1
6486 i = 0
6487 long_form = f" ({num_bytes} bytes)" if long else ""
6488 while s >= 1024 and i < n:
6489 s /= 1024
6490 i += 1
6491 formatted_num = human_readable_float(s) if precision is None else f"{s:.{precision}f}"
6492 return f"{sign}{formatted_num}{separator}{units[i]}{long_form}"
6495def human_readable_duration(
6496 duration: float, unit: str = "ns", separator: str = "", precision: Optional[int] = None, long: bool = False
6497) -> str:
6498 sign = "-" if duration < 0 else ""
6499 t = abs(duration)
6500 units = ("ns", "μs", "ms", "s", "m", "h", "d")
6501 nanos = (1, 1_000, 1_000_000, 1_000_000_000, 60 * 1_000_000_000, 60 * 60 * 1_000_000_000, 60 * 60 * 24 * 1_000_000_000)
6502 i = units.index(unit)
6503 long_form = f" ({round(duration * nanos[i] / 1_000_000_000)} seconds)" if long else ""
6504 if t < 1 and t != 0:
6505 t *= nanos[i]
6506 i = 0
6507 while t >= 1000 and i < 3:
6508 t /= 1000
6509 i += 1
6510 if i >= 3:
6511 while t >= 60 and i < 5:
6512 t /= 60
6513 i += 1
6514 if i >= 5:
6515 while t >= 24 and i < len(units) - 1:
6516 t /= 24
6517 i += 1
6518 formatted_num = human_readable_float(t) if precision is None else f"{t:.{precision}f}"
6519 return f"{sign}{formatted_num}{separator}{units[i]}{long_form}"
6522def human_readable_float(number: float) -> str:
6523 """If the number has one digit before the decimal point (0 <= abs(number) < 10):
6524 Round and use two decimals after the decimal point (e.g., 3.14559 --> "3.15").
6526 If the number has two digits before the decimal point (10 <= abs(number) < 100):
6527 Round and use one decimal after the decimal point (e.g., 12.36 --> "12.4").
6529 If the number has three or more digits before the decimal point (abs(number) >= 100):
6530 Round and use zero decimals after the decimal point (e.g., 123.556 --> "124").
6532 Ensure no unnecessary trailing zeroes are retained: Example: 1.500 --> "1.5", 1.00 --> "1"
6533 """
6534 abs_number = abs(number)
6535 precision = 2 if abs_number < 10 else 1 if abs_number < 100 else 0
6536 if precision == 0:
6537 return str(round(number))
6538 result = f"{number:.{precision}f}"
6539 assert "." in result
6540 result = result.rstrip("0").rstrip(".") # Remove trailing zeros and trailing decimal point if empty
6541 return "0" if result == "-0" else result
6544def percent(number: int, total: int) -> str:
6545 return f"{number}={'NaN' if total == 0 else human_readable_float(100 * number / total)}%"
6548def parse_duration_to_milliseconds(duration: str, regex_suffix: str = "", context: str = "") -> int:
6549 unit_milliseconds = {
6550 "milliseconds": 1,
6551 "millis": 1,
6552 "seconds": 1000,
6553 "secs": 1000,
6554 "minutes": 60 * 1000,
6555 "mins": 60 * 1000,
6556 "hours": 60 * 60 * 1000,
6557 "days": 86400 * 1000,
6558 "weeks": 7 * 86400 * 1000,
6559 "months": round(30.5 * 86400 * 1000),
6560 "years": 365 * 86400 * 1000,
6561 }
6562 match = re.fullmatch(
6563 r"(\d+)\s*(milliseconds|millis|seconds|secs|minutes|mins|hours|days|weeks|months|years)" + regex_suffix, duration
6564 )
6565 if not match:
6566 if context:
6567 die(f"Invalid duration format: {duration} within {context}")
6568 else:
6569 raise ValueError(f"Invalid duration format: {duration}")
6570 assert match
6571 quantity = int(match.group(1))
6572 unit = match.group(2)
6573 return quantity * unit_milliseconds[unit]
6576def get_home_directory() -> str:
6577 """Reliably detects home dir without using HOME env var."""
6578 # thread-safe version of: os.environ.pop('HOME', None); os.path.expanduser('~')
6579 return pwd.getpwuid(os.getuid()).pw_dir
6582def create_symlink(src: str, dst_dir: str, dst: str) -> None:
6583 rel_path = os.path.relpath(src, start=dst_dir)
6584 os.symlink(rel_path, os.path.join(dst_dir, dst))
6587def is_version_at_least(version_str: str, min_version_str: str) -> bool:
6588 """Checks if the version string is at least the minimum version string."""
6589 return tuple(map(int, version_str.split("."))) >= tuple(map(int, min_version_str.split(".")))
6592def tail(file: str, n: int, errors: Optional[str] = None) -> Sequence[str]:
6593 if not os.path.isfile(file):
6594 return []
6595 with open(file, "r", encoding="utf-8", errors=errors) as fd:
6596 return deque(fd, maxlen=n)
6599def append_if_absent(lst: List[TAPPEND], *items: TAPPEND) -> List[TAPPEND]:
6600 for item in items:
6601 if item not in lst:
6602 lst.append(item)
6603 return lst
6606def stderr_to_str(stderr: Any) -> str:
6607 """Workaround for https://github.com/python/cpython/issues/87597"""
6608 return str(stderr) if not isinstance(stderr, bytes) else stderr.decode("utf-8")
6611def xprint(log: Logger, value: Any, run: bool = True, end: str = "\n", file: Optional[TextIO] = None) -> None:
6612 if run and value:
6613 value = value if end else str(value).rstrip()
6614 level = log_stdout if file is sys.stdout else log_stderr
6615 log.log(level, "%s", value)
6618def set_last_modification_time_safe(
6619 path: str,
6620 unixtime_in_secs: Union[int, Tuple[int, int]],
6621 if_more_recent: bool = False,
6622) -> None:
6623 try:
6624 os.makedirs(os.path.dirname(path), exist_ok=True)
6625 set_last_modification_time(path, unixtime_in_secs=unixtime_in_secs, if_more_recent=if_more_recent)
6626 except FileNotFoundError:
6627 pass # harmless
6630def set_last_modification_time(
6631 path: str,
6632 unixtime_in_secs: Union[int, Tuple[int, int]],
6633 if_more_recent: bool = False,
6634) -> None:
6635 """if_more_recent=True is a concurrency control mechanism that prevents us from overwriting a newer (monotonically
6636 increasing) snapshots_changed value (which is a UTC Unix time in integer seconds) that might have been written to the
6637 cache file by a different, more up-to-date bzfs process."""
6638 unixtime_in_secs = (unixtime_in_secs, unixtime_in_secs) if isinstance(unixtime_in_secs, int) else unixtime_in_secs
6639 if not os_path_exists(path):
6640 with open(path, "a"):
6641 pass
6642 elif if_more_recent and unixtime_in_secs[1] <= round(os_stat(path).st_mtime):
6643 return
6644 os_utime(path, times=unixtime_in_secs)
6647def drain(iterable: Iterable) -> None:
6648 """Consumes all items in the iterable, effectively draining it."""
6649 deque(iterable, maxlen=0)
6652def nprefix(s: str) -> str:
6653 return sys.intern(s + "_")
6656def ninfix(s: str) -> str:
6657 return sys.intern(s + "_") if s else ""
6660def nsuffix(s: str) -> str:
6661 return sys.intern("_" + s) if s else ""
6664def format_dict(dictionary: Dict[Any, Any]) -> str:
6665 return f'"{dictionary}"'
6668def unixtime_fromisoformat(datetime_str: str) -> int:
6669 """Converts an ISO 8601 datetime string into a UTC Unix time in integer seconds. If the datetime string does not
6670 contain time zone info then it is assumed to be in the local time zone."""
6671 return int(datetime.fromisoformat(datetime_str).timestamp())
6674def isotime_from_unixtime(unixtime_in_seconds: int) -> str:
6675 """Converts a UTC Unix time in integer seconds into an ISO 8601 datetime string in the local time zone.
6676 Example: 2024-09-03_12:26:15"""
6677 tz: tzinfo = timezone.utc # outputs time in UTC
6678 # tz = None # outputs time in local time zone
6679 dt = datetime.fromtimestamp(unixtime_in_seconds, tz=tz)
6680 return dt.isoformat(sep="_", timespec="seconds")
6683def current_datetime(
6684 tz_spec: Optional[str] = None,
6685 now_fn: Optional[Callable[[Optional[tzinfo]], datetime]] = None,
6686) -> datetime:
6687 """Returns a datetime that is the current time in the given timezone, or in the local timezone if tz_spec is absent."""
6688 if now_fn is None:
6689 now_fn = datetime.now
6690 return now_fn(get_timezone(tz_spec))
6693def get_timezone(tz_spec: Optional[str] = None) -> Optional[tzinfo]:
6694 """Returns the given timezone, or the local timezone if the timezone spec is absent. The optional timezone spec is of
6695 the form "UTC" or "+HH:MM" or "-HH:MM" for fixed UTC offsets."""
6696 if tz_spec is None:
6697 tz = None # i.e. local timezone
6698 elif tz_spec == "UTC":
6699 tz = timezone.utc
6700 else:
6701 if match := re.fullmatch(r"([+-])(\d\d):?(\d\d)", tz_spec):
6702 sign, hours, minutes = match.groups()
6703 offset = int(hours) * 60 + int(minutes)
6704 offset = -offset if sign == "-" else offset
6705 tz = timezone(timedelta(minutes=offset))
6706 elif "/" in tz_spec and sys.version_info >= (3, 9):
6707 from zoneinfo import ZoneInfo # requires python >= 3.9
6709 tz = ZoneInfo(tz_spec) # Standard IANA timezone. Example: "Europe/Vienna"
6710 else:
6711 raise ValueError(f"Invalid timezone specification: {tz_spec}")
6712 return tz
6715metadata_month = {"min": 1, "max": 12, "help": "The month within a year"}
6716metadata_weekday = {"min": 0, "max": 6, "help": "The weekday within a week: 0=Sunday, 1=Monday, ..., 6=Saturday"}
6717metadata_day = {"min": 1, "max": 31, "help": "The day within a month"}
6718metadata_hour = {"min": 0, "max": 23, "help": "The hour within a day"}
6719metadata_minute = {"min": 0, "max": 59, "help": "The minute within an hour"}
6720metadata_second = {"min": 0, "max": 59, "help": "The second within a minute"}
6721metadata_millisecond = {"min": 0, "max": 999, "help": "The millisecond within a second"}
6722metadata_microsecond = {"min": 0, "max": 999, "help": "The microsecond within a millisecond"}
6725@dataclass(frozen=True)
6726class PeriodAnchors:
6727 # The anchors for a given duration unit are computed as follows:
6728 # yearly: Anchor(dt) = latest T where T <= dt and T == Start of January 1 of dt + anchor.yearly_* vars
6729 yearly_year: int = field(default=1, metadata={"min": 1, "max": 9999, "help": "The anchor year for multi-year periods"})
6730 yearly_month: int = field(default=1, metadata=metadata_month) # 1 <= x <= 12
6731 yearly_monthday: int = field(default=1, metadata=metadata_day) # 1 <= x <= 31
6732 yearly_hour: int = field(default=0, metadata=metadata_hour) # 0 <= x <= 23
6733 yearly_minute: int = field(default=0, metadata=metadata_minute) # 0 <= x <= 59
6734 yearly_second: int = field(default=0, metadata=metadata_second) # 0 <= x <= 59
6736 # monthly: Anchor(dt) = latest T where T <= dt && T == Start of first day of month of dt + anchor.monthly_* vars
6737 monthly_month: int = field(default=1, metadata={"min": 1, "max": 12, "help": "The anchor month for multi-month periods"})
6738 monthly_monthday: int = field(default=1, metadata=metadata_day) # 1 <= x <= 31
6739 monthly_hour: int = field(default=0, metadata=metadata_hour) # 0 <= x <= 23
6740 monthly_minute: int = field(default=0, metadata=metadata_minute) # 0 <= x <= 59
6741 monthly_second: int = field(default=0, metadata=metadata_second) # 0 <= x <= 59
6743 # weekly: Anchor(dt) = latest T where T <= dt && T == Latest midnight from Sunday to Monday of dt + anchor.weekly_* vars
6744 weekly_weekday: int = field(default=0, metadata=metadata_weekday) # 0 <= x <= 7
6745 weekly_hour: int = field(default=0, metadata=metadata_hour) # 0 <= x <= 23
6746 weekly_minute: int = field(default=0, metadata=metadata_minute) # 0 <= x <= 59
6747 weekly_second: int = field(default=0, metadata=metadata_second) # 0 <= x <= 59
6749 # daily: Anchor(dt) = latest T where T <= dt && T == Latest midnight of dt + anchor.daily_* vars
6750 daily_hour: int = field(default=0, metadata=metadata_hour) # 0 <= x <= 23
6751 daily_minute: int = field(default=0, metadata=metadata_minute) # 0 <= x <= 59
6752 daily_second: int = field(default=0, metadata=metadata_second) # 0 <= x <= 59
6754 # hourly: Anchor(dt) = latest T where T <= dt && T == Latest midnight of dt + anchor.hourly_* vars
6755 hourly_minute: int = field(default=0, metadata=metadata_minute) # 0 <= x <= 59
6756 hourly_second: int = field(default=0, metadata=metadata_second) # 0 <= x <= 59
6758 # minutely: Anchor(dt) = latest T where T <= dt && T == Latest midnight of dt + anchor.minutely_* vars
6759 minutely_second: int = field(default=0, metadata=metadata_second) # 0 <= x <= 59
6761 # secondly: Anchor(dt) = latest T where T <= dt && T == Latest midnight of dt + anchor.secondly_* vars
6762 secondly_millisecond: int = field(default=0, metadata=metadata_millisecond) # 0 <= x <= 999
6764 # secondly: Anchor(dt) = latest T where T <= dt && T == Latest midnight of dt + anchor.millisecondly_* vars
6765 millisecondly_microsecond: int = field(default=0, metadata=metadata_microsecond) # 0 <= x <= 999
6767 @staticmethod
6768 def parse(args: argparse.Namespace) -> "PeriodAnchors":
6769 kwargs = {f.name: getattr(args, f.name) for f in dataclasses.fields(PeriodAnchors)}
6770 return PeriodAnchors(**kwargs)
6773def round_datetime_up_to_duration_multiple(
6774 dt: datetime, duration_amount: int, duration_unit: str, anchors: PeriodAnchors
6775) -> datetime:
6776 """Given a timezone-aware datetime and a duration, returns a datetime (in the same timezone) that is greater than or
6777 equal to dt, and rounded up (ceiled) and snapped to an anchor plus a multiple of the duration. The snapping is done
6778 relative to the anchors object and the rules defined therein.
6779 Supported units: "millisecondly", "secondly", "minutely", "hourly", "daily", "weekly", "monthly", "yearly".
6780 If dt is already exactly on a boundary (i.e. exactly on a multiple), it is returned unchanged.
6781 Examples:
6782 Default hourly anchor is midnight
6783 14:00:00, 1 hours --> 14:00:00
6784 14:05:01, 1 hours --> 15:00:00
6785 15:05:01, 1 hours --> 16:00:00
6786 16:05:01, 1 hours --> 17:00:00
6787 23:55:01, 1 hours --> 00:00:00 on the next day
6788 14:05:01, 2 hours --> 16:00:00
6789 15:00:00, 2 hours --> 16:00:00
6790 15:05:01, 2 hours --> 16:00:00
6791 16:00:00, 2 hours --> 16:00:00
6792 16:05:01, 2 hours --> 18:00:00
6793 23:55:01, 2 hours --> 00:00:00 on the next day
6794 """
6796 def add_months(dt: datetime, months: int) -> datetime:
6797 total_month = dt.month - 1 + months
6798 new_year = dt.year + total_month // 12
6799 new_month = total_month % 12 + 1
6800 last_day = calendar.monthrange(new_year, new_month)[1] # last valid day of the current month
6801 return dt.replace(year=new_year, month=new_month, day=min(dt.day, last_day))
6803 def add_years(dt: datetime, years: int) -> datetime:
6804 new_year = dt.year + years
6805 last_day = calendar.monthrange(new_year, dt.month)[1] # last valid day of the current month
6806 return dt.replace(year=new_year, day=min(dt.day, last_day))
6808 if duration_amount == 0:
6809 return dt
6811 period = None
6812 if duration_unit == "millisecondly":
6813 anchor = dt.replace(hour=0, minute=0, second=0, microsecond=anchors.millisecondly_microsecond)
6814 anchor = anchor if anchor <= dt else anchor - timedelta(milliseconds=1)
6815 period = timedelta(milliseconds=duration_amount)
6817 elif duration_unit == "secondly":
6818 anchor = dt.replace(hour=0, minute=0, second=0, microsecond=anchors.secondly_millisecond * 1000)
6819 anchor = anchor if anchor <= dt else anchor - timedelta(seconds=1)
6820 period = timedelta(seconds=duration_amount)
6822 elif duration_unit == "minutely":
6823 anchor = dt.replace(second=anchors.minutely_second, microsecond=0)
6824 anchor = anchor if anchor <= dt else anchor - timedelta(minutes=1)
6825 period = timedelta(minutes=duration_amount)
6827 elif duration_unit == "hourly":
6828 daily_base = dt.replace(hour=0, minute=0, second=0, microsecond=0)
6829 anchor = daily_base + timedelta(minutes=anchors.hourly_minute, seconds=anchors.hourly_second)
6830 anchor = anchor if anchor <= dt else anchor - timedelta(days=1)
6831 period = timedelta(hours=duration_amount)
6833 elif duration_unit == "daily":
6834 daily_base = dt.replace(hour=0, minute=0, second=0, microsecond=0)
6835 anchor = daily_base + timedelta(hours=anchors.daily_hour, minutes=anchors.daily_minute, seconds=anchors.daily_second)
6836 anchor = anchor if anchor <= dt else anchor - timedelta(days=1)
6837 period = timedelta(days=duration_amount)
6839 elif duration_unit == "weekly":
6840 daily_base = dt.replace(hour=0, minute=0, second=0, microsecond=0)
6841 anchor = daily_base + timedelta(
6842 hours=anchors.weekly_hour, minutes=anchors.weekly_minute, seconds=anchors.weekly_second
6843 )
6844 # Convert cron weekday (0=Sunday, 1=Monday, ..., 6=Saturday) to Python's weekday (0=Monday, ..., 6=Sunday)
6845 target_py_weekday = (anchors.weekly_weekday - 1) % 7
6846 diff_days = (anchor.weekday() - target_py_weekday) % 7
6847 anchor = anchor - timedelta(days=diff_days)
6848 anchor = anchor if anchor <= dt else anchor - timedelta(weeks=1)
6849 period = timedelta(weeks=duration_amount)
6851 if period is not None: # "millisecondly", "secondly", "minutely", "hourly", "daily", "weekly"
6852 delta = dt - anchor
6853 period_micros = (period.days * 86400 + period.seconds) * 1_000_000 + period.microseconds
6854 delta_micros = (delta.days * 86400 + delta.seconds) * 1_000_000 + delta.microseconds
6855 remainder = delta_micros % period_micros
6856 if remainder == 0:
6857 return dt
6858 return dt + timedelta(microseconds=period_micros - remainder)
6860 elif duration_unit == "monthly":
6861 last_day = calendar.monthrange(dt.year, dt.month)[1] # last valid day of the current month
6862 anchor = dt.replace( # Compute the base anchor for the month ensuring the day is valid
6863 month=anchors.monthly_month,
6864 day=min(anchors.monthly_monthday, last_day),
6865 hour=anchors.monthly_hour,
6866 minute=anchors.monthly_minute,
6867 second=anchors.monthly_second,
6868 microsecond=0,
6869 )
6870 if anchor > dt:
6871 anchor = add_months(anchor, -duration_amount)
6872 diff_months = (dt.year - anchor.year) * 12 + (dt.month - anchor.month)
6873 anchor_boundary = add_months(anchor, duration_amount * (diff_months // duration_amount))
6874 if anchor_boundary < dt:
6875 anchor_boundary = add_months(anchor_boundary, duration_amount)
6876 return anchor_boundary
6878 elif duration_unit == "yearly":
6879 # Calculate the start of the cycle period that `dt` falls into.
6880 year_offset = (dt.year - anchors.yearly_year) % duration_amount
6881 period_start_year = dt.year - year_offset
6882 last_day = calendar.monthrange(period_start_year, anchors.yearly_month)[1] # last valid day of the month
6883 anchor = dt.replace(
6884 year=period_start_year,
6885 month=anchors.yearly_month,
6886 day=min(anchors.yearly_monthday, last_day),
6887 hour=anchors.yearly_hour,
6888 minute=anchors.yearly_minute,
6889 second=anchors.yearly_second,
6890 microsecond=0,
6891 )
6892 if anchor < dt:
6893 return add_years(anchor, duration_amount)
6894 return anchor
6896 else:
6897 raise ValueError(f"Unsupported duration unit: {duration_unit}")
6900def subprocess_run(*args: Any, **kwargs: Any) -> subprocess.CompletedProcess:
6901 """Drop-in replacement for subprocess.run() that mimics its behavior except it enhances cleanup on TimeoutExpired."""
6902 input_value = kwargs.pop("input", None)
6903 timeout = kwargs.pop("timeout", None)
6904 check = kwargs.pop("check", False)
6905 if input_value is not None:
6906 if kwargs.get("stdin") is not None:
6907 raise ValueError("input and stdin are mutually exclusive")
6908 kwargs["stdin"] = subprocess.PIPE
6910 with subprocess.Popen(*args, **kwargs) as proc:
6911 try:
6912 stdout, stderr = proc.communicate(input_value, timeout=timeout)
6913 except BaseException as e:
6914 try:
6915 if isinstance(e, subprocess.TimeoutExpired):
6916 terminate_process_subtree(root_pid=proc.pid) # send SIGTERM to child process and its descendants
6917 finally:
6918 proc.kill()
6919 raise e
6920 else:
6921 exitcode: Optional[int] = proc.poll()
6922 assert exitcode is not None
6923 if check and exitcode:
6924 raise subprocess.CalledProcessError(exitcode, proc.args, output=stdout, stderr=stderr)
6925 return subprocess.CompletedProcess(proc.args, exitcode, stdout, stderr)
6928def terminate_process_subtree(
6929 except_current_process: bool = False,
6930 root_pid: Optional[int] = None,
6931 sig: signal.Signals = signal.SIGTERM,
6932) -> None:
6933 """Sends signal also to descendant processes to also terminate processes started via subprocess.run()"""
6934 current_pid = os.getpid()
6935 root_pid = current_pid if root_pid is None else root_pid
6936 pids = get_descendant_processes(root_pid)
6937 if root_pid == current_pid:
6938 pids += [] if except_current_process else [current_pid]
6939 else:
6940 pids.insert(0, root_pid)
6941 for pid in pids:
6942 with contextlib.suppress(OSError):
6943 os.kill(pid, sig)
6946def get_descendant_processes(root_pid: int) -> List[int]:
6947 """Returns the list of all descendant process IDs for the given root PID, on Unix systems."""
6948 procs = defaultdict(list)
6949 cmd = ["ps", "-Ao", "pid,ppid"]
6950 lines = subprocess.run(cmd, stdin=DEVNULL, stdout=PIPE, text=True, check=True).stdout.splitlines()
6951 for line in lines[1:]: # all lines except the header line
6952 splits = line.split()
6953 assert len(splits) == 2
6954 pid = int(splits[0])
6955 ppid = int(splits[1])
6956 procs[ppid].append(pid)
6957 descendants: List[int] = []
6959 def recursive_append(ppid: int) -> None:
6960 for child_pid in procs[ppid]:
6961 descendants.append(child_pid)
6962 recursive_append(child_pid)
6964 recursive_append(root_pid)
6965 return descendants
6968def pid_exists(pid: int) -> Optional[bool]:
6969 if pid <= 0:
6970 return False
6971 try: # with signal=0, no signal is actually sent, but error checking is still performed
6972 os.kill(pid, 0) # ... which can be used to check for process existence on POSIX systems
6973 except OSError as err:
6974 if err.errno == errno.ESRCH: # No such process
6975 return False
6976 if err.errno == errno.EPERM: # Operation not permitted
6977 return True
6978 return None
6979 return True
6982arabic_decimal_separator = "\u066b" # "٫"
6983pv_size_to_bytes_regex = re.compile(rf"(\d+[.,{arabic_decimal_separator}]?\d*)\s*([KMGTPEZYRQ]?)(i?)([Bb])(.*)")
6986def pv_size_to_bytes(size: str) -> Tuple[int, str]: # example inputs: "800B", "4.12 KiB", "510 MiB", "510 MB", "4Gb", "2TiB"
6987 if match := pv_size_to_bytes_regex.fullmatch(size):
6988 number = float(match.group(1).replace(",", ".").replace(arabic_decimal_separator, "."))
6989 i = "KMGTPEZYRQ".index(match.group(2)) if match.group(2) else -1
6990 m = 1024 if match.group(3) == "i" else 1000
6991 b = 1 if match.group(4) == "B" else 8
6992 line_tail = match.group(5)
6993 if line_tail and line_tail.startswith("/s"):
6994 raise ValueError("Invalid pv_size: " + size) # stems from 'pv --rate' or 'pv --average-rate'
6995 size_in_bytes = round(number * (m ** (i + 1)) / b)
6996 return size_in_bytes, line_tail
6997 else:
6998 return 0, "" # skip partial or bad 'pv' log file line (pv process killed while writing?)
7001def count_num_bytes_transferred_by_zfs_send(basis_pv_log_file: str) -> int:
7002 """Scrapes the .pv log file(s) and sums up the 'pv --bytes' column."""
7004 def parse_pv_line(line: str) -> int:
7005 if ":" in line:
7006 col = line.split(":", 1)[1].strip()
7007 num_bytes, _ = pv_size_to_bytes(col)
7008 return num_bytes
7009 return 0
7011 total_bytes = 0
7012 files = [basis_pv_log_file] + glob.glob(basis_pv_log_file + pv_file_thread_separator + "[0-9]*")
7013 for file in files:
7014 if os.path.isfile(file):
7015 try:
7016 with open(file, mode="r", newline="", encoding="utf-8") as fd:
7017 line = None
7018 for line in fd:
7019 if line.endswith("\r"):
7020 continue # skip all but the most recent status update of each transfer
7021 total_bytes += parse_pv_line(line)
7022 line = None
7023 if line is not None:
7024 total_bytes += parse_pv_line(line) # consume last line of file w/ intermediate status update, if any
7025 except FileNotFoundError:
7026 pass # harmless
7027 return total_bytes
7030def parse_dataset_locator(
7031 input_text: str,
7032 validate: bool = True,
7033 user: Optional[str] = None,
7034 host: Optional[str] = None,
7035 port: Optional[int] = None,
7036) -> Tuple[str, str, str, str, str]:
7037 def convert_ipv6(hostname: str) -> str: # support IPv6 without getting confused by host:dataset colon separator ...
7038 return hostname.replace("|", ":") # ... and any colons that may be part of a (valid) ZFS dataset name
7040 user_undefined = user is None
7041 if user is None:
7042 user = ""
7043 host_undefined = host is None
7044 if host is None:
7045 host = ""
7046 host = convert_ipv6(host)
7047 user_host, dataset, pool = "", "", ""
7049 # Input format is [[user@]host:]dataset
7050 # 1234 5 6
7051 if match := re.fullmatch(r"(((([^@]*)@)?([^:]+)):)?(.*)", input_text, re.DOTALL): 7051 ↛ 7068line 7051 didn't jump to line 7068 because the condition on line 7051 was always true
7052 if user_undefined:
7053 user = match.group(4) or ""
7054 if host_undefined:
7055 host = match.group(5) or ""
7056 host = convert_ipv6(host)
7057 if host == "-":
7058 host = ""
7059 dataset = match.group(6) or ""
7060 i = dataset.find("/")
7061 pool = dataset[0:i] if i >= 0 else dataset
7063 if user and host:
7064 user_host = f"{user}@{host}"
7065 elif host:
7066 user_host = host
7068 if validate:
7069 validate_user_name(user, input_text)
7070 validate_host_name(host, input_text)
7071 if port is not None:
7072 validate_port(port, f"Invalid port number: '{port}' for: '{input_text}' - ")
7073 validate_dataset_name(dataset, input_text)
7075 return user, host, user_host, pool, dataset
7078def validate_dataset_name(dataset: str, input_text: str) -> None:
7079 """'zfs create' CLI does not accept dataset names that are empty or start or end in a slash, etc."""
7080 # Also see https://github.com/openzfs/zfs/issues/439#issuecomment-2784424
7081 # and https://github.com/openzfs/zfs/issues/8798
7082 # and (by now nomore accurate): https://docs.oracle.com/cd/E26505_01/html/E37384/gbcpt.html
7083 if (
7084 dataset in ("", ".", "..")
7085 or "//" in dataset
7086 or dataset.startswith("/")
7087 or dataset.endswith("/")
7088 or dataset.startswith("./")
7089 or dataset.startswith("../")
7090 or dataset.endswith("/.")
7091 or dataset.endswith("/..")
7092 or "/./" in dataset
7093 or "/../" in dataset
7094 or any(char in SHELL_CHARS or (char.isspace() and char != " ") for char in dataset)
7095 or not dataset[0].isalpha()
7096 ):
7097 die(f"Invalid ZFS dataset name: '{dataset}' for: '{input_text}'")
7100def validate_user_name(user: str, input_text: str) -> None:
7101 if user and (".." in user or any(c.isspace() or c == '"' or c == "'" or c in "/@`" for c in user)):
7102 die(f"Invalid user name: '{user}' for: '{input_text}'")
7105def validate_host_name(host: str, input_text: str, extra_invalid_chars: str = "") -> None:
7106 invalid_chars = SHELL_CHARS + "/" + extra_invalid_chars
7107 if host and (".." in host or any(c.isspace() or c in invalid_chars for c in host)):
7108 die(f"Invalid host name: '{host}' for: '{input_text}'")
7111def validate_port(port: Union[str, int], message: str) -> None:
7112 if isinstance(port, int):
7113 port = str(port)
7114 if port and not port.isdigit():
7115 die(message + f"must be empty or a positive integer: '{port}'")
7118def validate_default_shell(path_to_default_shell: str, r: Remote) -> None:
7119 if path_to_default_shell.endswith("/csh") or path_to_default_shell.endswith("/tcsh"):
7120 # On some old FreeBSD systems the default shell is still csh. Also see https://www.grymoire.com/unix/CshTop10.txt
7121 die(
7122 f"Cowardly refusing to proceed because {prog_name} is not compatible with csh-style quoting of special "
7123 f"characters. The safe workaround is to first manually set 'sh' instead of '{path_to_default_shell}' as "
7124 f"the default shell of the Unix user on {r.location} host: {r.ssh_user_host or 'localhost'}, like so: "
7125 "chsh -s /bin/sh YOURUSERNAME"
7126 )
7129def list_formatter(iterable: Iterable, separator: str = " ", lstrip: bool = False) -> Any:
7130 # For lazy/noop evaluation in disabled log levels
7131 class CustomListFormatter:
7132 def __str__(self) -> str:
7133 s = separator.join(map(str, iterable))
7134 return s.lstrip() if lstrip else s
7136 return CustomListFormatter()
7139def pretty_print_formatter(obj_to_format: Any) -> Any: # For lazy/noop evaluation in disabled log levels
7140 class PrettyPrintFormatter:
7141 def __str__(self) -> str:
7142 import pprint
7144 return pprint.pformat(vars(obj_to_format))
7146 return PrettyPrintFormatter()
7149def reset_logger() -> None:
7150 """Remove and close logging handlers (and close their files) and reset loggers to default state."""
7151 for log in [logging.getLogger(__name__), logging.getLogger(get_logger_subname())]:
7152 for handler in log.handlers.copy():
7153 log.removeHandler(handler)
7154 handler.flush()
7155 handler.close()
7156 for _filter in log.filters.copy():
7157 log.removeFilter(_filter)
7158 log.setLevel(logging.NOTSET)
7159 log.propagate = True
7162def get_logger_subname() -> str:
7163 return __name__ + ".sub" # the logger name for use by --log-config-file
7166def get_logger(log_params: LogParams, args: argparse.Namespace, log: Optional[Logger] = None) -> Logger:
7167 add_trace_loglevel()
7168 logging.addLevelName(log_stderr, "STDERR")
7169 logging.addLevelName(log_stdout, "STDOUT")
7171 if log is not None:
7172 assert isinstance(log, Logger)
7173 return log # use third party provided logger object
7174 elif args.log_config_file:
7175 clog = get_dict_config_logger(log_params, args) # use logger defined in config file, and afterwards ...
7176 # ... add our own handlers unless matching handlers are already present
7177 default_log = get_default_logger(log_params, args)
7178 return clog if args.log_config_file else default_log
7181def get_default_logger(log_params: LogParams, args: argparse.Namespace) -> Logger:
7182 sublog = logging.getLogger(get_logger_subname())
7183 log = logging.getLogger(__name__)
7184 log.setLevel(log_params.log_level)
7185 log.propagate = False # don't propagate log messages up to the root logger to avoid emitting duplicate messages
7187 if not any(isinstance(h, logging.StreamHandler) and h.stream in [sys.stdout, sys.stderr] for h in sublog.handlers):
7188 handler: logging.Handler = logging.StreamHandler(stream=sys.stdout)
7189 handler.setFormatter(get_default_log_formatter(log_params=log_params))
7190 handler.setLevel(log_params.log_level)
7191 log.addHandler(handler)
7193 abs_log_file = os.path.abspath(log_params.log_file)
7194 if not any(isinstance(h, logging.FileHandler) and h.baseFilename == abs_log_file for h in sublog.handlers):
7195 handler = logging.FileHandler(log_params.log_file, encoding="utf-8")
7196 handler.setFormatter(get_default_log_formatter())
7197 handler.setLevel(log_params.log_level)
7198 log.addHandler(handler)
7200 address = args.log_syslog_address
7201 if address: # optionally, also log to local or remote syslog
7202 address, socktype = get_syslog_address(address, args.log_syslog_socktype)
7203 log_syslog_prefix = str(args.log_syslog_prefix).strip().replace("%", "") # sanitize
7204 handler = logging.handlers.SysLogHandler(address=address, facility=args.log_syslog_facility, socktype=socktype)
7205 handler.setFormatter(get_default_log_formatter(prefix=log_syslog_prefix + " "))
7206 handler.setLevel(args.log_syslog_level)
7207 log.addHandler(handler)
7208 if handler.level < sublog.getEffectiveLevel():
7209 log_level_name = logging.getLevelName(sublog.getEffectiveLevel())
7210 log.warning(
7211 "%s",
7212 f"No messages with priority lower than {log_level_name} will be sent to syslog because syslog "
7213 f"log level {args.log_syslog_level} is lower than overall log level {log_level_name}.",
7214 )
7216 # perf: tell logging framework not to gather unnecessary expensive info for each log record
7217 logging.logProcesses = False
7218 logging.logThreads = False
7219 logging.logMultiprocessing = False
7220 return log
7223log_level_prefixes = {
7224 logging.CRITICAL: "[C] CRITICAL:",
7225 logging.ERROR: "[E] ERROR:",
7226 logging.WARNING: "[W]",
7227 logging.INFO: "[I]",
7228 logging.DEBUG: "[D]",
7229 log_trace: "[T]",
7230}
7233def get_default_log_formatter(prefix: str = "", log_params: Optional[LogParams] = None) -> logging.Formatter:
7234 _level_prefixes = log_level_prefixes
7235 _log_stderr = log_stderr
7236 _log_stdout = log_stdout
7237 terminal_cols = [0 if log_params is None else None] # 'None' indicates "configure value later"
7239 class DefaultLogFormatter(logging.Formatter):
7240 def format(self, record: logging.LogRecord) -> str:
7241 levelno = record.levelno
7242 if levelno != _log_stderr and levelno != _log_stdout: # emit stdout and stderr "as-is" (no formatting)
7243 timestamp = datetime.now().isoformat(sep=" ", timespec="seconds") # 2024-09-03 12:26:15
7244 ts_level = f"{timestamp} {_level_prefixes.get(levelno, '')} "
7245 msg = record.msg
7246 i = msg.find("%s")
7247 msg = ts_level + msg
7248 if i >= 1:
7249 i += len(ts_level)
7250 msg = msg[0:i].ljust(54) + msg[i:] # right-pad msg if record.msg contains "%s" unless at start
7251 if record.args:
7252 msg = msg % record.args
7253 msg = prefix + msg
7254 else:
7255 msg = prefix + super().format(record)
7257 cols = terminal_cols[0]
7258 if cols is None:
7259 cols = self.ljust_cols()
7260 msg = msg.ljust(cols) # w/ progress line, "overwrite" trailing chars of previous msg with spaces
7261 return msg
7263 @staticmethod
7264 def ljust_cols() -> int:
7265 # lock-free yet thread-safe late configuration-based init for prettier ProgressReporter output
7266 # log_params.params and available_programs are not fully initialized yet before detect_available_programs() ends
7267 cols = 0
7268 assert log_params is not None
7269 p = log_params.params
7270 if p is not None and "local" in p.available_programs:
7271 if "pv" in p.available_programs["local"]:
7272 cols = p.terminal_columns
7273 assert cols is not None
7274 terminal_cols[0] = cols # finally, resolve to use this specific value henceforth
7275 return cols
7277 return DefaultLogFormatter()
7280def get_simple_logger(program: str) -> Logger:
7281 class LevelFormatter(logging.Formatter):
7282 def format(self, record: logging.LogRecord) -> str:
7283 record.level_prefix = log_level_prefixes.get(record.levelno, "")
7284 record.program = program
7285 return super().format(record)
7287 add_trace_loglevel()
7288 log = logging.getLogger(program)
7289 log.setLevel(logging.INFO)
7290 log.propagate = False
7291 if not any(isinstance(h, logging.StreamHandler) for h in log.handlers):
7292 handler = logging.StreamHandler()
7293 handler.setFormatter(
7294 LevelFormatter(fmt="%(asctime)s %(level_prefix)s [%(program)s] %(message)s", datefmt="%Y-%m-%d %H:%M:%S")
7295 )
7296 log.addHandler(handler)
7297 return log
7300def add_trace_loglevel() -> None:
7301 logging.addLevelName(log_trace, "TRACE")
7304def get_syslog_address(
7305 address: str, log_syslog_socktype: str
7306) -> Tuple[Union[str, Tuple[str, int]], Optional[socket.SocketKind]]:
7307 address = address.strip()
7308 socktype: Optional[socket.SocketKind] = None
7309 if ":" in address:
7310 host, port_str = address.rsplit(":", 1)
7311 addr = (host.strip(), int(port_str.strip()))
7312 socktype = socket.SOCK_DGRAM if log_syslog_socktype == "UDP" else socket.SOCK_STREAM # for TCP
7313 return addr, socktype
7314 return address, socktype
7317def get_dict_config_logger(log_params: LogParams, args: argparse.Namespace) -> Logger:
7318 import json
7320 prefix = prog_name + "."
7321 log_config_vars = {
7322 prefix + "sub.logger": get_logger_subname(),
7323 prefix + "get_default_log_formatter": __name__ + ".get_default_log_formatter",
7324 prefix + "log_level": log_params.log_level,
7325 prefix + "log_dir": log_params.log_dir,
7326 prefix + "log_file": os.path.basename(log_params.log_file),
7327 prefix + "timestamp": log_params.timestamp,
7328 prefix + "dryrun": "dryrun" if args.dryrun else "",
7329 }
7330 log_config_vars.update(log_params.log_config_vars) # merge variables passed into CLI with convenience variables
7332 log_config_file_str = log_params.log_config_file
7333 if log_config_file_str.startswith("+"):
7334 with open(log_config_file_str[1:], "r", encoding="utf-8") as fd:
7335 log_config_file_str = fd.read()
7337 def remove_json_comments(config_str: str) -> str: # not standard but practical
7338 lines = []
7339 for line in config_str.splitlines():
7340 stripped = line.strip()
7341 if stripped.startswith("#"):
7342 line = "" # replace comment line with empty line to preserve line numbering
7343 elif stripped.endswith("#"):
7344 i = line.rfind("#", 0, line.rindex("#"))
7345 if i >= 0:
7346 line = line[0:i] # strip line-ending comment
7347 lines.append(line)
7348 return "\n".join(lines)
7350 def substitute_log_config_vars(config_str: str, log_config_variables: Dict[str, str]) -> str:
7351 """Substitute ${name[:default]} placeholders within JSON with values from log_config_variables"""
7353 def substitute_fn(match: re.Match) -> str:
7354 varname = match.group(1)
7355 error_msg = validate_log_config_variable_name(varname)
7356 if error_msg:
7357 raise ValueError(error_msg)
7358 replacement = log_config_variables.get(varname)
7359 if not replacement:
7360 default = match.group(3)
7361 if default is None:
7362 raise ValueError("Missing default value in JSON for empty log config variable: ${" + varname + "}")
7363 replacement = default
7364 replacement = json.dumps(replacement) # JSON escape special chars such as newlines, quotes, etc
7365 assert len(replacement) >= 2
7366 assert replacement.startswith('"')
7367 assert replacement.endswith('"')
7368 return replacement[1:-1] # strip surrounding quotes added by dumps()
7370 pattern = re.compile(r"\$\{([^}:]*?)(:([^}]*))?}") # Any char except } and :, followed by optional default part
7371 return pattern.sub(substitute_fn, config_str)
7373 log_config_file_str = remove_json_comments(log_config_file_str)
7374 if not log_config_file_str.strip().startswith("{"):
7375 log_config_file_str = "{\n" + log_config_file_str # lenient JSON parsing
7376 if not log_config_file_str.strip().endswith("}"):
7377 log_config_file_str = log_config_file_str + "\n}" # lenient JSON parsing
7378 log_config_file_str = substitute_log_config_vars(log_config_file_str, log_config_vars)
7379 if args is not None and args.verbose >= 2:
7380 print("[T] Substituted log_config_file_str:\n" + log_config_file_str, flush=True)
7381 log_config_dict = json.loads(log_config_file_str)
7382 logging.config.dictConfig(log_config_dict)
7383 return logging.getLogger(get_logger_subname())
7386def validate_log_config_variable(var: str) -> Optional[str]:
7387 if not var.strip():
7388 return "Invalid log config NAME:VALUE variable. Variable must not be empty: " + var
7389 if ":" not in var:
7390 return "Invalid log config NAME:VALUE variable. Variable is missing a colon character: " + var
7391 return validate_log_config_variable_name(var[0 : var.index(":")])
7394def validate_log_config_variable_name(name: str) -> Optional[str]:
7395 if not name:
7396 return "Invalid log config variable name. Name must not be empty: " + name
7397 bad_chars = "${} " + '"' + "'"
7398 if any(char in bad_chars for char in name):
7399 return f"Invalid log config variable name. Name must not contain forbidden {bad_chars} characters: " + name
7400 if any(char.isspace() for char in name):
7401 return "Invalid log config variable name. Name must not contain whitespace: " + name
7402 return None
7405#############################################################################
7406class RetryableError(Exception):
7407 """Indicates that the task that caused the underlying exception can be retried and might eventually succeed."""
7409 def __init__(self, message: str, no_sleep: bool = False) -> None:
7410 super().__init__(message)
7411 self.no_sleep: bool = no_sleep
7414#############################################################################
7415class Tee:
7416 def __init__(self, *files: TextIO) -> None:
7417 self.files = files
7419 def write(self, obj: str) -> None:
7420 for file in self.files:
7421 file.write(obj)
7422 file.flush() # Ensure each write is flushed immediately
7424 def flush(self) -> None:
7425 for file in self.files:
7426 file.flush()
7428 def fileno(self) -> int:
7429 return self.files[0].fileno()
7432#############################################################################
7433class NonEmptyStringAction(argparse.Action):
7434 def __call__(
7435 self,
7436 parser: argparse.ArgumentParser,
7437 namespace: argparse.Namespace,
7438 values: Any,
7439 option_string: Optional[str] = None,
7440 ) -> None:
7441 values = values.strip()
7442 if values == "":
7443 parser.error(f"{option_string}: Empty string is not valid")
7444 setattr(namespace, self.dest, values)
7447#############################################################################
7448class DatasetPairsAction(argparse.Action):
7449 def __call__(
7450 self,
7451 parser: argparse.ArgumentParser,
7452 namespace: argparse.Namespace,
7453 values: Any,
7454 option_string: Optional[str] = None,
7455 ) -> None:
7456 datasets = []
7457 for value in values:
7458 if not value.startswith("+"):
7459 datasets.append(value)
7460 else:
7461 try:
7462 with open(value[1:], "r", encoding="utf-8") as fd:
7463 for line in fd.read().splitlines():
7464 if line.startswith("#") or not line.strip():
7465 continue # skip comment lines and empty lines
7466 splits = line.split("\t", 1)
7467 if len(splits) <= 1:
7468 parser.error("Line must contain tab-separated SRC_DATASET and DST_DATASET: " + line)
7469 src_root_dataset, dst_root_dataset = splits
7470 if not src_root_dataset.strip() or not dst_root_dataset.strip():
7471 parser.error("SRC_DATASET and DST_DATASET must not be empty or whitespace-only:" + line)
7472 datasets.append(src_root_dataset)
7473 datasets.append(dst_root_dataset)
7474 except FileNotFoundError:
7475 parser.error(f"File not found: {value[1:]}")
7477 if len(datasets) % 2 != 0:
7478 parser.error(f"Each SRC_DATASET must have a corresponding DST_DATASET: {datasets}")
7479 root_dataset_pairs = [(datasets[i], datasets[i + 1]) for i in range(0, len(datasets), 2)]
7480 setattr(namespace, self.dest, root_dataset_pairs)
7483#############################################################################
7484class SafeFileNameAction(argparse.Action):
7485 def __call__(
7486 self,
7487 parser: argparse.ArgumentParser,
7488 namespace: argparse.Namespace,
7489 values: Any,
7490 option_string: Optional[str] = None,
7491 ) -> None:
7492 if ".." in values or "/" in values or "\\" in values:
7493 parser.error(f"Invalid file name '{values}': must not contain '..' or '/' or '\\'.")
7494 setattr(namespace, self.dest, values)
7497#############################################################################
7498class NewSnapshotFilterGroupAction(argparse.Action):
7499 def __call__(
7500 self,
7501 parser: argparse.ArgumentParser,
7502 args: argparse.Namespace,
7503 values: Any,
7504 option_string: Optional[str] = None,
7505 ) -> None:
7506 if not hasattr(args, snapshot_filters_var):
7507 args.snapshot_filters_var = [[]]
7508 elif len(args.snapshot_filters_var[-1]) > 0:
7509 args.snapshot_filters_var.append([])
7512#############################################################################
7513class FileOrLiteralAction(argparse.Action):
7514 def __call__(
7515 self,
7516 parser: argparse.ArgumentParser,
7517 namespace: argparse.Namespace,
7518 values: Any,
7519 option_string: Optional[str] = None,
7520 ) -> None:
7521 current_values = getattr(namespace, self.dest, None)
7522 if current_values is None:
7523 current_values = []
7524 extra_values = []
7525 for value in values:
7526 if not value.startswith("+"):
7527 extra_values.append(value)
7528 else:
7529 try:
7530 with open(value[1:], "r", encoding="utf-8") as fd:
7531 for line in fd.read().splitlines():
7532 if line.startswith("#") or not line.strip():
7533 continue # skip comment lines and empty lines
7534 extra_values.append(line)
7535 except FileNotFoundError:
7536 parser.error(f"File not found: {value[1:]}")
7537 current_values += extra_values
7538 setattr(namespace, self.dest, current_values)
7539 if self.dest in snapshot_regex_filter_names:
7540 add_snapshot_filter(namespace, SnapshotFilter(self.dest, None, extra_values))
7543#############################################################################
7544class IncludeSnapshotPlanAction(argparse.Action):
7545 def __call__(
7546 self,
7547 parser: argparse.ArgumentParser,
7548 namespace: argparse.Namespace,
7549 values: Any,
7550 option_string: Optional[str] = None,
7551 ) -> None:
7552 opts = getattr(namespace, self.dest, None)
7553 opts = [] if opts is None else opts
7554 # The bzfs_include_snapshot_plan_excludes_outdated_snapshots env var flag is a work-around for (rare) replication
7555 # situations where a common snapshot cannot otherwise be found because bookmarks are disabled and a common
7556 # snapshot is actually available but not included by the --include-snapshot-plan policy chosen by the user, and the
7557 # user cannot change the content of the --include-snapshot-plan for some reason. The flag makes replication work even
7558 # in this scenario, at the expense of including (and thus replicating) old snapshots that will immediately be deleted
7559 # on the destination by the next pruning action. In a proper production setup, it should never be necessary to set
7560 # the flag to 'False'.
7561 include_snapshot_times_and_ranks = getenv_bool("include_snapshot_plan_excludes_outdated_snapshots", True)
7562 if not self._add_opts(opts, include_snapshot_times_and_ranks, parser, values, option_string=option_string):
7563 opts += ["--new-snapshot-filter-group", "--include-snapshot-regex=!.*"]
7564 setattr(namespace, self.dest, opts)
7566 def _add_opts(
7567 self,
7568 opts: List[str],
7569 include_snapshot_times_and_ranks: bool,
7570 parser: argparse.ArgumentParser,
7571 values: str,
7572 option_string: Optional[str] = None,
7573 ) -> bool:
7574 """Generates extra options to be parsed later during second parse_args() pass, within run_main()"""
7575 xperiods = SnapshotPeriods()
7576 has_at_least_one_filter_clause = False
7577 for org, target_periods in ast.literal_eval(values).items():
7578 prefix = re.escape(nprefix(org))
7579 for target, periods in target_periods.items():
7580 infix = re.escape(ninfix(target)) if target else year_with_four_digits_regex.pattern # disambiguate
7581 for period_unit, period_amount in periods.items(): # e.g. period_unit can be "10minutely" or "minutely"
7582 if not isinstance(period_amount, int) or period_amount < 0:
7583 parser.error(f"{option_string}: Period amount must be a non-negative integer: {period_amount}")
7584 suffix = re.escape(nsuffix(period_unit))
7585 regex = f"{prefix}{infix}.*{suffix}"
7586 opts += ["--new-snapshot-filter-group", f"--include-snapshot-regex={regex}"]
7587 if include_snapshot_times_and_ranks:
7588 duration_amount, duration_unit = xperiods.suffix_to_duration0(period_unit) # --> 10, "minutely"
7589 duration_unit_label = xperiods.period_labels.get(duration_unit) # duration_unit_label = "minutes"
7590 opts += [
7591 "--include-snapshot-times-and-ranks",
7592 (
7593 "notime"
7594 if duration_unit_label is None or duration_amount * period_amount == 0
7595 else f"{duration_amount * period_amount}{duration_unit_label}ago..anytime"
7596 ),
7597 f"latest{period_amount}",
7598 ]
7599 has_at_least_one_filter_clause = True
7600 return has_at_least_one_filter_clause
7603#############################################################################
7604class DeleteDstSnapshotsExceptPlanAction(IncludeSnapshotPlanAction):
7605 def __call__(
7606 self,
7607 parser: argparse.ArgumentParser,
7608 namespace: argparse.Namespace,
7609 values: Any,
7610 option_string: Optional[str] = None,
7611 ) -> None:
7612 opts = getattr(namespace, self.dest, None)
7613 opts = [] if opts is None else opts
7614 opts += ["--delete-dst-snapshots-except"]
7615 if not self._add_opts(opts, True, parser, values, option_string=option_string):
7616 parser.error(
7617 f"{option_string}: Cowardly refusing to delete all snapshots on "
7618 f"--delete-dst-snapshots-except-plan='{values}' (which means 'retain no snapshots' aka "
7619 "'delete all snapshots'). Assuming this is an unintended pilot error rather than intended carnage. "
7620 "Aborting. If this is really what is intended, use `--delete-dst-snapshots --include-snapshot-regex=.*` "
7621 "instead to force the deletion."
7622 )
7623 setattr(namespace, self.dest, opts)
7626#############################################################################
7627class TimeRangeAndRankRangeAction(argparse.Action):
7628 def __call__(
7629 self,
7630 parser: argparse.ArgumentParser,
7631 namespace: argparse.Namespace,
7632 values: Any,
7633 option_string: Optional[str] = None,
7634 ) -> None:
7635 def parse_time(time_spec: str) -> Optional[Union[int, timedelta]]:
7636 time_spec = time_spec.strip()
7637 if time_spec == "*" or time_spec == "anytime":
7638 return None
7639 if time_spec.isdigit():
7640 return int(time_spec) # Input is a Unix time in integer seconds
7641 try:
7642 return timedelta(milliseconds=parse_duration_to_milliseconds(time_spec, regex_suffix=r"\s*ago"))
7643 except ValueError:
7644 try: # If it's not a duration, try parsing as an ISO 8601 datetime
7645 return unixtime_fromisoformat(time_spec)
7646 except ValueError:
7647 parser.error(f"{option_string}: Invalid duration, Unix time, or ISO 8601 datetime: {time_spec}")
7649 assert isinstance(values, list)
7650 assert len(values) > 0
7651 value = values[0].strip()
7652 if value == "notime":
7653 value = "0..0"
7654 if ".." not in value:
7655 parser.error(f"{option_string}: Invalid time range: Missing '..' separator: {value}")
7656 timerange_specs = [parse_time(time_spec) for time_spec in value.split("..", 1)]
7657 rankranges = self.parse_rankranges(parser, values[1:], option_string=option_string)
7658 setattr(namespace, self.dest, [timerange_specs] + rankranges) # for testing only
7659 timerange = self.get_include_snapshot_times(timerange_specs)
7660 add_time_and_rank_snapshot_filter(namespace, self.dest, timerange, rankranges)
7662 @staticmethod
7663 def get_include_snapshot_times(times: List[Union[timedelta, int, None]]) -> UnixTimeRange:
7664 def utc_unix_time_in_seconds(time_spec: Union[timedelta, int, None], default: int) -> Union[timedelta, int]:
7665 if isinstance(time_spec, timedelta):
7666 return time_spec
7667 if isinstance(time_spec, int):
7668 return int(time_spec)
7669 return default
7671 lo, hi = times
7672 if lo is None and hi is None:
7673 return None
7674 lo = utc_unix_time_in_seconds(lo, default=0)
7675 hi = utc_unix_time_in_seconds(hi, default=unixtime_infinity_secs)
7676 if isinstance(lo, int) and isinstance(hi, int):
7677 return (lo, hi) if lo <= hi else (hi, lo)
7678 return lo, hi
7680 @staticmethod
7681 def parse_rankranges(
7682 parser: argparse.ArgumentParser,
7683 values: Any,
7684 option_string: Optional[str] = None,
7685 ) -> List[RankRange]:
7686 def parse_rank(spec: str) -> Tuple[bool, str, int, bool]:
7687 spec = spec.strip()
7688 if not (match := re.fullmatch(r"(all\s*except\s*)?(oldest|latest)\s*(\d+)%?", spec)):
7689 parser.error(f"{option_string}: Invalid rank format: {spec}")
7690 assert match
7691 is_except = bool(match.group(1))
7692 kind = match.group(2)
7693 num = int(match.group(3))
7694 is_percent = spec.endswith("%")
7695 if is_percent and num > 100:
7696 parser.error(f"{option_string}: Invalid rank: Percent must not be greater than 100: {spec}")
7697 return is_except, kind, num, is_percent
7699 rankranges = []
7700 for value in values:
7701 value = value.strip()
7702 if ".." in value:
7703 lo_split, hi_split = value.split("..", 1)
7704 lo = parse_rank(lo_split)
7705 hi = parse_rank(hi_split)
7706 if lo[0] or hi[0]:
7707 # Example: 'all except latest 90..except latest 95' or 'all except latest 90..latest 95'
7708 parser.error(f"{option_string}: Invalid rank range: {value}")
7709 if lo[1] != hi[1]:
7710 # Example: 'latest10..oldest10' and 'oldest10..latest10' may be somewhat unambigous if there are 40
7711 # input snapshots, but they are tricky/not well-defined if there are less than 20 input snapshots.
7712 parser.error(f"{option_string}: Ambiguous rank range: Must not compare oldest with latest: {value}")
7713 else:
7714 hi = parse_rank(value)
7715 is_except, kind, num, is_percent = hi
7716 if is_except:
7717 if is_percent:
7718 # 'all except latest 10%' aka 'oldest 90%' aka 'oldest 0..oldest 90%'
7719 # 'all except oldest 10%' aka 'latest 90%' aka 'latest 0..oldest 90%'
7720 negated_kind = "oldest" if kind == "latest" else "latest"
7721 lo = parse_rank(f"{negated_kind}0")
7722 hi = parse_rank(f"{negated_kind}{100-num}%")
7723 else:
7724 # 'all except latest 90' aka 'latest 90..latest 100%'
7725 # 'all except oldest 90' aka 'oldest 90..oldest 100%'
7726 lo = parse_rank(f"{kind}{num}")
7727 hi = parse_rank(f"{kind}100%")
7728 else:
7729 # 'latest 90' aka 'latest 0..latest 90'
7730 lo = parse_rank(f"{kind}0")
7731 rankranges.append((lo[1:], hi[1:]))
7732 return rankranges
7735#############################################################################
7736@dataclass(order=True)
7737class SnapshotFilter:
7738 name: str
7739 timerange: UnixTimeRange
7740 options: Any = field(compare=False, default=None)
7743def add_snapshot_filter(args: argparse.Namespace, _filter: SnapshotFilter) -> None:
7744 if not hasattr(args, snapshot_filters_var):
7745 args.snapshot_filters_var = [[]]
7746 args.snapshot_filters_var[-1].append(_filter)
7749def add_time_and_rank_snapshot_filter(
7750 args: argparse.Namespace, dst: str, timerange: UnixTimeRange, rankranges: List[RankRange]
7751) -> None:
7752 if timerange is None or len(rankranges) == 0 or any(rankrange[0] == rankrange[1] for rankrange in rankranges):
7753 add_snapshot_filter(args, SnapshotFilter("include_snapshot_times", timerange, None))
7754 else:
7755 assert timerange is not None
7756 add_snapshot_filter(args, SnapshotFilter(dst, timerange, rankranges))
7759def has_timerange_filter(snapshot_filters: List[List[SnapshotFilter]]) -> bool:
7760 """Interacts with add_time_and_rank_snapshot_filter() and optimize_snapshot_filters()"""
7761 return any(f.timerange is not None for snapshot_filter in snapshot_filters for f in snapshot_filter)
7764def optimize_snapshot_filters(snapshot_filters: List[SnapshotFilter]) -> List[SnapshotFilter]:
7765 """Not intended to be a full query execution plan optimizer, but we still apply some basic plan optimizations."""
7766 merge_adjacent_snapshot_filters(snapshot_filters)
7767 merge_adjacent_snapshot_regexes(snapshot_filters)
7768 snapshot_filters = [f for f in snapshot_filters if f.timerange or f.options] # drop noop --include-snapshot-times
7769 reorder_snapshot_time_filters(snapshot_filters)
7770 return snapshot_filters
7773def merge_adjacent_snapshot_filters(snapshot_filters: List[SnapshotFilter]) -> None:
7774 """Merges filter operators of the same kind if they are next to each other and carry an option list, for example
7775 --include-snapshot-ranks and --include-snapshot-regex and --exclude-snapshot-regex. This improves execution perf
7776 and makes handling easier in later stages.
7777 Example: merges --include-snapshot-times-and-ranks 0..9 oldest10% --include-snapshot-times-and-ranks 0..9 latest20%
7778 into --include-snapshot-times-and-ranks 0..9 oldest10% latest20%"""
7779 i = len(snapshot_filters) - 1
7780 while i >= 0:
7781 filter_i = snapshot_filters[i]
7782 if isinstance(filter_i.options, list):
7783 j = i - 1
7784 if j >= 0 and snapshot_filters[j] == filter_i:
7785 lst = snapshot_filters[j].options
7786 assert isinstance(lst, list)
7787 lst += filter_i.options
7788 snapshot_filters.pop(i)
7789 i -= 1
7792def merge_adjacent_snapshot_regexes(snapshot_filters: List[SnapshotFilter]) -> None:
7793 # Merge regex filter operators of the same kind as long as they are within the same group, aka as long as they
7794 # are not separated by a non-regex filter. This improves execution perf and makes handling easier in later stages.
7795 # Example: --include-snapshot-regex .*daily --exclude-snapshot-regex .*weekly --include-snapshot-regex .*hourly
7796 # --exclude-snapshot-regex .*monthly
7797 # gets merged into the following:
7798 # --include-snapshot-regex .*daily .*hourly --exclude-snapshot-regex .*weekly .*monthly
7799 i = len(snapshot_filters) - 1
7800 while i >= 0:
7801 filter_i = snapshot_filters[i]
7802 if filter_i.name in snapshot_regex_filter_names:
7803 assert isinstance(filter_i.options, list)
7804 j = i - 1
7805 while j >= 0 and snapshot_filters[j].name in snapshot_regex_filter_names:
7806 if snapshot_filters[j].name == filter_i.name:
7807 lst = snapshot_filters[j].options
7808 assert isinstance(lst, list)
7809 lst += filter_i.options
7810 snapshot_filters.pop(i)
7811 break
7812 j -= 1
7813 i -= 1
7815 # Merge --include-snapshot-regex and --exclude-snapshot-regex filters that are part of the same group (i.e. next
7816 # to each other) into a single combined filter operator that contains the info of both, and hence all info for the
7817 # group, which makes handling easier in later stages.
7818 # Example: --include-snapshot-regex .*daily .*hourly --exclude-snapshot-regex .*weekly .*monthly
7819 # gets merged into the following: --snapshot-regex(excludes=[.*weekly, .*monthly], includes=[.*daily, .*hourly])
7820 i = len(snapshot_filters) - 1
7821 while i >= 0:
7822 filter_i = snapshot_filters[i]
7823 name = filter_i.name
7824 if name in snapshot_regex_filter_names:
7825 j = i - 1
7826 if j >= 0 and snapshot_filters[j].name in snapshot_regex_filter_names:
7827 filter_j = snapshot_filters[j]
7828 assert filter_j.name != name
7829 snapshot_filters.pop(i)
7830 i -= 1
7831 else:
7832 name_j = next(iter(snapshot_regex_filter_names.difference({name})))
7833 filter_j = SnapshotFilter(name_j, None, [])
7834 sorted_filters = sorted([filter_i, filter_j])
7835 exclude_regexes, include_regexes = sorted_filters[0].options, sorted_filters[1].options
7836 snapshot_filters[i] = SnapshotFilter(snapshot_regex_filter_name, None, (exclude_regexes, include_regexes))
7837 i -= 1
7840def reorder_snapshot_time_filters(snapshot_filters: List[SnapshotFilter]) -> None:
7841 """In an execution plan that contains filter operators based on sort order (the --include-snapshot-times-and-ranks
7842 operator with non-empty ranks), filters cannot freely be reordered without violating correctness, but they can
7843 still be partially reordered for better execution performance. The filter list is partitioned into sections such
7844 that sections are separated by --include-snapshot-times-and-ranks operators with non-empty ranks. Within each
7845 section, we move include_snapshot_times operators aka --include-snapshot-times-and-ranks operators with empty ranks
7846 before --include/exclude-snapshot-regex operators because the former involves fast integer comparisons and the
7847 latter involves more expensive regex matching.
7848 Example: reorders --include-snapshot-regex .*daily --include-snapshot-times-and-ranks 2024-01-01..2024-04-01 into
7849 --include-snapshot-times-and-ranks 2024-01-01..2024-04-01 --include-snapshot-regex .*daily"""
7851 def reorder_time_filters_within_section(i: int, j: int) -> None:
7852 while j > i:
7853 filter_j = snapshot_filters[j]
7854 if filter_j.name == "include_snapshot_times":
7855 snapshot_filters.pop(j)
7856 snapshot_filters.insert(i + 1, filter_j)
7857 j -= 1
7859 i = len(snapshot_filters) - 1
7860 j = i
7861 while i >= 0:
7862 name = snapshot_filters[i].name
7863 if name == "include_snapshot_times_and_ranks":
7864 reorder_time_filters_within_section(i, j)
7865 j = i - 1
7866 i -= 1
7867 reorder_time_filters_within_section(i, j)
7870#############################################################################
7871class LogConfigVariablesAction(argparse.Action):
7872 def __call__(
7873 self,
7874 parser: argparse.ArgumentParser,
7875 namespace: argparse.Namespace,
7876 values: Any,
7877 option_string: Optional[str] = None,
7878 ) -> None:
7879 current_values = getattr(namespace, self.dest, None)
7880 if current_values is None:
7881 current_values = []
7882 for variable in values:
7883 error_msg = validate_log_config_variable(variable)
7884 if error_msg:
7885 parser.error(error_msg)
7886 current_values.append(variable)
7887 setattr(namespace, self.dest, current_values)
7890#############################################################################
7891# class CheckRange is copied from https://gist.github.com/dmitriykovalev/2ab1aa33a8099ef2d514925d84aa89e7/30961300d3f8192f775709c06ff9a5b777475adf
7892# Written by Dmitriy Kovalev
7893#
7894# Licensed under the Apache License, Version 2.0 (the "License");
7895# you may not use this file except in compliance with the License.
7896# You may obtain a copy of the License at
7897#
7898# http://www.apache.org/licenses/LICENSE-2.0
7899#
7900# Unless required by applicable law or agreed to in writing, software
7901# distributed under the License is distributed on an "AS IS" BASIS,
7902# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
7903# See the License for the specific language governing permissions and
7904# limitations under the License.
7905#
7906# Allows you to validate open, closed, and half-open intervals on int as well as float arguments.
7907# Each endpoint can be either a number or positive or negative infinity:
7908# [a, b] --> min=a, max=b
7909# [a, b) --> min=a, sup=b
7910# (a, b] --> inf=a, max=b
7911# (a, b) --> inf=a, sup=b
7912# [a, +infinity) --> min=a
7913# (a, +infinity) --> inf=a
7914# (-infinity, b] --> max=b
7915# (-infinity, b) --> sup=b
7916# fmt: off
7917class CheckRange(argparse.Action):
7918 ops = {'inf': operator.gt,
7919 'min': operator.ge,
7920 'sup': operator.lt,
7921 'max': operator.le}
7923 def __init__(self, *args: Any, **kwargs: Any) -> None:
7924 if 'min' in kwargs and 'inf' in kwargs:
7925 raise ValueError('either min or inf, but not both')
7926 if 'max' in kwargs and 'sup' in kwargs:
7927 raise ValueError('either max or sup, but not both')
7929 for name in self.ops:
7930 if name in kwargs:
7931 setattr(self, name, kwargs.pop(name))
7933 super().__init__(*args, **kwargs)
7935 def interval(self) -> str:
7936 if hasattr(self, 'min'):
7937 lo = f'[{self.min}'
7938 elif hasattr(self, 'inf'):
7939 lo = f'({self.inf}'
7940 else:
7941 lo = '(-infinity'
7943 if hasattr(self, 'max'):
7944 up = f'{self.max}]'
7945 elif hasattr(self, 'sup'):
7946 up = f'{self.sup})'
7947 else:
7948 up = '+infinity)'
7950 return f'valid range: {lo}, {up}'
7952 def __call__(
7953 self,
7954 parser: argparse.ArgumentParser,
7955 namespace: argparse.Namespace,
7956 values: Any,
7957 option_string: Optional[str] = None,
7958 ) -> None:
7959 for name, op in self.ops.items():
7960 if hasattr(self, name) and not op(values, getattr(self, name)):
7961 raise argparse.ArgumentError(self, self.interval())
7962 setattr(namespace, self.dest, values)
7963# fmt: on
7966#############################################################################
7967class CheckPercentRange(CheckRange):
7969 def __call__(
7970 self,
7971 parser: argparse.ArgumentParser,
7972 namespace: argparse.Namespace,
7973 values: Any,
7974 option_string: Optional[str] = None,
7975 ) -> None:
7976 assert isinstance(values, str)
7977 original = values
7978 values = values.strip()
7979 is_percent = values.endswith("%")
7980 if is_percent:
7981 values = values[0:-1]
7982 try:
7983 values = float(values)
7984 except ValueError:
7985 parser.error(f"{option_string}: Invalid percentage or number: {original}")
7986 super().__call__(parser, namespace, values, option_string=option_string)
7987 setattr(namespace, self.dest, (getattr(namespace, self.dest), is_percent))
7990#############################################################################
7991class Comparable(Protocol):
7992 def __lt__(self, other: Any) -> bool: # pragma: no cover - behavior defined by implementor
7993 ...
7996T = TypeVar("T", bound=Comparable) # Generic type variable for elements stored in a SmallPriorityQueue
7999class SmallPriorityQueue(Generic[T]):
8000 """A priority queue that can handle updates to the priority of any element that is already contained in the queue, and
8001 does so very efficiently if there are a small number of elements in the queue (no more than thousands), as is the case
8002 for us. Could be implemented using a SortedList via https://github.com/grantjenks/python-sortedcontainers or using an
8003 indexed priority queue via https://github.com/nvictus/pqdict but, to avoid an external dependency, is actually
8004 implemented using a simple yet effective binary search-based sorted list that can handle updates to the priority of
8005 elements that are already contained in the queue, via removal of the element, followed by update of the element, followed
8006 by (re)insertion. Duplicate elements (if any) are maintained in their order of insertion relative to other duplicates."""
8008 def __init__(self, reverse: bool = False) -> None:
8009 self._lst: List[T] = []
8010 self._reverse: bool = reverse
8012 def clear(self) -> None:
8013 self._lst.clear()
8015 def push(self, element: T) -> None:
8016 bisect.insort(self._lst, element)
8018 def pop(self) -> T:
8019 """Removes and return the smallest (or largest if reverse == True) element from the queue."""
8020 return self._lst.pop() if self._reverse else self._lst.pop(0)
8022 def peek(self) -> T:
8023 """Returns the smallest (or largest if reverse == True) element without removing it."""
8024 return self._lst[-1] if self._reverse else self._lst[0]
8026 def remove(self, element: T) -> bool:
8027 """Removes the first occurrence of the specified element from the queue; returns True if the element was contained"""
8028 lst = self._lst
8029 i = bisect.bisect_left(lst, element)
8030 is_contained = i < len(lst) and lst[i] == element
8031 if is_contained:
8032 del lst[i] # is an optimized memmove()
8033 return is_contained
8035 def __len__(self) -> int:
8036 return len(self._lst)
8038 def __contains__(self, element: T) -> bool:
8039 lst = self._lst
8040 i = bisect.bisect_left(lst, element)
8041 return i < len(lst) and lst[i] == element
8043 def __iter__(self) -> Iterator[T]:
8044 return reversed(self._lst) if self._reverse else iter(self._lst)
8046 def __repr__(self) -> str:
8047 return repr(list(reversed(self._lst))) if self._reverse else repr(self._lst)
8050#############################################################################
8051class SynchronizedBool:
8052 """Thread-safe bool."""
8054 def __init__(self, val: bool) -> None:
8055 assert isinstance(val, bool)
8056 self._lock: threading.Lock = threading.Lock()
8057 self._value: bool = val
8059 @property
8060 def value(self) -> bool:
8061 with self._lock:
8062 return self._value
8064 @value.setter
8065 def value(self, new_value: bool) -> None:
8066 with self._lock:
8067 self._value = new_value
8069 def get_and_set(self, new_value: bool) -> bool:
8070 with self._lock:
8071 old_value = self._value
8072 self._value = new_value
8073 return old_value
8075 def compare_and_set(self, expected_value: bool, new_value: bool) -> bool:
8076 with self._lock:
8077 eq = self._value == expected_value
8078 if eq:
8079 self._value = new_value
8080 return eq
8082 def __bool__(self) -> bool:
8083 return self.value
8085 def __repr__(self) -> str:
8086 return repr(self.value)
8088 def __str__(self) -> str:
8089 return str(self.value)
8092#############################################################################
8093K = TypeVar("K")
8094V = TypeVar("V")
8097class SynchronizedDict(Generic[K, V]):
8098 """Thread-safe dict."""
8100 def __init__(self, val: Dict[K, V]) -> None:
8101 assert isinstance(val, dict)
8102 self._lock: threading.Lock = threading.Lock()
8103 self._dict: Dict[K, V] = val
8105 def __getitem__(self, key: K) -> V:
8106 with self._lock:
8107 return self._dict[key]
8109 def __setitem__(self, key: K, value: V) -> None:
8110 with self._lock:
8111 self._dict[key] = value
8113 def __delitem__(self, key: K) -> None:
8114 with self._lock:
8115 self._dict.pop(key)
8117 def __contains__(self, key: K) -> bool:
8118 with self._lock:
8119 return key in self._dict
8121 def __len__(self) -> int:
8122 with self._lock:
8123 return len(self._dict)
8125 def __repr__(self) -> str:
8126 with self._lock:
8127 return repr(self._dict)
8129 def __str__(self) -> str:
8130 with self._lock:
8131 return str(self._dict)
8133 def get(self, key: K, default: Optional[V] = None) -> Optional[V]:
8134 with self._lock:
8135 return self._dict.get(key, default)
8137 def pop(self, key: K, default: Optional[V] = None) -> Optional[V]:
8138 with self._lock:
8139 return self._dict.pop(key, default)
8141 def clear(self) -> None:
8142 with self._lock:
8143 self._dict.clear()
8145 def items(self) -> ItemsView[K, V]:
8146 with self._lock:
8147 return self._dict.copy().items()
8150#############################################################################
8151class _XFinally(contextlib.AbstractContextManager):
8152 def __init__(self, cleanup: Callable[[], None]) -> None:
8153 self._cleanup = cleanup # Zero‑argument callable executed after the `with` block exits.
8155 def __exit__( # type: ignore # need to ignore on python <= 3.8
8156 self, exc_type: Optional[Type[BaseException]], exc: Optional[BaseException], tb: Optional[types.TracebackType]
8157 ) -> bool:
8158 try:
8159 self._cleanup()
8160 except BaseException as cleanup_exc:
8161 if exc is None:
8162 raise # No main error --> propagate cleanup error normally
8163 # Both failed
8164 # if sys.version_info >= (3, 11):
8165 # raise ExceptionGroup("main error and cleanup error", [exc, cleanup_exc]) from None
8166 # <= 3.10: attach so it shows up in traceback but doesn't mask
8167 exc.__context__ = cleanup_exc
8168 return False # reraise original exception
8169 return False # propagate main exception if any
8172def xfinally(cleanup: Callable[[], None]) -> _XFinally:
8173 """Usage: with xfinally(lambda: cleanup()): ...
8174 Returns a context manager that guarantees that cleanup() runs on exit and guarantees any error in cleanup() will never
8175 mask an exception raised earlier inside the body of the `with` block, while still surfacing both problems when possible.
8177 Problem it solves
8178 -----------------
8179 A naive ``try ... finally`` may lose the original exception:
8181 try:
8182 work()
8183 finally:
8184 cleanup() # <-- if this raises an exception, it replaces the real error!
8186 `_XFinally` preserves exception priority:
8188 * Body raises, cleanup succeeds --> original body exception is re‑raised.
8189 * Body raises, cleanup also raises --> re‑raises body exception; cleanup exception is linked via ``__context__``.
8190 * Body succeeds, cleanup raises --> cleanup exception propagates normally.
8192 Example:
8193 -------
8194 >>> with xfinally(reset_logger): # doctest: +SKIP
8195 ... run_tasks()
8197 The single *with* line replaces verbose ``try/except/finally`` boilerplate while preserving full error information.
8198 """
8199 return _XFinally(cleanup)
8202#############################################################################
8203class ProgramValidator:
8204 """These exclusion lists are not complete or exhaustive; they are merely a weak first line of defense, and no substitute
8205 for strong sandboxing mechanisms in additional layers of defense."""
8207 def __init__(self) -> None:
8208 # immutable variables:
8209 self.shell_programs: FrozenSet[str] = frozenset({"bash", "dash", "sh"})
8210 self.sudo_programs: FrozenSet[str] = frozenset({"sudo", "doas"})
8211 self.ssh_programs: FrozenSet[str] = frozenset({"ssh", "hpnssh"})
8212 self.zfs_programs: FrozenSet[str] = frozenset({"zfs"})
8213 self.zpool_programs: FrozenSet[str] = frozenset({"zpool"})
8214 self.compression_programs: FrozenSet[str] = frozenset(
8215 {"zstd", "lz4", "pzstd", "pigz", "gzip", "bzip2", "brotli", "lzma"}
8216 )
8217 self.disallowed_programs: FrozenSet[str] = frozenset(
8218 {
8219 "ansible",
8220 "apt",
8221 "apt-add-repository",
8222 "apt-config",
8223 "apt-get",
8224 "apt-key",
8225 "apt-mark",
8226 "awk",
8227 "aws",
8228 "az",
8229 "btrfs",
8230 "busybox",
8231 "cargo",
8232 "cat",
8233 "cd",
8234 "cfdisk",
8235 "chacl",
8236 "chgpasswd",
8237 "chgroup",
8238 "chmod",
8239 "chown",
8240 "chroot",
8241 "chsh",
8242 "cloud-init",
8243 "conda",
8244 "cp",
8245 "cpan",
8246 "cryptcat",
8247 "cryptsetup",
8248 "csh",
8249 "curl",
8250 "dd",
8251 "delgroup",
8252 "delpart",
8253 "deluser",
8254 "deno",
8255 "dmsetup",
8256 "dnf",
8257 "dpkg",
8258 "echo",
8259 "ed",
8260 "egrep",
8261 "emacs",
8262 "env",
8263 "ethtool",
8264 "eval",
8265 "ex",
8266 "exec",
8267 "fdisk",
8268 "fgrep",
8269 "find",
8270 "firewalld",
8271 "fsck",
8272 "ftp",
8273 "fwupdmgr",
8274 "gawk",
8275 "gcloud",
8276 "gcp",
8277 "gdisk",
8278 "gh",
8279 "git",
8280 "git-lfs",
8281 "git-shell",
8282 "gkill",
8283 "go",
8284 "gparted",
8285 "gpasswd",
8286 "grep",
8287 "grm",
8288 "grmdir",
8289 "groupadd",
8290 "groupdel",
8291 "groupmod",
8292 "gsed",
8293 "gsutil",
8294 "gtar",
8295 "gtimeout",
8296 "halt",
8297 "hdparm",
8298 "head",
8299 "hostname",
8300 "ifconfig",
8301 "init",
8302 "initctl",
8303 "ionice",
8304 "ip",
8305 "iperf",
8306 "iperf3",
8307 "iptables",
8308 "java",
8309 "kill",
8310 "killall",
8311 "ksh",
8312 "less",
8313 "ln",
8314 "losetup",
8315 "ls",
8316 "lvm",
8317 "mawk",
8318 "mconnect",
8319 "mdadm",
8320 "mkdir",
8321 "mkfs",
8322 "mkfs.btrfs",
8323 "mkfs.ext2",
8324 "mkfs.ext3",
8325 "mkfs.ext4",
8326 "mkfs.fat",
8327 "mkfs.msdos",
8328 "mkfs.xfs",
8329 "modprobe",
8330 "more",
8331 "mosh",
8332 "mount",
8333 "mv",
8334 "nano",
8335 "nc",
8336 "neovim",
8337 "nice",
8338 "node",
8339 "nohup",
8340 "ntpd",
8341 "nvme",
8342 "openssl",
8343 "parallel",
8344 "parted",
8345 "partx",
8346 "passwd",
8347 "perl",
8348 "perl5",
8349 "php",
8350 "pip",
8351 "pip3",
8352 "pipx",
8353 "pkill",
8354 "poweroff",
8355 "pvchange",
8356 "pvcreate",
8357 "pvmove",
8358 "pvremove",
8359 "pvresize",
8360 "pyenv",
8361 "python",
8362 "python2",
8363 "python3",
8364 "rclone",
8365 "rcp",
8366 "reboot",
8367 "red",
8368 "renice",
8369 "resize2fs",
8370 "resizepart",
8371 "restic",
8372 "rg",
8373 "rgrep",
8374 "rlogin",
8375 "rm",
8376 "rmdir",
8377 "rnano",
8378 "route",
8379 "rpm",
8380 "rsh",
8381 "rsync",
8382 "ruby",
8383 "runuser",
8384 "rvim",
8385 "scp",
8386 "sdparm",
8387 "sed",
8388 "service",
8389 "setsid",
8390 "sftp",
8391 "sgdisk",
8392 "shutdown",
8393 "sleep",
8394 "smbd",
8395 "smbpasswd",
8396 "socat",
8397 "source",
8398 "ssh-add",
8399 "ssh-agent",
8400 "sshd",
8401 "su",
8402 "swapoff",
8403 "swapon",
8404 "sysctl",
8405 "systemctl",
8406 "systemd",
8407 "tac",
8408 "tail",
8409 "tar",
8410 "tc",
8411 "tclsh",
8412 "tcpdump",
8413 "tcsh",
8414 "tee",
8415 "telnet",
8416 "time",
8417 "timeout",
8418 "tmux",
8419 "touch",
8420 "tree",
8421 "ts-node",
8422 "tune2fs",
8423 "ufw",
8424 "umount",
8425 "unlink",
8426 "update-initramfs",
8427 "useradd",
8428 "userdel",
8429 "usermod",
8430 "uv",
8431 "vi",
8432 "view",
8433 "vim",
8434 "wget",
8435 "wipe",
8436 "wipefs",
8437 "wireshark",
8438 "xargs",
8439 "yum",
8440 "zdb",
8441 "zed",
8442 "zip",
8443 "zsh",
8444 prog_name + "_jobrunner",
8445 prog_name,
8446 }
8447 )
8449 def validate_program(
8450 self,
8451 path: str,
8452 allow_shell: bool = False,
8453 allow_sudo: bool = False,
8454 allow_ssh: bool = False,
8455 allow_zfs: bool = False,
8456 allow_zpool: bool = False,
8457 allow_compression: bool = False,
8458 extra_invalid_chars: str = "",
8459 ) -> None:
8460 for char in SHELL_CHARS + ":" + extra_invalid_chars:
8461 if char in path:
8462 die(f"Program name must not contain a '{char}' character: {path}")
8463 if not allow_shell:
8464 self._validate_program(path, self.shell_programs)
8465 if not allow_sudo:
8466 self._validate_program(path, self.sudo_programs)
8467 if not allow_ssh:
8468 self._validate_program(path, self.ssh_programs)
8469 if not allow_zfs:
8470 self._validate_program(path, self.zfs_programs)
8471 if not allow_zpool:
8472 self._validate_program(path, self.zpool_programs)
8473 if not allow_compression:
8474 self._validate_program(path, self.compression_programs)
8475 self._validate_program(path, self.disallowed_programs)
8477 @staticmethod
8478 def _validate_program(path: str, programs: FrozenSet[str]) -> None:
8479 basename = os.path.basename(path)
8480 if basename in programs or not basename:
8481 die(f"Invalid program name: {path}")
8484#############################################################################
8485if __name__ == "__main__": 8485 ↛ 8486line 8485 didn't jump to line 8486 because the condition on line 8485 was never true
8486 main()