Coverage for bzfs/bzfs.py: 99%
4054 statements
« prev ^ index » next coverage.py v7.6.12, created at 2025-03-14 04:27 +0000
« prev ^ index » next coverage.py v7.6.12, created at 2025-03-14 04:27 +0000
1#!/usr/bin/env python3
2#
3# Copyright 2024 Wolfgang Hoschek AT mac DOT com
4#
5# Licensed under the Apache License, Version 2.0 (the "License");
6# you may not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# http://www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an "AS IS" BASIS,
13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
17# /// script
18# requires-python = ">=3.7"
19# dependencies = []
20# ///
22"""
23* The codebase starts with docs, definition of input data and associated argument parsing into a "Params" class.
24* All CLI option/parameter values are reachable from the "Params" class.
25* Control flow starts in main(), far below ..., which kicks off a "Job".
26* A Job runs one or more "tasks" via run_tasks(), each task replicating a separate dataset tree.
27* The core replication algorithm is in run_task() and especially in replicate_dataset().
28* The filter algorithms that apply include/exclude policies are in filter_datasets() and filter_snapshots().
29* The --create-src-snapshots-* and --delete-* and --compare-* algorithms also start in run_task().
30* Consider using an IDE/editor that can open multiple windows for the same file, such as PyCharm or Sublime Text, etc.
31* README.md is mostly auto-generated from the ArgumentParser help texts as the source of "truth", via update_readme.py.
32Simply run that script whenever you change or add ArgumentParser help text.
33"""
35import argparse
36import ast
37import bisect
38import calendar
39import collections
40import concurrent
41import copy
42import fcntl
43import glob
44import hashlib
45import heapq
46import inspect
47import itertools
48import json
49import logging
50import logging.config
51import logging.handlers
52import operator
53import os
54import platform
55import pprint
56import pwd
57import random
58import re
59import selectors
60import shlex
61import shutil
62import signal
63import socket
64import stat
65import subprocess
66import sys
67import tempfile
68import threading
69import time
70import traceback
71from argparse import Namespace
72from collections import defaultdict, deque, Counter, namedtuple
73from concurrent.futures import ThreadPoolExecutor, Future, FIRST_COMPLETED
74from contextlib import redirect_stderr
75from dataclasses import dataclass, field, fields
76from datetime import datetime, timedelta, timezone, tzinfo
77from itertools import groupby
78from logging import Logger
79from math import ceil
80from pathlib import Path
81from subprocess import CalledProcessError, TimeoutExpired
82from typing import Deque, Dict, Iterable, Iterator, List, Sequence, Set, Tuple
83from typing import Any, Callable, Generator, Generic, Optional
84from typing import ItemsView, TextIO, TypeVar, Union
86__version__ = "1.10.0-dev"
87prog_name = "bzfs"
88prog_author = "Wolfgang Hoschek"
89die_status = 3
90still_running_status = 4
91min_python_version = (3, 7)
92if sys.version_info < min_python_version:
93 print(f"ERROR: {prog_name} requires Python version >= {'.'.join(map(str, min_python_version))}!")
94 sys.exit(die_status)
95exclude_dataset_regexes_default = r"(.*/)?[Tt][Ee]?[Mm][Pp][-_]?[0-9]*" # skip tmp datasets by default
96create_src_snapshots_prefix_dflt = prog_name + "_"
97create_src_snapshots_suffix_dflt = "_adhoc"
98disable_prg = "-"
99env_var_prefix = prog_name + "_"
100pv_file_thread_separator = "_"
101dummy_dataset = "dummy"
102zfs_version_is_at_least_2_1_0 = "zfs>=2.1.0"
103zfs_version_is_at_least_2_2_0 = "zfs>=2.2.0"
104zfs_recv_groups = {"zfs_recv_o": "-o", "zfs_recv_x": "-x", "zfs_set": ""}
105snapshot_regex_filter_names = {"include_snapshot_regex", "exclude_snapshot_regex"}
106snapshot_regex_filter_name = "snapshot_regex"
107snapshot_filters_var = "snapshot_filters_var"
108cmp_choices_items = ["src", "dst", "all"]
109inject_dst_pipe_fail_kbytes = 400
110unixtime_infinity_secs = 2**64 # billions of years in the future and to be extra safe, larger than the largest ZFS GUID
111year_with_four_digits_regex = re.compile(r"[1-9][0-9][0-9][0-9]") # regex for empty target shall not match non-empty target
112log_stderr = (logging.INFO + logging.WARN) // 2 # custom log level is halfway in between
113log_stdout = (log_stderr + logging.INFO) // 2 # custom log level is halfway in between
114log_debug = logging.DEBUG
115log_trace = logging.DEBUG // 2 # custom log level is halfway in between
116SHARED = "shared"
117DEDICATED = "dedicated"
118DONT_SKIP_DATASET = ""
119DEVNULL = subprocess.DEVNULL
120PIPE = subprocess.PIPE
123def argument_parser() -> argparse.ArgumentParser:
124 create_src_snapshots_plan_example1 = str({"test": {"": {"adhoc": 1}}}).replace(" ", "")
125 create_src_snapshots_plan_example2 = str({"prod": {"us-west-1": {"hourly": 36, "daily": 31}}}).replace(" ", "")
126 delete_dst_snapshots_except_plan_example1 = str(
127 {
128 "prod": {
129 "onsite": {
130 "secondly": 40,
131 "minutely": 40,
132 "hourly": 36,
133 "daily": 31,
134 "weekly": 12,
135 "monthly": 18,
136 "yearly": 5,
137 }
138 }
139 }
140 ).replace(" ", "")
142 # fmt: off
143 parser = argparse.ArgumentParser(
144 prog=prog_name,
145 allow_abbrev=False,
146 description=f"""
147*{prog_name} is a backup command line tool that reliably replicates ZFS snapshots from a (local or remote)
148source ZFS dataset (ZFS filesystem or ZFS volume) and its descendant datasets to a (local or remote)
149destination ZFS dataset to make the destination dataset a recursively synchronized copy of the source dataset,
150using zfs send/receive/rollback/destroy and ssh tunnel as directed. For example, {prog_name} can be used to
151incrementally replicate all ZFS snapshots since the most recent common snapshot from source to destination,
152in order to help protect against data loss or ransomware.*
154When run for the first time, {prog_name} replicates the dataset and all its snapshots from the source to the
155destination. On subsequent runs, {prog_name} transfers only the data that has changed since the previous run,
156i.e. it incrementally replicates to the destination all intermediate snapshots that have been created on
157the source since the last run. Source ZFS snapshots older than the most recent common snapshot found on the
158destination are auto-skipped.
160Unless {prog_name} is explicitly told to create snapshots on the source, it treats the source as read-only,
161thus the source remains unmodified. With the --dryrun flag, {prog_name} also treats the destination as read-only.
162In normal operation, {prog_name} treats the destination as append-only. Optional CLI flags are available to
163delete destination snapshots and destination datasets as directed, for example to make the destination
164identical to the source if the two have somehow diverged in unforeseen ways. This easily enables
165(re)synchronizing the backup from the production state, as well as restoring the production state from
166backup.
168In the spirit of rsync, {prog_name} supports a variety of powerful include/exclude filters that can be combined to
169select which datasets, snapshots and properties to create, replicate, delete or compare.
171Typically, a `cron` job on the source host runs `{prog_name}` periodically to create new snapshots and prune outdated
172snapshots on the source, whereas another `cron` job on the destination host runs `{prog_name}` periodically to prune
173outdated destination snapshots. Yet another `cron` job runs `{prog_name}` periodically to replicate the recently created
174snapshots from the source to the destination. The frequency of these periodic activities is typically every N milliseconds,
175every second, minute, hour, day, week, month and/or year (or multiples thereof).
177All {prog_name} functions including snapshot creation, replication, deletion, comparison, etc. happily work with any
178snapshots in any format, even created or managed by third party ZFS snapshot management tools, including manual
179zfs snapshot/destroy. All functions can also be used independently. That is, if you wish you can use {prog_name} just
180for creating snapshots, or just for replicating, or just for deleting/pruning, or just for comparing snapshot lists.
182The source 'pushes to' the destination whereas the destination 'pulls from' the source. {prog_name} is installed
183and executed on the 'initiator' host which can be either the host that contains the source dataset (push mode),
184or the destination dataset (pull mode), or both datasets (local mode, no network required, no ssh required),
185or any third-party (even non-ZFS OSX) host as long as that host is able to SSH (via standard 'ssh' OpenSSH CLI) into
186both the source and destination host (pull-push mode). In pull-push mode the source 'zfs send's the data stream
187to the initiator which immediately pipes the stream (without storing anything locally) to the destination
188host that 'zfs receive's it. Pull-push mode means that {prog_name} need not be installed or executed on either
189source or destination host. Only the underlying 'zfs' CLI must be installed on both source and destination host.
190{prog_name} can run as root or non-root user, in the latter case via a) sudo or b) when granted corresponding
191ZFS permissions by administrators via 'zfs allow' delegation mechanism.
193{prog_name} is written in Python and continously runs a wide set of unit tests and integration tests to ensure
194coverage and compatibility with old and new versions of ZFS on Linux, FreeBSD and Solaris, on all Python
195versions >= 3.7 (including latest stable which is currently python-3.13).
197{prog_name} is a stand-alone program with zero required dependencies, consisting of a single file, akin to a
198stand-alone shell script or binary executable. It is designed to be able to run in restricted barebones server
199environments. No external Python packages are required; indeed no Python package management at all is required.
200You can just copy the file wherever you like, for example into /usr/local/bin or similar, and simply run it like
201any stand-alone shell script or binary executable.
203{prog_name} automatically replicates the snapshots of multiple datasets in parallel for best performance.
204Similarly, it quickly deletes (or compares) snapshots of multiple datasets in parallel. Atomic snapshots can be
205created as frequently as every N milliseconds.
207Optionally, {prog_name} applies bandwidth rate-limiting and progress monitoring (via 'pv' CLI) during 'zfs
208send/receive' data transfers. When run across the network, {prog_name} also transparently inserts lightweight
209data compression (via 'zstd -1' CLI) and efficient data buffering (via 'mbuffer' CLI) into the pipeline
210between network endpoints during 'zfs send/receive' network transfers. If one of these utilities is not
211installed this is auto-detected, and the operation continues reliably without the corresponding auxiliary
212feature.
214# Periodic Jobs with bzfs_jobrunner
216The software also ships with the [bzfs_jobrunner](README_bzfs_jobrunner.md) companion program, which is a convenience
217wrapper around `{prog_name}` that simplifies periodic ZFS snapshot creation, replication, and pruning, across source host
218and multiple destination hosts, using a single shared [jobconfig](bzfs_tests/bzfs_job_example.py) script.
220# Quickstart
222* Create adhoc atomic snapshots without a schedule:
224```$ {prog_name} tank1/foo/bar dummy --recursive --skip-replication --create-src-snapshots
225--create-src-snapshots-plan "{create_src_snapshots_plan_example1}"```
227```$ zfs list -t snapshot tank1/foo/bar
229tank1/foo/bar@test_2024-11-06_08:30:05_adhoc
230```
232* Create periodic atomic snapshots on a schedule, every hour and every day, by launching this from a periodic `cron` job:
234```$ {prog_name} tank1/foo/bar dummy --recursive --skip-replication --create-src-snapshots
235--create-src-snapshots-plan "{create_src_snapshots_plan_example2}"```
237```$ zfs list -t snapshot tank1/foo/bar
239tank1/foo/bar@prod_us-west-1_2024-11-06_08:30:05_daily
241tank1/foo/bar@prod_us-west-1_2024-11-06_08:30:05_hourly
242```
244Note: A periodic snapshot is created if it is due per the schedule indicated by its suffix (e.g. `_daily` or `_hourly`
245or `_minutely` or `_2secondly` or `_100millisecondly`), or if the --create-src-snapshots-even-if-not-due flag is specified,
246or if the most recent scheduled snapshot is somehow missing. In the latter case {prog_name} immediately creates a snapshot
247(named with the current time, not backdated to the missed time), and then resumes the original schedule. If the suffix is
248`_adhoc` or not a known period then a snapshot is considered non-periodic and is thus created immediately regardless of the
249creation time of any existing snapshot.
251* Replication example in local mode (no network, no ssh), to replicate ZFS dataset tank1/foo/bar to tank2/boo/bar:
253```$ {prog_name} tank1/foo/bar tank2/boo/bar```
255```$ zfs list -t snapshot tank1/foo/bar
257tank1/foo/bar@prod_us-west-1_2024-11-06_08:30:05_daily
259tank1/foo/bar@prod_us-west-1_2024-11-06_08:30:05_hourly```
261```$ zfs list -t snapshot tank2/boo/bar
263tank2/boo/bar@prod_us-west-1_2024-11-06_08:30:05_daily
265tank2/boo/bar@prod_us-west-1_2024-11-06_08:30:05_hourly```
267* Same example in pull mode:
269```$ {prog_name} root@host1.example.com:tank1/foo/bar tank2/boo/bar```
271* Same example in push mode:
273```$ {prog_name} tank1/foo/bar root@host2.example.com:tank2/boo/bar```
275* Same example in pull-push mode:
277```$ {prog_name} root@host1:tank1/foo/bar root@host2:tank2/boo/bar```
279* Example in local mode (no network, no ssh) to recursively replicate ZFS dataset tank1/foo/bar and its descendant
280datasets to tank2/boo/bar:
282```$ {prog_name} tank1/foo/bar tank2/boo/bar --recursive```
284```$ zfs list -t snapshot -r tank1/foo/bar
286tank1/foo/bar@prod_us-west-1_2024-11-06_08:30:05_daily
288tank1/foo/bar@prod_us-west-1_2024-11-06_08:30:05_hourly
290tank1/foo/bar/baz@prod_us-west-1_2024-11-06_08:40:00_daily
292tank1/foo/bar/baz@prod_us-west-1_2024-11-06_08:40:00_hourly```
294```$ zfs list -t snapshot -r tank2/boo/bar
296tank2/boo/bar@prod_us-west-1_2024-11-06_08:30:05_daily
298tank2/boo/bar@prod_us-west-1_2024-11-06_08:30:05_hourly
300tank2/boo/bar/baz@prod_us-west-1_2024-11-06_08:40:00_daily
302tank2/boo/bar/baz@prod_us-west-1_2024-11-06_08:40:00_hourly```
304* Example that makes destination identical to source even if the two have drastically diverged:
306```$ {prog_name} tank1/foo/bar tank2/boo/bar --recursive --force --delete-dst-datasets --delete-dst-snapshots```
308* Replicate all daily snapshots created during the last 7 days, and at the same time ensure that the latest 7 daily
309snapshots (per dataset) are replicated regardless of creation time:
311```$ {prog_name} tank1/foo/bar tank2/boo/bar --recursive --include-snapshot-regex '.*_daily'
312--include-snapshot-times-and-ranks '7 days ago..anytime' 'latest 7'```
314Note: The example above compares the specified times against the standard ZFS 'creation' time property of the snapshots
315(which is a UTC Unix time in integer seconds), rather than against a timestamp that may be part of the snapshot name.
317* Delete all daily snapshots older than 7 days, but ensure that the latest 7 daily snapshots (per dataset) are retained
318regardless of creation time:
320```$ {prog_name} {dummy_dataset} tank2/boo/bar --dryrun --recursive --skip-replication --delete-dst-snapshots
321--include-snapshot-regex '.*_daily' --include-snapshot-times-and-ranks notime 'all except latest 7'
322--include-snapshot-times-and-ranks 'anytime..7 days ago'```
324Note: This also prints how many GB of disk space in total would be freed if the command were to be run for real without
325the --dryrun flag.
327* Delete all daily snapshots older than 7 days, but ensure that the latest 7 daily snapshots (per dataset) are retained
328regardless of creation time. Additionally, only delete a snapshot if no corresponding snapshot or bookmark exists in
329the source dataset (same as above except replace the 'dummy' source with 'tank1/foo/bar'):
331```$ {prog_name} tank1/foo/bar tank2/boo/bar --dryrun --recursive --skip-replication --delete-dst-snapshots
332--include-snapshot-regex '.*_daily' --include-snapshot-times-and-ranks notime 'all except latest 7'
333--include-snapshot-times-and-ranks '7 days ago..anytime'```
335* Delete all daily snapshots older than 7 days, but ensure that the latest 7 daily snapshots (per dataset) are retained
336regardless of creation time. Additionally, only delete a snapshot if no corresponding snapshot exists in the source
337dataset (same as above except append 'no-crosscheck'):
339```$ {prog_name} tank1/foo/bar tank2/boo/bar --dryrun --recursive --skip-replication --delete-dst-snapshots
340--include-snapshot-regex '.*_daily' --include-snapshot-times-and-ranks notime 'all except latest 7'
341--include-snapshot-times-and-ranks 'anytime..7 days ago' --delete-dst-snapshots-no-crosscheck```
343* Delete all daily bookmarks older than 90 days, but retain the latest 200 daily bookmarks (per dataset) regardless
344of creation time:
346```$ {prog_name} {dummy_dataset} tank1/foo/bar --dryrun --recursive --skip-replication --delete-dst-snapshots=bookmarks
347--include-snapshot-regex '.*_daily' --include-snapshot-times-and-ranks notime 'all except latest 200'
348--include-snapshot-times-and-ranks 'anytime..90 days ago'```
350* Delete all tmp datasets within tank2/boo/bar:
352```$ {prog_name} {dummy_dataset} tank2/boo/bar --dryrun --recursive --skip-replication --delete-dst-datasets
353--include-dataset-regex '(.*/)?tmp.*' --exclude-dataset-regex '!.*'```
355* Retain all secondly snapshots that were created less than 40 seconds ago, and ensure that the latest 40
356secondly snapshots (per dataset) are retained regardless of creation time. Same for 40 minutely snapshots, 36 hourly
357snapshots, 31 daily snapshots, 12 weekly snapshots, 18 monthly snapshots, and 5 yearly snapshots:
359```$ {prog_name} {dummy_dataset} tank2/boo/bar --dryrun --recursive --skip-replication --delete-dst-snapshots
360--delete-dst-snapshots-except
361--include-snapshot-regex '.*_secondly' --include-snapshot-times-and-ranks '40 seconds ago..anytime' 'latest 40'
362--new-snapshot-filter-group
363--include-snapshot-regex '.*_minutely' --include-snapshot-times-and-ranks '40 minutes ago..anytime' 'latest 40'
364--new-snapshot-filter-group
365--include-snapshot-regex '.*_hourly' --include-snapshot-times-and-ranks '36 hours ago..anytime' 'latest 36'
366--new-snapshot-filter-group
367--include-snapshot-regex '.*_daily' --include-snapshot-times-and-ranks '31 days ago..anytime' 'latest 31'
368--new-snapshot-filter-group
369--include-snapshot-regex '.*_weekly' --include-snapshot-times-and-ranks '12 weeks ago..anytime' 'latest 12'
370--new-snapshot-filter-group
371--include-snapshot-regex '.*_monthly' --include-snapshot-times-and-ranks '18 months ago..anytime' 'latest 18'
372--new-snapshot-filter-group
373--include-snapshot-regex '.*_yearly' --include-snapshot-times-and-ranks '5 years ago..anytime' 'latest 5'```
375For convenience, the lengthy command line above can be expressed in a more concise way, like so:
377```$ {prog_name} {dummy_dataset} tank2/boo/bar --dryrun --recursive --skip-replication --delete-dst-snapshots
378--delete-dst-snapshots-except-plan "{delete_dst_snapshots_except_plan_example1}"```
380* Compare source and destination dataset trees recursively, for example to check if all recently taken snapshots have
381been successfully replicated by a periodic job. List snapshots only contained in src (tagged with 'src'),
382only contained in dst (tagged with 'dst'), and contained in both src and dst (tagged with 'all'), restricted to hourly
383and daily snapshots taken within the last 7 days, excluding the last 4 hours (to allow for some slack/stragglers),
384excluding temporary datasets:
386```$ {prog_name} tank1/foo/bar tank2/boo/bar --skip-replication --compare-snapshot-lists=src+dst+all --recursive
387--include-snapshot-regex '.*_(hourly|daily)' --include-snapshot-times-and-ranks '7 days ago..4 hours ago'
388--exclude-dataset-regex '(.*/)?tmp.*'```
390If the resulting TSV output file contains zero lines starting with the prefix 'src' and zero lines starting with the
391prefix 'dst' then no source snapshots are missing on the destination, and no destination snapshots are missing
392on the source, indicating that the periodic replication and pruning jobs perform as expected. The TSV output is sorted
393by dataset, and by ZFS creation time within each dataset - the first and last line prefixed with 'all' contains the
394metadata of the oldest and latest common snapshot, respectively. The --compare-snapshot-lists option also directly
395logs various summary stats, such as the metadata of the latest common snapshot, latest snapshots and oldest snapshots,
396as well as the time diff between the latest common snapshot and latest snapshot only in src (and only in dst), as well
397as how many src snapshots and how many GB of data are missing on dst, etc.
399* Example with further options:
401```$ {prog_name} tank1/foo/bar root@host2.example.com:tank2/boo/bar --recursive
402--exclude-snapshot-regex '.*_(secondly|minutely)' --exclude-snapshot-regex 'test_.*'
403--include-snapshot-times-and-ranks '7 days ago..anytime' 'latest 7' --exclude-dataset /tank1/foo/bar/temporary
404--exclude-dataset /tank1/foo/bar/baz/trash --exclude-dataset-regex '(.*/)?private'
405--exclude-dataset-regex '(.*/)?[Tt][Ee]?[Mm][Pp][-_]?[0-9]*' --ssh-dst-private-key /root/.ssh/id_rsa```
406""", formatter_class=argparse.RawTextHelpFormatter)
408 parser.add_argument(
409 "root_dataset_pairs", nargs="+", action=DatasetPairsAction, metavar="SRC_DATASET DST_DATASET",
410 help="SRC_DATASET: "
411 "Source ZFS dataset (and its descendants) that will be replicated. Can be a ZFS filesystem or ZFS volume. "
412 "Format is [[user@]host:]dataset. The host name can also be an IPv4 address (or an IPv6 address where "
413 "each ':' colon character must be replaced with a '|' pipe character for disambiguation). If the "
414 "host name is '-', the dataset will be on the local host, and the corresponding SSH leg will be omitted. "
415 "The same is true if the host is omitted and the dataset does not contain a ':' colon at the same time. "
416 "Local dataset examples: `tank1/foo/bar`, `tank1`, `-:tank1/foo/bar:baz:boo` "
417 "Remote dataset examples: `host:tank1/foo/bar`, `host.example.com:tank1/foo/bar`, "
418 "`root@host:tank`, `root@host.example.com:tank1/foo/bar`, `user@127.0.0.1:tank1/foo/bar:baz:boo`, "
419 "`user@||1:tank1/foo/bar:baz:boo`. "
420 "The first component of the ZFS dataset name is the ZFS pool name, here `tank1`. "
421 "If the option starts with a `+` prefix then dataset names are read from the UTF-8 text file given "
422 "after the `+` prefix, with each line in the file containing a SRC_DATASET and a DST_DATASET, "
423 "separated by a tab character. Example: `+root_dataset_names.txt`, `+/path/to/root_dataset_names.txt`\n\n"
424 "DST_DATASET: "
425 "Destination ZFS dataset for replication and deletion. Has same naming format as SRC_DATASET. During "
426 "replication, destination datasets that do not yet exist are created as necessary, along with their "
427 "parent and ancestors.\n\n"
428 f"*Performance Note:* {prog_name} automatically replicates multiple datasets in parallel. It replicates "
429 "snapshots in parallel across datasets and serially within a dataset. All child datasets of a dataset "
430 "may be processed in parallel. For consistency, processing of a dataset only starts after processing of "
431 "all its ancestor datasets has completed. Further, when a thread is ready to start processing another "
432 "dataset, it chooses the next dataset wrt. case-sensitive sort order from the datasets that are "
433 "currently available for start of processing. Initially, only the roots of the selected dataset subtrees "
434 "are available for start of processing. The degree of parallelism is configurable with the --threads "
435 "option (see below).\n\n")
436 parser.add_argument(
437 "--recursive", "-r", action="store_true",
438 help="During snapshot creation, replication, deletion and comparison, also consider descendant datasets, i.e. "
439 "datasets within the dataset tree, including children, and children of children, etc.\n\n")
440 parser.add_argument(
441 "--include-dataset", action=FileOrLiteralAction, nargs="+", default=[], metavar="DATASET",
442 help="During snapshot creation, replication, deletion and comparison, select any ZFS dataset (and its descendants) "
443 "that is contained within SRC_DATASET (DST_DATASET in case of deletion) if its dataset name is one of the "
444 "given include dataset names but none of the exclude dataset names. If a dataset is excluded its descendants "
445 "are automatically excluded too, and this decision is never reconsidered even for the descendants because "
446 "exclude takes precedence over include.\n\n"
447 "A dataset name is absolute if the specified dataset is prefixed by `/`, e.g. `/tank/baz/tmp`. "
448 "Otherwise the dataset name is relative wrt. source and destination, e.g. `baz/tmp` if the source "
449 "is `tank`.\n\n"
450 "This option is automatically translated to an --include-dataset-regex (see below) and can be "
451 "specified multiple times.\n\n"
452 "If the option starts with a `+` prefix then dataset names are read from the newline-separated "
453 "UTF-8 text file given after the `+` prefix, one dataset per line inside of the text file.\n\n"
454 "Examples: `/tank/baz/tmp` (absolute), `baz/tmp` (relative), "
455 "`+dataset_names.txt`, `+/path/to/dataset_names.txt`\n\n")
456 parser.add_argument(
457 "--exclude-dataset", action=FileOrLiteralAction, nargs="+", default=[], metavar="DATASET",
458 help="Same syntax as --include-dataset (see above) except that the option is automatically translated to an "
459 "--exclude-dataset-regex (see below).\n\n")
460 parser.add_argument(
461 "--include-dataset-regex", action=FileOrLiteralAction, nargs="+", default=[], metavar="REGEX",
462 help="During snapshot creation, replication (and deletion) and comparison, select any ZFS dataset (and its "
463 "descendants) that is contained within SRC_DATASET (DST_DATASET in case of deletion) if its relative dataset "
464 "path (e.g. `baz/tmp`) wrt. SRC_DATASET (DST_DATASET in case of deletion) matches at least one of the given "
465 "include regular expressions but none of the exclude regular expressions. "
466 "If a dataset is excluded its descendants are automatically excluded too, and this decision is never "
467 "reconsidered even for the descendants because exclude takes precedence over include.\n\n"
468 "This option can be specified multiple times. "
469 "A leading `!` character indicates logical negation, i.e. the regex matches if the regex with the "
470 "leading `!` character removed does not match.\n\n"
471 "If the option starts with a `+` prefix then regex names are read from the newline-separated "
472 "UTF-8 text file given after the `+` prefix, one regex per line inside of the text file.\n\n"
473 "Default: `.*` (include all datasets).\n\n"
474 "Examples: `baz/tmp`, `(.*/)?doc[^/]*/(private|confidential).*`, `!public`, "
475 "`+dataset_regexes.txt`, `+/path/to/dataset_regexes.txt`\n\n")
476 parser.add_argument(
477 "--exclude-dataset-regex", action=FileOrLiteralAction, nargs="+", default=[], metavar="REGEX",
478 help="Same syntax as --include-dataset-regex (see above) except that the default is "
479 f"`{exclude_dataset_regexes_default}` (exclude tmp datasets). Example: `!.*` (exclude no dataset)\n\n")
480 parser.add_argument(
481 "--exclude-dataset-property", default=None, action=NonEmptyStringAction, metavar="STRING",
482 help="The name of a ZFS dataset user property (optional). If this option is specified, the effective value "
483 "(potentially inherited) of that user property is read via 'zfs list' for each selected source dataset "
484 "to determine whether the dataset will be included or excluded, as follows:\n\n"
485 "a) Value is 'true' or '-' or empty string or the property is missing: Include the dataset.\n\n"
486 "b) Value is 'false': Exclude the dataset and its descendants.\n\n"
487 "c) Value is a comma-separated list of host names (no spaces, for example: "
488 "'store001,store002'): Include the dataset if the host name of "
489 f"the host executing {prog_name} is contained in the list, otherwise exclude the dataset and its "
490 "descendants.\n\n"
491 "If a dataset is excluded its descendants are automatically excluded too, and the property values of the "
492 "descendants are ignored because exclude takes precedence over include.\n\n"
493 "Examples: 'syncoid:sync', 'com.example.eng.project.x:backup'\n\n"
494 "*Note:* The use of --exclude-dataset-property is discouraged for most use cases. It is more flexible, "
495 "more powerful, *and* more efficient to instead use a combination of --include/exclude-dataset-regex "
496 "and/or --include/exclude-dataset to achieve the same or better outcome.\n\n")
497 parser.add_argument(
498 "--include-snapshot-regex", action=FileOrLiteralAction, nargs="+", default=[], metavar="REGEX",
499 help="During replication, deletion and comparison, select any source ZFS snapshot that has a name (i.e. the part "
500 "after the '@') that matches at least one of the given include regular expressions but none of the "
501 "exclude regular expressions. If a snapshot is excluded this decision is never reconsidered because "
502 "exclude takes precedence over include.\n\n"
503 "This option can be specified multiple times. "
504 "A leading `!` character indicates logical negation, i.e. the regex matches if the regex with the "
505 "leading `!` character removed does not match.\n\n"
506 "Default: `.*` (include all snapshots). "
507 "Examples: `test_.*`, `!prod_.*`, `.*_(hourly|frequent)`, `!.*_(weekly|daily)`\n\n"
508 "*Note:* All --include/exclude-snapshot-* CLI option groups are combined into a mini filter pipeline. "
509 "A filter pipeline is executed in the order given on the command line, left to right. For example if "
510 "--include-snapshot-times-and-ranks (see below) is specified on the command line before "
511 "--include/exclude-snapshot-regex, then --include-snapshot-times-and-ranks will be applied before "
512 "--include/exclude-snapshot-regex. The pipeline results would not always be the same if the order were "
513 "reversed. Order matters.\n\n"
514 "*Note:* During replication, bookmarks are always retained aka selected in order to help find common "
515 "snapshots between source and destination.\n\n")
516 parser.add_argument(
517 "--exclude-snapshot-regex", action=FileOrLiteralAction, nargs="+", default=[], metavar="REGEX",
518 help="Same syntax as --include-snapshot-regex (see above) except that the default is to exclude no "
519 "snapshots.\n\n")
520 parser.add_argument(
521 "--include-snapshot-times-and-ranks", action=TimeRangeAndRankRangeAction, nargs="+", default=[],
522 metavar=("TIMERANGE", "RANKRANGE"),
523 help="This option takes as input parameters a time range filter and an optional rank range filter. It "
524 "separately computes the results for each filter and selects the UNION of both results. "
525 "To instead use a pure rank range filter (no UNION), or a pure time range filter (no UNION), simply "
526 "use 'notime' aka '0..0' to indicate an empty time range, or omit the rank range, respectively. "
527 "This option can be specified multiple times.\n\n"
528 "<b>*Replication Example (UNION):* </b>\n\n"
529 "Specify to replicate all daily snapshots created during the last 7 days, "
530 "and at the same time ensure that the latest 7 daily snapshots (per dataset) are replicated regardless "
531 "of creation time, like so: "
532 "`--include-snapshot-regex '.*_daily' --include-snapshot-times-and-ranks '7 days ago..anytime' 'latest 7'`\n\n"
533 "<b>*Deletion Example (no UNION):* </b>\n\n"
534 "Specify to delete all daily snapshots older than 7 days, but ensure that the "
535 "latest 7 daily snapshots (per dataset) are retained regardless of creation time, like so: "
536 "`--include-snapshot-regex '.*_daily' --include-snapshot-times-and-ranks notime 'all except latest 7' "
537 "--include-snapshot-times-and-ranks 'anytime..7 days ago'`"
538 "\n\n"
539 "This helps to safely cope with irregular scenarios where no snapshots were created or received within "
540 "the last 7 days, or where more than 7 daily snapshots were created within the last 7 days. It can also "
541 "help to avoid accidental pruning of the last snapshot that source and destination have in common.\n\n"
542 ""
543 "<b>*TIMERANGE:* </b>\n\n"
544 "The ZFS 'creation' time of a snapshot (and bookmark) must fall into this time range in order for the "
545 "snapshot to be included. The time range consists of a 'start' time, followed by a '..' separator, "
546 "followed by an 'end' time. For example '2024-01-01..2024-04-01', or 'anytime..anytime' aka `*..*` aka all "
547 "times, or 'notime' aka '0..0' aka empty time range. Only snapshots (and bookmarks) in the half-open time "
548 "range [start, end) are included; other snapshots (and bookmarks) are excluded. If a snapshot is excluded "
549 "this decision is never reconsidered because exclude takes precedence over include. Each of the two specified "
550 "times can take any of the following forms:\n\n"
551 "* a) `anytime` aka `*` wildcard; represents negative or positive infinity.\n\n"
552 "* b) a non-negative integer representing a UTC Unix time in seconds. Example: 1728109805\n\n"
553 "* c) an ISO 8601 datetime string with or without timezone. Examples: '2024-10-05', "
554 "'2024-10-05T14:48:55', '2024-10-05T14:48:55+02', '2024-10-05T14:48:55-04:30'. If the datetime string "
555 "does not contain time zone info then it is assumed to be in the local time zone. Timezone string support "
556 "requires Python >= 3.11.\n\n"
557 "* d) a duration that indicates how long ago from the current time, using the following syntax: "
558 "a non-negative integer, followed by an optional space, followed by a duration unit that is "
559 "*one* of 'seconds', 'secs', 'minutes', 'mins', 'hours', 'days', 'weeks', 'months', 'years', "
560 "followed by an optional space, followed by the word 'ago'. "
561 "Examples: '0secs ago', '40 mins ago', '36hours ago', '90days ago', '12weeksago'.\n\n"
562 "* Note: This option compares the specified time against the standard ZFS 'creation' time property of the "
563 "snapshot (which is a UTC Unix time in integer seconds), rather than against a timestamp that may be "
564 "part of the snapshot name. You can list the ZFS creation time of snapshots and bookmarks as follows: "
565 "`zfs list -t snapshot,bookmark -o name,creation -s creation -d 1 $SRC_DATASET` (optionally add "
566 "the -p flag to display UTC Unix time in integer seconds).\n\n"
567 "*Note:* During replication, bookmarks are always retained aka selected in order to help find common "
568 "snapshots between source and destination.\n\n"
569 ""
570 "<b>*RANKRANGE:* </b>\n\n"
571 "Specifies to include the N (or N%%) oldest snapshots or latest snapshots, and exclude all other "
572 "snapshots (default: include no snapshots). Snapshots are sorted by creation time (actually, by the "
573 "'createtxg' ZFS property, which serves the same purpose but is more precise). The rank position of a "
574 "snapshot is the zero-based integer position of the snapshot within that sorted list. A rank consists of the "
575 "optional words 'all except' (followed by an optional space), followed by the word 'oldest' or 'latest', "
576 "followed by a non-negative integer, followed by an optional '%%' percent sign. A rank range consists of a "
577 "lower rank, followed by a '..' separator, followed by a higher rank. "
578 "If the optional lower rank is missing it is assumed to be 0. Examples:\n\n"
579 "* 'oldest 10%%' aka 'oldest 0..oldest 10%%' (include the oldest 10%% of all snapshots)\n\n"
580 "* 'latest 10%%' aka 'latest 0..latest 10%%' (include the latest 10%% of all snapshots)\n\n"
581 "* 'all except latest 10%%' aka 'oldest 90%%' aka 'oldest 0..oldest 90%%' (include all snapshots except the "
582 "latest 10%% of all snapshots)\n\n"
583 "* 'oldest 90' aka 'oldest 0..oldest 90' (include the oldest 90 snapshots)\n\n"
584 "* 'latest 90' aka 'latest 0..latest 90' (include the latest 90 snapshots)\n\n"
585 "* 'all except oldest 90' aka 'oldest 90..oldest 100%%' (include all snapshots except the oldest 90 snapshots)"
586 "\n\n"
587 "* 'all except latest 90' aka 'latest 90..latest 100%%' (include all snapshots except the latest 90 snapshots)"
588 "\n\n"
589 "* 'latest 1' aka 'latest 0..latest 1' (include the latest snapshot)\n\n"
590 "* 'all except latest 1' aka 'latest 1..latest 100%%' (include all snapshots except the latest snapshot)\n\n"
591 "* 'oldest 2' aka 'oldest 0..oldest 2' (include the oldest 2 snapshots)\n\n"
592 "* 'all except oldest 2' aka 'oldest 2..oldest 100%%' (include all snapshots except the oldest 2 snapshots)\n\n"
593 "* 'oldest 100%%' aka 'oldest 0..oldest 100%%' (include all snapshots)\n\n"
594 "* 'oldest 0%%' aka 'oldest 0..oldest 0%%' (include no snapshots)\n\n"
595 "* 'oldest 0' aka 'oldest 0..oldest 0' (include no snapshots)\n\n"
596 "*Note:* Percentage calculations are not based on the number of snapshots "
597 "contained in the dataset on disk, but rather based on the number of snapshots arriving at the filter. "
598 "For example, if only two daily snapshots arrive at the filter because a prior filter excludes hourly "
599 "snapshots, then 'latest 10' will only include these two daily snapshots, and 'latest 50%%' will only "
600 "include one of these two daily snapshots.\n\n"
601 "*Note:* During replication, bookmarks are always retained aka selected in order to help find common "
602 "snapshots between source and destination. Bookmarks do not count towards N or N%% wrt. rank.\n\n"
603 "*Note:* If a snapshot is excluded this decision is never reconsidered because exclude takes precedence "
604 "over include.\n\n")
606 def format_dict(dictionary):
607 return f'"{dictionary}"'
609 src_snapshot_plan_example = {
610 "prod": {
611 "onsite": {"secondly": 40, "minutely": 40, "hourly": 36, "daily": 31, "weekly": 12, "monthly": 18, "yearly": 5},
612 "us-west-1": {"secondly": 0, "minutely": 0, "hourly": 36, "daily": 31, "weekly": 12, "monthly": 18, "yearly": 5},
613 "eu-west-1": {"secondly": 0, "minutely": 0, "hourly": 36, "daily": 31, "weekly": 12, "monthly": 18, "yearly": 5},
614 },
615 "test": {
616 "offsite": {"12hourly": 42, "weekly": 12},
617 "onsite": {"100millisecondly": 42},
618 },
619 }
620 parser.add_argument(
621 "--include-snapshot-plan", action=IncludeSnapshotPlanAction, default=None,
622 metavar="DICT_STRING",
623 help="Replication periods to be used if replicating snapshots within the selected destination datasets. "
624 "Has the same format as --create-src-snapshots-plan and --delete-dst-snapshots-except-plan (see below). "
625 "Snapshots that do not match a period will not be replicated. To avoid unexpected surprises, make sure to "
626 "carefully specify ALL snapshot names and periods that shall be replicated, in combination with --dryrun.\n\n"
627 f"Example: `{format_dict(src_snapshot_plan_example)}`. This example will, for the organization 'prod' and the "
628 "intended logical target 'onsite', replicate secondly snapshots that were created less than 40 seconds ago, "
629 "yet replicate the latest 40 secondly snapshots regardless of creation time. Analog for the latest 40 minutely "
630 "snapshots, latest 36 hourly snapshots, etc. "
631 "Note: A zero within a period (e.g. 'hourly': 0) indicates that no snapshots shall be replicated for the given "
632 "period.\n\n"
633 "Note: --include-snapshot-plan is a convenience option that auto-generates a series of the following other "
634 "options: --new-snapshot-filter-group, --include-snapshot-regex, --include-snapshot-times-and-ranks\n\n")
635 parser.add_argument(
636 "--new-snapshot-filter-group", action=NewSnapshotFilterGroupAction, nargs=0,
637 help="Starts a new snapshot filter group containing separate --{include|exclude}-snapshot-* filter options. The "
638 "program separately computes the results for each filter group and selects the UNION of all results. "
639 "This option can be specified multiple times and serves as a separator between groups. Example:\n\n"
640 "Delete all minutely snapshots older than 40 minutes, but ensure that the latest 40 minutely snapshots (per "
641 "dataset) are retained regardless of creation time. Additionally, delete all hourly snapshots older than 36 "
642 "hours, but ensure that the latest 36 hourly snapshots (per dataset) are retained regardless of creation time. "
643 "Additionally, delete all daily snapshots older than 31 days, but ensure that the latest 31 daily snapshots "
644 "(per dataset) are retained regardless of creation time: "
645 f"`{prog_name} {dummy_dataset} tank2/boo/bar --dryrun --recursive --skip-replication --delete-dst-snapshots "
646 "--include-snapshot-regex '.*_minutely' --include-snapshot-times-and-ranks notime 'all except latest 40' "
647 "--include-snapshot-times-and-ranks 'anytime..40 minutes ago' "
648 "--new-snapshot-filter-group "
649 "--include-snapshot-regex '.*_hourly' --include-snapshot-times-and-ranks notime 'all except latest 36' "
650 "--include-snapshot-times-and-ranks 'anytime..36 hours ago' "
651 "--new-snapshot-filter-group "
652 "--include-snapshot-regex '.*_daily' --include-snapshot-times-and-ranks notime 'all except latest 31' "
653 "--include-snapshot-times-and-ranks 'anytime..31 days ago'`\n\n")
654 parser.add_argument(
655 "--create-src-snapshots", action="store_true",
656 help="Do nothing if the --create-src-snapshots flag is missing. Otherwise, before the replication step (see below), "
657 "atomically create new snapshots of the source datasets selected via --{include|exclude}-dataset* policy. "
658 "The names of the snapshots can be configured via --create-src-snapshots-* suboptions (see below). "
659 "To create snapshots only, without any other processing such as replication, etc, consider using this flag "
660 "together with the --skip-replication flag.\n\n"
661 "A periodic snapshot is created if it is due per the schedule indicated by --create-src-snapshots-plan "
662 "(for example '_daily' or '_hourly' or _'10minutely' or '_2secondly' or '_100millisecondly'), or if the "
663 "--create-src-snapshots-even-if-not-due flag is specified, or if the most recent scheduled snapshot "
664 f"is somehow missing. In the latter case {prog_name} immediately creates a snapshot (tagged with the current "
665 "time, not backdated to the missed time), and then resumes the original schedule.\n\n"
666 "If the snapshot suffix is '_adhoc' or not a known period then a snapshot is considered "
667 "non-periodic and is thus created immediately regardless of the creation time of any existing snapshot.\n\n"
668 "The implementation attempts to fit as many datasets as possible into a single (atomic) 'zfs snapshot' command "
669 "line, using case-sensitive sort order, and using 'zfs snapshot -r' to the extent that this is compatible "
670 "with the actual results of the schedule and the actual results of the --{include|exclude}-dataset* pruning "
671 "policy. The snapshots of all datasets that fit "
672 "within the same single 'zfs snapshot' CLI invocation will be taken within the same ZFS transaction group, and "
673 "correspondingly have identical 'createtxg' ZFS property (but not necessarily identical 'creation' ZFS time "
674 "property as ZFS actually provides no such guarantee), and thus be consistent. Dataset names that can't fit "
675 "into a single command line are spread over multiple command line invocations, respecting the limits that the "
676 "operating system places on the maximum length of a single command line, per `getconf ARG_MAX`.\n\n"
677 f"Note: All {prog_name} functions including snapshot creation, replication, deletion, comparison, etc. happily "
678 "work with any snapshots in any format, even created or managed by third party ZFS snapshot management tools, "
679 "including manual zfs snapshot/destroy.\n\n")
680 parser.add_argument(
681 "--create-src-snapshots-plan", default=None, type=str, metavar="DICT_STRING",
682 help="Creation periods that specify a schedule for when new snapshots shall be created on src within the selected "
683 "datasets. Has the same format as --delete-dst-snapshots-except-plan.\n\n"
684 f"Example: `{format_dict(src_snapshot_plan_example)}`. This example will, for the organization 'prod' and "
685 "the intended logical target 'onsite', create 'secondly' snapshots every second, 'minutely' snapshots every "
686 "minute, hourly snapshots every hour, and so on. "
687 "It will also create snapshots for the targets 'us-west-1' and 'eu-west-1' within the 'prod' organization. "
688 "In addition, it will create snapshots every 12 hours and every week for the 'test' organization, "
689 "and name them as being intended for the 'offsite' replication target. Analog for snapshots that are taken "
690 "every 100 milliseconds within the 'test' organization.\n\n"
691 "The example creates ZFS snapshots with names like "
692 "`prod_onsite_<timestamp>_secondly`, `prod_onsite_<timestamp>_minutely`, "
693 "`prod_us-west-1_<timestamp>_hourly`, `prod_us-west-1_<timestamp>_daily`, "
694 "`prod_eu-west-1_<timestamp>_hourly`, `prod_eu-west-1_<timestamp>_daily`, "
695 "`test_offsite_<timestamp>_12hourly`, `test_offsite_<timestamp>_weekly`, and so on.\n\n"
696 "Note: A period name that is missing indicates that no snapshots shall be created for the given period.\n\n"
697 "The period name can contain an optional positive integer immediately preceding the time period unit, for "
698 "example `_2secondly` or `_10minutely` or `_100millisecondly` to indicate that snapshots are taken every 2 "
699 "seconds, or every 10 minutes, or every 100 milliseconds, respectively.\n\n")
701 def argparser_escape(text: str) -> str:
702 return text.replace('%', '%%')
704 create_src_snapshots_timeformat_dflt = "%Y-%m-%d_%H:%M:%S"
705 parser.add_argument(
706 "--create-src-snapshots-timeformat", default=create_src_snapshots_timeformat_dflt, metavar="STRFTIME_SPEC",
707 help=f"Default is `{argparser_escape(create_src_snapshots_timeformat_dflt)}`. For the strftime format, see "
708 "https://docs.python.org/3.11/library/datetime.html#strftime-strptime-behavior. "
709 f"Examples: `{argparser_escape('%Y-%m-%d_%H:%M:%S.%f')}` (adds microsecond resolution), "
710 f"`{argparser_escape('%Y-%m-%d_%H:%M:%S%z')}` (adds timezone offset), "
711 f"`{argparser_escape('%Y-%m-%dT%H-%M-%S')}` (no colons).\n\n"
712 "The name of the snapshot created on the src is `$org_$target_strftime(--create-src-snapshots-time*)_$period`. "
713 "Example: `tank/foo@prod_us-west-1_2024-09-03_12:26:15_daily`\n\n")
714 parser.add_argument(
715 "--create-src-snapshots-timezone", default="", type=str, metavar="TZ_SPEC",
716 help=f"Default is the local timezone of the system running {prog_name}. When creating a new snapshot on the source, "
717 "fetch the current time in the specified timezone, and feed that time, and the value of "
718 "--create-src-snapshots-timeformat, into the standard strftime() function to generate the timestamp portion "
719 "of the snapshot name. The TZ_SPEC input parameter is of the form 'UTC' or '+HHMM' or '-HHMM' for fixed UTC "
720 "offsets, or an IANA TZ identifier for auto-adjustment to daylight savings time, or the empty string to use "
721 "the local timezone, for example '', 'UTC', '+0000', '+0530', '-0400', 'America/Los_Angeles', 'Europe/Vienna'. "
722 "For a list of valid IANA TZ identifiers see https://en.wikipedia.org/wiki/List_of_tz_database_time_zones#List"
723 "\n\nTo change the timezone not only for snapshot name creation, but in all respects for the entire program, "
724 "use the standard 'TZ' Unix environment variable, like so: `export TZ=UTC`.\n\n")
725 parser.add_argument(
726 "--create-src-snapshots-even-if-not-due", action="store_true",
727 help="Take snapshots immediately regardless of the creation time of any existing snapshot, even if snapshots "
728 "are periodic and not actually due per the schedule.\n\n")
729 parser.add_argument(
730 "--create-src-snapshots-enable-snapshots-changed-cache", action="store_true",
731 help="Maintain a local cache of recent snapshot creation times, running "
732 "'zfs list -t filesystem,volume -o snapshots_changed' instead of 'zfs list -t snapshot' to determine if a new "
733 "snapshot shall be created on the src. This flag improves performance for high-frequency snapshotting use "
734 "cases. Only relevant if --create-src-snapshots-even-if-not-due is not specified.\n\n")
735 zfs_send_program_opts_default = "--props --raw --compressed"
736 parser.add_argument(
737 "--zfs-send-program-opts", type=str, default=zfs_send_program_opts_default, metavar="STRING",
738 help="Parameters to fine-tune 'zfs send' behaviour (optional); will be passed into 'zfs send' CLI. "
739 "The value is split on runs of one or more whitespace characters. "
740 f"Default is '{zfs_send_program_opts_default}'. To run `zfs send` without options, specify the empty "
741 "string: `--zfs-send-program-opts=''`. "
742 "See https://openzfs.github.io/openzfs-docs/man/master/8/zfs-send.8.html "
743 "and https://github.com/openzfs/zfs/issues/13024\n\n")
744 zfs_recv_program_opts_default = "-u"
745 parser.add_argument(
746 "--zfs-recv-program-opts", type=str, default=zfs_recv_program_opts_default, metavar="STRING",
747 help="Parameters to fine-tune 'zfs receive' behaviour (optional); will be passed into 'zfs receive' CLI. "
748 "The value is split on runs of one or more whitespace characters. "
749 f"Default is '{zfs_recv_program_opts_default}'. To run `zfs receive` without options, specify the empty "
750 "string: `--zfs-recv-program-opts=''`. "
751 "Example: '-u -o canmount=noauto -o readonly=on -x keylocation -x keyformat -x encryption'. "
752 "See https://openzfs.github.io/openzfs-docs/man/master/8/zfs-receive.8.html "
753 "and https://openzfs.github.io/openzfs-docs/man/master/7/zfsprops.7.html\n\n")
754 parser.add_argument(
755 "--zfs-recv-program-opt", action="append", default=[], metavar="STRING",
756 help="Parameter to fine-tune 'zfs receive' behaviour (optional); will be passed into 'zfs receive' CLI. "
757 "The value can contain spaces and is not split. This option can be specified multiple times. Example: `"
758 "--zfs-recv-program-opt=-o "
759 "--zfs-recv-program-opt='org.zfsbootmenu:commandline=ro debug zswap.enabled=1'`\n\n")
760 parser.add_argument(
761 "--force-rollback-to-latest-snapshot", action="store_true",
762 help="Before replication, rollback the destination dataset to its most recent destination snapshot (if there "
763 "is one), via 'zfs rollback', just in case the destination dataset was modified since its most recent "
764 "snapshot. This is much less invasive than the other --force* options (see below).\n\n")
765 parser.add_argument(
766 "--force-rollback-to-latest-common-snapshot", action="store_true",
767 help="Before replication, delete destination ZFS snapshots that are more recent than the most recent common "
768 "snapshot selected on the source ('conflicting snapshots'), via 'zfs rollback'. Do no rollback if no common "
769 "snapshot is selected.\n\n")
770 parser.add_argument(
771 "--force", action="store_true",
772 help="Same as --force-rollback-to-latest-common-snapshot (see above), except that additionally, if no common "
773 "snapshot is selected, then delete all destination snapshots before starting replication, and proceed "
774 "without aborting. Without the --force* flags, the destination dataset is treated as append-only, hence "
775 "no destination snapshot that already exists is deleted, and instead the operation is aborted with an "
776 "error when encountering a conflicting snapshot.\n\n"
777 "Analogy: --force-rollback-to-latest-snapshot is a tiny hammer, whereas "
778 "--force-rollback-to-latest-common-snapshot is a medium sized hammer, --force is a large hammer, and "
779 "--force-destroy-dependents is a very large hammer. "
780 "Consider using the smallest hammer that can fix the problem. No hammer is ever used by default.\n\n")
781 parser.add_argument(
782 "--force-destroy-dependents", action="store_true",
783 help="On destination, --force and --force-rollback-to-latest-common-snapshot and --delete-* will add the "
784 "'-R' flag to their use of 'zfs rollback' and 'zfs destroy', causing them to delete dependents such as "
785 "clones and bookmarks. This can be very destructive and is rarely advisable.\n\n")
786 parser.add_argument(
787 "--force-hard", action="store_true", # deprecated; was renamed to --force-destroy-dependents
788 help=argparse.SUPPRESS)
789 parser.add_argument(
790 "--force-unmount", action="store_true",
791 help="On destination, --force and --force-rollback-to-latest-common-snapshot will add the '-f' flag to their "
792 "use of 'zfs rollback' and 'zfs destroy'.\n\n")
793 parser.add_argument(
794 "--force-once", "--f1", action="store_true",
795 help="Use the --force option or --force-rollback-to-latest-common-snapshot option at most once to resolve a "
796 "conflict, then abort with an error on any subsequent conflict. This helps to interactively resolve "
797 "conflicts, one conflict at a time.\n\n")
798 parser.add_argument(
799 "--skip-parent", action="store_true",
800 help="During replication and deletion, skip processing of the SRC_DATASET and DST_DATASET and only process "
801 "their descendant datasets, i.e. children, and children of children, etc (with --recursive). No dataset "
802 "is processed unless --recursive is also specified. "
803 f"Analogy: `{prog_name} --recursive --skip-parent src dst` is akin to Unix `cp -r src/* dst/` whereas "
804 f" `{prog_name} --recursive --skip-parent --skip-replication --delete-dst-datasets dummy dst` is akin to "
805 f"Unix `rm -r dst/*`\n\n")
806 parser.add_argument(
807 "--skip-missing-snapshots", choices=["fail", "dataset", "continue"], default="dataset", nargs="?",
808 help="During replication, handle source datasets that select no snapshots (and no relevant bookmarks) "
809 "as follows:\n\n"
810 "a) 'fail': Abort with an error.\n\n"
811 "b) 'dataset' (default): Skip the source dataset with a warning. Skip descendant datasets if "
812 "--recursive and destination dataset does not exist. Otherwise skip to the next dataset.\n\n"
813 "c) 'continue': Skip nothing. If destination snapshots exist, delete them (with --force) or abort "
814 "with an error (without --force). If there is no such abort, continue processing with the next dataset. "
815 "Eventually create empty destination dataset and ancestors if they do not yet exist and source dataset "
816 "has at least one descendant that selects at least one snapshot.\n\n")
817 retries_default = 2
818 parser.add_argument(
819 "--retries", type=int, min=0, default=retries_default, action=CheckRange, metavar="INT",
820 help="The maximum number of times a retryable replication or deletion step shall be retried if it fails, for "
821 f"example because of network hiccups (default: {retries_default}, min: 0). "
822 "Also consider this option if a periodic pruning script may simultaneously delete a dataset or "
823 f"snapshot or bookmark while {prog_name} is running and attempting to access it.\n\n")
824 retry_min_sleep_secs_default = 0.125
825 parser.add_argument(
826 "--retry-min-sleep-secs", type=float, min=0, default=retry_min_sleep_secs_default,
827 action=CheckRange, metavar="FLOAT",
828 help=f"The minimum duration to sleep between retries (default: {retry_min_sleep_secs_default}).\n\n")
829 retry_max_sleep_secs_default = 5 * 60
830 parser.add_argument(
831 "--retry-max-sleep-secs", type=float, min=0, default=retry_max_sleep_secs_default,
832 action=CheckRange, metavar="FLOAT",
833 help="The maximum duration to sleep between retries initially starts with --retry-min-sleep-secs (see above), "
834 "and doubles on each retry, up to the final maximum of --retry-max-sleep-secs "
835 f"(default: {retry_max_sleep_secs_default}). On each retry a random sleep time in the "
836 "[--retry-min-sleep-secs, current max] range is picked. The timer resets after each operation.\n\n")
837 retry_max_elapsed_secs_default = 60 * 60
838 parser.add_argument(
839 "--retry-max-elapsed-secs", type=float, min=0, default=retry_max_elapsed_secs_default,
840 action=CheckRange, metavar="FLOAT",
841 help="A single operation (e.g. 'zfs send/receive' of the current dataset, or deletion of a list of snapshots "
842 "within the current dataset) will not be retried (or not retried anymore) once this much time has elapsed "
843 f"since the initial start of the operation, including retries (default: {retry_max_elapsed_secs_default})."
844 " The timer resets after each operation completes or retries exhaust, such that subsequently failing "
845 "operations can again be retried.\n\n")
846 parser.add_argument(
847 "--skip-on-error", choices=["fail", "tree", "dataset"], default="dataset",
848 help="During replication and deletion, if an error is not retryable, or --retries has been exhausted, "
849 "or --skip-missing-snapshots raises an error, proceed as follows:\n\n"
850 "a) 'fail': Abort the program with an error. This mode is ideal for testing, clear "
851 "error reporting, and situations where consistency trumps availability.\n\n"
852 "b) 'tree': Log the error, skip the dataset tree rooted at the dataset for which the error "
853 "occurred, and continue processing the next (sibling) dataset tree. "
854 "Example: Assume datasets tank/user1/foo and tank/user2/bar and an error occurs while processing "
855 "tank/user1. In this case processing skips tank/user1/foo and proceeds with tank/user2.\n\n"
856 "c) 'dataset' (default): Same as 'tree' except if the destination dataset already exists, skip to "
857 "the next dataset instead. "
858 "Example: Assume datasets tank/user1/foo and tank/user2/bar and an error occurs while "
859 "processing tank/user1. In this case processing skips tank/user1 and proceeds with tank/user1/foo "
860 "if the destination already contains tank/user1. Otherwise processing continues with tank/user2. "
861 "This mode is for production use cases that require timely forward progress even in the presence of "
862 "partial failures. For example, assume the job is to backup the home directories or virtual machines "
863 "of thousands of users across an organization. Even if replication of some of the datasets for some "
864 "users fails due too conflicts, busy datasets, etc, the replication job will continue for the "
865 "remaining datasets and the remaining users.\n\n")
866 parser.add_argument(
867 "--skip-replication", action="store_true",
868 help="Skip replication step (see above) and proceed to the optional --delete-dst-datasets step "
869 "immediately (see below).\n\n")
870 parser.add_argument(
871 "--delete-dst-datasets", action="store_true",
872 help="Do nothing if the --delete-dst-datasets option is missing. Otherwise, after successful replication "
873 "step, if any, delete existing destination datasets that are selected via --{include|exclude}-dataset* "
874 "policy yet do not exist within SRC_DATASET (which can be an empty dataset, such as the hardcoded virtual "
875 f"dataset named '{dummy_dataset}'!). Do not recurse without --recursive. With --recursive, never delete "
876 "non-selected dataset subtrees or their ancestors.\n\n"
877 "For example, if the destination contains datasets h1,h2,h3,d1 whereas source only contains h3, "
878 "and the include/exclude policy selects h1,h2,h3,d1, then delete datasets h1,h2,d1 on "
879 "the destination to make it 'the same'. On the other hand, if the include/exclude policy "
880 "only selects h1,h2,h3 then only delete datasets h1,h2 on the destination to make it 'the same'.\n\n"
881 "Example to delete all tmp datasets within tank2/boo/bar: "
882 f"`{prog_name} {dummy_dataset} tank2/boo/bar --dryrun --skip-replication --recursive "
883 "--delete-dst-datasets --include-dataset-regex '(.*/)?tmp.*' --exclude-dataset-regex '!.*'`\n\n")
884 parser.add_argument(
885 "--delete-dst-snapshots", choices=["snapshots", "bookmarks"], default=None, const="snapshots",
886 nargs="?",
887 help="Do nothing if the --delete-dst-snapshots option is missing. Otherwise, after successful "
888 "replication, and successful --delete-dst-datasets step, if any, delete existing destination snapshots "
889 "whose GUID does not exist within the source dataset (which can be an empty dummy dataset!) if the "
890 "destination snapshots are selected by the --include/exclude-snapshot-* policy, and the destination "
891 "dataset is selected via --{include|exclude}-dataset* policy. Does not recurse without --recursive.\n\n"
892 "For example, if the destination dataset contains snapshots h1,h2,h3,d1 (h=hourly, d=daily) whereas "
893 "the source dataset only contains snapshot h3, and the include/exclude policy selects "
894 "h1,h2,h3,d1, then delete snapshots h1,h2,d1 on the destination dataset to make it 'the same'. "
895 "On the other hand, if the include/exclude policy only selects snapshots h1,h2,h3 then only "
896 "delete snapshots h1,h2 on the destination dataset to make it 'the same'.\n\n"
897 "*Note:* To delete snapshots regardless, consider using --delete-dst-snapshots in combination with a "
898 f"source that is an empty dataset, such as the hardcoded virtual dataset named '{dummy_dataset}', like so:"
899 f" `{prog_name} {dummy_dataset} tank2/boo/bar --dryrun --skip-replication --delete-dst-snapshots "
900 "--include-snapshot-regex '.*_daily' --recursive`\n\n"
901 "*Note:* Use --delete-dst-snapshots=bookmarks to delete bookmarks instead of snapshots, in which "
902 "case no snapshots are selected and the --{include|exclude}-snapshot-* filter options treat bookmarks as "
903 "snapshots wrt. selecting.\n\n"
904 "*Performance Note:* --delete-dst-snapshots operates on multiple datasets in parallel (and serially "
905 f"within a dataset), using the same dataset order as {prog_name} replication. "
906 "The degree of parallelism is configurable with the --threads option (see below).\n\n")
907 parser.add_argument(
908 "--delete-dst-snapshots-no-crosscheck", action="store_true",
909 help="This flag indicates that --delete-dst-snapshots=snapshots shall check the source dataset only for "
910 "a snapshot with the same GUID, and ignore whether a bookmark with the same GUID is present in the "
911 "source dataset. Similarly, it also indicates that --delete-dst-snapshots=bookmarks shall check the "
912 "source dataset only for a bookmark with the same GUID, and ignore whether a snapshot with the same GUID "
913 "is present in the source dataset.\n\n")
914 parser.add_argument(
915 "--delete-dst-snapshots-except", action="store_true",
916 help="This flag indicates that the --include/exclude-snapshot-* options shall have inverted semantics for the "
917 "--delete-dst-snapshots option, thus deleting all snapshots except for the selected snapshots (within the "
918 "specified datasets), instead of deleting all selected snapshots (within the specified datasets). In other"
919 " words, this flag enables to specify which snapshots to retain instead of which snapshots to delete.\n\n")
920 parser.add_argument(
921 "--delete-dst-snapshots-except-plan", action=DeleteDstSnapshotsExceptPlanAction, default=None,
922 metavar="DICT_STRING",
923 help="Retention periods to be used if pruning snapshots or bookmarks within the selected destination datasets via "
924 "--delete-dst-snapshots. Has the same format as --create-src-snapshots-plan. "
925 "Snapshots (--delete-dst-snapshots=snapshots) or bookmarks (with --delete-dst-snapshots=bookmarks) that "
926 "do not match a period will be deleted. To avoid unexpected surprises, make sure to carefully specify ALL "
927 "snapshot names and periods that shall be retained, in combination with --dryrun.\n\n"
928 f"Example: `{format_dict(src_snapshot_plan_example)}`. This example will, for the organization 'prod' and "
929 "the intended logical target 'onsite', retain secondly snapshots that were created less than 40 seconds ago, "
930 "yet retain the latest 40 secondly snapshots regardless of creation time. Analog for the latest 40 minutely "
931 "snapshots, latest 36 hourly snapshots, etc. "
932 "It will also retain snapshots for the targets 'us-west-1' and 'eu-west-1' within the 'prod' organization. "
933 "In addition, within the 'test' organization, it will retain snapshots that are created every 12 hours and "
934 "every week as specified, and name them as being intended for the 'offsite' replication target. Analog for "
935 "snapshots that are taken every 100 milliseconds within the 'test' organization. "
936 "All other snapshots within the selected datasets will be deleted - you've been warned!\n\n"
937 "The example scans the selected ZFS datasets for snapshots with names like "
938 "`prod_onsite_<timestamp>_secondly`, `prod_onsite_<timestamp>_minutely`, "
939 "`prod_us-west-1_<timestamp>_hourly`, `prod_us-west-1_<timestamp>_daily`, "
940 "`prod_eu-west-1_<timestamp>_hourly`, `prod_eu-west-1_<timestamp>_daily`, "
941 "`test_offsite_<timestamp>_12hourly`, `test_offsite_<timestamp>_weekly`, and so on, and deletes all snapshots "
942 "that do not match a retention rule.\n\n"
943 "Note: A zero within a period (e.g. 'hourly': 0) indicates that no snapshots shall be retained for the given "
944 "period.\n\n"
945 "Note: --delete-dst-snapshots-except-plan is a convenience option that auto-generates a series of the "
946 "following other options: --delete-dst-snapshots-except, "
947 "--new-snapshot-filter-group, --include-snapshot-regex, --include-snapshot-times-and-ranks\n\n")
948 parser.add_argument(
949 "--delete-empty-dst-datasets", choices=["snapshots", "snapshots+bookmarks"], default=None,
950 const="snapshots+bookmarks", nargs="?",
951 help="Do nothing if the --delete-empty-dst-datasets option is missing or --recursive is missing. Otherwise, "
952 "after successful replication "
953 "step and successful --delete-dst-datasets and successful --delete-dst-snapshots steps, if any, "
954 "delete any selected destination dataset that has no snapshot and no bookmark if all descendants of "
955 "that destination dataset are also selected and do not have a snapshot or bookmark either "
956 "(again, only if the existing destination dataset is selected via --{include|exclude}-dataset* policy). "
957 "Never delete non-selected dataset subtrees or their ancestors.\n\n"
958 "For example, if the destination contains datasets h1,d1, and the include/exclude policy "
959 "selects h1,d1, then check if h1,d1 can be deleted. "
960 "On the other hand, if the include/exclude policy only selects h1 then only check if h1 "
961 "can be deleted.\n\n"
962 "*Note:* Use --delete-empty-dst-datasets=snapshots to delete snapshot-less datasets even if they still "
963 "contain bookmarks.\n\n")
964 cmp_choices_dflt = "+".join(cmp_choices_items)
965 cmp_choices: List[str] = []
966 for i in range(0, len(cmp_choices_items)):
967 cmp_choices += map(lambda item: "+".join(item), itertools.combinations(cmp_choices_items, i + 1))
968 parser.add_argument(
969 "--compare-snapshot-lists", choices=cmp_choices, default=None, const=cmp_choices_dflt, nargs="?",
970 help="Do nothing if the --compare-snapshot-lists option is missing. Otherwise, after successful replication "
971 "step and successful --delete-dst-datasets, --delete-dst-snapshots steps and --delete-empty-dst-datasets "
972 "steps, if any, proceed as follows:\n\n"
973 "Compare source and destination dataset trees recursively wrt. snapshots, for example to check if all "
974 "recently taken snapshots have been successfully replicated by a periodic job.\n\n"
975 "Example: List snapshots only contained in source (tagged with 'src'), only contained in destination "
976 "(tagged with 'dst'), and contained in both source and destination (tagged with 'all'), restricted to "
977 "hourly and daily snapshots taken within the last 7 days, excluding the last 4 hours (to allow for some "
978 "slack/stragglers), excluding temporary datasets: "
979 f"`{prog_name} tank1/foo/bar tank2/boo/bar --skip-replication "
980 "--compare-snapshot-lists=src+dst+all --recursive --include-snapshot-regex '.*_(hourly|daily)' "
981 "--include-snapshot-times-and-ranks '7 days ago..4 hours ago' --exclude-dataset-regex 'tmp.*'`\n\n"
982 "This outputs a TSV file containing the following columns:\n\n"
983 "`location creation_iso createtxg rel_name guid root_dataset rel_dataset name creation written`\n\n"
984 "Example output row:\n\n"
985 "`src 2024-11-06_08:30:05 17435050 /foo@test_2024-11-06_08:30:05_daily 2406491805272097867 tank1/src "
986 "/foo tank1/src/foo@test_2024-10-06_08:30:04_daily 1730878205 24576`\n\n"
987 "If the TSV output file contains zero lines starting with the prefix 'src' and zero lines starting with "
988 "the prefix 'dst' then no source snapshots are missing on the destination, and no destination "
989 "snapshots are missing on the source, indicating that the periodic replication and pruning jobs perform "
990 "as expected. The TSV output is sorted by rel_dataset, and by ZFS creation time within each rel_dataset "
991 "- the first and last line prefixed with 'all' contains the metadata of the oldest and latest common "
992 "snapshot, respectively. Third party tools can use this info for post-processing, for example using "
993 "custom scripts using 'csplit' or duckdb analytics queries.\n\n"
994 "The --compare-snapshot-lists option also directly logs various summary stats, such as the metadata of "
995 "the latest common snapshot, latest snapshots and oldest snapshots, as well as the time diff between the "
996 "latest common snapshot and latest snapshot only in src (and only in dst), as well as how many src "
997 "snapshots and how many GB of data are missing on dst, etc.\n\n"
998 "*Note*: Consider omitting the 'all' flag to reduce noise and instead focus on missing snapshots only, "
999 "like so: --compare-snapshot-lists=src+dst \n\n"
1000 "*Note*: The source can also be an empty dataset, such as the hardcoded virtual dataset named "
1001 f"'{dummy_dataset}'.\n\n"
1002 "*Note*: --compare-snapshot-lists is typically *much* faster than standard 'zfs list -t snapshot' CLI "
1003 "usage because the former issues requests with a higher degree of parallelism than the latter. The "
1004 "degree is configurable with the --threads option (see below).\n\n")
1005 parser.add_argument(
1006 "--dryrun", "-n", choices=["recv", "send"], default=None, const="send", nargs="?",
1007 help="Do a dry run (aka 'no-op') to print what operations would happen if the command were to be executed "
1008 "for real (optional). This option treats both the ZFS source and destination as read-only. "
1009 "Accepts an optional argument for fine tuning that is handled as follows:\n\n"
1010 "a) 'recv': Send snapshot data via 'zfs send' to the destination host and receive it there via "
1011 "'zfs receive -n', which discards the received data there.\n\n"
1012 "b) 'send': Do not execute 'zfs send' and do not execute 'zfs receive'. This is a less 'realistic' form "
1013 "of dry run, but much faster, especially for large snapshots and slow networks/disks, as no snapshot is "
1014 "actually transferred between source and destination. This is the default when specifying --dryrun.\n\n"
1015 "Examples: --dryrun, --dryrun=send, --dryrun=recv\n\n")
1016 parser.add_argument(
1017 "--verbose", "-v", action="count", default=0,
1018 help="Print verbose information. This option can be specified multiple times to increase the level of "
1019 "verbosity. To print what ZFS/SSH operation exactly is happening (or would happen), add the `-v -v` "
1020 "flag, maybe along with --dryrun. All ZFS and SSH commands (even with --dryrun) are logged such that "
1021 "they can be inspected, copy-and-pasted into a terminal shell and run manually to help anticipate or "
1022 "diagnose issues. ERROR, WARN, INFO, DEBUG, TRACE output lines are identified by [E], [W], [I], [D], [T] "
1023 "prefixes, respectively.\n\n")
1024 parser.add_argument(
1025 "--quiet", "-q", action="store_true",
1026 help="Suppress non-error, info, debug, and trace output.\n\n")
1027 parser.add_argument(
1028 "--no-privilege-elevation", "-p", action="store_true",
1029 help="Do not attempt to run state changing ZFS operations 'zfs create/rollback/destroy/send/receive/snapshot' as "
1030 "root (via 'sudo -u root' elevation granted by administrators appending the following to /etc/sudoers: "
1031 "`<NON_ROOT_USER_NAME> ALL=NOPASSWD:/path/to/zfs`\n\n"
1032 "Instead, the --no-privilege-elevation flag is for non-root users that have been granted corresponding "
1033 "ZFS permissions by administrators via 'zfs allow' delegation mechanism, like so: "
1034 "sudo zfs allow -u $SRC_NON_ROOT_USER_NAME snapshot,destroy,send,bookmark,hold $SRC_DATASET; "
1035 "sudo zfs allow -u $DST_NON_ROOT_USER_NAME mount,create,receive,rollback,destroy,canmount,mountpoint,"
1036 "readonly,compression,encryption,keylocation,recordsize $DST_DATASET_OR_POOL.\n\n"
1037 "For extra security $SRC_NON_ROOT_USER_NAME should be different than $DST_NON_ROOT_USER_NAME, i.e. the "
1038 "sending Unix user on the source and the receiving Unix user at the destination should be separate Unix "
1039 "user accounts with separate private keys even if both accounts reside on the same machine, per the "
1040 "principle of least privilege. Further, if you do not plan to use the --force* flags and "
1041 "--delete-* CLI options then ZFS permissions 'rollback,destroy' can "
1042 "be omitted. If you do not plan to customize the respective ZFS dataset property then ZFS permissions "
1043 "'canmount,mountpoint,readonly,compression,encryption,keylocation,recordsize' can be omitted, arriving "
1044 "at the absolutely minimal set of required destination permissions: "
1045 "`mount,create,receive`.\n\n"
1046 "Also see https://openzfs.github.io/openzfs-docs/man/master/8/zfs-allow.8.html#EXAMPLES and "
1047 "https://tinyurl.com/9h97kh8n and "
1048 "https://youtu.be/o_jr13Z9f1k?si=7shzmIQJpzNJV6cq\n\n")
1049 parser.add_argument(
1050 "--no-stream", action="store_true",
1051 help="During replication, only replicate the most recent selected source snapshot of a dataset (using -i "
1052 "incrementals instead of -I incrementals), hence skip all intermediate source snapshots that may exist "
1053 "between that and the most recent common snapshot. If there is no common snapshot also skip all other "
1054 "source snapshots for the dataset, except for the most recent selected source snapshot. This option helps "
1055 "the destination to 'catch up' with the source ASAP, consuming a minimum of disk space, at the expense "
1056 "of reducing reliable options for rolling back to intermediate snapshots in the future.\n\n")
1057 parser.add_argument(
1058 "--no-resume-recv", action="store_true",
1059 help="Replication of snapshots via 'zfs send/receive' can be interrupted by intermittent network hiccups, "
1060 "reboots, hardware issues, etc. Interrupted 'zfs send/receive' operations are retried if the --retries "
1061 f"and --retry-* options enable it (see above). In normal operation {prog_name} automatically retries "
1062 "such that only the portion of the snapshot is transmitted that has not yet been fully received on the "
1063 "destination. For example, this helps to progressively transfer a large individual snapshot over a "
1064 "wireless network in a timely manner despite frequent intermittent network hiccups. This optimization is "
1065 "called 'resume receive' and uses the 'zfs receive -s' and 'zfs send -t' feature.\n\n"
1066 "The --no-resume-recv option disables this optimization such that a retry now retransmits the entire "
1067 "snapshot from scratch, which could slow down or even prohibit progress in case of frequent network "
1068 f"hiccups. {prog_name} automatically falls back to using the --no-resume-recv option if it is "
1069 "auto-detected that the ZFS pool does not reliably support the 'resume receive' optimization.\n\n"
1070 "*Note:* Snapshots that have already been fully transferred as part of the current 'zfs send/receive' "
1071 "operation need not be retransmitted regardless of the --no-resume-recv flag. For example, assume "
1072 "a single 'zfs send/receive' operation is transferring incremental snapshots 1 through 10 via "
1073 "'zfs send -I', but the operation fails while transferring snapshot 10, then snapshots 1 through 9 "
1074 "need not be retransmitted regardless of the --no-resume-recv flag, as these snapshots have already "
1075 "been successfully received at the destination either way.\n\n")
1076 parser.add_argument(
1077 "--no-create-bookmark", action="store_true",
1078 help=f"For increased safety, in normal operation {prog_name} replication behaves as follows wrt. ZFS bookmark "
1079 "creation, if it is autodetected that the source ZFS pool support bookmarks: "
1080 f"Whenever it has successfully completed replication of the most recent source snapshot, {prog_name} "
1081 "creates a ZFS bookmark of that snapshot and attaches it to the source dataset. "
1082 "Bookmarks exist so an incremental stream can continue to be sent from the source dataset without having "
1083 "to keep the already replicated snapshot around on the source dataset until the next upcoming snapshot "
1084 "has been successfully replicated. This way you can send the snapshot from the source dataset to another "
1085 "host, then bookmark the snapshot on the source dataset, then delete the snapshot from the source "
1086 "dataset to save disk space, and then still incrementally send the next upcoming snapshot from the "
1087 "source dataset to the other host by referring to the bookmark.\n\n"
1088 "The --no-create-bookmark option disables this safety feature but is discouraged, because bookmarks "
1089 "are tiny and relatively cheap and help to ensure that ZFS replication can continue even if source and "
1090 "destination dataset somehow have no common snapshot anymore. "
1091 "For example, if a pruning script has accidentally deleted too many (or even all) snapshots on the "
1092 "source dataset in an effort to reclaim disk space, replication can still proceed because it can use "
1093 "the info in the bookmark (the bookmark must still exist in the source dataset) instead of the info in "
1094 "the metadata of the (now missing) source snapshot.\n\n"
1095 "A ZFS bookmark is a tiny bit of metadata extracted from a ZFS snapshot by the 'zfs bookmark' CLI, and "
1096 "attached to a dataset, much like a ZFS snapshot. Note that a ZFS bookmark does not contain user data; "
1097 "instead a ZFS bookmark is essentially a tiny pointer in the form of the GUID of the snapshot and 64-bit "
1098 "transaction group number of the snapshot and creation time of the snapshot, which is sufficient to tell "
1099 "the destination ZFS pool how to find the destination snapshot corresponding to the source bookmark "
1100 "and (potentially already deleted) source snapshot. A bookmark can be fed into 'zfs send' as the "
1101 "source of an incremental send. Note that while a bookmark allows for its snapshot "
1102 "to be deleted on the source after successful replication, it still requires that its snapshot is not "
1103 "somehow deleted prematurely on the destination dataset, so be mindful of that. "
1104 f"By convention, a bookmark created by {prog_name} has the same name as its corresponding "
1105 "snapshot, the only difference being the leading '#' separator instead of the leading '@' separator. "
1106 "Also see https://www.youtube.com/watch?v=LaNgoAZeTww&t=316s.\n\n"
1107 "You can list bookmarks, like so: "
1108 "`zfs list -t bookmark -o name,guid,createtxg,creation -d 1 $SRC_DATASET`, and you can (and should) "
1109 "periodically prune obsolete bookmarks just like snapshots, like so: "
1110 "`zfs destroy $SRC_DATASET#$BOOKMARK`. Typically, bookmarks should be pruned less aggressively "
1111 "than snapshots, and destination snapshots should be pruned less aggressively than source snapshots. "
1112 "As an example starting point, here is a command that deletes all bookmarks older than "
1113 "90 days, but retains the latest 200 bookmarks (per dataset) regardless of creation time: "
1114 f"`{prog_name} {dummy_dataset} tank2/boo/bar --dryrun --recursive --skip-replication "
1115 "--delete-dst-snapshots=bookmarks --include-snapshot-times-and-ranks notime 'all except latest 200' "
1116 "--include-snapshot-times-and-ranks 'anytime..90 days ago'`\n\n")
1117 parser.add_argument(
1118 "--no-use-bookmark", action="store_true",
1119 help=f"For increased safety, in normal replication operation {prog_name} replication also looks for bookmarks "
1120 "(in addition to snapshots) on the source dataset in order to find the most recent common snapshot wrt. the "
1121 "destination dataset, if it is auto-detected that the source ZFS pool support bookmarks. "
1122 "The --no-use-bookmark option disables this safety feature but is discouraged, because bookmarks help "
1123 "to ensure that ZFS replication can continue even if source and destination dataset somehow have no "
1124 "common snapshot anymore.\n\n"
1125 f"Note that it does not matter whether a bookmark was created by {prog_name} or a third party script, "
1126 "as only the GUID of the bookmark and the GUID of the snapshot is considered for comparison, and ZFS "
1127 "guarantees that any bookmark of a given snapshot automatically has the same GUID, transaction group "
1128 "number and creation time as the snapshot. Also note that you can create, delete and prune bookmarks "
1129 f"any way you like, as {prog_name} (without --no-use-bookmark) will happily work with whatever "
1130 "bookmarks currently exist, if any.\n\n")
1132 ssh_cipher_default = "^aes256-gcm@openssh.com" if platform.system() != "SunOS" else ""
1133 # for speed with confidentiality and integrity
1134 # measure cipher perf like so: count=5000; for i in $(seq 1 3); do echo "iteration $i:"; for cipher in $(ssh -Q cipher); do dd if=/dev/zero bs=1M count=$count 2> /dev/null | ssh -c $cipher -p 40999 127.0.0.1 "(time -p cat) > /dev/null" 2>&1 | grep real | awk -v count=$count -v cipher=$cipher '{print cipher ": " count / $2 " MB/s"}'; done; done
1135 # see https://gbe0.com/posts/linux/server/benchmark-ssh-ciphers/
1136 # and https://crypto.stackexchange.com/questions/43287/what-are-the-differences-between-these-aes-ciphers
1137 parser.add_argument(
1138 "--ssh-cipher", type=str, default=ssh_cipher_default, metavar="STRING",
1139 help="SSH cipher specification for encrypting the session (optional); will be passed into ssh -c CLI. "
1140 "--ssh-cipher is a comma-separated list of ciphers listed in order of preference. See the 'Ciphers' "
1141 "keyword in ssh_config(5) for more information: "
1142 f"https://manpages.ubuntu.com/manpages/man5/sshd_config.5.html. Default: `{ssh_cipher_default}`\n\n")
1144 ssh_private_key_file_default = ".ssh/id_rsa"
1145 locations = ["src", "dst"]
1146 for loc in locations:
1147 parser.add_argument(
1148 f"--ssh-{loc}-private-key", action="append", default=[], metavar="FILE",
1149 help=f"Path to SSH private key file on local host to connect to {loc} (optional); will be passed into "
1150 "ssh -i CLI. This option can be specified multiple times. "
1151 f"default: $HOME/{ssh_private_key_file_default}\n\n")
1152 for loc in locations:
1153 parser.add_argument(
1154 f"--ssh-{loc}-user", type=str, metavar="STRING",
1155 help=f"Remote SSH username on {loc} host to connect to (optional). Overrides username given in "
1156 f"{loc.upper()}_DATASET.\n\n")
1157 for loc in locations:
1158 parser.add_argument(
1159 f"--ssh-{loc}-host", type=str, metavar="STRING",
1160 help=f"Remote SSH hostname of {loc} host to connect to (optional). Can also be an IPv4 or IPv6 address. "
1161 f"Overrides hostname given in {loc.upper()}_DATASET.\n\n")
1162 for loc in locations:
1163 parser.add_argument(
1164 f"--ssh-{loc}-port", type=int, metavar="INT",
1165 help=f"Remote SSH port on {loc} host to connect to (optional).\n\n")
1166 for loc in locations:
1167 parser.add_argument(
1168 f"--ssh-{loc}-extra-opts", type=str, default="", metavar="STRING",
1169 help=f"Additional options to be passed to ssh CLI when connecting to {loc} host (optional). "
1170 "The value is split on runs of one or more whitespace characters. "
1171 f"Example: `--ssh-{loc}-extra-opts='-v -v'` to debug ssh config issues.\n\n")
1172 parser.add_argument(
1173 f"--ssh-{loc}-extra-opt", action="append", default=[], metavar="STRING",
1174 help=f"Additional option to be passed to ssh CLI when connecting to {loc} host (optional). The value "
1175 "can contain spaces and is not split. This option can be specified multiple times. "
1176 f"Example: `--ssh-{loc}-extra-opt='-oProxyCommand=nc %%h %%p'` to disable the TCP_NODELAY "
1177 "socket option for OpenSSH.\n\n")
1178 for loc in locations:
1179 parser.add_argument(
1180 f"--ssh-{loc}-config-file", type=str, metavar="FILE",
1181 help=f"Path to SSH ssh_config(5) file to connect to {loc} (optional); will be passed into ssh -F CLI.\n\n")
1182 threads_default = 100 # percent
1183 parser.add_argument(
1184 "--threads", min=1, default=(threads_default, True), action=CheckPercentRange, metavar="INT[%]",
1185 help="The maximum number of threads to use for parallel operations; can be given as a positive integer, "
1186 f"optionally followed by the %% percent character (min: 1, default: {threads_default}%%). Percentages "
1187 "are relative to the number of CPU cores on the machine. Example: 200%% uses twice as many threads as "
1188 "there are cores on the machine; 75%% uses num_threads = num_cores * 0.75. Currently this option only "
1189 "applies to dataset and snapshot replication, --create-src-snapshots, --delete-dst-snapshots, "
1190 "--delete-empty-dst-datasets, and --compare-snapshot-lists. The ideal value for this parameter depends "
1191 "on the use case and its performance requirements, as well as the number of available CPU cores and the "
1192 "parallelism offered by SSDs vs. HDDs, ZFS topology and configuration, as well as the network bandwidth "
1193 "and other workloads simultaneously running on the system. The current default is geared towards a high "
1194 "degreee of parallelism, and as such may perform poorly on HDDs. Examples: 1, 4, 75%%, 150%%\n\n")
1195 maxsessions_dflt = 8
1196 parser.add_argument(
1197 "--max-concurrent-ssh-sessions-per-tcp-connection", type=int, min=1, default=maxsessions_dflt,
1198 action=CheckRange, metavar="INT",
1199 help=f"For best throughput, {prog_name} uses multiple SSH TCP connections in parallel, as indicated by "
1200 "--threads (see above). For best startup latency, each such parallel TCP connection can carry a "
1201 "maximum of S concurrent SSH sessions, where "
1202 f"S=--max-concurrent-ssh-sessions-per-tcp-connection (default: {maxsessions_dflt}, min: 1). "
1203 "Concurrent SSH sessions are mostly used for metadata operations such as listing ZFS datasets and their "
1204 "snapshots. This client-side max sessions parameter must not be higher than the server-side "
1205 "sshd_config(5) MaxSessions parameter (which defaults to 10, see "
1206 "https://manpages.ubuntu.com/manpages/latest/man5/sshd_config.5.html).\n\n"
1207 f"*Note:* For better throughput, {prog_name} uses one dedicated TCP connection per ZFS "
1208 "send/receive operation such that the dedicated connection is never used by any other "
1209 "concurrent SSH session, effectively ignoring the value of the "
1210 "--max-concurrent-ssh-sessions-per-tcp-connection parameter in the ZFS send/receive case.\n\n")
1211 parser.add_argument(
1212 "--bwlimit", default=None, action=NonEmptyStringAction, metavar="STRING",
1213 help="Sets 'pv' bandwidth rate limit for zfs send/receive data transfer (optional). Example: `100m` to cap "
1214 "throughput at 100 MB/sec. Default is unlimited. Also see "
1215 "https://manpages.ubuntu.com/manpages/latest/en/man1/pv.1.html\n\n")
1216 parser.add_argument(
1217 "--daemon-lifetime", default="0 seconds", metavar="DURATION",
1218 # help="Exit the daemon after this much time has elapsed. Default is '0 seconds', i.e. no daemon mode. "
1219 # "Examples: '600 seconds', '86400 seconds', '1000years'")
1220 help=argparse.SUPPRESS)
1221 daemon_frequency_dflt = "minutely"
1222 parser.add_argument(
1223 "--daemon-frequency", default=daemon_frequency_dflt, metavar="STRING",
1224 # help=f"Run a daemon iteration every N time units. Default is '{daemon_frequency_dflt}'. "
1225 # "Examples: '100 millisecondly', '10secondly, 'minutely' to request the daemon to run every 100 milliseconds, "
1226 # "or every 10 seconds, or every minute, respectively. Only has an effect if --daemon-lifetime is non-zero.")
1227 help=argparse.SUPPRESS)
1228 parser.add_argument(
1229 "--no-estimate-send-size", action="store_true",
1230 help=argparse.SUPPRESS)
1232 def hlp(program: str) -> str:
1233 return f"The name or path to the '{program}' executable (optional). Default is '{program}'. "
1235 msg = f"Use '{disable_prg}' to disable the use of this program.\n\n"
1236 parser.add_argument(
1237 "--compression-program", default="zstd", action=NonEmptyStringAction, metavar="STRING",
1238 help=hlp("zstd") + "Examples: 'lz4', 'pigz', 'gzip', '/opt/bin/zstd'. " + msg.rstrip() + " The use is "
1239 "auto-disabled if data is transferred locally instead of via the network. This "
1240 "option is about transparent compression-on-the-wire, not about compression-at-rest.\n\n")
1241 parser.add_argument(
1242 "--compression-program-opts", default="-1", metavar="STRING",
1243 help="The options to be passed to the compression program on the compression step (optional). "
1244 "Default is '-1' (fastest).\n\n")
1245 parser.add_argument(
1246 "--mbuffer-program", default="mbuffer", action=NonEmptyStringAction, metavar="STRING",
1247 help=hlp("mbuffer") + msg.rstrip() + " The use is auto-disabled if data is transferred locally "
1248 "instead of via the network. This tool is used to smooth out the rate "
1249 "of data flow and prevent bottlenecks caused by network latency or "
1250 "speed fluctuation.\n\n")
1251 mbuffer_program_opts_default = "-q -m 128M"
1252 parser.add_argument(
1253 "--mbuffer-program-opts", default=mbuffer_program_opts_default, metavar="STRING",
1254 help=f"Options to be passed to 'mbuffer' program (optional). Default: '{mbuffer_program_opts_default}'.\n\n")
1255 parser.add_argument(
1256 "--ps-program", default="ps", action=NonEmptyStringAction, metavar="STRING",
1257 help=hlp("ps") + msg)
1258 parser.add_argument(
1259 "--pv-program", default="pv", action=NonEmptyStringAction, metavar="STRING",
1260 help=hlp("pv") + msg.rstrip() + " This is used for bandwidth rate-limiting and progress monitoring.\n\n")
1261 pv_program_opts_default = ("--progress --timer --eta --fineta --rate --average-rate --bytes --interval=1 "
1262 "--width=120 --buffer-size=2M")
1263 parser.add_argument(
1264 "--pv-program-opts", default=pv_program_opts_default, metavar="STRING",
1265 help=f"The options to be passed to the 'pv' program (optional). Default: '{pv_program_opts_default}'.\n\n")
1266 parser.add_argument(
1267 "--shell-program", default="sh", action=NonEmptyStringAction, metavar="STRING",
1268 help=hlp("sh") + msg)
1269 parser.add_argument(
1270 "--ssh-program", default="ssh", action=NonEmptyStringAction, metavar="STRING",
1271 help=hlp("ssh") + "Examples: 'hpnssh' or 'ssh' or '/opt/bin/ssh' or wrapper scripts around 'ssh'. " + msg)
1272 parser.add_argument(
1273 "--sudo-program", default="sudo", action=NonEmptyStringAction, metavar="STRING",
1274 help=hlp("sudo") + msg)
1275 parser.add_argument(
1276 "--zfs-program", default="zfs", action=NonEmptyStringAction, metavar="STRING",
1277 help=hlp("zfs") + "\n\n")
1278 parser.add_argument(
1279 "--zpool-program", default="zpool", action=NonEmptyStringAction, metavar="STRING",
1280 help=hlp("zpool") + msg)
1281 parser.add_argument(
1282 "--log-dir", type=str, metavar="DIR",
1283 help=f"Path to the log output directory on local host (optional). Default: $HOME/{prog_name}-logs. The logger "
1284 "that is used by default writes log files there, in addition to the console. The current.dir symlink "
1285 "always points to the subdirectory containing the most recent log file. The current.log symlink "
1286 "always points to the most recent log file. The current.pv symlink always points to the most recent "
1287 "data transfer monitoring log. Run `tail --follow=name --max-unchanged-stats=1` on both symlinks to "
1288 "follow what's currently going on. Parallel replication generates a separate .pv file per thread. To "
1289 "monitor these, run something like "
1290 "`while true; do clear; for f in $(realpath $HOME/bzfs-logs/current/current.pv)*; "
1291 "do tac -s $(printf '\\r') $f | tr '\\r' '\\n' | grep -m1 -v '^$'; done; sleep 1; done`\n\n")
1292 h_fix = ("The path name of the log file on local host is "
1293 "`${--log-dir}/${--log-file-prefix}<timestamp>${--log-file-infix}${--log-file-suffix}-<random>.log`. "
1294 "Example: `--log-file-prefix=zrun_us-west-1_ --log-file-suffix=_daily` will generate log "
1295 "file names such as `zrun_us-west-1_2024-09-03_12:26:15_daily-bl4i1fth.log`\n\n")
1296 parser.add_argument(
1297 "--log-file-prefix", default="zrun_", action=SafeFileNameAction, metavar="STRING",
1298 help="Default is zrun_. " + h_fix)
1299 parser.add_argument(
1300 "--log-file-infix", default="", action=SafeFileNameAction, metavar="STRING",
1301 help="Default is the empty string. " + h_fix)
1302 parser.add_argument(
1303 "--log-file-suffix", default="", action=SafeFileNameAction, metavar="STRING",
1304 help="Default is the empty string. " + h_fix)
1305 parser.add_argument(
1306 "--log-syslog-address", default=None, action=NonEmptyStringAction, metavar="STRING",
1307 help="Host:port of the syslog machine to send messages to (e.g. 'foo.example.com:514' or '127.0.0.1:514'), or "
1308 "the file system path to the syslog socket file on localhost (e.g. '/dev/log'). The default is no "
1309 "address, i.e. do not log anything to syslog by default. See "
1310 "https://docs.python.org/3/library/logging.handlers.html#sysloghandler\n\n")
1311 parser.add_argument(
1312 "--log-syslog-socktype", choices=["UDP", "TCP"], default="UDP",
1313 help="The socket type to use to connect if no local socket file system path is used. Default is 'UDP'.\n\n")
1314 parser.add_argument(
1315 "--log-syslog-facility", type=int, min=0, max=7, default=1, action=CheckRange, metavar="INT",
1316 help="The local facility aka category that identifies msg sources in syslog (default: 1, min=0, max=7).\n\n")
1317 parser.add_argument(
1318 "--log-syslog-prefix", default=prog_name, action=NonEmptyStringAction, metavar="STRING",
1319 help=f"The name to prepend to each message that is sent to syslog; identifies {prog_name} messages as opposed "
1320 f"to messages from other sources. Default is '{prog_name}'.\n\n")
1321 parser.add_argument(
1322 "--log-syslog-level", choices=["CRITICAL", "ERROR", "WARN", "INFO", "DEBUG", "TRACE"],
1323 default="ERROR",
1324 help="Only send messages with equal or higher priority than this log level to syslog. Default is 'ERROR'.\n\n")
1325 parser.add_argument(
1326 "--log-config-file", default=None, action=NonEmptyStringAction, metavar="STRING",
1327 help="The contents of a JSON file that defines a custom python logging configuration to be used (optional). "
1328 "If the option starts with a `+` prefix then the contents are read from the UTF-8 JSON file given "
1329 "after the `+` prefix. Examples: +log_config.json, +/path/to/log_config.json. "
1330 "Here is an example config file that demonstrates usage: "
1331 "https://github.com/whoschek/bzfs/blob/main/bzfs_tests/log_config.json\n\n"
1332 "For more examples see "
1333 "https://stackoverflow.com/questions/7507825/where-is-a-complete-example-of-logging-config-dictconfig "
1334 "and for details see "
1335 "https://docs.python.org/3/library/logging.config.html#configuration-dictionary-schema\n\n"
1336 "*Note:* Lines starting with a # character are ignored as comments within the JSON. Also, if a line ends "
1337 "with a # character the portion between that # character and the preceding # character on the same line "
1338 "is ignored as a comment.\n\n")
1339 parser.add_argument(
1340 "--log-config-var", action=LogConfigVariablesAction, nargs="+", default=[], metavar="NAME:VALUE",
1341 help="User defined variables in the form of zero or more NAME:VALUE pairs (optional). "
1342 "These variables can be used within the JSON passed with --log-config-file (see above) via "
1343 "`${name[:default]}` references, which are substituted (aka interpolated) as follows:\n\n"
1344 "If the variable contains a non-empty CLI value then that value is used. Else if a default value for the "
1345 "variable exists in the JSON file that default value is used. Else the program aborts with an error. "
1346 "Example: In the JSON variable `${syslog_address:/dev/log}`, the variable name is 'syslog_address' "
1347 "and the default value is '/dev/log'. The default value is the portion after the optional : colon "
1348 "within the variable declaration. The default value is used if the CLI user does not specify a non-empty "
1349 "value via --log-config-var, for example via "
1350 "--log-config-var syslog_address:/path/to/socket_file or via "
1351 "--log-config-var syslog_address:[host,port].\n\n"
1352 f"{prog_name} automatically supplies the following convenience variables: "
1353 "`${bzfs.log_level}`, `${bzfs.log_dir}`, `${bzfs.log_file}`, `${bzfs.sub.logger}`, "
1354 "`${bzfs.get_default_log_formatter}`, `${bzfs.timestamp}`. "
1355 "For a complete list see the source code of get_dict_config_logger().\n\n")
1356 parser.add_argument(
1357 "--include-envvar-regex", action=FileOrLiteralAction, nargs="+", default=[], metavar="REGEX",
1358 help="On program startup, unset all Unix environment variables for which the full environment variable "
1359 "name matches at least one of the excludes but none of the includes. If an environment variable is "
1360 "included this decision is never reconsidered because include takes precedence over exclude. "
1361 "The purpose is to tighten security and help guard against accidental inheritance or malicious "
1362 "injection of environment variable values that may have unintended effects.\n\n"
1363 "This option can be specified multiple times. "
1364 "A leading `!` character indicates logical negation, i.e. the regex matches if the regex with the "
1365 "leading `!` character removed does not match. "
1366 "The default is to include no environment variables, i.e. to make no exceptions to "
1367 "--exclude-envvar-regex. "
1368 "Example that retains at least these two env vars: "
1369 "`--include-envvar-regex PATH "
1370 f"--include-envvar-regex {env_var_prefix}min_pipe_transfer_size`. "
1371 "Example that retains all environment variables without tightened security: `'.*'`\n\n")
1372 parser.add_argument(
1373 "--exclude-envvar-regex", action=FileOrLiteralAction, nargs="+", default=[], metavar="REGEX",
1374 help="Same syntax as --include-envvar-regex (see above) except that the default is to exclude no "
1375 f"environment variables. Example: `{env_var_prefix}.*`\n\n")
1377 for period, label in {"yearly": "years", "monthly": "months", "weekly": "weeks", "daily": "days", "hourly": "hours",
1378 "minutely": "minutes", "secondly": "seconds", "millisecondly": "milliseconds"}.items():
1379 anchor_group = parser.add_argument_group(
1380 f"{period.title()} period anchors", "Use these options to customize when snapshots that happen "
1381 f"every N {label} are scheduled to be created on the source by the --create-src-snapshots option.")
1382 for f in [f for f in fields(PeriodAnchors) if f.name.startswith(period + "_")]:
1383 _min = f.metadata.get("min")
1384 _max = f.metadata.get("max")
1385 anchor_group.add_argument(
1386 "--" + f.name, type=int, min=_min, max=_max, default=f.default, action=CheckRange, metavar="INT",
1387 help=f"{f.metadata.get('help')} ({_min} ≤ x ≤ {_max}, default: {f.default}).\n\n")
1389 for option_name, flag in zfs_recv_groups.items():
1390 grup = option_name.replace("_", "-") # one of zfs_recv_o, zfs_recv_x
1391 flag = "'" + flag + "'" # one of -o or -x
1393 def h(text: str) -> str:
1394 return argparse.SUPPRESS if option_name == "zfs_set" else text
1396 argument_group = parser.add_argument_group(
1397 grup + " (Experimental)",
1398 description=h(f"The following group of parameters specifies additional zfs receive {flag} options that "
1399 "can be used to configure the copying of ZFS dataset properties from the source dataset to "
1400 "its corresponding destination dataset. The 'zfs-recv-o' group of parameters is applied "
1401 "before the 'zfs-recv-x' group."))
1402 target_choices_items = ["full", "incremental"]
1403 target_choices_default = "+".join(target_choices_items)
1404 target_choices = target_choices_items + [target_choices_default]
1405 qq = "'"
1406 argument_group.add_argument(
1407 f"--{grup}-targets", choices=target_choices, default=target_choices_default,
1408 help=h(f"The zfs send phase or phases during which the extra {flag} options are passed to 'zfs receive'. "
1409 "This can be one of the following choices: "
1410 f"{', '.join([f'{qq}{x}{qq}' for x in target_choices])}. "
1411 f"Default is '{target_choices_default}'. "
1412 "A 'full' send is sometimes also known as an 'initial' send.\n\n"))
1413 msg = "Thus, -x opts do not benefit from source != 'local' (which is the default already)." \
1414 if flag == "'-x'" else ""
1415 argument_group.add_argument(
1416 f"--{grup}-sources", action=NonEmptyStringAction, default="local", metavar="STRING",
1417 help=h("The ZFS sources to provide to the 'zfs get -s' CLI in order to fetch the ZFS dataset properties "
1418 f"that will be fed into the --{grup}-include/exclude-regex filter (see below). The sources are in "
1419 "the form of a comma-separated list (no spaces) containing one or more of the following choices: "
1420 "'local', 'default', 'inherited', 'temporary', 'received', 'none', with the default being 'local'. "
1421 f"Uses 'zfs get -p -s ${grup}-sources all $SRC_DATASET' to fetch the "
1422 "properties to copy - https://openzfs.github.io/openzfs-docs/man/master/8/zfs-get.8.html. P.S: Note "
1423 "that the existing 'zfs send --props' option does not filter and that --props only reads properties "
1424 f"from the 'local' ZFS property source (https://github.com/openzfs/zfs/issues/13024). {msg}\n\n"))
1425 argument_group.add_argument(
1426 f"--{grup}-include-regex", action=FileOrLiteralAction, nargs="+", default=[], metavar="REGEX",
1427 help=h(f"Take the output properties of --{grup}-sources (see above) and filter them such that we only "
1428 "retain the properties whose name matches at least one of the --include regexes but none of the "
1429 "--exclude regexes. If a property is excluded this decision is never reconsidered because exclude "
1430 f"takes precedence over include. Append each retained property to the list of {flag} options in "
1431 "--zfs-recv-program-opt(s), unless another '-o' or '-x' option with the same name already exists "
1432 "therein. In other words, --zfs-recv-program-opt(s) takes precedence.\n\n"
1433 f"The --{grup}-include-regex option can be specified multiple times. "
1434 "A leading `!` character indicates logical negation, i.e. the regex matches if the regex with the "
1435 "leading `!` character removed does not match. "
1436 "If the option starts with a `+` prefix then regexes are read from the newline-separated "
1437 "UTF-8 text file given after the `+` prefix, one regex per line inside of the text file.\n\n"
1438 f"The default is to include no properties, thus by default no extra {flag} option is appended. "
1439 f"Example: `--{grup}-include-regex recordsize volblocksize`. "
1440 "More examples: `.*` (include all properties), `foo bar myapp:.*` (include three regexes) "
1441 f"`+{grup}_regexes.txt`, `+/path/to/{grup}_regexes.txt`\n\n"))
1442 argument_group.add_argument(
1443 f"--{grup}-exclude-regex", action=FileOrLiteralAction, nargs="+", default=[], metavar="REGEX",
1444 help=h(f"Same syntax as --{grup}-include-regex (see above), and the default is to exclude no properties. "
1445 f"Example: --{grup}-exclude-regex encryptionroot keystatus origin volblocksize volsize\n\n"))
1446 parser.add_argument(
1447 "--version", action="version", version=f"{prog_name}-{__version__}, by {prog_author}",
1448 help="Display version information and exit.\n\n")
1449 parser.add_argument(
1450 "--help, -h", action="help",
1451 help="Show this help message and exit.\n\n")
1452 return parser
1453 # fmt: on
1456#############################################################################
1457class LogParams:
1458 def __init__(self, args: argparse.Namespace):
1459 """Option values for logging; reads from ArgumentParser via args."""
1460 # immutable variables:
1461 if args.quiet:
1462 self.log_level = "ERROR"
1463 elif args.verbose >= 2:
1464 self.log_level = "TRACE"
1465 elif args.verbose >= 1:
1466 self.log_level = "DEBUG"
1467 else:
1468 self.log_level = "INFO"
1469 self.log_config_file = args.log_config_file
1470 self.log_config_vars = dict(var.split(":", 1) for var in args.log_config_var)
1471 self.timestamp: str = datetime.now().isoformat(sep="_", timespec="seconds") # 2024-09-03_12:26:15
1472 self.home_dir: str = get_home_directory()
1473 log_parent_dir: str = args.log_dir if args.log_dir else os.path.join(self.home_dir, prog_name + "-logs")
1474 self.last_modified_cache_dir = os.path.join(log_parent_dir, ".cache", "last_modified")
1475 self.log_dir: str = os.path.join(log_parent_dir, self.timestamp[0 : self.timestamp.index("_")]) # 2024-09-03
1476 os.makedirs(self.log_dir, exist_ok=True)
1477 self.log_file_prefix = args.log_file_prefix
1478 self.log_file_infix = args.log_file_infix
1479 self.log_file_suffix = args.log_file_suffix
1480 fd, self.log_file = tempfile.mkstemp(
1481 suffix=".log",
1482 prefix=f"{self.log_file_prefix}{self.timestamp}{self.log_file_infix}{self.log_file_suffix}-",
1483 dir=self.log_dir,
1484 )
1485 os.close(fd)
1486 self.pv_log_file = self.log_file[0 : -len(".log")] + ".pv"
1487 Path(self.pv_log_file).touch()
1489 # Create/update "current" symlink to current_dir, which is a subdir containing further symlinks to log files.
1490 # For parallel usage, ensures there is no time window when the symlinks are inconsistent or do not exist.
1491 current = "current"
1492 dot_current_dir = os.path.join(log_parent_dir, f".{current}")
1493 current_dir = os.path.join(dot_current_dir, os.path.basename(self.log_file)[0 : -len(".log")])
1494 os.makedirs(current_dir, exist_ok=True)
1495 create_symlink(self.log_file, current_dir, f"{current}.log")
1496 create_symlink(self.pv_log_file, current_dir, f"{current}.pv")
1497 create_symlink(self.log_dir, current_dir, f"{current}.dir")
1498 dst_file = os.path.join(current_dir, current)
1499 os.symlink(os.path.relpath(current_dir, start=log_parent_dir), dst_file)
1500 os.replace(dst_file, os.path.join(log_parent_dir, current)) # atomic rename
1501 delete_stale_files(dot_current_dir, prefix="", secs=60, dirs=True, exclude=os.path.basename(current_dir))
1502 self.params: Params = None
1504 def __repr__(self) -> str:
1505 return str(self.__dict__)
1508#############################################################################
1509RegexList = List[Tuple[re.Pattern, bool]] # Type alias
1510UnixTimeRange = Optional[Tuple[Union[timedelta, int], Union[timedelta, int]]] # Type alias
1511RankRange = Tuple[Tuple[str, int, bool], Tuple[str, int, bool]] # Type alias
1512Tree = Dict[str, Optional[Dict]] # Type alias
1513RemoteConfCacheItem = namedtuple("RemoteConfCacheItem", ["connection_pools", "available_programs", "zpool_features"])
1516#############################################################################
1517class Params:
1518 def __init__(
1519 self,
1520 args: argparse.Namespace,
1521 sys_argv: Optional[List[str]] = None,
1522 log_params: LogParams = None,
1523 log: Logger = None,
1524 inject_params: Optional[Dict[str, bool]] = None,
1525 ):
1526 """Option values for all aspects; reads from ArgumentParser via args."""
1527 # immutable variables:
1528 assert args is not None
1529 self.args: argparse.Namespace = args
1530 self.sys_argv: List[str] = sys_argv if sys_argv is not None else []
1531 assert isinstance(self.sys_argv, list)
1532 self.log_params: LogParams = log_params
1533 self.log: Logger = log
1534 self.inject_params: Dict[str, bool] = inject_params if inject_params is not None else {} # for testing only
1535 self.one_or_more_whitespace_regex: re.Pattern = re.compile(r"\s+")
1536 self.two_or_more_spaces_regex: re.Pattern = re.compile(r" +")
1537 self.unset_matching_env_vars(args)
1539 assert len(args.root_dataset_pairs) > 0
1540 self.root_dataset_pairs: List[Tuple[str, str]] = args.root_dataset_pairs
1541 self.recursive: bool = args.recursive
1542 self.recursive_flag: str = "-r" if args.recursive else ""
1544 self.dry_run: bool = args.dryrun is not None
1545 self.dry_run_recv: str = "-n" if self.dry_run else ""
1546 self.dry_run_destroy: str = self.dry_run_recv
1547 self.dry_run_no_send: bool = args.dryrun == "send"
1548 self.verbose_zfs: bool = args.verbose >= 2
1549 self.verbose_destroy: str = "" if args.quiet else "-v"
1550 self.quiet: bool = args.quiet
1552 self.zfs_send_program_opts: List[str] = self.fix_send_opts(self.split_args(args.zfs_send_program_opts))
1553 zfs_recv_program_opts: List[str] = self.split_args(args.zfs_recv_program_opts)
1554 for extra_opt in args.zfs_recv_program_opt:
1555 zfs_recv_program_opts.append(self.validate_arg(extra_opt, allow_all=True))
1556 self.zfs_recv_program_opts: List[str] = self.fix_recv_opts(zfs_recv_program_opts)
1557 if self.verbose_zfs:
1558 append_if_absent(self.zfs_send_program_opts, "-v")
1559 append_if_absent(self.zfs_recv_program_opts, "-v")
1560 self.zfs_full_recv_opts: List[str] = self.zfs_recv_program_opts.copy()
1561 cpconfigs = [CopyPropertiesConfig(group, flag, args, self) for group, flag in zfs_recv_groups.items()]
1562 self.zfs_recv_o_config, self.zfs_recv_x_config, self.zfs_set_config = cpconfigs
1564 self.force_rollback_to_latest_snapshot: bool = args.force_rollback_to_latest_snapshot
1565 self.force_rollback_to_latest_common_snapshot = SynchronizedBool(args.force_rollback_to_latest_common_snapshot)
1566 self.force: SynchronizedBool = SynchronizedBool(args.force)
1567 self.force_once: bool = args.force_once
1568 self.force_unmount: str = "-f" if args.force_unmount else ""
1569 self.force_hard: str = "-R" if args.force_destroy_dependents else ""
1570 self.force_hard: str = "-R" if args.force_hard else self.force_hard # --force-hard is deprecated
1572 self.skip_parent: bool = args.skip_parent
1573 self.skip_missing_snapshots: str = args.skip_missing_snapshots
1574 self.skip_on_error: str = args.skip_on_error
1575 self.retry_policy: RetryPolicy = RetryPolicy(args, self)
1576 self.skip_replication: bool = args.skip_replication
1577 self.delete_dst_snapshots: bool = args.delete_dst_snapshots is not None
1578 self.delete_dst_bookmarks: bool = args.delete_dst_snapshots == "bookmarks"
1579 self.delete_dst_snapshots_no_crosscheck: bool = args.delete_dst_snapshots_no_crosscheck
1580 self.delete_dst_snapshots_except: bool = args.delete_dst_snapshots_except
1581 self.delete_dst_datasets: bool = args.delete_dst_datasets
1582 self.delete_empty_dst_datasets: bool = args.delete_empty_dst_datasets is not None
1583 self.delete_empty_dst_datasets_if_no_bookmarks_and_no_snapshots: bool = (
1584 args.delete_empty_dst_datasets == "snapshots+bookmarks"
1585 )
1586 self.compare_snapshot_lists: Optional[str] = args.compare_snapshot_lists
1587 self.daemon_lifetime_nanos: int = 1_000_000 * parse_duration_to_milliseconds(args.daemon_lifetime)
1588 self.daemon_frequency: str = args.daemon_frequency
1589 self.enable_privilege_elevation: bool = not args.no_privilege_elevation
1590 self.no_stream: bool = args.no_stream
1591 self.resume_recv: bool = not args.no_resume_recv
1592 self.create_bookmark: bool = not args.no_create_bookmark
1593 self.use_bookmark: bool = not args.no_use_bookmark
1595 self.src: Remote = Remote("src", args, self) # src dataset, host and ssh options
1596 self.dst: Remote = Remote("dst", args, self) # dst dataset, host and ssh options
1597 self.create_src_snapshots_config: CreateSrcSnapshotConfig = CreateSrcSnapshotConfig(args, self)
1599 self.compression_program: str = self.program_name(args.compression_program)
1600 self.compression_program_opts: List[str] = self.split_args(args.compression_program_opts)
1601 self.getconf_program: str = self.program_name("getconf") # print number of CPUs on POSIX except Solaris
1602 self.psrinfo_program: str = self.program_name("psrinfo") # print number of CPUs on Solaris
1603 self.mbuffer_program: str = self.program_name(args.mbuffer_program)
1604 self.mbuffer_program_opts: List[str] = self.split_args(args.mbuffer_program_opts)
1605 self.ps_program: str = self.program_name(args.ps_program)
1606 self.pv_program: str = self.program_name(args.pv_program)
1607 self.pv_program_opts: List[str] = self.split_args(args.pv_program_opts)
1608 self.isatty: bool = getenv_bool("isatty", True)
1609 if args.bwlimit:
1610 self.pv_program_opts += [f"--rate-limit={self.validate_arg(args.bwlimit)}"]
1611 self.shell_program_local: str = "sh"
1612 self.shell_program: str = self.program_name(args.shell_program)
1613 self.ssh_program: str = self.program_name(args.ssh_program)
1614 self.sudo_program: str = self.program_name(args.sudo_program)
1615 self.uname_program: str = self.program_name("uname")
1616 self.zfs_program: str = self.program_name(args.zfs_program)
1617 self.zpool_program: str = self.program_name(args.zpool_program)
1619 # no point creating complex shell pipeline commands for tiny data transfers:
1620 self.min_pipe_transfer_size: int = getenv_int("min_pipe_transfer_size", 1024 * 1024)
1621 self.max_datasets_per_batch_on_list_snaps = getenv_int("max_datasets_per_batch_on_list_snaps", 1024)
1622 self.max_datasets_per_minibatch_on_list_snaps = getenv_int("max_datasets_per_minibatch_on_list_snaps", -1)
1623 self.max_snapshots_per_minibatch_on_delete_snaps = getenv_int("max_snapshots_per_minibatch_on_delete_snaps", 2**29)
1624 self.dedicated_tcp_connection_per_zfs_send = getenv_bool("dedicated_tcp_connection_per_zfs_send", True)
1625 self.threads: Tuple[int, bool] = args.threads
1626 self.no_estimate_send_size: bool = args.no_estimate_send_size
1628 self.terminal_columns: int = (
1629 getenv_int("terminal_columns", shutil.get_terminal_size(fallback=(120, 24)).columns)
1630 if self.isatty and self.pv_program != disable_prg and not self.quiet
1631 else 0
1632 )
1634 self.os_cpu_count: int = os.cpu_count()
1635 self.os_geteuid: int = os.geteuid()
1636 self.prog_version: str = __version__
1637 self.python_version: str = sys.version
1638 self.platform_version: str = platform.version()
1639 self.platform_platform: str = platform.platform()
1641 # mutable variables:
1642 snapshot_filters = args.snapshot_filters_var if hasattr(args, snapshot_filters_var) else [[]]
1643 self.snapshot_filters: List[List[SnapshotFilter]] = [optimize_snapshot_filters(f) for f in snapshot_filters]
1644 self.exclude_dataset_property: Optional[str] = args.exclude_dataset_property
1645 self.exclude_dataset_regexes: RegexList = [] # deferred to validate_task() phase
1646 self.include_dataset_regexes: RegexList = [] # deferred to validate_task() phase
1647 self.tmp_exclude_dataset_regexes: RegexList = [] # deferred to validate_task() phase
1648 self.tmp_include_dataset_regexes: RegexList = [] # deferred to validate_task() phase
1649 self.abs_exclude_datasets: List[str] = [] # deferred to validate_task() phase
1650 self.abs_include_datasets: List[str] = [] # deferred to validate_task() phase
1652 self.curr_zfs_send_program_opts: List[str] = []
1653 self.zfs_recv_ox_names: Set[str] = set()
1654 self.available_programs: Dict[str, Dict[str, str]] = {}
1655 self.zpool_features: Dict[str, Dict[str, str]] = {}
1656 self.connection_pools = {}
1658 def split_args(self, text: str, *items, allow_all: bool = False) -> List[str]:
1659 """Splits option string on runs of one or more whitespace into an option list."""
1660 text = text.strip()
1661 opts = self.one_or_more_whitespace_regex.split(text) if text else []
1662 xappend(opts, items)
1663 if not allow_all:
1664 self.validate_quoting(opts)
1665 return opts
1667 def validate_arg(self, opt: str, allow_spaces: bool = False, allow_all: bool = False) -> Optional[str]:
1668 """allow_all permits all characters, including whitespace and quotes. See squote() and dquote()."""
1669 if allow_all or opt is None:
1670 return opt
1671 if any(char.isspace() and (char != " " or not allow_spaces) for char in opt):
1672 die(f"Option must not contain a whitespace character {'other than space' if allow_spaces else ''} : {opt}")
1673 self.validate_quoting([opt])
1674 return opt
1676 @staticmethod
1677 def validate_quoting(opts: List[str]) -> None:
1678 for opt in opts:
1679 if "'" in opt or '"' in opt or "`" in opt:
1680 die(f"Option must not contain a single quote or double quote or backtick character: {opt}")
1682 @staticmethod
1683 def fix_recv_opts(opts: List[str]) -> List[str]:
1684 return fix_send_recv_opts(
1685 opts, exclude_long_opts={"--dryrun"}, exclude_short_opts="n", include_arg_opts={"-o", "-x"}
1686 )
1688 @staticmethod
1689 def fix_send_opts(opts: List[str]) -> List[str]:
1690 return fix_send_recv_opts(
1691 opts,
1692 exclude_long_opts={"--dryrun"},
1693 exclude_short_opts="den",
1694 include_arg_opts={"-X", "--exclude", "--redact"},
1695 exclude_arg_opts={"-i", "-I"},
1696 )
1698 def program_name(self, program: str) -> str:
1699 """For testing: helps simulate errors caused by external programs."""
1700 self.validate_arg(program)
1701 if not program:
1702 die("Program name must not be the empty string")
1703 if self.inject_params.get("inject_unavailable_" + program, False):
1704 return program + "-xxx" # substitute a program that cannot be found on the PATH
1705 if self.inject_params.get("inject_failing_" + program, False):
1706 return "false" # substitute a program that will error out with non-zero return code
1707 return program
1709 def unset_matching_env_vars(self, args: argparse.Namespace) -> None:
1710 exclude_envvar_regexes = compile_regexes(args.exclude_envvar_regex)
1711 include_envvar_regexes = compile_regexes(args.include_envvar_regex)
1712 for envvar_name in list(os.environ.keys()):
1713 if is_included(envvar_name, exclude_envvar_regexes, include_envvar_regexes):
1714 os.environ.pop(envvar_name, None)
1715 self.log.debug("Unsetting b/c envvar regex: %s", envvar_name)
1717 def lock_file_name(self) -> str:
1718 """Makes it such that a job that runs periodically declines to start if the same previous periodic
1719 job is still running without completion yet."""
1720 # fmt: off
1721 key = (tuple(self.root_dataset_pairs), self.args.recursive, self.args.exclude_dataset_property,
1722 tuple(self.args.include_dataset), tuple(self.args.exclude_dataset),
1723 tuple(self.args.include_dataset_regex), tuple(self.args.exclude_dataset_regex),
1724 tuple(tuple(f) for f in self.snapshot_filters), self.args.skip_replication, self.args.create_src_snapshots,
1725 self.args.create_src_snapshots_plan, self.args.create_src_snapshots_timeformat,
1726 self.create_src_snapshots_config.anchors,
1727 self.args.delete_dst_datasets, self.args.delete_dst_snapshots, self.args.delete_dst_snapshots_except,
1728 self.args.delete_empty_dst_datasets,
1729 self.src.basis_ssh_host, self.dst.basis_ssh_host,
1730 self.src.basis_ssh_user, self.dst.basis_ssh_user)
1731 # fmt: on
1732 hash_code = hashlib.sha256(str(key).encode("utf-8")).hexdigest()
1733 return os.path.join(tempfile.gettempdir(), f"{prog_name}-lockfile-{hash_code}.lock")
1735 def dry(self, msg: str) -> str:
1736 return "Dry " + msg if self.dry_run else msg
1739#############################################################################
1740class Remote:
1741 def __init__(self, loc: str, args: argparse.Namespace, p: Params):
1742 """Option values for either location=='src' or location=='dst'; reads from ArgumentParser via args."""
1743 # immutable variables:
1744 assert loc == "src" or loc == "dst"
1745 self.location: str = loc
1746 self.params = p
1747 self.basis_ssh_user: str = getattr(args, f"ssh_{loc}_user")
1748 self.basis_ssh_host: str = getattr(args, f"ssh_{loc}_host")
1749 self.ssh_port: int = getattr(args, f"ssh_{loc}_port")
1750 self.ssh_config_file: str = p.validate_arg(getattr(args, f"ssh_{loc}_config_file"))
1751 self.ssh_cipher: str = p.validate_arg(args.ssh_cipher)
1752 self.ssh_private_key_files: List[str] = [p.validate_arg(key) for key in getattr(args, f"ssh_{loc}_private_key")]
1753 # disable interactive password prompts and X11 forwarding and pseudo-terminal allocation:
1754 self.ssh_extra_opts: List[str] = ["-oBatchMode=yes", "-oServerAliveInterval=0", "-x", "-T"]
1755 self.ssh_extra_opts += p.split_args(getattr(args, f"ssh_{loc}_extra_opts"))
1756 for extra_opt in getattr(args, f"ssh_{loc}_extra_opt"):
1757 self.ssh_extra_opts.append(p.validate_arg(extra_opt, allow_spaces=True))
1758 self.max_concurrent_ssh_sessions_per_tcp_connection: int = args.max_concurrent_ssh_sessions_per_tcp_connection
1759 self.reuse_ssh_connection: bool = getenv_bool("reuse_ssh_connection", True)
1760 if self.reuse_ssh_connection:
1761 self.ssh_socket_dir: str = os.path.join(get_home_directory(), ".ssh", "bzfs")
1762 os.makedirs(os.path.dirname(self.ssh_socket_dir), exist_ok=True)
1763 os.makedirs(self.ssh_socket_dir, mode=stat.S_IRWXU, exist_ok=True) # aka chmod u=rwx,go=
1764 self.socket_prefix = "s"
1765 delete_stale_files(self.ssh_socket_dir, self.socket_prefix)
1766 self.sanitize1_regex = re.compile(r"[\s\\/@$]") # replace whitespace, /, $, \, @ with a ~ tilde char
1767 self.sanitize2_regex = re.compile(r"[^a-zA-Z0-9;:,<.>?~`!%#$^&*+=_-]") # Remove chars not in the allowed set
1769 # mutable variables:
1770 self.root_dataset: str = "" # deferred until run_main()
1771 self.basis_root_dataset: str = "" # deferred until run_main()
1772 self.pool: str = ""
1773 self.sudo: str = ""
1774 self.use_zfs_delegation: bool = False
1775 self.ssh_user: str = ""
1776 self.ssh_host: str = ""
1777 self.ssh_user_host: str = ""
1779 def local_ssh_command(self) -> List[str]:
1780 """Returns the ssh CLI command to run locally in order to talk to the remote host. This excludes the (trailing)
1781 command to run on the remote host, which will be appended later."""
1782 if self.ssh_user_host == "":
1783 return [] # dataset is on local host - don't use ssh
1785 # dataset is on remote host
1786 p = self.params
1787 if p.ssh_program == disable_prg:
1788 die("Cannot talk to remote host because ssh CLI is disabled.")
1789 ssh_cmd = [p.ssh_program] + self.ssh_extra_opts
1790 if self.ssh_config_file:
1791 ssh_cmd += ["-F", self.ssh_config_file]
1792 for ssh_private_key_file in self.ssh_private_key_files:
1793 ssh_cmd += ["-i", ssh_private_key_file]
1794 if self.ssh_cipher:
1795 ssh_cmd += ["-c", self.ssh_cipher]
1796 if self.ssh_port:
1797 ssh_cmd += ["-p", str(self.ssh_port)]
1798 if self.reuse_ssh_connection:
1799 # Performance: reuse ssh connection for low latency startup of frequent ssh invocations via the 'ssh -S' and
1800 # 'ssh -S -M -oControlPersist=60s' options. See https://en.wikibooks.org/wiki/OpenSSH/Cookbook/Multiplexing
1801 # Generate unique private Unix domain socket file name in user's home dir and pass it to 'ssh -S /path/to/socket'
1802 def sanitize(name: str) -> str:
1803 name = self.sanitize1_regex.sub("~", name) # replace whitespace, /, $, \, @ with a ~ tilde char
1804 name = self.sanitize2_regex.sub("", name) # Remove chars not in the allowed set
1805 return name
1807 unique = f"{os.getpid()}@{time.time_ns()}@{random.SystemRandom().randint(0, 999_999_999_999)}"
1808 socket_name = f"{self.socket_prefix}{unique}@{sanitize(self.ssh_host)[:45]}@{sanitize(self.ssh_user)}"
1809 socket_file = os.path.join(self.ssh_socket_dir, socket_name)[: max(100, len(self.ssh_socket_dir) + 10)]
1810 ssh_cmd += ["-S", socket_file]
1811 ssh_cmd += [self.ssh_user_host]
1812 return ssh_cmd
1814 def cache_key(self) -> Tuple:
1815 # fmt: off
1816 return (self.location, self.pool, self.ssh_user_host, self.ssh_port, self.ssh_config_file, self.ssh_cipher,
1817 tuple(self.ssh_private_key_files), tuple(self.ssh_extra_opts))
1818 # fmt: on
1820 def __repr__(self) -> str:
1821 return str(self.__dict__)
1824#############################################################################
1825class CopyPropertiesConfig:
1826 def __init__(self, group: str, flag: str, args: argparse.Namespace, p: Params):
1827 """Option values for --zfs-recv-o* and --zfs-recv-x* option groups; reads from ArgumentParser via args."""
1828 # immutable variables:
1829 grup = group
1830 self.group: str = group
1831 self.flag: str = flag # one of -o or -x
1832 sources: str = p.validate_arg(getattr(args, f"{grup}_sources"))
1833 self.sources: str = ",".join(sorted([s.strip() for s in sources.strip().split(",")])) # canonicalize
1834 self.targets: str = p.validate_arg(getattr(args, f"{grup}_targets"))
1835 self.include_regexes: RegexList = compile_regexes(getattr(args, f"{grup}_include_regex"))
1836 self.exclude_regexes: RegexList = compile_regexes(getattr(args, f"{grup}_exclude_regex"))
1838 def __repr__(self) -> str:
1839 return str(self.__dict__)
1842#############################################################################
1843class RetryPolicy:
1844 def __init__(self, args: argparse.Namespace, p: Params):
1845 """Option values for retries; reads from ArgumentParser via args."""
1846 # immutable variables:
1847 self.retries: int = args.retries
1848 self.min_sleep_secs: float = args.retry_min_sleep_secs
1849 self.max_sleep_secs: float = args.retry_max_sleep_secs
1850 self.max_elapsed_secs: float = args.retry_max_elapsed_secs
1851 self.min_sleep_nanos: int = int(self.min_sleep_secs * 1_000_000_000)
1852 self.max_sleep_nanos: int = int(self.max_sleep_secs * 1_000_000_000)
1853 self.max_elapsed_nanos: int = int(self.max_elapsed_secs * 1_000_000_000)
1854 self.min_sleep_nanos = max(1, self.min_sleep_nanos)
1855 self.max_sleep_nanos = max(self.min_sleep_nanos, self.max_sleep_nanos)
1857 def __repr__(self) -> str:
1858 return (
1859 f"retries: {self.retries}, min_sleep_secs: {self.min_sleep_secs}, "
1860 f"max_sleep_secs: {self.max_sleep_secs}, max_elapsed_secs: {self.max_elapsed_secs}"
1861 )
1864#############################################################################
1865@dataclass
1866class Retry:
1867 count: int
1870#############################################################################
1871@dataclass(frozen=True, order=True)
1872class SnapshotLabel:
1873 """Contains the individual parts that are concatenated into a ZFS snapshot name."""
1875 prefix: str # bzfs_
1876 infix: str # us-west-1_
1877 timestamp: str # 2024-11-06_08:30:05
1878 suffix: str # _hourly
1880 def __str__(self) -> str: # bzfs_us-west-1_2024-11-06_08:30:05_hourly
1881 return f"{self.prefix}{self.infix}{self.timestamp}{self.suffix}"
1883 def validate_label(self, input_text: str) -> None:
1884 name = str(self)
1885 validate_dataset_name(name, input_text)
1886 if "/" in name:
1887 die(f"Invalid ZFS snapshot name: '{name}' for: '{input_text}*'")
1888 for key, value in {"prefix": self.prefix, "infix": self.infix, "suffix": self.suffix}.items():
1889 if key == "prefix":
1890 if not value.endswith("_"):
1891 die(f"Invalid {input_text}{key}: Must end with an underscore character: '{value}'")
1892 if value.count("_") > 1:
1893 die(f"Invalid {input_text}{key}: Must not contain multiple underscore characters: '{value}'")
1894 elif key == "infix":
1895 if value:
1896 if not value.endswith("_"):
1897 die(f"Invalid {input_text}{key}: Must end with an underscore character: '{value}'")
1898 if value.count("_") > 1:
1899 die(f"Invalid {input_text}{key}: Must not contain multiple underscore characters: '{value}'")
1900 elif value:
1901 if not value.startswith("_"):
1902 die(f"Invalid {input_text}{key}: Must start with an underscore character: '{value}'")
1903 if value.count("_") > 1:
1904 die(f"Invalid {input_text}{key}: Must not contain multiple underscore characters: '{value}'")
1907#############################################################################
1908class SnapshotPeriods:
1909 def __init__(self):
1910 self.suffix_milliseconds = {
1911 "yearly": 365 * 86400 * 1000,
1912 "monthly": round(30.5 * 86400 * 1000),
1913 "weekly": 7 * 86400 * 1000,
1914 "daily": 86400 * 1000,
1915 "hourly": 60 * 60 * 1000,
1916 "minutely": 60 * 1000,
1917 "secondly": 1000,
1918 "millisecondly": 1,
1919 }
1920 self.period_labels = {
1921 "yearly": "years",
1922 "monthly": "months",
1923 "weekly": "weeks",
1924 "daily": "days",
1925 "hourly": "hours",
1926 "minutely": "minutes",
1927 "secondly": "seconds",
1928 "millisecondly": "milliseconds",
1929 }
1930 self._suffix_regex0 = re.compile(rf"([1-9][0-9]*)?({'|'.join(self.suffix_milliseconds.keys())})")
1931 self._suffix_regex1 = re.compile("_" + self._suffix_regex0.pattern)
1933 def suffix_to_duration0(self, suffix: str) -> Tuple[int, str]:
1934 return self._suffix_to_duration(suffix, self._suffix_regex0)
1936 def suffix_to_duration1(self, suffix: str) -> Tuple[int, str]:
1937 return self._suffix_to_duration(suffix, self._suffix_regex1)
1939 @staticmethod
1940 def _suffix_to_duration(suffix: str, regex: re.Pattern) -> Tuple[int, str]:
1941 match = regex.fullmatch(suffix)
1942 if match:
1943 duration_amount = int(match.group(1)) if match.group(1) else 1
1944 assert duration_amount > 0
1945 duration_unit = match.group(2)
1946 return duration_amount, duration_unit
1947 else:
1948 return 0, ""
1951#############################################################################
1952class CreateSrcSnapshotConfig:
1953 def __init__(self, args: argparse.Namespace, p: Params):
1954 """Option values for --create-src-snapshots*; reads from ArgumentParser via args."""
1955 # immutable variables:
1956 self.skip_create_src_snapshots: bool = not args.create_src_snapshots
1957 self.create_src_snapshots_even_if_not_due: bool = args.create_src_snapshots_even_if_not_due
1958 self.enable_snapshots_changed_cache: bool = args.create_src_snapshots_enable_snapshots_changed_cache
1959 tz_spec: str = args.create_src_snapshots_timezone if args.create_src_snapshots_timezone else None
1960 self.tz: tzinfo = get_timezone(tz_spec)
1961 self.current_datetime: datetime = current_datetime(tz_spec)
1962 self.timeformat: str = args.create_src_snapshots_timeformat
1963 self.anchors: PeriodAnchors = PeriodAnchors().parse(args)
1965 suffixes: List[str] = []
1966 labels = []
1967 create_src_snapshots_plan = args.create_src_snapshots_plan or str({"bzfs": {"onsite": {"adhoc": 1}}})
1968 for org, target_periods in ast.literal_eval(create_src_snapshots_plan).items():
1969 for target, periods in target_periods.items():
1970 for period_unit, period_amount in periods.items(): # e.g. period_unit can be "10minutely" or "minutely"
1971 if not isinstance(period_amount, int) or period_amount < 0:
1972 die(f"--create-src-snapshots-plan: Period amount must be a non-negative integer: {period_amount}")
1973 if period_amount > 0:
1974 suffix = nsuffix(period_unit)
1975 suffixes.append(suffix)
1976 labels.append(SnapshotLabel(prefix=org + "_", infix=ninfix(target), timestamp="", suffix=suffix))
1977 xperiods = SnapshotPeriods()
1978 if self.skip_create_src_snapshots:
1979 duration_amount, duration_unit = xperiods.suffix_to_duration0(p.daemon_frequency)
1980 if duration_amount <= 0 or not duration_unit:
1981 die(f"Invalid --daemon-frequency: {p.daemon_frequency}")
1982 suffixes = [nsuffix(p.daemon_frequency)]
1983 labels = []
1984 suffix_durations = {suffix: xperiods.suffix_to_duration1(suffix) for suffix in suffixes}
1986 def suffix_key(suffix: str):
1987 duration_amount, duration_unit = suffix_durations[suffix]
1988 duration_milliseconds = duration_amount * xperiods.suffix_milliseconds.get(duration_unit, 0)
1989 if suffix.endswith("hourly") or suffix.endswith("minutely") or suffix.endswith("secondly"):
1990 if duration_milliseconds != 0 and 86400 * 1000 % duration_milliseconds != 0:
1991 die(
1992 "Invalid --create-src-snapshots-plan: Period duration should be a divisor of 86400 seconds "
1993 f"without remainder so that snapshots will be created at the same time of day every day: {suffix}"
1994 )
1995 return duration_milliseconds, suffix
1997 suffixes = sorted(suffixes, key=suffix_key, reverse=True) # take snapshots for dailies before hourlies, and so on
1998 self.suffix_durations: Dict[str, Tuple[int, str]] = {suffix: suffix_durations[suffix] for suffix in suffixes} # sort
1999 suffix_indexes = {suffix: k for k, suffix in enumerate(suffixes)}
2000 labels.sort(key=lambda label: (suffix_indexes[label.suffix], label)) # take snapshots for dailies before hourlies
2001 self._snapshot_labels: List[SnapshotLabel] = labels
2002 for label in self.snapshot_labels():
2003 label.validate_label("--create-src-snapshots-plan ")
2005 def snapshot_labels(self) -> List[SnapshotLabel]:
2006 timeformat = self.timeformat
2007 is_millis = timeformat.endswith("%F") # non-standard hack to append milliseconds
2008 if is_millis:
2009 timeformat = timeformat[0:-1] + "f" # replace %F with %f (append microseconds)
2010 timestamp: str = self.current_datetime.strftime(timeformat)
2011 if is_millis:
2012 timestamp = timestamp[0 : -len("000")] # replace microseconds with milliseconds
2013 timestamp = timestamp.replace("+", "z") # zfs CLI does not accept the '+' character in snapshot names
2014 return [SnapshotLabel(label.prefix, label.infix, timestamp, label.suffix) for label in self._snapshot_labels]
2016 def __repr__(self) -> str:
2017 return str(self.__dict__)
2020#############################################################################
2021def main() -> None:
2022 """API for command line clients."""
2023 try:
2024 run_main(argument_parser().parse_args(), sys.argv)
2025 except subprocess.CalledProcessError as e:
2026 sys.exit(e.returncode)
2029def run_main(args: argparse.Namespace, sys_argv: Optional[List[str]] = None, log: Optional[Logger] = None) -> None:
2030 """API for Python clients; visible for testing; may become a public API eventually."""
2031 Job().run_main(args, sys_argv, log)
2034#############################################################################
2035class Job:
2036 def __init__(self):
2037 self.params: Params
2038 self.all_dst_dataset_exists: Dict[str, Dict[str, bool]] = defaultdict(lambda: defaultdict(bool))
2039 self.dst_dataset_exists: SynchronizedDict[str, bool] = SynchronizedDict({})
2040 self.src_properties: Dict[str, Dict[str, str | int]] = {}
2041 self.all_exceptions: List[str] = []
2042 self.all_exceptions_count = 0
2043 self.max_exceptions_to_summarize = 10000
2044 self.first_exception: Optional[BaseException] = None
2045 self.remote_conf_cache: Dict[Tuple, RemoteConfCacheItem] = {}
2046 self.dedicated_tcp_connection_per_zfs_send: bool = True
2047 self.max_datasets_per_minibatch_on_list_snaps: Dict[str, int] = {}
2048 self.max_workers: Dict[str, int] = {}
2049 self.re_suffix = r"(?:/.*)?" # also match descendants of a matching dataset
2050 self.stats_lock = threading.Lock()
2051 self.num_snapshots_found: int = 0
2052 self.num_snapshots_replicated: int = 0
2053 self.control_persist_secs: int = 90
2054 self.control_persist_margin_secs: int = 2
2055 self.progress_reporter: ProgressReporter = None
2056 self.is_first_replication_task: SynchronizedBool = SynchronizedBool(True)
2057 self.replication_start_time_nanos: int = time.time_ns()
2059 self.is_test_mode: bool = False # for testing only
2060 self.creation_prefix = "" # for testing only
2061 self.isatty: Optional[bool] = None # for testing only
2062 self.use_select: bool = False # for testing only
2063 self.progress_update_intervals: Optional[Tuple[float, float]] = None # for testing only
2064 self.error_injection_triggers: Dict[str, Counter] = {} # for testing only
2065 self.delete_injection_triggers: Dict[str, Counter] = {} # for testing only
2066 self.param_injection_triggers: Dict[str, Dict[str, bool]] = {} # for testing only
2067 self.inject_params: Dict[str, bool] = {} # for testing only
2068 self.injection_lock = threading.Lock() # for testing only
2069 self.max_command_line_bytes: Optional[int] = None # for testing only
2071 def cleanup(self):
2072 """Exit any multiplexed ssh sessions that may be leftover."""
2073 cache_items = self.remote_conf_cache.values()
2074 for i, cache_item in enumerate(cache_items):
2075 cache_item.connection_pools.shutdown(f"{i + 1}/{len(cache_items)}")
2077 def terminate(self, except_current_process=False):
2078 try:
2079 self.cleanup()
2080 finally:
2081 terminate_process_group(except_current_process=except_current_process)
2083 def run_main(self, args: argparse.Namespace, sys_argv: Optional[List[str]] = None, log: Optional[Logger] = None):
2084 assert isinstance(self.error_injection_triggers, dict)
2085 assert isinstance(self.delete_injection_triggers, dict)
2086 assert isinstance(self.inject_params, dict)
2087 log_params = LogParams(args)
2088 try:
2089 log = get_logger(log_params, args, log)
2090 log.info("%s", "Log file is: " + log_params.log_file)
2091 aux_args = []
2092 if getattr(args, "include_snapshot_plan", None):
2093 aux_args += args.include_snapshot_plan
2094 if getattr(args, "delete_dst_snapshots_except_plan", None):
2095 aux_args += args.delete_dst_snapshots_except_plan
2096 if len(aux_args) > 0:
2097 log.info("Auxiliary CLI arguments: %s", " ".join(aux_args))
2098 args = argument_parser().parse_args(xappend(aux_args, "--", args.root_dataset_pairs), namespace=args)
2099 log.info("CLI arguments: %s %s", " ".join(sys_argv or []), f"[euid: {os.geteuid()}]")
2100 log.debug("Parsed CLI arguments: %s", args)
2101 try:
2102 self.params = p = Params(args, sys_argv, log_params, log, self.inject_params)
2103 except SystemExit as e:
2104 log.error("%s", str(e))
2105 raise
2106 log_params.params = p
2107 with open(log_params.log_file, "a", encoding="utf-8") as log_file_fd:
2108 with redirect_stderr(Tee(log_file_fd, sys.stderr)): # send stderr to both logfile and stderr
2109 lock_file = p.lock_file_name()
2110 with open(lock_file, "w") as lock_fd:
2111 try:
2112 # Acquire an exclusive lock; will raise an error if lock is already held by another process.
2113 # The (advisory) lock is auto-released when the process terminates or the fd is closed.
2114 fcntl.flock(lock_fd, fcntl.LOCK_EX | fcntl.LOCK_NB) # LOCK_NB ... non-blocking
2115 except BlockingIOError as e:
2116 msg = "Exiting as same previous periodic job is still running without completion yet"
2117 log.error(f"{msg} per %s", lock_file)
2118 raise SystemExit(still_running_status) from e
2119 try:
2120 # On CTRL-C send signal to the entire process group to also terminate child processes
2121 old_sigint_handler = signal.signal(signal.SIGINT, lambda signum, frame: self.terminate())
2122 try:
2123 self.run_tasks()
2124 except BaseException as e:
2125 self.terminate(except_current_process=True)
2126 raise e
2127 finally:
2128 signal.signal(signal.SIGINT, old_sigint_handler) # restore original signal handler
2129 for i in range(2 if self.max_command_line_bytes else 1):
2130 self.cleanup()
2131 finally:
2132 unlink_missing_ok(lock_file) # avoid accumulation of stale lock files
2133 finally:
2134 reset_logger()
2136 def run_tasks(self) -> None:
2137 def log_error_on_exit(error, status_code):
2138 log.error("%s%s", f"Exiting {prog_name} with status code {status_code}. Cause: ", error)
2140 p, log = self.params, self.params.log
2141 try:
2142 self.all_exceptions = []
2143 self.all_exceptions_count = 0
2144 self.first_exception = None
2145 self.remote_conf_cache = {}
2146 self.isatty = self.isatty if self.isatty is not None else p.isatty
2147 self.validate_once()
2148 self.replication_start_time_nanos = time.time_ns()
2149 self.progress_reporter = ProgressReporter(p, self.use_select, self.progress_update_intervals)
2150 try:
2151 daemon_stoptime_nanos = time.time_ns() + p.daemon_lifetime_nanos
2152 while True:
2153 self.progress_reporter.reset()
2154 src, dst = p.src, p.dst
2155 for src_root_dataset, dst_root_dataset in p.root_dataset_pairs:
2156 src.root_dataset = src.basis_root_dataset = src_root_dataset
2157 dst.root_dataset = dst.basis_root_dataset = dst_root_dataset
2158 p.curr_zfs_send_program_opts = p.zfs_send_program_opts.copy()
2159 task_description = f"{src.basis_root_dataset} {p.recursive_flag} --> {dst.basis_root_dataset}"
2160 if len(p.root_dataset_pairs) > 1:
2161 log.info("Starting task: %s", task_description + " ...")
2162 try:
2163 try:
2164 self.maybe_inject_error(cmd=[], error_trigger="retryable_run_tasks")
2165 self.validate_task()
2166 self.run_task()
2167 except RetryableError as retryable_error:
2168 raise retryable_error.__cause__
2169 except (CalledProcessError, TimeoutExpired, SystemExit, UnicodeDecodeError) as e:
2170 if p.skip_on_error == "fail":
2171 raise
2172 log.error("%s", str(e))
2173 self.append_exception(e, "task", task_description)
2174 if not self.sleep_until_next_daemon_iteration(daemon_stoptime_nanos):
2175 break
2176 finally:
2177 self.progress_reporter.stop()
2178 if not p.skip_replication:
2179 self.print_replication_stats(self.replication_start_time_nanos)
2180 error_count = self.all_exceptions_count
2181 if error_count > 0:
2182 msgs = "\n".join([f"{i + 1}/{error_count}: {e}" for i, e in enumerate(self.all_exceptions)])
2183 log.error("%s", f"Tolerated {error_count} errors. Error Summary: \n{msgs}")
2184 raise self.first_exception # reraise first swallowed exception
2185 except subprocess.CalledProcessError as e:
2186 log_error_on_exit(e, e.returncode)
2187 raise
2188 except SystemExit as e:
2189 log_error_on_exit(e, e.code)
2190 raise
2191 except (subprocess.TimeoutExpired, UnicodeDecodeError) as e:
2192 log_error_on_exit(e, die_status)
2193 raise SystemExit(die_status) from e
2194 except re.error as e:
2195 log_error_on_exit(f"{e} within regex '{e.pattern}'", die_status)
2196 raise SystemExit(die_status) from e
2197 finally:
2198 log.info("%s", "Log file was: " + p.log_params.log_file)
2200 log.info("Success. Goodbye!")
2201 print("", end="", file=sys.stderr)
2202 sys.stderr.flush()
2204 def append_exception(self, e: Exception, task_name: str, task_description: str) -> None:
2205 self.first_exception = self.first_exception or e
2206 if len(self.all_exceptions) < self.max_exceptions_to_summarize: # cap max memory consumption
2207 self.all_exceptions.append(str(e))
2208 self.all_exceptions_count += 1
2209 self.params.log.error(f"#{self.all_exceptions_count}: Done with %s: %s", task_name, task_description)
2211 def sleep_until_next_daemon_iteration(self, daemon_stoptime_nanos: int) -> bool:
2212 sleep_nanos = daemon_stoptime_nanos - time.time_ns()
2213 if sleep_nanos <= 0:
2214 return False
2215 self.progress_reporter.pause()
2216 p, log = self.params, self.params.log
2217 config = p.create_src_snapshots_config
2218 curr_datetime = config.current_datetime + timedelta(microseconds=1)
2219 next_snapshotting_event_dt = min(
2220 (
2221 round_datetime_up_to_duration_multiple(curr_datetime, duration_amount, duration_unit, config.anchors)
2222 for duration_amount, duration_unit in config.suffix_durations.values()
2223 ),
2224 default=curr_datetime + timedelta(days=1000 * 365), # infinity
2225 )
2226 offset: timedelta = next_snapshotting_event_dt - datetime.now(config.tz)
2227 offset_nanos = (offset.days * 86400 + offset.seconds) * 1_000_000_000 + offset.microseconds * 1_000
2228 sleep_nanos = min(sleep_nanos, max(0, offset_nanos))
2229 log.info("Daemon sleeping for: %s%s", human_readable_duration(sleep_nanos), f" ... [Log {p.log_params.log_file}]")
2230 time.sleep(sleep_nanos / 1_000_000_000)
2231 config.current_datetime = datetime.now(config.tz)
2232 return daemon_stoptime_nanos - time.time_ns() > 0
2234 def print_replication_stats(self, start_time_nanos: int):
2235 p, log = self.params, self.params.log
2236 elapsed_nanos = time.time_ns() - start_time_nanos
2237 msg = p.dry(f"Replicated {self.num_snapshots_replicated} snapshots in {human_readable_duration(elapsed_nanos)}.")
2238 if self.is_program_available("pv", "local"):
2239 sent_bytes = count_num_bytes_transferred_by_zfs_send(p.log_params.pv_log_file)
2240 sent_bytes_per_sec = round(1_000_000_000 * sent_bytes / elapsed_nanos)
2241 msg += f" zfs sent {human_readable_bytes(sent_bytes)} [{human_readable_bytes(sent_bytes_per_sec)}/s]."
2242 log.info("%s", msg.ljust(p.terminal_columns - len("2024-01-01 23:58:45 [I] ")))
2244 def validate_once(self) -> None:
2245 p = self.params
2246 p.zfs_recv_ox_names = self.recv_option_property_names(p.zfs_recv_program_opts)
2247 for snapshot_filter in p.snapshot_filters:
2248 for _filter in snapshot_filter:
2249 if _filter.name == snapshot_regex_filter_name:
2250 exclude_snapshot_regexes = compile_regexes(_filter.options[0])
2251 include_snapshot_regexes = compile_regexes(_filter.options[1] or [".*"])
2252 _filter.options = (exclude_snapshot_regexes, include_snapshot_regexes)
2254 exclude_regexes = [exclude_dataset_regexes_default]
2255 if len(p.args.exclude_dataset_regex) > 0: # some patterns don't exclude anything
2256 exclude_regexes = [regex for regex in p.args.exclude_dataset_regex if regex != "" and regex != "!.*"]
2257 include_regexes = p.args.include_dataset_regex
2259 # relative datasets need not be compiled more than once as they don't change between tasks
2260 def separate_abs_vs_rel_datasets(datasets: List[str]) -> Tuple[List[str], List[str]]:
2261 abs_datasets, rel_datasets = [], []
2262 for dataset in datasets:
2263 (abs_datasets if dataset.startswith("/") else rel_datasets).append(dataset)
2264 return abs_datasets, rel_datasets
2266 p.abs_exclude_datasets, rel_exclude_datasets = separate_abs_vs_rel_datasets(p.args.exclude_dataset)
2267 p.abs_include_datasets, rel_include_datasets = separate_abs_vs_rel_datasets(p.args.include_dataset)
2268 p.tmp_exclude_dataset_regexes, p.tmp_include_dataset_regexes = (
2269 compile_regexes(exclude_regexes + self.dataset_regexes(rel_exclude_datasets), suffix=self.re_suffix),
2270 compile_regexes(include_regexes + self.dataset_regexes(rel_include_datasets), suffix=self.re_suffix),
2271 )
2273 if p.pv_program != disable_prg:
2274 pv_program_opts_set = set(p.pv_program_opts)
2275 if pv_program_opts_set.isdisjoint({"--bytes", "-b", "--bits", "-8"}):
2276 die("--pv-program-opts must contain one of --bytes or --bits for progress metrics to function.")
2277 if self.isatty and not p.quiet:
2278 for opts in [["--eta", "-e"], ["--fineta", "-I"], ["--average-rate", "-a"]]:
2279 if pv_program_opts_set.isdisjoint(opts):
2280 die(f"--pv-program-opts must contain one of {', '.join(opts)} for progress report line to function.")
2282 def validate_task(self) -> None:
2283 p, log = self.params, self.params.log
2284 src, dst = p.src, p.dst
2285 for remote in [src, dst]:
2286 r, loc = remote, remote.location
2287 validate_user_name(r.basis_ssh_user, f"--ssh-{loc}-user")
2288 validate_host_name(r.basis_ssh_host, f"--ssh-{loc}-host")
2289 validate_port(r.ssh_port, f"--ssh-{loc}-port ")
2290 r.ssh_user, r.ssh_host, r.ssh_user_host, r.pool, r.root_dataset = parse_dataset_locator(
2291 r.basis_root_dataset, user=r.basis_ssh_user, host=r.basis_ssh_host, port=r.ssh_port
2292 )
2293 r.sudo, r.use_zfs_delegation = self.sudo_cmd(r.ssh_user_host, r.ssh_user)
2294 self.dst_dataset_exists = SynchronizedDict(self.all_dst_dataset_exists[dst.ssh_user_host])
2296 if src.ssh_host == dst.ssh_host:
2297 msg = f"src: {src.basis_root_dataset}, dst: {dst.basis_root_dataset}"
2298 if src.root_dataset == dst.root_dataset:
2299 die(f"Source and destination dataset must not be the same! {msg}")
2300 if p.recursive and (
2301 is_descendant(src.root_dataset, of_root_dataset=dst.root_dataset)
2302 or is_descendant(dst.root_dataset, of_root_dataset=src.root_dataset)
2303 ):
2304 die(f"Source and destination dataset trees must not overlap! {msg}")
2306 suffix = self.re_suffix # also match descendants of a matching dataset
2307 p.exclude_dataset_regexes, p.include_dataset_regexes = (
2308 p.tmp_exclude_dataset_regexes + compile_regexes(self.dataset_regexes(p.abs_exclude_datasets), suffix=suffix),
2309 p.tmp_include_dataset_regexes + compile_regexes(self.dataset_regexes(p.abs_include_datasets), suffix=suffix),
2310 )
2311 if len(p.include_dataset_regexes) == 0:
2312 p.include_dataset_regexes = [(re.compile(".*"), False)]
2314 self.detect_available_programs()
2316 zfs_send_program_opts = p.curr_zfs_send_program_opts
2317 if self.is_zpool_feature_enabled_or_active(dst, "feature@large_blocks"):
2318 append_if_absent(zfs_send_program_opts, "--large-block") # solaris-11.4 does not have this feature
2319 if self.is_solaris_zfs(dst):
2320 p.dry_run_destroy = "" # solaris-11.4 knows no 'zfs destroy -n' flag
2321 p.verbose_destroy = "" # solaris-11.4 knows no 'zfs destroy -v' flag
2322 if self.is_solaris_zfs(src): # solaris-11.4 only knows -w compress
2323 zfs_send_program_opts = ["-p" if opt == "--props" else opt for opt in zfs_send_program_opts]
2324 zfs_send_program_opts = fix_solaris_raw_mode(zfs_send_program_opts)
2325 p.curr_zfs_send_program_opts = zfs_send_program_opts
2327 self.max_workers = {}
2328 self.max_datasets_per_minibatch_on_list_snaps = {}
2329 for r in [src, dst]:
2330 cpus = int(p.available_programs[r.location].get("getconf_cpu_count", 8))
2331 threads, is_percent = p.threads
2332 cpus = max(1, round(cpus * threads / 100.0) if is_percent else round(threads))
2333 self.max_workers[r.location] = cpus
2334 bs = max(1, p.max_datasets_per_batch_on_list_snaps) # 1024 by default
2335 max_datasets_per_minibatch = p.max_datasets_per_minibatch_on_list_snaps
2336 if max_datasets_per_minibatch <= 0:
2337 max_datasets_per_minibatch = max(1, bs // cpus)
2338 max_datasets_per_minibatch = min(bs, max_datasets_per_minibatch)
2339 self.max_datasets_per_minibatch_on_list_snaps[r.location] = max_datasets_per_minibatch
2340 log.trace(
2341 "%s",
2342 f"max_datasets_per_batch_on_list_snaps: {p.max_datasets_per_batch_on_list_snaps}, "
2343 f"max_datasets_per_minibatch_on_list_snaps: {max_datasets_per_minibatch}, "
2344 f"max_workers: {self.max_workers[r.location]}, "
2345 f"location: {r.location}",
2346 )
2347 log.trace("Validated Param values: %s", pretty_print_formatter(self.params))
2349 def sudo_cmd(self, ssh_user_host: str, ssh_user: str) -> Tuple[str, bool]:
2350 p = self.params
2351 assert isinstance(ssh_user_host, str)
2352 p.validate_arg(ssh_user_host)
2353 assert isinstance(ssh_user, str)
2354 validate_user_name(ssh_user, ssh_user_host)
2355 assert isinstance(p.sudo_program, str)
2356 p.program_name(p.sudo_program) # validate
2357 assert isinstance(p.enable_privilege_elevation, bool)
2359 is_root = True
2360 if ssh_user_host != "":
2361 if ssh_user == "":
2362 if os.geteuid() != 0:
2363 is_root = False
2364 elif ssh_user != "root":
2365 is_root = False
2366 elif os.geteuid() != 0:
2367 is_root = False
2369 if is_root:
2370 sudo = "" # using sudo in an attempt to make ZFS operations work even if we are not root user?
2371 use_zfs_delegation = False # or instead using 'zfs allow' delegation?
2372 return sudo, use_zfs_delegation
2373 elif p.enable_privilege_elevation:
2374 if p.sudo_program == disable_prg:
2375 die(f"sudo CLI is not available on host: {ssh_user_host or 'localhost'}")
2376 # The '-n' option makes 'sudo' safer and more fail-fast. It avoids having sudo prompt the user for input of any
2377 # kind. If a password is required for the sudo command to run, sudo will display an error message and exit.
2378 return p.sudo_program + " -n", False
2379 else:
2380 return "", True
2382 def run_task(self) -> None:
2383 def filter_src_datasets() -> List[str]: # apply --{include|exclude}-dataset policy
2384 return self.filter_datasets(src, basis_src_datasets) if src_datasets is None else src_datasets
2386 p, log = self.params, self.params.log
2387 src, dst = p.src, p.dst
2388 task_description = f"{src.basis_root_dataset} {p.recursive_flag} --> {dst.basis_root_dataset} ..."
2389 failed = False
2390 src_datasets = None
2391 basis_src_datasets = []
2392 self.src_properties = {}
2393 if not self.is_dummy(src): # find src dataset or all datasets in src dataset tree (with --recursive)
2394 snapshots_changed_avail = (
2395 (not p.create_src_snapshots_config.skip_create_src_snapshots)
2396 and (not p.create_src_snapshots_config.create_src_snapshots_even_if_not_due)
2397 and self.is_snapshots_changed_zfs_property_available(src)
2398 )
2399 props = "volblocksize,recordsize,name"
2400 props = "snapshots_changed," + props if snapshots_changed_avail else props
2401 cmd = p.split_args(
2402 f"{p.zfs_program} list -t filesystem,volume -s name -Hp -o {props} {p.recursive_flag}", src.root_dataset
2403 )
2404 for line in (self.try_ssh_command(src, log_debug, cmd=cmd) or "").splitlines():
2405 cols = line.split("\t")
2406 snapshots_changed, volblocksize, recordsize, src_dataset = cols if snapshots_changed_avail else ["-"] + cols
2407 self.src_properties[src_dataset] = {
2408 "recordsize": int(recordsize) if recordsize != "-" else -int(volblocksize),
2409 "snapshots_changed": int(snapshots_changed) if snapshots_changed and snapshots_changed != "-" else 0,
2410 }
2411 basis_src_datasets.append(src_dataset)
2412 assert not self.is_test_mode or basis_src_datasets == sorted(basis_src_datasets), "List is not sorted"
2414 # Optionally, atomically create a new snapshot of the src datasets selected by --{include|exclude}-dataset* policy.
2415 # The implementation attempts to fit as many datasets as possible into a single (atomic) 'zfs snapshot' command line,
2416 # using case-sensitive sort order, and using 'zfs snapshot -r' to the extent that this is compatible with the
2417 # --{include|exclude}-dataset* pruning policy. The snapshots of all datasets that fit within the same single
2418 # 'zfs snapshot' CLI invocation will be taken within the same ZFS transaction group, and correspondingly have
2419 # identical 'createtxg' ZFS property (but not necessarily identical 'creation' ZFS time property as ZFS actually
2420 # provides no such guarantee), and thus be consistent. Dataset names that can't fit into a single command line are
2421 # spread over multiple command line invocations, respecting the limits that the operating system places on the
2422 # maximum length of a single command line, per `getconf ARG_MAX`.
2423 if not p.create_src_snapshots_config.skip_create_src_snapshots:
2424 log.info(p.dry("--create-src-snapshots: %s"), f"{src.basis_root_dataset} {p.recursive_flag} ...")
2425 if len(basis_src_datasets) == 0:
2426 die(f"Source dataset does not exist: {src.basis_root_dataset}")
2427 src_datasets = filter_src_datasets() # apply include/exclude policy
2428 datasets_to_snapshot: Dict[SnapshotLabel, List[str]] = self.find_datasets_to_snapshot(src_datasets)
2429 datasets_to_snapshot = {label: datasets for label, datasets in datasets_to_snapshot.items() if len(datasets) > 0}
2430 basis_datasets_to_snapshot = datasets_to_snapshot.copy() # shallow copy
2431 commands = {}
2432 for label, datasets in datasets_to_snapshot.items():
2433 cmd = p.split_args(f"{src.sudo} {p.zfs_program} snapshot")
2434 if p.recursive:
2435 # Run 'zfs snapshot -r' on the roots of subtrees if possible, else fallback to non-recursive CLI flavor
2436 root_datasets = self.root_datasets_if_recursive_zfs_snapshot_is_possible(datasets, basis_src_datasets)
2437 if root_datasets is not None:
2438 cmd += ["-r"] # recursive; takes a snapshot of all datasets in the subtree(s)
2439 datasets_to_snapshot[label] = root_datasets
2440 commands[label] = cmd
2441 creation_msg = f"Creating {sum(len(datasets) for datasets in basis_datasets_to_snapshot.values())} snapshots"
2442 log.info(p.dry("--create-src-snapshots: %s"), f"{creation_msg} within {len(src_datasets)} datasets ...")
2443 # create snapshots in large (parallel) batches, without using a command line that's too big for the OS to handle
2444 self.run_ssh_cmd_parallel(
2445 src,
2446 [(commands[lbl], [f"{ds}@{lbl}" for ds in datasets]) for lbl, datasets in datasets_to_snapshot.items()],
2447 fn=lambda cmd, batch: self.run_ssh_command(src, is_dry=p.dry_run, print_stdout=True, cmd=cmd + batch),
2448 max_batch_items=1 if self.is_solaris_zfs(src) else 2**29, # solaris CLI doesn't accept multiple datasets
2449 )
2450 # perf: copy lastmodified time of source dataset into local cache to reduce future 'zfs list -t snapshot' calls
2451 self.update_last_modified_cache(basis_datasets_to_snapshot)
2453 # Optionally, replicate src.root_dataset (optionally including its descendants) to dst.root_dataset
2454 if not p.skip_replication:
2455 if len(basis_src_datasets) == 0:
2456 die(f"Replication: Source dataset does not exist: {src.basis_root_dataset}")
2457 if self.is_dummy(dst):
2458 die(f"Replication: Destination may be a dummy dataset only if exclusively creating snapshots on the source!")
2459 src_datasets = filter_src_datasets() # apply include/exclude policy
2460 log.info("Starting replication task: %s", task_description + f" [{len(src_datasets)} datasets]")
2461 # perf/latency: no need to set up a dedicated TCP connection if no parallel replication is possible
2462 self.dedicated_tcp_connection_per_zfs_send = (
2463 p.dedicated_tcp_connection_per_zfs_send
2464 and min(self.max_workers[p.src.location], self.max_workers[p.dst.location]) > 1
2465 and has_siblings(src_datasets) # siblings can be replicated in parallel
2466 )
2467 self.num_snapshots_found = 0
2468 self.num_snapshots_replicated = 0
2469 start_time_nanos = time.time_ns()
2470 # Run replicate_dataset(dataset) for each dataset, while taking care of errors, retries + parallel execution
2471 failed = self.process_datasets_in_parallel_and_fault_tolerant(
2472 src_datasets,
2473 process_dataset=self.replicate_dataset, # lambda
2474 skip_tree_on_error=lambda dataset: not self.dst_dataset_exists[
2475 replace_prefix(dataset, old_prefix=src.root_dataset, new_prefix=dst.root_dataset)
2476 ],
2477 task_name="Replication",
2478 )
2479 log.info(
2480 p.dry("Replication done: %s"),
2481 f"{task_description} [Replicated {self.num_snapshots_replicated} out of {self.num_snapshots_found} snapshots"
2482 f" within {len(src_datasets)} datasets; took {human_readable_duration(time.time_ns() - start_time_nanos)}]",
2483 )
2485 if failed or not (
2486 p.delete_dst_datasets or p.delete_dst_snapshots or p.delete_empty_dst_datasets or p.compare_snapshot_lists
2487 ):
2488 return
2489 log.info("Listing dst datasets: %s", task_description)
2490 if self.is_dummy(dst):
2491 die(f"Destination may be a dummy dataset only if exclusively creating snapshots on the source!")
2492 cmd = p.split_args(
2493 f"{p.zfs_program} list -t filesystem,volume -s name -Hp -o name", p.recursive_flag, dst.root_dataset
2494 )
2495 basis_dst_datasets = self.try_ssh_command(dst, log_trace, cmd=cmd)
2496 if basis_dst_datasets is None:
2497 log.warning("Destination dataset does not exist: %s", dst.root_dataset)
2498 basis_dst_datasets = []
2499 else:
2500 basis_dst_datasets = basis_dst_datasets.splitlines()
2501 assert not self.is_test_mode or basis_dst_datasets == sorted(basis_dst_datasets), "List is not sorted"
2502 dst_datasets = self.filter_datasets(dst, basis_dst_datasets) # apply include/exclude policy
2504 # Optionally, delete existing destination datasets that do not exist within the source dataset if they are
2505 # included via --{include|exclude}-dataset* policy. Do not recurse without --recursive. With --recursive,
2506 # never delete non-selected dataset subtrees or their ancestors.
2507 if p.delete_dst_datasets and not failed:
2508 log.info(p.dry("--delete-dst-datasets: %s"), task_description)
2509 children = defaultdict(set)
2510 for dst_dataset in basis_dst_datasets: # Compute the direct children of each NON-FILTERED dataset
2511 parent = os.path.dirname(dst_dataset)
2512 children[parent].add(dst_dataset)
2513 to_delete: Set[str] = set()
2514 for dst_dataset in reversed(dst_datasets):
2515 if children[dst_dataset].issubset(to_delete):
2516 to_delete.add(dst_dataset) # all children are deletable, thus the dataset itself is deletable too
2517 to_delete = to_delete.difference(
2518 {replace_prefix(src_dataset, src.root_dataset, dst.root_dataset) for src_dataset in basis_src_datasets}
2519 )
2520 self.delete_datasets(dst, to_delete)
2521 dst_datasets = sorted(set(dst_datasets).difference(to_delete))
2522 basis_dst_datasets = sorted(set(basis_dst_datasets).difference(to_delete))
2524 # Optionally, delete existing destination snapshots that do not exist within the source dataset if they
2525 # are included by the --{include|exclude}-snapshot-* policy, and the destination dataset is included
2526 # via --{include|exclude}-dataset* policy.
2527 if p.delete_dst_snapshots and not failed:
2528 log.info(p.dry("--delete-dst-snapshots: %s"), task_description + f" [{len(dst_datasets)} datasets]")
2529 kind = "bookmark" if p.delete_dst_bookmarks else "snapshot"
2530 filter_needs_creation_time = has_timerange_filter(p.snapshot_filters)
2531 props = self.creation_prefix + "creation,guid,name" if filter_needs_creation_time else "guid,name"
2532 basis_src_datasets_set = set(basis_src_datasets)
2533 num_snapshots_found, num_snapshots_deleted = 0, 0
2535 def delete_destination_snapshots(dst_dataset: str, tid: str, retry: Retry) -> bool: # thread-safe
2536 src_dataset = replace_prefix(dst_dataset, old_prefix=dst.root_dataset, new_prefix=src.root_dataset)
2537 if src_dataset in basis_src_datasets_set and (self.are_bookmarks_enabled(src) or not p.delete_dst_bookmarks):
2538 src_kind = kind
2539 if not p.delete_dst_snapshots_no_crosscheck:
2540 src_kind = "snapshot,bookmark" if self.are_bookmarks_enabled(src) else "snapshot"
2541 src_cmd = p.split_args(f"{p.zfs_program} list -t {src_kind} -d 1 -s name -Hp -o guid", src_dataset)
2542 else:
2543 src_cmd = None
2544 dst_cmd = p.split_args(f"{p.zfs_program} list -t {kind} -d 1 -s createtxg -Hp -o {props}", dst_dataset)
2545 self.maybe_inject_delete(dst, dataset=dst_dataset, delete_trigger="zfs_list_delete_dst_snapshots")
2546 src_snaps_with_guids, dst_snaps_with_guids = self.run_in_parallel( # list src+dst snapshots in parallel
2547 lambda: set(self.run_ssh_command(src, log_trace, cmd=src_cmd).splitlines() if src_cmd else []),
2548 lambda: self.try_ssh_command(dst, log_trace, cmd=dst_cmd),
2549 )
2550 if dst_snaps_with_guids is None:
2551 log.warning("Third party deleted destination: %s", dst_dataset)
2552 return False
2553 dst_snaps_with_guids = dst_snaps_with_guids.splitlines()
2554 num_dst_snaps_with_guids = len(dst_snaps_with_guids)
2555 if p.delete_dst_bookmarks:
2556 replace_in_lines(dst_snaps_with_guids, old="#", new="@") # treat bookmarks as snapshots
2557 dst_snaps_with_guids = self.filter_snapshots(dst_snaps_with_guids, all_except=p.delete_dst_snapshots_except)
2558 if p.delete_dst_bookmarks:
2559 replace_in_lines(dst_snaps_with_guids, old="@", new="#") # restore pre-filtering bookmark state
2560 if filter_needs_creation_time:
2561 dst_snaps_with_guids = cut(field=2, lines=dst_snaps_with_guids)
2562 missing_snapshot_guids = set(cut(field=1, lines=dst_snaps_with_guids)).difference(src_snaps_with_guids)
2563 missing_snapshot_tags = self.filter_lines(dst_snaps_with_guids, missing_snapshot_guids)
2564 separator = "#" if p.delete_dst_bookmarks else "@"
2565 missing_snapshot_tags = cut(field=2, separator=separator, lines=missing_snapshot_tags)
2566 if p.delete_dst_bookmarks:
2567 self.delete_bookmarks(dst, dst_dataset, snapshot_tags=missing_snapshot_tags)
2568 else:
2569 self.delete_snapshots(dst, dst_dataset, snapshot_tags=missing_snapshot_tags)
2570 with self.stats_lock:
2571 nonlocal num_snapshots_found
2572 num_snapshots_found += num_dst_snaps_with_guids
2573 nonlocal num_snapshots_deleted
2574 num_snapshots_deleted += len(missing_snapshot_tags)
2575 return True
2577 # Run delete_destination_snapshots(dataset) for each dataset, while handling errors, retries + parallel exec
2578 if self.are_bookmarks_enabled(dst) or not p.delete_dst_bookmarks:
2579 starttime_nanos = time.time_ns()
2580 failed = self.process_datasets_in_parallel_and_fault_tolerant(
2581 dst_datasets,
2582 process_dataset=delete_destination_snapshots, # lambda
2583 skip_tree_on_error=lambda dataset: False,
2584 task_name="--delete-dst-snapshots",
2585 )
2586 log.info(
2587 p.dry("--delete-dst-snapshots: %s"),
2588 task_description + f" [Deleted {num_snapshots_deleted} out of {num_snapshots_found} {kind}s "
2589 f"within {len(dst_datasets)} datasets; took {human_readable_duration(time.time_ns() - starttime_nanos)}]",
2590 )
2592 # Optionally, delete any existing destination dataset that has no snapshot and no bookmark if all descendants
2593 # of that dataset do not have a snapshot or bookmark either. To do so, we walk the dataset list (conceptually,
2594 # a tree) depth-first (i.e. sorted descending). If a dst dataset has zero snapshots and zero bookmarks and all
2595 # its children are already marked as orphans, then it is itself an orphan, and we mark it as such. Walking in
2596 # a reverse sorted way means that we efficiently check for zero snapshots/bookmarks not just over the direct
2597 # children but the entire tree. Finally, delete all orphan datasets in an efficient batched way.
2598 if p.delete_empty_dst_datasets and p.recursive and not failed:
2599 log.info(p.dry("--delete-empty-dst-datasets: %s"), task_description)
2600 delete_empty_dst_datasets_if_no_bookmarks_and_no_snapshots = (
2601 p.delete_empty_dst_datasets_if_no_bookmarks_and_no_snapshots and self.are_bookmarks_enabled(dst)
2602 )
2604 # Compute the direct children of each NON-FILTERED dataset. Thus, no non-selected dataset and no ancestor of a
2605 # non-selected dataset will ever be added to the "orphan" set. In other words, this treats non-selected dataset
2606 # subtrees as if they all had snapshots, so non-selected dataset subtrees and their ancestors are guaranteed
2607 # to not get deleted.
2608 children = defaultdict(set)
2609 for dst_dataset in basis_dst_datasets:
2610 parent = os.path.dirname(dst_dataset)
2611 children[parent].add(dst_dataset)
2613 # Find and mark orphan datasets, finally delete them in an efficient way. Using two filter runs instead of one
2614 # filter run is an optimization. The first run only computes candidate orphans, without incurring I/O, to reduce
2615 # the list of datasets for which we list snapshots via 'zfs list -t snapshot ...' from dst_datasets to a subset
2616 # of dst_datasets, which in turn reduces I/O and improves perf. Essentially, this eliminates the I/O to list
2617 # snapshots for ancestors of excluded datasets. The second run computes the real orphans.
2618 btype = "bookmark,snapshot" if delete_empty_dst_datasets_if_no_bookmarks_and_no_snapshots else "snapshot"
2619 dst_datasets_having_snapshots: Set[str] = set()
2620 for run in range(0, 2):
2621 orphans: Set[str] = set()
2622 for dst_dataset in reversed(dst_datasets):
2623 if children[dst_dataset].issubset(orphans):
2624 # all children turned out to be orphans, thus the dataset itself could be an orphan
2625 if dst_dataset not in dst_datasets_having_snapshots: # always True during first filter run
2626 orphans.add(dst_dataset)
2627 if run == 0:
2628 # find datasets with >= 1 snapshot; update dst_datasets_having_snapshots for real use in the 2nd run
2629 cmd = p.split_args(f"{p.zfs_program} list -t {btype} -d 1 -S name -Hp -o name")
2630 for datasets_having_snapshots in self.zfs_list_snapshots_in_parallel(
2631 dst, cmd, sorted(orphans), ordered=False
2632 ):
2633 if delete_empty_dst_datasets_if_no_bookmarks_and_no_snapshots:
2634 replace_in_lines(datasets_having_snapshots, old="#", new="@") # treat bookmarks as snapshots
2635 datasets_having_snapshots = set(cut(field=1, separator="@", lines=datasets_having_snapshots))
2636 dst_datasets_having_snapshots.update(datasets_having_snapshots) # union
2637 else:
2638 self.delete_datasets(dst, orphans)
2639 dst_datasets = sorted(set(dst_datasets).difference(orphans))
2641 if p.compare_snapshot_lists and not failed:
2642 log.info("--compare-snapshot-lists: %s", task_description)
2643 if len(basis_src_datasets) == 0 and not self.is_dummy(src):
2644 die(f"Source dataset does not exist: {src.basis_root_dataset}")
2645 src_datasets = filter_src_datasets() # apply include/exclude policy
2646 self.run_compare_snapshot_lists(src_datasets, dst_datasets)
2648 def replicate_dataset(self, src_dataset: str, tid: str, retry: Retry) -> bool:
2649 """Replicates src_dataset (without handling descendants) to dst_dataset (thread-safe)."""
2651 p, log = self.params, self.params.log
2652 src, dst = p.src, p.dst
2653 retry_count = retry.count
2654 dst_dataset = replace_prefix(src_dataset, old_prefix=src.root_dataset, new_prefix=dst.root_dataset)
2655 log.debug(p.dry(f"{tid} Replicating: %s"), f"{src_dataset} --> {dst_dataset} ...")
2657 # list GUID and name for dst snapshots, sorted ascending by createtxg (more precise than creation time)
2658 dst_cmd = p.split_args(f"{p.zfs_program} list -t snapshot -d 1 -s createtxg -Hp -o guid,name", dst_dataset)
2660 # list GUID and name for src snapshots + bookmarks, primarily sort ascending by transaction group (which is more
2661 # precise than creation time), secondarily sort such that snapshots appear after bookmarks for the same GUID.
2662 # Note: A snapshot and its ZFS bookmarks always have the same GUID, creation time and transaction group. A snapshot
2663 # changes its transaction group but retains its creation time and GUID on 'zfs receive' on another pool, i.e.
2664 # comparing createtxg is only meaningful within a single pool, not across pools from src to dst. Comparing creation
2665 # time remains meaningful across pools from src to dst. Creation time is a UTC Unix time in integer seconds.
2666 # Note that 'zfs create', 'zfs snapshot' and 'zfs bookmark' CLIs enforce that snapshot names must not contain a '#'
2667 # char, bookmark names must not contain a '@' char, and dataset names must not contain a '#' or '@' char.
2668 # GUID and creation time also do not contain a '#' or '@' char.
2669 filter_needs_creation_time = has_timerange_filter(p.snapshot_filters)
2670 types = "snapshot,bookmark" if p.use_bookmark and self.are_bookmarks_enabled(src) else "snapshot"
2671 props = self.creation_prefix + "creation,guid,name" if filter_needs_creation_time else "guid,name"
2672 src_cmd = p.split_args(f"{p.zfs_program} list -t {types} -s createtxg -s type -d 1 -Hp -o {props}", src_dataset)
2673 self.maybe_inject_delete(src, dataset=src_dataset, delete_trigger="zfs_list_snapshot_src")
2674 src_snapshots_and_bookmarks, dst_snapshots_with_guids = self.run_in_parallel( # list src+dst snapshots in parallel
2675 lambda: self.try_ssh_command(src, log_trace, cmd=src_cmd),
2676 lambda: self.try_ssh_command(dst, log_trace, cmd=dst_cmd, error_trigger="zfs_list_snapshot_dst"),
2677 )
2678 self.dst_dataset_exists[dst_dataset] = dst_snapshots_with_guids is not None
2679 dst_snapshots_with_guids = dst_snapshots_with_guids.splitlines() if dst_snapshots_with_guids is not None else []
2680 if src_snapshots_and_bookmarks is None:
2681 log.warning("Third party deleted source: %s", src_dataset)
2682 return False # src dataset has been deleted by some third party while we're running - nothing to do anymore
2683 src_snapshots_with_guids: List[str] = src_snapshots_and_bookmarks.splitlines()
2684 src_snapshots_and_bookmarks = None
2685 if len(dst_snapshots_with_guids) == 0 and "bookmark" in types:
2686 # src bookmarks serve no purpose if the destination dataset has no snapshot; ignore them
2687 src_snapshots_with_guids = [snapshot for snapshot in src_snapshots_with_guids if "@" in snapshot]
2688 num_src_snapshots_found = sum(1 for snapshot in src_snapshots_with_guids if "@" in snapshot)
2689 with self.stats_lock:
2690 self.num_snapshots_found += num_src_snapshots_found
2691 # apply include/exclude regexes to ignore irrelevant src snapshots
2692 basis_src_snapshots_with_guids = src_snapshots_with_guids
2693 src_snapshots_with_guids = self.filter_snapshots(src_snapshots_with_guids)
2694 if filter_needs_creation_time:
2695 src_snapshots_with_guids = cut(field=2, lines=src_snapshots_with_guids)
2696 basis_src_snapshots_with_guids = cut(field=2, lines=basis_src_snapshots_with_guids)
2698 # find oldest and latest "true" snapshot, as well as GUIDs of all snapshots and bookmarks.
2699 # a snapshot is "true" if it is not a bookmark.
2700 oldest_src_snapshot = ""
2701 latest_src_snapshot = ""
2702 included_src_guids: Set[str] = set()
2703 for line in src_snapshots_with_guids:
2704 guid, snapshot = line.split("\t", 1)
2705 included_src_guids.add(guid)
2706 if "@" in snapshot:
2707 latest_src_snapshot = snapshot
2708 if not oldest_src_snapshot:
2709 oldest_src_snapshot = snapshot
2710 if len(src_snapshots_with_guids) == 0:
2711 if p.skip_missing_snapshots == "fail":
2712 die(f"Source dataset includes no snapshot: {src_dataset}. Consider using --skip-missing-snapshots=dataset")
2713 elif p.skip_missing_snapshots == "dataset":
2714 log.warning("Skipping source dataset because it includes no snapshot: %s", src_dataset)
2715 if p.recursive and not self.dst_dataset_exists[dst_dataset]:
2716 log.warning("Also skipping descendant datasets as dst dataset does not exist for %s", src_dataset)
2717 return self.dst_dataset_exists[dst_dataset]
2719 log.debug("latest_src_snapshot: %s", latest_src_snapshot)
2720 latest_dst_snapshot = ""
2721 latest_dst_guid = ""
2722 latest_common_src_snapshot = ""
2723 props_cache = {}
2724 done_checking = False
2726 if self.dst_dataset_exists[dst_dataset]:
2727 if len(dst_snapshots_with_guids) > 0:
2728 latest_dst_guid, latest_dst_snapshot = dst_snapshots_with_guids[-1].split("\t", 1)
2729 if p.force_rollback_to_latest_snapshot or p.force:
2730 log.info(p.dry(f"{tid} Rolling back destination to most recent snapshot: %s"), latest_dst_snapshot)
2731 # rollback just in case the dst dataset was modified since its most recent snapshot
2732 done_checking = done_checking or self.check_zfs_dataset_busy(dst, dst_dataset)
2733 cmd = p.split_args(f"{dst.sudo} {p.zfs_program} rollback", latest_dst_snapshot)
2734 self.try_ssh_command(dst, log_debug, is_dry=p.dry_run, print_stdout=True, cmd=cmd, exists=False)
2735 elif latest_src_snapshot == "":
2736 log.info(f"{tid} Already-up-to-date: %s", dst_dataset)
2737 return True
2739 # find most recent snapshot (or bookmark) that src and dst have in common - we'll start to replicate
2740 # from there up to the most recent src snapshot. any two snapshots are "common" iff their ZFS GUIDs (i.e.
2741 # contents) are equal. See https://github.com/openzfs/zfs/commit/305bc4b370b20de81eaf10a1cf724374258b74d1
2742 def latest_common_snapshot(snapshots_with_guids: List[str], intersect_guids: Set[str]) -> Tuple[str, str]:
2743 """Returns a true snapshot instead of its bookmark with the same GUID, per the sort order previously
2744 used for 'zfs list -s ...'"""
2745 for _line in reversed(snapshots_with_guids):
2746 _guid, _snapshot = _line.split("\t", 1)
2747 if _guid in intersect_guids:
2748 return _guid, _snapshot # can be a snapshot or bookmark
2749 return None, ""
2751 latest_common_guid, latest_common_src_snapshot = latest_common_snapshot(
2752 src_snapshots_with_guids, set(cut(field=1, lines=dst_snapshots_with_guids))
2753 )
2754 log.debug("latest_common_src_snapshot: %s", latest_common_src_snapshot) # is a snapshot or bookmark
2755 log.trace("latest_dst_snapshot: %s", latest_dst_snapshot)
2757 if latest_common_src_snapshot and latest_common_guid != latest_dst_guid:
2758 # found latest common snapshot but dst has an even newer snapshot. rollback dst to that common snapshot.
2759 _, latest_common_dst_snapshot = latest_common_snapshot(dst_snapshots_with_guids, {latest_common_guid})
2760 if not (p.force_rollback_to_latest_common_snapshot or p.force):
2761 die(
2762 f"Conflict: Most recent destination snapshot {latest_dst_snapshot} is more recent than "
2763 f"most recent common snapshot {latest_common_dst_snapshot}. Rollback destination first, "
2764 "for example via --force-rollback-to-latest-common-snapshot (or --force) option."
2765 )
2766 if p.force_once:
2767 p.force.value = False
2768 p.force_rollback_to_latest_common_snapshot.value = False
2769 log.info(
2770 p.dry(f"{tid} Rolling back destination to most recent common snapshot: %s"), latest_common_dst_snapshot
2771 )
2772 done_checking = done_checking or self.check_zfs_dataset_busy(dst, dst_dataset)
2773 cmd = p.split_args(
2774 f"{dst.sudo} {p.zfs_program} rollback -r {p.force_unmount} {p.force_hard}", latest_common_dst_snapshot
2775 )
2776 try:
2777 self.run_ssh_command(dst, log_debug, is_dry=p.dry_run, print_stdout=True, cmd=cmd)
2778 except (subprocess.CalledProcessError, subprocess.TimeoutExpired, UnicodeDecodeError) as e:
2779 stderr = stderr_to_str(e.stderr) if hasattr(e, "stderr") else ""
2780 no_sleep = self.clear_resumable_recv_state_if_necessary(dst_dataset, stderr)
2781 # op isn't idempotent so retries regather current state from the start of replicate_dataset()
2782 raise RetryableError("Subprocess failed", no_sleep=no_sleep) from e
2784 if latest_src_snapshot and latest_src_snapshot == latest_common_src_snapshot:
2785 log.info(f"{tid} Already up-to-date: %s", dst_dataset)
2786 return True
2788 # endif self.dst_dataset_exists[dst_dataset]
2789 log.debug("latest_common_src_snapshot: %s", latest_common_src_snapshot) # is a snapshot or bookmark
2790 log.trace("latest_dst_snapshot: %s", latest_dst_snapshot)
2791 dry_run_no_send = False
2792 right_just = 7
2794 def format_size(num_bytes: int) -> str:
2795 return human_readable_bytes(num_bytes, separator="").rjust(right_just)
2797 if not latest_common_src_snapshot:
2798 # no common snapshot was found. delete all dst snapshots, if any
2799 if latest_dst_snapshot:
2800 if not p.force:
2801 die(
2802 f"Conflict: No common snapshot found between {src_dataset} and {dst_dataset} even though "
2803 "destination has at least one snapshot. Aborting. Consider using --force option to first "
2804 "delete all existing destination snapshots in order to be able to proceed with replication."
2805 )
2806 if p.force_once:
2807 p.force.value = False
2808 done_checking = done_checking or self.check_zfs_dataset_busy(dst, dst_dataset)
2809 self.delete_snapshots(dst, dst_dataset, snapshot_tags=cut(2, separator="@", lines=dst_snapshots_with_guids))
2810 if p.dry_run:
2811 # As we're in --dryrun (--force) mode this conflict resolution step (see above) wasn't really executed:
2812 # "no common snapshot was found. delete all dst snapshots". In turn, this would cause the subsequent
2813 # 'zfs receive -n' to fail with "cannot receive new filesystem stream: destination has snapshots; must
2814 # destroy them to overwrite it". So we skip the zfs send/receive step and keep on trucking.
2815 dry_run_no_send = True
2817 # to start with, fully replicate oldest snapshot, which in turn creates a common snapshot
2818 if p.no_stream:
2819 oldest_src_snapshot = latest_src_snapshot
2820 if oldest_src_snapshot:
2821 if not self.dst_dataset_exists[dst_dataset]:
2822 # on destination, create parent filesystem and ancestors if they do not yet exist
2823 dst_dataset_parent = os.path.dirname(dst_dataset)
2824 if not self.dst_dataset_exists[dst_dataset_parent]:
2825 if p.dry_run:
2826 dry_run_no_send = True
2827 if dst_dataset_parent != "": 2827 ↛ 2830line 2827 didn't jump to line 2830 because the condition on line 2827 was always true
2828 self.create_filesystem(dst_dataset_parent)
2830 recv_resume_token, send_resume_opts, recv_resume_opts = self._recv_resume_token(dst_dataset, retry_count)
2831 curr_size = self.estimate_send_size(src, dst_dataset, recv_resume_token, oldest_src_snapshot)
2832 humansize = format_size(curr_size)
2833 if recv_resume_token:
2834 send_opts = send_resume_opts # e.g. ["-t", "1-c740b4779-..."]
2835 else:
2836 send_opts = p.curr_zfs_send_program_opts + [oldest_src_snapshot]
2837 send_cmd = p.split_args(f"{src.sudo} {p.zfs_program} send", send_opts)
2838 recv_opts = p.zfs_full_recv_opts.copy() + recv_resume_opts
2839 recv_opts, set_opts = self.add_recv_property_options(True, recv_opts, src_dataset, props_cache)
2840 recv_cmd = p.split_args(
2841 f"{dst.sudo} {p.zfs_program} receive -F", p.dry_run_recv, recv_opts, dst_dataset, allow_all=True
2842 )
2843 log.info(p.dry(f"{tid} Full send: %s"), f"{oldest_src_snapshot} --> {dst_dataset} ({humansize.strip()}) ...")
2844 done_checking = done_checking or self.check_zfs_dataset_busy(dst, dst_dataset)
2845 dry_run_no_send = dry_run_no_send or p.dry_run_no_send
2846 self.maybe_inject_params(error_trigger="full_zfs_send_params")
2847 humansize = humansize.rjust(right_just * 3 + 2)
2848 self.run_zfs_send_receive(
2849 src_dataset, dst_dataset, send_cmd, recv_cmd, curr_size, humansize, dry_run_no_send, "full_zfs_send"
2850 )
2851 latest_common_src_snapshot = oldest_src_snapshot # we have now created a common snapshot
2852 if not dry_run_no_send and not p.dry_run:
2853 self.dst_dataset_exists[dst_dataset] = True
2854 with self.stats_lock:
2855 self.num_snapshots_replicated += 1
2856 self.create_zfs_bookmark(src, oldest_src_snapshot, src_dataset)
2857 self.zfs_set(set_opts, dst, dst_dataset)
2858 retry_count = 0
2860 # endif not latest_common_src_snapshot
2861 # finally, incrementally replicate all snapshots from most recent common snapshot until most recent src snapshot
2862 if latest_common_src_snapshot:
2864 def replication_candidates() -> Tuple[List[str], List[str]]:
2865 assert len(basis_src_snapshots_with_guids) > 0
2866 result_snapshots = []
2867 result_guids = []
2868 last_appended_guid = ""
2869 snapshot_itr = reversed(basis_src_snapshots_with_guids)
2870 while True:
2871 guid, snapshot = snapshot_itr.__next__().split("\t", 1)
2872 if "@" in snapshot:
2873 result_snapshots.append(snapshot)
2874 result_guids.append(guid)
2875 last_appended_guid = guid
2876 if snapshot == latest_common_src_snapshot: # latest_common_src_snapshot is a snapshot or bookmark
2877 if guid != last_appended_guid and "@" not in snapshot:
2878 # only appends the src bookmark if it has no snapshot. If the bookmark has a snapshot then that
2879 # snapshot has already been appended, per the sort order previously used for 'zfs list -s ...'
2880 result_snapshots.append(snapshot)
2881 result_guids.append(guid)
2882 break
2883 result_snapshots.reverse()
2884 result_guids.reverse()
2885 assert len(result_snapshots) > 0
2886 assert len(result_snapshots) == len(result_guids)
2887 return result_guids, result_snapshots
2889 # collect the most recent common snapshot (which may be a bookmark) followed by all src snapshots
2890 # (that are not a bookmark) that are more recent than that.
2891 cand_guids, cand_snapshots = replication_candidates()
2892 if len(cand_snapshots) == 1:
2893 # latest_src_snapshot is a (true) snapshot that is equal to latest_common_src_snapshot or LESS recent
2894 # than latest_common_src_snapshot. The latter case can happen if latest_common_src_snapshot is a
2895 # bookmark whose snapshot has been deleted on src.
2896 return True # nothing more tbd
2898 recv_resume_token, send_resume_opts, recv_resume_opts = self._recv_resume_token(dst_dataset, retry_count)
2899 recv_opts = p.zfs_recv_program_opts.copy() + recv_resume_opts
2900 recv_opts, set_opts = self.add_recv_property_options(False, recv_opts, src_dataset, props_cache)
2901 if p.no_stream:
2902 # skip intermediate snapshots
2903 steps_todo = [("-i", latest_common_src_snapshot, latest_src_snapshot, 1)]
2904 else:
2905 # include intermediate src snapshots that pass --{include,exclude}-snapshot-* policy, using
2906 # a series of -i/-I send/receive steps that skip excluded src snapshots.
2907 steps_todo = self.incremental_send_steps_wrapper(
2908 cand_snapshots, cand_guids, included_src_guids, recv_resume_token is not None
2909 )
2910 log.trace("steps_todo: %s", list_formatter(steps_todo, "; "))
2911 estimate_send_sizes = [
2912 self.estimate_send_size(
2913 src, dst_dataset, recv_resume_token if i == 0 else None, incr_flag, from_snap, to_snap
2914 )
2915 for i, (incr_flag, from_snap, to_snap, num_snapshots) in enumerate(steps_todo)
2916 ]
2917 total_size = sum(estimate_send_sizes)
2918 total_num = sum(num_snapshots for incr_flag, from_snap, to_snap, num_snapshots in steps_todo)
2919 done_size = 0
2920 done_num = 0
2921 for i, (incr_flag, from_snap, to_snap, curr_num_snapshots) in enumerate(steps_todo):
2922 curr_size = estimate_send_sizes[i]
2923 humansize = format_size(total_size) + "/" + format_size(done_size) + "/" + format_size(curr_size)
2924 human_num = f"{total_num}/{done_num}/{curr_num_snapshots} snapshots"
2925 if recv_resume_token:
2926 send_opts = send_resume_opts # e.g. ["-t", "1-c740b4779-..."]
2927 else:
2928 send_opts = p.curr_zfs_send_program_opts + [incr_flag, from_snap, to_snap]
2929 send_cmd = p.split_args(f"{src.sudo} {p.zfs_program} send", send_opts)
2930 recv_cmd = p.split_args(
2931 f"{dst.sudo} {p.zfs_program} receive", p.dry_run_recv, recv_opts, dst_dataset, allow_all=True
2932 )
2933 dense_size = p.two_or_more_spaces_regex.sub("", humansize.strip())
2934 log.info(
2935 p.dry(f"{tid} Incremental send {incr_flag}: %s"),
2936 f"{from_snap} .. {to_snap[to_snap.index('@'):]} --> {dst_dataset} ({dense_size}) ({human_num}) ...",
2937 )
2938 done_checking = done_checking or self.check_zfs_dataset_busy(dst, dst_dataset, busy_if_send=False)
2939 if p.dry_run and not self.dst_dataset_exists[dst_dataset]:
2940 dry_run_no_send = True
2941 dry_run_no_send = dry_run_no_send or p.dry_run_no_send
2942 self.maybe_inject_params(error_trigger="incr_zfs_send_params")
2943 self.run_zfs_send_receive(
2944 src_dataset, dst_dataset, send_cmd, recv_cmd, curr_size, humansize, dry_run_no_send, "incr_zfs_send"
2945 )
2946 done_size += curr_size
2947 done_num += curr_num_snapshots
2948 recv_resume_token = None
2949 with self.stats_lock:
2950 self.num_snapshots_replicated += curr_num_snapshots
2951 if i == len(steps_todo) - 1:
2952 self.create_zfs_bookmark(src, to_snap, src_dataset)
2953 self.zfs_set(set_opts, dst, dst_dataset)
2954 return True
2956 def prepare_zfs_send_receive(
2957 self,
2958 src_dataset: str,
2959 send_cmd: List[str],
2960 recv_cmd: List[str],
2961 size_estimate_bytes: int,
2962 size_estimate_human: str,
2963 ) -> Tuple[str, str, str]:
2964 p, log = self.params, self.params.log
2965 send_cmd = " ".join([shlex.quote(item) for item in send_cmd])
2966 recv_cmd = " ".join([shlex.quote(item) for item in recv_cmd])
2968 if self.is_program_available("zstd", "src") and self.is_program_available("zstd", "dst"):
2969 _compress_cmd = self.compress_cmd("src", size_estimate_bytes)
2970 _decompress_cmd = self.decompress_cmd("dst", size_estimate_bytes)
2971 else: # no compression is used if source and destination do not both support compression
2972 _compress_cmd, _decompress_cmd = "cat", "cat"
2974 recordsize = abs(self.src_properties[src_dataset]["recordsize"])
2975 src_buffer = self.mbuffer_cmd("src", size_estimate_bytes, recordsize)
2976 dst_buffer = self.mbuffer_cmd("dst", size_estimate_bytes, recordsize)
2977 local_buffer = self.mbuffer_cmd("local", size_estimate_bytes, recordsize)
2979 pv_src_cmd = ""
2980 pv_dst_cmd = ""
2981 pv_loc_cmd = ""
2982 if p.src.ssh_user_host == "":
2983 pv_src_cmd = self.pv_cmd("local", size_estimate_bytes, size_estimate_human)
2984 elif p.dst.ssh_user_host == "":
2985 pv_dst_cmd = self.pv_cmd("local", size_estimate_bytes, size_estimate_human)
2986 elif _compress_cmd == "cat":
2987 pv_loc_cmd = self.pv_cmd("local", size_estimate_bytes, size_estimate_human) # compression disabled
2988 else:
2989 # pull-push mode with compression enabled: reporting "percent complete" isn't straightforward because
2990 # localhost observes the compressed data instead of the uncompressed data, so we disable the progress bar.
2991 pv_loc_cmd = self.pv_cmd("local", size_estimate_bytes, size_estimate_human, disable_progress_bar=True)
2993 # assemble pipeline running on source leg
2994 src_pipe = ""
2995 if self.inject_params.get("inject_src_pipe_fail", False):
2996 # for testing; initially forward some bytes and then fail
2997 src_pipe = f"{src_pipe} | dd bs=64 count=1 2>/dev/null && false"
2998 if self.inject_params.get("inject_src_pipe_garble", False):
2999 src_pipe = f"{src_pipe} | base64" # for testing; forward garbled bytes
3000 if pv_src_cmd != "" and pv_src_cmd != "cat":
3001 src_pipe = f"{src_pipe} | {pv_src_cmd}"
3002 if _compress_cmd != "cat":
3003 src_pipe = f"{src_pipe} | {_compress_cmd}"
3004 if src_buffer != "cat":
3005 src_pipe = f"{src_pipe} | {src_buffer}"
3006 if src_pipe.startswith(" |"):
3007 src_pipe = src_pipe[2:] # strip leading ' |' part
3008 if self.inject_params.get("inject_src_send_error", False):
3009 send_cmd = f"{send_cmd} --injectedGarbageParameter" # for testing; induce CLI parse error
3010 if src_pipe != "":
3011 src_pipe = f"{send_cmd} | {src_pipe}"
3012 if p.src.ssh_user_host != "":
3013 src_pipe = p.shell_program + " -c " + self.dquote(src_pipe)
3014 else:
3015 src_pipe = send_cmd
3017 # assemble pipeline running on middle leg between source and destination. only enabled for pull-push mode
3018 local_pipe = ""
3019 if local_buffer != "cat":
3020 local_pipe = f"{local_buffer}"
3021 if pv_loc_cmd != "" and pv_loc_cmd != "cat":
3022 local_pipe = f"{local_pipe} | {pv_loc_cmd}"
3023 if local_buffer != "cat":
3024 local_pipe = f"{local_pipe} | {local_buffer}"
3025 if local_pipe.startswith(" |"):
3026 local_pipe = local_pipe[2:] # strip leading ' |' part
3027 if local_pipe != "":
3028 local_pipe = f"| {local_pipe}"
3030 # assemble pipeline running on destination leg
3031 dst_pipe = ""
3032 if dst_buffer != "cat":
3033 dst_pipe = f"{dst_buffer}"
3034 if _decompress_cmd != "cat":
3035 dst_pipe = f"{dst_pipe} | {_decompress_cmd}"
3036 if pv_dst_cmd != "" and pv_dst_cmd != "cat":
3037 dst_pipe = f"{dst_pipe} | {pv_dst_cmd}"
3038 if self.inject_params.get("inject_dst_pipe_fail", False):
3039 # interrupt zfs receive for testing retry/resume; initially forward some bytes and then stop forwarding
3040 dst_pipe = f"{dst_pipe} | dd bs=1024 count={inject_dst_pipe_fail_kbytes} 2>/dev/null"
3041 if self.inject_params.get("inject_dst_pipe_garble", False):
3042 dst_pipe = f"{dst_pipe} | base64" # for testing; forward garbled bytes
3043 if dst_pipe.startswith(" |"):
3044 dst_pipe = dst_pipe[2:] # strip leading ' |' part
3045 if self.inject_params.get("inject_dst_receive_error", False):
3046 recv_cmd = f"{recv_cmd} --injectedGarbageParameter" # for testing; induce CLI parse error
3047 if dst_pipe != "":
3048 dst_pipe = f"{dst_pipe} | {recv_cmd}"
3049 if p.dst.ssh_user_host != "":
3050 dst_pipe = p.shell_program + " -c " + self.dquote(dst_pipe)
3051 else:
3052 dst_pipe = recv_cmd
3054 # If there's no support for shell pipelines, we can't do compression, mbuffering, monitoring and rate-limiting,
3055 # so we fall back to simple zfs send/receive.
3056 if not self.is_program_available("sh", "src"):
3057 src_pipe = send_cmd
3058 if not self.is_program_available("sh", "dst"):
3059 dst_pipe = recv_cmd
3060 if not self.is_program_available("sh", "local"):
3061 local_pipe = ""
3063 src_pipe = self.squote(p.src, src_pipe)
3064 dst_pipe = self.squote(p.dst, dst_pipe)
3065 return src_pipe, local_pipe, dst_pipe
3067 def run_zfs_send_receive(
3068 self,
3069 src_dataset: str,
3070 dst_dataset: str,
3071 send_cmd: List[str],
3072 recv_cmd: List[str],
3073 size_estimate_bytes: int,
3074 size_estimate_human: str,
3075 dry_run_no_send: bool,
3076 error_trigger: Optional[str] = None,
3077 ) -> None:
3078 p, log = self.params, self.params.log
3079 src_pipe, local_pipe, dst_pipe = self.prepare_zfs_send_receive(
3080 src_dataset, send_cmd, recv_cmd, size_estimate_bytes, size_estimate_human
3081 )
3082 conn_pool_name = DEDICATED if self.dedicated_tcp_connection_per_zfs_send else SHARED
3083 src_conn_pool: ConnectionPool = p.connection_pools["src"].pool(conn_pool_name)
3084 src_conn: Connection = src_conn_pool.get_connection()
3085 dst_conn_pool: ConnectionPool = p.connection_pools["dst"].pool(conn_pool_name)
3086 dst_conn: Connection = dst_conn_pool.get_connection()
3087 try:
3088 self.refresh_ssh_connection_if_necessary(p.src, src_conn)
3089 self.refresh_ssh_connection_if_necessary(p.dst, dst_conn)
3090 src_ssh_cmd = " ".join(src_conn.ssh_cmd_quoted)
3091 dst_ssh_cmd = " ".join(dst_conn.ssh_cmd_quoted)
3092 cmd = [p.shell_program_local, "-c", f"{src_ssh_cmd} {src_pipe} {local_pipe} | {dst_ssh_cmd} {dst_pipe}"]
3093 msg = "Would execute: %s" if dry_run_no_send else "Executing: %s"
3094 log.debug(msg, cmd[2].lstrip())
3095 if not dry_run_no_send:
3096 try:
3097 self.maybe_inject_error(cmd=cmd, error_trigger=error_trigger)
3098 process = subprocess.run(cmd, stdin=DEVNULL, stdout=PIPE, stderr=PIPE, text=True, check=True)
3099 except (subprocess.CalledProcessError, subprocess.TimeoutExpired, UnicodeDecodeError) as e:
3100 no_sleep = False
3101 if not isinstance(e, UnicodeDecodeError):
3102 xprint(log, stderr_to_str(e.stdout), file=sys.stdout)
3103 log.warning("%s", stderr_to_str(e.stderr).rstrip())
3104 if isinstance(e, subprocess.CalledProcessError):
3105 no_sleep = self.clear_resumable_recv_state_if_necessary(dst_dataset, e.stderr)
3106 # op isn't idempotent so retries regather current state from the start of replicate_dataset()
3107 raise RetryableError("Subprocess failed", no_sleep=no_sleep) from e
3108 else:
3109 xprint(log, process.stdout, file=sys.stdout)
3110 xprint(log, process.stderr, file=sys.stderr)
3111 finally:
3112 dst_conn_pool.return_connection(dst_conn)
3113 src_conn_pool.return_connection(src_conn)
3115 def clear_resumable_recv_state_if_necessary(self, dst_dataset: str, stderr: str) -> bool:
3116 def clear_resumable_recv_state() -> bool:
3117 log.warning(p.dry("Aborting an interrupted zfs receive -s, deleting partially received state: %s"), dst_dataset)
3118 cmd = p.split_args(f"{p.dst.sudo} {p.zfs_program} receive -A", dst_dataset)
3119 self.try_ssh_command(p.dst, log_trace, is_dry=p.dry_run, print_stdout=True, cmd=cmd)
3120 log.trace(p.dry("Done Aborting an interrupted zfs receive -s: %s"), dst_dataset)
3121 return True
3123 p, log = self.params, self.params.log
3124 # "cannot resume send: 'wb_src/tmp/src@s1' is no longer the same snapshot used in the initial send"
3125 # "cannot resume send: 'wb_src/tmp/src@s1' used in the initial send no longer exists"
3126 # "cannot resume send: incremental source 0xa000000000000000 no longer exists"
3127 if "cannot resume send" in stderr and (
3128 "is no longer the same snapshot used in the initial send" in stderr
3129 or "used in the initial send no longer exists" in stderr
3130 or re.match(r"incremental source [0-9a-fx]+ no longer exists", stderr)
3131 ):
3132 return clear_resumable_recv_state()
3134 # "cannot receive resume stream: incompatible embedded data stream feature with encrypted receive."
3135 # see https://github.com/openzfs/zfs/issues/12480
3136 # 'cannot receive new filesystem stream: destination xx contains partially-complete state from "zfs receive -s"'
3137 # this indicates that --no-resume-recv detects that dst contains a previously interrupted recv -s
3138 elif "cannot receive" in stderr and (
3139 "cannot receive resume stream: incompatible embedded data stream feature with encrypted receive" in stderr
3140 or 'contains partially-complete state from "zfs receive -s"' in stderr
3141 ):
3142 return clear_resumable_recv_state()
3144 elif ( # this signals normal behavior on interrupt of 'zfs receive -s' if running without --no-resume-recv
3145 "cannot receive new filesystem stream: checksum mismatch or incomplete stream" in stderr
3146 and "Partially received snapshot is saved" in stderr
3147 ):
3148 return True
3150 # "cannot destroy 'wb_dest/tmp/dst@s1': snapshot has dependent clones ... use '-R' to destroy the following
3151 # datasets: wb_dest/tmp/dst/%recv" # see https://github.com/openzfs/zfs/issues/10439#issuecomment-642774560
3152 # This msg indicates a failed 'zfs destroy' via --delete-dst-snapshots. This "clone" is caused by a previously
3153 # interrupted 'zfs receive -s'. The fix used here is to delete the partially received state of said
3154 # 'zfs receive -s' via 'zfs receive -A', followed by an automatic retry, which will now succeed to delete the
3155 # snapshot without user intervention.
3156 elif (
3157 "cannot destroy" in stderr
3158 and "snapshot has dependent clone" in stderr
3159 and "use '-R' to destroy the following dataset" in stderr
3160 and f"\n{dst_dataset}/%recv\n" in stderr
3161 ):
3162 return clear_resumable_recv_state()
3164 # Same cause as above, except that this error can occur during 'zfs rollback'
3165 # Also see https://github.com/openzfs/zfs/blob/master/cmd/zfs/zfs_main.c
3166 elif (
3167 "cannot rollback to" in stderr
3168 and "clones of previous snapshots exist" in stderr
3169 and "use '-R' to force deletion of the following clones and dependents" in stderr
3170 and f"\n{dst_dataset}/%recv\n" in stderr
3171 ):
3172 return clear_resumable_recv_state()
3174 return False
3176 def _recv_resume_token(self, dst_dataset: str, retry_count: int) -> Tuple[Optional[str], List[str], List[str]]:
3177 """Gets recv_resume_token ZFS property from dst_dataset and returns corresponding opts to use for send+recv."""
3178 p, log = self.params, self.params.log
3179 if not p.resume_recv:
3180 return None, [], []
3181 warning = None
3182 if not self.is_zpool_feature_enabled_or_active(p.dst, "feature@extensible_dataset"):
3183 warning = "not available on destination dataset"
3184 elif not self.is_program_available(zfs_version_is_at_least_2_1_0, "dst"):
3185 warning = "unreliable as zfs version is too old" # e.g. zfs-0.8.3 "internal error: Unknown error 1040"
3186 if warning:
3187 log.warning(f"ZFS receive resume feature is {warning}. Falling back to --no-resume-recv: %s", dst_dataset)
3188 return None, [], []
3189 recv_resume_token = None
3190 send_resume_opts = []
3191 if self.dst_dataset_exists[dst_dataset]:
3192 cmd = p.split_args(f"{p.zfs_program} get -Hp -o value -s none receive_resume_token", dst_dataset)
3193 recv_resume_token = self.run_ssh_command(p.dst, log_trace, cmd=cmd).rstrip()
3194 if recv_resume_token == "-" or not recv_resume_token:
3195 recv_resume_token = None
3196 else:
3197 send_resume_opts += ["-n"] if p.dry_run else []
3198 send_resume_opts += ["-v"] if p.verbose_zfs else []
3199 send_resume_opts += ["-t", recv_resume_token]
3200 recv_resume_opts = ["-s"]
3201 return recv_resume_token, send_resume_opts, recv_resume_opts
3203 def mbuffer_cmd(self, loc: str, size_estimate_bytes: int, recordsize: int) -> str:
3204 """If mbuffer command is on the PATH, uses it in the ssh network pipe between 'zfs send' and 'zfs receive' to
3205 smooth out the rate of data flow and prevent bottlenecks caused by network latency or speed fluctuation."""
3206 p = self.params
3207 if (
3208 size_estimate_bytes >= p.min_pipe_transfer_size
3209 and (
3210 (loc == "src" and (p.src.ssh_user_host != "" or p.dst.ssh_user_host != ""))
3211 or (loc == "dst" and (p.src.ssh_user_host != "" or p.dst.ssh_user_host != ""))
3212 or (loc == "local" and p.src.ssh_user_host != "" and p.dst.ssh_user_host != "")
3213 )
3214 and self.is_program_available("mbuffer", loc)
3215 ):
3216 recordsize = max(recordsize, 128 * 1024 if self.is_solaris_zfs_location(loc) else 2 * 1024 * 1024)
3217 return f"{p.mbuffer_program} {' '.join(['-s', str(recordsize)] + p.mbuffer_program_opts)}"
3218 else:
3219 return "cat"
3221 def compress_cmd(self, loc: str, size_estimate_bytes: int) -> str:
3222 """If zstd command is on the PATH, uses it in the ssh network pipe between 'zfs send' and 'zfs receive' to
3223 reduce network bottlenecks by sending compressed data."""
3224 p = self.params
3225 if (
3226 size_estimate_bytes >= p.min_pipe_transfer_size
3227 and (p.src.ssh_user_host != "" or p.dst.ssh_user_host != "")
3228 and self.is_program_available("zstd", loc)
3229 ):
3230 return f"{p.compression_program} {' '.join(p.compression_program_opts)}"
3231 else:
3232 return "cat"
3234 def decompress_cmd(self, loc: str, size_estimate_bytes: int) -> str:
3235 p = self.params
3236 if (
3237 size_estimate_bytes >= p.min_pipe_transfer_size
3238 and (p.src.ssh_user_host != "" or p.dst.ssh_user_host != "")
3239 and self.is_program_available("zstd", loc)
3240 ):
3241 return f"{p.compression_program} -dc"
3242 else:
3243 return "cat"
3245 worker_thread_number_regex: re.Pattern = re.compile(r"ThreadPoolExecutor-\d+_(\d+)")
3247 def pv_cmd(self, loc: str, size_estimate_bytes: int, size_estimate_human: str, disable_progress_bar=False) -> str:
3248 """If pv command is on the PATH, monitors the progress of data transfer from 'zfs send' to 'zfs receive'.
3249 Progress can be viewed via "tail -f $pv_log_file" aka tail -f ~/bzfs-logs/current.pv or similar."""
3250 p = self.params
3251 if self.is_program_available("pv", loc):
3252 size = f"--size={size_estimate_bytes}"
3253 if disable_progress_bar or size_estimate_bytes == 0:
3254 size = ""
3255 readable = shlex.quote(size_estimate_human)
3256 pv_log_file = p.log_params.pv_log_file
3257 thread_name = threading.current_thread().name
3258 match = Job.worker_thread_number_regex.fullmatch(thread_name)
3259 if match:
3260 worker = int(match.group(1))
3261 if worker > 0:
3262 pv_log_file += pv_file_thread_separator + f"{worker:04}"
3263 if self.is_first_replication_task.get_and_set(False):
3264 if self.isatty and not p.quiet:
3265 self.progress_reporter.start()
3266 self.replication_start_time_nanos = time.time_ns()
3267 if self.isatty and not p.quiet:
3268 self.progress_reporter.enqueue_pv_log_file(pv_log_file)
3269 pv_program_opts = p.pv_program_opts
3270 if self.progress_update_intervals is not None: # for testing
3271 pv_program_opts = pv_program_opts + [f"--interval={self.progress_update_intervals[0]}"]
3272 return f"{p.pv_program} {' '.join(pv_program_opts)} --force --name={readable} {size} 2>> {pv_log_file}"
3273 else:
3274 return "cat"
3276 def run_ssh_command(
3277 self, remote: Remote, level: int = -1, is_dry=False, check=True, print_stdout=False, print_stderr=True, cmd=None
3278 ) -> str:
3279 """Runs the given cmd via ssh on the given remote, and returns stdout. The full command is the concatenation
3280 of both the command to run on the localhost in order to talk to the remote host ($remote.local_ssh_command())
3281 and the command to run on the given remote host ($cmd)."""
3282 level = level if level >= 0 else logging.INFO
3283 assert cmd is not None and isinstance(cmd, list) and len(cmd) > 0
3284 p, log = self.params, self.params.log
3285 quoted_cmd = [shlex.quote(arg) for arg in cmd]
3286 conn_pool: ConnectionPool = p.connection_pools[remote.location].pool(SHARED)
3287 conn: Connection = conn_pool.get_connection()
3288 try:
3289 ssh_cmd: List[str] = conn.ssh_cmd
3290 if remote.ssh_user_host != "":
3291 self.refresh_ssh_connection_if_necessary(remote, conn)
3292 cmd = quoted_cmd
3293 msg = "Would execute: %s" if is_dry else "Executing: %s"
3294 log.log(level, msg, list_formatter(conn.ssh_cmd_quoted + quoted_cmd, lstrip=True))
3295 if is_dry:
3296 return ""
3297 try:
3298 process = subprocess.run(ssh_cmd + cmd, stdin=DEVNULL, stdout=PIPE, stderr=PIPE, text=True, check=check)
3299 except (subprocess.CalledProcessError, subprocess.TimeoutExpired, UnicodeDecodeError) as e:
3300 if not isinstance(e, UnicodeDecodeError): 3300 ↛ 3303line 3300 didn't jump to line 3303 because the condition on line 3300 was always true
3301 xprint(log, stderr_to_str(e.stdout), run=print_stdout, end="")
3302 xprint(log, stderr_to_str(e.stderr), run=print_stderr, end="")
3303 raise
3304 else:
3305 xprint(log, process.stdout, run=print_stdout, end="")
3306 xprint(log, process.stderr, run=print_stderr, end="")
3307 return process.stdout
3308 finally:
3309 conn_pool.return_connection(conn)
3311 def try_ssh_command(
3312 self, remote: Remote, level: int, is_dry=False, print_stdout=False, cmd=None, exists=True, error_trigger=None
3313 ):
3314 """Convenience method that helps retry/react to a dataset or pool that potentially doesn't exist anymore."""
3315 log = self.params.log
3316 try:
3317 self.maybe_inject_error(cmd=cmd, error_trigger=error_trigger)
3318 return self.run_ssh_command(remote, level=level, is_dry=is_dry, print_stdout=print_stdout, cmd=cmd)
3319 except (subprocess.CalledProcessError, subprocess.TimeoutExpired, UnicodeDecodeError) as e:
3320 if not isinstance(e, UnicodeDecodeError): 3320 ↛ 3330line 3320 didn't jump to line 3330 because the condition on line 3320 was always true
3321 stderr = stderr_to_str(e.stderr)
3322 if exists and (
3323 ": dataset does not exist" in stderr
3324 or ": filesystem does not exist" in stderr # solaris 11.4.0
3325 or ": does not exist" in stderr # solaris 11.4.0 'zfs send' with missing snapshot
3326 or ": no such pool" in stderr
3327 ):
3328 return None
3329 log.warning("%s", stderr.rstrip())
3330 raise RetryableError("Subprocess failed") from e
3332 def refresh_ssh_connection_if_necessary(self, remote: Remote, conn) -> None:
3333 conn: Connection = conn
3334 p, log = self.params, self.params.log
3335 if remote.ssh_user_host == "":
3336 return # we're in local mode; no ssh required
3337 if not self.is_program_available("ssh", "local"):
3338 die(f"{p.ssh_program} CLI is not available to talk to remote host. Install {p.ssh_program} first!")
3339 if not remote.reuse_ssh_connection:
3340 return
3341 # Performance: reuse ssh connection for low latency startup of frequent ssh invocations via the 'ssh -S' and
3342 # 'ssh -S -M -oControlPersist=60s' options. See https://en.wikibooks.org/wiki/OpenSSH/Cookbook/Multiplexing
3343 control_persist_limit_nanos = (self.control_persist_secs - self.control_persist_margin_secs) * 1_000_000_000
3344 now = time.time_ns() # no real need to compute this inside the critical section of conn.lock
3345 with conn.lock:
3346 if now - conn.last_refresh_time < control_persist_limit_nanos:
3347 return # ssh master is alive, reuse its TCP connection (this is the common case & the ultra-fast path)
3348 ssh_cmd = conn.ssh_cmd
3349 ssh_socket_cmd = ssh_cmd[0:-1] # omit trailing ssh_user_host
3350 ssh_socket_cmd += ["-O", "check", remote.ssh_user_host]
3351 # extend lifetime of ssh master by $control_persist_secs via 'ssh -O check' if master is still running.
3352 # 'ssh -S /path/to/socket -O check' doesn't talk over the network, hence is still a low latency fast path.
3353 if subprocess.run(ssh_socket_cmd, stdin=DEVNULL, stdout=PIPE, stderr=PIPE, text=True).returncode == 0:
3354 log.trace("ssh connection is alive: %s", list_formatter(ssh_socket_cmd))
3355 else: # ssh master is not alive; start a new master:
3356 log.trace("ssh connection is not yet alive: %s", list_formatter(ssh_socket_cmd))
3357 ssh_socket_cmd = ssh_cmd[0:-1] # omit trailing ssh_user_host
3358 ssh_socket_cmd += ["-M", f"-oControlPersist={self.control_persist_secs}s", remote.ssh_user_host, "exit"]
3359 log.trace("Executing: %s", list_formatter(ssh_socket_cmd))
3360 process = subprocess.run(ssh_socket_cmd, stdin=DEVNULL, stderr=PIPE, text=True)
3361 if process.returncode != 0:
3362 log.error("%s", process.stderr.rstrip())
3363 die(
3364 f"Cannot ssh into remote host via '{' '.join(ssh_socket_cmd)}'. Fix ssh configuration "
3365 f"first, considering diagnostic log file output from running {prog_name} with: "
3366 "-v -v --ssh-src-extra-opts='-v -v' --ssh-dst-extra-opts='-v -v'"
3367 )
3368 conn.last_refresh_time = time.time_ns()
3370 def maybe_inject_error(self, cmd=None, error_trigger: Optional[str] = None) -> None:
3371 """For testing only; for unit tests to simulate errors during replication and test correct handling of them."""
3372 if error_trigger:
3373 counter = self.error_injection_triggers.get("before")
3374 if counter and self.decrement_injection_counter(counter, error_trigger):
3375 try:
3376 raise CalledProcessError(returncode=1, cmd=" ".join(cmd), stderr=error_trigger + ":dataset is busy")
3377 except subprocess.CalledProcessError as e:
3378 if error_trigger.startswith("retryable_"):
3379 raise RetryableError("Subprocess failed") from e
3380 else:
3381 raise
3383 def maybe_inject_delete(self, remote: Remote, dataset=None, delete_trigger=None) -> None:
3384 """For testing only; for unit tests to delete datasets during replication and test correct handling of that."""
3385 assert delete_trigger
3386 counter = self.delete_injection_triggers.get("before")
3387 if counter and self.decrement_injection_counter(counter, delete_trigger):
3388 p = self.params
3389 cmd = p.split_args(f"{remote.sudo} {p.zfs_program} destroy -r", p.force_unmount, p.force_hard, dataset)
3390 self.run_ssh_command(remote, log_debug, print_stdout=True, cmd=cmd)
3392 def maybe_inject_params(self, error_trigger: str) -> None:
3393 """For testing only; for unit tests to simulate errors during replication and test correct handling of them."""
3394 assert error_trigger
3395 counter = self.error_injection_triggers.get("before")
3396 if counter and self.decrement_injection_counter(counter, error_trigger):
3397 self.inject_params = self.param_injection_triggers[error_trigger]
3398 elif error_trigger in self.param_injection_triggers:
3399 self.inject_params = {}
3401 def decrement_injection_counter(self, counter: Counter, trigger: str) -> bool:
3402 """For testing only."""
3403 with self.injection_lock:
3404 if counter[trigger] <= 0:
3405 return False
3406 counter[trigger] -= 1
3407 return True
3409 def squote(self, remote: Remote, arg: str) -> str:
3410 return arg if remote.ssh_user_host == "" else shlex.quote(arg)
3412 def dquote(self, arg: str) -> str:
3413 """shell-escapes double quotes and backticks, then surrounds with double quotes."""
3414 return '"' + arg.replace('"', '\\"').replace("`", "\\`") + '"'
3416 def filter_datasets(self, remote: Remote, sorted_datasets: List[str]) -> List[str]:
3417 """Returns all datasets (and their descendants) that match at least one of the include regexes but none of the
3418 exclude regexes. Assumes the list of input datasets is sorted. The list of output datasets will be sorted too."""
3419 p, log = self.params, self.params.log
3420 results = []
3421 for i, dataset in enumerate(sorted_datasets):
3422 if i == 0 and p.skip_parent:
3423 continue
3424 rel_dataset = relativize_dataset(dataset, remote.root_dataset)
3425 if rel_dataset.startswith("/"):
3426 rel_dataset = rel_dataset[1:] # strip leading '/' char if any
3427 if is_included(rel_dataset, p.include_dataset_regexes, p.exclude_dataset_regexes):
3428 results.append(dataset)
3429 log.debug("Including b/c dataset regex: %s", dataset)
3430 else:
3431 log.debug("Excluding b/c dataset regex: %s", dataset)
3432 if p.exclude_dataset_property:
3433 results = self.filter_datasets_by_exclude_property(remote, results)
3434 is_debug = p.log.isEnabledFor(log_debug)
3435 for dataset in results:
3436 is_debug and log.debug(f"Finally included {remote.location} dataset: %s", dataset)
3437 if self.is_test_mode:
3438 # Asserts the following: If a dataset is excluded its descendants are automatically excluded too, and this
3439 # decision is never reconsidered even for the descendants because exclude takes precedence over include.
3440 resultset = set(results)
3441 root_datasets = [dataset for dataset in results if os.path.dirname(dataset) not in resultset] # have no parent
3442 for dataset in results: # each dataset belongs to a subtree rooted at one of the roots
3443 assert any(is_descendant(dataset, of_root_dataset=root) for root in root_datasets)
3444 return results
3446 def filter_datasets_by_exclude_property(self, remote: Remote, sorted_datasets: List[str]) -> List[str]:
3447 """Excludes datasets that are marked with a ZFS user property value that, in effect, says 'skip me'."""
3448 p, log = self.params, self.params.log
3449 results = []
3450 localhostname = None
3451 skip_dataset = DONT_SKIP_DATASET
3452 for dataset in sorted_datasets:
3453 if is_descendant(dataset, of_root_dataset=skip_dataset):
3454 # skip_dataset shall be ignored or has been deleted by some third party while we're running
3455 continue # nothing to do anymore for this dataset subtree (note that datasets is sorted)
3456 skip_dataset = DONT_SKIP_DATASET
3457 # TODO perf: on zfs >= 2.3 use json via zfs list -j to safely merge all zfs list's into one 'zfs list' call
3458 cmd = p.split_args(f"{p.zfs_program} list -t filesystem,volume -Hp -o {p.exclude_dataset_property}", dataset)
3459 self.maybe_inject_delete(remote, dataset=dataset, delete_trigger="zfs_list_exclude_property")
3460 property_value = self.try_ssh_command(remote, log_trace, cmd=cmd)
3461 if property_value is None:
3462 log.warning(f"Third party deleted {remote.location}: %s", dataset)
3463 skip_dataset = dataset
3464 else:
3465 reason = ""
3466 property_value = property_value.strip()
3467 if not property_value or property_value == "-" or property_value.lower() == "true":
3468 sync = True
3469 elif property_value.lower() == "false":
3470 sync = False
3471 else:
3472 localhostname = localhostname or socket.gethostname()
3473 sync = any(localhostname == hostname.strip() for hostname in property_value.split(","))
3474 reason = f", localhostname: {localhostname}, hostnames: {property_value}"
3476 if sync:
3477 results.append(dataset)
3478 log.debug("Including b/c dataset prop: %s%s", dataset, reason)
3479 else:
3480 skip_dataset = dataset
3481 log.debug("Excluding b/c dataset prop: %s%s", dataset, reason)
3482 return results
3484 def filter_snapshots(self, basis_snapshots: List[str], all_except: bool = False) -> List[str]:
3485 """Returns all snapshots that pass all include/exclude policies."""
3487 def resolve_timerange(timerange: UnixTimeRange) -> UnixTimeRange:
3488 assert timerange is not None
3489 lo, hi = timerange
3490 if isinstance(lo, timedelta):
3491 lo = ceil(current_unixtime_in_secs - lo.total_seconds())
3492 if isinstance(hi, timedelta):
3493 hi = ceil(current_unixtime_in_secs - hi.total_seconds())
3494 assert isinstance(lo, int)
3495 assert isinstance(hi, int)
3496 return (lo, hi) if lo <= hi else (hi, lo)
3498 p, log = self.params, self.params.log
3499 current_unixtime_in_secs: float = p.create_src_snapshots_config.current_datetime.timestamp()
3500 resultset = set()
3501 for snapshot_filter in p.snapshot_filters:
3502 snapshots = basis_snapshots
3503 for _filter in snapshot_filter:
3504 name = _filter.name
3505 if name == snapshot_regex_filter_name:
3506 snapshots = self.filter_snapshots_by_regex(snapshots, regexes=_filter.options)
3507 elif name == "include_snapshot_times":
3508 timerange = resolve_timerange(_filter.timerange) if _filter.timerange is not None else _filter.timerange
3509 snapshots = self.filter_snapshots_by_creation_time(snapshots, include_snapshot_times=timerange)
3510 else:
3511 assert name == "include_snapshot_times_and_ranks"
3512 timerange = resolve_timerange(_filter.timerange) if _filter.timerange is not None else _filter.timerange
3513 snapshots = self.filter_snapshots_by_creation_time_and_rank(
3514 snapshots, include_snapshot_times=timerange, include_snapshot_ranks=_filter.options
3515 )
3516 resultset.update(snapshots) # union
3517 snapshots = [line for line in basis_snapshots if "#" in line or (line in resultset) != all_except]
3518 is_debug = p.log.isEnabledFor(log_debug)
3519 for snapshot in snapshots:
3520 is_debug and log.debug("Finally included snapshot: %s", snapshot[snapshot.rindex("\t") + 1 :])
3521 return snapshots
3523 def filter_snapshots_by_regex(self, snapshots: List[str], regexes: Tuple[RegexList, RegexList]) -> List[str]:
3524 """Returns all snapshots that match at least one of the include regexes but none of the exclude regexes."""
3525 exclude_snapshot_regexes, include_snapshot_regexes = regexes
3526 p, log = self.params, self.params.log
3527 is_debug = log.isEnabledFor(log_debug)
3528 results = []
3529 for snapshot in snapshots:
3530 i = snapshot.find("@") # snapshot separator
3531 if i < 0:
3532 continue # retain bookmarks to help find common snapshots, apply filter only to snapshots
3533 elif is_included(snapshot[i + 1 :], include_snapshot_regexes, exclude_snapshot_regexes):
3534 results.append(snapshot)
3535 is_debug and log.debug("Including b/c snapshot regex: %s", snapshot[snapshot.rindex("\t") + 1 :])
3536 else:
3537 is_debug and log.debug("Excluding b/c snapshot regex: %s", snapshot[snapshot.rindex("\t") + 1 :])
3538 return results
3540 def filter_snapshots_by_creation_time(self, snaps: List[str], include_snapshot_times: UnixTimeRange) -> List[str]:
3541 p, log = self.params, self.params.log
3542 is_debug = log.isEnabledFor(log_debug)
3543 lo_snaptime, hi_snaptime = include_snapshot_times or (0, unixtime_infinity_secs)
3544 results = []
3545 for snapshot in snaps:
3546 if "@" not in snapshot:
3547 continue # retain bookmarks to help find common snapshots, apply filter only to snapshots
3548 elif lo_snaptime <= int(snapshot[0 : snapshot.index("\t")]) < hi_snaptime:
3549 results.append(snapshot)
3550 is_debug and log.debug("Including b/c creation time: %s", snapshot[snapshot.rindex("\t") + 1 :])
3551 else:
3552 is_debug and log.debug("Excluding b/c creation time: %s", snapshot[snapshot.rindex("\t") + 1 :])
3553 return results
3555 def filter_snapshots_by_creation_time_and_rank(
3556 self, snapshots: List[str], include_snapshot_times: UnixTimeRange, include_snapshot_ranks: List[RankRange]
3557 ) -> List[str]:
3559 def get_idx(rank: Tuple[str, int, bool], n: int) -> int:
3560 kind, num, is_percent = rank
3561 m = round(n * num / 100) if is_percent else min(n, num)
3562 assert kind == "latest" or kind == "oldest"
3563 return m if kind == "oldest" else n - m
3565 assert isinstance(include_snapshot_ranks, list)
3566 assert len(include_snapshot_ranks) > 0
3567 p, log = self.params, self.params.log
3568 is_debug = log.isEnabledFor(log_debug)
3569 lo_time, hi_time = include_snapshot_times or (0, unixtime_infinity_secs)
3570 n = sum(1 for snapshot in snapshots if "@" in snapshot)
3571 for rank_range in include_snapshot_ranks:
3572 lo_rank, hi_rank = rank_range
3573 lo = get_idx(lo_rank, n)
3574 hi = get_idx(hi_rank, n)
3575 lo, hi = (lo, hi) if lo <= hi else (hi, lo)
3576 i = 0
3577 results = []
3578 for snapshot in snapshots:
3579 if "@" not in snapshot:
3580 continue # retain bookmarks to help find common snapshots, apply filter only to snapshots
3581 else:
3582 msg = None
3583 if lo <= i < hi:
3584 msg = "Including b/c snapshot rank: %s"
3585 elif lo_time <= int(snapshot[0 : snapshot.index("\t")]) < hi_time:
3586 msg = "Including b/c creation time: %s"
3587 if msg:
3588 results.append(snapshot)
3589 else:
3590 msg = "Excluding b/c snapshot rank: %s"
3591 is_debug and log.debug(msg, snapshot[snapshot.rindex("\t") + 1 :])
3592 i += 1
3593 snapshots = results
3594 n = hi - lo
3595 return snapshots
3597 def filter_properties(self, props: Dict[str, str], include_regexes, exclude_regexes) -> Dict[str, str]:
3598 """Returns ZFS props whose name matches at least one of the include regexes but none of the exclude regexes."""
3599 p, log = self.params, self.params.log
3600 is_debug = log.isEnabledFor(log_debug)
3601 results = {}
3602 for propname, propvalue in props.items():
3603 if is_included(propname, include_regexes, exclude_regexes):
3604 results[propname] = propvalue
3605 is_debug and log.debug("Including b/c property regex: %s", propname)
3606 else:
3607 is_debug and log.debug("Excluding b/c property regex: %s", propname)
3608 return results
3610 @staticmethod
3611 def filter_lines(input_list: Iterable[str], input_set: Set[str]) -> List[str]:
3612 """For each line in input_list, includes the line if input_set contains the first column field of that line."""
3613 if len(input_set) == 0:
3614 return []
3615 return [line for line in input_list if line[0 : line.index("\t")] in input_set]
3617 def delete_snapshots(self, remote: Remote, dataset: str, snapshot_tags: List[str]) -> None:
3618 if len(snapshot_tags) == 0:
3619 return
3620 p, log = self.params, self.params.log
3621 log.info(p.dry(f"Deleting {len(snapshot_tags)} snapshots within %s: %s"), dataset, snapshot_tags)
3622 # delete snapshots in batches without creating a command line that's too big for the OS to handle
3623 self.run_ssh_cmd_batched(
3624 remote,
3625 self.delete_snapshot_cmd(remote, dataset + "@"),
3626 snapshot_tags,
3627 lambda batch: self.delete_snapshot(remote, dataset, dataset + "@" + ",".join(batch)),
3628 max_batch_items=1 if self.is_solaris_zfs(remote) else self.params.max_snapshots_per_minibatch_on_delete_snaps,
3629 sep=",",
3630 )
3632 def delete_snapshot(self, r: Remote, dataset: str, snaps_to_delete: str) -> None:
3633 p, log = self.params, self.params.log
3634 cmd = self.delete_snapshot_cmd(r, snaps_to_delete)
3635 is_dry = p.dry_run and self.is_solaris_zfs(r) # solaris-11.4 knows no 'zfs destroy -n' flag
3636 try:
3637 self.maybe_inject_error(cmd=cmd, error_trigger="zfs_delete_snapshot")
3638 self.run_ssh_command(r, log_debug, is_dry=is_dry, print_stdout=True, cmd=cmd)
3639 except (subprocess.CalledProcessError, subprocess.TimeoutExpired, UnicodeDecodeError) as e:
3640 stderr = stderr_to_str(e.stderr) if hasattr(e, "stderr") else ""
3641 no_sleep = self.clear_resumable_recv_state_if_necessary(dataset, stderr)
3642 # op isn't idempotent so retries regather current state from the start
3643 raise RetryableError("Subprocess failed", no_sleep=no_sleep) from e
3645 def delete_snapshot_cmd(self, r: Remote, snaps_to_delete: str) -> List[str]:
3646 p = self.params
3647 return p.split_args(
3648 f"{r.sudo} {p.zfs_program} destroy", p.force_hard, p.verbose_destroy, p.dry_run_destroy, snaps_to_delete
3649 )
3651 def delete_bookmarks(self, remote: Remote, dataset: str, snapshot_tags: List[str]) -> None:
3652 if len(snapshot_tags) == 0:
3653 return
3654 # Unfortunately ZFS has no syntax yet to delete multiple bookmarks in a single CLI invocation
3655 p, log = self.params, self.params.log
3656 log.info(
3657 p.dry(f"Deleting {len(snapshot_tags)} bookmarks within %s: %s"), dataset, dataset + "#" + ",".join(snapshot_tags)
3658 )
3659 cmd = p.split_args(f"{remote.sudo} {p.zfs_program} destroy")
3660 self.run_ssh_cmd_parallel(
3661 remote,
3662 [(cmd, [f"{dataset}#{snapshot_tag}"]) for snapshot_tag in snapshot_tags],
3663 lambda _cmd, batch: self.try_ssh_command(
3664 remote, log_debug, is_dry=p.dry_run, print_stdout=True, cmd=_cmd + batch, exists=False
3665 ),
3666 )
3668 def delete_datasets(self, remote: Remote, datasets: Iterable[str]) -> None:
3669 """Deletes the given datasets via zfs destroy -r on the given remote."""
3670 # Impl is batch optimized to minimize CLI + network roundtrips: only need to run zfs destroy if previously
3671 # destroyed dataset (within sorted datasets) is not a prefix (aka ancestor) of current dataset
3672 p, log = self.params, self.params.log
3673 last_deleted_dataset = DONT_SKIP_DATASET
3674 for dataset in sorted(datasets):
3675 if is_descendant(dataset, of_root_dataset=last_deleted_dataset):
3676 continue
3677 log.info(p.dry("Deleting dataset tree: %s"), f"{dataset} ...")
3678 cmd = p.split_args(
3679 f"{remote.sudo} {p.zfs_program} destroy -r {p.force_unmount} {p.force_hard} {p.verbose_destroy}",
3680 p.dry_run_destroy,
3681 dataset,
3682 )
3683 is_dry = p.dry_run and self.is_solaris_zfs(remote) # solaris-11.4 knows no 'zfs destroy -n' flag
3684 self.run_ssh_command(remote, log_debug, is_dry=is_dry, print_stdout=True, cmd=cmd)
3685 last_deleted_dataset = dataset
3687 def create_filesystem(self, filesystem: str) -> None:
3688 # zfs create -p -u $filesystem
3689 # To ensure the filesystems that we create do not get mounted, we apply a separate 'zfs create -p -u'
3690 # invocation for each non-existing ancestor. This is because a single 'zfs create -p -u' applies the '-u'
3691 # part only to the immediate filesystem, rather than to the not-yet existing ancestors.
3692 p, log = self.params, self.params.log
3693 parent = ""
3694 no_mount = "-u" if self.is_program_available(zfs_version_is_at_least_2_1_0, "dst") else ""
3695 for component in filesystem.split("/"):
3696 parent += component
3697 if not self.dst_dataset_exists[parent]:
3698 cmd = p.split_args(f"{p.dst.sudo} {p.zfs_program} create -p", no_mount, parent)
3699 try:
3700 self.run_ssh_command(p.dst, log_debug, is_dry=p.dry_run, print_stdout=True, cmd=cmd)
3701 except subprocess.CalledProcessError as e:
3702 # ignore harmless error caused by 'zfs create' without the -u flag, or by dataset already existing
3703 if ( 3703 ↛ 3709line 3703 didn't jump to line 3709 because the condition on line 3703 was never true
3704 "filesystem successfully created, but it may only be mounted by root" not in e.stderr
3705 and "filesystem successfully created, but not mounted" not in e.stderr # SolarisZFS
3706 and "dataset already exists" not in e.stderr
3707 and "filesystem already exists" not in e.stderr # SolarisZFS?
3708 ):
3709 raise
3710 if not p.dry_run:
3711 self.dst_dataset_exists[parent] = True
3712 parent += "/"
3714 def create_zfs_bookmark(self, remote: Remote, src_snapshot: str, src_dataset: str) -> None:
3715 p, log = self.params, self.params.log
3716 assert "@" in src_snapshot
3717 bookmark = replace_prefix(src_snapshot, old_prefix=f"{src_dataset}@", new_prefix=f"{src_dataset}#")
3718 if p.create_bookmark and self.are_bookmarks_enabled(remote):
3719 cmd = p.split_args(f"{remote.sudo} {p.zfs_program} bookmark", src_snapshot, bookmark)
3720 try:
3721 self.run_ssh_command(remote, log_debug, is_dry=p.dry_run, print_stderr=False, cmd=cmd)
3722 except subprocess.CalledProcessError as e:
3723 # ignore harmless zfs error caused by bookmark with the same name already existing
3724 if ": bookmark exists" not in e.stderr: 3724 ↛ 3725line 3724 didn't jump to line 3725 because the condition on line 3724 was never true
3725 print(e.stderr, file=sys.stderr, end="")
3726 raise
3728 def estimate_send_size(self, remote: Remote, dst_dataset: str, recv_resume_token: str, *items) -> int:
3729 """Estimates num bytes to transfer via 'zfs send'."""
3730 p, log = self.params, self.params.log
3731 if p.no_estimate_send_size or self.is_solaris_zfs(remote):
3732 return 0 # solaris-11.4 does not have a --parsable equivalent
3733 zfs_send_program_opts = ["--parsable" if opt == "-P" else opt for opt in p.curr_zfs_send_program_opts]
3734 zfs_send_program_opts = append_if_absent(zfs_send_program_opts, "-v", "-n", "--parsable")
3735 if recv_resume_token:
3736 zfs_send_program_opts = ["-Pnv", "-t", recv_resume_token]
3737 items = ""
3738 cmd = p.split_args(f"{remote.sudo} {p.zfs_program} send", zfs_send_program_opts, items)
3739 try:
3740 lines = self.try_ssh_command(remote, log_trace, cmd=cmd)
3741 except RetryableError as retryable_error:
3742 if recv_resume_token:
3743 e = retryable_error.__cause__
3744 stderr = stderr_to_str(e.stderr) if hasattr(e, "stderr") else ""
3745 retryable_error.no_sleep = self.clear_resumable_recv_state_if_necessary(dst_dataset, stderr)
3746 # op isn't idempotent so retries regather current state from the start of replicate_dataset()
3747 raise retryable_error
3748 if lines is None: 3748 ↛ 3749line 3748 didn't jump to line 3749 because the condition on line 3748 was never true
3749 return 0 # src dataset or snapshot has been deleted by third party
3750 size = lines.splitlines()[-1]
3751 assert size.startswith("size")
3752 return int(size[size.index("\t") + 1 :])
3754 def dataset_regexes(self, datasets: List[str]) -> List[str]:
3755 src, dst = self.params.src, self.params.dst
3756 results = []
3757 for dataset in datasets:
3758 if dataset.startswith("/"):
3759 # it's an absolute dataset - convert it to a relative dataset
3760 dataset = dataset[1:]
3761 if is_descendant(dataset, of_root_dataset=src.root_dataset):
3762 dataset = relativize_dataset(dataset, src.root_dataset)
3763 elif is_descendant(dataset, of_root_dataset=dst.root_dataset):
3764 dataset = relativize_dataset(dataset, dst.root_dataset)
3765 else:
3766 continue # ignore datasets that make no difference
3767 if dataset.startswith("/"):
3768 dataset = dataset[1:]
3769 if dataset.endswith("/"):
3770 dataset = dataset[0:-1]
3771 if dataset:
3772 regex = re.escape(dataset)
3773 else:
3774 regex = ".*"
3775 results.append(regex)
3776 return results
3778 def run_with_retries(self, policy: RetryPolicy, fn: Callable, *args, **kwargs) -> Any:
3779 """Runs the given function with the given arguments, and retries on failure as indicated by policy."""
3780 log = self.params.log
3781 max_sleep_mark = policy.min_sleep_nanos
3782 retry_count = 0
3783 sysrandom = None
3784 start_time_nanos = time.time_ns()
3785 while True:
3786 try:
3787 return fn(*args, **kwargs, retry=Retry(retry_count)) # Call the target function with provided args
3788 except RetryableError as retryable_error:
3789 elapsed_nanos = time.time_ns() - start_time_nanos
3790 if retry_count < policy.retries and elapsed_nanos < policy.max_elapsed_nanos:
3791 retry_count += 1
3792 if retryable_error.no_sleep and retry_count <= 1:
3793 log.info(f"Retrying [{retry_count}/{policy.retries}] immediately ...")
3794 continue
3795 # pick a random sleep duration within the range [min_sleep_nanos, max_sleep_mark] as delay
3796 sysrandom = sysrandom if sysrandom is not None else random.SystemRandom()
3797 sleep_nanos = sysrandom.randint(policy.min_sleep_nanos, max_sleep_mark)
3798 log.info(f"Retrying [{retry_count}/{policy.retries}] in {human_readable_duration(sleep_nanos)} ...")
3799 time.sleep(sleep_nanos / 1_000_000_000)
3800 max_sleep_mark = min(policy.max_sleep_nanos, 2 * max_sleep_mark) # exponential backoff with cap
3801 else:
3802 if policy.retries > 0:
3803 log.warning(
3804 f"Giving up because the last [{retry_count}/{policy.retries}] retries across "
3805 f"[{elapsed_nanos // 1_000_000_000}/{policy.max_elapsed_nanos // 1_000_000_000}] "
3806 "seconds for the current request failed!"
3807 )
3808 raise retryable_error.__cause__
3810 def incremental_send_steps_wrapper(
3811 self, src_snapshots: List[str], src_guids: List[str], included_guids: Set[str], is_resume: bool
3812 ) -> List[Tuple[str, str, str, int]]:
3813 force_convert_I_to_i = self.params.src.use_zfs_delegation and not getenv_bool("no_force_convert_I_to_i", True)
3814 # force_convert_I_to_i == True implies that:
3815 # If using 'zfs allow' delegation mechanism, force convert 'zfs send -I' to a series of
3816 # 'zfs send -i' as a workaround for zfs issue https://github.com/openzfs/zfs/issues/16394
3817 return self.incremental_send_steps(src_snapshots, src_guids, included_guids, is_resume, force_convert_I_to_i)
3819 def incremental_send_steps(
3820 self, src_snapshots: List[str], src_guids: List[str], included_guids: Set[str], is_resume, force_convert_I_to_i
3821 ) -> List[Tuple[str, str, str, int]]:
3822 """Computes steps to incrementally replicate the given src snapshots with the given src_guids such that we
3823 include intermediate src snapshots that pass the policy specified by --{include,exclude}-snapshot-*
3824 (represented here by included_guids), using an optimal series of -i/-I send/receive steps that skip
3825 excluded src snapshots. The steps are optimal in the sense that no solution with fewer steps exists. A step
3826 corresponds to a single ZFS send/receive operation. Fewer steps translate to better performance, especially
3827 when sending many small snapshots. For example, 1 step that sends 100 small snapshots in a single operation is
3828 much faster than 100 steps that each send only 1 such snapshot per ZFS send/receive operation.
3829 Example: skip hourly snapshots and only include daily shapshots for replication
3830 Example: [d1, h1, d2, d3, d4] (d is daily, h is hourly) --> [d1, d2, d3, d4] via
3831 -i d1:d2 (i.e. exclude h1; '-i' and ':' indicate 'skip intermediate snapshots')
3832 -I d2-d4 (i.e. also include d3; '-I' and '-' indicate 'include intermediate snapshots')
3833 * The force_convert_I_to_i param is necessary as a work-around for https://github.com/openzfs/zfs/issues/16394
3834 * 'zfs send' CLI with a bookmark as starting snapshot does not (yet) support including intermediate
3835 src_snapshots via -I flag per https://github.com/openzfs/zfs/issues/12415. Thus, if the replication source
3836 is a bookmark we convert a -I step to a -i step followed by zero or more -i/-I steps.
3837 * The is_resume param is necessary as 'zfs send -t' does not support sending more than a single snapshot
3838 on resuming a previously interrupted 'zfs receive -s'. Thus, here too, we convert a -I step to a -i step
3839 followed by zero or more -i/-I steps."""
3841 def append_run(i: int, label: str) -> int:
3842 step = ("-I", src_snapshots[start], src_snapshots[i], i - start)
3843 # print(f"{label} {self.send_step_to_str(step)}")
3844 is_not_resume = len(steps) > 0 or not is_resume
3845 if i - start > 1 and not force_convert_I_to_i and "@" in src_snapshots[start] and is_not_resume:
3846 steps.append(step)
3847 elif "@" in src_snapshots[start] and is_not_resume:
3848 for j in range(start, i): # convert -I step to -i steps
3849 steps.append(("-i", src_snapshots[j], src_snapshots[j + 1], 1))
3850 else: # it's a bookmark src or zfs send -t; convert -I step to -i step followed by zero or more -i/-I steps
3851 steps.append(("-i", src_snapshots[start], src_snapshots[start + 1], 1))
3852 i = start + 1
3853 return i - 1
3855 assert len(src_guids) == len(src_snapshots)
3856 assert len(included_guids) >= 0
3857 steps = []
3858 guids = src_guids
3859 n = len(guids)
3860 i = 0
3861 while i < n and guids[i] not in included_guids: # skip hourlies
3862 i += 1
3864 while i < n:
3865 assert guids[i] in included_guids # it's a daily
3866 start = i
3867 i += 1
3868 while i < n and guids[i] in included_guids: # skip dailies
3869 i += 1
3870 if i < n:
3871 if i - start == 1:
3872 # it's a single daily (that was already replicated) followed by an hourly
3873 i += 1
3874 while i < n and guids[i] not in included_guids: # skip hourlies
3875 i += 1
3876 if i < n:
3877 assert start != i
3878 step = ("-i", src_snapshots[start], src_snapshots[i], 1)
3879 # print(f"r1 {self.send_step_to_str(step)}")
3880 steps.append(step)
3881 i -= 1
3882 else: # it's a run of more than one daily
3883 i -= 1
3884 assert start != i
3885 i = append_run(i, "r2")
3886 else: # finish up run of trailing dailies
3887 i -= 1
3888 if start != i:
3889 i = append_run(i, "r3")
3890 i += 1
3891 return steps
3893 @staticmethod
3894 def send_step_to_str(step: Tuple[str, str, str]) -> str:
3895 # return str(step[1]) + ('-' if step[0] == '-I' else ':') + str(step[2])
3896 return str(step)
3898 def zfs_set(self, properties: List[str], remote: Remote, dataset: str) -> None:
3899 """Applies the given property key=value pairs via 'zfs set' CLI to the given dataset on the given remote."""
3900 p, log = self.params, self.params.log
3901 if len(properties) == 0:
3902 return
3903 # set properties in batches without creating a command line that's too big for the OS to handle
3904 cmd = p.split_args(f"{remote.sudo} {p.zfs_program} set")
3905 self.run_ssh_cmd_batched(
3906 remote,
3907 cmd,
3908 properties,
3909 lambda batch: self.run_ssh_command(
3910 remote, log_debug, is_dry=p.dry_run, print_stdout=True, cmd=cmd + batch + [dataset]
3911 ),
3912 max_batch_items=1 if self.is_solaris_zfs(remote) else 2**29, # solaris-11.4 CLI doesn't accept multiple props
3913 )
3915 def zfs_get(
3916 self,
3917 remote: Remote,
3918 dataset: str,
3919 sources: str,
3920 output_columns: str,
3921 propnames: str,
3922 splitlines: bool,
3923 props_cache: Dict[Tuple[str, str, str], Dict[str, str]],
3924 ) -> Dict[str, Optional[str]]:
3925 """Returns the results of 'zfs get' CLI on the given dataset on the given remote."""
3926 if not propnames:
3927 return {}
3928 p, log = self.params, self.params.log
3929 cache_key = (sources, output_columns, propnames)
3930 props = props_cache.get(cache_key)
3931 if props is None:
3932 cmd = p.split_args(f"{p.zfs_program} get -Hp -o {output_columns} -s {sources} {propnames}", dataset)
3933 lines = self.run_ssh_command(remote, log_trace, cmd=cmd)
3934 is_name_value_pair = "," in output_columns
3935 props = {}
3936 # if not splitlines: omit single trailing newline that was appended by 'zfs get' CLI
3937 for line in lines.splitlines() if splitlines else [lines[0:-1]]:
3938 if is_name_value_pair:
3939 propname, propvalue = line.split("\t", 1)
3940 props[propname] = propvalue
3941 else:
3942 props[line] = None
3943 props_cache[cache_key] = props
3944 return props
3946 def add_recv_property_options(
3947 self, full_send: bool, recv_opts: List[str], dataset: str, cache: Dict[Tuple[str, str, str], Dict[str, str]]
3948 ) -> Tuple[List[str], List[str]]:
3949 """Reads the ZFS properties of the given src dataset. Appends zfs recv -o and -x values to recv_opts according to CLI
3950 params, and returns properties to explicitly set on the dst dataset after 'zfs receive' completes successfully."""
3951 p = self.params
3952 set_opts = []
3953 ox_names = p.zfs_recv_ox_names.copy()
3954 for config in [p.zfs_recv_o_config, p.zfs_recv_x_config, p.zfs_set_config]:
3955 if len(config.include_regexes) == 0:
3956 continue # this is the default - it's an instant noop
3957 if (full_send and "full" in config.targets) or (not full_send and "incremental" in config.targets):
3958 # 'zfs get' uses newline as record separator and tab as separator between output columns. A ZFS user property
3959 # may contain newline and tab characters (indeed anything). Together, this means that there is no reliable
3960 # way to determine where a record ends and the next record starts when listing multiple arbitrary records in
3961 # a single 'zfs get' call. Therefore, here we use a separate 'zfs get' call for each ZFS user property.
3962 # TODO: perf: on zfs >= 2.3 use json via zfs get -j to safely merge all zfs gets into one 'zfs get' call
3963 try:
3964 props = self.zfs_get(p.src, dataset, config.sources, "property", "all", True, cache)
3965 props = self.filter_properties(props, config.include_regexes, config.exclude_regexes)
3966 user_propnames = [name for name in props.keys() if ":" in name]
3967 sys_propnames = ",".join([name for name in props.keys() if ":" not in name])
3968 props = self.zfs_get(p.src, dataset, config.sources, "property,value", sys_propnames, True, cache)
3969 for propnames in user_propnames:
3970 props.update(self.zfs_get(p.src, dataset, config.sources, "property,value", propnames, False, cache))
3971 except (subprocess.CalledProcessError, subprocess.TimeoutExpired, UnicodeDecodeError) as e:
3972 raise RetryableError("Subprocess failed") from e
3973 for propname in sorted(props.keys()):
3974 if config is p.zfs_recv_o_config:
3975 if propname not in ox_names:
3976 recv_opts.append("-o")
3977 recv_opts.append(f"{propname}={props[propname]}")
3978 ox_names.add(propname)
3979 elif config is p.zfs_recv_x_config:
3980 if propname not in ox_names:
3981 recv_opts.append("-x")
3982 recv_opts.append(propname)
3983 ox_names.add(propname)
3984 else:
3985 set_opts.append(f"{propname}={props[propname]}")
3986 return recv_opts, set_opts
3988 @staticmethod
3989 def recv_option_property_names(recv_opts: List[str]) -> Set[str]:
3990 """Extracts -o and -x property names that are already specified on the command line. This can be used to check
3991 for dupes because 'zfs receive' does not accept multiple -o or -x options with the same property name."""
3992 propnames = set()
3993 i = 0
3994 n = len(recv_opts)
3995 while i < n:
3996 stripped = recv_opts[i].strip()
3997 if stripped in {"-o", "-x"}:
3998 i += 1
3999 if i == n or recv_opts[i].strip() in {"-o", "-x"}:
4000 die(f"Missing value for {stripped} option in --zfs-receive-program-opt(s): {' '.join(recv_opts)}")
4001 propnames.add(recv_opts[i] if stripped == "-x" else recv_opts[i].split("=", 1)[0])
4002 i += 1
4003 return propnames
4005 def root_datasets_if_recursive_zfs_snapshot_is_possible(
4006 self, datasets: List[str], basis_datasets: List[str]
4007 ) -> Optional[List[str]]:
4008 """Returns the root datasets within the (filtered) `datasets` list if no incompatible pruning is detected. A dataset
4009 within `datasets` is considered a root dataset if it has no parent, i.e. it is not a descendant of any dataset in
4010 `datasets`. Returns `None` if any (unfiltered) dataset in `basis_dataset` that is a descendant of at least one of
4011 the root datasets is missing in `datasets`, indicating that --include/exclude-dataset* or the snapshot schedule
4012 have pruned a dataset in a way that is incompatible with 'zfs snapshot -r' CLI semantics, thus requiring a switch
4013 to the non-recursive 'zfs snapshot snapshot1 .. snapshot N' CLI flavor.
4014 Assumes that set(datasets).issubset(set(basis_datasets)). Also assumes that datasets and basis_datasets are both
4015 sorted (and thus the output root_datasets is sorted too), which is why this algorithm is efficient - O(N) time
4016 complexity. The impl is akin to the merge algorithm of a merge sort, adapted to our specific use case.
4017 See root_datasets_if_recursive_zfs_snapshot_is_possible_slow_but_correct() in the unit test suite for an alternative
4018 impl that's easier to grok."""
4019 datasets_set: Set[str] = set(datasets)
4020 root_datasets: List[str] = self.find_root_datasets(datasets)
4021 len_root_datasets = len(root_datasets)
4022 len_basis_datasets = len(basis_datasets)
4023 i, j = 0, 0
4024 while i < len_root_datasets and j < len_basis_datasets: # walk and "merge" both sorted lists, in sync
4025 if basis_datasets[j] < root_datasets[i]: # irrelevant subtree?
4026 j += 1 # move to the next basis_src_dataset
4027 elif is_descendant(basis_datasets[j], of_root_dataset=root_datasets[i]): # relevant subtree?
4028 if basis_datasets[j] not in datasets_set: # was dataset chopped off by schedule or --incl/exclude-dataset*?
4029 return None # detected filter pruning that is incompatible with 'zfs snapshot -r'
4030 j += 1 # move to the next basis_src_dataset
4031 else:
4032 i += 1 # move to next root dataset; no need to check root_datasets that are nomore (or not yet) reachable
4033 return root_datasets
4035 @staticmethod
4036 def find_root_datasets(sorted_datasets: List[str]) -> List[str]:
4037 """Returns the roots of the subtrees in the (sorted) input datasets. The output root dataset list is sorted, too.
4038 A dataset is a root dataset if it has no parent, i.e. it is not a descendant of any dataset in the input datasets."""
4039 root_datasets = []
4040 skip_dataset = DONT_SKIP_DATASET
4041 for dataset in sorted_datasets:
4042 if is_descendant(dataset, of_root_dataset=skip_dataset):
4043 continue
4044 skip_dataset = dataset
4045 root_datasets.append(dataset)
4046 return root_datasets
4048 def find_datasets_to_snapshot(self, sorted_datasets: List[str]) -> Dict[SnapshotLabel, List[str]]:
4049 """Given a (sorted) list of source datasets, returns a dict where the key is a snapshot name (aka SnapshotLabel, e.g.
4050 bzfs_2024-11-06_08:30:05_hourly) and the value is the (sorted) (sub)list of datasets for which a snapshot needs to
4051 be created with that name, because these datasets are due per the schedule, either because the 'creation' time of
4052 their most recent snapshot with that name pattern is now too old, or such a snapshot does not even exist.
4053 The baseline implementation uses the 'zfs list -t snapshot' CLI to find the most recent snapshots, which is simple
4054 but doesn't scale well with the number of snapshots, at least if the goal is to take snapshots every second.
4055 An alternative, much more scalable, implementation queries the standard ZFS "snapshots_changed" dataset property
4056 (requires zfs >= 2.2.0), in combination with a local cache that stores this property, as well as the creation time
4057 of the most recent snapshot, for each SnapshotLabel and each dataset."""
4059 def cache_get_snapshots_changed(dataset: str, label: SnapshotLabel = None) -> int:
4060 """Like zfs_get_snapshots_changed() but reads from local cache."""
4061 try: # perf: inode metadata reads and writes are fast - ballpark O(200k) ops/sec.
4062 return round(os.stat(self.last_modified_cache_file(dataset, label)).st_mtime)
4063 except FileNotFoundError:
4064 return 0 # harmless
4066 def create_snapshot_if_latest_is_too_old(
4067 datasets_to_snapshot: Dict[SnapshotLabel, List[str]], label: SnapshotLabel, creation_unixtime: int
4068 ):
4069 """Schedules creation of a snapshot for the given label if the label's existing latest snapshot is too old."""
4070 creation_dt = datetime.fromtimestamp(creation_unixtime, tz=config.tz)
4071 log.trace("Latest snapshot creation: %s for %s", creation_dt, label)
4072 duration_amount, duration_unit = config.suffix_durations[label.suffix]
4073 next_event_dt = round_datetime_up_to_duration_multiple(
4074 creation_dt + timedelta(microseconds=1), duration_amount, duration_unit, config.anchors
4075 )
4076 msg = ""
4077 if config.current_datetime >= next_event_dt:
4078 datasets_to_snapshot[label].append(dataset) # mark it as scheduled for snapshot creation
4079 msg = " has passed"
4080 log.info("Next scheduled snapshot time: %s for %s@%s%s", next_event_dt, dataset, label, msg)
4082 p, log = self.params, self.params.log
4083 src, config = p.src, p.create_src_snapshots_config
4084 datasets_to_snapshot = defaultdict(list)
4085 labels = []
4086 for label in config.snapshot_labels():
4087 _duration_amount, _duration_unit = config.suffix_durations[label.suffix]
4088 if _duration_amount == 0 or config.create_src_snapshots_even_if_not_due:
4089 datasets_to_snapshot[label] = sorted_datasets # take snapshot regardless of creation time of existing snaps
4090 else:
4091 labels.append(label)
4092 if len(labels) == 0:
4093 return datasets_to_snapshot # nothing more TBD
4095 # satisfy request from local cache as much as possible
4096 cached_datasets_to_snapshot = defaultdict(list)
4097 if self.is_snapshots_changed_zfs_property_available(src):
4098 sorted_datasets_todo = []
4099 for dataset in sorted_datasets:
4100 cached_snapshots_changed: int = cache_get_snapshots_changed(dataset)
4101 if cached_snapshots_changed == 0:
4102 sorted_datasets_todo.append(dataset) # request cannot be answered from cache
4103 continue
4104 if cached_snapshots_changed != self.src_properties[dataset]["snapshots_changed"]: # get that prop "for free"
4105 self.invalidate_last_modified_cache_dataset(dataset)
4106 sorted_datasets_todo.append(dataset) # request cannot be answered from cache
4107 continue
4108 creation_unixtimes = {}
4109 for label in labels:
4110 creation_unixtime = cache_get_snapshots_changed(dataset, label)
4111 if creation_unixtime == 0:
4112 sorted_datasets_todo.append(dataset) # request cannot be answered from cache
4113 break
4114 creation_unixtimes[label] = creation_unixtime
4115 if len(creation_unixtimes) == len(labels):
4116 for label in labels:
4117 create_snapshot_if_latest_is_too_old(cached_datasets_to_snapshot, label, creation_unixtimes[label])
4118 sorted_datasets = sorted_datasets_todo
4120 # fallback to 'zfs list -t snapshot' for any remaining datasets, as these couldn't be satisfied from local cache
4121 cmd = p.split_args(f"{p.zfs_program} list -t snapshot -d 1 -Hp -o createtxg,creation,name") # sort dataset,createtxg
4122 datasets_with_snapshots = set()
4123 for lines in self.zfs_list_snapshots_in_parallel(src, cmd, sorted_datasets, ordered=False):
4124 # streaming group by dataset name (consumes constant memory only)
4125 for dataset, group in groupby(lines, key=lambda line: line[line.rindex("\t") + 1 : line.index("@")]):
4126 datasets_with_snapshots.add(dataset)
4127 snapshots = sorted( # fetch all snapshots of current dataset and sort by createtxg,creation,name
4128 (int(createtxg), int(creation), name[name.index("@") + 1 :])
4129 for createtxg, creation, name in (line.split("\t", 2) for line in group)
4130 )
4131 assert len(snapshots) > 0
4132 reversed_snapshot_names = [snapshot[-1] for snapshot in reversed(snapshots)]
4133 year_with_4_digits_regex = year_with_four_digits_regex
4134 for label in labels:
4135 infix = label.infix
4136 start = label.prefix + label.infix
4137 end = label.suffix
4138 startlen = len(start)
4139 endlen = len(end)
4140 minlen = startlen + endlen if infix else 4 + startlen + endlen # year_with_four_digits_regex
4141 year_slice = slice(startlen, startlen + 4) # [startlen:startlen+4] # year_with_four_digits_regex
4142 creation_unixtime = 0
4143 for j, s in enumerate(reversed_snapshot_names): # find latest snapshot that matches this label
4144 if (
4145 s.endswith(end)
4146 and s.startswith(start)
4147 and len(s) >= minlen
4148 and (infix or year_with_4_digits_regex.fullmatch(s[year_slice])) # year_with_four_digits_regex
4149 ):
4150 creation_unixtime = snapshots[len(snapshots) - j - 1][1]
4151 break
4152 create_snapshot_if_latest_is_too_old(datasets_to_snapshot, label, creation_unixtime)
4153 datasets_without_snapshots = [dataset for dataset in sorted_datasets if dataset not in datasets_with_snapshots]
4154 for lbl in labels: # merge (sorted) results from local cache + 'zfs list -t snapshot' into (sorted) combined result
4155 datasets_to_snapshot[lbl].sort()
4156 if datasets_without_snapshots or (lbl in cached_datasets_to_snapshot): # +take snaps for snapshot-less datasets
4157 datasets_to_snapshot[lbl] = list(
4158 heapq.merge(datasets_to_snapshot[lbl], cached_datasets_to_snapshot[lbl], datasets_without_snapshots)
4159 ) # inputs to merge() are sorted, and outputs are sorted too
4160 # sort to ensure that we take snapshots for dailies before hourlies, and so on
4161 label_indexes = {label: k for k, label in enumerate(config.snapshot_labels())}
4162 datasets_to_snapshot = dict(sorted(datasets_to_snapshot.items(), key=lambda kv: label_indexes[kv[0]]))
4163 return datasets_to_snapshot
4165 def last_modified_cache_file(self, dataset: str, label: SnapshotLabel = None) -> str:
4166 p = self.params
4167 cache_file = os.path.join(dataset, "=" if label is None else f"{label.prefix}{label.infix}{label.suffix}")
4168 userhost_dir = p.src.ssh_user_host if p.src.ssh_user_host else "-"
4169 return os.path.join(p.log_params.last_modified_cache_dir, userhost_dir, cache_file)
4171 def invalidate_last_modified_cache_dataset(self, dataset: str) -> None:
4172 """Resets the last_modified timestamp of all cache files of the given dataset to zero."""
4173 cache_file = self.last_modified_cache_file(dataset)
4174 if not self.params.dry_run:
4175 try:
4176 os.utime(cache_file, times=(0, 0)) # update this before the other files
4177 for entry in os.scandir(os.path.dirname(cache_file)):
4178 if entry.path != cache_file:
4179 os.utime(entry.path, times=(0, 0))
4180 os.utime(cache_file, times=(0, 0)) # and again after the other files
4181 except FileNotFoundError:
4182 pass # harmless
4184 def update_last_modified_cache(self, datasets_to_snapshot: Dict[SnapshotLabel, List[str]]) -> None:
4185 """perf: copy lastmodified time of source dataset into local cache to reduce future 'zfs list -t snapshot' calls."""
4186 p, log = self.params, self.params.log
4187 src, dst = p.src, p.dst
4188 if not self.is_snapshots_changed_zfs_property_available(src):
4189 return
4190 src_datasets_set = set()
4191 dataset_labels = defaultdict(list)
4192 for label, datasets in datasets_to_snapshot.items():
4193 duration_amount, duration_unit = p.create_src_snapshots_config.suffix_durations[label.suffix]
4194 if duration_amount > 0: # no need to update the cache for adhoc snapshots
4195 src_datasets_set.update(datasets) # union
4196 for dataset in datasets:
4197 dataset_labels[dataset].append(label)
4199 src_datasets: List[str] = sorted(src_datasets_set)
4200 if p.create_src_snapshots_config.create_src_snapshots_even_if_not_due:
4201 snapshots_changed_items = [(src_dataset, 0) for src_dataset in src_datasets]
4202 else:
4203 snapshots_changed_items = self.zfs_get_snapshots_changed(src, src_datasets).items()
4205 for src_dataset, snapshots_changed in snapshots_changed_items:
4206 if snapshots_changed == 0:
4207 self.invalidate_last_modified_cache_dataset(src_dataset)
4208 else:
4209 cache_file = self.last_modified_cache_file(src_dataset)
4210 cache_dir = os.path.dirname(cache_file)
4211 if not p.dry_run:
4212 try:
4213 os.makedirs(cache_dir, exist_ok=True)
4214 set_last_modification_time(cache_file, unixtime_in_secs=snapshots_changed, if_more_recent=True)
4215 for label in dataset_labels[src_dataset]:
4216 set_last_modification_time(
4217 self.last_modified_cache_file(src_dataset, label),
4218 unixtime_in_secs=snapshots_changed,
4219 if_more_recent=True,
4220 )
4221 except FileNotFoundError:
4222 pass # harmless
4224 def zfs_get_snapshots_changed(self, remote: Remote, datasets: List[str]) -> Dict[str, int]:
4225 """Returns the ZFS dataset property "snapshots_changed", which is a UTC Unix time in integer seconds.
4226 See https://openzfs.github.io/openzfs-docs/man/master/7/zfsprops.7.html#snapshots_changed"""
4227 p, log = self.params, self.params.log
4228 assert self.is_snapshots_changed_zfs_property_available(remote)
4229 cmd = p.split_args(f"{p.zfs_program} list -t filesystem,volume -s name -Hp -o snapshots_changed,name")
4230 # cmd = p.split_args(f"{p.zfs_program} get -Hp -o value -s none snapshots_changed")
4231 results = {}
4232 for lines in self.itr_ssh_cmd_parallel(
4233 remote,
4234 [(cmd, datasets)],
4235 lambda _cmd, batch: self.run_ssh_command(remote, log_trace, cmd=_cmd + batch).splitlines(),
4236 ordered=False,
4237 ):
4238 for line in lines:
4239 snapshots_changed, dataset = line.split("\t", 1)
4240 if snapshots_changed == "-" or not snapshots_changed:
4241 snapshots_changed = "0"
4242 results[dataset] = int(snapshots_changed)
4243 return results
4245 @dataclass(order=True)
4246 class ComparableSnapshot:
4247 key: Tuple[str, str] # rel_dataset, guid
4248 cols: List[str] = field(compare=False)
4250 def run_compare_snapshot_lists(self, src_datasets: List[str], dst_datasets: List[str]) -> None:
4251 """Compares source and destination dataset trees recursively wrt. snapshots, for example to check if all recently
4252 taken snapshots have been successfully replicated by a periodic job. Lists snapshots only contained in source
4253 (tagged with 'src'), only contained in destination (tagged with 'dst'), and contained in both source and destination
4254 (tagged with 'all'), in the form of a TSV file, along with other snapshot metadata. Implemented with a time and
4255 space efficient streaming algorithm; easily scales to millions of datasets and any number of snapshots.
4256 Assumes that both src_datasets and dst_datasets are sorted."""
4257 p, log = self.params, self.params.log
4258 src, dst = p.src, p.dst
4259 task = src.root_dataset + " vs. " + dst.root_dataset
4260 tsv_dir = p.log_params.log_file[0 : -len(".log")] + ".cmp"
4261 os.makedirs(tsv_dir, exist_ok=True)
4262 tsv_file = os.path.join(tsv_dir, (src.root_dataset + "%" + dst.root_dataset).replace("/", "~") + ".tsv")
4263 tmp_tsv_file = tsv_file + ".tmp"
4264 compare_snapshot_lists = set(p.compare_snapshot_lists.split("+"))
4265 is_src_dst_all = all(choice in compare_snapshot_lists for choice in cmp_choices_items)
4266 all_src_dst = [loc for loc in ("all", "src", "dst") if loc in compare_snapshot_lists]
4267 is_first_row = True
4268 now = None
4270 def zfs_list_snapshot_iterator(r: Remote, sorted_datasets: List[str]) -> Generator[str, None, None]:
4271 """Lists snapshots sorted by dataset name. All snapshots of a given dataset will be adjacent."""
4272 assert not self.is_test_mode or sorted_datasets == sorted(sorted_datasets), "List is not sorted"
4273 written_zfs_prop = "written" # https://openzfs.github.io/openzfs-docs/man/master/7/zfsprops.7.html#written
4274 if self.is_solaris_zfs(r): # solaris-11.4 zfs does not know the "written" ZFS snapshot property
4275 written_zfs_prop = "type" # for simplicity, fill in the non-integer dummy constant type="snapshot"
4276 props = self.creation_prefix + f"creation,guid,createtxg,{written_zfs_prop},name"
4277 types = "snapshot"
4278 if p.use_bookmark and r.location == "src" and self.are_bookmarks_enabled(r):
4279 types = "snapshot,bookmark"
4280 cmd = p.split_args(f"{p.zfs_program} list -t {types} -d 1 -Hp -o {props}") # sorted by dataset, createtxg
4281 for lines in self.zfs_list_snapshots_in_parallel(r, cmd, sorted_datasets):
4282 yield from lines
4284 def snapshot_iterator(
4285 root_dataset: str, sorted_itr: Generator[str, None, None]
4286 ) -> Generator[Job.ComparableSnapshot, None, None]:
4287 """Splits/groups snapshot stream into distinct datasets, sorts by GUID within a dataset such that any two
4288 snapshots with the same GUID will lie adjacent to each other during the upcoming phase that merges
4289 src snapshots and dst snapshots."""
4290 # streaming group by dataset name (consumes constant memory only)
4291 for dataset, group in groupby(
4292 sorted_itr, key=lambda line: line[line.rindex("\t") + 1 : line.replace("#", "@").index("@")]
4293 ):
4294 snapshots = list(group) # fetch all snapshots of current dataset, e.g. dataset=tank1/src/foo
4295 snapshots = self.filter_snapshots(snapshots) # apply include/exclude policy
4296 snapshots.sort(key=lambda line: line.split("\t", 2)[1]) # stable sort by GUID (2nd remains createtxg)
4297 rel_dataset = relativize_dataset(dataset, root_dataset) # rel_dataset=/foo, root_dataset=tank1/src
4298 last_guid = ""
4299 for line in snapshots:
4300 cols = line.split("\t")
4301 creation, guid, createtxg, written, snapshot_name = cols
4302 if guid == last_guid:
4303 assert "#" in snapshot_name
4304 continue # ignore bookmarks whose snapshot still exists. also ignore dupes of bookmarks
4305 last_guid = guid
4306 if written == "snapshot":
4307 written = "-" # sanitize solaris-11.4 work-around (solaris-11.4 also has no bookmark feature)
4308 cols = [creation, guid, createtxg, written, snapshot_name]
4309 key = (rel_dataset, guid) # ensures src snapshots and dst snapshots with the same GUID will be adjacent
4310 yield Job.ComparableSnapshot(key, cols)
4312 def print_dataset(rel_dataset: str, entries: Iterable[Tuple[str, Job.ComparableSnapshot]]) -> None:
4313 entries = sorted( # fetch all snapshots of current dataset and sort em by creation, createtxg, snapshot_tag
4314 entries,
4315 key=lambda entry: (
4316 int(entry[1].cols[0]),
4317 int(entry[1].cols[2]),
4318 entry[1].cols[-1][entry[1].cols[-1].replace("#", "@").index("@") + 1 :],
4319 ),
4320 )
4322 @dataclass
4323 class SnapshotStats:
4324 snapshot_count: int = field(default=0)
4325 sum_written: int = field(default=0)
4326 snapshot_count_since: int = field(default=0)
4327 sum_written_since: int = field(default=0)
4328 latest_snapshot_idx: int = field(default=None)
4329 latest_snapshot_row_str: str = field(default=None)
4330 latest_snapshot_creation: str = field(default=None)
4331 oldest_snapshot_row_str: str = field(default=None)
4332 oldest_snapshot_creation: str = field(default=None)
4334 # print metadata of snapshots of current dataset to TSV file; custom stats can later be computed from there
4335 stats = defaultdict(SnapshotStats)
4336 header = "location creation_iso createtxg rel_name guid root_dataset rel_dataset name creation written"
4337 nonlocal is_first_row
4338 is_first_row = is_first_row and fd.write(header.replace(" ", "\t") + "\n") and False
4339 for i, entry in enumerate(entries):
4340 loc = location = entry[0]
4341 creation, guid, createtxg, written, name = entry[1].cols
4342 root_dataset = dst.root_dataset if location == cmp_choices_items[1] else src.root_dataset
4343 rel_name = relativize_dataset(name, root_dataset)
4344 creation_iso = isotime_from_unixtime(int(creation))
4345 row = loc, creation_iso, createtxg, rel_name, guid, root_dataset, rel_dataset, name, creation, written
4346 # Example: src 2024-11-06_08:30:05 17435050 /foo@test_2024-11-06_08:30:05_daily 2406491805272097867 tank1/src /foo tank1/src/foo@test_2024-10-06_08:30:04_daily 1730878205 24576
4347 row_str = "\t".join(row)
4348 if not p.dry_run:
4349 fd.write(row_str + "\n")
4350 s = stats[location]
4351 s.snapshot_count += 1
4352 s.sum_written += int(written) if written != "-" else 0
4353 s.latest_snapshot_idx = i
4354 s.latest_snapshot_row_str = row_str
4355 s.latest_snapshot_creation = creation
4356 if not s.oldest_snapshot_row_str:
4357 s.oldest_snapshot_row_str = row_str
4358 s.oldest_snapshot_creation = creation
4360 # for convenience, directly log basic summary stats of current dataset
4361 k = stats["all"].latest_snapshot_idx # defaults to None
4362 k = k if k is not None else -1
4363 for entry in entries[k + 1 :]: # aggregate basic stats since latest common snapshot
4364 location = entry[0]
4365 creation, guid, createtxg, written, name = entry[1].cols
4366 s = stats[location]
4367 s.snapshot_count_since += 1
4368 s.sum_written_since += int(written) if written != "-" else 0
4369 prefix = f"Comparing {rel_dataset}~"
4370 msgs = []
4371 msgs.append(f"{prefix} of {task}")
4372 msgs.append(
4373 f"{prefix} Q: No src snapshots are missing on dst, and no dst snapshots are missing on src, "
4374 "and there is a common snapshot? A: "
4375 + (
4376 "n/a"
4377 if not is_src_dst_all
4378 else str(
4379 stats["src"].snapshot_count == 0
4380 and stats["dst"].snapshot_count == 0
4381 and stats["all"].snapshot_count > 0
4382 )
4383 )
4384 )
4385 nonlocal now
4386 now = now or round(time.time()) # uses the same timestamp across the entire dataset tree
4387 latcom = "latest common snapshot"
4388 for loc in all_src_dst:
4389 s = stats[loc]
4390 msgs.append(f"{prefix} Latest snapshot only in {loc}: {s.latest_snapshot_row_str or 'n/a'}")
4391 msgs.append(f"{prefix} Oldest snapshot only in {loc}: {s.oldest_snapshot_row_str or 'n/a'}")
4392 msgs.append(f"{prefix} Snapshots only in {loc}: {s.snapshot_count}")
4393 msgs.append(f"{prefix} Snapshot data written only in {loc}: {human_readable_bytes(s.sum_written)}")
4394 if loc != "all":
4395 na = None if k >= 0 else "n/a"
4396 msgs.append(f"{prefix} Snapshots only in {loc} since {latcom}: {na or s.snapshot_count_since}")
4397 msgs.append(
4398 f"{prefix} Snapshot data written only in {loc} since {latcom}: "
4399 f"{na or human_readable_bytes(s.sum_written_since)}"
4400 )
4401 all_creation = stats["all"].latest_snapshot_creation
4402 latest = ("latest", s.latest_snapshot_creation)
4403 oldest = ("oldest", s.oldest_snapshot_creation)
4404 for label, s_creation in latest, oldest:
4405 if loc != "all":
4406 hd = "n/a"
4407 if s_creation and k >= 0:
4408 hd = human_readable_duration(int(all_creation) - int(s_creation), unit="s")
4409 msgs.append(f"{prefix} Time diff between {latcom} and {label} snapshot only in {loc}: {hd}")
4410 for label, s_creation in latest, oldest:
4411 hd = "n/a" if not s_creation else human_readable_duration(now - int(s_creation), unit="s")
4412 msgs.append(f"{prefix} Time diff between now and {label} snapshot only in {loc}: {hd}")
4413 log.info("%s", "\n".join(msgs))
4415 # setup streaming pipeline
4416 src_snap_itr = snapshot_iterator(src.root_dataset, zfs_list_snapshot_iterator(src, src_datasets))
4417 dst_snap_itr = snapshot_iterator(dst.root_dataset, zfs_list_snapshot_iterator(dst, dst_datasets))
4418 merge_itr = self.merge_sorted_iterators(cmp_choices_items, p.compare_snapshot_lists, src_snap_itr, dst_snap_itr)
4420 rel_datasets: Dict[str, Set[str]] = defaultdict(set)
4421 for datasets, remote in (src_datasets, src), (dst_datasets, dst):
4422 for dataset in datasets: # rel_dataset=/foo, root_dataset=tank1/src
4423 rel_datasets[remote.location].add(relativize_dataset(dataset, remote.root_dataset))
4424 rel_src_or_dst: List[str] = sorted(rel_datasets["src"].union(rel_datasets["dst"]))
4426 log.debug("%s", f"Temporary TSV output file comparing {task} is: {tmp_tsv_file}")
4427 with open(tmp_tsv_file, "w", encoding="utf-8") as fd:
4428 # streaming group by rel_dataset (consumes constant memory only); entry is a Tuple[str, ComparableSnapshot]
4429 group = groupby(merge_itr, key=lambda entry: entry[1].key[0])
4430 self.print_datasets(group, lambda rel_ds, entries: print_dataset(rel_ds, entries), rel_src_or_dst)
4431 os.rename(tmp_tsv_file, tsv_file)
4432 log.info("%s", f"Final TSV output file comparing {task} is: {tsv_file}")
4434 tsv_file = tsv_file[0 : tsv_file.rindex(".")] + ".rel_datasets_tsv"
4435 tmp_tsv_file = tsv_file + ".tmp"
4436 with open(tmp_tsv_file, "w", encoding="utf-8") as fd:
4437 header = "location rel_dataset src_dataset dst_dataset"
4438 fd.write(header.replace(" ", "\t") + "\n")
4439 src_only: Set[str] = rel_datasets["src"].difference(rel_datasets["dst"])
4440 dst_only: Set[str] = rel_datasets["dst"].difference(rel_datasets["src"])
4441 for rel_dataset in rel_src_or_dst:
4442 loc = "src" if rel_dataset in src_only else "dst" if rel_dataset in dst_only else "all"
4443 src_dataset = src.root_dataset + rel_dataset if rel_dataset not in dst_only else ""
4444 dst_dataset = dst.root_dataset + rel_dataset if rel_dataset not in src_only else ""
4445 row = loc, rel_dataset, src_dataset, dst_dataset # Example: all /foo/bar tank1/src/foo/bar tank2/dst/foo/bar
4446 if not p.dry_run:
4447 fd.write("\t".join(row) + "\n")
4448 os.rename(tmp_tsv_file, tsv_file)
4450 @staticmethod
4451 def print_datasets(group: groupby, fn: Callable[[str, Iterable], None], rel_datasets: Iterable[str]) -> None:
4452 rel_datasets = sorted(rel_datasets)
4453 n = len(rel_datasets)
4454 i = 0
4455 for rel_dataset, entries in group:
4456 while i < n and rel_datasets[i] < rel_dataset:
4457 fn(rel_datasets[i], []) # Also print summary stats for datasets whose snapshot stream is empty
4458 i += 1
4459 assert i >= n or rel_datasets[i] == rel_dataset
4460 i += 1
4461 fn(rel_dataset, entries)
4462 while i < n:
4463 fn(rel_datasets[i], []) # Also print summary stats for datasets whose snapshot stream is empty
4464 i += 1
4466 def merge_sorted_iterators(
4467 self,
4468 choices: List[str], # ["src", "dst", "all"]
4469 choice: str, # Example: "src+dst+all"
4470 src_itr: Generator[ComparableSnapshot, None, None],
4471 dst_itr: Generator[ComparableSnapshot, None, None],
4472 ) -> Generator[Tuple[str, ComparableSnapshot], None, None]:
4473 """This is the typical merge algorithm of a merge sort, slightly adapted to our specific use case."""
4474 assert len(choices) == 3
4475 assert choice
4476 flags = 0
4477 for i, item in enumerate(choices):
4478 if item in choice:
4479 flags |= 1 << i
4480 src_next, dst_next = self.run_in_parallel(lambda: next(src_itr, None), lambda: next(dst_itr, None))
4481 while not (src_next is None and dst_next is None):
4482 if src_next == dst_next:
4483 n = 2
4484 if (flags & (1 << n)) != 0:
4485 yield choices[n], src_next, dst_next
4486 src_next = next(src_itr, None)
4487 dst_next = next(dst_itr, None)
4488 elif src_next is None or (dst_next is not None and dst_next < src_next):
4489 n = 1
4490 if (flags & (1 << n)) != 0:
4491 yield choices[n], dst_next
4492 dst_next = next(dst_itr, None)
4493 else:
4494 n = 0
4495 if (flags & (1 << n)) != 0:
4496 if isinstance(src_next, Job.ComparableSnapshot):
4497 name = src_next.cols[-1]
4498 if "@" in name:
4499 yield choices[n], src_next # include snapshot
4500 else: # ignore src bookmarks for which no snapshot exists in dst; those aren't useful
4501 assert "#" in name
4502 else:
4503 yield choices[n], src_next
4504 src_next = next(src_itr, None)
4506 @staticmethod
4507 def build_dataset_tree(sorted_datasets: List[str]) -> Tree:
4508 """Takes as input a sorted list of datasets and returns a sorted directory tree containing the same dataset names,
4509 in the form of nested dicts."""
4510 tree: Tree = {}
4511 for dataset in sorted_datasets:
4512 current = tree
4513 components = dataset.split("/")
4514 n = len(components) - 1
4515 for i, component in enumerate(components):
4516 child = current.get(component, None)
4517 if child is None:
4518 child = {} if i < n else None # perf: use None to indicate empty leaf dictionary
4519 current[component] = child
4520 current = child
4521 return tree
4523 def process_datasets_in_parallel_and_fault_tolerant(
4524 self,
4525 datasets: List[str],
4526 process_dataset: Callable[[str, str, Retry], bool], # lambda, must be thread-safe
4527 skip_tree_on_error: Callable[[str], bool], # lambda, must be thread-safe
4528 task_name: str,
4529 ) -> bool:
4530 """Runs process_dataset(dataset) for each dataset in datasets, while taking care of error handling and retries
4531 and parallel execution. Assumes that the input dataset list is sorted. All children of a dataset may be
4532 processed in parallel. For consistency (even during parallel dataset replication/deletion), processing of a
4533 dataset only starts after processing of all its ancestor datasets has completed. Further, when a thread is
4534 ready to start processing another dataset, it chooses the "smallest" dataset wrt. case-sensitive sort order
4535 from the datasets that are currently available for start of processing. Initially, only the roots of the
4536 selected dataset subtrees are available for start of processing."""
4537 p, log = self.params, self.params.log
4539 def _process_dataset(dataset: str, tid: str):
4540 start_time_nanos = time.time_ns()
4541 try:
4542 return self.run_with_retries(p.retry_policy, process_dataset, dataset, tid)
4543 finally:
4544 elapsed_nanos = time.time_ns() - start_time_nanos
4545 log.debug(p.dry(f"{tid} {task_name} done: %s took %s"), dataset, human_readable_duration(elapsed_nanos))
4547 def build_dataset_tree_and_find_roots() -> List[Tuple[str, Tree]]:
4548 """For consistency, processing of a dataset only starts after processing of its ancestors has completed."""
4549 tree: Tree = self.build_dataset_tree(datasets) # tree consists of nested dictionaries
4550 skip_dataset = DONT_SKIP_DATASET
4551 roots = []
4552 for dataset in datasets:
4553 if is_descendant(dataset, of_root_dataset=skip_dataset):
4554 continue
4555 skip_dataset = dataset
4556 children = tree
4557 for component in dataset.split("/"):
4558 children = children[component]
4559 roots.append((dataset, children))
4560 return roots
4562 priority_queue: List[Tuple[str, Tree]] = build_dataset_tree_and_find_roots()
4563 heapq.heapify(priority_queue) # same order as sorted()
4564 log.trace("Retry policy: %s", p.retry_policy)
4565 max_workers = min(self.max_workers[p.src.location], self.max_workers[p.dst.location])
4566 with ThreadPoolExecutor(max_workers=max_workers) as executor:
4567 todo_futures: Set[Future] = set()
4568 submitted = 0
4570 def submit_datasets() -> bool:
4571 while len(priority_queue) > 0 and len(todo_futures) < max_workers:
4572 # pick "smallest" dataset (wrt. sort order) available for start of processing; submit to thread pool
4573 node: Tuple[str, Tree] = heapq.heappop(priority_queue)
4574 dataset, children = node
4575 nonlocal submitted
4576 submitted += 1
4577 tid = f"{submitted}/{len(datasets)}"
4578 future = executor.submit(_process_dataset, dataset, tid)
4579 future.node = node
4580 todo_futures.add(future)
4581 return len(todo_futures) > 0
4583 failed = False
4584 while submit_datasets():
4585 done_futures, todo_futures = concurrent.futures.wait(todo_futures, return_when=FIRST_COMPLETED) # blocks
4586 for done_future in done_futures:
4587 dataset, children = done_future.node
4588 try:
4589 no_skip: bool = done_future.result() # does not block as processing has already completed
4590 except (CalledProcessError, subprocess.TimeoutExpired, SystemExit, UnicodeDecodeError) as e:
4591 failed = True
4592 if p.skip_on_error == "fail":
4593 [todo_future.cancel() for todo_future in todo_futures]
4594 terminate_process_group(except_current_process=True)
4595 raise e
4596 no_skip = not (p.skip_on_error == "tree" or skip_tree_on_error(dataset))
4597 log.error("%s", str(e))
4598 self.append_exception(e, task_name, dataset)
4599 if no_skip and children: # make child datasets available for start of processing ...
4600 for child, grandchildren in children.items(): # as processing of parent has now completed
4601 child = f"{dataset}/{child}"
4602 heapq.heappush(priority_queue, (child, grandchildren))
4603 assert len(priority_queue) == 0
4604 return failed
4606 def is_program_available(self, program: str, location: str) -> bool:
4607 return program in self.params.available_programs.get(location, {})
4609 def detect_available_programs(self) -> None:
4610 p = params = self.params
4611 log = p.log
4612 available_programs = params.available_programs
4613 if "local" not in available_programs:
4614 cmd = [p.shell_program_local, "-c", self.find_available_programs()]
4615 available_programs["local"] = dict.fromkeys(
4616 subprocess.run(cmd, stdin=DEVNULL, stdout=PIPE, stderr=sys.stderr, text=True).stdout.splitlines()
4617 )
4618 cmd = [p.shell_program_local, "-c", "exit"]
4619 if subprocess.run(cmd, stdin=DEVNULL, stdout=PIPE, stderr=sys.stderr, text=True).returncode != 0: 4619 ↛ 4620line 4619 didn't jump to line 4620 because the condition on line 4619 was never true
4620 self.disable_program("sh", ["local"])
4622 for r in [p.dst, p.src]:
4623 loc = r.location
4624 remote_conf_cache_key = r.cache_key()
4625 cache_item: Optional[RemoteConfCacheItem] = self.remote_conf_cache.get(remote_conf_cache_key)
4626 if cache_item is not None:
4627 # startup perf: cache avoids ssh connect setup and feature detection roundtrips on revisits to same site
4628 p.connection_pools[loc], available_programs[loc], p.zpool_features[loc] = cache_item
4629 continue
4630 p.connection_pools[loc] = ConnectionPools(
4631 r, {SHARED: r.max_concurrent_ssh_sessions_per_tcp_connection, DEDICATED: 1}
4632 )
4633 self.detect_zpool_features(r)
4634 self.detect_available_programs_remote(r, available_programs, r.ssh_user_host)
4635 self.remote_conf_cache[remote_conf_cache_key] = RemoteConfCacheItem(
4636 p.connection_pools[loc], available_programs[loc], p.zpool_features[loc]
4637 )
4638 if r.use_zfs_delegation and p.zpool_features[loc].get("delegation") == "off":
4639 die(
4640 f"Permission denied as ZFS delegation is disabled for {r.location} "
4641 f"dataset: {r.basis_root_dataset}. Manually enable it via 'sudo zpool set delegation=on {r.pool}'"
4642 )
4644 locations = ["src", "dst", "local"]
4645 if params.compression_program == disable_prg:
4646 self.disable_program("zstd", locations)
4647 if params.mbuffer_program == disable_prg:
4648 self.disable_program("mbuffer", locations)
4649 if params.ps_program == disable_prg:
4650 self.disable_program("ps", locations)
4651 if params.pv_program == disable_prg:
4652 self.disable_program("pv", locations)
4653 if params.shell_program == disable_prg:
4654 self.disable_program("sh", locations)
4655 if params.sudo_program == disable_prg:
4656 self.disable_program("sudo", locations)
4657 if params.zpool_program == disable_prg:
4658 self.disable_program("zpool", locations)
4660 for key, programs in available_programs.items():
4661 for program in list(programs.keys()):
4662 if program.startswith("uname-"):
4663 # uname-Linux foo 5.15.0-69-generic #76-Ubuntu SMP Fri Mar 17 17:19:29 UTC 2023 x86_64 x86_64 x86_64 GNU/Linux
4664 # uname-FreeBSD freebsd 14.1-RELEASE FreeBSD 14.1-RELEASE releng/14.1-n267679-10e31f0946d8 GENERIC amd64
4665 # uname-SunOS solaris 5.11 11.4.42.111.0 i86pc i386 i86pc # https://blogs.oracle.com/solaris/post/building-open-source-software-on-oracle-solaris-114-cbe-release
4666 # uname-SunOS solaris 5.11 11.4.0.15.0 i86pc i386 i86pc
4667 # uname-Darwin foo 23.6.0 Darwin Kernel Version 23.6.0: Mon Jul 29 21:13:04 PDT 2024; root:xnu-10063.141.2~1/RELEASE_ARM64_T6020 arm64
4668 programs.pop(program)
4669 uname = program[len("uname-") :]
4670 programs["uname"] = uname
4671 log.trace(f"available_programs[{key}][uname]: %s", uname)
4672 programs["os"] = uname.split(" ")[0] # Linux|FreeBSD|SunOS|Darwin
4673 log.trace(f"available_programs[{key}][os]: %s", programs["os"])
4674 elif program.startswith("default_shell-"):
4675 programs.pop(program)
4676 default_shell = program[len("default_shell-") :]
4677 programs["default_shell"] = default_shell
4678 log.trace(f"available_programs[{key}][default_shell]: %s", default_shell)
4679 validate_default_shell(default_shell, r)
4680 elif program.startswith("getconf_cpu_count-"):
4681 programs.pop(program)
4682 getconf_cpu_count = program[len("getconf_cpu_count-") :]
4683 programs["getconf_cpu_count"] = getconf_cpu_count
4684 log.trace(f"available_programs[{key}][getconf_cpu_count]: %s", getconf_cpu_count)
4686 for key, programs in available_programs.items():
4687 log.debug(f"available_programs[{key}]: %s", list_formatter(programs, separator=", "))
4689 for r in [p.dst, p.src]:
4690 if r.sudo and not self.is_program_available("sudo", r.location):
4691 die(f"{p.sudo_program} CLI is not available on {r.location} host: {r.ssh_user_host or 'localhost'}")
4693 def disable_program(self, program: str, locations: List[str]) -> None:
4694 for location in locations:
4695 self.params.available_programs[location].pop(program, None)
4697 def find_available_programs(self) -> str:
4698 p = self.params
4699 cmds = []
4700 cmds.append(f"command -v echo > /dev/null && echo echo")
4701 cmds.append(f"command -v echo > /dev/null && echo default_shell-$SHELL")
4702 cmds.append(f"command -v {p.zpool_program} > /dev/null && echo zpool")
4703 cmds.append(f"command -v {p.ssh_program} > /dev/null && echo ssh")
4704 cmds.append(f"command -v {p.shell_program} > /dev/null && echo sh")
4705 cmds.append(f"command -v {p.sudo_program} > /dev/null && echo sudo")
4706 cmds.append(f"command -v {p.compression_program} > /dev/null && echo zstd")
4707 cmds.append(f"command -v {p.mbuffer_program} > /dev/null && echo mbuffer")
4708 cmds.append(f"command -v {p.pv_program} > /dev/null && echo pv")
4709 cmds.append(f"command -v {p.ps_program} > /dev/null && echo ps")
4710 # print num CPUs on Solaris:
4711 cmds.append(f"command -v {p.psrinfo_program} > /dev/null && printf getconf_cpu_count- && {p.psrinfo_program} -p")
4712 cmds.append( # print num CPUs on POSIX except Solaris
4713 f"! command -v {p.psrinfo_program} && command -v {p.getconf_program} > /dev/null && printf getconf_cpu_count- && {p.getconf_program} _NPROCESSORS_ONLN"
4714 )
4715 cmds.append(f"command -v {p.uname_program} > /dev/null && printf uname- && {p.uname_program} -a || true")
4716 return "; ".join(cmds)
4718 def detect_available_programs_remote(self, remote: Remote, available_programs: Dict, ssh_user_host: str) -> None:
4719 p, log = self.params, self.params.log
4720 location = remote.location
4721 available_programs_minimum = {"zpool": None, "sudo": None}
4722 available_programs[location] = {}
4723 lines = None
4724 try:
4725 # on Linux, 'zfs --version' returns with zero status and prints the correct info
4726 # on FreeBSD, 'zfs --version' always prints the same (correct) info as Linux, but nonetheless sometimes
4727 # returns with non-zero status (sometimes = if the zfs kernel module is not loaded)
4728 # on Solaris, 'zfs --version' returns with non-zero status without printing useful info as the --version
4729 # option is not known there
4730 lines = self.run_ssh_command(remote, log_trace, print_stderr=False, cmd=[p.zfs_program, "--version"])
4731 assert lines
4732 except (FileNotFoundError, PermissionError): # location is local and program file was not found
4733 die(f"{p.zfs_program} CLI is not available on {location} host: {ssh_user_host or 'localhost'}")
4734 except subprocess.CalledProcessError as e:
4735 if "unrecognized command '--version'" in e.stderr and "run: zfs help" in e.stderr:
4736 available_programs[location]["zfs"] = "notOpenZFS" # solaris-11.4 zfs does not know --version flag
4737 elif not e.stdout.startswith("zfs"): 4737 ↛ 4740line 4737 didn't jump to line 4740 because the condition on line 4737 was always true
4738 die(f"{p.zfs_program} CLI is not available on {location} host: {ssh_user_host or 'localhost'}")
4739 else:
4740 lines = e.stdout # FreeBSD if the zfs kernel module is not loaded
4741 assert lines
4742 if lines:
4743 line = lines.splitlines()[0]
4744 assert line.startswith("zfs")
4745 # Example: zfs-2.1.5~rc5-ubuntu3 -> 2.1.5, zfswin-2.2.3rc5 -> 2.2.3
4746 version = line.split("-")[1].strip()
4747 match = re.fullmatch(r"(\d+\.\d+\.\d+).*", version)
4748 assert match, "Unparsable zfs version string: " + version
4749 version = match.group(1)
4750 available_programs[location]["zfs"] = version
4751 if is_version_at_least(version, "2.1.0"):
4752 available_programs[location][zfs_version_is_at_least_2_1_0] = True
4753 if is_version_at_least(version, "2.2.0"):
4754 available_programs[location][zfs_version_is_at_least_2_2_0] = True
4755 log.trace(f"available_programs[{location}][zfs]: %s", available_programs[location]["zfs"])
4757 if p.shell_program != disable_prg:
4758 try:
4759 cmd = [p.shell_program, "-c", self.find_available_programs()]
4760 available_programs[location].update(
4761 dict.fromkeys(self.run_ssh_command(remote, log_trace, cmd=cmd).splitlines())
4762 )
4763 return
4764 except (FileNotFoundError, PermissionError) as e: # location is local and shell program file was not found
4765 if e.filename != p.shell_program: 4765 ↛ 4766line 4765 didn't jump to line 4766 because the condition on line 4765 was never true
4766 raise
4767 except subprocess.CalledProcessError:
4768 pass
4769 log.warning("%s", f"Failed to find {p.shell_program} on {location}. Continuing with minimal assumptions...")
4770 available_programs[location].update(available_programs_minimum)
4772 def is_solaris_zfs(self, remote: Remote) -> bool:
4773 return self.is_solaris_zfs_location(remote.location)
4775 def is_solaris_zfs_location(self, location: str) -> bool:
4776 if location == "local":
4777 return platform.system() == "SunOS"
4778 return self.params.available_programs[location].get("zfs") == "notOpenZFS"
4780 @staticmethod
4781 def is_dummy(r: Remote) -> bool:
4782 return r.root_dataset == dummy_dataset
4784 def detect_zpool_features(self, remote: Remote) -> None:
4785 p = params = self.params
4786 r, loc, log = remote, remote.location, p.log
4787 lines = []
4788 features = {}
4789 if self.is_dummy(r):
4790 params.zpool_features[loc] = {}
4791 return
4792 if params.zpool_program != disable_prg:
4793 cmd = params.split_args(f"{params.zpool_program} get -Hp -o property,value all", r.pool)
4794 try:
4795 lines = self.run_ssh_command(remote, log_trace, check=False, cmd=cmd).splitlines()
4796 except (FileNotFoundError, PermissionError) as e:
4797 if e.filename != params.zpool_program: 4797 ↛ 4798line 4797 didn't jump to line 4798 because the condition on line 4797 was never true
4798 raise
4799 log.warning(
4800 "%s", f"Failed to detect zpool features on {loc}: {r.pool}. Continuing with minimal assumptions ..."
4801 )
4802 else:
4803 props = {line.split("\t", 1)[0]: line.split("\t", 1)[1] for line in lines}
4804 features = {k: v for k, v in props.items() if k.startswith("feature@") or k == "delegation"}
4805 if len(lines) == 0:
4806 cmd = p.split_args(f"{p.zfs_program} list -t filesystem -Hp -o name -s name", r.pool)
4807 if self.try_ssh_command(remote, log_trace, cmd=cmd) is None:
4808 die(f"Pool does not exist for {loc} dataset: {r.basis_root_dataset}. Manually create the pool first!")
4809 params.zpool_features[loc] = features
4811 def is_zpool_feature_enabled_or_active(self, remote: Remote, feature: str) -> bool:
4812 return self.params.zpool_features[remote.location].get(feature) in ("active", "enabled")
4814 def are_bookmarks_enabled(self, remote: Remote) -> bool:
4815 return self.is_zpool_feature_enabled_or_active(
4816 remote, "feature@bookmark_v2"
4817 ) and self.is_zpool_feature_enabled_or_active(remote, "feature@bookmark_written")
4819 def is_snapshots_changed_zfs_property_available(self, remote: Remote) -> bool:
4820 return (
4821 self.params.create_src_snapshots_config.enable_snapshots_changed_cache
4822 and self.is_program_available(zfs_version_is_at_least_2_2_0, remote.location)
4823 and self.is_zpool_feature_enabled_or_active(remote, "feature@extensible_dataset")
4824 )
4826 def check_zfs_dataset_busy(self, remote: Remote, dataset: str, busy_if_send: bool = True) -> bool:
4827 """Decline to start a state changing ZFS operation that may conflict with other currently running processes.
4828 Instead, retry the operation later and only execute it when it's become safe. For example, decline to start
4829 a 'zfs receive' into a destination dataset if another process is already running another 'zfs receive' into
4830 the same destination dataset. However, it's actually safe to run an incremental 'zfs receive' into a dataset
4831 in parallel with a 'zfs send' out of the very same dataset. This also helps daisy chain use cases where
4832 A replicates to B, and B replicates to C."""
4833 p, log = self.params, self.params.log
4834 if not self.is_program_available("ps", remote.location):
4835 return True
4836 cmd = p.split_args(f"{p.ps_program} -Ao args")
4837 procs = (self.try_ssh_command(remote, log_trace, cmd=cmd) or "").splitlines()
4838 if self.inject_params.get("is_zfs_dataset_busy", False):
4839 procs += ["sudo -n zfs receive -u -o foo:bar=/baz " + dataset] # for unit testing only
4840 if not self.is_zfs_dataset_busy(procs, dataset, busy_if_send=busy_if_send):
4841 return True
4842 op = "zfs {receive" + ("|send" if busy_if_send else "") + "} operation"
4843 try:
4844 die(f"Cannot continue now: Destination is already busy with {op} from another process: {dataset}")
4845 except SystemExit as e:
4846 log.warning("%s", str(e))
4847 raise RetryableError("dst currently busy with zfs mutation op") from e
4849 zfs_dataset_busy_prefix = r"(([^ ]*?/)?(sudo|doas)( +-n)? +)?([^ ]*?/)?zfs (receive|recv"
4850 zfs_dataset_busy_if_mods = re.compile((zfs_dataset_busy_prefix + ") .*").replace("(", "(?:"))
4851 zfs_dataset_busy_if_send = re.compile((zfs_dataset_busy_prefix + "|send) .*").replace("(", "(?:"))
4853 def is_zfs_dataset_busy(self, procs: List[str], dataset: str, busy_if_send: bool) -> bool:
4854 regex = Job.zfs_dataset_busy_if_send if busy_if_send else Job.zfs_dataset_busy_if_mods
4855 suffix = " " + dataset
4856 infix = " " + dataset + "@"
4857 return any(filter(lambda proc: (proc.endswith(suffix) or infix in proc) and regex.fullmatch(proc), procs))
4859 def run_ssh_cmd_batched(
4860 self, r: Remote, cmd: List[str], cmd_args: List[str], fn: Callable[[List[str]], Any], max_batch_items=2**29, sep=" "
4861 ) -> None:
4862 drain(self.itr_ssh_cmd_batched(r, cmd, cmd_args, fn, max_batch_items=max_batch_items, sep=sep))
4864 def itr_ssh_cmd_batched(
4865 self, r: Remote, cmd: List[str], cmd_args: List[str], fn: Callable[[List[str]], Any], max_batch_items=2**29, sep=" "
4866 ) -> Generator[Any, None, None]:
4867 """Runs fn(cmd_args) in batches w/ cmd, without creating a command line that's too big for the OS to handle."""
4868 max_bytes = min(self.get_max_command_line_bytes("local"), self.get_max_command_line_bytes(r.location))
4869 # Max size of a single argument is 128KB on Linux - https://lists.gnu.org/archive/html/bug-bash/2020-09/msg00095.html
4870 max_bytes = max_bytes if sep == " " else min(max_bytes, 131071) # e.g. 'zfs destroy foo@s1,s2,...,sN'
4871 fsenc = sys.getfilesystemencoding()
4872 conn_pool: ConnectionPool = self.params.connection_pools[r.location].pool(SHARED)
4873 conn: Connection = conn_pool.get_connection()
4874 cmd = conn.ssh_cmd + cmd
4875 conn_pool.return_connection(conn)
4876 header_bytes: int = len(" ".join(cmd).encode(fsenc))
4877 total_bytes: int = header_bytes
4878 batch: List[str] = []
4880 def flush() -> Any:
4881 if len(batch) > 0:
4882 return fn(batch)
4884 for cmd_arg in cmd_args:
4885 curr_bytes = len(f"{sep}{cmd_arg}".encode(fsenc))
4886 if total_bytes + curr_bytes > max_bytes or len(batch) >= max_batch_items:
4887 results = flush()
4888 if results is not None:
4889 yield results
4890 batch, total_bytes = [], header_bytes
4891 batch.append(cmd_arg)
4892 total_bytes += curr_bytes
4893 results = flush()
4894 if results is not None:
4895 yield results
4897 def run_ssh_cmd_parallel(
4898 self,
4899 r: Remote,
4900 cmd_args_list: List[Tuple[List[str], List[str]]],
4901 fn: Callable[[List[str], List[str]], Any],
4902 max_batch_items=2**29,
4903 ) -> None:
4904 drain(self.itr_ssh_cmd_parallel(r, cmd_args_list, fn=fn, max_batch_items=max_batch_items, ordered=False))
4906 def itr_ssh_cmd_parallel(
4907 self,
4908 r: Remote,
4909 cmd_args_list: List[Tuple[List[str], List[str]]],
4910 fn: Callable[[List[str], List[str]], Any],
4911 max_batch_items=2**29,
4912 ordered=True,
4913 ) -> Generator:
4914 """Returns output datasets in the same order as the input datasets (not in random order) if ordered == True."""
4915 max_workers = self.max_workers[r.location]
4916 with ThreadPoolExecutor(max_workers=max_workers) as executor:
4917 iterators = [
4918 self.itr_ssh_cmd_batched(
4919 r, cmd, cmd_args, lambda batch: executor.submit(fn, cmd, batch), max_batch_items=max_batch_items
4920 )
4921 for cmd, cmd_args in cmd_args_list
4922 ]
4923 iterator = itertools.chain(*iterators)
4924 # Materialize the next N futures into a buffer, causing submission + parallel execution of their CLI calls
4925 fifo_buffer: deque[Future] = deque(itertools.islice(iterator, max_workers))
4927 if ordered:
4928 while fifo_buffer: # submit the next CLI call whenever the current CLI call returns
4929 curr_future: Future = fifo_buffer.popleft()
4930 next_future: Future = next(iterator, None) # causes the next CLI call to be submitted
4931 if next_future is not None:
4932 fifo_buffer.append(next_future)
4933 yield curr_future.result() # blocks until CLI returns
4934 else:
4935 todo_futures: Set[Future] = set(fifo_buffer)
4936 while todo_futures:
4937 done_futures, todo_futures = concurrent.futures.wait(todo_futures, return_when=FIRST_COMPLETED) # blocks
4938 for done_future in done_futures: # submit the next CLI call whenever a CLI call returns
4939 next_future: Future = next(iterator, None) # causes the next CLI call to be submitted
4940 if next_future is not None:
4941 todo_futures.add(next_future)
4942 yield done_future.result() # does not block as processing has already completed
4943 assert next(iterator, None) is None
4945 def zfs_list_snapshots_in_parallel(self, r: Remote, cmd: List[str], datasets: List[str], ordered=True) -> Generator:
4946 """Runs 'zfs list -t snapshot' on multiple datasets at the same time."""
4947 max_workers = self.max_workers[r.location]
4948 return self.itr_ssh_cmd_parallel(
4949 r,
4950 [(cmd, datasets)],
4951 fn=lambda cmd, batch: (self.try_ssh_command(r, log_trace, cmd=cmd + batch) or "").splitlines(),
4952 max_batch_items=min(
4953 self.max_datasets_per_minibatch_on_list_snaps[r.location],
4954 max(
4955 len(datasets) // (max_workers if r.ssh_user_host else max_workers * 8),
4956 max_workers if r.ssh_user_host else 1,
4957 ),
4958 ),
4959 ordered=ordered,
4960 )
4962 @staticmethod
4963 def run_in_parallel(fn1: Callable[[], Any], fn2: Callable[[], Any]) -> Tuple[Any, Any]:
4964 """perf: Runs both I/O functions in parallel/concurrently."""
4965 with ThreadPoolExecutor(max_workers=1) as executor:
4966 future: Future = executor.submit(fn2) # async fn2
4967 result1 = fn1() # blocks until fn1 call returns
4968 result2 = future.result() # blocks until fn2 call returns
4969 return result1, result2
4971 def get_max_command_line_bytes(self, location: str, os_name: Optional[str] = None) -> int:
4972 """Remote flavor of os.sysconf("SC_ARG_MAX") - size(os.environb) - safety margin"""
4973 os_name = os_name if os_name else self.params.available_programs[location].get("os")
4974 if os_name == "Linux":
4975 arg_max = 2 * 1024 * 1024
4976 elif os_name == "FreeBSD":
4977 arg_max = 256 * 1024
4978 elif os_name == "SunOS":
4979 arg_max = 1 * 1024 * 1024
4980 elif os_name == "Darwin":
4981 arg_max = 1 * 1024 * 1024
4982 elif os_name == "Windows":
4983 arg_max = 32 * 1024
4984 else:
4985 arg_max = 256 * 1024 # unknown
4987 environ_size = 4 * 1024 # typically is 1-4 KB
4988 safety_margin = (8 * 2 * 4 + 4) * 1024 if arg_max >= 1 * 1024 * 1024 else 8 * 1024
4989 max_bytes = max(4 * 1024, arg_max - environ_size - safety_margin)
4990 if self.max_command_line_bytes is not None:
4991 return self.max_command_line_bytes # for testing only
4992 else:
4993 return max_bytes
4996#############################################################################
4997@dataclass(order=True, repr=False)
4998class Connection:
4999 """Represents the ability to multiplex N=capacity concurrent SSH sessions over the same TCP connection."""
5001 free: int # sort order evens out the number of concurrent sessions among the TCP connections
5002 last_modified: int # LIFO: tiebreaker favors latest returned conn as that's most alive and hot
5004 def __init__(self, remote: Remote, max_concurrent_ssh_sessions_per_tcp_connection: int, cid: int):
5005 assert max_concurrent_ssh_sessions_per_tcp_connection > 0
5006 self.capacity: int = max_concurrent_ssh_sessions_per_tcp_connection
5007 self.free: int = max_concurrent_ssh_sessions_per_tcp_connection
5008 self.last_modified: int = 0
5009 self.cid: int = cid
5010 self.ssh_cmd: List[str] = remote.local_ssh_command()
5011 self.ssh_cmd_quoted: List[str] = [shlex.quote(item) for item in self.ssh_cmd]
5012 self.lock: threading.Lock = threading.Lock()
5013 self.last_refresh_time: int = 0
5015 def __repr__(self) -> str:
5016 return str({"free": self.free, "cid": self.cid})
5018 def increment_free(self, value: int) -> None:
5019 self.free += value
5020 assert self.free >= 0
5021 assert self.free <= self.capacity
5023 def is_full(self) -> bool:
5024 return self.free <= 0
5026 def update_last_modified(self, last_modified: int) -> None:
5027 self.last_modified = last_modified
5029 def shutdown(self, msg_prefix: str, p: Params) -> None:
5030 ssh_cmd = self.ssh_cmd
5031 if ssh_cmd:
5032 ssh_socket_cmd = ssh_cmd[0:-1] + ["-O", "exit", ssh_cmd[-1]]
5033 is_trace = p.log.isEnabledFor(log_trace)
5034 is_trace and p.log.trace(f"Executing {msg_prefix}: %s", " ".join([shlex.quote(x) for x in ssh_socket_cmd]))
5035 process = subprocess.run(ssh_socket_cmd, stdin=DEVNULL, stderr=PIPE, text=True)
5036 if process.returncode != 0:
5037 p.log.trace("%s", process.stderr.rstrip())
5040#############################################################################
5041class ConnectionPool:
5042 """Fetch a TCP connection for use in an SSH session, use it, finally return it back to the pool for future reuse."""
5044 def __init__(self, remote: Remote, max_concurrent_ssh_sessions_per_tcp_connection: int):
5045 assert max_concurrent_ssh_sessions_per_tcp_connection > 0
5046 self.remote: Remote = copy.copy(remote) # shallow copy for immutability (Remote is mutable)
5047 self.capacity: int = max_concurrent_ssh_sessions_per_tcp_connection
5048 self.priority_queue: SmallPriorityQueue = SmallPriorityQueue(reverse=True) # sorted by #free slots and last_modified
5049 self.last_modified: int = 0 # monotonically increasing sequence number
5050 self.cid: int = 0 # monotonically increasing connection number
5051 self._lock: threading.Lock = threading.Lock()
5053 def get_connection(self) -> Connection:
5054 with self._lock:
5055 conn = self.priority_queue.pop() if len(self.priority_queue) > 0 else None
5056 if conn is None or conn.is_full():
5057 if conn is not None:
5058 self.priority_queue.push(conn)
5059 conn = Connection(self.remote, self.capacity, self.cid) # add a new connection
5060 self.last_modified += 1
5061 conn.update_last_modified(self.last_modified) # LIFO tiebreaker favors latest conn as that's most alive
5062 self.cid += 1
5063 conn.increment_free(-1)
5064 self.priority_queue.push(conn)
5065 return conn
5067 def return_connection(self, conn: Connection) -> None:
5068 assert conn is not None
5069 with self._lock:
5070 # update priority = remove conn from queue, update priority, finally reinsert updated conn into queue
5071 self.priority_queue.remove(conn, assert_is_contained=True)
5072 conn.increment_free(1)
5073 self.last_modified += 1
5074 conn.update_last_modified(self.last_modified) # LIFO tiebreaker favors latest conn as that's most alive
5075 self.priority_queue.push(conn)
5077 def shutdown(self, msg_prefix: str) -> None:
5078 if self.remote.reuse_ssh_connection:
5079 for conn in self.priority_queue:
5080 conn.shutdown(msg_prefix, self.remote.params)
5081 self.priority_queue.clear()
5083 def __repr__(self) -> str:
5084 with self._lock:
5085 queue = self.priority_queue
5086 return str({"capacity": self.capacity, "queue_len": len(queue), "cid": self.cid, "queue": queue})
5089#############################################################################
5090class ConnectionPools:
5091 """A bunch of named connection pools with various multiplexing capacities."""
5093 def __init__(self, remote: Remote, capacities: Dict[str, int]):
5094 self.pools = {name: ConnectionPool(remote, capacity) for name, capacity in capacities.items()}
5096 def __repr__(self) -> str:
5097 return str(self.pools)
5099 def pool(self, name: str) -> ConnectionPool:
5100 return self.pools[name]
5102 def shutdown(self, msg_prefix: str) -> None:
5103 for name, pool in self.pools.items():
5104 pool.shutdown(msg_prefix + "/" + name)
5107#############################################################################
5108class ProgressReporter:
5109 """Periodically prints progress updates to the same console status line, which is helpful if the program runs in an
5110 interactive Unix terminal session. Tails the 'pv' output log files that are being written to by (parallel) replication,
5111 and extracts aggregate progress and throughput metrics from them, such as MB, MB/s, ETA, etc. Periodically prints these
5112 metrics to the console status line (but not to the log file), and in doing so "visually overwrites" the previous status
5113 line, via appending a \r carriage return control char rather than a \n newline char. Does not print a status line if the
5114 Unix environment var 'bzfs_isatty' is set to 'false', in order not to confuse programs that scrape redirected stdout.
5115 Example console status line:
5116 2025-01-17 01:23:04 [I] zfs sent 41.7 GiB 0:00:46 [963 MiB/s] [907 MiB/s] [==========> ] 80% ETA 0:00:04 ETA 01:23:08"""
5118 def __init__(self, p: Params, use_select: bool, progress_update_intervals: Optional[Tuple[float, float]], fail=False):
5119 # immutable variables:
5120 self.params: Params = p
5121 self.use_select: bool = use_select
5122 self.progress_update_intervals = progress_update_intervals
5123 self.inject_error: bool = fail # for testing only
5125 # mutable variables:
5126 self.thread: threading.Thread = None
5127 self.exception: BaseException = None
5128 self.lock: threading.Lock = threading.Lock()
5129 self.sleeper: InterruptibleSleep = InterruptibleSleep(self.lock) # sleeper shares lock with reporter
5130 self.file_name_queue: Set[str] = set()
5131 self.file_name_set: Set[str] = set()
5132 self.is_resetting = True
5133 self.is_pausing = False
5135 def start(self) -> None:
5136 with self.lock:
5137 assert self.thread is None
5138 self.thread = threading.Thread(target=lambda: self._run(), name="progress_reporter", daemon=True)
5139 self.thread.start()
5141 def stop(self) -> None:
5142 """Blocks until reporter is stopped, then reraises any exception that may have happened during log processing."""
5143 self.sleeper.interrupt()
5144 t = self.thread
5145 if t is not None:
5146 t.join()
5147 e = self.exception
5148 if e is not None:
5149 raise e # reraise exception in current thread
5151 def pause(self) -> None:
5152 with self.lock:
5153 self.is_pausing = True
5155 def reset(self) -> None:
5156 with self.lock:
5157 self.is_resetting = True
5159 def enqueue_pv_log_file(self, pv_log_file: str) -> None:
5160 """Tells progress reporter thread to also monitor and tail the given pv log file."""
5161 with self.lock:
5162 if pv_log_file not in self.file_name_set:
5163 self.file_name_queue.add(pv_log_file)
5165 def _run(self) -> None:
5166 log = self.params.log
5167 try:
5168 fds: List[TextIO] = []
5169 try:
5170 selector = selectors.SelectSelector() if self.use_select else selectors.PollSelector()
5171 try:
5172 self._run_internal(fds, selector)
5173 finally:
5174 selector.close()
5175 finally:
5176 for fd in fds:
5177 fd.close()
5178 except BaseException as e:
5179 self.exception = e # will be reraised in stop()
5180 log.error("%s%s", "ProgressReporter:\n", "".join(traceback.TracebackException.from_exception(e).format()))
5182 @dataclass
5183 class TransferStat:
5184 @dataclass(order=True)
5185 class ETA: # Estimated time of arrival
5186 timestamp_nanos: int # sorted by future time at which current zfs send/recv transfer is estimated to complete
5187 seq_nr: int # tiebreaker wrt. sort order
5188 line_tail: str = field(compare=False) # trailing pv log line part w/ progress bar, duration ETA, timestamp ETA
5190 bytes_in_flight: int
5191 eta: ETA
5193 def _run_internal(self, fds: List[TextIO], selector: selectors.BaseSelector) -> None:
5195 @dataclass
5196 class Sample:
5197 sent_bytes: int
5198 timestamp_nanos: int
5200 update_interval_secs, sliding_window_secs = (
5201 self.progress_update_intervals if self.progress_update_intervals is not None else self.get_update_intervals()
5202 )
5203 update_interval_nanos: int = round(update_interval_secs * 1_000_000_000)
5204 sliding_window_nanos: int = round(sliding_window_secs * 1_000_000_000)
5205 sleep_nanos = round(update_interval_nanos / 2.5)
5206 etas: List = []
5207 while True:
5208 empty_file_name_queue: Set[str] = set()
5209 with self.lock:
5210 if self.sleeper.is_stopping:
5211 return
5212 # progress reporter thread picks up pv log files that so far aren't being tailed
5213 n = len(self.file_name_queue)
5214 m = len(self.file_name_set)
5215 self.file_name_set.update(self.file_name_queue) # union
5216 assert len(self.file_name_set) == n + m # aka assert (previous) file_name_set.isdisjoint(file_name_queue)
5217 local_file_name_queue = self.file_name_queue
5218 self.file_name_queue = empty_file_name_queue # exchange buffers
5219 is_pausing = self.is_pausing
5220 self.is_pausing = False
5221 is_resetting = self.is_resetting
5222 self.is_resetting = False
5223 if is_pausing:
5224 next_update_nanos = time.time_ns() + 1000 * 365 * 86400 * 1_000_000_000 # infinity
5225 if is_resetting:
5226 sent_bytes, last_status_len = 0, 0
5227 num_lines, num_readables = 0, 0
5228 start_time_nanos = time.time_ns()
5229 next_update_nanos = start_time_nanos + update_interval_nanos
5230 latest_samples: Deque[Sample] = deque([Sample(0, start_time_nanos)]) # sliding window w/ recent measurements
5231 for pv_log_file in local_file_name_queue:
5232 Path(pv_log_file).touch()
5233 fd = open(pv_log_file, mode="r", newline="", encoding="utf-8")
5234 fds.append(fd)
5235 eta = self.TransferStat.ETA(timestamp_nanos=0, seq_nr=-len(fds), line_tail="")
5236 selector.register(fd, selectors.EVENT_READ, data=(iter(fd), self.TransferStat(bytes_in_flight=0, eta=eta)))
5237 etas.append(eta)
5238 readables = selector.select(timeout=0) # 0 indicates "don't block"
5239 has_line = False
5240 curr_time_nanos = time.time_ns()
5241 for selector_key, _ in readables: # for each file that's ready for non-blocking read
5242 num_readables += 1
5243 key: selectors.SelectorKey = selector_key
5244 iter_fd, s = key.data
5245 for line in iter_fd: # aka iter(fd)
5246 sent_bytes += self.update_transfer_stat(line, s, curr_time_nanos)
5247 num_lines += 1
5248 has_line = True
5249 if curr_time_nanos >= next_update_nanos:
5250 elapsed_nanos = curr_time_nanos - start_time_nanos
5251 msg0, msg3 = self.format_sent_bytes(sent_bytes, elapsed_nanos) # throughput etc since replication start time
5252 msg1 = self.format_duration(elapsed_nanos) # duration since replication start time
5253 oldest: Sample = latest_samples[0] # throughput etc, over sliding window
5254 _, msg2 = self.format_sent_bytes(sent_bytes - oldest.sent_bytes, curr_time_nanos - oldest.timestamp_nanos)
5255 msg4 = max(etas).line_tail if len(etas) > 0 else "" # progress bar, ETAs
5256 timestamp = datetime.now().isoformat(sep=" ", timespec="seconds") # 2024-09-03 12:26:15
5257 status_line = f"{timestamp} [I] zfs sent {msg0} {msg1} {msg2} {msg3} {msg4}"
5258 status_line = status_line.ljust(last_status_len) # "overwrite" trailing chars of previous status with spaces
5260 # The Unix console skips back to the beginning of the console line when it sees this \r control char:
5261 sys.stdout.write(f"{status_line}\r")
5262 sys.stdout.flush()
5264 # log.trace("\nnum_lines: %s, num_readables: %s", num_lines, num_readables)
5265 last_status_len = len(status_line.rstrip())
5266 next_update_nanos += update_interval_nanos
5267 latest_samples.append(Sample(sent_bytes, curr_time_nanos))
5268 if elapsed_nanos >= sliding_window_nanos:
5269 latest_samples.popleft() # slide the sliding window containing recent measurements
5270 elif not has_line:
5271 # Avoid burning CPU busily spinning on I/O readiness as fds are almost always ready for non-blocking read
5272 # even if no new pv log line has been written. Yet retain ability to wake up immediately on reporter.stop().
5273 self.sleeper.sleep(min(sleep_nanos, next_update_nanos - curr_time_nanos))
5274 if self.inject_error:
5275 raise ValueError("Injected ProgressReporter error") # for testing only
5277 def update_transfer_stat(self, line: str, s: TransferStat, curr_time_nanos: int) -> int:
5278 num_bytes, s.eta.timestamp_nanos, s.eta.line_tail = self.parse_pv_line(line, curr_time_nanos)
5279 bytes_in_flight = s.bytes_in_flight
5280 s.bytes_in_flight = num_bytes if line.endswith("\r") else 0 # intermediate vs. final status update of each transfer
5281 return num_bytes - bytes_in_flight
5283 no_rates_regex = re.compile(r".*/s\s*[)\]]?\s*") # matches until end of last pv rate, e.g. "834MiB/s]" or "834MiB/s)"
5284 # time remaining --eta "ETA 00:00:39" or "ETA 2+0:00:39" or "ETA 2:0:00:39", followed by trailing --fineta timestamp ETA
5285 time_remaining_eta_regex = re.compile(r".*?ETA\s*((\d+)[+:])?(\d\d?):(\d\d):(\d\d).*(ETA|FIN).*")
5287 @staticmethod
5288 def parse_pv_line(line: str, curr_time_nanos: int) -> Tuple[int, int, str]:
5289 assert isinstance(line, str)
5290 if ":" in line:
5291 line = line.split(":", 1)[1].strip()
5292 sent_bytes, line = pv_size_to_bytes(line)
5293 line = ProgressReporter.no_rates_regex.sub("", line.lstrip(), 1) # remove pv --timer, --rate, --average-rate
5294 match = ProgressReporter.time_remaining_eta_regex.fullmatch(line) # extract pv --eta duration
5295 if match:
5296 _, days, hours, minutes, secs, _ = match.groups()
5297 time_remaining_secs = (86400 * int(days) if days else 0) + int(hours) * 3600 + int(minutes) * 60 + int(secs)
5298 curr_time_nanos += time_remaining_secs * 1_000_000_000 # ETA timestamp = now + time remaining duration
5299 return sent_bytes, curr_time_nanos, line
5300 return 0, curr_time_nanos, ""
5302 @staticmethod
5303 def format_sent_bytes(num_bytes: int, duration_nanos: int) -> Tuple[str, str]:
5304 bytes_per_sec = round(1_000_000_000 * num_bytes / max(1, duration_nanos))
5305 return f"{human_readable_bytes(num_bytes, precision=2)}", f"[{human_readable_bytes(bytes_per_sec, precision=2)}/s]"
5307 @staticmethod
5308 def format_duration(duration_nanos: int) -> str:
5309 total_seconds = round(duration_nanos / 1_000_000_000)
5310 hours, remainder = divmod(total_seconds, 3600)
5311 minutes, seconds = divmod(remainder, 60)
5312 return f"{hours}:{minutes:02d}:{seconds:02d}"
5314 def get_update_intervals(self) -> Tuple[float, float]:
5315 parser = argparse.ArgumentParser(allow_abbrev=False)
5316 parser.add_argument("--interval", "-i", type=float, default=1)
5317 parser.add_argument("--average-rate-window", "-m", type=float, default=30)
5318 args, _ = parser.parse_known_args(args=self.params.pv_program_opts)
5319 interval = min(60 * 60, max(args.interval, 0.1))
5320 return interval, min(60 * 60, max(args.average_rate_window, interval))
5323#############################################################################
5324class InterruptibleSleep:
5325 """Provides a sleep(timeout) function that can be interrupted by another thread."""
5327 def __init__(self, lock=None):
5328 self.is_stopping: bool = False
5329 self._lock = lock if lock is not None else threading.Lock()
5330 self._condition = threading.Condition(self._lock)
5332 def sleep(self, duration_nanos: int) -> None:
5333 """Delays the current thread by the given number of nanoseconds."""
5334 end_time_nanos = time.time_ns() + duration_nanos
5335 with self._lock:
5336 while not self.is_stopping:
5337 duration_nanos = end_time_nanos - time.time_ns()
5338 if duration_nanos <= 0:
5339 return
5340 self._condition.wait(timeout=duration_nanos / 1_000_000_000) # release, then block until notified or timeout
5342 def interrupt(self) -> None:
5343 """Wakes up currently sleeping threads and makes any future sleep()s a noop."""
5344 with self._lock:
5345 if not self.is_stopping:
5346 self.is_stopping = True
5347 self._condition.notify_all()
5350#############################################################################
5351def fix_send_recv_opts(
5352 opts: List[str],
5353 exclude_long_opts: Set[str],
5354 exclude_short_opts: str,
5355 include_arg_opts: Set[str],
5356 exclude_arg_opts: Set[str] = set(),
5357) -> List[str]:
5358 """These opts are instead managed via bzfs CLI args --dryrun, etc."""
5359 assert "-" not in exclude_short_opts
5360 results = []
5361 i = 0
5362 n = len(opts)
5363 while i < n:
5364 opt = opts[i]
5365 i += 1
5366 if opt in exclude_arg_opts: # example: {"-X", "--exclude"}
5367 i += 1
5368 continue
5369 elif opt in include_arg_opts: # example: {"-o", "-x"}
5370 results.append(opt)
5371 if i < n:
5372 results.append(opts[i])
5373 i += 1
5374 elif opt not in exclude_long_opts: # example: {"--dryrun", "--verbose"}
5375 if opt.startswith("-") and opt != "-" and not opt.startswith("--"):
5376 for char in exclude_short_opts: # example: "den"
5377 opt = opt.replace(char, "")
5378 if opt == "-":
5379 continue
5380 results.append(opt)
5381 return results
5384def fix_solaris_raw_mode(lst: List[str]) -> List[str]:
5385 lst = ["-w" if opt == "--raw" else opt for opt in lst]
5386 lst = ["compress" if opt == "--compressed" else opt for opt in lst]
5387 i = lst.index("-w") if "-w" in lst else -1
5388 if i >= 0:
5389 i += 1
5390 if i == len(lst) or (lst[i] != "none" and lst[i] != "compress"):
5391 lst.insert(i, "none")
5392 return lst
5395def delete_stale_files(root_dir: str, prefix: str, secs: int = 31 * 24 * 60 * 60, dirs=False, exclude=None) -> None:
5396 """Cleans up obsolete files. For example caused by abnormal termination, OS crash."""
5397 now = time.time()
5398 for entry in os.scandir(root_dir):
5399 if entry.name == exclude or not entry.name.startswith(prefix):
5400 continue
5401 try:
5402 if ((dirs and entry.is_dir()) or (not dirs and not entry.is_dir())) and now - entry.stat().st_mtime >= secs:
5403 if dirs:
5404 shutil.rmtree(entry.path, ignore_errors=True)
5405 else:
5406 os.remove(entry.path)
5407 except FileNotFoundError:
5408 pass # harmless
5411def die(msg: str) -> None:
5412 ex = SystemExit(msg)
5413 ex.code = die_status
5414 raise ex
5417def cut(field: int = -1, separator: str = "\t", lines: List[str] = None) -> List[str]:
5418 """Retains only column number 'field' in a list of TSV/CSV lines; Analog to Unix 'cut' CLI command."""
5419 assert isinstance(lines, list)
5420 if field == 1:
5421 return [line[0 : line.index(separator)] for line in lines]
5422 elif field == 2:
5423 return [line[line.index(separator) + 1 :] for line in lines]
5424 else:
5425 raise ValueError("Unsupported parameter value")
5428def has_siblings(sorted_datasets: List[str]) -> bool:
5429 """Returns whether the (sorted) list of input datasets contains any siblings."""
5430 skip_dataset = DONT_SKIP_DATASET
5431 parents: Set[str] = set()
5432 for dataset in sorted_datasets:
5433 assert dataset
5434 parent = os.path.dirname(dataset)
5435 if parent in parents:
5436 return True # I have a sibling if my parent already has another child
5437 parents.add(parent)
5438 if is_descendant(dataset, of_root_dataset=skip_dataset):
5439 continue
5440 if skip_dataset != DONT_SKIP_DATASET:
5441 return True # I have a sibling if I am a root dataset and another root dataset already exists
5442 skip_dataset = dataset
5443 return False
5446def is_descendant(dataset: str, of_root_dataset: str) -> bool:
5447 return f"{dataset}/".startswith(f"{of_root_dataset}/")
5450def relativize_dataset(dataset: str, root_dataset: str) -> str:
5451 """Converts an absolute dataset path to a relative dataset path wrt root_dataset
5452 Example: root_dataset=tank/foo, dataset=tank/foo/bar/baz --> relative_path=/bar/baz"""
5453 return dataset[len(root_dataset) :]
5456def replace_prefix(s: str, old_prefix: str, new_prefix: str) -> str:
5457 """In a string s, replaces a leading old_prefix string with new_prefix. Assumes the leading string is present."""
5458 return new_prefix + s[len(old_prefix) :]
5461def replace_in_lines(lines: List[str], old: str, new: str) -> None:
5462 for i in range(len(lines)):
5463 lines[i] = lines[i].replace(old, new)
5466def is_included(name: str, include_regexes: RegexList, exclude_regexes: RegexList) -> bool:
5467 """Returns True if the name matches at least one of the include regexes but none of the exclude regexes;
5468 else False. A regex that starts with a `!` is a negation - the regex matches if the regex without the
5469 `!` prefix does not match."""
5470 for regex, is_negation in exclude_regexes:
5471 is_match = regex.fullmatch(name) if regex.pattern != ".*" else True
5472 if is_negation:
5473 is_match = not is_match
5474 if is_match:
5475 return False
5477 for regex, is_negation in include_regexes:
5478 is_match = regex.fullmatch(name) if regex.pattern != ".*" else True
5479 if is_negation:
5480 is_match = not is_match
5481 if is_match:
5482 return True
5484 return False
5487def compile_regexes(regexes: List[str], suffix: str = "") -> RegexList:
5488 assert isinstance(regexes, list)
5489 compiled_regexes = []
5490 for regex in regexes:
5491 if suffix: # disallow non-trailing end-of-str symbol in dataset regexes to ensure descendants will also match
5492 if regex.endswith("\\$"):
5493 pass # trailing literal $ is ok
5494 elif regex.endswith("$"):
5495 regex = regex[0:-1] # ok because all users of compile_regexes() call re.fullmatch()
5496 elif "$" in regex:
5497 raise re.error("Must not use non-trailing '$' character", regex)
5498 is_negation = regex.startswith("!")
5499 if is_negation:
5500 regex = regex[1:]
5501 regex = replace_capturing_groups_with_non_capturing_groups(regex)
5502 if regex != ".*" or not (suffix.startswith("(") and suffix.endswith(")?")):
5503 regex = f"{regex}{suffix}"
5504 compiled_regexes.append((re.compile(regex), is_negation))
5505 return compiled_regexes
5508def replace_capturing_groups_with_non_capturing_groups(regex: str) -> str:
5509 """Replaces regex capturing groups with non-capturing groups for better matching performance.
5510 Example: '(.*/)?tmp(foo|bar)(?!public)\\(' --> '(?:.*/)?tmp(?:foo|bar)(?!public)\\()'
5511 Aka replaces brace '(' followed by a char other than question mark '?', but not preceded by a backslash
5512 with the replacement string '(?:'
5513 Also see https://docs.python.org/3/howto/regex.html#non-capturing-and-named-groups"""
5514 # pattern = re.compile(r'(?<!\\)\((?!\?)')
5515 # return pattern.sub('(?:', regex)
5516 i = len(regex) - 2
5517 while i >= 0:
5518 i = regex.rfind("(", 0, i + 1)
5519 if i >= 0 and regex[i] == "(" and (regex[i + 1] != "?") and (i == 0 or regex[i - 1] != "\\"):
5520 regex = f"{regex[0:i]}(?:{regex[i + 1:]}"
5521 i -= 1
5522 return regex
5525def getenv_any(key: str, default=None) -> str:
5526 """All shell environment variable names used for configuration start with this prefix."""
5527 return os.getenv(env_var_prefix + key, default)
5530def getenv_int(key: str, default: int) -> int:
5531 return int(getenv_any(key, default))
5534def getenv_bool(key: str, default: bool = False) -> bool:
5535 return getenv_any(key, str(default).lower()).strip().lower() == "true"
5538P = TypeVar("P")
5541def find_match(
5542 seq: Sequence[P],
5543 predicate: Callable[[P], bool],
5544 start: Optional[int] = None,
5545 end: Optional[int] = None,
5546 reverse: bool = False,
5547 raises: Union[bool, str, Callable[[], str]] = False, # raises: bool | str | Callable = False, # python >= 3.10
5548) -> int:
5549 """Returns the integer index within seq of the first item (or last item if reverse==True) that matches the given
5550 predicate condition. If no matching item is found returns -1 or ValueError, depending on the raises parameter,
5551 which is a bool indicating whether to raise an error, or a string containing the error message, but can also be a
5552 Callable/lambda in order to support efficient deferred generation of error messages.
5553 Analog to str.find(), including slicing semantics with parameters start and end.
5554 For example, seq can be a list, tuple or str.
5556 Example usage:
5557 lst = ["a", "b", "-c", "d"]
5558 i = find_match(lst, lambda arg: arg.startswith("-"), start=1, end=3, reverse=True)
5559 if i >= 0:
5560 ...
5561 i = find_match(lst, lambda arg: arg.startswith("-"), raises=f"Tag {tag} not found in {file}")
5562 i = find_match(lst, lambda arg: arg.startswith("-"), raises=lambda: f"Tag {tag} not found in {file}")
5563 """
5564 offset = 0 if start is None else start if start >= 0 else len(seq) + start
5565 if start is not None or end is not None:
5566 seq = seq[start:end]
5567 for i, item in enumerate(reversed(seq) if reverse else seq):
5568 if predicate(item):
5569 if reverse:
5570 return len(seq) - i - 1 + offset
5571 else:
5572 return i + offset
5573 if raises is False or raises is None:
5574 return -1
5575 if raises is True:
5576 raise ValueError("No matching item found in sequence")
5577 if callable(raises):
5578 raises = raises()
5579 raise ValueError(raises)
5582def xappend(lst, *items) -> List[str]:
5583 """Appends each of the items to the given list if the item is "truthy", e.g. not None and not an empty string.
5584 If an item is an iterable does so recursively, flattening the output."""
5585 for item in items:
5586 if isinstance(item, str) or not isinstance(item, collections.abc.Iterable):
5587 if item:
5588 lst.append(item)
5589 else:
5590 xappend(lst, *item)
5591 return lst
5594def human_readable_bytes(size: float, separator=" ", precision=None, long=False) -> str:
5595 sign = "-" if size < 0 else ""
5596 s = abs(size)
5597 units = ("B", "KiB", "MiB", "GiB", "TiB", "PiB", "EiB", "ZiB", "YiB", "RiB", "QiB")
5598 i = 0
5599 long_form = f" ({size} bytes)" if long else ""
5600 while s >= 1024 and i < len(units) - 1:
5601 s /= 1024
5602 i += 1
5603 formatted_num = human_readable_float(s) if precision is None else f"{s:.{precision}f}"
5604 return f"{sign}{formatted_num}{separator}{units[i]}{long_form}"
5607def human_readable_duration(duration: float, unit="ns", separator=" ", precision=None, long=False) -> str:
5608 sign = "-" if duration < 0 else ""
5609 t = abs(duration)
5610 units = ("ns", "μs", "ms", "s", "m", "h", "d")
5611 seconds = (1 / 1_000_000_000, 1 / 1_000_000, 1 / 1000, 1, 60, 60 * 60, 60 * 60 * 24)
5612 i = units.index(unit)
5613 long_form = f" ({round(duration * seconds[i])} seconds)" if long else ""
5614 while t >= 1000 and i < 3:
5615 t /= 1000
5616 i += 1
5617 if i >= 3:
5618 while t >= 60 and i < 5:
5619 t /= 60
5620 i += 1
5621 if i >= 5:
5622 while t >= 24 and i < len(units) - 1:
5623 t /= 24
5624 i += 1
5625 formatted_num = human_readable_float(t) if precision is None else f"{t:.{precision}f}"
5626 return f"{sign}{formatted_num}{separator}{units[i]}{long_form}"
5629def human_readable_float(number: float) -> str:
5630 """If the number has one digit before the decimal point (0 <= abs(number) < 10):
5631 Round and use two decimals after the decimal point (e.g., 3.14559 --> "3.15").
5633 If the number has two digits before the decimal point (10 <= abs(number) < 100):
5634 Round and use one decimal after the decimal point (e.g., 12.36 --> "12.4").
5636 If the number has three or more digits before the decimal point (abs(number) >= 100):
5637 Round and use zero decimals after the decimal point (e.g., 123.556 --> "124").
5639 Ensure no unnecessary trailing zeroes are retained: Example: 1.500 --> "1.5", 1.00 --> "1"
5640 """
5641 abs_number = abs(number)
5642 precision = 2 if abs_number < 10 else 1 if abs_number < 100 else 0
5643 if precision == 0:
5644 return str(round(number))
5645 result = f"{number:.{precision}f}"
5646 assert "." in result
5647 result = result.rstrip("0").rstrip(".") # Remove trailing zeros and trailing decimal point if empty
5648 return "0" if result == "-0" else result
5651def parse_duration_to_milliseconds(duration: str, regex_suffix: str = "") -> int:
5652 unit_milliseconds = {
5653 "milliseconds": 1,
5654 "millis": 1,
5655 "seconds": 1000,
5656 "secs": 1000,
5657 "minutes": 60 * 1000,
5658 "mins": 60 * 1000,
5659 "hours": 60 * 60 * 1000,
5660 "days": 86400 * 1000,
5661 "weeks": 7 * 86400 * 1000,
5662 "months": round(30.5 * 86400 * 1000),
5663 "years": round(365 * 86400 * 1000),
5664 }
5665 match = re.fullmatch(
5666 r"(\d+)\s*(milliseconds|millis|seconds|secs|minutes|mins|hours|days|weeks|months|years)" + regex_suffix, duration
5667 )
5668 if not match:
5669 raise ValueError("Invalid duration format")
5670 quantity = int(match.group(1))
5671 unit = match.group(2)
5672 return quantity * unit_milliseconds[unit]
5675def get_home_directory() -> str:
5676 """Reliably detects home dir without using HOME env var."""
5677 # thread-safe version of: os.environ.pop('HOME', None); os.path.expanduser('~')
5678 return pwd.getpwuid(os.getuid()).pw_dir
5681def create_symlink(src: str, dst_dir: str, dst: str) -> None:
5682 rel_path = os.path.relpath(src, start=dst_dir)
5683 os.symlink(rel_path, os.path.join(dst_dir, dst))
5686def is_version_at_least(version_str: str, min_version_str: str) -> bool:
5687 """Checks if the version string is at least the minimum version string."""
5688 return tuple(map(int, version_str.split("."))) >= tuple(map(int, min_version_str.split(".")))
5691def tail(file, n: int) -> Sequence[str]:
5692 if not os.path.isfile(file):
5693 return []
5694 with open(file, "r", encoding="utf-8") as fd:
5695 return deque(fd, maxlen=n)
5698def append_if_absent(lst: List, *items) -> List:
5699 for item in items:
5700 if item not in lst:
5701 lst.append(item)
5702 return lst
5705def stderr_to_str(stderr) -> str:
5706 """Workaround for https://github.com/python/cpython/issues/87597"""
5707 return stderr if not isinstance(stderr, bytes) else stderr.decode("utf-8")
5710def xprint(log: Logger, value, run: bool = True, end: str = "\n", file=None) -> None:
5711 if run and value:
5712 value = value if end else value.rstrip()
5713 level = log_stdout if file is sys.stdout else log_stderr
5714 log.log(level, "%s", value)
5717def unlink_missing_ok(file: str) -> None: # workaround for compat with python < 3.8
5718 try:
5719 Path(file).unlink()
5720 except FileNotFoundError:
5721 pass
5724def set_last_modification_time(path: str, unixtime_in_secs: int, if_more_recent=False) -> None:
5725 if not os.path.exists(path):
5726 with open(path, "a"):
5727 pass
5728 elif if_more_recent and unixtime_in_secs <= round(os.stat(path).st_mtime):
5729 return
5730 os.utime(path, times=(unixtime_in_secs, unixtime_in_secs))
5733def drain(iterable: Iterable) -> None:
5734 deque(iterable, maxlen=0)
5737def nsuffix(s: str) -> str:
5738 return "_" + s if s else ""
5741def ninfix(s: str) -> str:
5742 return s + "_" if s else ""
5745def unixtime_fromisoformat(datetime_str: str) -> int:
5746 """Converts an ISO 8601 datetime string into a UTC Unix time in integer seconds. If the datetime string does not
5747 contain time zone info then it is assumed to be in the local time zone."""
5748 return int(datetime.fromisoformat(datetime_str).timestamp())
5751def isotime_from_unixtime(unixtime_in_seconds: int) -> str:
5752 """Converts a UTC Unix time in integer seconds into an ISO 8601 datetime string in the local time zone.
5753 Example: 2024-09-03_12:26:15"""
5754 tz = timezone.utc # outputs time in UTC
5755 tz = None # outputs time in local time zone
5756 dt = datetime.fromtimestamp(unixtime_in_seconds, tz=tz)
5757 return dt.isoformat(sep="_", timespec="seconds")
5760def current_datetime(tz_spec: str = None, now_fn: Callable[[Optional[tzinfo]], datetime] = None) -> datetime:
5761 """Returns a datetime that is the current time in the given timezone, or in the local timezone if tz_spec is absent."""
5762 now_fn = now_fn or datetime.now
5763 return now_fn(get_timezone(tz_spec))
5766def get_timezone(tz_spec: str = None) -> tzinfo:
5767 """Returns the given timezone, or the local timezone if the timezone spec is absent. The optional timezone spec is of
5768 the form "UTC" or "+HH:MM" or "-HH:MM" for fixed UTC offsets."""
5769 if tz_spec is None:
5770 tz = None # i.e. local timezone
5771 elif tz_spec == "UTC":
5772 tz = timezone.utc
5773 else:
5774 match = re.fullmatch(r"([+-])(\d\d):?(\d\d)", tz_spec)
5775 if match:
5776 sign, hours, minutes = match.groups()
5777 offset = int(hours) * 60 + int(minutes)
5778 offset = -offset if sign == "-" else offset
5779 tz = timezone(timedelta(minutes=offset))
5780 elif "/" in tz_spec and sys.version_info >= (3, 9):
5781 from zoneinfo import ZoneInfo # requires python >= 3.9
5783 tz = ZoneInfo(tz_spec) # Standard IANA timezone. Example: "Europe/Vienna"
5784 else:
5785 raise ValueError(f"Invalid timezone specification: {tz_spec}")
5786 return tz
5789metadata_month = {"min": 1, "max": 12, "help": "The month within a year"}
5790metadata_weekday = {"min": 0, "max": 6, "help": "The weekday within a week: 0=Sunday, 1=Monday, ..., 6=Saturday"}
5791metadata_day = {"min": 1, "max": 31, "help": "The day within a month"}
5792metadata_hour = {"min": 0, "max": 23, "help": "The hour within a day"}
5793metadata_minute = {"min": 0, "max": 59, "help": "The minute within an hour"}
5794metadata_second = {"min": 0, "max": 59, "help": "The second within a minute"}
5795metadata_millisecond = {"min": 0, "max": 999, "help": "The millisecond within a second"}
5796metadata_microsecond = {"min": 0, "max": 999, "help": "The microsecond within a millisecond"}
5799@dataclass
5800class PeriodAnchors:
5801 # The anchors for a given duration unit are computed as follows:
5802 # yearly: Anchor(dt) = latest T where T <= dt and T == Start of January 1 of dt + anchor.yearly_* vars
5803 yearly_month: int = field(default=1, metadata=metadata_month) # 1 <= x <= 12
5804 yearly_monthday: int = field(default=1, metadata=metadata_day) # 1 <= x <= 31
5805 yearly_hour: int = field(default=0, metadata=metadata_hour) # 0 <= x <= 23
5806 yearly_minute: int = field(default=0, metadata=metadata_minute) # 0 <= x <= 59
5807 yearly_second: int = field(default=0, metadata=metadata_second) # 0 <= x <= 59
5809 # monthly: Anchor(dt) = latest T where T <= dt && T == Start of first day of month of dt + anchor.monthly_* vars
5810 monthly_monthday: int = field(default=1, metadata=metadata_day) # 1 <= x <= 31
5811 monthly_hour: int = field(default=0, metadata=metadata_hour) # 0 <= x <= 23
5812 monthly_minute: int = field(default=0, metadata=metadata_minute) # 0 <= x <= 59
5813 monthly_second: int = field(default=0, metadata=metadata_second) # 0 <= x <= 59
5815 # weekly: Anchor(dt) = latest T where T <= dt && T == Latest midnight from Sunday to Monday of dt + anchor.weekly_* vars
5816 weekly_weekday: int = field(default=0, metadata=metadata_weekday) # 0 <= x <= 7
5817 weekly_hour: int = field(default=0, metadata=metadata_hour) # 0 <= x <= 23
5818 weekly_minute: int = field(default=0, metadata=metadata_minute) # 0 <= x <= 59
5819 weekly_second: int = field(default=0, metadata=metadata_second) # 0 <= x <= 59
5821 # daily: Anchor(dt) = latest T where T <= dt && T == Latest midnight of dt + anchor.daily_* vars
5822 daily_hour: int = field(default=0, metadata=metadata_hour) # 0 <= x <= 23
5823 daily_minute: int = field(default=0, metadata=metadata_minute) # 0 <= x <= 59
5824 daily_second: int = field(default=0, metadata=metadata_second) # 0 <= x <= 59
5826 # hourly: Anchor(dt) = latest T where T <= dt && T == Latest midnight of dt + anchor.hourly_* vars
5827 hourly_minute: int = field(default=0, metadata=metadata_minute) # 0 <= x <= 59
5828 hourly_second: int = field(default=0, metadata=metadata_second) # 0 <= x <= 59
5830 # minutely: Anchor(dt) = latest T where T <= dt && T == Latest midnight of dt + anchor.minutely_* vars
5831 minutely_second: int = field(default=0, metadata=metadata_second) # 0 <= x <= 59
5833 # secondly: Anchor(dt) = latest T where T <= dt && T == Latest midnight of dt + anchor.secondly_* vars
5834 secondly_millisecond: int = field(default=0, metadata=metadata_millisecond) # 0 <= x <= 999
5836 # secondly: Anchor(dt) = latest T where T <= dt && T == Latest midnight of dt + anchor.millisecondly_* vars
5837 millisecondly_microsecond: int = field(default=0, metadata=metadata_microsecond) # 0 <= x <= 999
5839 def parse(self, args: argparse.Namespace):
5840 for f in fields(PeriodAnchors):
5841 setattr(self, f.name, getattr(args, f.name))
5842 return self
5845def round_datetime_up_to_duration_multiple(
5846 dt: datetime, duration_amount: int, duration_unit: str, anchors: PeriodAnchors = PeriodAnchors()
5847) -> datetime:
5848 """Given a timezone-aware datetime and a duration, returns a datetime (in the same timezone) that is greater than or
5849 equal to dt, and rounded up (ceiled) and snapped to an anchor plus a multiple of the duration. The snapping is done
5850 relative to the anchors object and the rules defined therein.
5851 Supported units: "millisecondly", "secondly", "minutely", "hourly", "daily", "weekly", "monthly", "yearly".
5852 If dt is already exactly on a boundary (i.e. exactly on a multiple), it is returned unchanged.
5853 Examples:
5854 Default hourly anchor is midnight
5855 14:00:00, 1 hours --> 14:00:00
5856 14:05:01, 1 hours --> 15:00:00
5857 15:05:01, 1 hours --> 16:00:00
5858 16:05:01, 1 hours --> 17:00:00
5859 23:55:01, 1 hours --> 00:00:00 on the next day
5860 14:05:01, 2 hours --> 16:00:00
5861 15:00:00, 2 hours --> 16:00:00
5862 15:05:01, 2 hours --> 16:00:00
5863 16:00:00, 2 hours --> 16:00:00
5864 16:05:01, 2 hours --> 18:00:00
5865 23:55:01, 2 hours --> 00:00:00 on the next day
5866 """
5868 def add_months(dt: datetime, months: int) -> datetime:
5869 total_month = dt.month - 1 + months
5870 new_year = dt.year + total_month // 12
5871 new_month = total_month % 12 + 1
5872 last_day = calendar.monthrange(new_year, new_month)[1] # last valid day of the current month
5873 return dt.replace(year=new_year, month=new_month, day=min(dt.day, last_day))
5875 def add_years(dt: datetime, years: int) -> datetime:
5876 new_year = dt.year + years
5877 last_day = calendar.monthrange(new_year, dt.month)[1] # last valid day of the current month
5878 return dt.replace(year=new_year, day=min(dt.day, last_day))
5880 def get_anchor(anchor: datetime, dt: datetime, period: timedelta) -> datetime:
5881 """Adjusts anchor downward by one period if anchor is in the future relative to dt."""
5882 if anchor > dt:
5883 diff = anchor - period
5884 assert diff <= dt
5885 return diff
5886 return anchor
5888 if duration_amount == 0:
5889 return dt
5891 period = None
5892 if duration_unit == "millisecondly":
5893 anchor = get_anchor(
5894 dt.replace(hour=0, minute=0, second=0, microsecond=anchors.millisecondly_microsecond),
5895 dt,
5896 timedelta(milliseconds=1),
5897 )
5898 period = timedelta(milliseconds=duration_amount)
5899 elif duration_unit == "secondly":
5900 anchor = get_anchor(
5901 dt.replace(hour=0, minute=0, second=0, microsecond=anchors.secondly_millisecond * 1000), dt, timedelta(seconds=1)
5902 )
5903 period = timedelta(seconds=duration_amount)
5904 elif duration_unit == "minutely":
5905 anchor = get_anchor(dt.replace(second=anchors.minutely_second, microsecond=0), dt, timedelta(minutes=1))
5906 period = timedelta(minutes=duration_amount)
5907 elif duration_unit == "hourly":
5908 daily_base = dt.replace(hour=0, minute=0, second=0, microsecond=0)
5909 anchor = get_anchor(
5910 daily_base + timedelta(minutes=anchors.hourly_minute, seconds=anchors.hourly_second), dt, timedelta(days=1)
5911 )
5912 period = timedelta(hours=duration_amount)
5913 elif duration_unit == "daily":
5914 daily_base = dt.replace(hour=0, minute=0, second=0, microsecond=0)
5915 anchor = get_anchor(
5916 daily_base + timedelta(hours=anchors.daily_hour, minutes=anchors.daily_minute, seconds=anchors.daily_second),
5917 dt,
5918 timedelta(days=1),
5919 )
5920 period = timedelta(days=duration_amount)
5921 elif duration_unit == "weekly":
5922 daily_base = dt.replace(hour=0, minute=0, second=0, microsecond=0)
5923 anchor = daily_base + timedelta(
5924 hours=anchors.weekly_hour, minutes=anchors.weekly_minute, seconds=anchors.weekly_second
5925 )
5926 # Convert cron weekday (0=Sunday, 1=Monday, ..., 6=Saturday) to Python's weekday (0=Monday, ..., 6=Sunday)
5927 target_py_weekday = (anchors.weekly_weekday - 1) % 7
5928 diff_days = (anchor.weekday() - target_py_weekday) % 7
5929 anchor = get_anchor(anchor - timedelta(days=diff_days), dt, timedelta(days=7))
5930 period = timedelta(weeks=duration_amount)
5932 if period is not None: # "millisecondly", "secondly", "minutely", "hourly", "daily", "weekly"
5933 delta = dt - anchor
5934 period_micros = (period.days * 86400 + period.seconds) * 1_000_000 + period.microseconds
5935 delta_micros = (delta.days * 86400 + delta.seconds) * 1_000_000 + delta.microseconds
5936 remainder = delta_micros % period_micros
5937 if remainder == 0:
5938 return dt
5939 return dt + timedelta(microseconds=period_micros - remainder)
5941 elif duration_unit == "monthly":
5942 last_day = calendar.monthrange(dt.year, dt.month)[1] # last valid day of the current month
5943 anchor = dt.replace( # Compute the base anchor for the month ensuring the day is valid
5944 day=min(anchors.monthly_monthday, last_day),
5945 hour=anchors.monthly_hour,
5946 minute=anchors.monthly_minute,
5947 second=anchors.monthly_second,
5948 microsecond=0,
5949 )
5950 if anchor > dt:
5951 anchor = add_months(anchor, -1)
5952 diff_months = (dt.year - anchor.year) * 12 + (dt.month - anchor.month)
5953 anchor_boundary = add_months(anchor, duration_amount * (diff_months // duration_amount))
5954 if anchor_boundary < dt:
5955 anchor_boundary = add_months(anchor_boundary, duration_amount)
5956 return anchor_boundary
5958 elif duration_unit == "yearly":
5959 last_day = calendar.monthrange(dt.year, anchors.yearly_month)[1] # last valid day for anchor month in current year
5960 anchor = dt.replace( # Compute the base yearly anchor candidate for the current year, ensuring the day is valid
5961 month=anchors.yearly_month,
5962 day=min(anchors.yearly_monthday, last_day),
5963 hour=anchors.yearly_hour,
5964 minute=anchors.yearly_minute,
5965 second=anchors.yearly_second,
5966 microsecond=0,
5967 )
5968 if anchor > dt:
5969 anchor = anchor.replace(year=anchor.year - 1)
5970 diff_years = dt.year - anchor.year
5971 anchor_boundary = add_years(anchor, duration_amount * (diff_years // duration_amount))
5972 if anchor_boundary < dt:
5973 anchor_boundary = add_years(anchor_boundary, duration_amount)
5974 return anchor_boundary
5976 else:
5977 raise ValueError(f"Unsupported duration unit: {duration_unit}")
5980def terminate_process_group(except_current_process=False):
5981 """Sends signal to the entire process group to also terminate child processes started via subprocess.run()"""
5982 signum = signal.SIGTERM
5983 old_signal_handler = (
5984 signal.signal(signum, lambda signalnum, frame: None) # temporarily disable signal handler on current process
5985 if except_current_process
5986 else signal.getsignal(signum)
5987 )
5988 try:
5989 is_test = any("unittest" in frame.filename for frame in inspect.stack())
5990 is_test or os.killpg(os.getpgrp(), signum) # avoid confusing python's unit test framework with killpg()
5991 finally:
5992 signal.signal(signum, old_signal_handler) # reenable and restore original handler
5995arabic_decimal_separator = "\u066b" # "٫"
5996pv_size_to_bytes_regex = re.compile(rf"(\d+[.,{arabic_decimal_separator}]?\d*)\s*([KMGTPEZYRQ]?)(i?)([Bb])(.*)")
5999def pv_size_to_bytes(size: str) -> Tuple[int, str]: # example inputs: "800B", "4.12 KiB", "510 MiB", "510 MB", "4Gb", "2TiB"
6000 match = pv_size_to_bytes_regex.fullmatch(size)
6001 if match:
6002 number = float(match.group(1).replace(",", ".").replace(arabic_decimal_separator, "."))
6003 i = "KMGTPEZYRQ".index(match.group(2)) if match.group(2) else -1
6004 m = 1024 if match.group(3) == "i" else 1000
6005 b = 1 if match.group(4) == "B" else 8
6006 line_tail = match.group(5)
6007 if line_tail and line_tail.startswith("/s"):
6008 raise ValueError("Invalid pv_size: " + size) # stems from 'pv --rate' or 'pv --average-rate'
6009 size_in_bytes = round(number * (m ** (i + 1)) / b)
6010 return size_in_bytes, line_tail
6011 else:
6012 return 0, "" # skip partial or bad 'pv' log file line (pv process killed while writing?)
6015def count_num_bytes_transferred_by_zfs_send(basis_pv_log_file: str) -> int:
6016 """Scrapes the .pv log file(s) and sums up the 'pv --bytes' column."""
6018 def parse_pv_line(line: str) -> int:
6019 if ":" in line:
6020 col = line.split(":", 1)[1].strip()
6021 num_bytes, _ = pv_size_to_bytes(col)
6022 return num_bytes
6023 return 0
6025 total_bytes = 0
6026 files = [basis_pv_log_file] + glob.glob(basis_pv_log_file + pv_file_thread_separator + "[0-9]*")
6027 for file in files:
6028 if os.path.isfile(file):
6029 with open(file, mode="r", newline="", encoding="utf-8") as fd:
6030 line = None
6031 for line in fd:
6032 if line.endswith("\r"):
6033 continue # skip all but the most recent status update of each transfer
6034 total_bytes += parse_pv_line(line)
6035 line = None
6036 if line is not None:
6037 total_bytes += parse_pv_line(line)
6038 return total_bytes
6041def parse_dataset_locator(input_text: str, validate: bool = True, user: str = None, host: str = None, port: int = None):
6042 def convert_ipv6(hostname: str) -> str: # support IPv6 without getting confused by host:dataset colon separator ...
6043 return hostname.replace("|", ":") # ... and any colons that may be part of a (valid) ZFS dataset name
6045 user_undefined = user is None
6046 if user is None:
6047 user = ""
6048 host_undefined = host is None
6049 if host is None:
6050 host = ""
6051 host = convert_ipv6(host)
6052 user_host, dataset, pool = "", "", ""
6054 # Input format is [[user@]host:]dataset
6055 # 1234 5 6
6056 match = re.fullmatch(r"(((([^@]*)@)?([^:]+)):)?(.*)", input_text, re.DOTALL)
6057 if match: 6057 ↛ 6074line 6057 didn't jump to line 6074 because the condition on line 6057 was always true
6058 if user_undefined:
6059 user = match.group(4) or ""
6060 if host_undefined:
6061 host = match.group(5) or ""
6062 host = convert_ipv6(host)
6063 if host == "-":
6064 host = ""
6065 dataset = match.group(6) or ""
6066 i = dataset.find("/")
6067 pool = dataset[0:i] if i >= 0 else dataset
6069 if user and host:
6070 user_host = f"{user}@{host}"
6071 elif host:
6072 user_host = host
6074 if validate:
6075 validate_user_name(user, input_text)
6076 validate_host_name(host, input_text)
6077 validate_port(port, f"Invalid port number: '{port}' for: '{input_text}' - ")
6078 validate_dataset_name(dataset, input_text)
6080 return user, host, user_host, pool, dataset
6083def validate_dataset_name(dataset: str, input_text: str) -> None:
6084 # 'zfs create' CLI does not accept dataset names that are empty or start or end in a slash, etc.
6085 # Also see https://github.com/openzfs/zfs/issues/439#issuecomment-2784424
6086 # and https://github.com/openzfs/zfs/issues/8798
6087 # and (by now nomore accurate): https://docs.oracle.com/cd/E26505_01/html/E37384/gbcpt.html
6088 if (
6089 dataset in ["", ".", ".."]
6090 or "//" in dataset
6091 or dataset.startswith("/")
6092 or dataset.endswith("/")
6093 or dataset.startswith("./")
6094 or dataset.startswith("../")
6095 or dataset.endswith("/.")
6096 or dataset.endswith("/..")
6097 or "/./" in dataset
6098 or "/../" in dataset
6099 or '"' in dataset
6100 or any(char in "'@#`%$^&*+=|,\\" for char in dataset)
6101 or any(char.isspace() and char != " " for char in dataset)
6102 or not dataset[0].isalpha()
6103 ):
6104 die(f"Invalid ZFS dataset name: '{dataset}' for: '{input_text}'")
6107def validate_user_name(user: str, input_text: str) -> None:
6108 if user and (".." in user or any(c.isspace() or c == '"' or c == "'" or c in "/@`" for c in user)):
6109 die(f"Invalid user name: '{user}' for: '{input_text}'")
6112def validate_host_name(host: str, input_text: str) -> None:
6113 if host and (".." in host or any(c.isspace() or c == '"' or c == "'" or c in "/@`" for c in host)):
6114 die(f"Invalid host name: '{host}' for: '{input_text}'")
6117def validate_port(port: int, message: str) -> None:
6118 if isinstance(port, int):
6119 port = str(port)
6120 if port and not port.isdigit():
6121 die(message + f"must be empty or a positive integer: '{port}'")
6124def validate_default_shell(path_to_default_shell: str, r: Remote) -> None:
6125 if path_to_default_shell.endswith("/csh") or path_to_default_shell.endswith("/tcsh"):
6126 # On some old FreeBSD systems the default shell is still csh. Also see https://www.grymoire.com/unix/CshTop10.txt
6127 die(
6128 f"Cowardly refusing to proceed because {prog_name} is not compatible with csh-style quoting of special "
6129 f"characters. The safe workaround is to first manually set 'sh' instead of '{path_to_default_shell}' as "
6130 f"the default shell of the Unix user on {r.location} host: {r.ssh_user_host or 'localhost'}, like so: "
6131 "chsh -s /bin/sh YOURUSERNAME"
6132 )
6135def list_formatter(iterable: Iterable, separator=" ", lstrip=False): # For lazy/noop evaluation in disabled log levels
6136 class CustomListFormatter:
6137 def __str__(self):
6138 s = separator.join(map(str, iterable))
6139 return s.lstrip() if lstrip else s
6141 return CustomListFormatter()
6144def pretty_print_formatter(obj_to_format): # For lazy/noop evaluation in disabled log levels
6145 class PrettyPrintFormatter:
6146 def __str__(self):
6147 return pprint.pformat(vars(obj_to_format))
6149 return PrettyPrintFormatter()
6152def reset_logger() -> None:
6153 """Remove and close logging handlers (and close their files) and reset loggers to default state."""
6154 for log in [logging.getLogger(__name__), logging.getLogger(get_logger_subname())]:
6155 for handler in log.handlers.copy():
6156 log.removeHandler(handler)
6157 handler.flush()
6158 handler.close()
6159 for _filter in log.filters.copy():
6160 log.removeFilter(_filter)
6161 log.setLevel(logging.NOTSET)
6162 log.propagate = True
6165def get_logger_subname() -> str:
6166 return __name__ + ".sub" # the logger name for use by --log-config-file
6169def get_logger(log_params: LogParams, args: argparse.Namespace, log: Optional[Logger] = None) -> Logger:
6170 _log_trace = log_trace
6171 if not hasattr(logging.Logger, "trace"): # add convenience function for custom log level to the logger
6172 logging.Logger.trace = lambda self, msg, *arguments: (
6173 self._log(_log_trace, msg, arguments) if self.isEnabledFor(_log_trace) else None
6174 )
6175 logging.addLevelName(log_trace, "TRACE")
6176 logging.addLevelName(log_stderr, "STDERR")
6177 logging.addLevelName(log_stdout, "STDOUT")
6179 if log is not None:
6180 assert isinstance(log, Logger)
6181 return log # use third party provided logger object
6182 elif args.log_config_file:
6183 log = get_dict_config_logger(log_params, args) # use logger defined in config file, and afterwards ...
6184 # ... add our own handlers unless matching handlers are already present
6185 default_log = get_default_logger(log_params, args)
6186 return log if args.log_config_file else default_log
6189def get_default_logger(log_params: LogParams, args: argparse.Namespace) -> Logger:
6190 sublog = logging.getLogger(get_logger_subname())
6191 log = logging.getLogger(__name__)
6192 log.setLevel(log_params.log_level)
6193 log.propagate = False # don't propagate log messages up to the root logger to avoid emitting duplicate messages
6195 if not any(isinstance(h, logging.StreamHandler) and h.stream in [sys.stdout, sys.stderr] for h in sublog.handlers):
6196 handler = logging.StreamHandler(stream=sys.stdout)
6197 handler.setFormatter(get_default_log_formatter(log_params=log_params))
6198 handler.setLevel(log_params.log_level)
6199 log.addHandler(handler)
6201 abs_log_file = os.path.abspath(log_params.log_file)
6202 if not any(isinstance(h, logging.FileHandler) and h.baseFilename == abs_log_file for h in sublog.handlers):
6203 handler = logging.FileHandler(log_params.log_file, encoding="utf-8")
6204 handler.setFormatter(get_default_log_formatter())
6205 handler.setLevel(log_params.log_level)
6206 log.addHandler(handler)
6208 address = args.log_syslog_address
6209 if address: # optionally, also log to local or remote syslog
6210 address, socktype = get_syslog_address(address, args.log_syslog_socktype)
6211 log_syslog_prefix = str(args.log_syslog_prefix).strip().replace("%", "") # sanitize
6212 handler = logging.handlers.SysLogHandler(address=address, facility=args.log_syslog_facility, socktype=socktype)
6213 handler.setFormatter(get_default_log_formatter(prefix=log_syslog_prefix + " "))
6214 handler.setLevel(args.log_syslog_level)
6215 log.addHandler(handler)
6216 if handler.level < sublog.getEffectiveLevel():
6217 log_level_name = logging.getLevelName(sublog.getEffectiveLevel())
6218 log.warning(
6219 "%s",
6220 f"No messages with priority lower than {log_level_name} will be sent to syslog because syslog "
6221 f"log level {args.log_syslog_level} is lower than overall log level {log_level_name}.",
6222 )
6224 # perf: tell logging framework not to gather unnecessary expensive info for each log record
6225 logging.logProcesses = False
6226 logging.logThreads = False
6227 logging.logMultiprocessing = False
6228 return log
6231def get_default_log_formatter(prefix: str = "", log_params: LogParams = None) -> logging.Formatter:
6232 level_prefixes = {
6233 logging.CRITICAL: "[C] CRITICAL:",
6234 logging.ERROR: "[E] ERROR:",
6235 logging.WARNING: "[W]",
6236 logging.INFO: "[I]",
6237 logging.DEBUG: "[D]",
6238 log_trace: "[T]",
6239 }
6240 _log_stderr = log_stderr
6241 _log_stdout = log_stdout
6242 terminal_cols = [0 if log_params is None else None] # 'None' indicates "configure value later"
6244 class DefaultLogFormatter(logging.Formatter):
6245 def format(self, record) -> str:
6246 levelno = record.levelno
6247 if levelno != _log_stderr and levelno != _log_stdout: # emit stdout and stderr "as-is" (no formatting)
6248 timestamp = datetime.now().isoformat(sep=" ", timespec="seconds") # 2024-09-03 12:26:15
6249 ts_level = f"{timestamp} {level_prefixes.get(levelno, '')} "
6250 msg = record.msg
6251 i = msg.find("%s")
6252 msg = ts_level + msg
6253 if i >= 1:
6254 i += len(ts_level)
6255 msg = msg[0:i].ljust(54) + msg[i:] # right-pad msg if record.msg contains "%s" unless at start
6256 if record.args:
6257 msg = msg % record.args
6258 msg = prefix + msg
6259 else:
6260 msg = prefix + super().format(record)
6262 cols = terminal_cols[0]
6263 if cols is None:
6264 cols = self.ljust_cols()
6265 msg = msg.ljust(cols) # w/ progress line, "overwrite" trailing chars of previous msg with spaces
6266 return msg
6268 @staticmethod
6269 def ljust_cols() -> int:
6270 # lock-free yet thread-safe late configuration-based init for prettier ProgressReporter output
6271 # log_params.params and available_programs are not fully initialized yet before detect_available_programs() ends
6272 cols = 0
6273 p = log_params.params
6274 if p is not None and "local" in p.available_programs:
6275 if "pv" in p.available_programs["local"]:
6276 cols = p.terminal_columns
6277 assert cols is not None
6278 terminal_cols[0] = cols # finally, resolve to use this specific value henceforth
6279 return cols
6281 return DefaultLogFormatter()
6284def get_syslog_address(address: str, log_syslog_socktype: str) -> Tuple:
6285 socktype = None
6286 address = address.strip()
6287 if ":" in address:
6288 host, port = address.rsplit(":", 1)
6289 address = (host.strip(), int(port.strip()))
6290 socktype = socket.SOCK_DGRAM if log_syslog_socktype == "UDP" else socket.SOCK_STREAM # for TCP
6291 return address, socktype
6294def get_dict_config_logger(log_params: LogParams, args: argparse.Namespace) -> Logger:
6295 prefix = prog_name + "."
6296 log_config_vars = {
6297 prefix + "sub.logger": get_logger_subname(),
6298 prefix + "get_default_log_formatter": __name__ + ".get_default_log_formatter",
6299 prefix + "log_level": log_params.log_level,
6300 prefix + "log_dir": log_params.log_dir,
6301 prefix + "log_file": os.path.basename(log_params.log_file),
6302 prefix + "timestamp": log_params.timestamp,
6303 prefix + "dryrun": "dryrun" if args.dryrun else "",
6304 }
6305 log_config_vars.update(log_params.log_config_vars) # merge variables passed into CLI with convenience variables
6307 log_config_file_str = log_params.log_config_file
6308 if log_config_file_str.startswith("+"):
6309 with open(log_config_file_str[1:], "r", encoding="utf-8") as fd:
6310 log_config_file_str = fd.read()
6312 def remove_json_comments(config_str: str) -> str: # not standard but practical
6313 lines = []
6314 for line in config_str.splitlines():
6315 stripped = line.strip()
6316 if stripped.startswith("#"):
6317 line = "" # replace comment line with empty line to preserve line numbering
6318 elif stripped.endswith("#"):
6319 i = line.rfind("#", 0, line.rindex("#"))
6320 if i >= 0:
6321 line = line[0:i] # strip line-ending comment
6322 lines.append(line)
6323 return "\n".join(lines)
6325 def substitute_log_config_vars(config_str: str, log_config_variables: Dict[str, str]) -> str:
6326 """Substitute ${name[:default]} placeholders within JSON with values from log_config_variables"""
6328 def substitute_fn(match: re.Match) -> str:
6329 varname = match.group(1)
6330 error_msg = validate_log_config_variable_name(varname)
6331 if error_msg:
6332 raise ValueError(error_msg)
6333 replacement = log_config_variables.get(varname)
6334 if not replacement:
6335 default = match.group(3)
6336 if default is None:
6337 raise ValueError("Missing default value in JSON for empty log config variable: ${" + varname + "}")
6338 replacement = default
6339 replacement = json.dumps(replacement) # JSON escape special chars such as newlines, quotes, etc
6340 assert len(replacement) >= 2
6341 assert replacement.startswith('"')
6342 assert replacement.endswith('"')
6343 return replacement[1:-1] # strip surrounding quotes added by dumps()
6345 pattern = re.compile(r"\$\{([^}:]*?)(:([^}]*))?}") # Any char except } and :, followed by optional default part
6346 return pattern.sub(substitute_fn, config_str)
6348 log_config_file_str = remove_json_comments(log_config_file_str)
6349 if not log_config_file_str.strip().startswith("{"):
6350 log_config_file_str = "{\n" + log_config_file_str # lenient JSON parsing
6351 if not log_config_file_str.strip().endswith("}"):
6352 log_config_file_str = log_config_file_str + "\n}" # lenient JSON parsing
6353 log_config_file_str = substitute_log_config_vars(log_config_file_str, log_config_vars)
6354 if args is not None and args.verbose >= 2:
6355 print("[T] Substituted log_config_file_str:\n" + log_config_file_str, flush=True)
6356 log_config_dict = json.loads(log_config_file_str)
6357 logging.config.dictConfig(log_config_dict)
6358 return logging.getLogger(get_logger_subname())
6361def validate_log_config_variable(var: str) -> str:
6362 if not var.strip():
6363 return "Invalid log config NAME:VALUE variable. Variable must not be empty: " + var
6364 if ":" not in var:
6365 return "Invalid log config NAME:VALUE variable. Variable is missing a colon character: " + var
6366 return validate_log_config_variable_name(var[0 : var.index(":")])
6369def validate_log_config_variable_name(name: str):
6370 if not name:
6371 return "Invalid log config variable name. Name must not be empty: " + name
6372 bad_chars = "${} " + '"' + "'"
6373 if any(char in bad_chars for char in name):
6374 return f"Invalid log config variable name. Name must not contain forbidden {bad_chars} characters: " + name
6375 if any(char.isspace() for char in name):
6376 return "Invalid log config variable name. Name must not contain whitespace: " + name
6377 return None
6380#############################################################################
6381class RetryableError(Exception):
6382 """Indicates that the task that caused the underlying exception can be retried and might eventually succeed."""
6384 def __init__(self, message, no_sleep: bool = False):
6385 super().__init__(message)
6386 self.no_sleep: bool = no_sleep
6389#############################################################################
6390class Tee:
6391 def __init__(self, *files):
6392 self.files = files
6394 def write(self, obj) -> None:
6395 for file in self.files:
6396 file.write(obj)
6397 file.flush() # Ensure each write is flushed immediately
6399 def flush(self) -> None:
6400 for file in self.files:
6401 file.flush()
6403 def fileno(self) -> int:
6404 return self.files[0].fileno()
6407#############################################################################
6408class NonEmptyStringAction(argparse.Action):
6409 def __call__(self, parser, namespace, values, option_string=None):
6410 values = values.strip()
6411 if values == "":
6412 parser.error(f"{option_string}: Empty string is not valid")
6413 setattr(namespace, self.dest, values)
6416#############################################################################
6417class DatasetPairsAction(argparse.Action):
6418 def __call__(self, parser, namespace, values, option_string=None):
6419 datasets = []
6420 for value in values:
6421 if not value.startswith("+"):
6422 datasets.append(value)
6423 else:
6424 try:
6425 with open(value[1:], "r", encoding="utf-8") as fd:
6426 for line in fd.read().splitlines():
6427 if not line.strip() or line.startswith("#"):
6428 continue # skip empty lines and comment lines
6429 splits = line.split("\t", 1)
6430 if len(splits) <= 1:
6431 parser.error("Line must contain tab-separated SRC_DATASET and DST_DATASET: " + line)
6432 src_root_dataset, dst_root_dataset = splits
6433 if not src_root_dataset.strip() or not dst_root_dataset.strip():
6434 parser.error("SRC_DATASET and DST_DATASET must not be empty or whitespace-only:" + line)
6435 datasets.append(src_root_dataset)
6436 datasets.append(dst_root_dataset)
6437 except FileNotFoundError:
6438 parser.error(f"File not found: {value[1:]}")
6440 if len(datasets) % 2 != 0:
6441 parser.error(f"Each SRC_DATASET must have a corresponding DST_DATASET: {datasets}")
6442 root_dataset_pairs = [(datasets[i], datasets[i + 1]) for i in range(0, len(datasets), 2)]
6443 setattr(namespace, self.dest, root_dataset_pairs)
6446#############################################################################
6447class SafeFileNameAction(argparse.Action):
6448 def __call__(self, parser, namespace, values, option_string=None):
6449 if ".." in values or "/" in values or "\\" in values:
6450 parser.error(f"Invalid file name '{values}': must not contain '..' or '/' or '\\'.")
6451 setattr(namespace, self.dest, values)
6454#############################################################################
6455class NewSnapshotFilterGroupAction(argparse.Action):
6456 def __call__(self, parser, args, values, option_string=None):
6457 if not hasattr(args, snapshot_filters_var):
6458 args.snapshot_filters_var = [[]]
6459 elif len(args.snapshot_filters_var[-1]) > 0:
6460 args.snapshot_filters_var.append([])
6463#############################################################################
6464class FileOrLiteralAction(argparse.Action):
6465 def __call__(self, parser, namespace, values, option_string=None):
6466 current_values = getattr(namespace, self.dest, None)
6467 if current_values is None:
6468 current_values = []
6469 extra_values = []
6470 for value in values:
6471 if not value.startswith("+"):
6472 extra_values.append(value)
6473 else:
6474 try:
6475 with open(value[1:], "r", encoding="utf-8") as fd:
6476 for line in fd.read().splitlines():
6477 if not line.strip() or line.startswith("#"):
6478 continue # skip empty lines and comment lines
6479 extra_values.append(line)
6480 except FileNotFoundError:
6481 parser.error(f"File not found: {value[1:]}")
6482 current_values += extra_values
6483 setattr(namespace, self.dest, current_values)
6484 if self.dest in snapshot_regex_filter_names:
6485 add_snapshot_filter(namespace, SnapshotFilter(self.dest, None, extra_values))
6488#############################################################################
6489class IncludeSnapshotPlanAction(argparse.Action):
6490 def __call__(self, parser, namespace, values, option_string=None):
6491 opts = getattr(namespace, self.dest, None)
6492 opts = [] if opts is None else opts
6493 # The bzfs_include_snapshot_plan_excludes_outdated_snapshots env var flag is a work-around for (rare) replication
6494 # situations where a common snapshot cannot otherwise be found because bookmarks are disabled and a common
6495 # snapshot is actually available but not included by the --include-snapshot-plan policy chosen by the user, and the
6496 # user cannot change the content of the --include-snapshot-plan for some reason. The flag makes replication work even
6497 # in this scenario, at the expense of including (and thus replicating) old snapshots that will immediately be deleted
6498 # on the destination by the next pruning action. In a proper production setup, it should never be necessary to set
6499 # the flag to 'False'.
6500 include_snapshot_times_and_ranks = getenv_bool("include_snapshot_plan_excludes_outdated_snapshots", True)
6501 if not self._add_opts(opts, include_snapshot_times_and_ranks, parser, values, option_string=option_string):
6502 opts += ["--new-snapshot-filter-group", "--include-snapshot-regex=!.*"]
6503 setattr(namespace, self.dest, opts)
6505 def _add_opts(self, opts: List[str], include_snapshot_times_and_ranks: bool, parser, values, option_string=None) -> bool:
6506 """Generates extra options to be parsed later during second parse_args() pass, within run_main()"""
6507 xperiods = SnapshotPeriods()
6508 has_at_least_one_filter_clause = False
6509 for org, target_periods in ast.literal_eval(values).items():
6510 for target, periods in target_periods.items():
6511 for period_unit, period_amount in periods.items(): # e.g. period_unit can be "10minutely" or "minutely"
6512 if not isinstance(period_amount, int) or period_amount < 0:
6513 parser.error(f"{option_string}: Period amount must be a non-negative integer: {period_amount}")
6514 infix = re.escape(ninfix(target)) if target else year_with_four_digits_regex.pattern # disambiguate
6515 regex = f"{re.escape(org)}_{infix}.*{re.escape(nsuffix(period_unit))}"
6516 opts += ["--new-snapshot-filter-group", f"--include-snapshot-regex={regex}"]
6517 if include_snapshot_times_and_ranks:
6518 duration_amount, duration_unit = xperiods.suffix_to_duration0(period_unit) # --> 10, "minutely"
6519 duration_unit_label = xperiods.period_labels.get(duration_unit) # duration_unit_label = "minutes"
6520 opts += [
6521 "--include-snapshot-times-and-ranks",
6522 (
6523 "notime"
6524 if duration_unit_label is None or duration_amount * period_amount == 0
6525 else f"{duration_amount * period_amount}{duration_unit_label}ago..anytime"
6526 ),
6527 f"latest{period_amount}",
6528 ]
6529 has_at_least_one_filter_clause = True
6530 return has_at_least_one_filter_clause
6533#############################################################################
6534class DeleteDstSnapshotsExceptPlanAction(IncludeSnapshotPlanAction):
6535 def __call__(self, parser, namespace, values, option_string=None):
6536 opts = getattr(namespace, self.dest, None)
6537 opts = [] if opts is None else opts
6538 opts += ["--delete-dst-snapshots-except"]
6539 if not self._add_opts(opts, True, parser, values, option_string=option_string):
6540 parser.error(
6541 f"{option_string}: Cowardly refusing to delete all snapshots on "
6542 f"--delete-dst-snapshots-except-plan='{values}' (which means 'retain no snapshots' aka "
6543 "'delete all snapshots'). Assuming this is an unintended pilot error rather than intended carnage. "
6544 "Aborting. If this is really what is intended, use `--delete-dst-snapshots --include-snapshot-regex=.*` "
6545 "instead to force the deletion."
6546 )
6547 setattr(namespace, self.dest, opts)
6550#############################################################################
6551class TimeRangeAndRankRangeAction(argparse.Action):
6552 def __call__(self, parser, namespace, values, option_string=None):
6553 def parse_time(time_spec):
6554 time_spec = time_spec.strip()
6555 if time_spec == "*" or time_spec == "anytime":
6556 return None
6557 if time_spec.isdigit():
6558 return int(time_spec) # Input is a Unix time in integer seconds
6559 try:
6560 return timedelta(milliseconds=parse_duration_to_milliseconds(time_spec, regex_suffix=r"\s*ago"))
6561 except ValueError:
6562 try: # If it's not a duration, try parsing as an ISO 8601 datetime
6563 return unixtime_fromisoformat(time_spec)
6564 except ValueError:
6565 parser.error(f"{option_string}: Invalid duration, Unix time, or ISO 8601 datetime: {time_spec}")
6567 assert isinstance(values, list)
6568 assert len(values) > 0
6569 value = values[0].strip()
6570 if value == "notime":
6571 value = "0..0"
6572 if ".." not in value:
6573 parser.error(f"{option_string}: Invalid time range: Missing '..' separator: {value}")
6574 timerange = [parse_time(time_spec) for time_spec in value.split("..", 1)]
6575 rankranges = self.parse_rankranges(parser, values[1:], option_string=option_string)
6576 setattr(namespace, self.dest, [timerange] + rankranges) # for testing only
6577 timerange = self.get_include_snapshot_times(timerange)
6578 add_time_and_rank_snapshot_filter(namespace, self.dest, timerange, rankranges)
6580 @staticmethod
6581 def get_include_snapshot_times(times) -> UnixTimeRange:
6582 def utc_unix_time_in_seconds(time_spec: Union[timedelta, int], default: int) -> int:
6583 if isinstance(time_spec, timedelta):
6584 return time_spec
6585 if isinstance(time_spec, int):
6586 return int(time_spec)
6587 return default
6589 lo, hi = times
6590 if lo is None and hi is None:
6591 return None
6592 lo = utc_unix_time_in_seconds(lo, default=0)
6593 hi = utc_unix_time_in_seconds(hi, default=unixtime_infinity_secs)
6594 if isinstance(lo, int) and isinstance(hi, int):
6595 return (lo, hi) if lo <= hi else (hi, lo)
6596 return lo, hi
6598 @staticmethod
6599 def parse_rankranges(parser, values, option_string=None) -> List[RankRange]:
6600 def parse_rank(spec):
6601 spec = spec.strip()
6602 match = re.fullmatch(r"(all\s*except\s*)?(oldest|latest)\s*(\d+)%?", spec)
6603 if not match:
6604 parser.error(f"{option_string}: Invalid rank format: {spec}")
6605 is_except = bool(match.group(1))
6606 kind = match.group(2)
6607 num = int(match.group(3))
6608 is_percent = spec.endswith("%")
6609 if is_percent and num > 100:
6610 parser.error(f"{option_string}: Invalid rank: Percent must not be greater than 100: {spec}")
6611 return is_except, kind, num, is_percent
6613 rankranges = []
6614 for value in values:
6615 value = value.strip()
6616 if ".." in value:
6617 lo_split, hi_split = value.split("..", 1)
6618 lo = parse_rank(lo_split)
6619 hi = parse_rank(hi_split)
6620 if lo[0] or hi[0]:
6621 # Example: 'all except latest 90..except latest 95' or 'all except latest 90..latest 95'
6622 parser.error(f"{option_string}: Invalid rank range: {value}")
6623 if lo[1] != hi[1]:
6624 # Example: 'latest10..oldest10' and 'oldest10..latest10' may be somewhat unambigous if there are 40
6625 # input snapshots, but they are tricky/not well-defined if there are less than 20 input snapshots.
6626 parser.error(f"{option_string}: Ambiguous rank range: Must not compare oldest with latest: {value}")
6627 else:
6628 hi = parse_rank(value)
6629 is_except, kind, num, is_percent = hi
6630 if is_except:
6631 if is_percent:
6632 # 'all except latest 10%' aka 'oldest 90%' aka 'oldest 0..oldest 90%'
6633 # 'all except oldest 10%' aka 'latest 90%' aka 'latest 0..oldest 90%'
6634 negated_kind = "oldest" if kind == "latest" else "latest"
6635 lo = parse_rank(f"{negated_kind}0")
6636 hi = parse_rank(f"{negated_kind}{100-num}%")
6637 else:
6638 # 'all except latest 90' aka 'latest 90..latest 100%'
6639 # 'all except oldest 90' aka 'oldest 90..oldest 100%'
6640 lo = parse_rank(f"{kind}{num}")
6641 hi = parse_rank(f"{kind}100%")
6642 else:
6643 # 'latest 90' aka 'latest 0..latest 90'
6644 lo = parse_rank(f"{kind}0")
6645 rankranges.append((lo[1:], hi[1:]))
6646 return rankranges
6649#############################################################################
6650@dataclass(order=True)
6651class SnapshotFilter:
6652 name: str
6653 timerange: UnixTimeRange
6654 options: Any = field(compare=False, default=None)
6657def add_snapshot_filter(args: argparse.Namespace, _filter: SnapshotFilter) -> None:
6658 if not hasattr(args, snapshot_filters_var):
6659 args.snapshot_filters_var = [[]]
6660 args.snapshot_filters_var[-1].append(_filter)
6663def add_time_and_rank_snapshot_filter(args: Namespace, dst: str, timerange: UnixTimeRange, rankranges: List[RankRange]):
6664 if timerange is None or len(rankranges) == 0 or any(rankrange[0] == rankrange[1] for rankrange in rankranges):
6665 add_snapshot_filter(args, SnapshotFilter("include_snapshot_times", timerange, None))
6666 else:
6667 assert timerange is not None
6668 add_snapshot_filter(args, SnapshotFilter(dst, timerange, rankranges))
6671def has_timerange_filter(snapshot_filters: List[List[SnapshotFilter]]) -> bool:
6672 """Interacts with add_time_and_rank_snapshot_filter() and optimize_snapshot_filters()"""
6673 return any(f.timerange is not None for snapshot_filter in snapshot_filters for f in snapshot_filter)
6676def optimize_snapshot_filters(snapshot_filters: List[SnapshotFilter]) -> List[SnapshotFilter]:
6677 """Not intended to be a full query execution plan optimizer, but we still apply some basic plan optimizations."""
6678 merge_adjacent_snapshot_filters(snapshot_filters)
6679 merge_adjacent_snapshot_regexes(snapshot_filters)
6680 snapshot_filters = [f for f in snapshot_filters if f.timerange or f.options] # drop noop --include-snapshot-times
6681 reorder_snapshot_time_filters(snapshot_filters)
6682 return snapshot_filters
6685def merge_adjacent_snapshot_filters(snapshot_filters: List[SnapshotFilter]) -> None:
6686 """Merges filter operators of the same kind if they are next to each other and carry an option list, for example
6687 --include-snapshot-ranks and --include-snapshot-regex and --exclude-snapshot-regex. This improves execution perf
6688 and makes handling easier in later stages.
6689 Example: merges --include-snapshot-times-and-ranks 0..9 oldest10% --include-snapshot-times-and-ranks 0..9 latest20%
6690 into --include-snapshot-times-and-ranks 0..9 oldest10% latest20%"""
6691 i = len(snapshot_filters) - 1
6692 while i >= 0:
6693 filter_i = snapshot_filters[i]
6694 if isinstance(filter_i.options, list):
6695 j = i - 1
6696 if j >= 0 and snapshot_filters[j] == filter_i:
6697 lst = snapshot_filters[j].options
6698 assert isinstance(lst, list)
6699 lst += filter_i.options
6700 snapshot_filters.pop(i)
6701 i -= 1
6704def merge_adjacent_snapshot_regexes(snapshot_filters: List[SnapshotFilter]) -> None:
6705 # Merge regex filter operators of the same kind as long as they are within the same group, aka as long as they
6706 # are not separated by a non-regex filter. This improves execution perf and makes handling easier in later stages.
6707 # Example: --include-snapshot-regex .*daily --exclude-snapshot-regex .*weekly --include-snapshot-regex .*hourly
6708 # --exclude-snapshot-regex .*monthly
6709 # gets merged into the following:
6710 # --include-snapshot-regex .*daily .*hourly --exclude-snapshot-regex .*weekly .*monthly
6711 i = len(snapshot_filters) - 1
6712 while i >= 0:
6713 filter_i = snapshot_filters[i]
6714 if filter_i.name in snapshot_regex_filter_names:
6715 assert isinstance(filter_i.options, list)
6716 j = i - 1
6717 while j >= 0 and snapshot_filters[j].name in snapshot_regex_filter_names:
6718 if snapshot_filters[j].name == filter_i.name:
6719 lst = snapshot_filters[j].options
6720 assert isinstance(lst, list)
6721 lst += filter_i.options
6722 snapshot_filters.pop(i)
6723 break
6724 j -= 1
6725 i -= 1
6727 # Merge --include-snapshot-regex and --exclude-snapshot-regex filters that are part of the same group (i.e. next
6728 # to each other) into a single combined filter operator that contains the info of both, and hence all info for the
6729 # group, which makes handling easier in later stages.
6730 # Example: --include-snapshot-regex .*daily .*hourly --exclude-snapshot-regex .*weekly .*monthly
6731 # gets merged into the following: --snapshot-regex(excludes=[.*weekly, .*monthly], includes=[.*daily, .*hourly])
6732 i = len(snapshot_filters) - 1
6733 while i >= 0:
6734 filter_i = snapshot_filters[i]
6735 name = filter_i.name
6736 if name in snapshot_regex_filter_names:
6737 j = i - 1
6738 if j >= 0 and snapshot_filters[j].name in snapshot_regex_filter_names:
6739 filter_j = snapshot_filters[j]
6740 assert filter_j.name != name
6741 snapshot_filters.pop(i)
6742 i -= 1
6743 else:
6744 name_j = next(iter(snapshot_regex_filter_names.difference({name})))
6745 filter_j = SnapshotFilter(name_j, None, [])
6746 sorted_filters = sorted([filter_i, filter_j])
6747 exclude_regexes, include_regexes = sorted_filters[0].options, sorted_filters[1].options
6748 snapshot_filters[i] = SnapshotFilter(snapshot_regex_filter_name, None, (exclude_regexes, include_regexes))
6749 i -= 1
6752def reorder_snapshot_time_filters(snapshot_filters: List[SnapshotFilter]) -> None:
6753 """In an execution plan that contains filter operators based on sort order (the --include-snapshot-times-and-ranks
6754 operator with non-empty ranks), filters cannot freely be reordered without violating correctness, but they can
6755 still be partially reordered for better execution performance. The filter list is partitioned into sections such
6756 that sections are separated by --include-snapshot-times-and-ranks operators with non-empty ranks. Within each
6757 section, we move include_snapshot_times operators aka --include-snapshot-times-and-ranks operators with empty ranks
6758 before --include/exclude-snapshot-regex operators because the former involves fast integer comparisons and the
6759 latter involves more expensive regex matching.
6760 Example: reorders --include-snapshot-regex .*daily --include-snapshot-times-and-ranks 2024-01-01..2024-04-01 into
6761 --include-snapshot-times-and-ranks 2024-01-01..2024-04-01 --include-snapshot-regex .*daily"""
6763 def reorder_time_filters_within_section(i: int, j: int):
6764 while j > i:
6765 filter_j = snapshot_filters[j]
6766 if filter_j.name == "include_snapshot_times":
6767 snapshot_filters.pop(j)
6768 snapshot_filters.insert(i + 1, filter_j)
6769 j -= 1
6771 i = len(snapshot_filters) - 1
6772 j = i
6773 while i >= 0:
6774 name = snapshot_filters[i].name
6775 if name == "include_snapshot_times_and_ranks":
6776 reorder_time_filters_within_section(i, j)
6777 j = i - 1
6778 i -= 1
6779 reorder_time_filters_within_section(i, j)
6782#############################################################################
6783class LogConfigVariablesAction(argparse.Action):
6784 def __call__(self, parser, namespace, values, option_string=None):
6785 current_values = getattr(namespace, self.dest, None)
6786 if current_values is None:
6787 current_values = []
6788 for variable in values:
6789 error_msg = validate_log_config_variable(variable)
6790 if error_msg:
6791 parser.error(error_msg)
6792 current_values.append(variable)
6793 setattr(namespace, self.dest, current_values)
6796#############################################################################
6797# class CheckRange is copied from https://gist.github.com/dmitriykovalev/2ab1aa33a8099ef2d514925d84aa89e7/30961300d3f8192f775709c06ff9a5b777475adf
6798# Written by Dmitriy Kovalev
6799#
6800# Licensed under the Apache License, Version 2.0 (the "License");
6801# you may not use this file except in compliance with the License.
6802# You may obtain a copy of the License at
6803#
6804# http://www.apache.org/licenses/LICENSE-2.0
6805#
6806# Unless required by applicable law or agreed to in writing, software
6807# distributed under the License is distributed on an "AS IS" BASIS,
6808# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
6809# See the License for the specific language governing permissions and
6810# limitations under the License.
6811#
6812# Allows you to validate open, closed, and half-open intervals on int as well as float arguments.
6813# Each endpoint can be either a number or positive or negative infinity:
6814# [a, b] --> min=a, max=b
6815# [a, b) --> min=a, sup=b
6816# (a, b] --> inf=a, max=b
6817# (a, b) --> inf=a, sup=b
6818# [a, +infinity) --> min=a
6819# (a, +infinity) --> inf=a
6820# (-infinity, b] --> max=b
6821# (-infinity, b) --> sup=b
6822# fmt: off
6823class CheckRange(argparse.Action):
6824 ops = {'inf': operator.gt,
6825 'min': operator.ge,
6826 'sup': operator.lt,
6827 'max': operator.le}
6829 def __init__(self, *args, **kwargs):
6830 if 'min' in kwargs and 'inf' in kwargs:
6831 raise ValueError('either min or inf, but not both')
6832 if 'max' in kwargs and 'sup' in kwargs:
6833 raise ValueError('either max or sup, but not both')
6835 for name in self.ops:
6836 if name in kwargs:
6837 setattr(self, name, kwargs.pop(name))
6839 super().__init__(*args, **kwargs)
6841 def interval(self):
6842 if hasattr(self, 'min'):
6843 l = f'[{self.min}'
6844 elif hasattr(self, 'inf'):
6845 l = f'({self.inf}'
6846 else:
6847 l = '(-infinity'
6849 if hasattr(self, 'max'):
6850 u = f'{self.max}]'
6851 elif hasattr(self, 'sup'):
6852 u = f'{self.sup})'
6853 else:
6854 u = '+infinity)'
6856 return f'valid range: {l}, {u}'
6858 def __call__(self, parser, namespace, values, option_string=None):
6859 for name, op in self.ops.items():
6860 if hasattr(self, name) and not op(values, getattr(self, name)):
6861 raise argparse.ArgumentError(self, self.interval())
6862 setattr(namespace, self.dest, values)
6863# fmt: on
6866#############################################################################
6867class CheckPercentRange(CheckRange):
6869 def __call__(self, parser, namespace, values, option_string=None):
6870 assert isinstance(values, str)
6871 original = values
6872 values = values.strip()
6873 is_percent = values.endswith("%")
6874 if is_percent:
6875 values = values[0:-1]
6876 try:
6877 values = float(values)
6878 except ValueError:
6879 parser.error(f"{option_string}: Invalid percentage or number: {original}")
6880 super().__call__(parser, namespace, values, option_string=option_string)
6881 setattr(namespace, self.dest, (getattr(namespace, self.dest), is_percent))
6884#############################################################################
6885T = TypeVar("T") # Generic type variable for elements stored in a SmallPriorityQueue
6888class SmallPriorityQueue(Generic[T]):
6889 """A priority queue that can handle updates to the priority of any element that is already contained in the queue, and
6890 does so very efficiently if there are a small number of elements in the queue (no more than thousands), as is the case
6891 for us. Could be implemented using a SortedList via https://github.com/grantjenks/python-sortedcontainers or using an
6892 indexed priority queue via https://github.com/nvictus/pqdict but, to avoid an external dependency, is actually
6893 implemented using a simple yet effective binary search-based sorted list that can handle updates to the priority of
6894 elements that are already contained in the queue, via removal of the element, followed by update of the element, followed
6895 by (re)insertion. Do not underestimate the real-world performance of an optimized memmove() and optimized binary search.
6896 Note: Duplicate elements (if any) are maintained in their order of insertion relative to other duplicates."""
6898 def __init__(self, reverse: bool = False) -> None:
6899 self._lst: List[T] = []
6900 self._reverse: bool = reverse
6902 def clear(self) -> None:
6903 self._lst.clear()
6905 def push(self, element: T) -> None:
6906 bisect.insort(self._lst, element)
6908 def pop(self) -> T:
6909 """Removes and return the smallest (or largest if reverse == True) element from the queue."""
6910 return self._lst.pop() if self._reverse else self._lst.pop(0)
6912 def peek(self) -> T:
6913 """Returns the smallest (or largest if reverse == True) element without removing it."""
6914 return self._lst[-1] if self._reverse else self._lst[0]
6916 def remove(self, element: T, assert_is_contained: bool = False) -> None:
6917 """Removes the first occurrence of the specified element from the queue. The element must be contained."""
6918 lst = self._lst
6919 i = bisect.bisect_left(lst, element)
6920 if assert_is_contained:
6921 assert i < len(lst) and lst[i] == element
6922 del lst[i] # do not underestimate the real-world performance of an optimized memmove()
6924 def __len__(self) -> int:
6925 return len(self._lst)
6927 def __contains__(self, element: T) -> bool:
6928 lst = self._lst
6929 i = bisect.bisect_left(lst, element)
6930 return i < len(lst) and lst[i] == element
6932 def __iter__(self) -> Iterator[T]:
6933 return reversed(self._lst) if self._reverse else iter(self._lst)
6935 def __repr__(self) -> str:
6936 return repr(list(reversed(self._lst))) if self._reverse else repr(self._lst)
6939#############################################################################
6940class SynchronizedBool:
6941 def __init__(self, val: bool):
6942 assert isinstance(val, bool)
6943 self._lock: threading.Lock = threading.Lock()
6944 self._value: bool = val
6946 @property
6947 def value(self) -> bool:
6948 with self._lock:
6949 return self._value
6951 @value.setter
6952 def value(self, new_value: bool) -> None:
6953 with self._lock:
6954 self._value = new_value
6956 def get_and_set(self, new_value: bool) -> bool:
6957 with self._lock:
6958 old_value = self._value
6959 self._value = new_value
6960 return old_value
6962 def compare_and_set(self, expected_value: bool, new_value: bool) -> bool:
6963 with self._lock:
6964 eq = self._value == expected_value
6965 if eq:
6966 self._value = new_value
6967 return eq
6969 def __bool__(self) -> bool:
6970 return self.value
6972 def __repr__(self) -> str:
6973 return repr(self.value)
6975 def __str__(self) -> str:
6976 return str(self.value)
6979#############################################################################
6980K = TypeVar("K")
6981V = TypeVar("V")
6984class SynchronizedDict(Generic[K, V]):
6985 def __init__(self, val: Dict[K, V]):
6986 assert isinstance(val, dict)
6987 self._lock: threading.Lock = threading.Lock()
6988 self._dict: Dict[K, V] = val
6990 def __getitem__(self, key: K) -> V:
6991 with self._lock:
6992 return self._dict[key]
6994 def __setitem__(self, key: K, value: V) -> None:
6995 with self._lock:
6996 self._dict[key] = value
6998 def __delitem__(self, key: K) -> None:
6999 with self._lock:
7000 self._dict.pop(key)
7002 def __contains__(self, key: K) -> bool:
7003 with self._lock:
7004 return key in self._dict
7006 def __len__(self) -> int:
7007 with self._lock:
7008 return len(self._dict)
7010 def __repr__(self) -> str:
7011 with self._lock:
7012 return repr(self._dict)
7014 def __str__(self) -> str:
7015 with self._lock:
7016 return str(self._dict)
7018 def get(self, key: K, default: Optional[V] = None) -> Optional[V]:
7019 with self._lock:
7020 return self._dict.get(key, default)
7022 def pop(self, key: K, default: Optional[V] = None) -> V:
7023 with self._lock:
7024 return self._dict.pop(key, default)
7026 def clear(self) -> None:
7027 with self._lock:
7028 self._dict.clear()
7030 def items(self) -> ItemsView[K, V]:
7031 with self._lock:
7032 return self._dict.copy().items()
7035#############################################################################
7036if __name__ == "__main__": 7036 ↛ 7037line 7036 didn't jump to line 7037 because the condition on line 7036 was never true
7037 main()