mirror of
https://github.com/netinvent/npbackup.git
synced 2025-10-26 05:16:55 +08:00
Implement minimum_backup_size_error alert
This commit is contained in:
parent
ad0785f3eb
commit
13593ba199
4 changed files with 72 additions and 56 deletions
|
|
@ -160,7 +160,7 @@ empty_config_dict = {
|
|||
"exclude_files_larger_than": None,
|
||||
"additional_parameters": None,
|
||||
"additional_backup_only_parameters": None,
|
||||
"minimum_backup_size_error": "10M", # TODO
|
||||
"minimum_backup_size_error": "10", # In megabytes
|
||||
"pre_exec_commands": [],
|
||||
"pre_exec_per_command_timeout": 3600,
|
||||
"pre_exec_failure_is_fatal": False,
|
||||
|
|
|
|||
|
|
@ -36,9 +36,13 @@ logger = logging.getLogger()
|
|||
|
||||
def metric_writer(
|
||||
repo_config: dict, restic_result: bool, result_string: str, dry_run: bool
|
||||
):
|
||||
) -> bool:
|
||||
backup_too_small = False
|
||||
minimum_backup_size_error = repo_config.g("backup_opts.minimum_backup_size_error")
|
||||
try:
|
||||
labels = {}
|
||||
labels = {
|
||||
"npversion": f"{NAME}{VERSION}"
|
||||
}
|
||||
if repo_config.g("prometheus.metrics"):
|
||||
labels["instance"] = repo_config.g("prometheus.instance")
|
||||
labels["backup_job"] = repo_config.g("prometheus.backup_job")
|
||||
|
|
@ -46,23 +50,18 @@ def metric_writer(
|
|||
no_cert_verify = repo_config.g("prometheus.no_cert_verify")
|
||||
destination = repo_config.g("prometheus.destination")
|
||||
prometheus_additional_labels = repo_config.g("prometheus.additional_labels")
|
||||
minimum_backup_size_error = repo_config.g("backup_opts.minimum_backup_size_error") # TODO
|
||||
|
||||
if not isinstance(prometheus_additional_labels, list):
|
||||
prometheus_additional_labels = [prometheus_additional_labels]
|
||||
|
||||
# Configure lables
|
||||
label_string = ",".join(
|
||||
[f'{key}="{value}"' for key, value in labels.items() if value]
|
||||
)
|
||||
try:
|
||||
if prometheus_additional_labels:
|
||||
for additional_label in prometheus_additional_labels:
|
||||
if additional_label:
|
||||
try:
|
||||
label, value = additional_label.split("=")
|
||||
label_string += ',{}="{}"'.format(
|
||||
label.strip(), value.strip()
|
||||
)
|
||||
labels[label.strip()] = value.strip()
|
||||
except ValueError:
|
||||
logger.error(
|
||||
'Bogus additional label "{}" defined in configuration.'.format(
|
||||
|
|
@ -73,47 +72,46 @@ def metric_writer(
|
|||
logger.error("Bogus additional labels defined in configuration.")
|
||||
logger.debug("Trace:", exc_info=True)
|
||||
|
||||
label_string += ',npversion="{}{}"'.format(NAME, VERSION)
|
||||
|
||||
# If result was a str, we need to transform it into json first
|
||||
if isinstance(result_string, str):
|
||||
restic_result = restic_str_output_to_json(restic_result, result_string)
|
||||
# If result was a str, we need to transform it into json first
|
||||
if isinstance(result_string, str):
|
||||
restic_result = restic_str_output_to_json(restic_result, result_string)
|
||||
|
||||
errors, metrics = restic_json_to_prometheus(
|
||||
restic_result=restic_result, output=restic_result, labels=label_string
|
||||
)
|
||||
if errors or not restic_result:
|
||||
logger.error("Restic finished with errors.")
|
||||
if destination:
|
||||
logger.debug("Uploading metrics to {}".format(destination))
|
||||
if destination.lower().startswith("http"):
|
||||
try:
|
||||
authentication = (
|
||||
repo_config.g("prometheus.http_username"),
|
||||
repo_config.g("prometheus.http_password"),
|
||||
)
|
||||
except KeyError:
|
||||
logger.info("No metrics authentication present.")
|
||||
authentication = None
|
||||
if not dry_run:
|
||||
upload_metrics(
|
||||
destination, authentication, no_cert_verify, metrics
|
||||
)
|
||||
else:
|
||||
logger.info("Not uploading metrics in dry run mode")
|
||||
errors, metrics, backup_too_small = restic_json_to_prometheus(
|
||||
restic_result=restic_result, restic_json=restic_result, labels=labels, minimum_backup_size_error=minimum_backup_size_error
|
||||
)
|
||||
if errors or not restic_result:
|
||||
logger.error("Restic finished with errors.")
|
||||
if repo_config.g("prometheus.metrics") and destination:
|
||||
logger.debug("Uploading metrics to {}".format(destination))
|
||||
if destination.lower().startswith("http"):
|
||||
try:
|
||||
authentication = (
|
||||
repo_config.g("prometheus.http_username"),
|
||||
repo_config.g("prometheus.http_password"),
|
||||
)
|
||||
except KeyError:
|
||||
logger.info("No metrics authentication present.")
|
||||
authentication = None
|
||||
if not dry_run:
|
||||
upload_metrics(
|
||||
destination, authentication, no_cert_verify, metrics
|
||||
)
|
||||
else:
|
||||
try:
|
||||
with open(destination, "w") as file_handle:
|
||||
for metric in metrics:
|
||||
file_handle.write(metric + "\n")
|
||||
except OSError as exc:
|
||||
logger.error(
|
||||
"Cannot write metrics file {}: {}".format(destination, exc)
|
||||
)
|
||||
logger.info("Not uploading metrics in dry run mode")
|
||||
else:
|
||||
try:
|
||||
with open(destination, "w") as file_handle:
|
||||
for metric in metrics:
|
||||
file_handle.write(metric + "\n")
|
||||
except OSError as exc:
|
||||
logger.error(
|
||||
"Cannot write metrics file {}: {}".format(destination, exc)
|
||||
)
|
||||
except KeyError as exc:
|
||||
logger.info("Metrics not configured: {}".format(exc))
|
||||
except OSError as exc:
|
||||
logger.error("Cannot write metric file: ".format(exc))
|
||||
return backup_too_small
|
||||
|
||||
|
||||
class NPBackupRunner:
|
||||
|
|
@ -1016,12 +1014,16 @@ class NPBackupRunner:
|
|||
)
|
||||
|
||||
self.write_logs(f"Restic output:\n{self.restic_runner.backup_result_content}", level="debug")
|
||||
|
||||
# Extract backup size from result_string
|
||||
|
||||
# Metrics will not be in json format, since we need to diag cloud issues until
|
||||
metrics_analyzer_result = metric_writer(
|
||||
# there is a fix for https://github.com/restic/restic/issues/4155
|
||||
backup_too_small = metric_writer(
|
||||
self.repo_config, result, self.restic_runner.backup_result_content, self.restic_runner.dry_run
|
||||
)
|
||||
print(backup_too_small)
|
||||
if backup_too_small:
|
||||
self.write_logs("Backup is smaller than expected", level="error")
|
||||
|
||||
post_exec_commands_success = True
|
||||
if post_exec_commands:
|
||||
|
|
@ -1044,7 +1046,7 @@ class NPBackupRunner:
|
|||
)
|
||||
|
||||
operation_result = (
|
||||
result and pre_exec_commands_success and post_exec_commands_success
|
||||
result and pre_exec_commands_success and post_exec_commands_success and not backup_too_small
|
||||
)
|
||||
msg = f"Operation finished with {'success' if operation_result else 'failure'}"
|
||||
self.write_logs(msg, level="info" if operation_result else "error",
|
||||
|
|
|
|||
|
|
@ -168,14 +168,14 @@ def restic_str_output_to_json(
|
|||
|
||||
|
||||
def restic_json_to_prometheus(
|
||||
restic_json, labels: dict = None
|
||||
) -> Tuple[bool, List[str]]:
|
||||
restic_result: bool, restic_json: dict, labels: dict = None, minimum_backup_size_error: float = None,
|
||||
) -> Tuple[bool, List[str], bool]:
|
||||
"""
|
||||
Transform a restic JSON result into prometheus metrics
|
||||
"""
|
||||
_labels = []
|
||||
for key, value in labels.items():
|
||||
_labels.append(f'{key}="{value}"')
|
||||
_labels.append(f'{key.strip()}="{value.strip()}"')
|
||||
labels = ",".join(_labels)
|
||||
|
||||
# Take last line of restic output
|
||||
|
|
@ -222,7 +222,20 @@ def restic_json_to_prometheus(
|
|||
if "duration" in key:
|
||||
key += "_seconds"
|
||||
prom_metrics.append(f'restic_{key}{{{labels},action="backup"}} {value}')
|
||||
return prom_metrics
|
||||
|
||||
backup_too_small = False
|
||||
if minimum_backup_size_error:
|
||||
if restic_json["data_added"] < minimum_backup_size_error:
|
||||
backup_too_small = True
|
||||
good_backup = restic_result and not backup_too_small
|
||||
|
||||
prom_metrics.append(
|
||||
'restic_backup_failure{{{},timestamp="{}"}} {}'.format(
|
||||
labels, int(datetime.utcnow().timestamp()), 1 if not good_backup else 0
|
||||
)
|
||||
)
|
||||
|
||||
return good_backup, prom_metrics, backup_too_small
|
||||
|
||||
|
||||
def restic_output_2_metrics(restic_result, output, labels=None):
|
||||
|
|
|
|||
|
|
@ -133,7 +133,7 @@ def test_restic_str_output_to_json():
|
|||
json_metrics = restic_str_output_to_json(True, output)
|
||||
assert json_metrics["errors"] == False
|
||||
#print(json_metrics)
|
||||
prom_metrics = restic_json_to_prometheus(json_metrics, labels)
|
||||
_, prom_metrics, _ = restic_json_to_prometheus(True, json_metrics, labels)
|
||||
|
||||
#print(f"Parsed result:\n{prom_metrics}")
|
||||
for expected_result in expected_results_V2:
|
||||
|
|
@ -155,7 +155,7 @@ def test_restic_json_output():
|
|||
for version, json_output in restic_json_outputs.items():
|
||||
print(f"Testing V2 direct restic --json output from version {version}")
|
||||
restic_json = json.loads(json_output)
|
||||
prom_metrics = restic_json_to_prometheus(restic_json, labels)
|
||||
_, prom_metrics, _ = restic_json_to_prometheus(True, restic_json, labels)
|
||||
#print(f"Parsed result:\n{prom_metrics}")
|
||||
for expected_result in expected_results_V2:
|
||||
match_found = False
|
||||
|
|
@ -195,14 +195,15 @@ def test_real_restic_output():
|
|||
|
||||
|
||||
exit_code, output = command_runner(f"{restic_binary} init --repository-version 2", live_output=True)
|
||||
# Just backend current directory
|
||||
cmd = f"{restic_binary} backup {api_arg} ."
|
||||
exit_code, output = command_runner(cmd, timeout=60, live_output=True)
|
||||
exit_code, output = command_runner(cmd, timeout=120, live_output=True)
|
||||
assert exit_code == 0, "Failed to run restic"
|
||||
if not api_arg:
|
||||
restic_json = restic_str_output_to_json(True, output)
|
||||
else:
|
||||
restic_json = output
|
||||
prom_metrics = restic_json_to_prometheus(restic_json, labels)
|
||||
_, prom_metrics, _ = restic_json_to_prometheus(True, restic_json, labels)
|
||||
#print(f"Parsed result:\n{prom_metrics}")
|
||||
for expected_result in expected_results_V2:
|
||||
match_found = False
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue