mirror of
https://github.com/netinvent/npbackup.git
synced 2025-10-29 23:06:17 +08:00
Merge pull request #163 from netinvent/backup_metrics_decorator
Backup metrics decorator
This commit is contained in:
commit
f4f813375e
3 changed files with 102 additions and 83 deletions
|
|
@ -7,7 +7,7 @@ __intname__ = "npbackup.gui.core.runner"
|
|||
__author__ = "Orsiris de Jong"
|
||||
__copyright__ = "Copyright (C) 2022-2025 NetInvent"
|
||||
__license__ = "GPL-3.0-only"
|
||||
__build__ = "2025051101"
|
||||
__build__ = "2025052301"
|
||||
|
||||
|
||||
from typing import Optional, Callable, Union, List, Tuple
|
||||
|
|
@ -92,7 +92,9 @@ def metric_analyser(
|
|||
result_string: str,
|
||||
operation: str,
|
||||
dry_run: bool,
|
||||
is_first_metrics_run: bool,
|
||||
append_metrics_file: bool,
|
||||
exec_time: Optional[float] = None,
|
||||
analyze_only: bool = False,
|
||||
) -> Tuple[bool, bool]:
|
||||
"""
|
||||
Tries to get operation success and backup to small booleans from restic output
|
||||
|
|
@ -185,53 +187,50 @@ def metric_analyser(
|
|||
)
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
# Add exec time
|
||||
try:
|
||||
exec_time = os.environ.get("NPBACKUP_EXEC_TIME", None)
|
||||
exec_time = float(exec_time)
|
||||
metrics.append(
|
||||
f'npbackup_exec_time{{{labels},action="{operation}",repo_name="{repo_name}",timestamp="{int(datetime.now(timezone.utc).timestamp())}"}} {exec_time}'
|
||||
)
|
||||
except (ValueError, TypeError):
|
||||
logger.warning("Cannot get exec time from environment")
|
||||
|
||||
logger.debug("Metrics computed:\n{}".format("\n".join(metrics)))
|
||||
if destination and dry_run:
|
||||
logger.info("Dry run mode. Not sending metrics.")
|
||||
elif destination:
|
||||
logger.debug("Sending metrics to {}".format(destination))
|
||||
dest = destination.lower()
|
||||
if dest.startswith("http"):
|
||||
if not "metrics" in dest:
|
||||
logger.error(
|
||||
"Destination does not contain 'metrics' keyword. Not uploading."
|
||||
)
|
||||
return backup_too_small
|
||||
if not "job" in dest:
|
||||
logger.error(
|
||||
"Destination does not contain 'job' keyword. Not uploading."
|
||||
)
|
||||
return backup_too_small
|
||||
try:
|
||||
authentication = (
|
||||
repo_config.g("prometheus.http_username"),
|
||||
repo_config.g("prometheus.http_password"),
|
||||
)
|
||||
except KeyError:
|
||||
logger.info("No metrics authentication present.")
|
||||
authentication = None
|
||||
|
||||
# Fix for #150, job name needs to be unique in order to avoid overwriting previous job in push gateway
|
||||
destination = (
|
||||
f"{destination}___repo_name={repo_name}___action={operation}"
|
||||
if isinstance(exec_time, (int, float)):
|
||||
try:
|
||||
metrics.append(
|
||||
f'npbackup_exec_time{{{labels},action="{operation}",repo_name="{repo_name}",timestamp="{int(datetime.now(timezone.utc).timestamp())}"}} {exec_time}'
|
||||
)
|
||||
upload_metrics(destination, authentication, no_cert_verify, metrics)
|
||||
except (ValueError, TypeError):
|
||||
logger.warning("Cannot get exec time from environment")
|
||||
|
||||
if not analyze_only:
|
||||
logger.debug("Metrics computed:\n{}".format("\n".join(metrics)))
|
||||
if destination and dry_run:
|
||||
logger.info("Dry run mode. Not sending metrics.")
|
||||
elif destination:
|
||||
logger.debug("Sending metrics to {}".format(destination))
|
||||
dest = destination.lower()
|
||||
if dest.startswith("http"):
|
||||
if not "metrics" in dest:
|
||||
logger.error(
|
||||
"Destination does not contain 'metrics' keyword. Not uploading."
|
||||
)
|
||||
return backup_too_small
|
||||
if not "job" in dest:
|
||||
logger.error(
|
||||
"Destination does not contain 'job' keyword. Not uploading."
|
||||
)
|
||||
return backup_too_small
|
||||
try:
|
||||
authentication = (
|
||||
repo_config.g("prometheus.http_username"),
|
||||
repo_config.g("prometheus.http_password"),
|
||||
)
|
||||
except KeyError:
|
||||
logger.info("No metrics authentication present.")
|
||||
authentication = None
|
||||
|
||||
# Fix for #150, job name needs to be unique in order to avoid overwriting previous job in push gateway
|
||||
destination = (
|
||||
f"{destination}___repo_name={repo_name}___action={operation}"
|
||||
)
|
||||
upload_metrics(destination, authentication, no_cert_verify, metrics)
|
||||
else:
|
||||
write_metrics_file(destination, metrics, append=append_metrics_file)
|
||||
else:
|
||||
write_metrics_file(
|
||||
destination, metrics, append=not is_first_metrics_run
|
||||
)
|
||||
else:
|
||||
logger.debug("No metrics destination set. Not sending metrics")
|
||||
logger.debug("No metrics destination set. Not sending metrics")
|
||||
except KeyError as exc:
|
||||
logger.info("Metrics error: {}".format(exc))
|
||||
logger.debug("Trace:", exc_info=True)
|
||||
|
|
@ -293,7 +292,7 @@ class NPBackupRunner:
|
|||
self.warnings_for_json = []
|
||||
|
||||
self._produce_metrics = True
|
||||
self._is_first_metrics_run = True
|
||||
self._append_metrics_file = False
|
||||
self._canceled = False
|
||||
|
||||
@property
|
||||
|
|
@ -449,14 +448,14 @@ class NPBackupRunner:
|
|||
self._produce_metrics = value
|
||||
|
||||
@property
|
||||
def is_first_metrics_run(self):
|
||||
return self._is_first_metrics_run
|
||||
def append_metrics_file(self):
|
||||
return self._append_metrics_file
|
||||
|
||||
@is_first_metrics_run.setter
|
||||
def is_first_metrics_run(self, value):
|
||||
@append_metrics_file.setter
|
||||
def append_metrics_file(self, value):
|
||||
if not isinstance(value, bool):
|
||||
raise ValueError("is_first_metrics_run value {value} is not a boolean")
|
||||
self._is_first_metrics_run = value
|
||||
raise ValueError("append_metrics_file value {value} is not a boolean")
|
||||
self._append_metrics_file = value
|
||||
|
||||
@property
|
||||
def exec_time(self):
|
||||
|
|
@ -533,10 +532,6 @@ class NPBackupRunner:
|
|||
self.write_logs(
|
||||
f"Runner took {self.exec_time} seconds for {fn.__name__}", level="info"
|
||||
)
|
||||
try:
|
||||
os.environ["NPBACKUP_EXEC_TIME"] = str(self.exec_time)
|
||||
except OSError:
|
||||
pass
|
||||
return result
|
||||
|
||||
return wrapper
|
||||
|
|
@ -764,12 +759,17 @@ class NPBackupRunner:
|
|||
metric_analyser(
|
||||
self.repo_config,
|
||||
False,
|
||||
None,
|
||||
self.restic_runner.backup_result_content,
|
||||
fn.__name__,
|
||||
self.dry_run,
|
||||
self.is_first_metrics_run,
|
||||
self.append_metrics_file,
|
||||
self.exec_time,
|
||||
analyze_only=False,
|
||||
)
|
||||
self.is_first_metrics_run = False
|
||||
# We need to reset backup result content once it's parsed
|
||||
self.restic_runner.backup_result_content = None
|
||||
# We need to append to metric file once we begin writing to it
|
||||
self.append_metrics_file = True
|
||||
if self.json_output:
|
||||
js = {
|
||||
"result": False,
|
||||
|
|
@ -791,16 +791,21 @@ class NPBackupRunner:
|
|||
# pylint: disable=E1102 (not-callable)
|
||||
result = fn(self, *args, **kwargs)
|
||||
# pylint: disable=E1101 (no-member)
|
||||
if self._produce_metrics:
|
||||
if self.produce_metrics:
|
||||
metric_analyser(
|
||||
self.repo_config,
|
||||
result,
|
||||
None,
|
||||
self.restic_runner.backup_result_content,
|
||||
fn.__name__,
|
||||
self.dry_run,
|
||||
self.is_first_metrics_run,
|
||||
self.append_metrics_file,
|
||||
self.exec_time,
|
||||
analyze_only=False,
|
||||
)
|
||||
self.is_first_metrics_run = False
|
||||
# We need to reset backup result content once it's parsed
|
||||
self.restic_runner.backup_result_content = None
|
||||
# We need to append to metric file once we begin writing to it
|
||||
self.append_metrics_file = True
|
||||
else:
|
||||
self.write_logs(
|
||||
f"Metrics disabled for call {fn.__name__}", level="debug"
|
||||
|
|
@ -1260,6 +1265,7 @@ class NPBackupRunner:
|
|||
@threaded
|
||||
@close_queues
|
||||
@catch_exceptions
|
||||
@metrics
|
||||
@exec_timer
|
||||
@check_concurrency
|
||||
@has_permission
|
||||
|
|
@ -1546,14 +1552,6 @@ class NPBackupRunner:
|
|||
post_exec_failure_is_fatal,
|
||||
)
|
||||
|
||||
# So we must duplicate @exec_time code here since we must call @metrics manually
|
||||
# because it will need restic output from backup function
|
||||
self.exec_time = (datetime.now(timezone.utc) - start_time).total_seconds()
|
||||
try:
|
||||
os.environ["NPBACKUP_EXEC_TIME"] = str(self.exec_time)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
# Extract backup size from result_string
|
||||
# Metrics will not be in json format, since we need to diag cloud issues until
|
||||
# there is a fix for https://github.com/restic/restic/issues/4155
|
||||
|
|
@ -1563,9 +1561,10 @@ class NPBackupRunner:
|
|||
self.restic_runner.backup_result_content,
|
||||
"backup",
|
||||
self.restic_runner.dry_run,
|
||||
self.is_first_metrics_run,
|
||||
self.append_metrics_file,
|
||||
self.exec_time,
|
||||
analyze_only=True,
|
||||
)
|
||||
self.is_first_metrics_run = False
|
||||
|
||||
if backup_too_small:
|
||||
self.write_logs(
|
||||
|
|
@ -1586,7 +1585,6 @@ class NPBackupRunner:
|
|||
ignore_additional_json=True,
|
||||
)
|
||||
|
||||
housekeeping_result = True
|
||||
if operation_result:
|
||||
post_backup_housekeeping_percent_chance = self.repo_config.g(
|
||||
"backup_opts.post_backup_housekeeping_percent_chance"
|
||||
|
|
@ -1627,11 +1625,18 @@ class NPBackupRunner:
|
|||
__check_concurrency=False,
|
||||
check_concurrency=False,
|
||||
)
|
||||
if not housekeeping_result:
|
||||
self.write_logs(
|
||||
"After backup housekeeping failed", level="error"
|
||||
)
|
||||
|
||||
if not operation_result or not housekeeping_result:
|
||||
# housekeeping has it's own metrics, so we won't include them in the operational result of the backup
|
||||
if not operation_result:
|
||||
# patch result if json
|
||||
if isinstance(result, dict):
|
||||
result["result"] = False
|
||||
else:
|
||||
result = False
|
||||
# Don't overwrite backend output in case of failure
|
||||
return self.convert_to_json_output(result)
|
||||
return self.convert_to_json_output(result, msg)
|
||||
|
|
|
|||
|
|
@ -467,7 +467,10 @@ def restore_window(
|
|||
__autoclose=True,
|
||||
__no_lock=__no_lock,
|
||||
)
|
||||
return result["result"]
|
||||
try:
|
||||
return result["result"]
|
||||
except TypeError:
|
||||
return result
|
||||
|
||||
left_col = [
|
||||
[
|
||||
|
|
@ -520,7 +523,10 @@ def backup(repo_config: dict) -> bool:
|
|||
__backend_binary=backend_binary,
|
||||
__no_lock=__no_lock,
|
||||
)
|
||||
return result["result"]
|
||||
try:
|
||||
return result["result"]
|
||||
except TypeError:
|
||||
return result
|
||||
|
||||
|
||||
def forget_snapshot(repo_config: dict, snapshot_ids: List[str]) -> bool:
|
||||
|
|
@ -536,7 +542,10 @@ def forget_snapshot(repo_config: dict, snapshot_ids: List[str]) -> bool:
|
|||
__backend_binary=backend_binary,
|
||||
__no_lock=__no_lock,
|
||||
)
|
||||
return result["result"]
|
||||
try:
|
||||
return result["result"]
|
||||
except TypeError:
|
||||
return result
|
||||
|
||||
|
||||
def _main_gui(viewer_mode: bool):
|
||||
|
|
|
|||
|
|
@ -6,8 +6,8 @@ __intname__ = "restic_metrics"
|
|||
__author__ = "Orsiris de Jong"
|
||||
__copyright__ = "Copyright (C) 2022-2025 NetInvent"
|
||||
__license__ = "BSD-3-Clause"
|
||||
__version__ = "2.0.2"
|
||||
__build__ = "2024030501"
|
||||
__version__ = "2.0.3"
|
||||
__build__ = "2025052301"
|
||||
__description__ = (
|
||||
"Converts restic command line output to a text file node_exporter can scrape"
|
||||
)
|
||||
|
|
@ -185,6 +185,11 @@ def restic_json_to_prometheus(
|
|||
_labels.append(f'{key.strip()}="{value.strip()}"')
|
||||
labels = ",".join(_labels)
|
||||
|
||||
# If restic_json is a bool, just fail
|
||||
if isinstance(restic_json, bool):
|
||||
logger.error("Backup data could not be analayzed.")
|
||||
return False, [], False
|
||||
|
||||
# Take last line of restic output
|
||||
if isinstance(restic_json, str):
|
||||
found = False
|
||||
|
|
@ -195,7 +200,7 @@ def restic_json_to_prometheus(
|
|||
break
|
||||
if not found:
|
||||
logger.critical("Bogus data given. No message_type: summary found")
|
||||
return False, [], True
|
||||
return False, [], False
|
||||
|
||||
if not isinstance(restic_json, dict):
|
||||
try:
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue