Merge pull request #163 from netinvent/backup_metrics_decorator

Backup metrics decorator
2025-10-29 23:06:17 +08:00 · 2025-06-10 14:37:58 +02:00 · 2025-06-10 14:37:58 +02:00 · f4f813375e
commit f4f813375e
parent 357abb6b76 12acb0d2b1
3 changed files with 102 additions and 83 deletions
--- a/npbackup/core/runner.py
+++ b/npbackup/core/runner.py
@ -7,7 +7,7 @@ __intname__ = "npbackup.gui.core.runner"
 __author__ = "Orsiris de Jong"
 __copyright__ = "Copyright (C) 2022-2025 NetInvent"
 __license__ = "GPL-3.0-only"
-__build__ = "2025051101"
+__build__ = "2025052301"


 from typing import Optional, Callable, Union, List, Tuple
@ -92,7 +92,9 @@ def metric_analyser(
    result_string: str,
    operation: str,
    dry_run: bool,
-    is_first_metrics_run: bool,
+    append_metrics_file: bool,
+    exec_time: Optional[float] = None,
+    analyze_only: bool = False,
 ) -> Tuple[bool, bool]:
    """
    Tries to get operation success and backup to small booleans from restic output
@ -185,53 +187,50 @@ def metric_analyser(
            )
        except (ValueError, TypeError):
            pass
-        # Add exec time
-        try:
-            exec_time = os.environ.get("NPBACKUP_EXEC_TIME", None)
-            exec_time = float(exec_time)
-            metrics.append(
-                f'npbackup_exec_time{{{labels},action="{operation}",repo_name="{repo_name}",timestamp="{int(datetime.now(timezone.utc).timestamp())}"}} {exec_time}'
-            )
-        except (ValueError, TypeError):
-            logger.warning("Cannot get exec time from environment")
-
-        logger.debug("Metrics computed:\n{}".format("\n".join(metrics)))
-        if destination and dry_run:
-            logger.info("Dry run mode. Not sending metrics.")
-        elif destination:
-            logger.debug("Sending metrics to {}".format(destination))
-            dest = destination.lower()
-            if dest.startswith("http"):
-                if not "metrics" in dest:
-                    logger.error(
-                        "Destination does not contain 'metrics' keyword. Not uploading."
-                    )
-                    return backup_too_small
-                if not "job" in dest:
-                    logger.error(
-                        "Destination does not contain 'job' keyword. Not uploading."
-                    )
-                    return backup_too_small
-                try:
-                    authentication = (
-                        repo_config.g("prometheus.http_username"),
-                        repo_config.g("prometheus.http_password"),
-                    )
-                except KeyError:
-                    logger.info("No metrics authentication present.")
-                    authentication = None
-
-                # Fix for #150, job name needs to be unique in order to avoid overwriting previous job in push gateway
-                destination = (
-                    f"{destination}___repo_name={repo_name}___action={operation}"
+        if isinstance(exec_time, (int, float)):
+            try:
+                metrics.append(
+                    f'npbackup_exec_time{{{labels},action="{operation}",repo_name="{repo_name}",timestamp="{int(datetime.now(timezone.utc).timestamp())}"}} {exec_time}'
                )
-                upload_metrics(destination, authentication, no_cert_verify, metrics)
+            except (ValueError, TypeError):
+                logger.warning("Cannot get exec time from environment")
+
+        if not analyze_only:
+            logger.debug("Metrics computed:\n{}".format("\n".join(metrics)))
+            if destination and dry_run:
+                logger.info("Dry run mode. Not sending metrics.")
+            elif destination:
+                logger.debug("Sending metrics to {}".format(destination))
+                dest = destination.lower()
+                if dest.startswith("http"):
+                    if not "metrics" in dest:
+                        logger.error(
+                            "Destination does not contain 'metrics' keyword. Not uploading."
+                        )
+                        return backup_too_small
+                    if not "job" in dest:
+                        logger.error(
+                            "Destination does not contain 'job' keyword. Not uploading."
+                        )
+                        return backup_too_small
+                    try:
+                        authentication = (
+                            repo_config.g("prometheus.http_username"),
+                            repo_config.g("prometheus.http_password"),
+                        )
+                    except KeyError:
+                        logger.info("No metrics authentication present.")
+                        authentication = None
+
+                    # Fix for #150, job name needs to be unique in order to avoid overwriting previous job in push gateway
+                    destination = (
+                        f"{destination}___repo_name={repo_name}___action={operation}"
+                    )
+                    upload_metrics(destination, authentication, no_cert_verify, metrics)
+                else:
+                    write_metrics_file(destination, metrics, append=append_metrics_file)
            else:
-                write_metrics_file(
-                    destination, metrics, append=not is_first_metrics_run
-                )
-        else:
-            logger.debug("No metrics destination set. Not sending metrics")
+                logger.debug("No metrics destination set. Not sending metrics")
    except KeyError as exc:
        logger.info("Metrics error: {}".format(exc))
        logger.debug("Trace:", exc_info=True)
@ -293,7 +292,7 @@ class NPBackupRunner:
        self.warnings_for_json = []

        self._produce_metrics = True
-        self._is_first_metrics_run = True
+        self._append_metrics_file = False
        self._canceled = False

    @property
@ -449,14 +448,14 @@ class NPBackupRunner:
        self._produce_metrics = value

    @property
-    def is_first_metrics_run(self):
-        return self._is_first_metrics_run
+    def append_metrics_file(self):
+        return self._append_metrics_file

-    @is_first_metrics_run.setter
-    def is_first_metrics_run(self, value):
+    @append_metrics_file.setter
+    def append_metrics_file(self, value):
        if not isinstance(value, bool):
-            raise ValueError("is_first_metrics_run value {value} is not a boolean")
-        self._is_first_metrics_run = value
+            raise ValueError("append_metrics_file value {value} is not a boolean")
+        self._append_metrics_file = value

    @property
    def exec_time(self):
@ -533,10 +532,6 @@ class NPBackupRunner:
            self.write_logs(
                f"Runner took {self.exec_time} seconds for {fn.__name__}", level="info"
            )
-            try:
-                os.environ["NPBACKUP_EXEC_TIME"] = str(self.exec_time)
-            except OSError:
-                pass
            return result

        return wrapper
@ -764,12 +759,17 @@ class NPBackupRunner:
                metric_analyser(
                    self.repo_config,
                    False,
-                    None,
+                    self.restic_runner.backup_result_content,
                    fn.__name__,
                    self.dry_run,
-                    self.is_first_metrics_run,
+                    self.append_metrics_file,
+                    self.exec_time,
+                    analyze_only=False,
                )
-                self.is_first_metrics_run = False
+                # We need to reset backup result content once it's parsed
+                self.restic_runner.backup_result_content = None
+                # We need to append to metric file once we begin writing to it
+                self.append_metrics_file = True
                if self.json_output:
                    js = {
                        "result": False,
@ -791,16 +791,21 @@ class NPBackupRunner:
            # pylint: disable=E1102 (not-callable)
            result = fn(self, *args, **kwargs)
            # pylint: disable=E1101 (no-member)
-            if self._produce_metrics:
+            if self.produce_metrics:
                metric_analyser(
                    self.repo_config,
                    result,
-                    None,
+                    self.restic_runner.backup_result_content,
                    fn.__name__,
                    self.dry_run,
-                    self.is_first_metrics_run,
+                    self.append_metrics_file,
+                    self.exec_time,
+                    analyze_only=False,
                )
-                self.is_first_metrics_run = False
+                # We need to reset backup result content once it's parsed
+                self.restic_runner.backup_result_content = None
+                # We need to append to metric file once we begin writing to it
+                self.append_metrics_file = True
            else:
                self.write_logs(
                    f"Metrics disabled for call {fn.__name__}", level="debug"
@ -1260,6 +1265,7 @@ class NPBackupRunner:
    @threaded
    @close_queues
    @catch_exceptions
+    @metrics
    @exec_timer
    @check_concurrency
    @has_permission
@ -1546,14 +1552,6 @@ class NPBackupRunner:
                post_exec_failure_is_fatal,
            )

-        # So we must duplicate @exec_time code here since we must call @metrics manually
-        # because it will need restic output from backup function
-        self.exec_time = (datetime.now(timezone.utc) - start_time).total_seconds()
-        try:
-            os.environ["NPBACKUP_EXEC_TIME"] = str(self.exec_time)
-        except OSError:
-            pass
-
        # Extract backup size from result_string
        # Metrics will not be in json format, since we need to diag cloud issues until
        # there is a fix for https://github.com/restic/restic/issues/4155
@ -1563,9 +1561,10 @@ class NPBackupRunner:
            self.restic_runner.backup_result_content,
            "backup",
            self.restic_runner.dry_run,
-            self.is_first_metrics_run,
+            self.append_metrics_file,
+            self.exec_time,
+            analyze_only=True,
        )
-        self.is_first_metrics_run = False

        if backup_too_small:
            self.write_logs(
@ -1586,7 +1585,6 @@ class NPBackupRunner:
            ignore_additional_json=True,
        )

-        housekeeping_result = True
        if operation_result:
            post_backup_housekeeping_percent_chance = self.repo_config.g(
                "backup_opts.post_backup_housekeeping_percent_chance"
@ -1627,11 +1625,18 @@ class NPBackupRunner:
                        __check_concurrency=False,
                        check_concurrency=False,
                    )
+                    if not housekeeping_result:
+                        self.write_logs(
+                            "After backup housekeeping failed", level="error"
+                        )

-        if not operation_result or not housekeeping_result:
+        # housekeeping has it's own metrics, so we won't include them in the operational result of the backup
+        if not operation_result:
            # patch result if json
            if isinstance(result, dict):
                result["result"] = False
+            else:
+                result = False
            # Don't overwrite backend output in case of failure
            return self.convert_to_json_output(result)
        return self.convert_to_json_output(result, msg)
--- a/npbackup/gui/main.py
+++ b/npbackup/gui/main.py
@ -467,7 +467,10 @@ def restore_window(
            __autoclose=True,
            __no_lock=__no_lock,
        )
-        return result["result"]
+        try:
+            return result["result"]
+        except TypeError:
+            return result

    left_col = [
        [
@ -520,7 +523,10 @@ def backup(repo_config: dict) -> bool:
        __backend_binary=backend_binary,
        __no_lock=__no_lock,
    )
-    return result["result"]
+    try:
+        return result["result"]
+    except TypeError:
+        return result


 def forget_snapshot(repo_config: dict, snapshot_ids: List[str]) -> bool:
@ -536,7 +542,10 @@ def forget_snapshot(repo_config: dict, snapshot_ids: List[str]) -> bool:
        __backend_binary=backend_binary,
        __no_lock=__no_lock,
    )
-    return result["result"]
+    try:
+        return result["result"]
+    except TypeError:
+        return result


 def _main_gui(viewer_mode: bool):
--- a/npbackup/restic_metrics/init.py
+++ b/npbackup/restic_metrics/init.py
@ -6,8 +6,8 @@ __intname__ = "restic_metrics"
 __author__ = "Orsiris de Jong"
 __copyright__ = "Copyright (C) 2022-2025 NetInvent"
 __license__ = "BSD-3-Clause"
-__version__ = "2.0.2"
-__build__ = "2024030501"
+__version__ = "2.0.3"
+__build__ = "2025052301"
 __description__ = (
    "Converts restic command line output to a text file node_exporter can scrape"
 )
@ -185,6 +185,11 @@ def restic_json_to_prometheus(
            _labels.append(f'{key.strip()}="{value.strip()}"')
    labels = ",".join(_labels)

+    # If restic_json is a bool, just fail
+    if isinstance(restic_json, bool):
+        logger.error("Backup data could not be analayzed.")
+        return False, [], False
+
    # Take last line of restic output
    if isinstance(restic_json, str):
        found = False
@ -195,7 +200,7 @@ def restic_json_to_prometheus(
                break
        if not found:
            logger.critical("Bogus data given. No message_type: summary found")
-            return False, [], True
+            return False, [], False

    if not isinstance(restic_json, dict):
        try: