From b9dce2243310ef2c627fe7244b62824e5d44b541 Mon Sep 17 00:00:00 2001
From: Christopher Bartz <christopher.bartz@canonical.com>
Date: Fri, 25 Oct 2024 15:02:27 +0200
Subject: [PATCH] Include number of expected runners in reconciliation metric
 (#396)

* pin branch

* only run metrics test - REVERT ME

* bump stable version test_charm_upgrade

* adapt tests

* issue expected_runners for LXD

* fix unit test

* update changelog

* fix unit test

* fix reactive integration test

* update dashboard to include expected number of runners

* clear metrics log before yield

* only add expected runners to time series

* update docs

* Revert "only run metrics test - REVERT ME"

This reverts commit 11fdbafda7803f06bc4bb790d495beff3e169dfd.

* lint

* pin correct commit

* add more explanation for panel

* update Available runners panel to include max expected
---
 docs/changelog.md                          |  4 ++
 docs/reference/cos.md                      |  4 +-
 requirements.txt                           |  2 +-
 src/grafana_dashboards/metrics.json        | 75 +++++++++++++++++++---
 src/runner_manager.py                      |  4 ++
 tests/integration/helpers/charm_metrics.py | 11 +++-
 tests/integration/test_charm_upgrade.py    |  2 +-
 tests/integration/test_reactive.py         |  3 +
 tests/unit/test_lxd_runner_manager.py      |  4 +-
 9 files changed, 93 insertions(+), 16 deletions(-)

diff --git a/docs/changelog.md b/docs/changelog.md
index 9269199ed..9952d3263 100644
--- a/docs/changelog.md
+++ b/docs/changelog.md
@@ -1,5 +1,9 @@
 # Changelog
 
+### 2024-10-24
+
+- Add "expected_runners" to reconciliation metric.
+
 ### 2024-10-23
 
 - Fixed the wrong dateformat usage in the server uniqueness check.
diff --git a/docs/reference/cos.md b/docs/reference/cos.md
index d1fcdd736..e1a0c4d26 100644
--- a/docs/reference/cos.md
+++ b/docs/reference/cos.md
@@ -12,8 +12,8 @@ The "GitHub Self-Hosted Runner Metrics" metrics dashboard presents the following
 
 - General: Displays general metrics about the charm and runners, such as:
   - Lifecycle counters: Tracks the frequency of Runner initialisation, start, stop, and crash events.
-  - Available runners: A horizontal bar graph showing the number of runners available during the last reconciliation event. Note: This data is updated after each reconciliation event and is not real-time.
-  - Runners after reconciliation: A time series graph showing the number of runners marked as active/idle during the last reconciliation event over time. Note: This data is updated after each reconciliation event and is not real-time.
+  - Available runners: A horizontal bar graph showing the number of runners available (and max expected) during the last reconciliation event. Note: This data is updated after each reconciliation event and is not real-time. 
+  - Runners after reconciliation: A time series graph showing the number of runners marked as active/idle and the number of expected runners during the last reconciliation event over time. Note: This data is updated after each reconciliation event and is not real-time.
   - Duration observations: Each data point aggregates the last hour and shows the 50th, 90th, 95th percentile and maximum durations for:
       - Runner installation
       - Runner idle duration
diff --git a/requirements.txt b/requirements.txt
index 44e6fd952..ddf8da4fe 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -12,4 +12,4 @@ cosl ==0.0.15
 # juju 3.1.2.0 depends on pyyaml<=6.0 and >=5.1.2
 PyYAML ==6.0.*
 pyOpenSSL==24.2.1
-github_runner_manager @ git+https://github.com/canonical/github-runner-manager.git@398b4ff4740476f815cd7582e28b8b3a953cf845
+github_runner_manager @ git+https://github.com/canonical/github-runner-manager.git@6f276fd0fc95ed3550005a0b394225d1b506f7dd
diff --git a/src/grafana_dashboards/metrics.json b/src/grafana_dashboards/metrics.json
index 96aec6dc1..8bc61762e 100644
--- a/src/grafana_dashboards/metrics.json
+++ b/src/grafana_dashboards/metrics.json
@@ -24,7 +24,7 @@
   "editable": true,
   "fiscalYearStartMonth": 0,
   "graphTooltip": 0,
-  "id": 313,
+  "id": 319,
   "links": [],
   "liveNow": false,
   "panels": [
@@ -176,7 +176,7 @@
         "type": "loki",
         "uid": "${lokids}"
       },
-      "description": "Shows the number of available runners per application. Note: Available equals idle after reconciliation and is not a real-time metric as it is only updated after each reconciliation interval for each unit.",
+      "description": "Shows the number of available (and max expected) runners per application. Note: Available equals idle after reconciliation and is not a real-time metric as it is only updated after each reconciliation interval for each unit.\n\nNote that there is no maximum expected number of reactive runners.",
       "fieldConfig": {
         "defaults": {
           "color": {
@@ -202,7 +202,6 @@
               "mode": "off"
             }
           },
-          "decimals": 0,
           "mappings": [],
           "thresholds": {
             "mode": "absolute",
@@ -216,10 +215,42 @@
                 "value": 80
               }
             ]
-          },
-          "unit": "short"
+          }
         },
-        "overrides": []
+        "overrides": [
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "Value #A"
+            },
+            "properties": [
+              {
+                "id": "displayName",
+                "value": "Available"
+              }
+            ]
+          },
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "Value #B"
+            },
+            "properties": [
+              {
+                "id": "displayName",
+                "value": "Max possible"
+              },
+              {
+                "id": "custom.axisPlacement",
+                "value": "left"
+              },
+              {
+                "id": "custom.fillOpacity",
+                "value": 0
+              }
+            ]
+          }
+        ]
       },
       "gridPos": {
         "h": 8,
@@ -237,11 +268,11 @@
           "calcs": [],
           "displayMode": "list",
           "placement": "right",
-          "showLegend": false
+          "showLegend": true
         },
         "orientation": "horizontal",
         "showValue": "always",
-        "stacking": "none",
+        "stacking": "normal",
         "tooltip": {
           "mode": "single",
           "sort": "none"
@@ -261,6 +292,18 @@
           "legendFormat": "",
           "queryType": "instant",
           "refId": "A"
+        },
+        {
+          "datasource": {
+            "type": "loki",
+            "uid": "${lokids}"
+          },
+          "editorMode": "code",
+          "expr": "sum by(flavor)(last_over_time({filename=\"/var/log/github-runner-metrics.log\", juju_application=~\"$juju_application\", juju_model=~\"$juju_model\", juju_model_uuid=~\"$juju_model_uuid\", juju_unit=~\"$juju_unit\"} | json event=\"event\",expected_runners=\"expected_runners\",flavor=\"flavor\" | __error__=\"\" | event=\"reconciliation\" | unwrap expected_runners[$__range]))",
+          "hide": false,
+          "legendFormat": "",
+          "queryType": "instant",
+          "refId": "B"
         }
       ],
       "title": "Available Runners",
@@ -635,7 +678,7 @@
         "type": "loki",
         "uid": "${lokids}"
       },
-      "description": "This panel totals the number of active/idle runners reported by charm units after reconciliation.  Note that this is not a real-time metric and should only be used to identify trends or to investigate problems. A given point in time shows the aggregation of the last reported values from the reconciliation. This means, for example, that if the graph shows 10 idle runners at a particular time, it doesn't mean that there are 10 idle runners at that time, just that this is the most recently reported value. Also note that runners who are reported as idle at the time of reconciliation may become active immediately afterwards. And active runners may become offline/inactive.",
+      "description": "This panel totals the number of active/idle/total expected runners reported by charm units after reconciliation.  Note that this is not a real-time metric and should only be used to identify trends or to investigate problems. A given point in time shows the aggregation of the last reported values from the reconciliation. This means, for example, that if the graph shows 10 idle runners at a particular time, it doesn't mean that there are 10 idle runners at that time, just that this is the most recently reported value. Also note that runners who are reported as idle at the time of reconciliation may become active immediately afterwards. And active runners may become offline/inactive.\n\nThere is no expected number of runners for runners in reactive mode, which means that the total number of idle + active runners could be greater than expected if reactive applications are not filtered out in this view.",
       "fieldConfig": {
         "defaults": {
           "color": {
@@ -733,6 +776,18 @@
           "legendFormat": "Active",
           "queryType": "range",
           "refId": "A"
+        },
+        {
+          "datasource": {
+            "type": "loki",
+            "uid": "${lokids}"
+          },
+          "editorMode": "code",
+          "expr": "sum by(filename)(last_over_time({filename=\"/var/log/github-runner-metrics.log\", juju_application=~\"$juju_application\", juju_model=~\"$juju_model\", juju_model_uuid=~\"$juju_model_uuid\", juju_unit=~\"$juju_unit\"} | json event=\"event\",expected_runners=\"expected_runners\",flavor=\"flavor\" | __error__=\"\" | event=\"reconciliation\" | flavor=~\"$flavor\" | unwrap expected_runners[60m]))",
+          "hide": false,
+          "legendFormat": "Expected",
+          "queryType": "range",
+          "refId": "B"
         }
       ],
       "title": "Runners after Reconciliation",
@@ -2056,6 +2111,6 @@
   "timepicker": {},
   "timezone": "",
   "title": "GitHub Self-Hosted Runner Metrics",
-  "version": 16,
+  "version": 17,
   "weekStart": ""
 }
diff --git a/src/runner_manager.py b/src/runner_manager.py
index 6e806a6f9..727fc6782 100644
--- a/src/runner_manager.py
+++ b/src/runner_manager.py
@@ -362,6 +362,7 @@ def _issue_reconciliation_metric(
         metric_stats: IssuedMetricEventsStats,
         reconciliation_start_ts: float,
         reconciliation_end_ts: float,
+        expected_quantity: int,
     ) -> None:
         """Issue reconciliation metric.
 
@@ -369,6 +370,7 @@ def _issue_reconciliation_metric(
             metric_stats: The stats of issued metric events.
             reconciliation_start_ts: The timestamp of when reconciliation started.
             reconciliation_end_ts: The timestamp of when reconciliation ended.
+            expected_quantity: The expected quantity of runners.
         """
         runners = self._get_runners()
         runner_states = self._get_runner_health_states()
@@ -398,6 +400,7 @@ def _issue_reconciliation_metric(
                     - metric_stats.get(metric_events.RunnerStop, 0),
                     idle_runners=idle_online_count + idle_offline_count,
                     active_runners=active_count,
+                    expected_runners=expected_quantity,
                     duration=reconciliation_end_ts - reconciliation_start_ts,
                 )
             )
@@ -582,6 +585,7 @@ def reconcile(self, quantity: int, resources: VirtualMachineResources) -> int:
                 metric_stats=metric_stats,
                 reconciliation_start_ts=start_ts,
                 reconciliation_end_ts=end_ts,
+                expected_quantity=quantity,
             )
         return delta
 
diff --git a/tests/integration/helpers/charm_metrics.py b/tests/integration/helpers/charm_metrics.py
index 0ca7a5e84..6baea6990 100644
--- a/tests/integration/helpers/charm_metrics.py
+++ b/tests/integration/helpers/charm_metrics.py
@@ -168,7 +168,10 @@ async def cancel_workflow_run(
 
 
 async def assert_events_after_reconciliation(
-    app: Application, github_repository: Repository, post_job_status: PostJobStatus
+    app: Application,
+    github_repository: Repository,
+    post_job_status: PostJobStatus,
+    reactive_mode: bool = False,
 ):
     """Assert that the RunnerStart, RunnerStop and Reconciliation metric is logged.
 
@@ -176,6 +179,8 @@ async def assert_events_after_reconciliation(
         app: The charm to assert the events for.
         github_repository: The github repository to assert the events for.
         post_job_status: The expected post job status of the reconciliation event.
+        reactive_mode: Whether the charm manages reactive runners,
+         this changes the expected events.
     """
     unit = app.units[0]
 
@@ -221,6 +226,10 @@ async def assert_events_after_reconciliation(
             assert metric_log.get("crashed_runners") == 0
             assert metric_log.get("idle_runners") >= 0
             assert metric_log.get("active_runners") >= 0
+            if not reactive_mode:
+                assert metric_log.get("expected_runners") >= 0
+            else:
+                assert "expected_runners" not in metric_log
 
 
 async def wait_for_runner_to_be_marked_offline(
diff --git a/tests/integration/test_charm_upgrade.py b/tests/integration/test_charm_upgrade.py
index eb12dd591..3759a767e 100644
--- a/tests/integration/test_charm_upgrade.py
+++ b/tests/integration/test_charm_upgrade.py
@@ -40,7 +40,7 @@ async def test_charm_upgrade(
     assert: the charm is upgraded successfully.
     """
     latest_stable_path = tmp_path / "github-runner.charm"
-    latest_stable_revision = 256  # update this value every release to stable.
+    latest_stable_revision = 282  # update this value every release to stable.
     # download the charm and inject lxd profile for testing
     retcode, stdout, stderr = await ops_test.juju(
         "download",
diff --git a/tests/integration/test_reactive.py b/tests/integration/test_reactive.py
index 869214ec3..89c82dc66 100644
--- a/tests/integration/test_reactive.py
+++ b/tests/integration/test_reactive.py
@@ -21,6 +21,7 @@
 from charm_state import VIRTUAL_MACHINES_CONFIG_NAME
 from tests.integration.helpers.charm_metrics import (
     assert_events_after_reconciliation,
+    clear_metrics_log,
     get_metrics_log,
 )
 from tests.integration.helpers.common import (
@@ -47,6 +48,7 @@ async def app_fixture(
 
     await app_for_reactive.set_config({VIRTUAL_MACHINES_CONFIG_NAME: "1"})
     await reconcile(app_for_reactive, app_for_reactive.model)
+    await clear_metrics_log(app_for_reactive.units[0])
 
     yield app_for_reactive
 
@@ -400,4 +402,5 @@ async def _assert_metrics_are_logged(app: Application, github_repository: Reposi
         app=app,
         github_repository=github_repository,
         post_job_status=PostJobStatus.NORMAL,
+        reactive_mode=True,
     )
diff --git a/tests/unit/test_lxd_runner_manager.py b/tests/unit/test_lxd_runner_manager.py
index 0b50a7943..b55757622 100644
--- a/tests/unit/test_lxd_runner_manager.py
+++ b/tests/unit/test_lxd_runner_manager.py
@@ -438,8 +438,9 @@ def mock_get_runners():
         unhealthy=(),
     )
 
+    quantity = random.randint(0, 5)
     runner_manager.reconcile(
-        quantity=random.randint(0, 5), resources=VirtualMachineResources(2, "7GiB", "10Gib")
+        quantity=quantity, resources=VirtualMachineResources(2, "7GiB", "10Gib")
     )
 
     issue_event_mock.assert_any_call(
@@ -449,6 +450,7 @@ def mock_get_runners():
             crashed_runners=1,
             idle_runners=2,
             active_runners=1,
+            expected_runners=quantity,
             duration=0,
         )
     )