Skip to content

Commit

Permalink
Include number of expected runners in reconciliation metric (#396)
Browse files Browse the repository at this point in the history
* pin branch

* only run metrics test - REVERT ME

* bump stable version test_charm_upgrade

* adapt tests

* issue expected_runners for LXD

* fix unit test

* update changelog

* fix unit test

* fix reactive integration test

* update dashboard to include expected number of runners

* clear metrics log before yield

* only add expected runners to time series

* update docs

* Revert "only run metrics test - REVERT ME"

This reverts commit 11fdbaf.

* lint

* pin correct commit

* add more explanation for panel

* update Available runners panel to include max expected
  • Loading branch information
cbartz authored Oct 25, 2024
1 parent aa9f7b3 commit b9dce22
Show file tree
Hide file tree
Showing 9 changed files with 93 additions and 16 deletions.
4 changes: 4 additions & 0 deletions docs/changelog.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
# Changelog

### 2024-10-24

- Add "expected_runners" to reconciliation metric.

### 2024-10-23

- Fixed the wrong dateformat usage in the server uniqueness check.
Expand Down
4 changes: 2 additions & 2 deletions docs/reference/cos.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@ The "GitHub Self-Hosted Runner Metrics" metrics dashboard presents the following

- General: Displays general metrics about the charm and runners, such as:
- Lifecycle counters: Tracks the frequency of Runner initialisation, start, stop, and crash events.
- Available runners: A horizontal bar graph showing the number of runners available during the last reconciliation event. Note: This data is updated after each reconciliation event and is not real-time.
- Runners after reconciliation: A time series graph showing the number of runners marked as active/idle during the last reconciliation event over time. Note: This data is updated after each reconciliation event and is not real-time.
- Available runners: A horizontal bar graph showing the number of runners available (and max expected) during the last reconciliation event. Note: This data is updated after each reconciliation event and is not real-time.
- Runners after reconciliation: A time series graph showing the number of runners marked as active/idle and the number of expected runners during the last reconciliation event over time. Note: This data is updated after each reconciliation event and is not real-time.
- Duration observations: Each data point aggregates the last hour and shows the 50th, 90th, 95th percentile and maximum durations for:
- Runner installation
- Runner idle duration
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,4 @@ cosl ==0.0.15
# juju 3.1.2.0 depends on pyyaml<=6.0 and >=5.1.2
PyYAML ==6.0.*
pyOpenSSL==24.2.1
github_runner_manager @ git+https://github.com/canonical/github-runner-manager.git@398b4ff4740476f815cd7582e28b8b3a953cf845
github_runner_manager @ git+https://github.com/canonical/github-runner-manager.git@6f276fd0fc95ed3550005a0b394225d1b506f7dd
75 changes: 65 additions & 10 deletions src/grafana_dashboards/metrics.json
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 0,
"id": 313,
"id": 319,
"links": [],
"liveNow": false,
"panels": [
Expand Down Expand Up @@ -176,7 +176,7 @@
"type": "loki",
"uid": "${lokids}"
},
"description": "Shows the number of available runners per application. Note: Available equals idle after reconciliation and is not a real-time metric as it is only updated after each reconciliation interval for each unit.",
"description": "Shows the number of available (and max expected) runners per application. Note: Available equals idle after reconciliation and is not a real-time metric as it is only updated after each reconciliation interval for each unit.\n\nNote that there is no maximum expected number of reactive runners.",
"fieldConfig": {
"defaults": {
"color": {
Expand All @@ -202,7 +202,6 @@
"mode": "off"
}
},
"decimals": 0,
"mappings": [],
"thresholds": {
"mode": "absolute",
Expand All @@ -216,10 +215,42 @@
"value": 80
}
]
},
"unit": "short"
}
},
"overrides": []
"overrides": [
{
"matcher": {
"id": "byName",
"options": "Value #A"
},
"properties": [
{
"id": "displayName",
"value": "Available"
}
]
},
{
"matcher": {
"id": "byName",
"options": "Value #B"
},
"properties": [
{
"id": "displayName",
"value": "Max possible"
},
{
"id": "custom.axisPlacement",
"value": "left"
},
{
"id": "custom.fillOpacity",
"value": 0
}
]
}
]
},
"gridPos": {
"h": 8,
Expand All @@ -237,11 +268,11 @@
"calcs": [],
"displayMode": "list",
"placement": "right",
"showLegend": false
"showLegend": true
},
"orientation": "horizontal",
"showValue": "always",
"stacking": "none",
"stacking": "normal",
"tooltip": {
"mode": "single",
"sort": "none"
Expand All @@ -261,6 +292,18 @@
"legendFormat": "",
"queryType": "instant",
"refId": "A"
},
{
"datasource": {
"type": "loki",
"uid": "${lokids}"
},
"editorMode": "code",
"expr": "sum by(flavor)(last_over_time({filename=\"/var/log/github-runner-metrics.log\", juju_application=~\"$juju_application\", juju_model=~\"$juju_model\", juju_model_uuid=~\"$juju_model_uuid\", juju_unit=~\"$juju_unit\"} | json event=\"event\",expected_runners=\"expected_runners\",flavor=\"flavor\" | __error__=\"\" | event=\"reconciliation\" | unwrap expected_runners[$__range]))",
"hide": false,
"legendFormat": "",
"queryType": "instant",
"refId": "B"
}
],
"title": "Available Runners",
Expand Down Expand Up @@ -635,7 +678,7 @@
"type": "loki",
"uid": "${lokids}"
},
"description": "This panel totals the number of active/idle runners reported by charm units after reconciliation. Note that this is not a real-time metric and should only be used to identify trends or to investigate problems. A given point in time shows the aggregation of the last reported values from the reconciliation. This means, for example, that if the graph shows 10 idle runners at a particular time, it doesn't mean that there are 10 idle runners at that time, just that this is the most recently reported value. Also note that runners who are reported as idle at the time of reconciliation may become active immediately afterwards. And active runners may become offline/inactive.",
"description": "This panel totals the number of active/idle/total expected runners reported by charm units after reconciliation. Note that this is not a real-time metric and should only be used to identify trends or to investigate problems. A given point in time shows the aggregation of the last reported values from the reconciliation. This means, for example, that if the graph shows 10 idle runners at a particular time, it doesn't mean that there are 10 idle runners at that time, just that this is the most recently reported value. Also note that runners who are reported as idle at the time of reconciliation may become active immediately afterwards. And active runners may become offline/inactive.\n\nThere is no expected number of runners for runners in reactive mode, which means that the total number of idle + active runners could be greater than expected if reactive applications are not filtered out in this view.",
"fieldConfig": {
"defaults": {
"color": {
Expand Down Expand Up @@ -733,6 +776,18 @@
"legendFormat": "Active",
"queryType": "range",
"refId": "A"
},
{
"datasource": {
"type": "loki",
"uid": "${lokids}"
},
"editorMode": "code",
"expr": "sum by(filename)(last_over_time({filename=\"/var/log/github-runner-metrics.log\", juju_application=~\"$juju_application\", juju_model=~\"$juju_model\", juju_model_uuid=~\"$juju_model_uuid\", juju_unit=~\"$juju_unit\"} | json event=\"event\",expected_runners=\"expected_runners\",flavor=\"flavor\" | __error__=\"\" | event=\"reconciliation\" | flavor=~\"$flavor\" | unwrap expected_runners[60m]))",
"hide": false,
"legendFormat": "Expected",
"queryType": "range",
"refId": "B"
}
],
"title": "Runners after Reconciliation",
Expand Down Expand Up @@ -2056,6 +2111,6 @@
"timepicker": {},
"timezone": "",
"title": "GitHub Self-Hosted Runner Metrics",
"version": 16,
"version": 17,
"weekStart": ""
}
4 changes: 4 additions & 0 deletions src/runner_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -362,13 +362,15 @@ def _issue_reconciliation_metric(
metric_stats: IssuedMetricEventsStats,
reconciliation_start_ts: float,
reconciliation_end_ts: float,
expected_quantity: int,
) -> None:
"""Issue reconciliation metric.
Args:
metric_stats: The stats of issued metric events.
reconciliation_start_ts: The timestamp of when reconciliation started.
reconciliation_end_ts: The timestamp of when reconciliation ended.
expected_quantity: The expected quantity of runners.
"""
runners = self._get_runners()
runner_states = self._get_runner_health_states()
Expand Down Expand Up @@ -398,6 +400,7 @@ def _issue_reconciliation_metric(
- metric_stats.get(metric_events.RunnerStop, 0),
idle_runners=idle_online_count + idle_offline_count,
active_runners=active_count,
expected_runners=expected_quantity,
duration=reconciliation_end_ts - reconciliation_start_ts,
)
)
Expand Down Expand Up @@ -582,6 +585,7 @@ def reconcile(self, quantity: int, resources: VirtualMachineResources) -> int:
metric_stats=metric_stats,
reconciliation_start_ts=start_ts,
reconciliation_end_ts=end_ts,
expected_quantity=quantity,
)
return delta

Expand Down
11 changes: 10 additions & 1 deletion tests/integration/helpers/charm_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,14 +168,19 @@ async def cancel_workflow_run(


async def assert_events_after_reconciliation(
app: Application, github_repository: Repository, post_job_status: PostJobStatus
app: Application,
github_repository: Repository,
post_job_status: PostJobStatus,
reactive_mode: bool = False,
):
"""Assert that the RunnerStart, RunnerStop and Reconciliation metric is logged.
Args:
app: The charm to assert the events for.
github_repository: The github repository to assert the events for.
post_job_status: The expected post job status of the reconciliation event.
reactive_mode: Whether the charm manages reactive runners,
this changes the expected events.
"""
unit = app.units[0]

Expand Down Expand Up @@ -221,6 +226,10 @@ async def assert_events_after_reconciliation(
assert metric_log.get("crashed_runners") == 0
assert metric_log.get("idle_runners") >= 0
assert metric_log.get("active_runners") >= 0
if not reactive_mode:
assert metric_log.get("expected_runners") >= 0
else:
assert "expected_runners" not in metric_log


async def wait_for_runner_to_be_marked_offline(
Expand Down
2 changes: 1 addition & 1 deletion tests/integration/test_charm_upgrade.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ async def test_charm_upgrade(
assert: the charm is upgraded successfully.
"""
latest_stable_path = tmp_path / "github-runner.charm"
latest_stable_revision = 256 # update this value every release to stable.
latest_stable_revision = 282 # update this value every release to stable.
# download the charm and inject lxd profile for testing
retcode, stdout, stderr = await ops_test.juju(
"download",
Expand Down
3 changes: 3 additions & 0 deletions tests/integration/test_reactive.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from charm_state import VIRTUAL_MACHINES_CONFIG_NAME
from tests.integration.helpers.charm_metrics import (
assert_events_after_reconciliation,
clear_metrics_log,
get_metrics_log,
)
from tests.integration.helpers.common import (
Expand All @@ -47,6 +48,7 @@ async def app_fixture(

await app_for_reactive.set_config({VIRTUAL_MACHINES_CONFIG_NAME: "1"})
await reconcile(app_for_reactive, app_for_reactive.model)
await clear_metrics_log(app_for_reactive.units[0])

yield app_for_reactive

Expand Down Expand Up @@ -400,4 +402,5 @@ async def _assert_metrics_are_logged(app: Application, github_repository: Reposi
app=app,
github_repository=github_repository,
post_job_status=PostJobStatus.NORMAL,
reactive_mode=True,
)
4 changes: 3 additions & 1 deletion tests/unit/test_lxd_runner_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -438,8 +438,9 @@ def mock_get_runners():
unhealthy=(),
)

quantity = random.randint(0, 5)
runner_manager.reconcile(
quantity=random.randint(0, 5), resources=VirtualMachineResources(2, "7GiB", "10Gib")
quantity=quantity, resources=VirtualMachineResources(2, "7GiB", "10Gib")
)

issue_event_mock.assert_any_call(
Expand All @@ -449,6 +450,7 @@ def mock_get_runners():
crashed_runners=1,
idle_runners=2,
active_runners=1,
expected_runners=quantity,
duration=0,
)
)
Expand Down

0 comments on commit b9dce22

Please sign in to comment.