Skip to content

Commit

Permalink
Catch reconcile errors (#399)
Browse files Browse the repository at this point in the history
* test on push - REVERT ME

* pin branch

* outcomment needs

* handle reconcile error

* Revert "test on push - REVERT ME"

This reverts commit 2584a98.

* Revert "outcomment needs"

This reverts commit 3880e33.

* rename _reconcile_runners to _reconcile_lxd_runners

* remove outdated comment

* remove outdated comment

* add charm unit tests

* increase coverage

* lint

* update changelog

* remove TODO

* Set MaintenanceStatus and improve err msg

* pin websockets<14.0

* WIP checkin

* go into ActiveStatus

* go into ActiveStatus also for LXD
  • Loading branch information
cbartz authored Nov 15, 2024
1 parent 70234da commit 4e2d879
Show file tree
Hide file tree
Showing 5 changed files with 156 additions and 31 deletions.
5 changes: 5 additions & 0 deletions docs/changelog.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
# Changelog


### 2024-11-15

- Catch ReconcileError and set appropriate message in unit status.

### 2024-11-13

- Align the README with the one in https://github.com/canonical/is-charms-template-repo.
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ omit = [
]

[tool.coverage.report]
fail_under = 83
fail_under = 84
show_missing = true

[tool.pytest.ini_options]
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,4 @@ cosl ==0.0.15
# juju 3.1.2.0 depends on pyyaml<=6.0 and >=5.1.2
PyYAML ==6.0.*
pyOpenSSL==24.2.1
github_runner_manager @ git+https://github.com/canonical/github-runner-manager.git@6f276fd0fc95ed3550005a0b394225d1b506f7dd
github_runner_manager @ git+https://github.com/canonical/github-runner-manager.git@cfb8f74b93d6f29acce36cfc87ccab6d7c19a344
86 changes: 62 additions & 24 deletions src/charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
# pylint: disable=too-many-lines

"""Charm for creating and managing GitHub self-hosted runner instances."""

from utilities import bytes_with_unit_to_kib, execute_command, remove_residual_venv_dirs, retry

# This is a workaround for https://bugs.launchpad.net/juju/+bug/2058335
Expand All @@ -30,6 +29,7 @@
import ops
from charms.data_platform_libs.v0.data_interfaces import DatabaseRequires
from charms.grafana_agent.v0.cos_agent import COSAgentProvider
from github_runner_manager.errors import ReconcileError
from github_runner_manager.manager.cloud_runner_manager import (
GitHubRunnerConfig,
SupportServiceConfig,
Expand Down Expand Up @@ -114,6 +114,13 @@
RUNNER_MANAGER_USER = "runner-manager"
RUNNER_MANAGER_GROUP = "runner-manager"

ACTIVE_STATUS_RECONCILIATION_FAILED_MSG = "Last reconciliation failed."
FAILED_TO_RECONCILE_RUNNERS_MSG = "Failed to reconcile runners"
FAILED_RECONCILE_ACTION_ERR_MSG = (
"Failed to reconcile runners. Look at the juju logs for more information."
)


logger = logging.getLogger(__name__)


Expand Down Expand Up @@ -527,8 +534,7 @@ def _on_start(self, _: StartEvent) -> None:
if not self._get_set_image_ready_status():
return
runner_scaler = self._get_runner_scaler(state)
runner_scaler.reconcile(state.runner_config.virtual_machines)
self.unit.status = ActiveStatus()
self._reconcile_openstack_runners(runner_scaler, state.runner_config.virtual_machines)
return

runner_manager = self._get_runner_manager(state)
Expand All @@ -540,14 +546,14 @@ def _on_start(self, _: StartEvent) -> None:
self.unit.status = MaintenanceStatus("Starting runners")
try:
runner_manager.flush(LXDFlushMode.FLUSH_IDLE)
self._reconcile_runners(
self._reconcile_lxd_runners(
runner_manager,
state.runner_config.virtual_machines,
state.runner_config.virtual_machine_resources,
)
except RunnerError as err:
logger.exception("Failed to start runners")
self.unit.status = MaintenanceStatus(f"Failed to start runners: {err}")
self.unit.status = ActiveStatus(f"Failed to start runners: {err}")
return

self.unit.status = ActiveStatus()
Expand Down Expand Up @@ -623,7 +629,7 @@ def _on_upgrade_charm(self, _: UpgradeCharmEvent) -> None:
runner_manager = self._get_runner_manager(state)
logger.info("Flushing the runners...")
runner_manager.flush(LXDFlushMode.FLUSH_BUSY_WAIT_REPO_CHECK)
self._reconcile_runners(
self._reconcile_lxd_runners(
runner_manager,
state.runner_config.virtual_machines,
state.runner_config.virtual_machine_resources,
Expand Down Expand Up @@ -668,9 +674,10 @@ def _on_config_changed(self, _: ConfigChangedEvent) -> None: # noqa: C901
if state.charm_config.token != self._stored.token:
runner_scaler = self._get_runner_scaler(state)
runner_scaler.flush(flush_mode=FlushMode.FLUSH_IDLE)
runner_scaler.reconcile(state.runner_config.virtual_machines)
self._reconcile_openstack_runners(
runner_scaler, state.runner_config.virtual_machines
)
# TODO: 2024-04-12: Flush on token changes.
self.unit.status = ActiveStatus()
return

self._refresh_firewall(state)
Expand All @@ -679,7 +686,7 @@ def _on_config_changed(self, _: ConfigChangedEvent) -> None: # noqa: C901
if state.charm_config.token != self._stored.token:
runner_manager.flush(LXDFlushMode.FORCE_FLUSH_WAIT_REPO_CHECK)
self._stored.token = state.charm_config.token
self._reconcile_runners(
self._reconcile_lxd_runners(
runner_manager,
state.runner_config.virtual_machines,
state.runner_config.virtual_machine_resources,
Expand Down Expand Up @@ -768,8 +775,7 @@ def _trigger_reconciliation(self) -> None:
if not self._get_set_image_ready_status():
return
runner_scaler = self._get_runner_scaler(state)
runner_scaler.reconcile(state.runner_config.virtual_machines)
self.unit.status = ActiveStatus()
self._reconcile_openstack_runners(runner_scaler, state.runner_config.virtual_machines)
return

runner_manager = self._get_runner_manager(state)
Expand All @@ -782,7 +788,7 @@ def _trigger_reconciliation(self) -> None:
if all(not info.busy for info in runner_info):
self._update_kernel(now=True)

self._reconcile_runners(
self._reconcile_lxd_runners(
runner_manager,
state.runner_config.virtual_machines,
state.runner_config.virtual_machine_resources,
Expand Down Expand Up @@ -859,7 +865,15 @@ def _on_reconcile_runners_action(self, event: ActionEvent) -> None:
return
runner_scaler = self._get_runner_scaler(state)

delta = runner_scaler.reconcile(state.runner_config.virtual_machines)
self.unit.status = MaintenanceStatus("Reconciling runners")
try:
delta = runner_scaler.reconcile(state.runner_config.virtual_machines)
except ReconcileError:
logger.exception(FAILED_TO_RECONCILE_RUNNERS_MSG)
self.unit.status = ActiveStatus(ACTIVE_STATUS_RECONCILIATION_FAILED_MSG)
event.fail(FAILED_RECONCILE_ACTION_ERR_MSG)
return

self.unit.status = ActiveStatus()
event.set_results({"delta": {"virtual-machines": delta}})
return
Expand All @@ -870,7 +884,7 @@ def _on_reconcile_runners_action(self, event: ActionEvent) -> None:
runner_manager, state.charm_config.token, state.proxy_config
)

delta = self._reconcile_runners(
delta = self._reconcile_lxd_runners(
runner_manager,
state.runner_config.virtual_machines,
state.runner_config.virtual_machine_resources,
Expand All @@ -893,14 +907,22 @@ def _on_flush_runners_action(self, event: ActionEvent) -> None:
runner_scaler = self._get_runner_scaler(state)
flushed = runner_scaler.flush(flush_mode=FlushMode.FLUSH_IDLE)
logger.info("Flushed %s runners", flushed)
delta = runner_scaler.reconcile(state.runner_config.virtual_machines)
self.unit.status = MaintenanceStatus("Reconciling runners")
try:
delta = runner_scaler.reconcile(state.runner_config.virtual_machines)
except ReconcileError:
logger.exception(FAILED_TO_RECONCILE_RUNNERS_MSG)
self.unit.status = ActiveStatus(ACTIVE_STATUS_RECONCILIATION_FAILED_MSG)
event.fail(FAILED_RECONCILE_ACTION_ERR_MSG)
return
self.unit.status = ActiveStatus()
event.set_results({"delta": {"virtual-machines": delta}})
return

runner_manager = self._get_runner_manager(state)

runner_manager.flush(LXDFlushMode.FLUSH_BUSY_WAIT_REPO_CHECK)
delta = self._reconcile_runners(
delta = self._reconcile_lxd_runners(
runner_manager,
state.runner_config.virtual_machines,
state.runner_config.virtual_machine_resources,
Expand Down Expand Up @@ -946,10 +968,10 @@ def _on_stop(self, _: StopEvent) -> None:
runner_manager = self._get_runner_manager(state)
runner_manager.flush(LXDFlushMode.FLUSH_BUSY)

def _reconcile_runners(
def _reconcile_lxd_runners(
self, runner_manager: LXDRunnerManager, num: int, resources: VirtualMachineResources
) -> Dict[str, Any]:
"""Reconcile the current runners state and intended runner state.
"""Reconcile the current runners state and intended runner state for LXD mode.
Args:
runner_manager: For querying and managing the runner state.
Expand All @@ -975,6 +997,22 @@ def _reconcile_runners(
self.unit.status = ActiveStatus()
return {"delta": {"virtual-machines": delta_virtual_machines}}

def _reconcile_openstack_runners(self, runner_scaler: RunnerScaler, num: int) -> None:
"""Reconcile the current runners state and intended runner state for OpenStack mode.
Args:
runner_scaler: Scaler used to scale the amount of runners.
num: Target number of runners.
"""
self.unit.status = MaintenanceStatus("Reconciling runners")
try:
runner_scaler.reconcile(num)
except ReconcileError:
logger.exception(FAILED_TO_RECONCILE_RUNNERS_MSG)
self.unit.status = ActiveStatus(ACTIVE_STATUS_RECONCILIATION_FAILED_MSG)
else:
self.unit.status = ActiveStatus()

def _install_repo_policy_compliance(self, proxy_config: ProxyConfig) -> bool:
"""Install latest version of repo_policy_compliance service.
Expand Down Expand Up @@ -1193,15 +1231,17 @@ def _on_debug_ssh_relation_changed(self, _: ops.RelationChangedEvent) -> None:
if not self._get_set_image_ready_status():
return
runner_scaler = self._get_runner_scaler(state)
# TODO: 2024-04-12: Should be flush idle.
runner_scaler.flush()
runner_scaler.reconcile(state.runner_config.virtual_machines)
try:
runner_scaler.reconcile(state.runner_config.virtual_machines)
except ReconcileError:
logger.exception(FAILED_TO_RECONCILE_RUNNERS_MSG)
return

self._refresh_firewall(state)
runner_manager = self._get_runner_manager(state)
runner_manager.flush(LXDFlushMode.FLUSH_IDLE)
self._reconcile_runners(
self._reconcile_lxd_runners(
runner_manager,
state.runner_config.virtual_machines,
state.runner_config.virtual_machine_resources,
Expand Down Expand Up @@ -1240,9 +1280,7 @@ def _on_image_relation_changed(self, _: ops.RelationChangedEvent) -> None:

runner_scaler = self._get_runner_scaler(state)
runner_scaler.flush(flush_mode=FlushMode.FLUSH_IDLE)
runner_scaler.reconcile(state.runner_config.virtual_machines)
self.unit.status = ActiveStatus()
return
self._reconcile_openstack_runners(runner_scaler, state.runner_config.virtual_machines)

def _get_set_image_ready_status(self) -> bool:
"""Check if image is ready for Openstack and charm status accordingly.
Expand Down
92 changes: 87 additions & 5 deletions tests/unit/test_charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,19 @@

import pytest
import yaml
from github_runner_manager.errors import ReconcileError
from github_runner_manager.manager.runner_scaler import RunnerScaler
from github_runner_manager.types_.github import GitHubOrg, GitHubRepo, GitHubRunnerStatus
from ops.model import ActiveStatus, BlockedStatus, MaintenanceStatus, StatusBase, WaitingStatus
from ops.testing import Harness

from charm import GithubRunnerCharm, catch_action_errors, catch_charm_errors
from charm import (
ACTIVE_STATUS_RECONCILIATION_FAILED_MSG,
FAILED_RECONCILE_ACTION_ERR_MSG,
GithubRunnerCharm,
catch_action_errors,
catch_charm_errors,
)
from charm_state import (
GROUP_CONFIG_NAME,
IMAGE_INTEGRATION_NAME,
Expand Down Expand Up @@ -296,6 +304,27 @@ def test_on_flush_runners_action_fail(harness: Harness, runner_binary_path: Path
)


def test_on_flush_runners_reconcile_error_fail(harness: Harness):
"""
arrange: Set up charm with Openstack mode and ReconcileError.
act: Run flush runner action.
assert: Action fails with generic message and goes in ActiveStatus.
"""
state_mock = MagicMock()
state_mock.instance_type = InstanceType.OPENSTACK
harness.charm._setup_state = MagicMock(return_value=state_mock)

runner_scaler_mock = MagicMock(spec=RunnerScaler)
runner_scaler_mock.reconcile.side_effect = ReconcileError("mock error")
harness.charm._get_runner_scaler = MagicMock(return_value=runner_scaler_mock)

mock_event = MagicMock()
harness.charm._on_flush_runners_action(mock_event)
mock_event.fail.assert_called_with(FAILED_RECONCILE_ACTION_ERR_MSG)
assert harness.charm.unit.status.name == ActiveStatus.name
assert harness.charm.unit.status.message == ACTIVE_STATUS_RECONCILIATION_FAILED_MSG


def test_on_flush_runners_action_success(harness: Harness, runner_binary_path: Path):
"""
arrange: Set up charm without runner binary downloaded.
Expand All @@ -308,6 +337,61 @@ def test_on_flush_runners_action_success(harness: Harness, runner_binary_path: P
mock_event.set_results.assert_called()


def test_on_reconcile_runners_action_reconcile_error_fail(
harness: Harness, monkeypatch: pytest.MonkeyPatch
):
"""
arrange: Set up charm with Openstack mode, ready image, and ReconcileError.
act: Run reconcile runners action.
assert: Action fails with generic message and goes in ActiveStatus
"""
state_mock = MagicMock()
state_mock.instance_type = InstanceType.OPENSTACK
harness.charm._setup_state = MagicMock(return_value=state_mock)

runner_scaler_mock = MagicMock(spec=RunnerScaler)
runner_scaler_mock.reconcile.side_effect = ReconcileError("mock error")
harness.charm._get_runner_scaler = MagicMock(return_value=runner_scaler_mock)
monkeypatch.setattr(
OpenstackImage,
"from_charm",
MagicMock(return_value=OpenstackImage(id="test", tags=["test"])),
)

mock_event = MagicMock()
harness.charm._on_reconcile_runners_action(mock_event)

mock_event.fail.assert_called_with(FAILED_RECONCILE_ACTION_ERR_MSG)
assert harness.charm.unit.status.name == ActiveStatus.name
assert harness.charm.unit.status.message == ACTIVE_STATUS_RECONCILIATION_FAILED_MSG


def test_on_reconcile_runners_reconcile_error(harness: Harness, monkeypatch: pytest.MonkeyPatch):
"""
arrange: Set up charm with Openstack mode, ready image, and ReconcileError.
act: Trigger reconcile_runners event.
assert: Unit goes into ActiveStatus with error message.
"""
state_mock = MagicMock()
state_mock.instance_type = InstanceType.OPENSTACK
harness.charm._setup_state = MagicMock(return_value=state_mock)

runner_scaler_mock = MagicMock(spec=RunnerScaler)
runner_scaler_mock.reconcile.side_effect = ReconcileError("mock error")
harness.charm._get_runner_scaler = MagicMock(return_value=runner_scaler_mock)
monkeypatch.setattr(
OpenstackImage,
"from_charm",
MagicMock(return_value=OpenstackImage(id="test", tags=["test"])),
)

mock_event = MagicMock()
harness.charm._on_reconcile_runners(mock_event)

assert harness.charm.unit.status.name == ActiveStatus.name
assert harness.charm.unit.status.message == ACTIVE_STATUS_RECONCILIATION_FAILED_MSG


@pytest.mark.parametrize(
"hook",
[
Expand Down Expand Up @@ -653,11 +737,9 @@ def test_on_start_failure(self, run, wt, mkdir, rm):
harness.update_config({PATH_CONFIG_NAME: "mockorg/repo", TOKEN_CONFIG_NAME: "mocktoken"})
harness.begin()

harness.charm._reconcile_runners = raise_runner_error
harness.charm._reconcile_lxd_runners = raise_runner_error
harness.charm.on.start.emit()
assert harness.charm.unit.status == MaintenanceStatus(
"Failed to start runners: mock error"
)
assert harness.charm.unit.status == ActiveStatus("Failed to start runners: mock error")

@patch("charm.LXDRunnerManager")
@patch("charm.RunnerScaler")
Expand Down

0 comments on commit 4e2d879

Please sign in to comment.