From 3e1ece86263e2f0e2813ab461d0e157283efe10c Mon Sep 17 00:00:00 2001 From: Marcelo Henrique Neppel Date: Wed, 17 Aug 2022 16:56:12 -0300 Subject: [PATCH] Remove Patroni stop/restart workaround (#14) * Remove Patroni stop/restart workaround * Update test * Fix CI workflow * Add update status interval change --- src/charm.py | 38 -------------------- src/patroni.py | 21 ----------- tests/integration/test_charm.py | 2 -- tests/unit/test_charm.py | 62 +++------------------------------ tests/unit/test_patroni.py | 22 ------------ 5 files changed, 4 insertions(+), 141 deletions(-) diff --git a/src/charm.py b/src/charm.py index a29761c7a8..bb3317a4a6 100755 --- a/src/charm.py +++ b/src/charm.py @@ -400,22 +400,6 @@ def _on_get_primary(self, event: ActionEvent) -> None: logger.error(f"failed to get primary with error {e}") def _on_update_status(self, _) -> None: - # Until https://github.com/canonical/pebble/issues/6 is fixed, - # we need to use the logic below to restart the leader - # and a stuck replica after a failover/switchover. - try: - state = self._patroni.get_postgresql_state() - if state == "restarting": - # Restart the stuck replica. - self._restart_postgresql_service() - elif state == "starting" or state == "stopping": - # Force a primary change when the current primary is stuck. - self.force_primary_change() - except RetryError as e: - logger.error("failed to check PostgreSQL state") - self.unit.status = BlockedStatus(f"failed to check PostgreSQL state with error {e}") - return - # Display an active status message if the current unit is the primary. try: if self._patroni.get_primary(unit_name_pattern=True) == self.unit.name: @@ -423,13 +407,6 @@ def _on_update_status(self, _) -> None: except (RetryError, ConnectionError) as e: logger.error(f"failed to get primary with error {e}") - def _restart_postgresql_service(self) -> None: - """Restart PostgreSQL and Patroni.""" - self.unit.status = MaintenanceStatus(f"restarting {self._postgresql_service} service") - container = self.unit.get_container("postgresql") - container.restart(self._postgresql_service) - self.unit.status = ActiveStatus() - @property def _patroni(self): """Returns an instance of the Patroni object.""" @@ -541,21 +518,6 @@ def _unit_name_to_pod_name(self, unit_name: str) -> str: """ return unit_name.replace("/", "-") - def force_primary_change(self) -> None: - """Force primary changes immediately. - - This function is needed to handle cases related to - https://github.com/canonical/pebble/issues/6 . - When a fail-over is started, Patroni gets stuck on the primary - change because of some zombie process that are not cleaned by Pebble. - """ - # Change needed to force an immediate failover. - self._patroni.change_master_start_timeout(0) - # Restart the stuck previous leader (will trigger an immediate failover). - self._restart_postgresql_service() - # Revert configuration to default. - self._patroni.change_master_start_timeout(None) - if __name__ == "__main__": main(PostgresqlOperatorCharm, use_juju_for_storage=True) diff --git a/src/patroni.py b/src/patroni.py index 580780e067..9010b95b70 100644 --- a/src/patroni.py +++ b/src/patroni.py @@ -40,17 +40,6 @@ def __init__( self._storage_path = storage_path self._planned_units = planned_units - def change_master_start_timeout(self, seconds: int) -> None: - """Change master start timeout configuration. - - Args: - seconds: number of seconds to set in master_start_timeout configuration. - """ - requests.patch( - f"http://{self._endpoint}:8008/config", - json={"master_start_timeout": seconds}, - ) - def get_primary(self, unit_name_pattern=False) -> str: """Get primary instance. @@ -72,16 +61,6 @@ def get_primary(self, unit_name_pattern=False) -> str: break return primary - @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10)) - def get_postgresql_state(self) -> str: - """Get PostgreSQL state. - - Returns: - running, restarting or stopping. - """ - r = requests.get(f"http://{self._endpoint}:8008/health") - return r.json()["state"] - @property @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10)) def cluster_members(self) -> set: diff --git a/tests/integration/test_charm.py b/tests/integration/test_charm.py index 8267c71719..960176c6ac 100644 --- a/tests/integration/test_charm.py +++ b/tests/integration/test_charm.py @@ -45,8 +45,6 @@ async def test_build_and_deploy(ops_test: OpsTest): await ops_test.model.deploy( charm, resources=resources, application_name=APP_NAME, num_units=len(UNIT_IDS), trust=True ) - # Change update status hook interval to be triggered more often - # (it's required to handle https://github.com/canonical/postgresql-k8s-operator/issues/3). await ops_test.model.set_config({"update-status-hook-interval": "5s"}) await ops_test.model.wait_for_idle(apps=[APP_NAME], status="active", timeout=1000) diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index f5c230b078..286357fa75 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -2,11 +2,11 @@ # See LICENSE file for licensing details. import unittest -from unittest.mock import Mock, call, patch +from unittest.mock import Mock, patch from lightkube import codecs from lightkube.resources.core_v1 import Pod -from ops.model import ActiveStatus, BlockedStatus +from ops.model import ActiveStatus from ops.testing import Harness from tenacity import RetryError @@ -130,41 +130,19 @@ def test_fail_to_get_primary(self, _get_primary): _get_primary.assert_called_once() mock_event.set_results.assert_not_called() - @patch("ops.model.Container.restart") - def test_restart_postgresql_service(self, _restart): - self.charm._restart_postgresql_service() - _restart.assert_called_once_with(self._postgresql_service) - self.assertEqual( - self.harness.model.unit.status, - ActiveStatus(), - ) - @patch_network_get(private_address="1.1.1.1") @patch("charm.Patroni.get_primary") - @patch("charm.PostgresqlOperatorCharm._restart_postgresql_service") - @patch("charm.Patroni.change_master_start_timeout") - @patch("charm.Patroni.get_postgresql_state") def test_on_update_status( self, - _get_postgresql_state, - _change_master_start_timeout, - _restart_postgresql_service, _get_primary, ): - _get_postgresql_state.side_effect = ["running", "running", "restarting", "stopping"] _get_primary.side_effect = [ "postgresql-k8s/1", self.charm.unit.name, - self.charm.unit.name, - self.charm.unit.name, ] - # Test running status. - self.charm.on.update_status.emit() - _change_master_start_timeout.assert_not_called() - _restart_postgresql_service.assert_not_called() - # Check primary message not being set (current unit is not the primary). + self.charm.on.update_status.emit() _get_primary.assert_called_once() self.assertNotEqual( self.harness.model.unit.status, @@ -178,41 +156,9 @@ def test_on_update_status( ActiveStatus("Primary"), ) - # Test restarting status. - self.charm.on.update_status.emit() - _change_master_start_timeout.assert_not_called() - _restart_postgresql_service.assert_called_once() - - # Create a manager mock to check the correct order of the calls. - manager = Mock() - manager.attach_mock(_change_master_start_timeout, "c") - manager.attach_mock(_restart_postgresql_service, "r") - - # Test stopping status. - _restart_postgresql_service.reset_mock() - self.charm.on.update_status.emit() - expected_calls = [call.c(0), call.r(), call.c(None)] - self.assertEqual(manager.mock_calls, expected_calls) - - @patch_network_get(private_address="1.1.1.1") - @patch("charm.Patroni.get_primary") - @patch("charm.PostgresqlOperatorCharm._restart_postgresql_service") - @patch("charm.Patroni.get_postgresql_state") - def test_on_update_status_with_error_on_postgresql_status_check( - self, _get_postgresql_state, _restart_postgresql_service, _ - ): - _get_postgresql_state.side_effect = [RetryError("fake error")] - self.charm.on.update_status.emit() - _restart_postgresql_service.assert_not_called() - self.assertEqual( - self.harness.model.unit.status, - BlockedStatus("failed to check PostgreSQL state with error RetryError[fake error]"), - ) - @patch_network_get(private_address="1.1.1.1") @patch("charm.Patroni.get_primary") - @patch("charm.Patroni.get_postgresql_state") - def test_on_update_status_with_error_on_get_primary(self, _, _get_primary): + def test_on_update_status_with_error_on_get_primary(self, _get_primary): _get_primary.side_effect = [RetryError("fake error")] with self.assertLogs("charm", "ERROR") as logs: diff --git a/tests/unit/test_patroni.py b/tests/unit/test_patroni.py index b66a4705a6..5fec3da7b9 100644 --- a/tests/unit/test_patroni.py +++ b/tests/unit/test_patroni.py @@ -18,28 +18,6 @@ def setUp(self): "postgresql-k8s-0", "postgresql-k8s-0", "test-model", 1, STORAGE_PATH ) - @patch("requests.patch") - def test_change_master_start_timeout(self, _patch): - # Test with an initial timeout value. - self.patroni.change_master_start_timeout(0) - _patch.assert_called_once_with( - "http://postgresql-k8s-0:8008/config", json={"master_start_timeout": 0} - ) - - # Test with another timeout value. - _patch.reset_mock() - self.patroni.change_master_start_timeout(300) - _patch.assert_called_once_with( - "http://postgresql-k8s-0:8008/config", json={"master_start_timeout": 300} - ) - - @patch("requests.get") - def test_get_postgresql_state(self, _get): - _get.return_value.json.return_value = {"state": "running"} - state = self.patroni.get_postgresql_state() - self.assertEqual(state, "running") - _get.assert_called_once_with("http://postgresql-k8s-0:8008/health") - @patch("requests.get") def test_get_primary(self, _get): # Mock Patroni cluster API.