Skip to content

Commit

Permalink
Remove Patroni stop/restart workaround (#14)
Browse files Browse the repository at this point in the history
* Remove Patroni stop/restart workaround

* Update test

* Fix CI workflow

* Add update status interval change
  • Loading branch information
marceloneppel authored Aug 17, 2022
1 parent 62c636f commit 3e1ece8
Show file tree
Hide file tree
Showing 5 changed files with 4 additions and 141 deletions.
38 changes: 0 additions & 38 deletions src/charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -400,36 +400,13 @@ def _on_get_primary(self, event: ActionEvent) -> None:
logger.error(f"failed to get primary with error {e}")

def _on_update_status(self, _) -> None:
# Until https://github.com/canonical/pebble/issues/6 is fixed,
# we need to use the logic below to restart the leader
# and a stuck replica after a failover/switchover.
try:
state = self._patroni.get_postgresql_state()
if state == "restarting":
# Restart the stuck replica.
self._restart_postgresql_service()
elif state == "starting" or state == "stopping":
# Force a primary change when the current primary is stuck.
self.force_primary_change()
except RetryError as e:
logger.error("failed to check PostgreSQL state")
self.unit.status = BlockedStatus(f"failed to check PostgreSQL state with error {e}")
return

# Display an active status message if the current unit is the primary.
try:
if self._patroni.get_primary(unit_name_pattern=True) == self.unit.name:
self.unit.status = ActiveStatus("Primary")
except (RetryError, ConnectionError) as e:
logger.error(f"failed to get primary with error {e}")

def _restart_postgresql_service(self) -> None:
"""Restart PostgreSQL and Patroni."""
self.unit.status = MaintenanceStatus(f"restarting {self._postgresql_service} service")
container = self.unit.get_container("postgresql")
container.restart(self._postgresql_service)
self.unit.status = ActiveStatus()

@property
def _patroni(self):
"""Returns an instance of the Patroni object."""
Expand Down Expand Up @@ -541,21 +518,6 @@ def _unit_name_to_pod_name(self, unit_name: str) -> str:
"""
return unit_name.replace("/", "-")

def force_primary_change(self) -> None:
"""Force primary changes immediately.
This function is needed to handle cases related to
https://github.com/canonical/pebble/issues/6 .
When a fail-over is started, Patroni gets stuck on the primary
change because of some zombie process that are not cleaned by Pebble.
"""
# Change needed to force an immediate failover.
self._patroni.change_master_start_timeout(0)
# Restart the stuck previous leader (will trigger an immediate failover).
self._restart_postgresql_service()
# Revert configuration to default.
self._patroni.change_master_start_timeout(None)


if __name__ == "__main__":
main(PostgresqlOperatorCharm, use_juju_for_storage=True)
21 changes: 0 additions & 21 deletions src/patroni.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,17 +40,6 @@ def __init__(
self._storage_path = storage_path
self._planned_units = planned_units

def change_master_start_timeout(self, seconds: int) -> None:
"""Change master start timeout configuration.
Args:
seconds: number of seconds to set in master_start_timeout configuration.
"""
requests.patch(
f"http://{self._endpoint}:8008/config",
json={"master_start_timeout": seconds},
)

def get_primary(self, unit_name_pattern=False) -> str:
"""Get primary instance.
Expand All @@ -72,16 +61,6 @@ def get_primary(self, unit_name_pattern=False) -> str:
break
return primary

@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10))
def get_postgresql_state(self) -> str:
"""Get PostgreSQL state.
Returns:
running, restarting or stopping.
"""
r = requests.get(f"http://{self._endpoint}:8008/health")
return r.json()["state"]

@property
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10))
def cluster_members(self) -> set:
Expand Down
2 changes: 0 additions & 2 deletions tests/integration/test_charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,6 @@ async def test_build_and_deploy(ops_test: OpsTest):
await ops_test.model.deploy(
charm, resources=resources, application_name=APP_NAME, num_units=len(UNIT_IDS), trust=True
)
# Change update status hook interval to be triggered more often
# (it's required to handle https://github.com/canonical/postgresql-k8s-operator/issues/3).
await ops_test.model.set_config({"update-status-hook-interval": "5s"})

await ops_test.model.wait_for_idle(apps=[APP_NAME], status="active", timeout=1000)
Expand Down
62 changes: 4 additions & 58 deletions tests/unit/test_charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@
# See LICENSE file for licensing details.

import unittest
from unittest.mock import Mock, call, patch
from unittest.mock import Mock, patch

from lightkube import codecs
from lightkube.resources.core_v1 import Pod
from ops.model import ActiveStatus, BlockedStatus
from ops.model import ActiveStatus
from ops.testing import Harness
from tenacity import RetryError

Expand Down Expand Up @@ -130,41 +130,19 @@ def test_fail_to_get_primary(self, _get_primary):
_get_primary.assert_called_once()
mock_event.set_results.assert_not_called()

@patch("ops.model.Container.restart")
def test_restart_postgresql_service(self, _restart):
self.charm._restart_postgresql_service()
_restart.assert_called_once_with(self._postgresql_service)
self.assertEqual(
self.harness.model.unit.status,
ActiveStatus(),
)

@patch_network_get(private_address="1.1.1.1")
@patch("charm.Patroni.get_primary")
@patch("charm.PostgresqlOperatorCharm._restart_postgresql_service")
@patch("charm.Patroni.change_master_start_timeout")
@patch("charm.Patroni.get_postgresql_state")
def test_on_update_status(
self,
_get_postgresql_state,
_change_master_start_timeout,
_restart_postgresql_service,
_get_primary,
):
_get_postgresql_state.side_effect = ["running", "running", "restarting", "stopping"]
_get_primary.side_effect = [
"postgresql-k8s/1",
self.charm.unit.name,
self.charm.unit.name,
self.charm.unit.name,
]

# Test running status.
self.charm.on.update_status.emit()
_change_master_start_timeout.assert_not_called()
_restart_postgresql_service.assert_not_called()

# Check primary message not being set (current unit is not the primary).
self.charm.on.update_status.emit()
_get_primary.assert_called_once()
self.assertNotEqual(
self.harness.model.unit.status,
Expand All @@ -178,41 +156,9 @@ def test_on_update_status(
ActiveStatus("Primary"),
)

# Test restarting status.
self.charm.on.update_status.emit()
_change_master_start_timeout.assert_not_called()
_restart_postgresql_service.assert_called_once()

# Create a manager mock to check the correct order of the calls.
manager = Mock()
manager.attach_mock(_change_master_start_timeout, "c")
manager.attach_mock(_restart_postgresql_service, "r")

# Test stopping status.
_restart_postgresql_service.reset_mock()
self.charm.on.update_status.emit()
expected_calls = [call.c(0), call.r(), call.c(None)]
self.assertEqual(manager.mock_calls, expected_calls)

@patch_network_get(private_address="1.1.1.1")
@patch("charm.Patroni.get_primary")
@patch("charm.PostgresqlOperatorCharm._restart_postgresql_service")
@patch("charm.Patroni.get_postgresql_state")
def test_on_update_status_with_error_on_postgresql_status_check(
self, _get_postgresql_state, _restart_postgresql_service, _
):
_get_postgresql_state.side_effect = [RetryError("fake error")]
self.charm.on.update_status.emit()
_restart_postgresql_service.assert_not_called()
self.assertEqual(
self.harness.model.unit.status,
BlockedStatus("failed to check PostgreSQL state with error RetryError[fake error]"),
)

@patch_network_get(private_address="1.1.1.1")
@patch("charm.Patroni.get_primary")
@patch("charm.Patroni.get_postgresql_state")
def test_on_update_status_with_error_on_get_primary(self, _, _get_primary):
def test_on_update_status_with_error_on_get_primary(self, _get_primary):
_get_primary.side_effect = [RetryError("fake error")]

with self.assertLogs("charm", "ERROR") as logs:
Expand Down
22 changes: 0 additions & 22 deletions tests/unit/test_patroni.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,28 +18,6 @@ def setUp(self):
"postgresql-k8s-0", "postgresql-k8s-0", "test-model", 1, STORAGE_PATH
)

@patch("requests.patch")
def test_change_master_start_timeout(self, _patch):
# Test with an initial timeout value.
self.patroni.change_master_start_timeout(0)
_patch.assert_called_once_with(
"http://postgresql-k8s-0:8008/config", json={"master_start_timeout": 0}
)

# Test with another timeout value.
_patch.reset_mock()
self.patroni.change_master_start_timeout(300)
_patch.assert_called_once_with(
"http://postgresql-k8s-0:8008/config", json={"master_start_timeout": 300}
)

@patch("requests.get")
def test_get_postgresql_state(self, _get):
_get.return_value.json.return_value = {"state": "running"}
state = self.patroni.get_postgresql_state()
self.assertEqual(state, "running")
_get.assert_called_once_with("http://postgresql-k8s-0:8008/health")

@patch("requests.get")
def test_get_primary(self, _get):
# Mock Patroni cluster API.
Expand Down

0 comments on commit 3e1ece8

Please sign in to comment.