Skip to content

Commit

Permalink
feat: add hugepages into the HostMemoryFull alert expression (#230)
Browse files Browse the repository at this point in the history
* feat: add hugepages into the HostMemoryFull alert expression

* flip expression

* tox fmt and cos_agent bump

* fix scenario

* address pr comments
  • Loading branch information
lucabello authored Jan 14, 2025
1 parent 20af835 commit 4972c50
Show file tree
Hide file tree
Showing 3 changed files with 11 additions and 6 deletions.
7 changes: 3 additions & 4 deletions lib/charms/grafana_agent/v0/cos_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,7 +252,7 @@ class _MetricsEndpointDict(TypedDict):

LIBID = "dc15fa84cef84ce58155fb84f6c6213a"
LIBAPI = 0
LIBPATCH = 12
LIBPATCH = 13

PYDEPS = ["cosl", "pydantic"]

Expand Down Expand Up @@ -767,7 +767,7 @@ def is_ready(self, relation: Optional[Relation] = None):
"""Is this endpoint ready?"""
relation = relation or self._relation
if not relation:
logger.debug(f"no relation on {self._relation_name !r}: tracing not ready")
logger.debug(f"no relation on {self._relation_name!r}: tracing not ready")
return False
if relation.data is None:
logger.error(f"relation data is None for {relation}")
Expand Down Expand Up @@ -1029,8 +1029,7 @@ def _get_requested_protocols(self, relation: Relation):
if len(units) > 1:
# should never happen
raise ValueError(
f"unexpected error: subordinate relation {relation} "
f"should have exactly one unit"
f"unexpected error: subordinate relation {relation} should have exactly one unit"
)

unit = next(iter(units), None)
Expand Down
8 changes: 7 additions & 1 deletion src/prometheus_alert_rules/memory.rules
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,13 @@ groups:
LABELS = {{ $labels }}
The 5-minute-ahead prediction is made as a linear regression from the last 30 minutes of data.
- alert: HostMemoryFull
expr: avg_over_time(node_memory_MemUsed_percentage[1m]) > 95
# The difference of averages is more robust (less noisy) than computing the average at the end
expr: |
100 * avg_over_time(node_memory_MemFree_bytes[1m]) /
(
avg_over_time(node_memory_MemTotal_bytes[1m])
- avg_over_time(node_memory_Hugetlb_bytes[1m])
) < 10
for: 2m
labels:
severity: critical
Expand Down
2 changes: 1 addition & 1 deletion tests/scenario/test_alert_labels.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ def test_metrics_alert_rule_labels(charm_config):
)
for group in alert_rules["groups"]:
for rule in group["rules"]:
if "grafana-agent_alertgroup_alerts" in group["name"]:
if "grafana_agent_alertgroup_alerts" in group["name"]:
assert (
rule["labels"]["juju_application"] == "primary"
or rule["labels"]["juju_application"] == "subordinate"
Expand Down

0 comments on commit 4972c50

Please sign in to comment.