From 3a5550ab28cea24230441ee5e33b346ad9a0f6ee Mon Sep 17 00:00:00 2001 From: Chris Smith <1979423+chris13524@users.noreply.github.com> Date: Thu, 29 Feb 2024 16:54:02 -0700 Subject: [PATCH] fix: increase relay latency alarm threshold (#391) --- .../panels/app/http_request_latency.libsonnet | 21 ++++++++++++++++++- .../relay_incoming_message_latency.libsonnet | 2 +- 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/terraform/monitoring/panels/app/http_request_latency.libsonnet b/terraform/monitoring/panels/app/http_request_latency.libsonnet index 7d578426..82ceb3de 100644 --- a/terraform/monitoring/panels/app/http_request_latency.libsonnet +++ b/terraform/monitoring/panels/app/http_request_latency.libsonnet @@ -21,6 +21,25 @@ local targets = grafana.targets; message = '%(env)s - HTTP request latency too high' % { env: vars.environment }, notifications = vars.notifications, noDataState = 'no_data', + conditions = [ + grafana.alertCondition.new( + evaluatorParams = [ 10000 ], + evaluatorType = 'gt', + operatorType = 'or', + queryRefId = 'HttpRequestLatency', + queryTimeStart = '5m', + queryTimeEnd = 'now', + reducerType = grafana.alert_reducers.Avg + ), + ], + )) + + .setAlert(vars.environment, grafana.alert.new( + namespace = vars.namespace, + name = '%(env)s - HTTP (filtered) request latency too high' % { env: vars.environment }, + message = '%(env)s - HTTP (filtered) request latency too high' % { env: vars.environment }, + notifications = vars.notifications, + noDataState = 'no_data', conditions = [ grafana.alertCondition.new( evaluatorParams = [ 2000 ], @@ -44,7 +63,7 @@ local targets = grafana.targets; .addTarget(targets.prometheus( datasource = ds.prometheus, - expr = 'sum by (aws_ecs_task_revision, method, endpoint) (rate(http_request_latency_sum{endpoint!="/:project_id/subscribers"}[$__rate_interval])) / sum by (aws_ecs_task_revision, method, endpoint) (rate(http_request_latency_count{endpoint!="/:project_id/subscribers"}[$__rate_interval]))', + expr = 'sum by (aws_ecs_task_revision, method, endpoint) (rate(http_request_latency_sum{endpoint!~"^(/:project_id/subscribers|/v1/relay-webhook)"}[$__rate_interval])) / sum by (aws_ecs_task_revision, method, endpoint) (rate(http_request_latency_count{endpoint!="/:project_id/subscribers"}[$__rate_interval]))', legendFormat = '{{method}} {{endpoint}} r{{aws_ecs_task_revision}}', exemplar = false, refId = 'FilteredHttpRequestLatency', diff --git a/terraform/monitoring/panels/app/relay_incoming_message_latency.libsonnet b/terraform/monitoring/panels/app/relay_incoming_message_latency.libsonnet index 55ff5c61..d6e5d82f 100644 --- a/terraform/monitoring/panels/app/relay_incoming_message_latency.libsonnet +++ b/terraform/monitoring/panels/app/relay_incoming_message_latency.libsonnet @@ -23,7 +23,7 @@ local targets = grafana.targets; noDataState = 'no_data', conditions = [ grafana.alertCondition.new( - evaluatorParams = [ 5000 ], + evaluatorParams = [ 10000 ], evaluatorType = 'gt', operatorType = 'or', queryRefId = 'RelayIncomingMessageLatency',