From 902c8dea8a311234d7408364b3d22d3b0c2b076b Mon Sep 17 00:00:00 2001 From: Cosmin Tupangiu Date: Tue, 14 Jan 2025 14:56:58 +0100 Subject: [PATCH] metrics: Add basic API, agent and ova metrics Metrics for api, agent and ova are implemented. For API (agent and api_server) a middleware for chi router is used giving basic metrics. Agent: count the agents by status OVA: count the downloads by status (failed or success) Basic grafana dashboard is added. Signed-off-by: Cosmin Tupangiu --- Makefile | 6 + cmd/planner-api/run.go | 12 + dashboard/grafana.json | 1593 +++++++++++++++++++++ deploy/observability.yml | 44 + go.mod | 10 +- go.sum | 11 +- internal/agent/service/collector.go | 1 + internal/api_server/agentserver/server.go | 6 + internal/api_server/metrics_server.go | 54 + internal/api_server/server.go | 6 + internal/service/agent/handler.go | 34 + internal/service/image.go | 5 + pkg/metrics/handler.go | 25 + pkg/metrics/metrics.go | 71 + 14 files changed, 1871 insertions(+), 7 deletions(-) create mode 100644 dashboard/grafana.json create mode 100644 deploy/observability.yml create mode 100644 internal/api_server/metrics_server.go create mode 100644 pkg/metrics/handler.go create mode 100644 pkg/metrics/metrics.go diff --git a/Makefile b/Makefile index c86b1f9..4ceef1a 100644 --- a/Makefile +++ b/Makefile @@ -144,6 +144,12 @@ deploy-on-openshift: oc create secret generic migration-planner-secret -n ${MIGRATION_PLANNER_NAMESPACE} --from-literal=config_server=http://$$config_server --from-literal=config_server_ui=https://$$config_server_ui/migrate/wizard || true ls deploy/k8s | awk '! /secret|service|template/' | xargs -I {} oc apply -n ${MIGRATION_PLANNER_NAMESPACE} -f deploy/k8s/{} +deploy-local-obs: + @podman play kube --network host deploy/observability.yml + +undeploy-local-obs: + @podman kube down deploy/observability.yml + undeploy-on-openshift: oc delete route planner || true oc delete route planner-agent || true diff --git a/cmd/planner-api/run.go b/cmd/planner-api/run.go index 5de6de3..587a4f9 100644 --- a/cmd/planner-api/run.go +++ b/cmd/planner-api/run.go @@ -96,6 +96,18 @@ var runCmd = &cobra.Command{ } }() + go func() { + defer cancel() + listener, err := newListener("0.0.0.0:8080") + if err != nil { + zap.S().Named("metrics_server").Fatalf("creating listener: %s", err) + } + metricsServer := apiserver.NewMetricServer("0.0.0.0:8080", listener) + if err := metricsServer.Run(ctx); err != nil { + zap.S().Named("metrics_server").Fatalf("failed to run metrics server: %s", err) + } + }() + <-ctx.Done() _ = ep.Close() diff --git a/dashboard/grafana.json b/dashboard/grafana.json new file mode 100644 index 0000000..ea20c0b --- /dev/null +++ b/dashboard/grafana.json @@ -0,0 +1,1593 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 1, + "links": [], + "panels": [ + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 22, + "panels": [], + "title": "API overall", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "fe9y7urz5l5hcc" + }, + "description": "The percentage of time that the API is available (not returning 500 Internal Service Errors).", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 0, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 0, + "y": 1 + }, + "id": 17, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "editorMode": "code", + "expr": "sum(increase(chi_requests_total{service=\"api_server\",code!~\"5..|0\"}[$__range]))\n/\nsum(increase(chi_requests_total{service=\"api_server\"}[$__range]))", + "legendFormat": "", + "range": true, + "refId": "A" + } + ], + "title": "Availability (selected time) ", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "fe9y7urz5l5hcc" + }, + "description": "Number of requests per second based on response code", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 4, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 1, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 7, + "x": 4, + "y": 1 + }, + "id": 19, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "editorMode": "code", + "expr": "sum by (code) (rate(chi_requests_total{service=\"api_server\"}[$__range]))", + "legendFormat": "{{code}}", + "range": true, + "refId": "A" + } + ], + "title": "Requests", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "fe9y7urz5l5hcc" + }, + "description": "Number of requests per second based on response code", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 4, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 1, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 11, + "y": 1 + }, + "id": 20, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "editorMode": "code", + "expr": "sum(rate(chi_requests_total{service=\"api_server\", code!~\"2..\"}[$__range])) / sum(rate(chi_requests_total{service=\"api_server\"}[$__range]))", + "legendFormat": "non 2xx", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "fe9y7urz5l5hcc" + }, + "editorMode": "code", + "expr": "sum(rate(chi_requests_total{service=\"api_server\", code=~\"5.0\"}[$__range])) / sum(rate(chi_requests_total{service=\"api_server\"}[$__range]))", + "hide": false, + "instant": false, + "legendFormat": "5xx", + "range": true, + "refId": "B" + } + ], + "title": "Errors", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "fe9y7urz5l5hcc" + }, + "description": "The request duration within which the API have served 99%, 95%, 50% of requests", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 4, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 1, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 7, + "x": 17, + "y": 1 + }, + "id": 21, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "editorMode": "code", + "expr": "histogram_quantile(0.99, sum by (le) (rate(chi_request_duration_milliseconds_bucket{service=\"api_server\", code!~\"5..|0\"}[5m])))", + "legendFormat": "99th", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "fe9y7urz5l5hcc" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.90, sum by (le) (rate(chi_request_duration_milliseconds_bucket{service=\"api_server\", code!~\"5..|0\"}[5m])))", + "hide": false, + "instant": false, + "legendFormat": "90th", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "fe9y7urz5l5hcc" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.50, sum by (le) (rate(chi_request_duration_milliseconds_bucket{service=\"api_server\", code!~\"5..|0\"}[5m])))", + "hide": false, + "instant": false, + "legendFormat": "50th", + "range": true, + "refId": "C" + } + ], + "title": "Duration", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "fe9y7urz5l5hcc" + }, + "description": "The percentage of time that the API responds within 300ms.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 0, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 0, + "y": 5 + }, + "id": 18, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "editorMode": "code", + "expr": "sum(increase(chi_request_duration_milliseconds_bucket{service=\"api_server\", code!~\"5..|0\",le=\"300.0\"}[$__range]))\n/\nsum(increase(chi_request_duration_milliseconds_count{service=\"api_server\", code!~\"5..|0\"}[$__range]))", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Latency (<300ms)", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 9 + }, + "id": 13, + "panels": [], + "title": "Agent API overall", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "fe9y7urz5l5hcc" + }, + "description": "The percentage of time that the API is available (not returning 500 Internal Service Errors).", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 0, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 0, + "y": 10 + }, + "id": 11, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "editorMode": "code", + "expr": "sum(increase(chi_requests_total{service=~\"agent_server\",code!~\"5..|0\",path!~\"/health\"}[$__range]))\n/\nsum(increase(chi_requests_total{service=~\"agent_server\",path!~\"/health\"}[$__range]))", + "legendFormat": "", + "range": true, + "refId": "A" + } + ], + "title": "Availability (selected time) ", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "fe9y7urz5l5hcc" + }, + "description": "Number of requests per second based on response code", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 4, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 1, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 7, + "x": 4, + "y": 10 + }, + "id": 14, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "editorMode": "code", + "expr": "sum by (code) (rate(chi_requests_total{service=\"agent_server\",path!~\"/health\"}[$__range]))", + "legendFormat": "{{code}}", + "range": true, + "refId": "A" + } + ], + "title": "Requests", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "fe9y7urz5l5hcc" + }, + "description": "Number of requests per second based on response code", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 4, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 1, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 11, + "y": 10 + }, + "id": 15, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "editorMode": "code", + "expr": "sum(rate(chi_requests_total{service=\"agent_server\",path!~\"/health\", code!~\"2..\"}[$__range])) / sum(rate(chi_requests_total{service=\"agent_server\",path!~\"/health\"}[$__range]))", + "legendFormat": "non 2xx", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "fe9y7urz5l5hcc" + }, + "editorMode": "code", + "expr": "sum(rate(chi_requests_total{service=\"agent_server\",path!~\"/health\", code=~\"5.0\"}[$__range])) / sum(rate(chi_requests_total{service=\"agent_server\",path!~\"/health\"}[$__range]))", + "hide": false, + "instant": false, + "legendFormat": "5xx", + "range": true, + "refId": "B" + } + ], + "title": "Errors", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "fe9y7urz5l5hcc" + }, + "description": "The request duration within which the API have served 99%, 95%, 50% of requests", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 4, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 1, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 7, + "x": 17, + "y": 10 + }, + "id": 16, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "editorMode": "code", + "expr": "histogram_quantile(0.99, sum by (le) (rate(chi_request_duration_milliseconds_bucket{service=\"agent_server\", code!~\"5..|0\"}[5m])))", + "legendFormat": "99th", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "fe9y7urz5l5hcc" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.90, sum by (le) (rate(chi_request_duration_milliseconds_bucket{service=\"agent_server\", code!~\"5..|0\"}[5m])))", + "hide": false, + "instant": false, + "legendFormat": "90th", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "fe9y7urz5l5hcc" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.50, sum by (le) (rate(chi_request_duration_milliseconds_bucket{service=\"agent_server\", code!~\"5..|0\"}[5m])))", + "hide": false, + "instant": false, + "legendFormat": "50th", + "range": true, + "refId": "C" + } + ], + "title": "Duration", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "fe9y7urz5l5hcc" + }, + "description": "The percentage of time that the API responds within 300ms.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 0, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 0, + "y": 14 + }, + "id": 12, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "editorMode": "code", + "expr": "sum(increase(chi_request_duration_milliseconds_bucket{service=\"agent_server\", code!~\"5..|0\",path!~\"/health\",le=\"300.0\"}[$__range]))\n/\nsum(increase(chi_request_duration_milliseconds_count{service=\"agent_server\", code!~\"5..|0\",path!~\"/health\"}[$__range]))", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Latency (<300ms)", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 18 + }, + "id": 7, + "panels": [], + "title": "Ova download metrics", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "fe9y7urz5l5hcc" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 0, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 0, + "y": 19 + }, + "id": 9, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "editorMode": "code", + "expr": "increase(assisted_migration_ova_downloads{state=\"successful\"}[5m])", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Total downloads in the last 5m", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "fe9y7urz5l5hcc" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 4, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 1, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 20, + "x": 4, + "y": 19 + }, + "id": 10, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "editorMode": "code", + "expr": "increase(assisted_migration_ova_downloads{state=\"successful\"}[$__range])", + "legendFormat": "{{state}}", + "range": true, + "refId": "A" + } + ], + "title": "Ova downloads over time", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "fe9y7urz5l5hcc" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 0, + "y": 23 + }, + "id": 8, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "editorMode": "code", + "expr": "max_over_time(assisted_migration_ova_downloads{state=\"successful\"}[24h])", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Successfull downloads in the last 24h", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 27 + }, + "id": 2, + "panels": [], + "title": "Agent metrics", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "fe9y7urz5l5hcc" + }, + "description": "Current number of agents in up-to-date state", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 0, + "y": 28 + }, + "id": 3, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "editorMode": "code", + "expr": "assisted_migration_agent_status_count{state=\"up-to-date\"}", + "legendFormat": "{{state}}", + "range": true, + "refId": "A" + } + ], + "title": "Agents in up-to-date state", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "fe9y7urz5l5hcc" + }, + "description": "Current number of agents in gathering-inventory state", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 3, + "y": 28 + }, + "id": 5, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": { + "titleSize": 20 + }, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "editorMode": "code", + "expr": "assisted_migration_agent_status_count{state=\"gathering-initial-inventory\"}", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Agents in gathering-inventory state", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "fe9y7urz5l5hcc" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 3, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 1, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 18, + "x": 6, + "y": 28 + }, + "id": 1, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "fe9y7urz5l5hcc" + }, + "editorMode": "code", + "expr": "assisted_migration_agent_status_count", + "legendFormat": "{{state}}", + "range": true, + "refId": "A" + } + ], + "title": "Histograme of agents' states", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "fe9y7urz5l5hcc" + }, + "description": "Current number of agents in waiting-for-credential state", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 0, + "y": 32 + }, + "id": 6, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "editorMode": "code", + "expr": "assisted_migration_agent_status_count{state=\"waiting-for-credentials\"}", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Agents in waiting-for-credential state", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "fe9y7urz5l5hcc" + }, + "description": "Current number of agents in error state", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 3, + "y": 32 + }, + "id": 4, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "editorMode": "code", + "expr": "assisted_migration_agent_status_count{state=\"error\"}", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Agents in error state", + "type": "stat" + } + ], + "preload": false, + "refresh": "5s", + "schemaVersion": 40, + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "assisted_migrations", + "uid": "de9yb0qpl8dmoc", + "version": 33, + "weekStart": "" +} + diff --git a/deploy/observability.yml b/deploy/observability.yml new file mode 100644 index 0000000..8d57130 --- /dev/null +++ b/deploy/observability.yml @@ -0,0 +1,44 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: prometheus-config +data: + prometheus.yml: | + global: + scrape_interval: 15s + + scrape_configs: + - job_name: 'am' + metrics_path: /metrics + static_configs: + - targets: [ + 'localhost:8080', + ] +--- +apiVersion: v1 +kind: Pod +metadata: + name: prometheus-streamer +spec: + containers: + - name: prometheus + image: docker.io/prom/prometheus + volumeMounts: + - name: prometheus-config + mountPath: /etc/prometheus/prometheus.yml + subPath: prometheus.yml + - name: grafana + image: docker.io/grafana/grafana-oss + volumeMounts: + - name: grafana-data + mountPath: /var/lib/grafana + volumes: + - name: prometheus-config + configMap: + name: prometheus-config + items: + - key: prometheus.yml + path: prometheus.yml + - name: grafana-data + persistentVolumeClaim: + claimName: grafana-data diff --git a/go.mod b/go.mod index 23c4bbd..539cd30 100644 --- a/go.mod +++ b/go.mod @@ -23,10 +23,12 @@ require ( github.com/onsi/ginkgo/v2 v2.15.0 github.com/onsi/gomega v1.32.0 github.com/openshift/assisted-image-service v0.0.0-20240827125623-ad5c4b36a817 + github.com/prometheus/client_golang v1.20.5 github.com/sirupsen/logrus v1.9.3 github.com/spf13/cobra v1.8.1 github.com/spf13/pflag v1.0.6-0.20210604193023-d5e0c0615ace github.com/thoas/go-funk v0.9.3 + github.com/toshi0607/chi-prometheus v0.1.4 github.com/vmware/govmomi v0.39.0 go.uber.org/zap v1.27.0 golang.org/x/sync v0.10.0 @@ -44,9 +46,11 @@ require ( github.com/ajg/form v1.5.1 // indirect github.com/apapsch/go-jsonmerge/v2 v2.0.0 // indirect github.com/aws/aws-sdk-go v1.50.25 // indirect + github.com/beorn7/perks v1.0.1 // indirect github.com/bytedance/sonic v1.12.6 // indirect github.com/bytedance/sonic/loader v0.2.1 // indirect github.com/cavaliercoder/go-cpio v0.0.0-20180626203310-925f9528c45e // indirect + github.com/cespare/xxhash/v2 v2.3.0 // indirect github.com/clarketm/json v1.17.1 // indirect github.com/cloudevents/sdk-go/protocol/kafka_sarama/v2 v2.15.2 // indirect github.com/cloudwego/base64x v0.1.4 // indirect @@ -127,6 +131,9 @@ require ( github.com/pkg/errors v0.9.1 // indirect github.com/pkg/xattr v0.4.9 // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect + github.com/prometheus/client_model v0.6.1 // indirect + github.com/prometheus/common v0.55.0 // indirect + github.com/prometheus/procfs v0.15.1 // indirect github.com/rcrowley/go-metrics v0.0.0-20201227073835-cf1acfcdf475 // indirect github.com/stretchr/testify v1.9.0 // indirect github.com/twitchyliquid64/golang-asm v0.15.1 // indirect @@ -137,13 +144,12 @@ require ( golang.org/x/arch v0.12.0 // indirect golang.org/x/crypto v0.31.0 // indirect golang.org/x/net v0.33.0 // indirect - golang.org/x/oauth2 v0.17.0 // indirect + golang.org/x/oauth2 v0.21.0 // indirect golang.org/x/sys v0.28.0 // indirect golang.org/x/term v0.27.0 // indirect golang.org/x/text v0.21.0 // indirect golang.org/x/time v0.5.0 // indirect golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d // indirect - google.golang.org/appengine v1.6.8 // indirect google.golang.org/protobuf v1.36.1 // indirect gopkg.in/djherbis/times.v1 v1.3.0 // indirect gopkg.in/inf.v0 v0.9.1 // indirect diff --git a/go.sum b/go.sum index bf48001..83b1af7 100644 --- a/go.sum +++ b/go.sum @@ -280,6 +280,8 @@ github.com/kubev2v/forklift v0.0.0-20241129095927-4890e072e015 h1:lKoe5Sy+faux6g github.com/kubev2v/forklift v0.0.0-20241129095927-4890e072e015/go.mod h1:fHaGLhv09dWXKv0/0GNl3rgLe/KH5Y6IyG6eGLYaA6k= github.com/kubev2v/migration-event-streamer v0.0.0-20241125102656-9cdf9e64a16b h1:xOHUPs9sVGie2EpTZDfSsxUPZHMBgm8XYvthdAMzJD4= github.com/kubev2v/migration-event-streamer v0.0.0-20241125102656-9cdf9e64a16b/go.mod h1:xpo9o779xi1mM0142E8KqVc205ahAP1wswoN+DKbX8E= +github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc= +github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw= github.com/leodido/go-urn v1.4.0 h1:WT9HwE9SGECu3lg4d/dIA+jxlljEa1/ffXKmRjqdmIQ= github.com/leodido/go-urn v1.4.0/go.mod h1:bvxc+MVxLKB4z00jd1z+Dvzr47oO32F/QSNjSBOlFxI= github.com/leosunmo/zapchi v0.2.0 h1:BSX9FIcPbgVBgMgVBAfN0CrLWv012tjFcqSTfDDUYyY= @@ -399,6 +401,8 @@ github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsT github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= github.com/thoas/go-funk v0.9.3 h1:7+nAEx3kn5ZJcnDm2Bh23N2yOtweO14bi//dvRtgLpw= github.com/thoas/go-funk v0.9.3/go.mod h1:+IWnUfUmFO1+WVYQWQtIJHeRRdaIyyYglZN7xzUPe4Q= +github.com/toshi0607/chi-prometheus v0.1.4 h1:5KpqJrmdvMvbfU0JiL9ghOTbe8S9sgHDCCQvXgnyoJo= +github.com/toshi0607/chi-prometheus v0.1.4/go.mod h1:E++tBjqpDsvGWjLYdcFd5rvqJ7HG8wwBux+M6gyIL/Q= github.com/twitchyliquid64/golang-asm v0.15.1 h1:SU5vSMR7hnwNxj24w34ZyCi/FmDZTkS4MhqMhdFk5YI= github.com/twitchyliquid64/golang-asm v0.15.1/go.mod h1:a1lVb/DtPvCB8fslRZhAngC2+aY1QWCk3Cedj/Gdt08= github.com/ugorji/go/codec v1.2.12 h1:9LC83zGrHhuUA9l16C9AHXAqEV/2wBQ4nkvumAE65EE= @@ -480,8 +484,8 @@ golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= golang.org/x/net v0.33.0 h1:74SYHlV8BIgHIFC/LrYkOGIwL19eTYXQ5wc6TBuO36I= golang.org/x/net v0.33.0/go.mod h1:HXLR5J+9DxmrqMwG9qjGCxZ+zKXxBru04zlTvWlWuN4= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= -golang.org/x/oauth2 v0.17.0 h1:6m3ZPmLEFdVxKKWnKq4VqZ60gutO35zm+zrAHVmHyDQ= -golang.org/x/oauth2 v0.17.0/go.mod h1:OzPDGQiuQMguemayvdylqddI7qcD9lnSDb+1FiwQ5HA= +golang.org/x/oauth2 v0.21.0 h1:tsimM75w1tF/uws5rbeHzIWxEqElMehnc+iW793zsZs= +golang.org/x/oauth2 v0.21.0/go.mod h1:XYTD2NtWslqkgxebSiOHnXEap4TF09sJSc7H1sXbhtI= golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= @@ -534,7 +538,6 @@ golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.5/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= -golang.org/x/text v0.3.8/go.mod h1:E6s5w1FMmriuDzIBO73fBruAKo1PCIq6d2Q6DHfQ8WQ= golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= golang.org/x/text v0.21.0 h1:zyQAAkrwaneQ066sspRyJaG9VNi/YJ1NfzcGB3hZ/qo= golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ= @@ -568,8 +571,6 @@ gomodules.xyz/jsonpatch/v2 v2.4.0 h1:Ci3iUJyx9UeRx7CeFN8ARgGbkESwJK+KB9lLcWxY/Zw gomodules.xyz/jsonpatch/v2 v2.4.0/go.mod h1:AH3dM2RI6uoBZxn3LVrfvJ3E0/9dG4cSrbuBJT4moAY= google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= -google.golang.org/appengine v1.6.8 h1:IhEN5q69dyKagZPYMSdIjS2HqprW324FRQZJcGqPAsM= -google.golang.org/appengine v1.6.8/go.mod h1:1jJ3jBArFh5pcgW8gCtRJnepW8FzD1V44FJffLiz/Ds= google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc= google.golang.org/genproto v0.0.0-20190819201941-24fa4b261c55/go.mod h1:DMBHOl98Agz4BDEuKkezgsaosCRResVns1a3J2ZsMNc= google.golang.org/genproto v0.0.0-20200526211855-cb27e3aa2013/go.mod h1:NbSheEEYHJ7i3ixzK3sjbqSGDJWnxyFXZblF3eUsNvo= diff --git a/internal/agent/service/collector.go b/internal/agent/service/collector.go index aa4b2eb..9be5c30 100644 --- a/internal/agent/service/collector.go +++ b/internal/agent/service/collector.go @@ -136,6 +136,7 @@ func (c *Collector) run() { } zap.S().Named("collector").Infof("Create inventory") + inv := createBasicInventoryObj(about.InstanceUuid, vms, collector, hosts, clusters) zap.S().Named("collector").Infof("Run the validation of VMs") diff --git a/internal/api_server/agentserver/server.go b/internal/api_server/agentserver/server.go index 506cd9d..190bca2 100644 --- a/internal/api_server/agentserver/server.go +++ b/internal/api_server/agentserver/server.go @@ -19,6 +19,7 @@ import ( "github.com/kubev2v/migration-planner/internal/store" "github.com/leosunmo/zapchi" oapimiddleware "github.com/oapi-codegen/nethttp-middleware" + chiprometheus "github.com/toshi0607/chi-prometheus" "go.uber.org/zap" ) @@ -71,7 +72,12 @@ func (s *AgentServer) Run(ctx context.Context) error { } router := chi.NewRouter() + + metricMiddleware := chiprometheus.New("agent_server") + metricMiddleware.MustRegisterDefault() + router.Use( + metricMiddleware.Handler, authenticator.Authenticator, middleware.RequestID, zapchi.Logger(zap.S(), "router_agent"), diff --git a/internal/api_server/metrics_server.go b/internal/api_server/metrics_server.go new file mode 100644 index 0000000..7eb0513 --- /dev/null +++ b/internal/api_server/metrics_server.go @@ -0,0 +1,54 @@ +package apiserver + +import ( + "context" + "errors" + "net" + "net/http" + + "github.com/go-chi/chi" + "github.com/kubev2v/migration-planner/pkg/metrics" + "go.uber.org/zap" +) + +type MetricServer struct { + bindAddress string + httpServer *http.Server + listener net.Listener +} + +func NewMetricServer(bindAddress string, listener net.Listener) *MetricServer { + router := chi.NewRouter() + + prometheusMetricHandler := metrics.NewPrometheusMetricsHandler() + router.Handle("/metrics", prometheusMetricHandler.Handler()) + + s := &MetricServer{ + bindAddress: bindAddress, + listener: listener, + httpServer: &http.Server{ + Addr: bindAddress, + Handler: router, + }, + } + + return s +} + +func (m *MetricServer) Run(ctx context.Context) error { + go func() { + <-ctx.Done() + ctxTimeout, cancel := context.WithTimeout(context.Background(), gracefulShutdownTimeout) + defer cancel() + + m.httpServer.SetKeepAlivesEnabled(false) + _ = m.httpServer.Shutdown(ctxTimeout) + zap.S().Named("metrics_server").Info("metrics server terminated") + }() + + zap.S().Named("metrics_server").Infof("serving metrics: %s", m.bindAddress) + if err := m.httpServer.Serve(m.listener); err != nil && !errors.Is(err, net.ErrClosed) { + return err + } + return nil +} diff --git a/internal/api_server/server.go b/internal/api_server/server.go index 414d36e..38d6b31 100644 --- a/internal/api_server/server.go +++ b/internal/api_server/server.go @@ -20,6 +20,7 @@ import ( "github.com/kubev2v/migration-planner/internal/store" "github.com/leosunmo/zapchi" oapimiddleware "github.com/oapi-codegen/nethttp-middleware" + chiprometheus "github.com/toshi0607/chi-prometheus" "go.uber.org/zap" ) @@ -82,7 +83,12 @@ func (s *Server) Run(ctx context.Context) error { } router := chi.NewRouter() + + metricMiddleware := chiprometheus.New("api_server") + metricMiddleware.MustRegisterDefault() + router.Use( + metricMiddleware.Handler, authenticator.Authenticator, middleware.RequestID, zapchi.Logger(zap.S(), "router_api"), diff --git a/internal/service/agent/handler.go b/internal/service/agent/handler.go index 3cdd3cc..69c4c3c 100644 --- a/internal/service/agent/handler.go +++ b/internal/service/agent/handler.go @@ -13,6 +13,7 @@ import ( "github.com/kubev2v/migration-planner/internal/events" "github.com/kubev2v/migration-planner/internal/service/mappers" "github.com/kubev2v/migration-planner/internal/store" + "github.com/kubev2v/migration-planner/pkg/metrics" "go.uber.org/zap" ) @@ -170,9 +171,42 @@ func (h *AgentServiceHandler) UpdateAgentStatus(ctx context.Context, request age zap.S().Named("agent_handler").Errorw("failed to write event", "error", err, "event_kind", kind) } + // must not block here. + // don't care about errors or context + go h.updateMetrics() + return agentServer.UpdateAgentStatus200Response{}, nil } +// update metrics about agents states +// it lists all the agents and update the metrics by agent state +func (h *AgentServiceHandler) updateMetrics() { + agents, err := h.store.Agent().List(context.TODO(), store.NewAgentQueryFilter(), store.NewAgentQueryOptions()) + if err != nil { + zap.S().Named("agent_handler").Warnf("failed to update agent metrics: %s", err) + return + } + // holds the total number of agents by state + // set defaults + states := map[string]int{ + string(api.AgentStatusUpToDate): 0, + string(api.AgentStatusError): 0, + string(api.AgentStatusWaitingForCredentials): 0, + string(api.AgentStatusGatheringInitialInventory): 0, + } + for _, a := range agents { + if count, ok := states[a.Status]; ok { + count += 1 + states[a.Status] = count + continue + } + states[a.Status] = 1 + } + for k, v := range states { + metrics.UpdateAgentStateCounterMetric(k, v) + } +} + func (h *AgentServiceHandler) newAgentEvent(agent api.Agent) (string, io.Reader) { event := events.AgentEvent{ AgentID: agent.Id, diff --git a/internal/service/image.go b/internal/service/image.go index f87d0c3..070f12e 100644 --- a/internal/service/image.go +++ b/internal/service/image.go @@ -10,6 +10,7 @@ import ( "github.com/kubev2v/migration-planner/internal/api/server" "github.com/kubev2v/migration-planner/internal/auth" "github.com/kubev2v/migration-planner/internal/image" + "github.com/kubev2v/migration-planner/pkg/metrics" ) func (h *ServiceHandler) GetImage(ctx context.Context, request server.GetImageRequestObject) (server.GetImageResponseObject, error) { @@ -36,8 +37,12 @@ func (h *ServiceHandler) GetImage(ctx context.Context, request server.GetImageRe // Generate the OVA image if err := ova.Generate(); err != nil { + metrics.IncreaseOvaDownloadsTotalMetric("failed") return server.GetImage500JSONResponse{Message: fmt.Sprintf("error generating image %s", err)}, nil } + + metrics.IncreaseOvaDownloadsTotalMetric("successful") + return server.GetImage200ApplicationoctetStreamResponse{Body: bytes.NewReader([]byte{})}, nil } diff --git a/pkg/metrics/handler.go b/pkg/metrics/handler.go new file mode 100644 index 0000000..a3b67ed --- /dev/null +++ b/pkg/metrics/handler.go @@ -0,0 +1,25 @@ +package metrics + +import ( + "net/http" + + "github.com/prometheus/client_golang/prometheus/promhttp" +) + +type prometheusMetricsHandler struct { +} + +// NewPrometheusMetricsHandler adds custom metrics and proxy to prometheus handler +func NewPrometheusMetricsHandler() *prometheusMetricsHandler { + return &prometheusMetricsHandler{} +} + +// create and register banned_user metrics + +func (h *prometheusMetricsHandler) Handler() http.Handler { + handler := promhttp.Handler() + + return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + handler.ServeHTTP(w, r) + }) +} diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go new file mode 100644 index 0000000..ab7033c --- /dev/null +++ b/pkg/metrics/metrics.go @@ -0,0 +1,71 @@ +package metrics + +import ( + "github.com/prometheus/client_golang/prometheus" +) + +const ( + assistedMigration = "assisted_migration" + + // Ova metrics + ovaDownloadsTotal = "ova_downloads_total" + + // Agent metrics + AgentStatusCount = "agent_status_count" + + // Labels + agentStateLabel = "state" + ovaDownloadStatusLabel = "state" +) + +var agentStateCountLabels = []string{ + agentStateLabel, +} + +var ovaDownloadTotalLabels = []string{ + ovaDownloadStatusLabel, +} + +/** +* Metrics definition +**/ +var ovaDownloadsTotalMetric = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Subsystem: assistedMigration, + Name: ovaDownloadsTotal, + Help: "number of total ova downloads", + }, + ovaDownloadTotalLabels, +) + +var agentStatusCountMetric = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Subsystem: assistedMigration, + Name: AgentStatusCount, + Help: "metrics to record the number of agents in each status", + }, + agentStateCountLabels, +) + +func IncreaseOvaDownloadsTotalMetric(state string) { + labels := prometheus.Labels{ + ovaDownloadStatusLabel: state, + } + ovaDownloadsTotalMetric.With(labels).Inc() +} + +func UpdateAgentStateCounterMetric(state string, count int) { + labels := prometheus.Labels{ + agentStateLabel: state, + } + agentStatusCountMetric.With(labels).Set(float64(count)) +} + +func init() { + registerMetrics() +} + +func registerMetrics() { + prometheus.MustRegister(ovaDownloadsTotalMetric) + prometheus.MustRegister(agentStatusCountMetric) +}