Skip to content

Commit

Permalink
Merge pull request #83 from scality/feature/COSI-65-instrument-cosi-d…
Browse files Browse the repository at this point in the history
…rover-with-gprc-metrics

COSI-65, COSI-46, COSI-21: Add GRPC Metrics Instrumentation and Documentation Updates
  • Loading branch information
anurag4DSB authored Jan 7, 2025
2 parents 004bebe + 3006ca8 commit 2c61eca
Show file tree
Hide file tree
Showing 20 changed files with 985 additions and 16 deletions.
100 changes: 100 additions & 0 deletions .github/scripts/e2e_tests_metrics.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
#!/bin/bash
set -e

LOG_FILE=".github/e2e_tests/artifacts/logs/e2e_tests/metrics_service.log"
mkdir -p "$(dirname "$LOG_FILE")"

NAMESPACE="scality-object-storage"
SERVICE="scality-cosi-driver-metrics"
LOCAL_PORT=8080
TARGET_PORT=8080

# Declare expected values for each metric as environment variables
EXPECTED_CREATE_BUCKET=${1:-0}
EXPECTED_DELETE_BUCKET=${2:-0}
EXPECTED_GET_INFO=${3:-1}
EXPECTED_GRANT_ACCESS=${4:-0}
EXPECTED_REVOKE_ACCESS=${5:-0}
GRPC_METHOD_TO_TEST="grpc_server_msg_sent_total"

# Error handling function
error_handler() {
echo "An error occurred during the metrics test. Check the log file for details." | tee -a "$LOG_FILE"
echo "Failed command: $BASH_COMMAND" | tee -a "$LOG_FILE"
exit 1
}

# Trap errors and call the error handler
trap 'error_handler' ERR

# Logging and command execution function
log_and_run() {
echo "Running: $*" | tee -a "$LOG_FILE"
"$@" 2>&1 | tee -a "$LOG_FILE"
}

# Fetch services and validate the target service exists
log_and_run kubectl get svc --all-namespaces

# Port-forward the metrics service
log_and_run kubectl port-forward -n "$NAMESPACE" svc/"$SERVICE" "$LOCAL_PORT":"$TARGET_PORT" &
PORT_FORWARD_PID=$!

# Wait a few seconds to ensure port-forward is established
while ! nc -vz localhost $LOCAL_PORT > /dev/null 2>&1 ; do
# echo sleeping
sleep 0.1
done

# Fetch metrics
log_and_run curl -s http://localhost:$LOCAL_PORT/metrics > /tmp/metrics_output.log
log_and_run cat /tmp/metrics_output.log

log_and_run kill "$PORT_FORWARD_PID"


METRICS_OUTPUT=$(cat /tmp/metrics_output.log | grep $GRPC_METHOD_TO_TEST)
echo "gRPC Metrics fetched successfully:" | tee -a "$LOG_FILE"
echo "$METRICS_OUTPUT" | tee -a "$LOG_FILE"

# Validate metrics
echo "Validating gRPC Server Metrics..." | tee -a "$LOG_FILE"
echo "$METRICS_OUTPUT" | while read -r line; do
# Extract the grpc_method and value
method=$(echo "$line" | sed -n 's/.*grpc_method="\([^"]*\)".*/\1/p') # Extract method name
value=$(echo "$line" | awk '{print $NF}') # Extract value

# Determine the expected value based on the grpc_method
case "$method" in
"DriverCreateBucket")
expected_value=$EXPECTED_CREATE_BUCKET
;;
"DriverDeleteBucket")
expected_value=$EXPECTED_DELETE_BUCKET
;;
"DriverGetInfo")
expected_value=$EXPECTED_GET_INFO
;;
"DriverGrantBucketAccess")
expected_value=$EXPECTED_GRANT_ACCESS
;;
"DriverRevokeBucketAccess")
expected_value=$EXPECTED_REVOKE_ACCESS
;;
*)
echo "Unknown method: $method. Skipping validation." | tee -a "$LOG_FILE"
continue
;;
esac

# Display method, value, and expected value
echo "Method: $method, Value: $value, Expected: $expected_value" | tee -a "$LOG_FILE"

# Perform validation
if [[ "$value" -ne "$expected_value" ]]; then
echo "Error: $method has an unexpected value ($value). Expected: $expected_value" | tee -a "$LOG_FILE"
exit 1
fi
done

echo "Metrics validation successful!" | tee -a "$LOG_FILE"
12 changes: 12 additions & 0 deletions .github/workflows/e2e-feature-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,18 @@ jobs:
run: |
.github/scripts/e2e_tests_brownfield_use_case.sh
# the script accepts number of requests for APIs: CREATE_BUCKET, DELETE_BUCKET, GET_INFO
# GRANT_ACCESS and REVOKE_ACCESS in order
# Example below we are testing for those API counts:
# - 2 CREATE_BUCKET
# - 1 DELETE_BUCKET
# - 1 GET_INFO
# - 2 GRANT_ACCESS
# - 2 REVOKE_ACCESS
- name: E2E tests for metrics using API call metrics generated from above tests
run: |
.github/scripts/e2e_tests_metrics.sh 2 1 1 2 2
- name: "Delay completion"
if: ${{ github.event_name == 'workflow_dispatch' && inputs.debug_enabled }}
uses: scality/actions/[email protected]
Expand Down
12 changes: 12 additions & 0 deletions .github/workflows/helm-validation.yml
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,18 @@ jobs:
run: |
.github/scripts/verify_helm_install.sh
# the script accepts number of requests for APIs: CREATE_BUCKET, DELETE_BUCKET, GET_INFO
# GRANT_ACCESS and REVOKE_ACCESS in order
# Example below we are testing for those API counts:
# - 0 CREATE_BUCKET
# - 0 DELETE_BUCKET
# - 1 GET_INFO
# - 0 GRANT_ACCESS
# - 0 REVOKE_ACCESS
- name: Verify metrics for healthcheck route
run: |
.github/scripts/e2e_tests_metrics.sh 0 0 1 0 0
- name: "Delay completion"
if: ${{ github.event_name == 'workflow_dispatch' && inputs.debug_enabled }}
uses: scality/actions/[email protected]
Expand Down
47 changes: 38 additions & 9 deletions cmd/scality-cosi-driver/cmd.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,21 +20,32 @@ import (
"context"
"flag"
"fmt"
"strings"
"time"

"github.com/prometheus/client_golang/prometheus"
"github.com/scality/cosi-driver/pkg/driver"
"k8s.io/klog/v2"

"github.com/scality/cosi-driver/pkg/grpcfactory"
"github.com/scality/cosi-driver/pkg/metrics"
)

const (
provisionerName = "scality.com"
defaultDriverPrefix = "cosi"
provisionerName = "scality.com"
defaultDriverAddress = "unix:///var/lib/cosi/cosi.sock"
defaultDriverPrefix = "cosi"
defaultMetricsPath = "/metrics"
defaultMetricsPrefix = "scality_cosi_driver"
defaultMetricsAddress = ":8080"
)

var (
driverAddress = flag.String("driver-address", "unix:///var/lib/cosi/cosi.sock", "driver address for the socket")
driverPrefix = flag.String("driver-prefix", "", "prefix for COSI driver, e.g. <prefix>.scality.com")
driverAddress = flag.String("driver-address", defaultDriverAddress, "driver address for the socket file, default: unix:///var/lib/cosi/cosi.sock")
driverPrefix = flag.String("driver-prefix", defaultDriverPrefix, "prefix for COSI driver, e.g. <prefix>.scality.com, default: cosi")
driverMetricsAddress = flag.String("driver-metrics-address", defaultMetricsAddress, "The address (hostname:port) to expose Prometheus metrics, default: 0.0.0.0:8080")
driverMetricsPath = flag.String("driver-metrics-path", defaultMetricsPath, "path for the metrics endpoint, default: /metrics")
driverMetricsPrefix = flag.String("driver-custom-metrics-prefix", defaultMetricsPrefix, "prefix for the metrics, default: scality_cosi_driver")
)

func init() {
Expand All @@ -44,17 +55,29 @@ func init() {
}
flag.Parse()

if *driverPrefix == "" {
*driverPrefix = defaultDriverPrefix
klog.Warning("No driver prefix provided, using default prefix")
// check if driverMetricsPath starts with / if not add it
if !strings.HasPrefix(*driverMetricsPath, "/") {
*driverMetricsPath = "/" + *driverMetricsPath
}

klog.InfoS("COSI driver startup configuration", "driverAddress", *driverAddress, "driverPrefix", *driverPrefix)
klog.InfoS("COSI driver startup configuration",
"driverAddress", *driverAddress,
"driverPrefix", *driverPrefix,
"driverMetricsPath", *driverMetricsPath,
"driverMetricsPrefix", *driverMetricsPrefix,
"driverMetricsAddress", *driverMetricsAddress,
)
}

func run(ctx context.Context) error {
registry := prometheus.NewRegistry()
driverName := *driverPrefix + "." + provisionerName

metricsServer, err := metrics.StartMetricsServerWithRegistry(*driverMetricsAddress, registry, *driverMetricsPath)
if err != nil {
return fmt.Errorf("failed to start metrics server: %w", err)
}

identityServer, bucketProvisioner, err := driver.CreateDriver(ctx, driverName)
if err != nil {
return fmt.Errorf("failed to initialize Scality driver: %w", err)
Expand All @@ -65,5 +88,11 @@ func run(ctx context.Context) error {
return fmt.Errorf("failed to start the provisioner server: %w", err)
}

return server.Run(ctx)
err = server.Run(ctx, registry)
shutdownCtx, _ := context.WithTimeout(context.Background(), 5*time.Second)
if shutdownErr := metricsServer.Shutdown(shutdownCtx); shutdownErr != nil {
klog.ErrorS(shutdownErr, "Failed to gracefully shutdown metrics server")
}

return err
}
34 changes: 34 additions & 0 deletions docs/development/run-cosi-driver-locally.md
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,40 @@ grpcurl -plaintext -proto cosi.proto -import-path ./proto -unix -d '{
}' ./cosi.sock cosi.v1alpha1.Provisioner.DriverGrantBucketAccess
```

- DriverDeleteBucket gRPC API

```sh
grpcurl -plaintext -proto cosi.proto -import-path ./proto -unix -d '{
"bucket_id": "example-bucket"
}' ./cosi.sock cosi.v1alpha1.Provisioner.DriverDeleteBucket
```

- DriverRevokeBucketAccess gRPC API

```sh
grpcurl -plaintext -proto cosi.proto -import-path ./proto -unix -d '{
"accountId": "user-name",
"bucketId": "example-bucket"
}' ./cosi.sock cosi.v1alpha1.Provisioner.DriverRevokeBucketAccess
```

## Metrics

Query metrics using localhost:8080/metrics endpoint

Example:

```sh
curl -s localhost:8080/metrics | grep grpc_server_msg_sent_total
# HELP grpc_server_msg_sent_total Total number of gRPC stream messages sent by the server.
# TYPE grpc_server_msg_sent_total counter
grpc_server_msg_sent_total{grpc_method="DriverCreateBucket",grpc_service="cosi.v1alpha1.Provisioner",grpc_type="unary"} 2
grpc_server_msg_sent_total{grpc_method="DriverDeleteBucket",grpc_service="cosi.v1alpha1.Provisioner",grpc_type="unary"} 0
grpc_server_msg_sent_total{grpc_method="DriverGetInfo",grpc_service="cosi.v1alpha1.Identity",grpc_type="unary"} 3
grpc_server_msg_sent_total{grpc_method="DriverGrantBucketAccess",grpc_service="cosi.v1alpha1.Provisioner",grpc_type="unary"} 0
grpc_server_msg_sent_total{grpc_method="DriverRevokeBucketAccess",grpc_service="cosi.v1alpha1.Provisioner",grpc_type="unary"} 0
```

## Troubleshooting

- Socket Not Found: If /var/lib/cosi/cosi.sock is not created, ensure the COSI driver started correctly by checking its logs.
Expand Down
17 changes: 17 additions & 0 deletions docs/driver-params.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,20 @@ The table below details the configuration parameters for BucketClass, which dete
| `tlsCert`| The name of the secret containing the TLS certificate (optional). | `string` | No |

[Example](../cosi-examples/s3-secret-for-cosi.yaml)

## Deployment Parameters for COSI Driver

Below are the deployment parameters for configuring the COSI driver, which can be passed as flags or environment variables.

| **Parameter** | **Description** | **Default Value** | **Required** |
|---------------------------------|----------------------------------------------------------------|----------------------------------|--------------|
| `driver-address` | The socket file address for the COSI driver. | `unix:///var/lib/cosi/cosi.sock` | Yes |
| `driver-prefix` | The prefix for the COSI driver (e.g., `<prefix>.scality.com`). | `cosi` | No |
| `driver-metrics-address` | The address to expose Prometheus metrics. | `:8080` | No |
| `driver-metrics-path` | The HTTP path for exposing metrics. | `/metrics` | No |
| `driver-custom-metrics-prefix` | The prefix for metrics collected by the COSI driver. | `scality_cosi_driver` | No |

### Notes

- If driver-metrics-path does not start with /, it will automatically prepend /.
- Prometheus metrics are exposed for monitoring at the address and path specified.
35 changes: 35 additions & 0 deletions docs/metrics-overview.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# COSI Driver Metrics Documentation

This document provides an overview of the Prometheus metrics exposed by the COSI driver. These metrics are designed to help open-source users monitor the performance and operations of the COSI driver. The metrics cover gRPC server calls.

## Metrics Overview

Metrics are exposed at the `/metrics` endpoint on the address configured via the `--metrics-address` flag (default: `:8080`). These metrics are Prometheus-compatible and can be used to create dashboards for observability.

---

## gRPC Default Metrics

The COSI driver exposes default gRPC server metrics to monitor RPC activity.

| Metric Name | Description | Labels |
|---------------------------------|------------------------------------------------------------|--------------------------------------------|
| `grpc_server_started_total` | Total number of RPCs started on the server. | `grpc_method`, `grpc_service`, `grpc_type` |
| `grpc_server_handled_total` | Total number of RPCs completed on the server. | `grpc_method`, `grpc_service`, `grpc_code` |
| `grpc_server_msg_received_total`| Total number of messages received by the server. | `grpc_method`, `grpc_service` |
| `grpc_server_msg_sent_total` | Total number of messages sent by the server. | `grpc_method`, `grpc_service` |
| `grpc_server_handling_seconds` | Time taken for RPC calls to be handled by the server. | `grpc_method`, `grpc_service` |

### Example gRPC Methods

- Methods: `DriverCreateBucket`, `DriverDeleteBucket`, `DriverGetInfo`, `DriverGrantBucketAccess`, `DriverRevokeBucketAccess`
- Services: `cosi.v1alpha1.Provisioner`, `cosi.v1alpha1.Identity`

```sh
grpc_server_started_total{grpc_method="DriverGetInfo",grpc_service="cosi.v1alpha1.Identity",grpc_type="unary"} 2
```

## Additional Resource

- [gRPC-Go Prometheus Metrics](https://github.com/grpc-ecosystem/go-grpc-middleware)
- [Default Prometheus Metrics](https://pkg.go.dev/github.com/prometheus/client_golang/prometheus#pkg-subdirectories)
9 changes: 9 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,10 @@ go 1.22.6
require (
github.com/aws/aws-sdk-go-v2/credentials v1.17.47
github.com/aws/smithy-go v1.22.1
github.com/grpc-ecosystem/go-grpc-middleware/providers/prometheus v1.0.1
github.com/onsi/ginkgo/v2 v2.22.0
github.com/onsi/gomega v1.36.1
github.com/prometheus/client_golang v1.14.0
google.golang.org/grpc v1.69.0
k8s.io/client-go v0.31.3
k8s.io/klog/v2 v2.130.1
Expand All @@ -28,6 +30,13 @@ require (
github.com/aws/aws-sdk-go-v2/service/sso v1.24.7 // indirect
github.com/aws/aws-sdk-go-v2/service/ssooidc v1.28.6 // indirect
github.com/aws/aws-sdk-go-v2/service/sts v1.33.2 // indirect
github.com/beorn7/perks v1.0.1 // indirect
github.com/cespare/xxhash/v2 v2.3.0 // indirect
github.com/grpc-ecosystem/go-grpc-middleware/v2 v2.1.0 // indirect
github.com/matttproud/golang_protobuf_extensions v1.0.2-0.20181231171920-c182affec369 // indirect
github.com/prometheus/client_model v0.3.0 // indirect
github.com/prometheus/common v0.37.0 // indirect
github.com/prometheus/procfs v0.8.0 // indirect
gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect
)

Expand Down
Loading

0 comments on commit 2c61eca

Please sign in to comment.