Skip to content

Commit

Permalink
Add work around step to the functional test setup function to use a n…
Browse files Browse the repository at this point in the history
…ew client to load newly deployed CRDs

Add work around step to the functional test setup function to use a new client to load newly deployed CRDs
  • Loading branch information
jvoravong committed Dec 20, 2024
1 parent b49ef63 commit c4cde75
Show file tree
Hide file tree
Showing 8 changed files with 227 additions and 17 deletions.
12 changes: 12 additions & 0 deletions .chloggen/fix-operator-install-operations.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix'
change_type: bug_fix
# The name of the component, or a single word describing the area of concern, (e.g. agent, clusterReceiver, gateway, operator, chart, other)
component: operator
# A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`).
note: Additional fixes for issues where sometimes certificates and Instrumentation opentelemetry.io/v1alpha1 are installed too early
# One or more tracking issues related to the change
issues: [1586]
# (Optional) One or more lines of additional information to render under the primary note.
# These lines will be padded with 2 spaces and then inserted directly into the document.
# Use pipe (|) for multiline entries.
subtext:
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ kind: Certificate
metadata:
annotations:
helm.sh/hook: post-install,post-upgrade
helm.sh/hook-weight: "1"
helm.sh/hook-weight: "3"
labels:
helm.sh/chart: operator-0.71.2
app.kubernetes.io/name: operator
Expand Down Expand Up @@ -34,7 +34,7 @@ kind: Issuer
metadata:
annotations:
helm.sh/hook: post-install,post-upgrade
helm.sh/hook-weight: "1"
helm.sh/hook-weight: "2"
labels:
helm.sh/chart: operator-0.71.2
app.kubernetes.io/name: operator
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
---
# Source: splunk-otel-collector/templates/operator/job-startupapicheck.yaml
apiVersion: batch/v1
kind: Job
metadata:
name: default-splunk-otel-collector-operator-startupapicheck
namespace: default
labels:
app.kubernetes.io/name: splunk-otel-collector
helm.sh/chart: splunk-otel-collector-0.113.0
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/instance: default
app.kubernetes.io/version: "0.113.0"
app: splunk-otel-collector
component: otel-operator
chart: splunk-otel-collector-0.113.0
release: default
heritage: Helm
app.kubernetes.io/component: otel-operator
annotations:
"helm.sh/hook": post-install,post-upgrade
"helm.sh/hook-weight": "4"
spec:
template:
spec:
containers:
- name: startupapicheck
image: registry.access.redhat.com/ubi9/ubi:latest
env:
- name: MANAGER_METRICS_SERVICE_CLUSTERIP
value: "default-splunk-otel-collector-operator"
- name: MANAGER_METRICS_SERVICE_PORT
value: "8443"
- name: WEBHOOK_SERVICE_CLUSTERIP
value: "default-splunk-otel-collector-operator-webhook"
- name: WEBHOOK_SERVICE_PORT
value: "443"
command:
- sh
- -c
- |
operator_service_checked=false
operator_webhook_service_checked=false
for i in $(seq 1 300); do
# Checks for the Operator and Operator Webhook service availability using curl
# The services are ready when curl receives an HTTP 400 error response.
if [ "$operator_service_checked" = false ]; then
curl_output_operator=$(curl -s -o /dev/null -w "%{http_code}" "$MANAGER_METRICS_SERVICE_CLUSTERIP:$MANAGER_METRICS_SERVICE_PORT")
if [ "$curl_output_operator" = "400" ]; then
operator_service_checked=true
fi
fi
if [ "$operator_webhook_service_checked" = false ]; then
curl_output_webhook=$(curl -s -o /dev/null -w "%{http_code}" "$WEBHOOK_SERVICE_CLUSTERIP:$WEBHOOK_SERVICE_PORT")
if [ "$curl_output_webhook" = "400" ]; then
operator_webhook_service_checked=true
fi
fi
echo "Attempt $i: Operator Service=${operator_service_checked}, Operator Webhook Service=${operator_webhook_service_checked}"
sleep 1
if [ "$operator_service_checked" = true ] && [ "$operator_webhook_service_checked" = true ]; then
echo "All checks passed."
exit 0
fi
done
echo "Timeout reached. Not all checks completed successfully."
exit 1
restartPolicy: Never
41 changes: 32 additions & 9 deletions functional_tests/functional_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ import (
"bytes"
"context"
"fmt"
"k8s.io/cli-runtime/pkg/genericclioptions"
"k8s.io/client-go/discovery"
"os"
"path/filepath"
"regexp"
Expand Down Expand Up @@ -94,6 +96,20 @@ type sinks struct {
tracesConsumer *consumertest.TracesSink
}

type cacheInvalidatingRESTClientGetter struct {
genericclioptions.RESTClientGetter
log func(string, ...interface{})
}

func (c *cacheInvalidatingRESTClientGetter) ToDiscoveryClient() (discovery.CachedDiscoveryInterface, error) {
discoveryClient, err := c.RESTClientGetter.ToDiscoveryClient()
if err == nil {
c.log("Clearing discovery cache")
discoveryClient.Invalidate()
}
return discoveryClient, err
}

func setupOnce(t *testing.T) *sinks {
setupRun.Do(func() {
// create an API server
Expand Down Expand Up @@ -184,24 +200,31 @@ func deployChartsAndApps(t *testing.T) {
err = yaml.Unmarshal(buf.Bytes(), &values)
require.NoError(t, err)

// When using the Helm sdk, there can be cache invalidation issues we can get around with this,
// Helm cli users don't face this issue. Deploying CRD instances via Helm templates without
// and a CR in a hook post-install will fail without this.
// Related issue: https://github.com/helm/helm/issues/6452
// Draft of upstream fix: https://github.com/jvoravong/helm/pull/1/files
actionConfig := new(action.Configuration)
if err := actionConfig.Init(kube.GetConfig(testKubeConfig, "", "default"), "default", os.Getenv("HELM_DRIVER"), func(format string, v ...interface{}) {
wrappedGetter := &cacheInvalidatingRESTClientGetter{
RESTClientGetter: kube.GetConfig(testKubeConfig, "", "default"),
log: func(format string, args ...interface{}) {
fmt.Printf(format+"\n", args...)
},
}

if err := actionConfig.Init(wrappedGetter, "default", os.Getenv("HELM_DRIVER"), func(format string, v ...interface{}) {
t.Logf(format+"\n", v...)
}); err != nil {
require.NoError(t, err)
}

install := action.NewInstall(actionConfig)
install.Namespace = "default"
install.ReleaseName = "sock"
install.Timeout = 5 * time.Minute

_, err = install.Run(chart, values)
if err != nil {
t.Logf("error reported during helm install: %v\n", err)
retryUpgrade := action.NewUpgrade(actionConfig)
retryUpgrade.Namespace = "default"
retryUpgrade.Install = true
_, err = retryUpgrade.Run("sock", chart, values)
require.NoError(t, err)
}

waitForAllDeploymentsToStart(t, client)

Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
{{/*Create the startupapicheck job image name.*/}}
{{- define "splunk-otel-collector.operator.instrumentation-job.image" -}}
{{- printf "%s:%s" .Values.instrumentation.jobs.startupapicheck.repository .Values.instrumentation.jobs.startupapicheck.tag | trimSuffix ":" -}}
{{- end -}}

{{/*
Helper to ensure the correct usage of the Splunk OpenTelemetry Collector Operator.
- Checks for a valid endpoint for exporting telemetry data.
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
{{- if and .Values.operator.enabled .Values.instrumentation.jobs.startupapicheck.enabled }}
apiVersion: batch/v1
kind: Job
metadata:
name: {{ template "splunk-otel-collector.fullname" . }}-operator-startupapicheck
namespace: {{ template "splunk-otel-collector.namespace" . }}
labels:
{{- include "splunk-otel-collector.commonLabels" . | nindent 4 }}
app: {{ template "splunk-otel-collector.name" . }}
component: otel-operator
chart: {{ template "splunk-otel-collector.chart" . }}
release: {{ .Release.Name }}
heritage: {{ .Release.Service }}
app.kubernetes.io/component: otel-operator
annotations:
"helm.sh/hook": post-install,post-upgrade
"helm.sh/hook-weight": "4"
spec:
template:
spec:
containers:
- name: startupapicheck
image: {{ template "splunk-otel-collector.operator.instrumentation-job.image" . }}
env:
- name: MANAGER_METRICS_SERVICE_CLUSTERIP
value: "{{ template "splunk-otel-collector.fullname" . }}-operator"
- name: MANAGER_METRICS_SERVICE_PORT
value: "8443"
- name: WEBHOOK_SERVICE_CLUSTERIP
value: "{{ template "splunk-otel-collector.fullname" . }}-operator-webhook"
- name: WEBHOOK_SERVICE_PORT
value: "443"
command:
- sh
- -c
- |
operator_service_checked=false
operator_webhook_service_checked=false
for i in $(seq 1 300); do
# Checks for the Operator and Operator Webhook service availability using curl
# The services are ready when curl receives an HTTP 400 error response.
if [ "$operator_service_checked" = false ]; then
curl_output_operator=$(curl -s -o /dev/null -w "%{http_code}" "$MANAGER_METRICS_SERVICE_CLUSTERIP:$MANAGER_METRICS_SERVICE_PORT")
if [ "$curl_output_operator" = "400" ]; then
operator_service_checked=true
fi
fi
if [ "$operator_webhook_service_checked" = false ]; then
curl_output_webhook=$(curl -s -o /dev/null -w "%{http_code}" "$WEBHOOK_SERVICE_CLUSTERIP:$WEBHOOK_SERVICE_PORT")
if [ "$curl_output_webhook" = "400" ]; then
operator_webhook_service_checked=true
fi
fi
echo "Attempt $i: Operator Service=${operator_service_checked}, Operator Webhook Service=${operator_webhook_service_checked}"
sleep 1
if [ "$operator_service_checked" = true ] && [ "$operator_webhook_service_checked" = true ]; then
echo "All checks passed."
exit 0
fi
done
echo "Timeout reached. Not all checks completed successfully."
exit 1
restartPolicy: Never
{{ end }}
21 changes: 21 additions & 0 deletions helm-charts/splunk-otel-collector/values.schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -1681,6 +1681,27 @@
}
},
"required": ["repository", "tag"]
},
"jobs": {
"type": "object",
"additionalProperties": false,
"properties": {
"startupapicheck": {
"type": "object",
"additionalProperties": false,
"properties": {
"enabled": {
"type": "boolean"
},
"repository": {
"type": "string"
},
"tag": {
"type": "string"
}
}
}
}
}
}
},
Expand Down
19 changes: 13 additions & 6 deletions helm-charts/splunk-otel-collector/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -975,7 +975,6 @@ image:
# The policy that specifies when the user wants the Universal Base images to be pulled
pullPolicy: IfNotPresent


################################################################################
# Extra system configuration
################################################################################
Expand Down Expand Up @@ -1178,13 +1177,14 @@ operator:
enabled: false
admissionWebhooks:
certManager:
# Annotate the certificate and issuer to ensure they are created after the cert-manager CRDs have been installed.
certificateAnnotations:
"helm.sh/hook": post-install,post-upgrade
"helm.sh/hook-weight": "1"
# Annotate the issuer and certificate to ensure they are created after the cert-manager CRDs
# have been installed and cert-manager is ready.
issuerAnnotations:
"helm.sh/hook": post-install,post-upgrade
"helm.sh/hook-weight": "1"
"helm.sh/hook-weight": "2"
certificateAnnotations:
"helm.sh/hook": post-install,post-upgrade
"helm.sh/hook-weight": "3"
# Collector deployment via the operator is not supported at this time.
# The collector image repository is specified here to meet operator subchart constraints.
manager:
Expand Down Expand Up @@ -1271,6 +1271,13 @@ instrumentation:
# - name: NGINX_ENV_VAR
# value: nginx_value
# Auto-instrumentation Libraries (End)
# Jobs related to installing, uninstalling, or managing Operator Instrumentation.
jobs:
# Enables waiting for the operator to be available and ready before deploying instrumentation as a hook post-install step.
startupapicheck:
enabled: true
repository: registry.access.redhat.com/ubi9/ubi
tag: latest

# The cert-manager is a CNCF application deployed as a subchart and used for supporting operators that require TLS certificates.
# Full list of Helm value configurations: https://artifacthub.io/packages/helm/cert-manager/cert-manager?modal=values
Expand Down

0 comments on commit c4cde75

Please sign in to comment.