From a0d6cec3c5e018e362199697e6dad83d5c9d9993 Mon Sep 17 00:00:00 2001 From: Jean-Philippe Evrard Date: Fri, 1 Nov 2024 10:26:07 +0100 Subject: [PATCH] Fix discrepency metrics <> rebootAsRequired Without this patch, one metric could say "reboot is required" while the rebootAsRequired tick did not run (long period for example). This is a problem, as it leads to misexpectations: "Why did the system not reboot, while the metrics indicate a reboot was required". This solves it by inlining the metrics management within the rebootAsRequired goroutine. --- cmd/kured/main.go | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/cmd/kured/main.go b/cmd/kured/main.go index b3412932a..57789de77 100644 --- a/cmd/kured/main.go +++ b/cmd/kured/main.go @@ -283,7 +283,6 @@ func main() { lock := daemonsetlock.New(client, nodeID, dsNamespace, dsName, lockAnnotation, lockTTL, concurrency, lockReleaseDelay) go rebootAsRequired(nodeID, rebooter, rebootChecker, blockCheckers, window, lock, client) - go maintainRebootRequiredMetric(nodeID, rebootChecker) http.Handle("/metrics", promhttp.Handler()) log.Fatal(http.ListenAndServe(fmt.Sprintf("%s:%d", metricsHost, metricsPort), nil)) @@ -468,17 +467,6 @@ func uncordon(client *kubernetes.Clientset, node *v1.Node) error { return nil } -func maintainRebootRequiredMetric(nodeID string, checker checkers.Checker) { - for { - if checker.RebootRequired() { - rebootRequiredGauge.WithLabelValues(nodeID).Set(1) - } else { - rebootRequiredGauge.WithLabelValues(nodeID).Set(0) - } - time.Sleep(time.Minute) - } -} - func addNodeAnnotations(client *kubernetes.Clientset, nodeID string, annotations map[string]string) error { node, err := client.CoreV1().Nodes().Get(context.TODO(), nodeID, metav1.GetOptions{}) if err != nil { @@ -610,7 +598,9 @@ func rebootAsRequired(nodeID string, rebooter reboot.Rebooter, checker checkers. preferNoScheduleTaint := taints.New(client, nodeID, preferNoScheduleTaintName, v1.TaintEffectPreferNoSchedule) // Remove taint immediately during startup to quickly allow scheduling again. + // Also update the metric at startup (after lock shenanigans) if !checker.RebootRequired() { + rebootRequiredGauge.WithLabelValues(nodeID).Set(0) preferNoScheduleTaint.Disable() } @@ -625,9 +615,11 @@ func rebootAsRequired(nodeID string, rebooter reboot.Rebooter, checker checkers. if !checker.RebootRequired() { log.Infof("Reboot not required") + rebootRequiredGauge.WithLabelValues(nodeID).Set(0) preferNoScheduleTaint.Disable() continue } + rebootRequiredGauge.WithLabelValues(nodeID).Set(1) node, err := client.CoreV1().Nodes().Get(context.TODO(), nodeID, metav1.GetOptions{}) if err != nil {