diff --git a/docs/node-termination.md b/docs/node-termination.md index 2e8e99ac..9b48c617 100644 --- a/docs/node-termination.md +++ b/docs/node-termination.md @@ -52,3 +52,15 @@ metadata: annotations: atlassian.com/no-delete: "testing some long running bpf tracing" ``` + +## Force Tainting + +Force Tainting provides a mechanism for Escalator to promptly remove nodes tainted by an external system, or administrator. + +This is implemented by applying a taint to the node, with the following key: + +``` +atlassian.com/escalator-force +``` + +The node will be removed as soon as all non-daemonset pods are completed. This is useful when nodes must be removed asap, but running pods should not be killed. \ No newline at end of file diff --git a/pkg/controller/controller.go b/pkg/controller/controller.go index edbdf6db..57a40ba0 100644 --- a/pkg/controller/controller.go +++ b/pkg/controller/controller.go @@ -31,7 +31,8 @@ type NodeGroupState struct { scaleUpLock scaleLock // used for tracking which nodes are tainted. testing when in dry mode - taintTracker []string + taintTracker []string + forceTaintTracker []string // used for tracking scale delta across runs, useful for reducing hysteresis scaleDelta int @@ -54,11 +55,12 @@ type Opts struct { // scaleOpts provides options for a scale function // wraps options that would be passed as args type scaleOpts struct { - nodes []*v1.Node - taintedNodes []*v1.Node - untaintedNodes []*v1.Node - nodeGroup *NodeGroupState - nodesDelta int + nodes []*v1.Node + taintedNodes []*v1.Node + forceTaintedNodes []*v1.Node + untaintedNodes []*v1.Node + nodeGroup *NodeGroupState + nodesDelta int } // NewController creates a new controller with the specified options @@ -116,36 +118,52 @@ func (c *Controller) dryMode(nodeGroup *NodeGroupState) bool { } // filterNodes separates nodes between tainted and untainted nodes -func (c *Controller) filterNodes(nodeGroup *NodeGroupState, allNodes []*v1.Node) (untaintedNodes, taintedNodes, cordonedNodes []*v1.Node) { +func (c *Controller) filterNodes(nodeGroup *NodeGroupState, allNodes []*v1.Node) (untaintedNodes, taintedNodes, forceTaintedNodes, cordonedNodes []*v1.Node) { untaintedNodes = make([]*v1.Node, 0, len(allNodes)) taintedNodes = make([]*v1.Node, 0, len(allNodes)) + forceTaintedNodes = make([]*v1.Node, 0, len(allNodes)) cordonedNodes = make([]*v1.Node, 0, len(allNodes)) for _, node := range allNodes { + var tainted bool + var forceTainted bool + var untainted bool + if c.dryMode(nodeGroup) { - var contains bool for _, name := range nodeGroup.taintTracker { if node.Name == name { - contains = true + tainted = true break } } - if !contains { - untaintedNodes = append(untaintedNodes, node) - } else { - taintedNodes = append(taintedNodes, node) + + for _, name := range nodeGroup.forceTaintTracker { + if node.Name == name { + forceTainted = true + break + } } + + untainted = !forceTainted && !tainted + } else { // If the node is Unschedulable (cordoned), separate it out from the tainted/untainted if node.Spec.Unschedulable { cordonedNodes = append(cordonedNodes, node) continue } - if _, tainted := k8s.GetToBeRemovedTaint(node); !tainted { - untaintedNodes = append(untaintedNodes, node) - } else { - taintedNodes = append(taintedNodes, node) - } + + _, forceTainted = k8s.GetToBeForceRemovedTaint(node) + _, tainted = k8s.GetToBeRemovedTaint(node) + untainted = !forceTainted && !tainted + } + + if forceTainted { + forceTaintedNodes = append(forceTaintedNodes, node) + } else if tainted { + taintedNodes = append(taintedNodes, node) + } else if untainted { + untaintedNodes = append(untaintedNodes, node) } } @@ -210,7 +228,7 @@ func (c *Controller) scaleNodeGroup(nodegroup string, nodeGroup *NodeGroupState) } // Filter into untainted and tainted nodes - untaintedNodes, taintedNodes, cordonedNodes := c.filterNodes(nodeGroup, allNodes) + untaintedNodes, taintedNodes, forceTaintedNodes, cordonedNodes := c.filterNodes(nodeGroup, allNodes) // Metrics and Logs log.WithField("nodegroup", nodegroup).Infof("pods total: %v", len(pods)) @@ -218,12 +236,14 @@ func (c *Controller) scaleNodeGroup(nodegroup string, nodeGroup *NodeGroupState) log.WithField("nodegroup", nodegroup).Infof("cordoned nodes remaining total: %v", len(cordonedNodes)) log.WithField("nodegroup", nodegroup).Infof("nodes remaining untainted: %v", len(untaintedNodes)) log.WithField("nodegroup", nodegroup).Infof("nodes remaining tainted: %v", len(taintedNodes)) + log.WithField("nodegroup", nodegroup).Infof("nodes remaining force tainted: %v", len(forceTaintedNodes)) log.WithField("nodegroup", nodegroup).Infof("Minimum Node: %v", nodeGroup.Opts.MinNodes) log.WithField("nodegroup", nodegroup).Infof("Maximum Node: %v", nodeGroup.Opts.MaxNodes) metrics.NodeGroupNodes.WithLabelValues(nodegroup).Set(float64(len(allNodes))) metrics.NodeGroupNodesCordoned.WithLabelValues(nodegroup).Set(float64(len(cordonedNodes))) metrics.NodeGroupNodesUntainted.WithLabelValues(nodegroup).Set(float64(len(untaintedNodes))) metrics.NodeGroupNodesTainted.WithLabelValues(nodegroup).Set(float64(len(taintedNodes))) + metrics.NodeGroupNodesForceTainted.WithLabelValues(nodegroup).Set(float64(len(forceTaintedNodes))) metrics.NodeGroupPods.WithLabelValues(nodegroup).Set(float64(len(pods))) // We want to be really simple right now so we don't do anything if we are outside the range of allowed nodes @@ -288,11 +308,12 @@ func (c *Controller) scaleNodeGroup(nodegroup string, nodeGroup *NodeGroupState) if len(untaintedNodes) < nodeGroup.Opts.MinNodes { log.WithField("nodegroup", nodegroup).Warn("There are less untainted nodes than the minimum") result, err := c.ScaleUp(scaleOpts{ - nodes: allNodes, - nodesDelta: nodeGroup.Opts.MinNodes - len(untaintedNodes), - nodeGroup: nodeGroup, - taintedNodes: taintedNodes, - untaintedNodes: untaintedNodes, + nodes: allNodes, + nodesDelta: nodeGroup.Opts.MinNodes - len(untaintedNodes), + nodeGroup: nodeGroup, + taintedNodes: taintedNodes, + forceTaintedNodes: forceTaintedNodes, + untaintedNodes: untaintedNodes, }) if err != nil { log.WithField("nodegroup", nodegroup).Error(err) @@ -382,10 +403,21 @@ func (c *Controller) scaleNodeGroup(nodegroup string, nodeGroup *NodeGroupState) log.WithField("nodegroup", nodegroup).Debugf("Delta: %v", nodesDelta) scaleOptions := scaleOpts{ - nodes: allNodes, - taintedNodes: taintedNodes, - untaintedNodes: untaintedNodes, - nodeGroup: nodeGroup, + nodes: allNodes, + taintedNodes: taintedNodes, + forceTaintedNodes: forceTaintedNodes, + untaintedNodes: untaintedNodes, + nodeGroup: nodeGroup, + } + + // Check for nodes tainted for force removal + var forceRemoved int + var forceActionErr error + forceRemoved, forceActionErr = c.TryRemoveForceTaintedNodes(scaleOptions) + log.WithField("nodegroup", nodegroup).Infof("Reaper: There were %v empty nodes force deleted this round", forceRemoved) + + if forceActionErr != nil { + log.WithField("nodegroup", nodegroup).Error(forceActionErr) } // Perform a scale up, do nothing or scale down based on the nodes delta diff --git a/pkg/controller/controller_scale_node_group_test.go b/pkg/controller/controller_scale_node_group_test.go index 325da2ad..739fece8 100644 --- a/pkg/controller/controller_scale_node_group_test.go +++ b/pkg/controller/controller_scale_node_group_test.go @@ -126,7 +126,7 @@ func TestUntaintNodeGroupMinNodes(t *testing.T) { _, err = controller.scaleNodeGroup(nodeGroup.Name, nodeGroupsState[nodeGroup.Name]) assert.NoError(t, err) - untainted, tainted, _ := controller.filterNodes(nodeGroupsState[nodeGroup.Name], nodes) + untainted, tainted, _, _ := controller.filterNodes(nodeGroupsState[nodeGroup.Name], nodes) // Ensure that the tainted nodes where untainted assert.Equal(t, minNodes, len(untainted)) assert.Equal(t, 0, len(tainted)) @@ -193,7 +193,7 @@ func TestUntaintNodeGroupMaxNodes(t *testing.T) { _, err = controller.scaleNodeGroup(nodeGroup.Name, nodeGroupsState[nodeGroup.Name]) require.NoError(t, err) - untainted, tainted, _ := controller.filterNodes(nodeGroupsState[nodeGroup.Name], nodes) + untainted, tainted, _, _ := controller.filterNodes(nodeGroupsState[nodeGroup.Name], nodes) // Ensure that the tainted nodes where untainted assert.Equal(t, maxNodes, len(untainted)) assert.Equal(t, 0, len(tainted)) diff --git a/pkg/controller/controller_test.go b/pkg/controller/controller_test.go index 24586a99..3085465f 100644 --- a/pkg/controller/controller_test.go +++ b/pkg/controller/controller_test.go @@ -105,6 +105,10 @@ func TestControllerFilterNodes(t *testing.T) { Name: "n6", Tainted: false, }), + 6: test.BuildTestNode(test.NodeOpts{ + Name: "n7", + ForceTainted: true, + }), } type args struct { @@ -113,11 +117,12 @@ func TestControllerFilterNodes(t *testing.T) { master bool } tests := []struct { - name string - args args - wantUntaintedNodes []*v1.Node - wantTaintedNodes []*v1.Node - wantCordonedNodes []*v1.Node + name string + args args + wantUntaintedNodes []*v1.Node + wantTaintedNodes []*v1.Node + wantForceTaintedNodes []*v1.Node + wantCordonedNodes []*v1.Node }{ { "basic filter not drymode", @@ -132,6 +137,7 @@ func TestControllerFilterNodes(t *testing.T) { }, []*v1.Node{nodes[1], nodes[3], nodes[5]}, []*v1.Node{nodes[0], nodes[2], nodes[4]}, + []*v1.Node{nodes[6]}, []*v1.Node{}, }, { @@ -141,13 +147,15 @@ func TestControllerFilterNodes(t *testing.T) { Opts: NodeGroupOptions{ DryMode: true, }, - taintTracker: []string{"n1", "n3", "n5"}, + taintTracker: []string{"n1", "n3", "n5"}, + forceTaintTracker: []string{"n7"}, }, nodes, true, }, []*v1.Node{nodes[1], nodes[3], nodes[5]}, []*v1.Node{nodes[0], nodes[2], nodes[4]}, + []*v1.Node{nodes[6]}, []*v1.Node{}, }, } @@ -158,9 +166,10 @@ func TestControllerFilterNodes(t *testing.T) { DryMode: tt.args.master, }, } - gotUntaintedNodes, gotTaintedNodes, gotCordonedNodes := c.filterNodes(tt.args.nodeGroup, tt.args.allNodes) + gotUntaintedNodes, gotTaintedNodes, gotForceTaintedNodes, gotCordonedNodes := c.filterNodes(tt.args.nodeGroup, tt.args.allNodes) assert.Equal(t, tt.wantUntaintedNodes, gotUntaintedNodes) assert.Equal(t, tt.wantTaintedNodes, gotTaintedNodes) + assert.Equal(t, tt.wantForceTaintedNodes, gotForceTaintedNodes) assert.Equal(t, tt.wantCordonedNodes, gotCordonedNodes) }) } diff --git a/pkg/controller/scale_down.go b/pkg/controller/scale_down.go index 609869f1..7d9a8d96 100644 --- a/pkg/controller/scale_down.go +++ b/pkg/controller/scale_down.go @@ -45,6 +45,26 @@ func safeFromDeletion(node *v1.Node) (string, bool) { return "", false } +// TryRemoveForceTaintedNodes attempts to remove nodes that are +// * Tainted with force remove and empty +func (c *Controller) TryRemoveForceTaintedNodes(opts scaleOpts) (int, error) { + var toBeDeleted []*v1.Node + drymode := c.dryMode(opts.nodeGroup) + + for _, candidate := range opts.forceTaintedNodes { + if k8s.NodeEmpty(candidate, opts.nodeGroup.NodeInfoMap) { + if !drymode { + toBeDeleted = append(toBeDeleted, candidate) + } + log.WithField("drymode", drymode).WithField("nodegroup", opts.nodeGroup.Opts.Name).Infof("Node %v, %v ready to be force deleted", candidate.Name, candidate.Spec.ProviderID) + } else { + log.WithField("drymode", drymode).WithField("nodegroup", opts.nodeGroup.Opts.Name).Infof("Node %v, %v not ready to be force deleted", candidate.Name, candidate.Spec.ProviderID) + } + } + + return TryDeleteNodes(c, opts, toBeDeleted) +} + // TryRemoveTaintedNodes attempts to remove nodes are // * tainted and empty // * have passed their grace period @@ -98,6 +118,10 @@ func (c *Controller) TryRemoveTaintedNodes(opts scaleOpts) (int, error) { } } + return TryDeleteNodes(c, opts, toBeDeleted) +} + +func TryDeleteNodes(c *Controller, opts scaleOpts, toBeDeleted []*v1.Node) (int, error) { if len(toBeDeleted) > 0 { podsRemaining := 0 for _, nodeToBeDeleted := range toBeDeleted { diff --git a/pkg/controller/scale_down_test.go b/pkg/controller/scale_down_test.go index 1ee2a2cc..44747896 100644 --- a/pkg/controller/scale_down_test.go +++ b/pkg/controller/scale_down_test.go @@ -87,6 +87,7 @@ func TestControllerScaleDownTaint(t *testing.T) { scaleOpts{ nodes, []*v1.Node{}, + []*v1.Node{}, nodes, nodeGroupsState["example"], 2, @@ -102,6 +103,7 @@ func TestControllerScaleDownTaint(t *testing.T) { scaleOpts{ nodes, []*v1.Node{}, + []*v1.Node{}, nodes, nodeGroupsState["example"], 4, @@ -117,6 +119,7 @@ func TestControllerScaleDownTaint(t *testing.T) { scaleOpts{ nodes[:2], []*v1.Node{}, + []*v1.Node{}, nodes[:2], nodeGroupsState["example"], 4, @@ -132,6 +135,7 @@ func TestControllerScaleDownTaint(t *testing.T) { scaleOpts{ nodes[:3], []*v1.Node{}, + []*v1.Node{}, nodes[:3], nodeGroupsState["default"], 4, @@ -147,6 +151,7 @@ func TestControllerScaleDownTaint(t *testing.T) { scaleOpts{ nodes, []*v1.Node{}, + []*v1.Node{}, nodes, nodeGroupsState["default"], 4, @@ -451,6 +456,7 @@ func TestController_TryRemoveTaintedNodes(t *testing.T) { scaleOpts{ nodes, taintedNodes, + []*v1.Node{}, untaintedNodes, nodeGroupsState[testNodeGroup.ID()], 0, // not used in TryRemoveTaintedNodes @@ -464,6 +470,7 @@ func TestController_TryRemoveTaintedNodes(t *testing.T) { scaleOpts{ nodes, taintedNodes, + []*v1.Node{}, untaintedNodes, nodeGroupsState[testNodeGroup.ID()], 0, // not used in TryRemoveTaintedNodes @@ -477,6 +484,7 @@ func TestController_TryRemoveTaintedNodes(t *testing.T) { scaleOpts{ nodes, []*v1.Node{}, + []*v1.Node{}, nodes, nodeGroupsState[testNodeGroup.ID()], 0, // not used in TryRemoveTaintedNodes @@ -503,4 +511,95 @@ func TestController_TryRemoveTaintedNodes(t *testing.T) { assert.Equal(t, tt.want, got) }) } + + forceTests := []struct { + name string + opts scaleOpts + annotateFirstTainted bool + want int + wantErr bool + }{ + { + "test none force tainted", + scaleOpts{ + nodes, + []*v1.Node{}, + []*v1.Node{}, + nodes, + nodeGroupsState[testNodeGroup.ID()], + 0, // not used in TryRemoveTaintedNodes + }, + false, + 0, + false, + }, + { + "test one tainted", + scaleOpts{ + nodes, + []*v1.Node{nodes[0]}, + []*v1.Node{}, + nodes[1:], + nodeGroupsState[testNodeGroup.ID()], + 0, // not used in TryRemoveTaintedNodes + }, + false, + 0, + false, + }, + { + "test one force tainted", + scaleOpts{ + nodes, + []*v1.Node{}, + []*v1.Node{nodes[0]}, + nodes[1:], + nodeGroupsState[testNodeGroup.ID()], + 0, // not used in TryRemoveTaintedNodes + }, + false, + -1, + false, + }, + { + "test one force tainted remaining tainted", + scaleOpts{ + nodes, + nodes[1:], + []*v1.Node{nodes[0]}, + []*v1.Node{}, + nodeGroupsState[testNodeGroup.ID()], + 0, // not used in TryRemoveTaintedNodes + }, + false, + -1, + false, + }, + { + "test all force tainted", + scaleOpts{ + nodes, + []*v1.Node{}, + nodes, + []*v1.Node{}, + nodeGroupsState[testNodeGroup.ID()], + 0, // not used in TryRemoveTaintedNodes + }, + false, + -len(nodes), + false, + }, + } + + for _, tt := range forceTests { + t.Run(tt.name, func(t *testing.T) { + got, err := c.TryRemoveForceTaintedNodes(tt.opts) + if tt.wantErr { + assert.Error(t, err) + } else { + assert.NoError(t, err) + } + assert.Equal(t, tt.want, got) + }) + } } diff --git a/pkg/k8s/taint.go b/pkg/k8s/taint.go index f79a1705..f99f13fe 100644 --- a/pkg/k8s/taint.go +++ b/pkg/k8s/taint.go @@ -29,6 +29,9 @@ var TaintEffectTypes = map[apiv1.TaintEffect]bool{ const ( // ToBeRemovedByAutoscalerKey specifies the key the autoscaler uses to taint nodes as MARKED ToBeRemovedByAutoscalerKey = "atlassian.com/escalator" + + // ToBeForceRemovedByAutoscalerKey specifies the key used to mark a node for force removal + ToBeForceRemovedByAutoscalerKey = "atlassian.com/escalator-force" ) // AddToBeRemovedTaint takes a k8s node and adds the ToBeRemovedByAutoscaler taint to the node @@ -86,6 +89,17 @@ func GetToBeRemovedTaint(node *apiv1.Node) (apiv1.Taint, bool) { return apiv1.Taint{}, false } +// GetToBeRemovedTaint returns whether the node is tainted with the ToBeForceRemoved taint +// and the taint associated +func GetToBeForceRemovedTaint(node *apiv1.Node) (apiv1.Taint, bool) { + for _, taint := range node.Spec.Taints { + if taint.Key == ToBeForceRemovedByAutoscalerKey { + return taint, true + } + } + return apiv1.Taint{}, false +} + // GetToBeRemovedTime returns the time the node was tainted // result will be nil if does not exist func GetToBeRemovedTime(node *apiv1.Node) (*time.Time, error) { diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go index 3cec3c87..b6b948c6 100644 --- a/pkg/metrics/metrics.go +++ b/pkg/metrics/metrics.go @@ -36,6 +36,15 @@ var ( }, []string{"node_group"}, ) + // NodeGroupNodesForceTainted nodes considered by specific node groups that are force tainted + NodeGroupNodesForceTainted = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "node_group_force_tainted_nodes", + Namespace: NAMESPACE, + Help: "nodes considered by specific node groups that are force tainted", + }, + []string{"node_group"}, + ) // NodeGroupNodesCordoned nodes considered by specific node groups NodeGroupNodesCordoned = prometheus.NewGaugeVec( prometheus.GaugeOpts{ diff --git a/pkg/test/builder.go b/pkg/test/builder.go index d4ff7ded..e2a85930 100644 --- a/pkg/test/builder.go +++ b/pkg/test/builder.go @@ -16,13 +16,14 @@ import ( // NodeOpts minimal options for configuring a node object in testing type NodeOpts struct { - Name string - CPU int64 - Mem int64 - LabelKey string - LabelValue string - Creation time.Time - Tainted bool + Name string + CPU int64 + Mem int64 + LabelKey string + LabelValue string + Creation time.Time + Tainted bool + ForceTainted bool } // BuildFakeClient creates a fake client @@ -114,6 +115,12 @@ func BuildTestNode(opts NodeOpts) *apiv1.Node { Effect: apiv1.TaintEffectNoSchedule, }) } + if opts.ForceTainted { + taints = append(taints, apiv1.Taint{ + Key: "atlassian.com/escalator-force", + Effect: apiv1.TaintEffectNoSchedule, + }) + } node := &apiv1.Node{ ObjectMeta: metav1.ObjectMeta{