koord-manager: enhance batch resource configuration and updating (#1703)

Signed-off-by: saintube <[email protected]>
koordinator-sh · Oct 19, 2023 · 543358d · 543358d
1 parent 3270ae7
commit 543358d
Show file tree

Hide file tree

Showing 22 changed files with 1,468 additions and 394 deletions.
diff --git a/apis/configuration/slo_controller_config.go b/apis/configuration/slo_controller_config.go
@@ -173,11 +173,29 @@ func (in *NodeExtensionStrategy) DeepCopy() *NodeExtensionStrategy {
 	return out
 }
 
+// CalculatePolicy defines the calculate policy for resource overcommitment.
+// Default is "usage".
 type CalculatePolicy string
 
 const (
-	CalculateByPodUsage   CalculatePolicy = "usage"
+	// CalculateByPodUsage is the calculate policy according to the pod resource usage.
+	// When the policy="usage", the low-priority (LP) resources are calculated according to the high-priority (HP) pods'
+	// usages, so LP pod can reclaim the requested but unused resources of the HP pods.
+	// It is the default policy where the resources are over-committed between priority bands.
+	CalculateByPodUsage CalculatePolicy = "usage"
+	// CalculateByPodRequest is the calculate policy according to the pod resource request.
+	// When the policy="request", the low-priority (LP) resources are calculated according to the high-priority (HP)
+	// pods' requests, so LP pod can allocate the unallocated resources of the HP pods but can NOT reclaim the
+	// requested but unused resources of the HP pods.
+	// It is the policy where the resources are NOT over-committed between priority bands.
 	CalculateByPodRequest CalculatePolicy = "request"
+	// CalculateByPodMaxUsageRequest is the calculate policy according to the maximum of the pod usage and request.
+	// When the policy="maxUsageRequest", the low-priority (LP) resources are calculated according to the sum of the
+	// high-priority (HP) pods' maximum of its usage and its request, so LP pod can allocate the resources both
+	// unallocated and unused by the HP pods.
+	// It is the conservative policy where the resources are NOT over-committed between priority bands while HP's usage
+	// is also protected from the overcommitment.
+	CalculateByPodMaxUsageRequest CalculatePolicy = "maxUsageRequest"
 )
 
 // +k8s:deepcopy-gen=true
@@ -215,12 +233,17 @@ type ColocationStrategy struct {
 	MetricAggregatePolicy          *slov1alpha1.AggregatePolicy         `json:"metricAggregatePolicy,omitempty"`
 	MetricMemoryCollectPolicy      *slov1alpha1.NodeMemoryCollectPolicy `json:"metricMemoryCollectPolicy,omitempty"`
 
-	CPUReclaimThresholdPercent    *int64           `json:"cpuReclaimThresholdPercent,omitempty" validate:"omitempty,min=0,max=100"`
+	CPUReclaimThresholdPercent *int64 `json:"cpuReclaimThresholdPercent,omitempty" validate:"omitempty,min=0,max=100"`
+	// CPUCalculatePolicy determines the calculation policy of the CPU resources for the Batch pods.
+	// Supported: "usage" (default), "maxUsageRequest".
+	CPUCalculatePolicy            *CalculatePolicy `json:"cpuCalculatePolicy,omitempty"`
 	MemoryReclaimThresholdPercent *int64           `json:"memoryReclaimThresholdPercent,omitempty" validate:"omitempty,min=0,max=100"`
-	MemoryCalculatePolicy         *CalculatePolicy `json:"memoryCalculatePolicy,omitempty"`
-	DegradeTimeMinutes            *int64           `json:"degradeTimeMinutes,omitempty" validate:"omitempty,min=1"`
-	UpdateTimeThresholdSeconds    *int64           `json:"updateTimeThresholdSeconds,omitempty" validate:"omitempty,min=1"`
-	ResourceDiffThreshold         *float64         `json:"resourceDiffThreshold,omitempty" validate:"omitempty,gt=0,max=1"`
+	// MemoryCalculatePolicy determines the calculation policy of the memory resources for the Batch pods.
+	// Supported: "usage" (default), "request", "maxUsageRequest".
+	MemoryCalculatePolicy      *CalculatePolicy `json:"memoryCalculatePolicy,omitempty"`
+	DegradeTimeMinutes         *int64           `json:"degradeTimeMinutes,omitempty" validate:"omitempty,min=1"`
+	UpdateTimeThresholdSeconds *int64           `json:"updateTimeThresholdSeconds,omitempty" validate:"omitempty,min=1"`
+	ResourceDiffThreshold      *float64         `json:"resourceDiffThreshold,omitempty" validate:"omitempty,gt=0,max=1"`
 
 	// MidCPUThresholdPercent defines the maximum percentage of the Mid-tier cpu resource dividing the node allocatable.
 	// MidCPUAllocatable <= NodeCPUAllocatable * MidCPUThresholdPercent / 100.

diff --git a/apis/configuration/zz_generated.deepcopy.go b/apis/configuration/zz_generated.deepcopy.go
diff --git a/apis/extension/node_colocation.go b/apis/extension/node_colocation.go
@@ -0,0 +1,33 @@
+/*
+Copyright 2022 The Koordinator Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package extension
+
+const (
+	// AnnotationNodeColocationStrategy denotes the annotation key of the node colocation strategy.
+	// The value is the ColocationStrategy. It takes precedence to the ColocationStrategy in the slo-controller-config.
+	// The illegal value will be ignored.
+	AnnotationNodeColocationStrategy = NodeDomainPrefix + "/colocation-strategy"
+
+	// LabelCPUReclaimRatio denotes the CPU reclaim ratio of a node. The value is a float number.
+	// It takes precedence to the CPUReclaimThresholdPercent in the slo-controller-config and the node annotations.
+	// The illegal value will be ignored.
+	LabelCPUReclaimRatio = NodeDomainPrefix + "/cpu-reclaim-ratio"
+	// LabelMemoryReclaimRatio denotes the memory reclaim ratio of a node. The value is a float number.
+	// It takes precedence to the MemoryReclaimThresholdPercent in the slo-controller-config and the node annotations.
+	// The illegal value will be ignored.
+	LabelMemoryReclaimRatio = NodeDomainPrefix + "/memory-reclaim-ratio"
+)
diff --git a/docs/images/noderesource-framework.svg b/docs/images/noderesource-framework.svg
diff --git a/pkg/slo-controller/config/colocation_cm_event_handler.go b/pkg/slo-controller/config/colocation_cm_event_handler.go
@@ -100,7 +100,7 @@ func (p *ColocationHandlerForConfigMapEvent) syncConfig(configMap *corev1.Config
 	err := json.Unmarshal([]byte(configStr), &newCfg)
 	if err != nil {
 		//if controller restart ,cache will unavailable, else use old cfg
-		klog.Errorf("syncConfig failed! parse colocation error then use old Cfg ,configmap %s/%s, err: %s",
+		klog.Errorf("syncConfig failed since parse colocation error, use old Cfg ,configmap %s/%s, err: %s",
 			sloconfig.ConfigNameSpace, sloconfig.SLOCtrlConfigMap, err)
 		p.recorder.Eventf(configMap, "Warning", ReasonColocationConfigUnmarshalFailed, "failed to unmarshal colocation config, err: %s", err)
 		p.cfgCache.errorStatus = true
@@ -115,7 +115,7 @@ func (p *ColocationHandlerForConfigMapEvent) syncConfig(configMap *corev1.Config
 
 	if !sloconfig.IsColocationStrategyValid(&newCfg.ColocationStrategy) {
 		//if controller restart ,cache will unavailable, else use old cfg
-		klog.Errorf("syncConfig failed!  invalid cluster config,%+v", newCfg.ColocationStrategy)
+		klog.Errorf("syncConfig failed since the cluster config is invalid, %+v", newCfg.ColocationStrategy)
 		p.cfgCache.errorStatus = true
 		return false
 	}
@@ -126,7 +126,7 @@ func (p *ColocationHandlerForConfigMapEvent) syncConfig(configMap *corev1.Config
 		mergedNodeStrategyInterface, _ := util.MergeCfg(clusterStrategyCopy, &nodeStrategy.ColocationStrategy)
 		newNodeStrategy := *mergedNodeStrategyInterface.(*configuration.ColocationStrategy)
 		if !sloconfig.IsColocationStrategyValid(&newNodeStrategy) {
-			klog.Errorf("syncConfig failed! invalid node config,then use clusterCfg,nodeCfg:%+v", nodeStrategy)
+			klog.Errorf("syncConfig failed since node config if invalid, use clusterCfg, nodeCfg:%+v", nodeStrategy)
 			newCfg.NodeConfigs[index].ColocationStrategy = *newCfg.ColocationStrategy.DeepCopy()
 		} else {
 			newCfg.NodeConfigs[index].ColocationStrategy = newNodeStrategy

diff --git a/pkg/slo-controller/config/colocation_cm_event_handler_test.go b/pkg/slo-controller/config/colocation_cm_event_handler_test.go
@@ -38,7 +38,9 @@ func Test_syncColocationConfigIfChanged(t *testing.T) {
 	oldCfg.MemoryReclaimThresholdPercent = pointer.Int64(40)
 	memoryCalcPolicyByUsage := configuration.CalculateByPodUsage
 	memoryCalcPolicyByRequest := configuration.CalculateByPodRequest
-	var defaultNodeMemoryCollectPolicy slov1alpha1.NodeMemoryCollectPolicy = slov1alpha1.UsageWithoutPageCache
+	cpuCalcPolicyByUsage := configuration.CalculateByPodUsage
+	cpuCalcPolicyNew := configuration.CalculatePolicy("")
+	var defaultNodeMemoryCollectPolicy = slov1alpha1.UsageWithoutPageCache
 
 	type fields struct {
 		config *colocationCfgCache
@@ -199,6 +201,7 @@ func Test_syncColocationConfigIfChanged(t *testing.T) {
 						MetricReportIntervalSeconds:    pointer.Int64(20),
 						MetricAggregatePolicy:          sloconfig.DefaultColocationStrategy().MetricAggregatePolicy,
 						CPUReclaimThresholdPercent:     pointer.Int64(70),
+						CPUCalculatePolicy:             &cpuCalcPolicyByUsage,
 						MemoryReclaimThresholdPercent:  pointer.Int64(70),
 						MemoryCalculatePolicy:          &memoryCalcPolicyByUsage,
 						DegradeTimeMinutes:             pointer.Int64(15),
@@ -289,6 +292,7 @@ func Test_syncColocationConfigIfChanged(t *testing.T) {
 						MetricReportIntervalSeconds:    pointer.Int64(20),
 						MetricAggregatePolicy:          sloconfig.DefaultColocationStrategy().MetricAggregatePolicy,
 						CPUReclaimThresholdPercent:     pointer.Int64(70),
+						CPUCalculatePolicy:             &cpuCalcPolicyByUsage,
 						MemoryReclaimThresholdPercent:  pointer.Int64(80),
 						MemoryCalculatePolicy:          &memoryCalcPolicyByUsage,
 						DegradeTimeMinutes:             pointer.Int64(5),
@@ -311,6 +315,7 @@ func Test_syncColocationConfigIfChanged(t *testing.T) {
 								MetricReportIntervalSeconds:    pointer.Int64(20),
 								MetricAggregatePolicy:          sloconfig.DefaultColocationStrategy().MetricAggregatePolicy,
 								CPUReclaimThresholdPercent:     pointer.Int64(70),
+								CPUCalculatePolicy:             &cpuCalcPolicyByUsage,
 								MemoryReclaimThresholdPercent:  pointer.Int64(80),
 								MemoryCalculatePolicy:          &memoryCalcPolicyByUsage,
 								DegradeTimeMinutes:             pointer.Int64(5),
@@ -355,6 +360,7 @@ func Test_syncColocationConfigIfChanged(t *testing.T) {
 						MetricReportIntervalSeconds:    pointer.Int64(20),
 						MetricAggregatePolicy:          sloconfig.DefaultColocationStrategy().MetricAggregatePolicy,
 						CPUReclaimThresholdPercent:     pointer.Int64(70),
+						CPUCalculatePolicy:             &cpuCalcPolicyByUsage,
 						MemoryReclaimThresholdPercent:  pointer.Int64(80),
 						MemoryCalculatePolicy:          &memoryCalcPolicyByUsage,
 						DegradeTimeMinutes:             pointer.Int64(5),
@@ -377,6 +383,7 @@ func Test_syncColocationConfigIfChanged(t *testing.T) {
 						MetricReportIntervalSeconds:    pointer.Int64(60),
 						MetricAggregatePolicy:          sloconfig.DefaultColocationStrategy().MetricAggregatePolicy,
 						CPUReclaimThresholdPercent:     pointer.Int64(70),
+						CPUCalculatePolicy:             &cpuCalcPolicyByUsage,
 						MemoryReclaimThresholdPercent:  pointer.Int64(80),
 						MemoryCalculatePolicy:          &memoryCalcPolicyByUsage,
 						DegradeTimeMinutes:             pointer.Int64(5),
@@ -411,6 +418,7 @@ func Test_syncColocationConfigIfChanged(t *testing.T) {
 						MetricReportIntervalSeconds:    pointer.Int64(60),
 						MetricAggregatePolicy:          sloconfig.DefaultColocationStrategy().MetricAggregatePolicy,
 						CPUReclaimThresholdPercent:     pointer.Int64(70),
+						CPUCalculatePolicy:             &cpuCalcPolicyByUsage,
 						MemoryReclaimThresholdPercent:  pointer.Int64(80),
 						MemoryCalculatePolicy:          &memoryCalcPolicyByUsage,
 						DegradeTimeMinutes:             pointer.Int64(5),
@@ -509,6 +517,7 @@ func Test_syncColocationConfigIfChanged(t *testing.T) {
 						MetricReportIntervalSeconds:    pointer.Int64(20),
 						MetricAggregatePolicy:          sloconfig.DefaultColocationStrategy().MetricAggregatePolicy,
 						CPUReclaimThresholdPercent:     pointer.Int64(70),
+						CPUCalculatePolicy:             &cpuCalcPolicyByUsage,
 						MemoryReclaimThresholdPercent:  pointer.Int64(80),
 						MemoryCalculatePolicy:          &memoryCalcPolicyByUsage,
 						DegradeTimeMinutes:             pointer.Int64(5),
@@ -533,6 +542,7 @@ func Test_syncColocationConfigIfChanged(t *testing.T) {
 								MetricReportIntervalSeconds:    pointer.Int64(20),
 								MetricAggregatePolicy:          sloconfig.DefaultColocationStrategy().MetricAggregatePolicy,
 								CPUReclaimThresholdPercent:     pointer.Int64(70),
+								CPUCalculatePolicy:             &cpuCalcPolicyByUsage,
 								MemoryReclaimThresholdPercent:  pointer.Int64(80),
 								MemoryCalculatePolicy:          &memoryCalcPolicyByUsage,
 								DegradeTimeMinutes:             pointer.Int64(5),
@@ -559,6 +569,7 @@ func Test_syncColocationConfigIfChanged(t *testing.T) {
 						MetricReportIntervalSeconds:    pointer.Int64(60),
 						MetricAggregatePolicy:          sloconfig.DefaultColocationStrategy().MetricAggregatePolicy,
 						CPUReclaimThresholdPercent:     pointer.Int64(70),
+						CPUCalculatePolicy:             &cpuCalcPolicyByUsage,
 						MemoryReclaimThresholdPercent:  pointer.Int64(80),
 						MemoryCalculatePolicy:          &memoryCalcPolicyByUsage,
 						DegradeTimeMinutes:             pointer.Int64(5),
@@ -598,7 +609,7 @@ func Test_syncColocationConfigIfChanged(t *testing.T) {
 						"\"cpuReclaimThresholdPercent\":70,\"memoryReclaimThresholdPercent\":80,\"memoryCalculatePolicy\":\"request\"," +
 						"\"updateTimeThresholdSeconds\":300," +
 						"\"degradeTimeMinutes\":5,\"resourceDiffThreshold\":0.1,\"nodeConfigs\":[{\"nodeSelector\":" +
-						"{\"matchLabels\":{\"xxx\":\"yyy\"}},\"name\":\"xxx-yyy\",\"enable\":true,\"cpuReclaimThresholdPercent\":60}]}",
+						"{\"matchLabels\":{\"xxx\":\"yyy\"}},\"name\":\"xxx-yyy\",\"enable\":true,\"cpuReclaimThresholdPercent\":60, \"cpuCalculatePolicy\": \"\"}]}",
 				},
 			}},
 			wantChanged: true,
@@ -610,6 +621,7 @@ func Test_syncColocationConfigIfChanged(t *testing.T) {
 						MetricReportIntervalSeconds:    pointer.Int64(20),
 						MetricAggregatePolicy:          sloconfig.DefaultColocationStrategy().MetricAggregatePolicy,
 						CPUReclaimThresholdPercent:     pointer.Int64(70),
+						CPUCalculatePolicy:             &cpuCalcPolicyByUsage,
 						MemoryReclaimThresholdPercent:  pointer.Int64(80),
 						MemoryCalculatePolicy:          &memoryCalcPolicyByRequest,
 						DegradeTimeMinutes:             pointer.Int64(5),
@@ -640,6 +652,7 @@ func Test_syncColocationConfigIfChanged(t *testing.T) {
 								MetricMemoryCollectPolicy:      &defaultNodeMemoryCollectPolicy,
 								//change
 								CPUReclaimThresholdPercent: pointer.Int64(60),
+								CPUCalculatePolicy:         &cpuCalcPolicyNew,
 							},
 						},
 					},
@@ -666,7 +679,8 @@ func Test_syncColocationConfigIfChanged(t *testing.T) {
 func Test_IsCfgAvailable(t *testing.T) {
 	defaultConfig := sloconfig.DefaultColocationCfg()
 	memoryCalcPolicyByUsage := configuration.CalculateByPodUsage
-	var defaultNodeMemoryCollectPolicy slov1alpha1.NodeMemoryCollectPolicy = slov1alpha1.UsageWithoutPageCache
+	cpuCalcPolicyByUsage := configuration.CalculateByPodUsage
+	var defaultNodeMemoryCollectPolicy = slov1alpha1.UsageWithoutPageCache
 	type fields struct {
 		config    *colocationCfgCache
 		configMap *corev1.ConfigMap
@@ -736,6 +750,7 @@ func Test_IsCfgAvailable(t *testing.T) {
 					MetricAggregateDurationSeconds: pointer.Int64(60),
 					MetricAggregatePolicy:          sloconfig.DefaultColocationStrategy().MetricAggregatePolicy,
 					CPUReclaimThresholdPercent:     pointer.Int64(70),
+					CPUCalculatePolicy:             &cpuCalcPolicyByUsage,
 					MemoryReclaimThresholdPercent:  pointer.Int64(80),
 					MemoryCalculatePolicy:          &memoryCalcPolicyByUsage,
 					DegradeTimeMinutes:             pointer.Int64(5),

diff --git a/pkg/slo-controller/noderesource/framework/README.md b/pkg/slo-controller/noderesource/framework/README.md
@@ -44,13 +44,27 @@ type ResourceItem struct {
 }
 ```
 
-- **Prepare**: It prepares the Node object with the calculated result `NodeResource`. Before the Preparing, it is
+- **PreUpdate**: It allows the plugin to preprocess for the calculated results called before updating the Node.
+For example, a plugin can prepare and update some Objects like CRDs before updating the Node. And the plugin also can
+mutate the internal NodeResource object according the fully calculated results.
+It differs from the Prepare stage since a NodePreUpdatePlugin will be invoked only once in one loop (so the plugin
+should consider implement a retry login itself if needed), while the NodePreparePlugin is not expected to update other
+objects or mutate the NodeResource.
+
+```go
+type NodePreUpdatePlugin interface {
+	Plugin
+	PreUpdate(strategy *ColocationStrategy, node *Node, nr *NodeResource) error
+}
+```
+
+- **Prepare**: It prepares the Node object with the calculated result `NodeResource`. Before the updating, it is
 invoked after the Calculate so to allow the plugin to retry when the client updates conflicts.
 
 ```go
 type NodePreparePlugin interface {
 	Plugin
-	Execute(strategy *ColocationStrategy, node *Node, nr *NodeResource) error
+	Prepare(strategy *ColocationStrategy, node *Node, nr *NodeResource) error
 }
 
 type NodeResource struct {
@@ -62,17 +76,17 @@ type NodeResource struct {
 }
 ```
 
-- **NeedSync**: It checks if the newly-prepared Node object should be synchronized to the kube-apiserver. To be more
-specific, there are two types of NeedSync plugins for different client update methods, where one can determine whether
-the node status should be updated and another determines whether node metadata should be updated.
+- **NodeCheck**: It checks if the newly-prepared Node object should be synchronized to the kube-apiserver. To be more
+specific, currently there are two types of NeedSync plugins for different client update methods, where one can determine
+whether the node status should be updated and another determines whether node metadata should be updated.
 
 ```go
-type NodeSyncPlugin interface {
+type NodeStatusCheckPlugin interface {
 	Plugin
 	NeedSync(strategy *ColocationStrategy, oldNode, newNode *Node) (bool, string)
 }
 
-type NodeMetaSyncPlugin interface {
+type NodeMetaCheckPlugin interface {
 	Plugin
 	NeedSyncMeta(strategy *ColocationStrategy, oldNode, newNode *Node) (bool, string)
 }
@@ -85,7 +99,11 @@ There is the workflow about how the node resource controller handles a dequeued
 ## Example: Batch Resource Plugin
 
 The default `BatchResource` plugin is responsible for calculating and updating the Batch-tier resources.
-It implements the stages `Calculate`, `Reset`, `Prepare` and `NeedSync`:
+It implements the stages `Setup`, `Calculate`/`Reset`, `PreUpdate`, `Prepare` and `NodeStatusCheck`:
+
+**Setup**:
+
+In the initialization, the plugin sets the kube client, and add a watch for the NodeResourceTopology.
 
 **Calculate**:
 
@@ -98,12 +116,17 @@ batchAllocatable := nodeAllocatable * thresholdPercent - podUsage(HP) - systemUs
 
 Besides, the plugin implements the `Reset` method to clean up the Batch resources when the node colocation is disabled.
 
+**PreUpdate**:
+
+Before updating the Node obj, the plugin updates the zone-level Batch resources for the NRT (NodeResourceTopology)
+according to the calculated results from the `Calculate` stage.
+
 **Prepare**:
 
 The plugin sets the extended resources `kubernetes.io/batch-cpu`, `kubernetes.io/batch-memory` in the
-`node.status.allocatable` according to the calculated results from the `Calculate` or `Reset` stage.
+`node.status.allocatable` according to the calculated results from the `Calculate`/`Reset` stage.
 
-**NeedSync**:
+**NodeStatusCheck**:
 
 The plugin checks the extended resources `kubernetes.io/batch-cpu`, `kubernetes.io/batch-memory` of the prepared node
 and the old node. If the node's Batch resources have not been updated for too long or the calculated results changes