Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

descheduler: check nodemetrics cr is expired or not when descheduling #1721

Merged
merged 1 commit into from
Oct 25, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions pkg/descheduler/apis/config/types_loadaware.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,11 @@ type LowNodeLoadArgs struct {
// By default, NumberOfNodes is set to zero.
NumberOfNodes int32

// NodeMetricExpirationSeconds indicates the NodeMetric expiration in seconds.
// When NodeMetrics expired, the node is considered abnormal, and should not be considered by deschedule plugin.
// Default is 180 seconds.
NodeMetricExpirationSeconds *int64

// Naming this one differently since namespaces are still
// considered while considering resoures used by pods
// but then filtered out before eviction
Expand Down
7 changes: 6 additions & 1 deletion pkg/descheduler/apis/config/v1alpha2/defaults.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,8 @@ import (
)

const (
defaultMigrationControllerMaxConcurrentReconciles = 1
defaultMigrationControllerMaxConcurrentReconciles = 1
defaultNodeMetricExpirationSeconds int64 = 180

defaultMaxMigratingPerNode = 2
defaultMigrationJobMode = sev1alpha1.PodMigrationJobModeReservationFirst
Expand Down Expand Up @@ -263,6 +264,10 @@ func SetDefaults_LowNodeLoadArgs(obj *LowNodeLoadArgs) {
obj.AnomalyCondition.ConsecutiveAbnormalities = defaultLoadAnomalyCondition.ConsecutiveAbnormalities
}

if obj.NodeMetricExpirationSeconds == nil {
obj.NodeMetricExpirationSeconds = pointer.Int64(defaultNodeMetricExpirationSeconds)
}

defaultResourceWeights := map[corev1.ResourceName]int64{
corev1.ResourceCPU: 1,
corev1.ResourceMemory: 1,
Expand Down
13 changes: 8 additions & 5 deletions pkg/descheduler/apis/config/v1alpha2/defaults_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,9 @@ func TestSetDefaults_LowNodeLoadArgs(t *testing.T) {
NodeFit: pointer.Bool(false),
},
expected: &LowNodeLoadArgs{
NodeFit: pointer.Bool(false),
AnomalyCondition: defaultLoadAnomalyCondition,
NodeFit: pointer.Bool(false),
NodeMetricExpirationSeconds: pointer.Int64(defaultNodeMetricExpirationSeconds),
AnomalyCondition: defaultLoadAnomalyCondition,
ResourceWeights: map[corev1.ResourceName]int64{
corev1.ResourceCPU: 1,
corev1.ResourceMemory: 1,
Expand All @@ -56,7 +57,8 @@ func TestSetDefaults_LowNodeLoadArgs(t *testing.T) {
},
},
expected: &LowNodeLoadArgs{
NodeFit: pointer.Bool(true),
NodeFit: pointer.Bool(true),
NodeMetricExpirationSeconds: pointer.Int64(defaultNodeMetricExpirationSeconds),
AnomalyCondition: &LoadAnomalyCondition{
Timeout: &metav1.Duration{Duration: 10 * time.Second},
ConsecutiveAbnormalities: defaultLoadAnomalyCondition.ConsecutiveAbnormalities,
Expand All @@ -82,8 +84,9 @@ func TestSetDefaults_LowNodeLoadArgs(t *testing.T) {
},
},
expected: &LowNodeLoadArgs{
NodeFit: pointer.Bool(true),
AnomalyCondition: defaultLoadAnomalyCondition,
NodeFit: pointer.Bool(true),
NodeMetricExpirationSeconds: pointer.Int64(defaultNodeMetricExpirationSeconds),
AnomalyCondition: defaultLoadAnomalyCondition,
LowThresholds: ResourceThresholds{
corev1.ResourceCPU: 30,
corev1.ResourceMemory: 30,
Expand Down
5 changes: 5 additions & 0 deletions pkg/descheduler/apis/config/v1alpha2/types_loadaware.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,11 @@ type LowNodeLoadArgs struct {
// By default, NumberOfNodes is set to zero.
NumberOfNodes *int32 `json:"numberOfNodes,omitempty"`

// NodeMetricExpirationSeconds indicates the NodeMetric expiration in seconds.
// When NodeMetrics expired, the node is considered abnormal, and should not be considered by deschedule plugin.
// Default is 180 seconds.
NodeMetricExpirationSeconds *int64 `json:"nodeMetricExpirationSeconds,omitempty"`

// Naming this one differently since namespaces are still
// considered while considering resoures used by pods
// but then filtered out before eviction
Expand Down

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 5 additions & 0 deletions pkg/descheduler/apis/config/v1alpha2/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,10 @@ func ValidateLowLoadUtilizationArgs(path *field.Path, args *deschedulerconfig.Lo
allErrs = append(allErrs, field.Invalid(path.Child("numberOfNodes"), args.NumberOfNodes, "must be greater than or equal to 0"))
}

if args.NodeMetricExpirationSeconds != nil && *args.NodeMetricExpirationSeconds <= 0 {
allErrs = append(allErrs, field.Invalid(field.NewPath("nodeMetricExpiredSeconds"), *args.NodeMetricExpirationSeconds, "nodeMetricExpiredSeconds should be a positive value"))
}

if args.EvictableNamespaces != nil && len(args.EvictableNamespaces.Include) > 0 && len(args.EvictableNamespaces.Exclude) > 0 {
allErrs = append(allErrs, field.Invalid(path.Child("evictableNamespaces"), args.EvictableNamespaces, "only one of Include/Exclude namespaces can be set"))
}
Expand Down
5 changes: 5 additions & 0 deletions pkg/descheduler/apis/config/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ func (pl *LowNodeLoad) processOneNodePool(ctx context.Context, nodePool *desched

lowThresholds, highThresholds := newThresholds(nodePool.UseDeviationThresholds, nodePool.LowThresholds, nodePool.HighThresholds)
resourceNames := getResourceNames(lowThresholds)
nodeUsages := getNodeUsage(nodes, resourceNames, pl.nodeMetricLister, pl.handle.GetPodsAssignedToNodeFunc())
nodeUsages := getNodeUsage(nodes, resourceNames, pl.nodeMetricLister, pl.handle.GetPodsAssignedToNodeFunc(), pl.args.NodeMetricExpirationSeconds)
nodeThresholds := getNodeThresholds(nodeUsages, lowThresholds, highThresholds, resourceNames, nodePool.UseDeviationThresholds)
lowNodes, sourceNodes := classifyNodes(nodeUsages, nodeThresholds, lowThresholdFilter, highThresholdFilter)

Expand Down
16 changes: 13 additions & 3 deletions pkg/descheduler/framework/plugins/loadaware/utilization_util.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,11 @@ import (
"context"
"fmt"
"sort"
"time"

corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
"k8s.io/klog/v2"

Expand Down Expand Up @@ -127,7 +129,7 @@ func resourceThreshold(nodeCapacity corev1.ResourceList, resourceName corev1.Res
return resource.NewQuantity(resourceCapacityFraction(resourceCapacityQuantity.Value()), resourceCapacityQuantity.Format)
}

func getNodeUsage(nodes []*corev1.Node, resourceNames []corev1.ResourceName, nodeMetricLister slolisters.NodeMetricLister, getPodsAssignedToNode podutil.GetPodsAssignedToNodeFunc) map[string]*NodeUsage {
func getNodeUsage(nodes []*corev1.Node, resourceNames []corev1.ResourceName, nodeMetricLister slolisters.NodeMetricLister, getPodsAssignedToNode podutil.GetPodsAssignedToNodeFunc, nodeMetricExpirationSeconds *int64) map[string]*NodeUsage {
nodeUsages := map[string]*NodeUsage{}
for _, v := range nodes {
pods, err := podutil.ListPodsOnANode(v.Name, getPodsAssignedToNode, nil)
Expand All @@ -141,8 +143,10 @@ func getNodeUsage(nodes []*corev1.Node, resourceNames []corev1.ResourceName, nod
klog.ErrorS(err, "Failed to get NodeMetric", "node", klog.KObj(v))
continue
}
// TODO(joseph): We should check if NodeMetric is expired.
if nodeMetric.Status.NodeMetric == nil {
// We should check if NodeMetric is expired.
if nodeMetric.Status.NodeMetric == nil || nodeMetricExpirationSeconds != nil &&
isNodeMetricExpired(nodeMetric.Status.UpdateTime, *nodeMetricExpirationSeconds) {
klog.ErrorS(err, "NodeMetric has expired", "node", klog.KObj(v), "effective period", time.Duration(*nodeMetricExpirationSeconds)*time.Second)
continue
}

Expand Down Expand Up @@ -415,6 +419,12 @@ func isNodeUnderutilized(usage, thresholds map[corev1.ResourceName]*resource.Qua
return true
}

func isNodeMetricExpired(lastUpdateTime *metav1.Time, nodeMetricExpirationSeconds int64) bool {
return lastUpdateTime == nil ||
nodeMetricExpirationSeconds > 0 &&
time.Since(lastUpdateTime.Time) >= time.Duration(nodeMetricExpirationSeconds)*time.Second
}

func getResourceNames(thresholds ResourceThresholds) []corev1.ResourceName {
names := make([]corev1.ResourceName, 0, len(thresholds))
for resourceName := range thresholds {
Expand Down
Loading