From 7601ae40c04bae5850bd8a3e13a8c166a4069ba8 Mon Sep 17 00:00:00 2001 From: Joseph Date: Fri, 3 Nov 2023 14:56:30 +0800 Subject: [PATCH] scheduler: numa-aware scheduling supports selecting numa node by score (#1726) Signed-off-by: Joseph --- pkg/scheduler/apis/config/types.go | 10 +- pkg/scheduler/apis/config/v1beta2/defaults.go | 15 ++ pkg/scheduler/apis/config/v1beta2/types.go | 12 +- .../config/v1beta2/zz_generated.conversion.go | 2 + .../config/v1beta2/zz_generated.deepcopy.go | 5 + .../apis/config/zz_generated.deepcopy.go | 5 + .../frameworkext/topologymanager/policy.go | 26 +- .../policy_best_effort_test.go | 4 +- .../topologymanager/policy_none_test.go | 4 +- .../topologymanager/policy_restricted_test.go | 4 +- .../policy_single_numa_node.go | 4 +- .../policy_single_numa_node_test.go | 2 +- .../plugins/nodenumaresource/plugin.go | 12 +- .../plugins/nodenumaresource/plugin_test.go | 239 +++++++++++++++++- .../nodenumaresource/resource_manager.go | 63 +++-- .../nodenumaresource/resource_manager_test.go | 2 +- .../plugins/nodenumaresource/scoring_test.go | 71 ++---- .../plugins/nodenumaresource/topology_hint.go | 1 + .../plugins/nodenumaresource/util.go | 6 +- 19 files changed, 383 insertions(+), 104 deletions(-) diff --git a/pkg/scheduler/apis/config/types.go b/pkg/scheduler/apis/config/types.go index 20505d510..9ce87b788 100644 --- a/pkg/scheduler/apis/config/types.go +++ b/pkg/scheduler/apis/config/types.go @@ -103,8 +103,14 @@ type ScoringStrategy struct { type NodeNUMAResourceArgs struct { metav1.TypeMeta + // DefaultCPUBindPolicy represents the default CPU bind policy. + // If it is empty and the Pod does not declare a binding policy, + // the core will not be bound to the LSE/LSR type Pod. DefaultCPUBindPolicy CPUBindPolicy - ScoringStrategy *ScoringStrategy + // ScoringStrategy is used to configure the scoring strategy of the Node-level. + ScoringStrategy *ScoringStrategy + // NUMAScoringStrategy is used to configure the scoring strategy of the NUMANode-level + NUMAScoringStrategy *ScoringStrategy } // CPUBindPolicy defines the CPU binding policy @@ -132,7 +138,7 @@ const ( CPUExclusivePolicyNUMANodeLevel CPUExclusivePolicy = extension.CPUExclusivePolicyNUMANodeLevel ) -// NUMAAllocateStrategy indicates how to choose satisfied NUMA Nodes +// NUMAAllocateStrategy indicates how to choose satisfied NUMA Nodes during binding CPUs type NUMAAllocateStrategy = extension.NUMAAllocateStrategy const ( diff --git a/pkg/scheduler/apis/config/v1beta2/defaults.go b/pkg/scheduler/apis/config/v1beta2/defaults.go index d1f3b0df5..e0f0a8ed9 100644 --- a/pkg/scheduler/apis/config/v1beta2/defaults.go +++ b/pkg/scheduler/apis/config/v1beta2/defaults.go @@ -118,6 +118,21 @@ func SetDefaults_NodeNUMAResourceArgs(obj *NodeNUMAResourceArgs) { }, } } + if obj.NUMAScoringStrategy == nil { + obj.NUMAScoringStrategy = &ScoringStrategy{ + Type: LeastAllocated, + Resources: []schedconfigv1beta2.ResourceSpec{ + { + Name: string(corev1.ResourceCPU), + Weight: 1, + }, + { + Name: string(corev1.ResourceMemory), + Weight: 1, + }, + }, + } + } } func SetDefaults_ReservationArgs(obj *ReservationArgs) { diff --git a/pkg/scheduler/apis/config/v1beta2/types.go b/pkg/scheduler/apis/config/v1beta2/types.go index 4f7028167..89897ae62 100644 --- a/pkg/scheduler/apis/config/v1beta2/types.go +++ b/pkg/scheduler/apis/config/v1beta2/types.go @@ -98,8 +98,14 @@ type ScoringStrategy struct { type NodeNUMAResourceArgs struct { metav1.TypeMeta - DefaultCPUBindPolicy *CPUBindPolicy `json:"defaultCPUBindPolicy,omitempty"` - ScoringStrategy *ScoringStrategy `json:"scoringStrategy,omitempty"` + // DefaultCPUBindPolicy represents the default CPU bind policy. + // If it is empty and the Pod does not declare a binding policy, + // the core will not be bound to the LSE/LSR type Pod. + DefaultCPUBindPolicy *CPUBindPolicy `json:"defaultCPUBindPolicy,omitempty"` + // ScoringStrategy is used to configure the scoring strategy of the node-level. + ScoringStrategy *ScoringStrategy `json:"scoringStrategy,omitempty"` + // NUMAScoringStrategy is used to configure the scoring strategy of the NUMANode-level + NUMAScoringStrategy *ScoringStrategy `json:"numaScoringStrategy,omitempty"` } // CPUBindPolicy defines the CPU binding policy @@ -127,7 +133,7 @@ const ( CPUExclusivePolicyNUMANodeLevel CPUExclusivePolicy = extension.CPUExclusivePolicyNUMANodeLevel ) -// NUMAAllocateStrategy indicates how to choose satisfied NUMA Nodes +// NUMAAllocateStrategy indicates how to choose satisfied NUMA Nodes during binding CPUs type NUMAAllocateStrategy = extension.NUMAAllocateStrategy const ( diff --git a/pkg/scheduler/apis/config/v1beta2/zz_generated.conversion.go b/pkg/scheduler/apis/config/v1beta2/zz_generated.conversion.go index ed76b5596..597459cb1 100644 --- a/pkg/scheduler/apis/config/v1beta2/zz_generated.conversion.go +++ b/pkg/scheduler/apis/config/v1beta2/zz_generated.conversion.go @@ -298,6 +298,7 @@ func autoConvert_v1beta2_NodeNUMAResourceArgs_To_config_NodeNUMAResourceArgs(in return err } out.ScoringStrategy = (*config.ScoringStrategy)(unsafe.Pointer(in.ScoringStrategy)) + out.NUMAScoringStrategy = (*config.ScoringStrategy)(unsafe.Pointer(in.NUMAScoringStrategy)) return nil } @@ -311,6 +312,7 @@ func autoConvert_config_NodeNUMAResourceArgs_To_v1beta2_NodeNUMAResourceArgs(in return err } out.ScoringStrategy = (*ScoringStrategy)(unsafe.Pointer(in.ScoringStrategy)) + out.NUMAScoringStrategy = (*ScoringStrategy)(unsafe.Pointer(in.NUMAScoringStrategy)) return nil } diff --git a/pkg/scheduler/apis/config/v1beta2/zz_generated.deepcopy.go b/pkg/scheduler/apis/config/v1beta2/zz_generated.deepcopy.go index 458398baa..c467279b6 100644 --- a/pkg/scheduler/apis/config/v1beta2/zz_generated.deepcopy.go +++ b/pkg/scheduler/apis/config/v1beta2/zz_generated.deepcopy.go @@ -277,6 +277,11 @@ func (in *NodeNUMAResourceArgs) DeepCopyInto(out *NodeNUMAResourceArgs) { *out = new(ScoringStrategy) (*in).DeepCopyInto(*out) } + if in.NUMAScoringStrategy != nil { + in, out := &in.NUMAScoringStrategy, &out.NUMAScoringStrategy + *out = new(ScoringStrategy) + (*in).DeepCopyInto(*out) + } return } diff --git a/pkg/scheduler/apis/config/zz_generated.deepcopy.go b/pkg/scheduler/apis/config/zz_generated.deepcopy.go index 5e015e78e..b0cdf0303 100644 --- a/pkg/scheduler/apis/config/zz_generated.deepcopy.go +++ b/pkg/scheduler/apis/config/zz_generated.deepcopy.go @@ -254,6 +254,11 @@ func (in *NodeNUMAResourceArgs) DeepCopyInto(out *NodeNUMAResourceArgs) { *out = new(ScoringStrategy) (*in).DeepCopyInto(*out) } + if in.NUMAScoringStrategy != nil { + in, out := &in.NUMAScoringStrategy, &out.NUMAScoringStrategy + *out = new(ScoringStrategy) + (*in).DeepCopyInto(*out) + } return } diff --git a/pkg/scheduler/frameworkext/topologymanager/policy.go b/pkg/scheduler/frameworkext/topologymanager/policy.go index 3d0592112..ac1e2f763 100644 --- a/pkg/scheduler/frameworkext/topologymanager/policy.go +++ b/pkg/scheduler/frameworkext/topologymanager/policy.go @@ -36,6 +36,9 @@ type NUMATopologyHint struct { // Preferred is set to true when the NUMANodeAffinity encodes a preferred // allocation for the Pod. It is set to false otherwise. Preferred bool + // Score is the weight of this hint. For the same Affinity, + // the one with higher weight will be used first. + Score int64 } // IsEqual checks if NUMATopologyHint are equal @@ -85,7 +88,7 @@ func mergePermutation(numaNodes []int, permutation []NUMATopologyHint) NUMATopol mergedAffinity := bitmask.And(defaultAffinity, numaAffinities...) // Build a mergedHint from the merged affinity mask, indicating if an // preferred allocation was used to generate the affinity mask or not. - return NUMATopologyHint{mergedAffinity, preferred} + return NUMATopologyHint{mergedAffinity, preferred, 0} } func filterProvidersHints(providersHints []map[string][]NUMATopologyHint) [][]NUMATopologyHint { @@ -97,7 +100,7 @@ func filterProvidersHints(providersHints []map[string][]NUMATopologyHint) [][]NU // If hints is nil, insert a single, preferred any-numa hint into allProviderHints. if len(hints) == 0 { klog.V(5).Infof("[topologymanager] Hint Provider has no preference for NUMA affinity with any resource") - allProviderHints = append(allProviderHints, []NUMATopologyHint{{nil, true}}) + allProviderHints = append(allProviderHints, []NUMATopologyHint{{nil, true, 0}}) continue } @@ -105,13 +108,13 @@ func filterProvidersHints(providersHints []map[string][]NUMATopologyHint) [][]NU for resource := range hints { if hints[resource] == nil { klog.V(5).Infof("[topologymanager] Hint Provider has no preference for NUMA affinity with resource '%s'", resource) - allProviderHints = append(allProviderHints, []NUMATopologyHint{{nil, true}}) + allProviderHints = append(allProviderHints, []NUMATopologyHint{{nil, true, 0}}) continue } if len(hints[resource]) == 0 { klog.V(5).Infof("[topologymanager] Hint Provider has no possible NUMA affinities for resource '%s'", resource) - allProviderHints = append(allProviderHints, []NUMATopologyHint{{nil, false}}) + allProviderHints = append(allProviderHints, []NUMATopologyHint{{nil, false, 0}}) continue } @@ -129,7 +132,7 @@ func mergeFilteredHints(numaNodes []int, filteredHints [][]NUMATopologyHint) NUM // Set the bestHint to return from this function as {nil false}. // This will only be returned if no better hint can be found when // merging hints from each hint provider. - bestHint := NUMATopologyHint{defaultAffinity, false} + bestHint := NUMATopologyHint{defaultAffinity, false, 0} iterateAllProviderTopologyHints(filteredHints, func(permutation []NUMATopologyHint) { // Get the NUMANodeAffinity from each hint in the permutation and see if any // of them encode unpreferred allocations. @@ -140,6 +143,14 @@ func mergeFilteredHints(numaNodes []int, filteredHints [][]NUMATopologyHint) NUM return } + for _, v := range permutation { + if v.NUMANodeAffinity != nil && mergedHint.NUMANodeAffinity.IsEqual(v.NUMANodeAffinity) { + if v.Score > mergedHint.Score { + mergedHint.Score = v.Score + } + } + } + // If the current bestHint is non-preferred and the new mergedHint is // preferred, always choose the preferred hint over the non-preferred one. if mergedHint.Preferred && !bestHint.Preferred { @@ -158,6 +169,11 @@ func mergeFilteredHints(numaNodes []int, filteredHints [][]NUMATopologyHint) NUM // mergedHints that have a narrower NUMANodeAffinity than the // NUMANodeAffinity in the current bestHint. if !mergedHint.NUMANodeAffinity.IsNarrowerThan(bestHint.NUMANodeAffinity) { + if mergedHint.NUMANodeAffinity.Count() == bestHint.NUMANodeAffinity.Count() { + if mergedHint.Score > bestHint.Score { + bestHint = mergedHint + } + } return } diff --git a/pkg/scheduler/frameworkext/topologymanager/policy_best_effort_test.go b/pkg/scheduler/frameworkext/topologymanager/policy_best_effort_test.go index 8d9ed7886..260c573a1 100644 --- a/pkg/scheduler/frameworkext/topologymanager/policy_best_effort_test.go +++ b/pkg/scheduler/frameworkext/topologymanager/policy_best_effort_test.go @@ -29,12 +29,12 @@ func TestPolicyBestEffortCanAdmitPodResult(t *testing.T) { }{ { name: "Preferred is set to false in topology hints", - hint: NUMATopologyHint{nil, false}, + hint: NUMATopologyHint{nil, false, 0}, expected: true, }, { name: "Preferred is set to true in topology hints", - hint: NUMATopologyHint{nil, true}, + hint: NUMATopologyHint{nil, true, 0}, expected: true, }, } diff --git a/pkg/scheduler/frameworkext/topologymanager/policy_none_test.go b/pkg/scheduler/frameworkext/topologymanager/policy_none_test.go index e030631ba..33482427d 100644 --- a/pkg/scheduler/frameworkext/topologymanager/policy_none_test.go +++ b/pkg/scheduler/frameworkext/topologymanager/policy_none_test.go @@ -47,12 +47,12 @@ func TestPolicyNoneCanAdmitPodResult(t *testing.T) { }{ { name: "Preferred is set to false in topology hints", - hint: NUMATopologyHint{nil, false}, + hint: NUMATopologyHint{nil, false, 0}, expected: true, }, { name: "Preferred is set to true in topology hints", - hint: NUMATopologyHint{nil, true}, + hint: NUMATopologyHint{nil, true, 0}, expected: true, }, } diff --git a/pkg/scheduler/frameworkext/topologymanager/policy_restricted_test.go b/pkg/scheduler/frameworkext/topologymanager/policy_restricted_test.go index 6c5a6de86..f5eef4953 100644 --- a/pkg/scheduler/frameworkext/topologymanager/policy_restricted_test.go +++ b/pkg/scheduler/frameworkext/topologymanager/policy_restricted_test.go @@ -47,12 +47,12 @@ func TestPolicyRestrictedCanAdmitPodResult(t *testing.T) { }{ { name: "Preferred is set to false in topology hints", - hint: NUMATopologyHint{nil, false}, + hint: NUMATopologyHint{nil, false, 0}, expected: false, }, { name: "Preferred is set to true in topology hints", - hint: NUMATopologyHint{nil, true}, + hint: NUMATopologyHint{nil, true, 0}, expected: true, }, } diff --git a/pkg/scheduler/frameworkext/topologymanager/policy_single_numa_node.go b/pkg/scheduler/frameworkext/topologymanager/policy_single_numa_node.go index 00a57be30..7c8e853af 100644 --- a/pkg/scheduler/frameworkext/topologymanager/policy_single_numa_node.go +++ b/pkg/scheduler/frameworkext/topologymanager/policy_single_numa_node.go @@ -64,13 +64,13 @@ func filterSingleNumaHints(allResourcesHints [][]NUMATopologyHint) [][]NUMATopol func (p *singleNumaNodePolicy) Merge(providersHints []map[string][]NUMATopologyHint) (NUMATopologyHint, bool) { filteredHints := filterProvidersHints(providersHints) - // Filter to only include don't cares and hints with a single NUMA node. + // Filter to only include don't care and hints with a single NUMA node. singleNumaHints := filterSingleNumaHints(filteredHints) bestHint := mergeFilteredHints(p.numaNodes, singleNumaHints) defaultAffinity, _ := bitmask.NewBitMask(p.numaNodes...) if bestHint.NUMANodeAffinity.IsEqual(defaultAffinity) { - bestHint = NUMATopologyHint{nil, bestHint.Preferred} + bestHint = NUMATopologyHint{nil, bestHint.Preferred, 0} } admit := p.canAdmitPodResult(&bestHint) diff --git a/pkg/scheduler/frameworkext/topologymanager/policy_single_numa_node_test.go b/pkg/scheduler/frameworkext/topologymanager/policy_single_numa_node_test.go index 671423bbb..a87c7123d 100644 --- a/pkg/scheduler/frameworkext/topologymanager/policy_single_numa_node_test.go +++ b/pkg/scheduler/frameworkext/topologymanager/policy_single_numa_node_test.go @@ -30,7 +30,7 @@ func TestPolicySingleNumaNodeCanAdmitPodResult(t *testing.T) { }{ { name: "Preferred is set to false in topology hints", - hint: NUMATopologyHint{nil, false}, + hint: NUMATopologyHint{nil, false, 0}, expected: false, }, } diff --git a/pkg/scheduler/plugins/nodenumaresource/plugin.go b/pkg/scheduler/plugins/nodenumaresource/plugin.go index 10e87e54b..27a84ac63 100644 --- a/pkg/scheduler/plugins/nodenumaresource/plugin.go +++ b/pkg/scheduler/plugins/nodenumaresource/plugin.go @@ -74,6 +74,7 @@ type Plugin struct { pluginArgs *schedulingconfig.NodeNUMAResourceArgs nrtLister topologylister.NodeResourceTopologyLister scorer *resourceAllocationScorer + numaScorer *resourceAllocationScorer resourceManager ResourceManager topologyOptionsManager TopologyOptionsManager @@ -114,6 +115,14 @@ func NewWithOptions(args runtime.Object, handle framework.Handle, opts ...Option if !exists { return nil, fmt.Errorf("scoring strategy %s is not supported", strategy) } + scorer := scorePlugin(pluginArgs) + + strategy = pluginArgs.NUMAScoringStrategy.Type + scorePlugin, exists = resourceStrategyTypeMap[strategy] + if !exists { + return nil, fmt.Errorf("numa scoring strategy %s is not supported", strategy) + } + numaScorer := scorePlugin(pluginArgs) options := &pluginOptions{} for _, optFnc := range opts { @@ -144,7 +153,8 @@ func NewWithOptions(args runtime.Object, handle framework.Handle, opts ...Option handle: handle, pluginArgs: pluginArgs, nrtLister: nrtLister, - scorer: scorePlugin(pluginArgs), + scorer: scorer, + numaScorer: numaScorer, resourceManager: options.resourceManager, topologyOptionsManager: options.topologyOptionsManager, }, nil diff --git a/pkg/scheduler/plugins/nodenumaresource/plugin_test.go b/pkg/scheduler/plugins/nodenumaresource/plugin_test.go index 00f60165d..768d471b3 100644 --- a/pkg/scheduler/plugins/nodenumaresource/plugin_test.go +++ b/pkg/scheduler/plugins/nodenumaresource/plugin_test.go @@ -33,11 +33,12 @@ import ( "k8s.io/client-go/informers" kubefake "k8s.io/client-go/kubernetes/fake" apiresource "k8s.io/kubernetes/pkg/api/v1/resource" + k8sschedconfig "k8s.io/kubernetes/pkg/scheduler/apis/config" "k8s.io/kubernetes/pkg/scheduler/framework" "k8s.io/kubernetes/pkg/scheduler/framework/plugins/defaultbinder" "k8s.io/kubernetes/pkg/scheduler/framework/plugins/queuesort" "k8s.io/kubernetes/pkg/scheduler/framework/runtime" - schedulertesting "k8s.io/kubernetes/pkg/scheduler/testing" + st "k8s.io/kubernetes/pkg/scheduler/testing" "k8s.io/utils/pointer" "github.com/koordinator-sh/koordinator/apis/extension" @@ -49,6 +50,7 @@ import ( "github.com/koordinator-sh/koordinator/pkg/scheduler/apis/config/v1beta2" "github.com/koordinator-sh/koordinator/pkg/scheduler/frameworkext" "github.com/koordinator-sh/koordinator/pkg/scheduler/frameworkext/topologymanager" + "github.com/koordinator-sh/koordinator/pkg/util/bitmask" "github.com/koordinator-sh/koordinator/pkg/util/cpuset" ) @@ -109,13 +111,13 @@ func (f *testSharedLister) Get(nodeName string) (*framework.NodeInfo, error) { } func makeNode(name string, capacity map[corev1.ResourceName]string, cpuAmpRatio extension.Ratio) *corev1.Node { - node := schedulertesting.MakeNode().Name(name).Capacity(capacity).Obj() + node := st.MakeNode().Name(name).Capacity(capacity).Obj() _, _ = extension.SetNodeResourceAmplificationRatio(node, corev1.ResourceCPU, cpuAmpRatio) return node } func makePodOnNode(request map[corev1.ResourceName]string, node string, isCPUSet bool) *corev1.Pod { - pod := schedulertesting.MakePod().Req(request).Node(node).Priority(extension.PriorityProdValueMax).Obj() + pod := st.MakePod().Req(request).Node(node).Priority(extension.PriorityProdValueMax).Obj() if isCPUSet { pod.Labels = map[string]string{ extension.LabelPodQoS: string(extension.QoSLSR), @@ -171,9 +173,9 @@ func newPluginTestSuit(t *testing.T, pods []*corev1.Pod, nodes []*corev1.Node) * }) }) - registeredPlugins := []schedulertesting.RegisterPluginFunc{ - schedulertesting.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), - schedulertesting.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), + registeredPlugins := []st.RegisterPluginFunc{ + st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), + st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), } cs := kubefake.NewSimpleClientset() @@ -183,7 +185,7 @@ func newPluginTestSuit(t *testing.T, pods []*corev1.Pod, nodes []*corev1.Node) * } informerFactory := informers.NewSharedInformerFactory(cs, 0) snapshot := newTestSharedLister(pods, nodes) - fh, err := schedulertesting.NewFramework( + fh, err := st.NewFramework( registeredPlugins, "koord-scheduler", runtime.WithClientSet(cs), @@ -1521,3 +1523,226 @@ func Test_appendResourceSpecIfMissed(t *testing.T) { }) } } + +func TestFilterWithNUMANodeScoring(t *testing.T) { + mostAllocatedStrategy := &schedulingconfig.ScoringStrategy{ + Type: schedulingconfig.MostAllocated, + Resources: []k8sschedconfig.ResourceSpec{ + { + Name: string(corev1.ResourceCPU), + Weight: 1, + }, + { + Name: string(corev1.ResourceMemory), + Weight: 1, + }, + }, + } + leastAllocatedStrategy := &schedulingconfig.ScoringStrategy{ + Type: schedulingconfig.LeastAllocated, + Resources: []k8sschedconfig.ResourceSpec{ + { + Name: string(corev1.ResourceCPU), + Weight: 1, + }, + { + Name: string(corev1.ResourceMemory), + Weight: 1, + }, + }, + } + tests := []struct { + name string + node *corev1.Node + numaNodeCount int + requestedPod *corev1.Pod + existingPods map[int][]*corev1.Pod + numaScoringStrategy *schedulingconfig.ScoringStrategy + wantAffinity bitmask.BitMask + }{ + { + name: "single numa nodes and select most allocated", + node: st.MakeNode().Name("test-node-1"). + Capacity(map[corev1.ResourceName]string{"cpu": "104", "memory": "256Gi"}). + Label(extension.LabelNUMATopologyPolicy, string(extension.NUMATopologyPolicySingleNUMANode)). + Obj(), + numaNodeCount: 2, + requestedPod: st.MakePod().Req(map[corev1.ResourceName]string{"cpu": "4", "memory": "40Gi"}).Obj(), + existingPods: map[int][]*corev1.Pod{ + 0: { + st.MakePod().Req(map[corev1.ResourceName]string{"cpu": "4", "memory": "8Gi"}).Obj(), + }, + 1: { + st.MakePod().Req(map[corev1.ResourceName]string{"cpu": "40", "memory": "8Gi"}).Obj(), + }, + }, + numaScoringStrategy: mostAllocatedStrategy, + wantAffinity: func() bitmask.BitMask { + mask, _ := bitmask.NewBitMask(1) + return mask + }(), + }, + { + name: "single numa nodes and select least allocated", + node: st.MakeNode().Name("test-node-1"). + Capacity(map[corev1.ResourceName]string{"cpu": "104", "memory": "256Gi"}). + Label(extension.LabelNUMATopologyPolicy, string(extension.NUMATopologyPolicySingleNUMANode)). + Obj(), + numaNodeCount: 2, + requestedPod: st.MakePod().Req(map[corev1.ResourceName]string{"cpu": "4", "memory": "40Gi"}).Obj(), + existingPods: map[int][]*corev1.Pod{ + 0: { + st.MakePod().Req(map[corev1.ResourceName]string{"cpu": "4", "memory": "8Gi"}).Obj(), + }, + 1: { + st.MakePod().Req(map[corev1.ResourceName]string{"cpu": "40", "memory": "8Gi"}).Obj(), + }, + }, + numaScoringStrategy: leastAllocatedStrategy, + wantAffinity: func() bitmask.BitMask { + mask, _ := bitmask.NewBitMask(0) + return mask + }(), + }, + { + name: "single numa nodes and only one node can be used", + node: st.MakeNode().Name("test-node-1"). + Capacity(map[corev1.ResourceName]string{"cpu": "104", "memory": "256Gi"}). + Label(extension.LabelNUMATopologyPolicy, string(extension.NUMATopologyPolicySingleNUMANode)). + Obj(), + numaNodeCount: 2, + requestedPod: st.MakePod().Req(map[corev1.ResourceName]string{"cpu": "4", "memory": "40Gi"}).Obj(), + existingPods: map[int][]*corev1.Pod{ + 0: { + st.MakePod().Req(map[corev1.ResourceName]string{"cpu": "4", "memory": "8Gi"}).Obj(), + }, + 1: { + st.MakePod().Req(map[corev1.ResourceName]string{"cpu": "52", "memory": "8Gi"}).Obj(), + }, + }, + numaScoringStrategy: leastAllocatedStrategy, + wantAffinity: func() bitmask.BitMask { + mask, _ := bitmask.NewBitMask(0) + return mask + }(), + }, + { + name: "restricted numa nodes and select most allocated and preferred", + node: st.MakeNode().Name("test-node-1"). + Capacity(map[corev1.ResourceName]string{"cpu": "104", "memory": "256Gi"}). + Label(extension.LabelNUMATopologyPolicy, string(extension.NUMATopologyPolicyRestricted)). + Obj(), + numaNodeCount: 4, + requestedPod: st.MakePod().Req(map[corev1.ResourceName]string{"cpu": "4", "memory": "40Gi"}).Obj(), + existingPods: map[int][]*corev1.Pod{ + 0: { + st.MakePod().Req(map[corev1.ResourceName]string{"cpu": "24", "memory": "8Gi"}).Obj(), + }, + 1: { + st.MakePod().Req(map[corev1.ResourceName]string{"cpu": "23", "memory": "8Gi"}).Obj(), + }, + 2: { + st.MakePod().Req(map[corev1.ResourceName]string{"cpu": "4", "memory": "8Gi"}).Obj(), + }, + 3: { + st.MakePod().Req(map[corev1.ResourceName]string{"cpu": "8", "memory": "8Gi"}).Obj(), + }, + }, + numaScoringStrategy: mostAllocatedStrategy, + wantAffinity: func() bitmask.BitMask { + mask, _ := bitmask.NewBitMask(3) + return mask + }(), + }, + { + name: "restricted numa nodes and select least allocated and preferred", + node: st.MakeNode().Name("test-node-1"). + Capacity(map[corev1.ResourceName]string{"cpu": "104", "memory": "256Gi"}). + Label(extension.LabelNUMATopologyPolicy, string(extension.NUMATopologyPolicyRestricted)). + Obj(), + numaNodeCount: 4, + requestedPod: st.MakePod().Req(map[corev1.ResourceName]string{"cpu": "4", "memory": "40Gi"}).Obj(), + existingPods: map[int][]*corev1.Pod{ + 0: { + st.MakePod().Req(map[corev1.ResourceName]string{"cpu": "24", "memory": "8Gi"}).Obj(), + }, + 1: { + st.MakePod().Req(map[corev1.ResourceName]string{"cpu": "23", "memory": "8Gi"}).Obj(), + }, + 2: { + st.MakePod().Req(map[corev1.ResourceName]string{"cpu": "4", "memory": "8Gi"}).Obj(), + }, + 3: { + st.MakePod().Req(map[corev1.ResourceName]string{"cpu": "8", "memory": "8Gi"}).Obj(), + }, + }, + numaScoringStrategy: leastAllocatedStrategy, + wantAffinity: func() bitmask.BitMask { + mask, _ := bitmask.NewBitMask(2) + return mask + }(), + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + suit := newPluginTestSuit(t, nil, []*corev1.Node{tt.node}) + if tt.numaScoringStrategy != nil { + suit.nodeNUMAResourceArgs.NUMAScoringStrategy = tt.numaScoringStrategy + } + p, err := suit.proxyNew(suit.nodeNUMAResourceArgs, suit.Handle) + assert.NoError(t, err) + pl := p.(*Plugin) + + if tt.numaNodeCount > 0 { + numaNodeResource := corev1.ResourceList{} + for resourceName, quantity := range tt.node.Status.Allocatable { + if resourceName == corev1.ResourceCPU { + numaNodeResource[resourceName] = *resource.NewMilliQuantity(quantity.MilliValue()/int64(tt.numaNodeCount), resource.DecimalSI) + } else if resourceName == corev1.ResourceMemory { + numaNodeResource[resourceName] = *resource.NewQuantity(quantity.Value()/int64(tt.numaNodeCount), resource.BinarySI) + } else { + numaNodeResource[resourceName] = *resource.NewQuantity(quantity.Value()/int64(tt.numaNodeCount), resource.DecimalSI) + } + } + pl.topologyOptionsManager.UpdateTopologyOptions(tt.node.Name, func(options *TopologyOptions) { + cores := tt.node.Status.Allocatable.Cpu().MilliValue() / 1000 / 2 / int64(tt.numaNodeCount) + options.CPUTopology = buildCPUTopologyForTest(tt.numaNodeCount, 1, int(cores), 2) + for i := 0; i < tt.numaNodeCount; i++ { + options.NUMANodeResources = append(options.NUMANodeResources, NUMANodeResource{ + Node: i, + Resources: numaNodeResource.DeepCopy(), + }) + } + }) + } + for numaNode, pods := range tt.existingPods { + for _, v := range pods { + id := uuid.NewUUID() + pl.resourceManager.Update(tt.node.Name, &PodAllocation{ + UID: id, + Namespace: "default", + Name: string(id), + NUMANodeResources: []NUMANodeResource{ + { + Node: numaNode, + Resources: v.Spec.Containers[0].Resources.Requests, + }, + }, + }) + } + } + + cycleState := framework.NewCycleState() + _, status := pl.PreFilter(context.TODO(), cycleState, tt.requestedPod) + assert.True(t, status.IsSuccess()) + + nodeInfo, err := suit.Handle.SnapshotSharedLister().NodeInfos().Get(tt.node.Name) + assert.NoError(t, err) + status = pl.Filter(context.TODO(), cycleState, tt.requestedPod, nodeInfo) + assert.True(t, status.IsSuccess()) + hint := topologymanager.GetStore(cycleState).GetAffinity(tt.node.Name) + assert.Equal(t, tt.wantAffinity.GetBits(), hint.NUMANodeAffinity.GetBits()) + }) + } +} diff --git a/pkg/scheduler/plugins/nodenumaresource/resource_manager.go b/pkg/scheduler/plugins/nodenumaresource/resource_manager.go index edb0dff68..54cfff32d 100644 --- a/pkg/scheduler/plugins/nodenumaresource/resource_manager.go +++ b/pkg/scheduler/plugins/nodenumaresource/resource_manager.go @@ -31,6 +31,7 @@ import ( schedulingconfig "github.com/koordinator-sh/koordinator/pkg/scheduler/apis/config" "github.com/koordinator-sh/koordinator/pkg/scheduler/frameworkext/topologymanager" + "github.com/koordinator-sh/koordinator/pkg/util" "github.com/koordinator-sh/koordinator/pkg/util/bitmask" "github.com/koordinator-sh/koordinator/pkg/util/cpuset" ) @@ -59,6 +60,7 @@ type ResourceOptions struct { reusableResources map[int]corev1.ResourceList hint topologymanager.NUMATopologyHint topologyOptions TopologyOptions + numaScorer *resourceAllocationScorer } type resourceManager struct { @@ -127,11 +129,7 @@ func (c *resourceManager) GetTopologyHints(node *corev1.Node, pod *corev1.Pod, o return nil, err } - nodes := make([]int, 0, len(topologyOptions.NUMANodeResources)) - for _, v := range topologyOptions.NUMANodeResources { - nodes = append(nodes, v.Node) - } - result := generateResourceHints(nodes, options.requests, totalAvailable) + result := generateResourceHints(topologyOptions.NUMANodeResources, options.requests, totalAvailable, options.numaScorer) hints := make(map[string][]topologymanager.NUMATopologyHint) for k, v := range result { hints[k] = v @@ -381,29 +379,45 @@ func (c *resourceManager) getAvailableNUMANodeResources(nodeName string, topolog return totalAvailable, totalAllocated, nil } -func generateResourceHints(numaNodes []int, podRequests corev1.ResourceList, totalAvailable map[int]corev1.ResourceList) map[string][]topologymanager.NUMATopologyHint { +func generateResourceHints(numaNodeResources []NUMANodeResource, podRequests corev1.ResourceList, totalAvailable map[int]corev1.ResourceList, numaScorer *resourceAllocationScorer) map[string][]topologymanager.NUMATopologyHint { // Initialize minAffinitySize to include all NUMA Cells. - minAffinitySize := len(numaNodes) + minAffinitySizeMap := map[corev1.ResourceName]*int{} + for resourceName := range podRequests { + size := len(numaNodeResources) + minAffinitySizeMap[resourceName] = &size + } hints := map[string][]topologymanager.NUMATopologyHint{} + + numaNodes := make([]int, 0, len(numaNodeResources)) + for _, v := range numaNodeResources { + numaNodes = append(numaNodes, v.Node) + } + + podRequestResources := framework.NewResource(podRequests) bitmask.IterateBitMasks(numaNodes, func(mask bitmask.BitMask) { maskBits := mask.GetBits() - available := make(corev1.ResourceList) + total := make(corev1.ResourceList) for _, nodeID := range maskBits { - available = quotav1.Add(available, totalAvailable[nodeID]) - } - if satisfied, _ := quotav1.LessThanOrEqual(podRequests, available); !satisfied { - return + util.AddResourceList(available, totalAvailable[nodeID]) + for _, v := range numaNodeResources { + if v.Node == nodeID { + util.AddResourceList(total, v.Resources) + break + } + } } - // set the minimum amount of NUMA nodes that can satisfy the resources requests - if mask.Count() < minAffinitySize { - minAffinitySize = mask.Count() + var score int64 + if numaScorer != nil { + requested := quotav1.SubtractWithNonNegativeResult(total, available) + score, _ = numaScorer.score(framework.NewResource(requested), framework.NewResource(total), podRequestResources) } - for resourceName := range podRequests { - if _, ok := available[resourceName]; !ok { + for resourceName, request := range podRequests { + minAffinitySize := minAffinitySizeMap[resourceName] + if !shouldGenerateHint(total[resourceName], available[resourceName], request, mask.Count(), minAffinitySize) { continue } if _, ok := hints[string(resourceName)]; !ok { @@ -412,6 +426,7 @@ func generateResourceHints(numaNodes []int, podRequests corev1.ResourceList, tot hints[string(resourceName)] = append(hints[string(resourceName)], topologymanager.NUMATopologyHint{ NUMANodeAffinity: mask, Preferred: false, + Score: score, }) } }) @@ -419,6 +434,7 @@ func generateResourceHints(numaNodes []int, podRequests corev1.ResourceList, tot // update hints preferred according to multiNUMAGroups, in case when it wasn't provided, the default // behavior to prefer the minimal amount of NUMA nodes will be used for resourceName := range podRequests { + minAffinitySize := *minAffinitySizeMap[resourceName] for i, hint := range hints[string(resourceName)] { hints[string(resourceName)][i].Preferred = len(hint.NUMANodeAffinity.GetBits()) == minAffinitySize } @@ -427,6 +443,19 @@ func generateResourceHints(numaNodes []int, podRequests corev1.ResourceList, tot return hints } +func shouldGenerateHint(total resource.Quantity, available resource.Quantity, request resource.Quantity, nodeCount int, minAffinitySize *int) bool { + if total.Cmp(request) < 0 { + return false + } + if nodeCount < *minAffinitySize { + *minAffinitySize = nodeCount + } + if available.Cmp(request) < 0 { + return false + } + return true +} + func filterAvailableCPUsByRequiredCPUBindPolicy(policy schedulingconfig.CPUBindPolicy, availableCPUs cpuset.CPUSet, cpuDetails CPUDetails, cpusPerCore int) cpuset.CPUSet { if policy == schedulingconfig.CPUBindPolicyFullPCPUs { cpuDetails.KeepOnly(availableCPUs) diff --git a/pkg/scheduler/plugins/nodenumaresource/resource_manager_test.go b/pkg/scheduler/plugins/nodenumaresource/resource_manager_test.go index 01675b023..582c09398 100644 --- a/pkg/scheduler/plugins/nodenumaresource/resource_manager_test.go +++ b/pkg/scheduler/plugins/nodenumaresource/resource_manager_test.go @@ -928,7 +928,7 @@ func TestResourceManagerGetTopologyHint(t *testing.T) { mask, _ := bitmask.NewBitMask(0, 1) return mask }(), - Preferred: true, + Preferred: false, }, }, }, diff --git a/pkg/scheduler/plugins/nodenumaresource/scoring_test.go b/pkg/scheduler/plugins/nodenumaresource/scoring_test.go index 183ef5db1..c965a529c 100644 --- a/pkg/scheduler/plugins/nodenumaresource/scoring_test.go +++ b/pkg/scheduler/plugins/nodenumaresource/scoring_test.go @@ -111,7 +111,7 @@ func TestNUMANodeScore(t *testing.T) { "test-node-1": 2, "test-node-2": 1, }, - requestedPod: st.MakePod().Req(map[corev1.ResourceName]string{"cpu": "54", "memory": "40Gi"}).Obj(), + requestedPod: st.MakePod().Req(map[corev1.ResourceName]string{"cpu": "50", "memory": "40Gi"}).Obj(), strategy: &schedulerconfig.ScoringStrategy{ Type: schedulerconfig.MostAllocated, Resources: []config.ResourceSpec{ @@ -128,52 +128,11 @@ func TestNUMANodeScore(t *testing.T) { expectedScores: []framework.NodeScore{ { Name: "test-node-1", - Score: 33, + Score: 63, }, { Name: "test-node-2", - Score: 57, - }, - }, - }, - { - name: "restricted numa nodes score with same capacity", - nodes: []*corev1.Node{ - st.MakeNode().Name("test-node-1"). - Capacity(map[corev1.ResourceName]string{"cpu": "104", "memory": "256Gi"}). - Label(apiext.LabelNUMATopologyPolicy, string(apiext.NUMATopologyPolicyRestricted)). - Obj(), - st.MakeNode().Name("test-node-2"). - Capacity(map[corev1.ResourceName]string{"cpu": "104", "memory": "256Gi"}). - Label(apiext.LabelNUMATopologyPolicy, string(apiext.NUMATopologyPolicyRestricted)). - Obj(), - }, - numaNodeCounts: map[string]int{ - "test-node-1": 2, - "test-node-2": 2, - }, - requestedPod: st.MakePod().Req(map[corev1.ResourceName]string{"cpu": "54", "memory": "40Gi"}).Obj(), - strategy: &schedulerconfig.ScoringStrategy{ - Type: schedulerconfig.MostAllocated, - Resources: []config.ResourceSpec{ - { - Name: string(corev1.ResourceCPU), - Weight: 1, - }, - { - Name: string(corev1.ResourceMemory), - Weight: 1, - }, - }, - }, - expectedScores: []framework.NodeScore{ - { - Name: "test-node-1", - Score: 33, - }, - { - Name: "test-node-2", - Score: 33, + Score: 54, }, }, }, @@ -182,15 +141,15 @@ func TestNUMANodeScore(t *testing.T) { nodes: []*corev1.Node{ st.MakeNode().Name("test-node-1"). Capacity(map[corev1.ResourceName]string{"cpu": "104", "memory": "256Gi"}). - Label(apiext.LabelNUMATopologyPolicy, string(apiext.NUMATopologyPolicyRestricted)). + Label(apiext.LabelNUMATopologyPolicy, string(apiext.NUMATopologyPolicySingleNUMANode)). Obj(), st.MakeNode().Name("test-node-2"). Capacity(map[corev1.ResourceName]string{"cpu": "104", "memory": "256Gi"}). - Label(apiext.LabelNUMATopologyPolicy, string(apiext.NUMATopologyPolicyRestricted)). + Label(apiext.LabelNUMATopologyPolicy, string(apiext.NUMATopologyPolicySingleNUMANode)). Obj(), st.MakeNode().Name("test-node-3"). Capacity(map[corev1.ResourceName]string{"cpu": "104", "memory": "256Gi"}). - Label(apiext.LabelNUMATopologyPolicy, string(apiext.NUMATopologyPolicyRestricted)). + Label(apiext.LabelNUMATopologyPolicy, string(apiext.NUMATopologyPolicySingleNUMANode)). Obj(), }, numaNodeCounts: map[string]int{ @@ -220,15 +179,15 @@ func TestNUMANodeScore(t *testing.T) { expectedScores: []framework.NodeScore{ { Name: "test-node-1", - Score: 26, + Score: 19, }, { Name: "test-node-2", - Score: 39, + Score: 19, }, { Name: "test-node-3", - Score: 65, + Score: 19, }, }, }, @@ -237,15 +196,15 @@ func TestNUMANodeScore(t *testing.T) { nodes: []*corev1.Node{ st.MakeNode().Name("test-node-1"). Capacity(map[corev1.ResourceName]string{"cpu": "104", "memory": "256Gi"}). - Label(apiext.LabelNUMATopologyPolicy, string(apiext.NUMATopologyPolicyRestricted)). + Label(apiext.LabelNUMATopologyPolicy, string(apiext.NUMATopologyPolicySingleNUMANode)). Obj(), st.MakeNode().Name("test-node-2"). Capacity(map[corev1.ResourceName]string{"cpu": "104", "memory": "256Gi"}). - Label(apiext.LabelNUMATopologyPolicy, string(apiext.NUMATopologyPolicyRestricted)). + Label(apiext.LabelNUMATopologyPolicy, string(apiext.NUMATopologyPolicySingleNUMANode)). Obj(), st.MakeNode().Name("test-node-3"). Capacity(map[corev1.ResourceName]string{"cpu": "104", "memory": "256Gi"}). - Label(apiext.LabelNUMATopologyPolicy, string(apiext.NUMATopologyPolicyRestricted)). + Label(apiext.LabelNUMATopologyPolicy, string(apiext.NUMATopologyPolicySingleNUMANode)). Obj(), }, numaNodeCounts: map[string]int{ @@ -285,15 +244,15 @@ func TestNUMANodeScore(t *testing.T) { expectedScores: []framework.NodeScore{ { Name: "test-node-1", - Score: 29, + Score: 23, }, { Name: "test-node-2", - Score: 52, + Score: 27, }, { Name: "test-node-3", - Score: 65, + Score: 34, }, }, }, diff --git a/pkg/scheduler/plugins/nodenumaresource/topology_hint.go b/pkg/scheduler/plugins/nodenumaresource/topology_hint.go index ae4958eda..c25d321b7 100644 --- a/pkg/scheduler/plugins/nodenumaresource/topology_hint.go +++ b/pkg/scheduler/plugins/nodenumaresource/topology_hint.go @@ -54,6 +54,7 @@ func (p *Plugin) GetPodTopologyHints(ctx context.Context, cycleState *framework. if err != nil { return nil, framework.AsStatus(err) } + resourceOptions.numaScorer = p.numaScorer hints, err := p.resourceManager.GetTopologyHints(node, pod, resourceOptions) if err != nil { return nil, framework.NewStatus(framework.Unschedulable, "node(s) Insufficient NUMA Node resources") diff --git a/pkg/scheduler/plugins/nodenumaresource/util.go b/pkg/scheduler/plugins/nodenumaresource/util.go index a2f7e446b..b4eccf845 100644 --- a/pkg/scheduler/plugins/nodenumaresource/util.go +++ b/pkg/scheduler/plugins/nodenumaresource/util.go @@ -24,9 +24,9 @@ import ( ) func GetDefaultNUMAAllocateStrategy(pluginArgs *schedulingconfig.NodeNUMAResourceArgs) schedulingconfig.NUMAAllocateStrategy { - numaAllocateStrategy := schedulingconfig.NUMAMostAllocated - if pluginArgs != nil && pluginArgs.ScoringStrategy != nil && pluginArgs.ScoringStrategy.Type == schedulingconfig.LeastAllocated { - numaAllocateStrategy = schedulingconfig.NUMALeastAllocated + numaAllocateStrategy := schedulingconfig.NUMALeastAllocated + if pluginArgs != nil && pluginArgs.NUMAScoringStrategy != nil && pluginArgs.NUMAScoringStrategy.Type == schedulingconfig.MostAllocated { + numaAllocateStrategy = schedulingconfig.NUMAMostAllocated } return numaAllocateStrategy }