Skip to content

Commit

Permalink
koordlet: add core sched apis
Browse files Browse the repository at this point in the history
Signed-off-by: saintube <[email protected]>
  • Loading branch information
saintube committed Nov 13, 2023
1 parent f197bbc commit 952d4d9
Show file tree
Hide file tree
Showing 9 changed files with 427 additions and 21 deletions.
27 changes: 27 additions & 0 deletions apis/slo/v1alpha1/nodeslo_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,28 @@ import (
// CPUQOS enables cpu qos features.
type CPUQOS struct {
// group identity value for pods, default = 0
// NOTE: It takes effect if cpuPolicy = "groupIdentity".
GroupIdentity *int64 `json:"groupIdentity,omitempty" validate:"omitempty,min=-1,max=2"`
// cpu.idle value for pods, default = 0.
// `1` means using SCHED_IDLE.
// CGroup Idle (introduced since mainline Linux 5.15): https://lore.kernel.org/lkml/162971078674.25758.15464079371945307825.tip-bot2@tip-bot2/#r
// NOTE: It takes effect if cpuPolicy = "coreSched".
SchedIdle *int64 `json:"schedIdle,omitempty" validate:"omitempty,min=0,max=1"`
// whether pods of the QoS class can expel the cgroup idle pods at the SMT-level. default = false
// If set to true, pods of this QoS will use a dedicated core sched group for noise clean with the SchedIdle pods.
// NOTE: It takes effect if cpuPolicy = "coreSched".
CoreExpeller *bool `json:"coreExpeller,omitempty"`
}

type CPUQOSPolicy string

const (
// CPUQOSPolicyGroupIdentity indicates the Group Identity is applied to ensure the CPU QoS.
CPUQOSPolicyGroupIdentity CPUQOSPolicy = "groupIdentity"
// CPUQOSPolicyCoreSched indicates the Linux Core Scheduling and CGroup Idle is applied to ensure the CPU QoS.
CPUQOSPolicyCoreSched CPUQOSPolicy = "coreSched"
)

// MemoryQOS enables memory qos features.
type MemoryQOS struct {
// memcg qos
Expand Down Expand Up @@ -185,7 +204,15 @@ type ResourceQOS struct {
ResctrlQOS *ResctrlQOSCfg `json:"resctrlQOS,omitempty"`
}

type ResourceQOSPolicies struct {
// applied policy for the CPU QoS, default = "groupIdentity"
CPUPolicy *CPUQOSPolicy `json:"cpuPolicy,omitempty"`
}

type ResourceQOSStrategy struct {
// Policies of pod QoS.
Policies *ResourceQOSPolicies `json:"policies,omitempty"`

// ResourceQOS for LSR pods.
LSRClass *ResourceQOS `json:"lsrClass,omitempty"`

Expand Down
38 changes: 38 additions & 0 deletions apis/slo/v1alpha1/pod.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"encoding/json"

corev1 "k8s.io/api/core/v1"
"k8s.io/utils/pointer"

apiext "github.com/koordinator-sh/koordinator/apis/extension"
)
Expand Down Expand Up @@ -64,3 +65,40 @@ func GetPodMemoryQoSConfig(pod *corev1.Pod) (*PodMemoryQOSConfig, error) {
}
return &cfg, nil
}

const (
// AnnotationCoreSchedGroupID is the annotation key of the group ID of the Linux Core Scheduling.
// Value should be a valid UUID or the none value "0".
// When the value is a valid UUID, pods with that group ID and the equal CoreExpelled status on the node will be
// assigned to the same core sched cookie.
// When the value is the none value "0", pod will be reset to the default core sched cookie `0`.
// When the annotation is missing but the node-level strategy enables the core sched, the pod will be assigned an
// internal group according to the pod's UID.
//
// Core Sched: https://docs.kernel.org/admin-guide/hw-vuln/core-scheduling.html
// When the Core Sched is enabled, pods with the different core sched group IDs will not be running at the same SMT
// core at the same time, which means they will take different core sched cookies. If a pod sets the core sched
// disabled, it will take the default core sched cookie (0) and will also be force-idled to run on the same SMT core
// concurrently with the core-sched-enabled pods. In addition, the CoreExpelled configured in ResourceQOS also
// enables the individual cookie from pods of other QoS classes via adding a suffix for the group ID. So the pods
// of different QoS will take different cookies when their CoreExpelled status are diverse even if their group ID
// are the same.
AnnotationCoreSchedGroupID = apiext.DomainPrefix + "core-sched-group-id"

// CoreSchedGroupIDNone is the none value of the core sched group ID which indicates the core sched is disabled for
// the pod. The pod will be reset to the system-default cookie `0`.
CoreSchedGroupIDNone = "0"
)

// GetCoreSchedGroupID gets the core sched group ID from the pod annotations.
// It returns the core sched group ID and whether the pod explicitly disables the core sched.
func GetCoreSchedGroupID(annotations map[string]string) (string, *bool) {
if annotations == nil {
return "", nil
}
value, ok := annotations[AnnotationCoreSchedGroupID]
if !ok {
return "", nil
}
return value, pointer.Bool(value == CoreSchedGroupIDNone)
}
35 changes: 35 additions & 0 deletions apis/slo/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

97 changes: 87 additions & 10 deletions config/crd/bases/slo.koordinator.sh_nodeslos.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -176,12 +176,26 @@ spec:
cpuQOS:
description: CPUQOSCfg stores node-level config of cpu qos
properties:
coreExpeller:
description: 'whether pods of the QoS class can expel
the cgroup idle pods at the SMT-level. default = false
If set to true, pods of this QoS will use a dedicated
core sched group for noise clean with the SchedIdle
pods. NOTE: It takes effect if cpuPolicy = "coreSched".'
type: boolean
enable:
description: Enable indicates whether the cpu qos is enabled.
type: boolean
groupIdentity:
description: group identity value for pods, default =
0
description: 'group identity value for pods, default =
0 NOTE: It takes effect if cpuPolicy = "groupIdentity".'
format: int64
type: integer
schedIdle:
description: 'cpu.idle value for pods, default = 0. `1`
means using SCHED_IDLE. CGroup Idle (introduced since
mainline Linux 5.15): https://lore.kernel.org/lkml/162971078674.25758.15464079371945307825.tip-bot2@tip-bot2/#r
NOTE: It takes effect if cpuPolicy = "coreSched".'
format: int64
type: integer
type: object
Expand Down Expand Up @@ -387,12 +401,26 @@ spec:
cpuQOS:
description: CPUQOSCfg stores node-level config of cpu qos
properties:
coreExpeller:
description: 'whether pods of the QoS class can expel
the cgroup idle pods at the SMT-level. default = false
If set to true, pods of this QoS will use a dedicated
core sched group for noise clean with the SchedIdle
pods. NOTE: It takes effect if cpuPolicy = "coreSched".'
type: boolean
enable:
description: Enable indicates whether the cpu qos is enabled.
type: boolean
groupIdentity:
description: group identity value for pods, default =
0
description: 'group identity value for pods, default =
0 NOTE: It takes effect if cpuPolicy = "groupIdentity".'
format: int64
type: integer
schedIdle:
description: 'cpu.idle value for pods, default = 0. `1`
means using SCHED_IDLE. CGroup Idle (introduced since
mainline Linux 5.15): https://lore.kernel.org/lkml/162971078674.25758.15464079371945307825.tip-bot2@tip-bot2/#r
NOTE: It takes effect if cpuPolicy = "coreSched".'
format: int64
type: integer
type: object
Expand Down Expand Up @@ -598,12 +626,26 @@ spec:
cpuQOS:
description: CPUQOSCfg stores node-level config of cpu qos
properties:
coreExpeller:
description: 'whether pods of the QoS class can expel
the cgroup idle pods at the SMT-level. default = false
If set to true, pods of this QoS will use a dedicated
core sched group for noise clean with the SchedIdle
pods. NOTE: It takes effect if cpuPolicy = "coreSched".'
type: boolean
enable:
description: Enable indicates whether the cpu qos is enabled.
type: boolean
groupIdentity:
description: group identity value for pods, default =
0
description: 'group identity value for pods, default =
0 NOTE: It takes effect if cpuPolicy = "groupIdentity".'
format: int64
type: integer
schedIdle:
description: 'cpu.idle value for pods, default = 0. `1`
means using SCHED_IDLE. CGroup Idle (introduced since
mainline Linux 5.15): https://lore.kernel.org/lkml/162971078674.25758.15464079371945307825.tip-bot2@tip-bot2/#r
NOTE: It takes effect if cpuPolicy = "coreSched".'
format: int64
type: integer
type: object
Expand Down Expand Up @@ -809,12 +851,26 @@ spec:
cpuQOS:
description: CPUQOSCfg stores node-level config of cpu qos
properties:
coreExpeller:
description: 'whether pods of the QoS class can expel
the cgroup idle pods at the SMT-level. default = false
If set to true, pods of this QoS will use a dedicated
core sched group for noise clean with the SchedIdle
pods. NOTE: It takes effect if cpuPolicy = "coreSched".'
type: boolean
enable:
description: Enable indicates whether the cpu qos is enabled.
type: boolean
groupIdentity:
description: group identity value for pods, default =
0
description: 'group identity value for pods, default =
0 NOTE: It takes effect if cpuPolicy = "groupIdentity".'
format: int64
type: integer
schedIdle:
description: 'cpu.idle value for pods, default = 0. `1`
means using SCHED_IDLE. CGroup Idle (introduced since
mainline Linux 5.15): https://lore.kernel.org/lkml/162971078674.25758.15464079371945307825.tip-bot2@tip-bot2/#r
NOTE: It takes effect if cpuPolicy = "coreSched".'
format: int64
type: integer
type: object
Expand Down Expand Up @@ -948,6 +1004,13 @@ spec:
type: integer
type: object
type: object
policies:
description: Policies of pod QoS.
properties:
cpuPolicy:
description: applied policy for the CPU QoS, default = "groupIdentity"
type: string
type: object
systemClass:
description: ResourceQOS for system pods
properties:
Expand Down Expand Up @@ -1020,12 +1083,26 @@ spec:
cpuQOS:
description: CPUQOSCfg stores node-level config of cpu qos
properties:
coreExpeller:
description: 'whether pods of the QoS class can expel
the cgroup idle pods at the SMT-level. default = false
If set to true, pods of this QoS will use a dedicated
core sched group for noise clean with the SchedIdle
pods. NOTE: It takes effect if cpuPolicy = "coreSched".'
type: boolean
enable:
description: Enable indicates whether the cpu qos is enabled.
type: boolean
groupIdentity:
description: group identity value for pods, default =
0
description: 'group identity value for pods, default =
0 NOTE: It takes effect if cpuPolicy = "groupIdentity".'
format: int64
type: integer
schedIdle:
description: 'cpu.idle value for pods, default = 0. `1`
means using SCHED_IDLE. CGroup Idle (introduced since
mainline Linux 5.15): https://lore.kernel.org/lkml/162971078674.25758.15464079371945307825.tip-bot2@tip-bot2/#r
NOTE: It takes effect if cpuPolicy = "coreSched".'
format: int64
type: integer
type: object
Expand Down
37 changes: 26 additions & 11 deletions pkg/koordlet/runtimehooks/hooks/groupidentity/rule.go
Original file line number Diff line number Diff line change
Expand Up @@ -76,16 +76,31 @@ func (r *bvtRule) getHostQOSBvtValue(qosClass ext.QoSClass) int64 {

func (b *bvtPlugin) parseRule(mergedNodeSLOIf interface{}) (bool, error) {
mergedNodeSLO := mergedNodeSLOIf.(*slov1alpha1.NodeSLOSpec)
qosStrategy := mergedNodeSLO.ResourceQOSStrategy

// check if bvt is enabled
enable := *mergedNodeSLO.ResourceQOSStrategy.LSRClass.CPUQOS.Enable ||
*mergedNodeSLO.ResourceQOSStrategy.LSClass.CPUQOS.Enable ||
*mergedNodeSLO.ResourceQOSStrategy.BEClass.CPUQOS.Enable
// default policy enables
isPolicyGroupIdentity := qosStrategy.Policies == nil || qosStrategy.Policies.CPUPolicy == nil ||
len(*qosStrategy.Policies.CPUPolicy) <= 0 || *qosStrategy.Policies.CPUPolicy == slov1alpha1.CPUQOSPolicyGroupIdentity
// check if bvt (group identity) is enabled
lsrEnabled := isPolicyGroupIdentity && *qosStrategy.LSRClass.CPUQOS.Enable
lsEnabled := isPolicyGroupIdentity && *qosStrategy.LSClass.CPUQOS.Enable
beEnabled := isPolicyGroupIdentity && *qosStrategy.BEClass.CPUQOS.Enable

// setting pod rule by qos config
lsrValue := *mergedNodeSLO.ResourceQOSStrategy.LSRClass.CPUQOS.CPUQOS.GroupIdentity
lsValue := *mergedNodeSLO.ResourceQOSStrategy.LSClass.CPUQOS.GroupIdentity
beValue := *mergedNodeSLO.ResourceQOSStrategy.BEClass.CPUQOS.GroupIdentity
// Group Identity should be reset if the CPU QOS disables (already merged in states informer) or the CPU QoS policy
// is not "groupIdentity".
lsrValue := *sloconfig.NoneCPUQOS().GroupIdentity
if lsrEnabled {
lsrValue = *qosStrategy.LSRClass.CPUQOS.GroupIdentity
}
lsValue := *sloconfig.NoneCPUQOS().GroupIdentity
if lsEnabled {
lsValue = *qosStrategy.LSClass.CPUQOS.GroupIdentity
}
beValue := *sloconfig.NoneCPUQOS().GroupIdentity
if beEnabled {
beValue = *qosStrategy.BEClass.CPUQOS.GroupIdentity
}

// setting besteffort according to BE
besteffortDirVal := beValue
Expand All @@ -95,18 +110,18 @@ func (b *bvtPlugin) parseRule(mergedNodeSLOIf interface{}) (bool, error) {
burstableDirVal := lsValue
burstablePodVal := lsValue

// NOTICE guaranteed root dir must set as 0 until kernel supported
// NOTE: guaranteed root dir must set as 0 until kernel supported
guaranteedDirVal := *sloconfig.NoneCPUQOS().GroupIdentity
// setting guaranteed pod enabled if LS or LSR enabled
guaranteedPodVal := *sloconfig.NoneCPUQOS().GroupIdentity
if *mergedNodeSLO.ResourceQOSStrategy.LSRClass.CPUQOS.Enable {
if lsrEnabled {
guaranteedPodVal = lsrValue
} else if *mergedNodeSLO.ResourceQOSStrategy.LSClass.CPUQOS.Enable {
} else if lsEnabled {
guaranteedPodVal = lsValue
}

newRule := &bvtRule{
enable: enable,
enable: lsrEnabled || lsEnabled || beEnabled,
podQOSParams: map[ext.QoSClass]int64{
ext.QoSLSE: lsrValue,
ext.QoSLSR: lsrValue,
Expand Down
Loading

0 comments on commit 952d4d9

Please sign in to comment.