Skip to content

Commit

Permalink
koordlet: report host application resource usage for calculating batc… (
Browse files Browse the repository at this point in the history
#1733)

Signed-off-by: 佑祎 <[email protected]>
  • Loading branch information
zwzhang0107 authored Nov 8, 2023
1 parent 06eedc1 commit f197bbc
Show file tree
Hide file tree
Showing 29 changed files with 1,720 additions and 124 deletions.
7 changes: 7 additions & 0 deletions pkg/koordlet/metriccache/metric_resources.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,10 @@ var (
NodeMemoryWithHotPageUsageMetric = defaultMetricFactory.New(NodeMemoryWithHotPageUsage)
PodMemoryWithHotPageUsageMetric = defaultMetricFactory.New(PodMemoryWithHotPageUsage).withPropertySchema(MetricPropertyPodUID)
ContainerMemoryWithHotPageUsageMetric = defaultMetricFactory.New(ContainerMemoryWithHotPageUsage).withPropertySchema(MetricPropertyContainerID)
HostAppMemoryWithHotPageUsageMetric = defaultMetricFactory.New(HostAppMemoryWithHotPageUsage).withPropertySchema(MetricPropertyHostAppName)
NodeMemoryColdPageSizeMetric = defaultMetricFactory.New(NodeMemoryColdPageSize)
PodMemoryColdPageSizeMetric = defaultMetricFactory.New(PodMemoryColdPageSize).withPropertySchema(MetricPropertyPodUID)
HostAppMemoryColdPageSizeMetric = defaultMetricFactory.New(HostAppMemoryColdPageSize).withPropertySchema(MetricPropertyHostAppName)
ContainerMemoryColdPageSizeMetric = defaultMetricFactory.New(ContainerMemoryColdPageSize).withPropertySchema(MetricPropertyContainerID)

// CPI
Expand All @@ -65,4 +67,9 @@ var (

// BE
NodeBEMetric = defaultMetricFactory.New(NodeMetricBE).withPropertySchema(MetricPropertyBEResource, MetricPropertyBEAllocation)

// Host Application
HostAppCPUUsageMetric = defaultMetricFactory.New(HostAppCPUUsage).withPropertySchema(MetricPropertyHostAppName)
HostAppMemoryUsageMetric = defaultMetricFactory.New(HostAppMemoryUsage).withPropertySchema(MetricPropertyHostAppName)
HostAppMemoryUsageWithPageCacheMetric = defaultMetricFactory.New(HostAppMemoryWithPageCacheUsage).withPropertySchema(MetricPropertyHostAppName)
)
24 changes: 16 additions & 8 deletions pkg/koordlet/metriccache/metric_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,10 +49,6 @@ const (
// NodeBE
NodeMetricBE MetricKind = "node_be"

PriorityMetricCPUUsage MetricKind = "priority_cpu_usage"
PriorityMetricCPURealLimit MetricKind = "priority_cpu_real_limit"
PriorityMetricCPURequest MetricKind = "priority_cpu_request"

PodMetricCPUUsage MetricKind = "pod_cpu_usage"
PodMetricMemoryUsage MetricKind = "pod_memory_usage"
PodMemoryWithPageCacheUsage MetricKind = "pod_memory_usage_with_page_cache"
Expand All @@ -70,6 +66,10 @@ const (
PodMetricCPUThrottled MetricKind = "pod_cpu_throttled"
ContainerMetricCPUThrottled MetricKind = "container_cpu_throttled"

HostAppCPUUsage MetricKind = "host_application_cpu_usage"
HostAppMemoryUsage MetricKind = "host_application_memory_usage"
HostAppMemoryWithPageCacheUsage MetricKind = "host_application_memory_usage_with_page_cache"

// CPI
ContainerMetricCPI MetricKind = "container_cpi"

Expand All @@ -83,7 +83,9 @@ const (
NodeMemoryWithHotPageUsage MetricKind = "node_memory_with_hot_page_usage"
PodMemoryWithHotPageUsage MetricKind = "pod_memory_with_hot_page_usage"
ContainerMemoryWithHotPageUsage MetricKind = "container_memory_with_hot_page_usage"
HostAppMemoryWithHotPageUsage MetricKind = "host_application_memory_with_hot_page_usage"
NodeMemoryColdPageSize MetricKind = "node_memory_cold_page_size"
HostAppMemoryColdPageSize MetricKind = "host_application_memory_cold_page_size"
PodMemoryColdPageSize MetricKind = "pod_memory_cold_page_size"
ContainerMemoryColdPageSize MetricKind = "container_memory_cold_page_size"
)
Expand All @@ -106,6 +108,8 @@ const (

MetricPropertyBEResource MetricProperty = "be_resource"
MetricPropertyBEAllocation MetricProperty = "be_allocation"

MetricPropertyHostAppName MetricProperty = "host_app_name"
)

// MetricPropertyValue is the property value
Expand All @@ -124,10 +128,10 @@ const (
PSIDegreeFull MetricPropertyValue = "full"
PSIDegreeSome MetricPropertyValue = "some"

BEResourceCPU MetricPropertyValue = "cpu"
BEResouceAllocationUsage MetricPropertyValue = "usage"
BEResouceAllocationRealLimit MetricPropertyValue = "real-limit"
BEResouceAllocationRequest MetricPropertyValue = "request"
BEResourceCPU MetricPropertyValue = "cpu"
BEResourceAllocationUsage MetricPropertyValue = "usage"
BEResourceAllocationRealLimit MetricPropertyValue = "real-limit"
BEResourceAllocationRequest MetricPropertyValue = "request"
)

// MetricPropertiesFunc is a collection of functions generating metric property k-v, for metric sample generation and query
Expand All @@ -142,6 +146,7 @@ var MetricPropertiesFunc = struct {
PodGPU func(string, string, string) map[MetricProperty]string
ContainerGPU func(string, string, string) map[MetricProperty]string
NodeBE func(string, string) map[MetricProperty]string
HostApplication func(string) map[MetricProperty]string
}{
Pod: func(podUID string) map[MetricProperty]string {
return map[MetricProperty]string{MetricPropertyPodUID: podUID}
Expand Down Expand Up @@ -173,6 +178,9 @@ var MetricPropertiesFunc = struct {
NodeBE: func(beResource, beResourceAllocation string) map[MetricProperty]string {
return map[MetricProperty]string{MetricPropertyBEResource: beResource, MetricPropertyBEAllocation: beResourceAllocation}
},
HostApplication: func(appName string) map[MetricProperty]string {
return map[MetricProperty]string{MetricPropertyHostAppName: appName}
},
}

// point is the struct to describe metric
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -97,11 +97,11 @@ func (b *beResourceCollector) collectBECPUResourceMetric() {

collectTime := time.Now()
beLimit, err01 := metriccache.NodeBEMetric.GenerateSample(
metriccache.MetricPropertiesFunc.NodeBE(string(metriccache.BEResourceCPU), string(metriccache.BEResouceAllocationRealLimit)), collectTime, float64(realMilliLimit))
metriccache.MetricPropertiesFunc.NodeBE(string(metriccache.BEResourceCPU), string(metriccache.BEResourceAllocationRealLimit)), collectTime, float64(realMilliLimit))
beRequest, err02 := metriccache.NodeBEMetric.GenerateSample(
metriccache.MetricPropertiesFunc.NodeBE(string(metriccache.BEResourceCPU), string(metriccache.BEResouceAllocationRequest)), collectTime, float64(beCPUMilliRequest))
metriccache.MetricPropertiesFunc.NodeBE(string(metriccache.BEResourceCPU), string(metriccache.BEResourceAllocationRequest)), collectTime, float64(beCPUMilliRequest))
beUsage, err03 := metriccache.NodeBEMetric.GenerateSample(
metriccache.MetricPropertiesFunc.NodeBE(string(metriccache.BEResourceCPU), string(metriccache.BEResouceAllocationUsage)), collectTime, float64(beCPUUsageMilliCores))
metriccache.MetricPropertiesFunc.NodeBE(string(metriccache.BEResourceCPU), string(metriccache.BEResourceAllocationUsage)), collectTime, float64(beCPUUsageMilliCores))

if err01 != nil || err02 != nil || err03 != nil {
klog.Errorf("failed to collect node BECPU, beLimitGenerateSampleErr: %v, beRequestGenerateSampleErr: %v, beUsageGenerateSampleErr: %v", err01, err02, err03)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -84,9 +84,9 @@ func Test_collectBECPUResourceMetric(t *testing.T) {
querier, err := collector.metricCache.Querier(oldStartTime, now)
assert.NoError(t, err)

beCPUUsageProperties := metriccache.MetricPropertiesFunc.NodeBE(string(metriccache.BEResourceCPU), string(metriccache.BEResouceAllocationUsage))
beCPURequestProperties := metriccache.MetricPropertiesFunc.NodeBE(string(metriccache.BEResourceCPU), string(metriccache.BEResouceAllocationRequest))
beCPURealLimitProperties := metriccache.MetricPropertiesFunc.NodeBE(string(metriccache.BEResourceCPU), string(metriccache.BEResouceAllocationRealLimit))
beCPUUsageProperties := metriccache.MetricPropertiesFunc.NodeBE(string(metriccache.BEResourceCPU), string(metriccache.BEResourceAllocationUsage))
beCPURequestProperties := metriccache.MetricPropertiesFunc.NodeBE(string(metriccache.BEResourceCPU), string(metriccache.BEResourceAllocationRequest))
beCPURealLimitProperties := metriccache.MetricPropertiesFunc.NodeBE(string(metriccache.BEResourceCPU), string(metriccache.BEResourceAllocationRealLimit))

beCPUUsageQueryMeta, err := metriccache.NodeBEMetric.BuildQueryMeta(beCPUUsageProperties)
assert.NoError(t, err)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,14 +35,18 @@ import (
"github.com/koordinator-sh/koordinator/pkg/util"
)

var (
timeNow = time.Now
)

type kidledcoldPageCollector struct {
collectInterval time.Duration
started *atomic.Bool
cgroupReader resourceexecutor.CgroupReader
statesInformer statesinformer.StatesInformer
podFilter framework.PodFilter
appendableDB metriccache.Appendable
metricDB metriccache.MetricCache
metricDB metriccache.MetricCache // TODO remove redundant var
coldBoundary int
}

Expand Down Expand Up @@ -95,6 +99,12 @@ func (k *kidledcoldPageCollector) collectColdPageInfo() {
}
coldPageMetrics = append(coldPageMetrics, podsColdPageInfoMetric...)

hostAppsColdPageInfoMetric, err := k.collectHostAppsColdPageInfo()
if err != nil {
klog.Warningf("generate host application cold page info metrics failed, err %v", err)
}
coldPageMetrics = append(coldPageMetrics, hostAppsColdPageInfoMetric...)

appender := k.appendableDB.Appender()
if err := appender.Append(coldPageMetrics); err != nil {
klog.ErrorS(err, "Append node metrics error")
Expand All @@ -111,7 +121,7 @@ func (k *kidledcoldPageCollector) collectColdPageInfo() {

func (k *kidledcoldPageCollector) collectNodeColdPageInfo() ([]metriccache.MetricSample, error) {
coldPageMetrics := make([]metriccache.MetricSample, 0)
collectTime := time.Now()
collectTime := timeNow()
nodeColdPageBytes, err := k.cgroupReader.ReadMemoryColdPageUsage("")
if err != nil {
return nil, err
Expand All @@ -133,6 +143,7 @@ func (k *kidledcoldPageCollector) collectNodeColdPageInfo() ([]metriccache.Metri
return nil, err
}
coldPageMetrics = append(coldPageMetrics, memUsageWithHotPageMetrics)

klog.V(4).Infof("collectNodeResUsed finished, count %v, memUsageWithHotPage[%v], coldPageSize[%v]",
len(coldPageMetrics), memUsageWithHotPageValue, nodeColdPageBytes)
return coldPageMetrics, nil
Expand All @@ -150,7 +161,7 @@ func (k *kidledcoldPageCollector) collectPodsColdPageInfo() ([]metriccache.Metri
klog.V(5).Infof("skip collect pod %s, reason: %s", podKey, msg)
continue
}
collectTime := time.Now()
collectTime := timeNow()
podCgroupDir := meta.CgroupDir
podColdPageBytes, err := k.cgroupReader.ReadMemoryColdPageUsage(podCgroupDir)
if err != nil {
Expand All @@ -169,7 +180,7 @@ func (k *kidledcoldPageCollector) collectPodsColdPageInfo() ([]metriccache.Metri
}
coldMetrics = append(coldMetrics, podColdPageMetrics)

podMemUsageWithHotPageBytes, err := koordletutil.GetPodMemUsageWithHotPageCache(k.cgroupReader, podCgroupDir, podColdPageBytes)
podMemUsageWithHotPageBytes, err := koordletutil.GetCgroupMemUsageWithHotPageCache(k.cgroupReader, podCgroupDir, podColdPageBytes)
if err != nil {
klog.Warningf("failed to collect pod usage for Memory err: %s pod: %s/%s", err, pod.Namespace, pod.Name)
continue
Expand Down Expand Up @@ -199,7 +210,7 @@ func (k *kidledcoldPageCollector) collectContainersColdPageInfo(meta *statesinfo
for i := range pod.Status.ContainerStatuses {
containerStat := &pod.Status.ContainerStatuses[i]
containerKey := fmt.Sprintf("%s/%s/%s", pod.Namespace, pod.Name, containerStat.Name)
collectTime := time.Now()
collectTime := timeNow()
if len(containerStat.ContainerID) == 0 {
klog.Warningf("container %s id is empty, maybe not ready, skip this round", containerKey)
continue
Expand All @@ -222,7 +233,7 @@ func (k *kidledcoldPageCollector) collectContainersColdPageInfo(meta *statesinfo
}
coldMetrics = append(coldMetrics, containerColdPageMetrics)

containerMemUsageWithHotPageBytes, err := koordletutil.GetContainerMemUsageWithHotPageCache(k.cgroupReader, containerCgroupDir, containerColdPageBytes)
containerMemUsageWithHotPageBytes, err := koordletutil.GetCgroupMemUsageWithHotPageCache(k.cgroupReader, containerCgroupDir, containerColdPageBytes)
if err != nil {
return nil, err
}
Expand All @@ -240,6 +251,48 @@ func (k *kidledcoldPageCollector) collectContainersColdPageInfo(meta *statesinfo
return coldMetrics, nil
}

func (k *kidledcoldPageCollector) collectHostAppsColdPageInfo() ([]metriccache.MetricSample, error) {
nodeSLO := k.statesInformer.GetNodeSLO()
if nodeSLO == nil {
return nil, fmt.Errorf("get nil nodeslo curing collect host application cold page info")
}
coldMetrics := make([]metriccache.MetricSample, 0, len(nodeSLO.Spec.HostApplications))
count := 0
for _, hostApp := range nodeSLO.Spec.HostApplications {
collectTime := timeNow()
cgroupDir := koordletutil.GetHostAppCgroupRelativePath(&hostApp)
coldPageBytes, err := k.cgroupReader.ReadMemoryColdPageUsage(cgroupDir)
if err != nil {
klog.Warningf("can not get cold page info from memory.idle_page_stats file for host application %s", hostApp.Name)
continue
}
coldPageBytesValue := float64(coldPageBytes)
coldPageMetrics, err := metriccache.HostAppMemoryColdPageSizeMetric.GenerateSample(metriccache.MetricPropertiesFunc.HostApplication(hostApp.Name),
collectTime, coldPageBytesValue)
if err != nil {
return nil, err
}
coldMetrics = append(coldMetrics, coldPageMetrics)

memUsageWithHotPageBytes, err := koordletutil.GetCgroupMemUsageWithHotPageCache(k.cgroupReader, cgroupDir, coldPageBytes)
if err != nil {
klog.Warningf("failed to collect host application %v usage for memory err: %s", hostApp.Name, err)
continue
}

memUsageWithHotPageValue := float64(memUsageWithHotPageBytes)
memUsageWithHotPageMetrics, err := metriccache.HostAppMemoryWithHotPageUsageMetric.GenerateSample(metriccache.MetricPropertiesFunc.HostApplication(hostApp.Name),
collectTime, memUsageWithHotPageValue)
if err != nil {
return nil, err
}
coldMetrics = append(coldMetrics, memUsageWithHotPageMetrics)
count++
}
klog.V(4).Infof("collectHostAppsColdPageInfo finished, host application num %d, collected %d", len(coldMetrics), count)
return coldMetrics, nil
}

func (k *kidledcoldPageCollector) FilterPod(meta *statesinformer.PodMeta) (bool, string) {
return k.podFilter.FilterPod(meta)
}
Loading

0 comments on commit f197bbc

Please sign in to comment.