Skip to content

Commit

Permalink
koordlet: report gpu device topology (#1775)
Browse files Browse the repository at this point in the history
Signed-off-by: Joseph <[email protected]>
  • Loading branch information
eahydra authored Dec 12, 2023
1 parent 688cdd3 commit c927e11
Show file tree
Hide file tree
Showing 8 changed files with 237 additions and 10 deletions.
29 changes: 25 additions & 4 deletions pkg/koordlet/metricsadvisor/devices/gpu/collector_gpu_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,9 @@ type device struct {
Minor int32 // index starting from 0
DeviceUUID string
MemoryTotal uint64
NodeID int32
PCIE string
BusID string
Device nvml.Device
}

Expand Down Expand Up @@ -114,10 +117,21 @@ func (g *gpuDeviceManager) initGPUData() error {
if ret != nvml.SUCCESS {
return fmt.Errorf("unable to get device memory info: %v", nvml.ErrorString(ret))
}
pciInfo, ret := gpudevice.GetPciInfo()
if ret != nvml.SUCCESS {
return fmt.Errorf("unable to get pci info: %v", nvml.ErrorString(ret))
}
nodeID, pcie, busID, err := parseGPUPCIInfo(pciInfo.BusIdLegacy)
if err != nil {
return err
}
devices[deviceIndex] = &device{
DeviceUUID: uuid,
Minor: int32(minor),
MemoryTotal: memory.Total,
NodeID: nodeID,
PCIE: pcie,
BusID: busID,
Device: gpudevice,
}
}
Expand All @@ -134,7 +148,14 @@ func (g *gpuDeviceManager) deviceInfos() metriccache.Devices {
defer g.RUnlock()
gpuDevices := util.GPUDevices{}
for _, device := range g.devices {
gpuDevices = append(gpuDevices, util.GPUDeviceInfo{UUID: device.DeviceUUID, Minor: device.Minor, MemoryTotal: device.MemoryTotal})
gpuDevices = append(gpuDevices, util.GPUDeviceInfo{
UUID: device.DeviceUUID,
Minor: device.Minor,
MemoryTotal: device.MemoryTotal,
NodeID: device.NodeID,
PCIE: device.PCIE,
BusID: device.BusID,
})
}

return gpuDevices
Expand Down Expand Up @@ -179,7 +200,7 @@ func (g *gpuDeviceManager) getNodeGPUUsage() []metriccache.MetricSample {
return gpuMetrics
}

func (g *gpuDeviceManager) getPodOrContinerTotalGPUUsageOfPIDs(id string, isPodID bool, pids []uint32) []metriccache.MetricSample {
func (g *gpuDeviceManager) getPodOrContainerTotalGPUUsageOfPIDs(id string, isPodID bool, pids []uint32) []metriccache.MetricSample {
if id == "" {
klog.Warning("id is empty")
return nil
Expand Down Expand Up @@ -261,7 +282,7 @@ func (g *gpuDeviceManager) getPodGPUUsage(uid, podParentDir string, cs []corev1.
if err != nil {
return nil, fmt.Errorf("failed to get pid, error: %v", err)
}
return g.getPodOrContinerTotalGPUUsageOfPIDs(uid, true, pids), nil
return g.getPodOrContainerTotalGPUUsageOfPIDs(uid, true, pids), nil
}

func (g *gpuDeviceManager) getContainerGPUUsage(containerID, podParentDir string, c *corev1.ContainerStatus) ([]metriccache.MetricSample, error) {
Expand All @@ -273,7 +294,7 @@ func (g *gpuDeviceManager) getContainerGPUUsage(containerID, podParentDir string
if err != nil {
return nil, fmt.Errorf("failed to get pid, error: %v", err)
}
return g.getPodOrContinerTotalGPUUsageOfPIDs(containerID, false, currentPIDs), nil
return g.getPodOrContainerTotalGPUUsageOfPIDs(containerID, false, currentPIDs), nil
}

func (g *gpuDeviceManager) collectGPUUsage() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -350,7 +350,7 @@ func Test_gpuUsageDetailRecord_getPodOrContinerTotalGPUUsageOfPIDs(t *testing.T)
devices: tt.fields.devices,
processesMetrics: tt.fields.processesMetrics,
}
got := g.getPodOrContinerTotalGPUUsageOfPIDs(tt.args.id, tt.args.isPodID, tt.args.pids)
got := g.getPodOrContainerTotalGPUUsageOfPIDs(tt.args.id, tt.args.isPodID, tt.args.pids)
assert.Equal(t, tt.want, got)
})
}
Expand Down
83 changes: 83 additions & 0 deletions pkg/koordlet/metricsadvisor/devices/gpu/helper.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
/*
Copyright 2022 The Koordinator Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package gpu

import (
"bytes"
"fmt"
"os"
"path/filepath"
"regexp"
"strconv"
"strings"

"github.com/koordinator-sh/koordinator/pkg/koordlet/util/system"
)

var (
pcieRegexp = regexp.MustCompile(`pci\d{4}:[0-9a-fA-F]{2}`)
)

func parseGPUPCIInfo(busIdLegacy [16]int8) (int32, string, string, error) {
busIDBuilder := &strings.Builder{}
for _, v := range busIdLegacy {
if v != 0 {
busIDBuilder.WriteByte(byte(v))
}
}
busID := strings.ToLower(busIDBuilder.String())
nodeID, err := getNUMANodeID(busID)
if err != nil {
return 0, "", "", fmt.Errorf("failed to parse NUMA Node ID, err: %w", err)
}
pcie, err := getPCIERootComplexID(busID)
if err != nil {
return 0, "", "", fmt.Errorf("failed to parse PCIE ID, err: %w", err)
}
return nodeID, pcie, busID, nil
}

func getPCIERootComplexID(bdf string) (string, error) {
path, err := filepath.EvalSymlinks(filepath.Join(system.GetPCIDeviceDir(), bdf))
if err != nil {
return "", err
}
return parsePCIEID(path), err
}

func parsePCIEID(path string) string {
result := pcieRegexp.FindAllStringSubmatch(path, -1)
if len(result) == 0 || len(result[0]) == 0 {
return ""
}
return result[0][0]
}

func getNUMANodeID(bdf string) (int32, error) {
data, err := os.ReadFile(filepath.Join(system.GetPCIDeviceDir(), bdf, "numa_node"))
if err != nil {
return -1, err
}
nodeID, err := strconv.Atoi(string(bytes.TrimSpace(data)))
if err != nil {
return 0, err
}
if nodeID == -1 {
nodeID = 0
}
return int32(nodeID), nil
}
87 changes: 87 additions & 0 deletions pkg/koordlet/metricsadvisor/devices/gpu/helper_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
/*
Copyright 2022 The Koordinator Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package gpu

import (
"fmt"
"os"
"path/filepath"
"testing"

"github.com/stretchr/testify/assert"

"github.com/koordinator-sh/koordinator/pkg/koordlet/util/system"
)

func Test_parseGPUPCIInfo(t *testing.T) {
tests := []struct {
name string
busID string
nodeID int32
pcie string
wantNode int32
wantPCIE string
wantBusID string
wantErr bool
}{
{
name: "numa node -1",
busID: "0000:00:07.0",
nodeID: -1,
pcie: "pci0000:00",
wantNode: 0,
wantPCIE: "pci0000:00",
wantBusID: "0000:00:07.0",
},
{
name: "numa node 1",
busID: "0000:00:07.0",
nodeID: 1,
pcie: "pci0000:00",
wantNode: 1,
wantPCIE: "pci0000:00",
wantBusID: "0000:00:07.0",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
helper := system.NewFileTestUtil(t)
defer helper.Cleanup()

pciDeviceDir := system.GetPCIDeviceDir()
gpuDeviceDir := filepath.Join(pciDeviceDir, tt.pcie, tt.busID)
assert.NoError(t, os.MkdirAll(gpuDeviceDir, 0700))
assert.NoError(t, os.WriteFile(filepath.Join(gpuDeviceDir, "numa_node"), []byte(fmt.Sprintf("%d\n", tt.nodeID)), 0700))

symbolicLink := filepath.Join(pciDeviceDir, tt.busID)
assert.NoError(t, os.Symlink(gpuDeviceDir, symbolicLink))

var busIdLegacy [16]int8
for i, v := range tt.busID {
busIdLegacy[i] = int8(v)
}
nodeID, pcie, busID, err := parseGPUPCIInfo(busIdLegacy)
if (err != nil) && !tt.wantErr {
t.Errorf("expect wantErr=%v but got err=%v", tt.wantErr, err)
return
}
assert.Equal(t, tt.wantNode, nodeID)
assert.Equal(t, tt.wantPCIE, pcie)
assert.Equal(t, tt.wantBusID, busID)
})
}
}
12 changes: 12 additions & 0 deletions pkg/koordlet/statesinformer/impl/states_device_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,17 @@ func (s *statesInformer) buildGPUDevice() []schedulingv1alpha1.DeviceInfo {
health = false
}
s.gpuMutex.RUnlock()

var topology *schedulingv1alpha1.DeviceTopology
if gpu.NodeID >= 0 && gpu.PCIE != "" && gpu.BusID != "" {
topology = &schedulingv1alpha1.DeviceTopology{
SocketID: -1,
NodeID: gpu.NodeID,
PCIEID: gpu.PCIE,
BusID: gpu.BusID,
}
}

deviceInfos = append(deviceInfos, schedulingv1alpha1.DeviceInfo{
UUID: gpu.UUID,
Minor: &gpu.Minor,
Expand All @@ -172,6 +183,7 @@ func (s *statesInformer) buildGPUDevice() []schedulingv1alpha1.DeviceInfo {
extension.ResourceGPUMemory: *resource.NewQuantity(int64(gpu.MemoryTotal), resource.BinarySI),
extension.ResourceGPUMemoryRatio: *resource.NewQuantity(100, resource.DecimalSI),
},
Topology: topology,
})
}
return deviceInfos
Expand Down
26 changes: 22 additions & 4 deletions pkg/koordlet/statesinformer/impl/states_device_linux_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ func Test_reportGPUDevice(t *testing.T) {
gpuDeviceInfo = []koordletutil.GPUDeviceInfo{
{UUID: "1", Minor: 1, MemoryTotal: 8000},
{UUID: "2", Minor: 2, MemoryTotal: 10000},
{UUID: "3", Minor: 3, MemoryTotal: 8000, BusID: "0000:00:08.0", NodeID: 0, PCIE: "pci0000:00"},
}
mockMetricCache.EXPECT().Get(koordletutil.GPUDeviceType).Return(gpuDeviceInfo, true)
r := &statesInformer{
Expand Down Expand Up @@ -88,23 +89,40 @@ func Test_reportGPUDevice(t *testing.T) {
extension.ResourceGPUMemoryRatio: *resource.NewQuantity(100, resource.DecimalSI),
},
},
{
UUID: "3",
Minor: pointer.Int32(3),
Type: schedulingv1alpha1.GPU,
Health: true,
Resources: map[corev1.ResourceName]resource.Quantity{
extension.ResourceGPUCore: *resource.NewQuantity(100, resource.DecimalSI),
extension.ResourceGPUMemory: *resource.NewQuantity(8000, resource.BinarySI),
extension.ResourceGPUMemoryRatio: *resource.NewQuantity(100, resource.DecimalSI),
},
Topology: &schedulingv1alpha1.DeviceTopology{
SocketID: -1,
NodeID: 0,
PCIEID: "pci0000:00",
BusID: "0000:00:08.0",
},
},
}
device, err := fakeClient.Get(context.TODO(), "test", metav1.GetOptions{})
assert.Equal(t, nil, err)
assert.Equal(t, device.Spec.Devices, expectedDevices)

gpuDeviceInfo = append(gpuDeviceInfo, koordletutil.GPUDeviceInfo{
UUID: "3",
Minor: 3,
UUID: "4",
Minor: 4,
MemoryTotal: 10000,
})

mockMetricCache.EXPECT().Get(koordletutil.GPUDeviceType).Return(gpuDeviceInfo, true)
r.reportDevice()

expectedDevices = append(expectedDevices, schedulingv1alpha1.DeviceInfo{
UUID: "3",
Minor: pointer.Int32(3),
UUID: "4",
Minor: pointer.Int32(4),
Type: schedulingv1alpha1.GPU,
Health: true,
Resources: map[corev1.ResourceName]resource.Quantity{
Expand Down
3 changes: 3 additions & 0 deletions pkg/koordlet/util/device.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,4 +38,7 @@ type GPUDeviceInfo struct {
// Minor represents the Minor number of Devices, starting from 0
Minor int32 `json:"minor,omitempty"`
MemoryTotal uint64 `json:"memory-total,omitempty"`
NodeID int32 `json:"nodeID"`
PCIE string `json:"pcie,omitempty"`
BusID string `json:"busID,omitempty"`
}
5 changes: 4 additions & 1 deletion pkg/koordlet/util/system/system_file.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,8 @@ const (

KernelSchedGroupIdentityEnable = "kernel/sched_group_identity_enabled"

SysNUMASubDir = "bus/node/devices"
SysNUMASubDir = "bus/node/devices"
SysPCIDeviceDir = "bus/pci/devices"

SysCPUSMTActiveSubPath = "devices/system/cpu/smt/active"
SysIntelPStateNoTurboSubPath = "devices/system/cpu/intel_pstate/no_turbo"
Expand Down Expand Up @@ -118,6 +119,8 @@ func GetProcSysFilePath(file string) string {
return filepath.Join(Conf.ProcRootDir, SysctlSubDir, file)
}

func GetPCIDeviceDir() string { return filepath.Join(Conf.SysRootDir, SysPCIDeviceDir) }

var _ utilsysctl.Interface = &ProcSysctl{}

// ProcSysctl implements Interface by reading and writing files under /proc/sys
Expand Down

0 comments on commit c927e11

Please sign in to comment.