Skip to content

Commit

Permalink
Add GPU list to /about/usage/system
Browse files Browse the repository at this point in the history
  • Loading branch information
philogicae committed Nov 27, 2024
1 parent 6836d6d commit b0bcf57
Showing 1 changed file with 68 additions and 1 deletion.
69 changes: 68 additions & 1 deletion src/aleph/vm/orchestrator/resources.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,17 @@
import math
import subprocess
from datetime import datetime, timezone
from enum import Enum
from functools import lru_cache
from typing import List, Literal, Optional

import cpuinfo
import psutil
from aiohttp import web
from aleph_message.models import ItemHash
from aleph_message.models.abstract import HashableModel
from aleph_message.models.execution.environment import CpuProperties
from pydantic import BaseModel, Field
from pydantic import BaseModel, Extra, Field

from aleph.vm.conf import settings
from aleph.vm.sevclient import SevClient
Expand Down Expand Up @@ -69,8 +73,36 @@ class UsagePeriod(BaseModel):
duration_seconds: float


class GpuProperties(HashableModel):
"""GPU properties."""

vendor: str = Field(description="GPU vendor name")
device_name: str = Field(description="GPU vendor card name")
device_class: Literal["0300", "0302"] = Field(
description="GPU device class. Look at https://admin.pci-ids.ucw.cz/read/PD/03"
)
device_id: str = Field(description="GPU vendor & device ids")

class Config:
extra = Extra.forbid


class GpuDeviceClass(str, Enum):
VGA_COMPATIBLE_CONTROLLER = "0300"
_3D_CONTROLLER = "0302"


def is_gpu_device_class(device_class: str) -> bool:
try:
GpuDeviceClass(device_class)
return True
except ValueError:
return False


class MachineProperties(BaseModel):
cpu: CpuProperties
gpu: Optional[List[GpuProperties]]


class MachineUsage(BaseModel):
Expand All @@ -82,6 +114,39 @@ class MachineUsage(BaseModel):
active: bool = True


def parse_gpu_device_info(line) -> Optional[GpuProperties]:
"""Parse GPU device info from a line of lspci output."""

device = line.split(' "', maxsplit=1)[1]
device_class, device_vendor, device_info = device.split('" "', maxsplit=2)
device_class = device_class.split("[", maxsplit=1)[1][:-1]
vendor, vendor_id = device_vendor.split(" [", maxsplit=1)
device_name = device_info.split('"', maxsplit=1)[0]
device_name, model_id = device_name.split(" [", maxsplit=1)
device_id = f"{vendor_id[:-1]}:{model_id[:-1]}"

return (
GpuProperties(
vendor=vendor,
device_name=device_name,
device_class=device_class,
device_id=device_id,
)
if is_gpu_device_class(device_class)
else None
)


def get_gpu_info() -> Optional[List[GpuProperties]]:
"""Get GPU info using lspci command."""

result = subprocess.run(["lspci", "-mmnnn"], capture_output=True, text=True, check=True)
gpu_devices = list(
{device for line in result.stdout.split("\n") if line and (device := parse_gpu_device_info(line)) is not None}
)
return gpu_devices if gpu_devices else None


@lru_cache
def get_machine_properties() -> MachineProperties:
"""Fetch machine properties such as architecture, CPU vendor, ...
Expand All @@ -90,6 +155,7 @@ def get_machine_properties() -> MachineProperties:
In the future, some properties may have to be fetched from within a VM.
"""
cpu_info = cpuinfo.get_cpu_info() # Slow
gpu_info = get_gpu_info()
return MachineProperties(
cpu=CpuProperties(
architecture=cpu_info.get("raw_arch_string", cpu_info.get("arch_string_raw")),
Expand All @@ -105,6 +171,7 @@ def get_machine_properties() -> MachineProperties:
)
),
),
gpu=gpu_info,
)


Expand Down

0 comments on commit b0bcf57

Please sign in to comment.