Skip to content

Commit

Permalink
Fix: Improve code organization and structure.
Browse files Browse the repository at this point in the history
  • Loading branch information
nesitor committed Dec 3, 2024
1 parent 523bc44 commit 2cf7e70
Show file tree
Hide file tree
Showing 3 changed files with 37 additions and 22 deletions.
33 changes: 23 additions & 10 deletions src/aleph/vm/orchestrator/resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

from aleph.vm.conf import settings
from aleph.vm.pool import VmPool
from aleph.vm.resources import GpuProperties
from aleph.vm.resources import GpuDevice
from aleph.vm.sevclient import SevClient
from aleph.vm.utils import (
check_amd_sev_es_supported,
Expand Down Expand Up @@ -74,8 +74,11 @@ class UsagePeriod(BaseModel):

class MachineProperties(BaseModel):
cpu: CpuProperties
gpu: Optional[List[GpuProperties]]
available_gpus: Optional[List[GpuProperties]]


class GpuProperties(BaseModel):
devices: Optional[List[GpuDevice]]
available_devices: Optional[List[GpuDevice]]


class MachineUsage(BaseModel):
Expand All @@ -84,20 +87,30 @@ class MachineUsage(BaseModel):
disk: DiskUsage
period: UsagePeriod
properties: MachineProperties
gpu: GpuProperties
active: bool = True


def get_machine_gpus(request: web.Request) -> GpuProperties:
pool: VmPool = request.app["vm_pool"]
gpus = pool.gpus
available_gpus = pool.get_available_gpus()

return GpuProperties(
devices=gpus,
available_devices=available_gpus,
)


@lru_cache
def get_machine_properties(request: web.Request) -> MachineProperties:
def get_machine_properties() -> MachineProperties:
"""Fetch machine properties such as architecture, CPU vendor, ...
These should not change while the supervisor is running.
In the future, some properties may have to be fetched from within a VM.
"""
cpu_info = cpuinfo.get_cpu_info() # Slow
pool: VmPool = request.app["vm_pool"]
gpus = pool.gpus
available_gpus = pool.get_available_gpus()

return MachineProperties(
cpu=CpuProperties(
architecture=cpu_info.get("raw_arch_string", cpu_info.get("arch_string_raw")),
Expand All @@ -113,15 +126,14 @@ def get_machine_properties(request: web.Request) -> MachineProperties:
)
),
),
gpu=gpus,
available_gpus=available_gpus,
)


@cors_allow_all
async def about_system_usage(request: web.Request):
"""Public endpoint to expose information about the system usage."""
period_start = datetime.now(timezone.utc).replace(second=0, microsecond=0)
machine_properties = get_machine_properties()

usage: MachineUsage = MachineUsage(
cpu=CpuUsage(
Expand All @@ -141,7 +153,8 @@ async def about_system_usage(request: web.Request):
start_timestamp=period_start,
duration_seconds=60,
),
properties=get_machine_properties(request),
properties=machine_properties,
gpu=get_machine_gpus(request)
)

return web.json_response(text=usage.json(exclude_none=True))
Expand Down
11 changes: 6 additions & 5 deletions src/aleph/vm/pool.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from aleph.vm.controllers.firecracker.snapshot_manager import SnapshotManager
from aleph.vm.network.hostnetwork import Network, make_ipv6_allocator
from aleph.vm.orchestrator.metrics import get_execution_records
from aleph.vm.resources import GpuProperties, get_gpu_info
from aleph.vm.resources import GpuDevice, get_gpu_devices
from aleph.vm.systemd import SystemDManager
from aleph.vm.utils import get_message_executable_content
from aleph.vm.vm_type import VmType
Expand All @@ -43,7 +43,7 @@ class VmPool:
snapshot_manager: SnapshotManager | None = None
systemd_manager: SystemDManager
creation_lock: asyncio.Lock
gpus: List[GpuProperties] = []
gpus: List[GpuDevice] = []

def __init__(self, loop: asyncio.AbstractEventLoop):
self.executions = {}
Expand Down Expand Up @@ -83,7 +83,7 @@ def setup(self) -> None:

if settings.ENABLE_GPU_SUPPORT:
logger.debug("Detecting GPU devices ...")
self.available_gpus = get_gpu_info()
self.gpus = get_gpu_devices()

def teardown(self) -> None:
"""Stop the VM pool and the network properly."""
Expand Down Expand Up @@ -288,8 +288,9 @@ def get_instance_executions(self) -> Iterable[VmExecution]:
)
return executions or []

def get_available_gpus(self) -> Iterable[GpuProperties]:
available_gpus = self.available_gpus
def get_available_gpus(self) -> Iterable[GpuDevice]:
# TODO: Filter already used GPUs on current executions and remove it from available
available_gpus = self.gpus
return available_gpus or []

def get_executions_by_sender(self, payment_type: PaymentType) -> dict[str, dict[str, list[VmExecution]]]:
Expand Down
15 changes: 8 additions & 7 deletions src/aleph/vm/resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,16 @@
from enum import Enum
from typing import List, Optional

from pydantic import BaseModel, Extra, Field
from aleph_message.models import HashableModel
from pydantic import Extra, Field


class GpuDeviceClass(str, Enum):
VGA_COMPATIBLE_CONTROLLER = "0300"
_3D_CONTROLLER = "0302"


class GpuProperties(BaseModel):
class GpuDevice(HashableModel):
"""GPU properties."""

vendor: str = Field(description="GPU vendor name")
Expand Down Expand Up @@ -56,7 +57,7 @@ def is_kernel_enabled_gpu(pci_host: str) -> bool:
return False


def parse_gpu_device_info(line: str) -> Optional[GpuProperties]:
def parse_gpu_device_info(line: str) -> Optional[GpuDevice]:
"""Parse GPU device info from a line of lspci output."""

pci_host, device = line.split(' "', maxsplit=1)
Expand All @@ -72,15 +73,15 @@ def parse_gpu_device_info(line: str) -> Optional[GpuProperties]:

device_class = GpuDeviceClass(device_class)

vendor, vendor_id = device_vendor.split(" [", maxsplit=1)
vendor, vendor_id = device_vendor.rsplit(" [", maxsplit=1)
vendor_id = vendor_id[:-1]
vendor_name = get_vendor_name(vendor_id)
device_name = device_info.split('"', maxsplit=1)[0]
device_name, model_id = device_name.split(" [", maxsplit=1)
device_name, model_id = device_name.rsplit(" [", maxsplit=1)
model_id = model_id[:-1]
device_id = f"{vendor_id}:{model_id}"

return GpuProperties(
return GpuDevice(
pci_host=pci_host,
vendor=vendor_name,
device_name=device_name,
Expand All @@ -89,7 +90,7 @@ def parse_gpu_device_info(line: str) -> Optional[GpuProperties]:
)


def get_gpu_info() -> Optional[List[GpuProperties]]:
def get_gpu_devices() -> Optional[List[GpuDevice]]:
"""Get GPU info using lspci command."""

result = subprocess.run(["lspci", "-mmnnn"], capture_output=True, text=True, check=True)
Expand Down

0 comments on commit 2cf7e70

Please sign in to comment.