From 6d4f201afe9aa3fbd0b17791e7c628ba072975b8 Mon Sep 17 00:00:00 2001 From: Fengping Hu Date: Mon, 29 Jan 2024 16:38:57 -0600 Subject: [PATCH] fix gpu request counts and use generic label --- binderhub/resources.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/binderhub/resources.py b/binderhub/resources.py index a552ecae8..f90585046 100644 --- a/binderhub/resources.py +++ b/binderhub/resources.py @@ -67,7 +67,7 @@ def get_gpu_availability(self, product=None, memory=None): elif memory: nodes = api.list_node(label_selector='gpu=true,nvidia.com/gpu.memory=%s' %memory) else: - nodes = api.list_node(label_selector='gpu=true') + nodes = api.list_node(label_selector='nvidia.com/gpu.product') for node in nodes.items: product = node.metadata.labels['nvidia.com/gpu.product'] memory = int(node.metadata.labels['nvidia.com/gpu.memory']) @@ -80,9 +80,10 @@ def get_gpu_availability(self, product=None, memory=None): gpu['total_requests'] = 0 pods = api.list_pod_for_all_namespaces(field_selector='spec.nodeName=%s' %node.metadata.name).items for pod in pods: - requests = pod.spec.containers[0].resources.requests - if requests: - gpu['total_requests'] += int(requests.get('nvidia.com/gpu', 0)) + for container in pod.spec.containers: + requests = container.resources.requests + if requests: + gpu['total_requests'] += int(requests.get('nvidia.com/gpu', 0)) gpu['available'] = max(gpu['count'] - gpu['total_requests'], 0) return sorted(gpus.values(), key=lambda gpu : gpu['memory'])