Skip to content

Commit

Permalink
Add support for setting libcuda verbosity
Browse files Browse the repository at this point in the history
Resolves: Project-HAMi#544
Signed-off-by: Tigran Grigoryan <[email protected]>
  • Loading branch information
4gt-104 committed Oct 19, 2024
1 parent 28f20a6 commit c9b52c9
Show file tree
Hide file tree
Showing 3 changed files with 32 additions and 15 deletions.
1 change: 1 addition & 0 deletions charts/hami/templates/scheduler/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ spec:
- --resource-cores={{ .Values.resourceCores }}
- --resource-mem-percentage={{ .Values.resourceMemPercentage }}
- --resource-priority={{ .Values.resourcePriority }}
- --libcuda-log-verbosity-level={{ .Values.libcudaLogVerbosity }}
- --http_bind=0.0.0.0:443
- --cert_file=/tls/tls.crt
- --key_file=/tls/tls.key
Expand Down
2 changes: 2 additions & 0 deletions charts/hami/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ resourceMem: "nvidia.com/gpumem"
resourceMemPercentage: "nvidia.com/gpumem-percentage"
resourceCores: "nvidia.com/gpucores"
resourcePriority: "nvidia.com/priority"
## Use environment variable LIBCUDA_LOG_LEVEL to set the visibility of logs in containers
libcudaLogVerbosity: "2"

#MLU Parameters
mluResourceName: "cambricon.com/vmlu"
Expand Down
44 changes: 29 additions & 15 deletions pkg/device/nvidia/device.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,13 +49,14 @@ const (
)

var (
ResourceName string
ResourceMem string
ResourceCores string
ResourceMemPercentage string
ResourcePriority string
DebugMode bool
OverwriteEnv bool
ResourceName string
ResourceMem string
ResourceCores string
ResourceMemPercentage string
ResourcePriority string
LIBCUDALogVerbosityLevel string
DebugMode bool
OverwriteEnv bool
)

type NvidiaGPUDevices struct {
Expand All @@ -78,6 +79,7 @@ func ParseConfig(fs *flag.FlagSet) {
fs.StringVar(&ResourceMemPercentage, "resource-mem-percentage", "nvidia.com/gpumem-percentage", "gpu memory fraction to allocate")
fs.StringVar(&ResourceCores, "resource-cores", "nvidia.com/gpucores", "cores percentage to use")
fs.StringVar(&ResourcePriority, "resource-priority", "vgputaskpriority", "vgpu task priority 0 for high and 1 for low")
fs.StringVar(&LIBCUDALogVerbosityLevel, "libcuda-log-verbosity-level", "2", "verbosity level of LIBCUDA")
fs.BoolVar(&OverwriteEnv, "overwrite-env", false, "If set NVIDIA_VISIBLE_DEVICES=none to pods with no-gpu allocation")
}

Expand Down Expand Up @@ -136,18 +138,32 @@ func (dev *NvidiaGPUDevices) GetNodeDevices(n corev1.Node) ([]*api.DeviceInfo, e
return nodedevices, nil
}

func setOrUpdateEnvVar(ctr *corev1.Container, name string, value string) {
// Check if the env var already exists
for i, envVar := range ctr.Env {
if envVar.Name == name {
// If found, update the value
ctr.Env[i].Value = value
return
}
}
// If not found, append it as a new env var
ctr.Env = append(ctr.Env, corev1.EnvVar{
Name: name,
Value: value,
})
}

func (dev *NvidiaGPUDevices) MutateAdmission(ctr *corev1.Container, p *corev1.Pod) (bool, error) {
/*gpu related */
priority, ok := ctr.Resources.Limits[corev1.ResourceName(ResourcePriority)]
if ok {
ctr.Env = append(ctr.Env, corev1.EnvVar{
Name: api.TaskPriority,
Value: fmt.Sprint(priority.Value()),
})
setOrUpdateEnvVar(ctr, api.TaskPriority, fmt.Sprint(priority.Value()))
}

_, resourceNameOK := ctr.Resources.Limits[corev1.ResourceName(ResourceName)]
if resourceNameOK {
setOrUpdateEnvVar(ctr, "LIBCUDA_LOG_LEVEL", LIBCUDALogVerbosityLevel)
return resourceNameOK, nil
}

Expand All @@ -159,14 +175,12 @@ func (dev *NvidiaGPUDevices) MutateAdmission(ctr *corev1.Container, p *corev1.Po
if config.DefaultResourceNum > 0 {
ctr.Resources.Limits[corev1.ResourceName(ResourceName)] = *resource.NewQuantity(int64(config.DefaultResourceNum), resource.BinarySI)
resourceNameOK = true
setOrUpdateEnvVar(ctr, "LIBCUDA_LOG_LEVEL", LIBCUDALogVerbosityLevel)
}
}

if !resourceNameOK && OverwriteEnv {
ctr.Env = append(ctr.Env, corev1.EnvVar{
Name: "NVIDIA_VISIBLE_DEVICES",
Value: "none",
})
setOrUpdateEnvVar(ctr, "NVIDIA_VISIBLE_DEVICES", "none")
}
return resourceNameOK, nil
}
Expand Down

0 comments on commit c9b52c9

Please sign in to comment.