diff --git a/deploy/helm-charts/Chart.lock b/deploy/helm-charts/Chart.lock index e2a37ac..d73d14a 100644 --- a/deploy/helm-charts/Chart.lock +++ b/deploy/helm-charts/Chart.lock @@ -1,6 +1,9 @@ dependencies: - name: argo-workflows repository: https://argoproj.github.io/argo-helm - version: 0.33.1 -digest: sha256:bc9fd492011835b2ebb1d418f860eda28691ab8805d3215be2f488fc00cfe236 -generated: "2023-09-07T10:32:16.447050486-07:00" + version: 0.33.3 +- name: nvidia-device-plugin + repository: https://nvidia.github.io/k8s-device-plugin + version: 0.14.1 +digest: sha256:d8e2875bf6b1affdb6bacda1b011a731bb4163165d6fa27b767a76a327597751 +generated: "2023-09-22T18:47:17.529533776-07:00" diff --git a/deploy/helm-charts/Chart.yaml b/deploy/helm-charts/Chart.yaml index bec470c..d9f2a8a 100644 --- a/deploy/helm-charts/Chart.yaml +++ b/deploy/helm-charts/Chart.yaml @@ -33,3 +33,6 @@ dependencies: - name: argo-workflows version: 0.33.3 repository: https://argoproj.github.io/argo-helm + - name: nvidia-device-plugin + version: 0.14.1 + repository: https://nvidia.github.io/k8s-device-plugin diff --git a/deploy/helm-charts/docs/01.installation.md b/deploy/helm-charts/docs/01.installation.md index 7ac35bc..7574afe 100644 --- a/deploy/helm-charts/docs/01.installation.md +++ b/deploy/helm-charts/docs/01.installation.md @@ -14,7 +14,7 @@ sudo apt-mark hold kubelet kubeadm kubectl curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash # Install Argo CLI -curl -sLO https://github.com/argoproj/argo-workflows/releases/download/v3.4.10/argo-linux-amd64.gz +curl -sLO https://github.com/argoproj/argo-workflows/releases/download/v3.4.11/argo-linux-amd64.gz gunzip argo-linux-amd64.gz chmod +x argo-linux-amd64 sudo mv ./argo-linux-amd64 /usr/local/bin/argo @@ -26,9 +26,11 @@ Select one of the following Kubernetes distribution: ### [k3s](https://k3s.io/) -See [Requirements](https://docs.k3s.io/installation/requirements) for K3s for hardware requirements. +See [Requirements](https://docs.k3s.io/installation/requirements) for K3s for hardware requirements and steps to enable [NVIDIA Container Runtime Support](https://docs.k3s.io/advanced#nvidia-container-runtime-support). ```bash +sudo apt install -y nvidia-container-runtime cuda-drivers-fabricmanager-515 nvidia-headless-515-server + curl -sfL https://get.k3s.io | sh -s - --flannel-backend host-gw --service-node-port-range 104-32767 --flannel-external-ip # Copy default configuration @@ -42,6 +44,11 @@ sudo chown $(id -u):$(id -g) $HOME/.kube/config For detail installation instructions with GPU support, see [cloud-native-stack](https://github.com/NVIDIA/cloud-native-stack/tree/master/install-guides). ```bash +# Disable swap +sudo nano /etc/fstab +# Add a # before all the lines that start with /swap and save the file. + + sudo kubeadm init --pod-network-cidr=192.168.0.0/16 # Copy default configuration mkdir -p $HOME/.kube @@ -78,6 +85,11 @@ my-system Ready control-plane 73s v1.28.1 If modifying the port range is not an option, update the port numbers inside `values.yaml` to be in the range. +## Install NVIDIA Container Toolkit + +Following the [instructions](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html) to install and configure NVIDIA Container Toolkit with your container runtime. + + ## Build & download MONAI Deploy dependencies ```bash @@ -105,8 +117,10 @@ Use the following commands to install MONAI Deploy Helm charts and its dependenc - Postgres - archives Argo jobs (can be disabled in `values.yaml` > `argo-workflows` > `controller` > `persistence` > `archive=false`) ```bash -helm upgrade -i monai-deploy . # default/current namespace -helm upgrade -i monai-deploy -n my-space . # install in namespace "my-namespace" +# default/current namespace +helm upgrade -i monai-deploy . +# install in namespace "my-namespace" +helm upgrade -i monai-deploy -n my-space . ``` > **Note** diff --git a/deploy/helm-charts/docs/04.Uninstallation.md b/deploy/helm-charts/docs/04.Uninstallation.md index fcadd17..75bd933 100644 --- a/deploy/helm-charts/docs/04.Uninstallation.md +++ b/deploy/helm-charts/docs/04.Uninstallation.md @@ -29,6 +29,6 @@ sudo rm -rf ~/.kube ## Uninstall Tools ```bash -sudo apt-get purge -y kubeadm kubectl kubelet kubernetes-cni kube* helm +sudo apt-get purge -y kubeadm kubectl kubelet kubernetes-cni kube* sudo apt-get autoremove -y ``` diff --git a/deploy/helm-charts/files/sample-workflows/liver-seg-template.yml b/deploy/helm-charts/files/sample-workflows/liver-seg-template.yml index 1f10c79..c078bac 100644 --- a/deploy/helm-charts/files/sample-workflows/liver-seg-template.yml +++ b/deploy/helm-charts/files/sample-workflows/liver-seg-template.yml @@ -59,7 +59,7 @@ spec: - /bin/sh args: - '-c' - - date -Ins && python3 -u /opt/monai/app/app.py && date -Ins + - date -Ins && time python3 -u /opt/monai/app/app.py && date -Ins env: - name: "MONAI_INPUTPATH" value: "/var/monai/input/" @@ -69,3 +69,6 @@ spec: value: "/opt/monai/models/" - name: "MONAI_WORKDIR" value: "/var/monai/" + resources: + limits: + nvidia.com/gpu: 1 diff --git a/deploy/helm-charts/values.yaml b/deploy/helm-charts/values.yaml index 9442fee..e834d50 100644 --- a/deploy/helm-charts/values.yaml +++ b/deploy/helm-charts/values.yaml @@ -262,6 +262,8 @@ tolerations: [] affinity: {} +nvidia-device-plugin: + allowDefaultNamespace: true ### Argo Workflow ### argo-workflows: