-
Notifications
You must be signed in to change notification settings - Fork 24
/
Copy pathcifar10-job-template.yml
43 lines (43 loc) · 1.15 KB
/
cifar10-job-template.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
apiVersion: batch/v1
kind: Job
metadata:
name: cifar10-single-job-$ITEM
spec:
parallelism: 1
template:
metadata:
name: cifar10-single-job
labels:
app: cifar10-single-job
spec:
restartPolicy: Never
containers:
- name: pytorch
image: nvcr.io/nvidia/pytorch:18.11-py3
workingDir: /cifar10-training
env:
- name: JOB_ID
value: "$ITEM"
command: ["bash"]
args: ["-c","python cifar10_train.py"]
computeResourceRequests: ["nvidia-gpu"]
volumeMounts:
- name: cifar10-training
mountPath: /cifar10-training
- name: cifar10-dataset
mountPath: /datasets
volumes:
- name: cifar10-training
gitRepo:
repository: https://github.com/NVIDIA-developer-blog/kubernetes-hyperparam-exp.git
revision: master
directory: .
- name: cifar10-dataset
nfs:
server: 10.110.114.22
path: /home/sprasanna/Projects/training-datasets
computeResources:
- name: "nvidia-gpu"
resources:
limits:
nvidia.com/gpu: 1