Skip to content

Commit

Permalink
Changes to CPU config for better CPU usage.
Browse files Browse the repository at this point in the history
  • Loading branch information
RoshaniN committed Jan 15, 2025
1 parent 8dfdc81 commit af889a4
Show file tree
Hide file tree
Showing 3 changed files with 133 additions and 16 deletions.
7 changes: 5 additions & 2 deletions src/xpk/core/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -335,7 +335,7 @@ def get_total_chips_requested_from_args(
num_chips = system.vms_per_slice * system.chips_per_vm * args.num_nodes
else:
num_chips = system.vms_per_slice * system.chips_per_vm * args.num_slices

xpk_print("ROSHANI NUM_CHIPS = ", num_chips)
return num_chips


Expand Down Expand Up @@ -2312,7 +2312,10 @@ def get_main_container_resources(
return gpu_resources_yaml.format(system=system)

if system.accelerator_type == AcceleratorType['CPU']:
return ''
# return ''
setup = int(system.chips_per_vm) * 0.95
xpk_print("ROSHANIN NUM_CHIPS2 ", setup)
return f'{resource_type}: {setup}'

return f'{resource_type}: {system.chips_per_vm}'

Expand Down
28 changes: 14 additions & 14 deletions src/xpk/core/system_characteristics.py
Original file line number Diff line number Diff line change
Expand Up @@ -1266,7 +1266,7 @@ def get_system_characteristics_by_device_type(
1,
'N/A',
'm1-megamem-96',
1,
96,
AcceleratorType['CPU'],
'm1-megamem-96-1',
),
Expand All @@ -1276,7 +1276,7 @@ def get_system_characteristics_by_device_type(
1,
'N/A',
'n2-standard-64',
1,
64,
AcceleratorType['CPU'],
'n2-standard-64-1',
),
Expand All @@ -1286,7 +1286,7 @@ def get_system_characteristics_by_device_type(
1,
'N/A',
'n2-standard-32',
1,
32,
AcceleratorType['CPU'],
'n2-standard-32-1',
),
Expand All @@ -1295,7 +1295,7 @@ def get_system_characteristics_by_device_type(
2,
'N/A',
'n2-standard-32',
1,
32,
AcceleratorType['CPU'],
'n2-standard-32-2',
),
Expand All @@ -1304,7 +1304,7 @@ def get_system_characteristics_by_device_type(
4,
'N/A',
'n2-standard-32',
1,
32,
AcceleratorType['CPU'],
'n2-standard-32-4',
),
Expand All @@ -1313,7 +1313,7 @@ def get_system_characteristics_by_device_type(
8,
'N/A',
'n2-standard-32',
1,
32,
AcceleratorType['CPU'],
'n2-standard-32-8',
),
Expand All @@ -1322,7 +1322,7 @@ def get_system_characteristics_by_device_type(
16,
'N/A',
'n2-standard-32',
1,
32,
AcceleratorType['CPU'],
'n2-standard-32-16',
),
Expand All @@ -1331,7 +1331,7 @@ def get_system_characteristics_by_device_type(
32,
'N/A',
'n2-standard-32',
1,
32,
AcceleratorType['CPU'],
'n2-standard-32-32',
),
Expand All @@ -1340,7 +1340,7 @@ def get_system_characteristics_by_device_type(
64,
'N/A',
'n2-standard-32',
1,
32,
AcceleratorType['CPU'],
'n2-standard-32-64',
),
Expand All @@ -1349,7 +1349,7 @@ def get_system_characteristics_by_device_type(
128,
'N/A',
'n2-standard-32',
1,
32,
AcceleratorType['CPU'],
'n2-standard-32-128',
),
Expand All @@ -1358,7 +1358,7 @@ def get_system_characteristics_by_device_type(
256,
'N/A',
'n2-standard-32',
1,
32,
AcceleratorType['CPU'],
'n2-standard-32-256',
),
Expand All @@ -1367,7 +1367,7 @@ def get_system_characteristics_by_device_type(
512,
'N/A',
'n2-standard-32',
1,
32,
AcceleratorType['CPU'],
'n2-standard-32-512',
),
Expand All @@ -1376,7 +1376,7 @@ def get_system_characteristics_by_device_type(
1024,
'N/A',
'n2-standard-32',
1,
32,
AcceleratorType['CPU'],
'n2-standard-32-1024',
),
Expand All @@ -1385,7 +1385,7 @@ def get_system_characteristics_by_device_type(
2048,
'N/A',
'n2-standard-32',
1,
32,
AcceleratorType['CPU'],
'n2-standard-32-2048',
),
Expand Down
114 changes: 114 additions & 0 deletions src/xpk/core/test.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
apiVersion: jobset.x-k8s.io/v1alpha2
kind: JobSet
metadata:
name: roshanin-test-5
labels:
kueue.x-k8s.io/queue-name: multislice-queue # Name of the LocalQueue
xpk.google.com/workload: roshanin-test-5
annotations:
alpha.jobset.sigs.k8s.io/exclusive-topology: cloud.google.com/gke-nodepool # 1:1 job replica to node pool assignment
spec:
ttlSecondsAfterFinished: 43200
failurePolicy:
maxRestarts: 0
replicatedJobs:
- name: slice-job
replicas: 1
template:
spec:
parallelism: 1 # Equal to the number of VMs per slice
completions: 1 # Same as the above.
backoffLimit: 0 # When any pod fails, the job is failed
template:
metadata:
labels:
xpk.google.com/workload: roshanin-cpu-test-2
spec:
schedulerName: default-scheduler
restartPolicy: Never
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: cloud.google.com/gke-nodepool
operator: NotIn
values:
- default-pool

nodeSelector:



priorityClassName: medium
hostNetwork: true
dnsPolicy: ClusterFirstWithHostNet
terminationGracePeriodSeconds: 30
containers:
- name: jax-tpu
image: gcr.io/tpu-prod-env-multipod/maxtext_jax_stable_stack:2025-01-13

env:
- name: REPLICATED_JOB_NAME
valueFrom:
fieldRef:
fieldPath: metadata.annotations['jobset.sigs.k8s.io/replicatedjob-name']
- name: JOB_INDEX
valueFrom:
fieldRef:
fieldPath: metadata.annotations['jobset.sigs.k8s.io/job-index']
- name: JOB_COMPLETION_INDEX
valueFrom:
fieldRef:
fieldPath: metadata.annotations['batch.kubernetes.io/job-completion-index']
- name: PROCESSES_IN_JOB
value: "1"
- name: JAX_PROCESS_COUNT
value: "1"

- name: GCS_OUTPUT
value: "gs://ml-auto-solutions/output/sparsity_diffusion_devx/maxtext/chained_tests_mixtral-8x7b_stable-2025-01-13-05-00-09/"
- name: JAX_COORDINATOR_ADDRESS
value: "$(JOBSET_NAME)-$(REPLICATED_JOB_NAME)-0-0.$(JOBSET_NAME)"

ports:
- containerPort: 8471
- containerPort: 8080
- containerPort: 1234
securityContext:
privileged: true
command:
- bash
- -c
- |
echo XPK Start: $(date);
_sigterm() (kill -SIGTERM $! 2>/dev/null;);
trap _sigterm SIGTERM;
(set -xue;export BASE_OUTPUT_PATH=$GCS_OUTPUT; bash end_to_end/tpu/mixtral/8x7b/1_test_mixtral.sh) & PID=$!;
while kill -0 $PID 2>/dev/null;
do sleep 5;
done;
wait $PID;
EXIT_CODE=$?;
echo XPK End: $(date);
echo EXIT_CODE=$EXIT_CODE;
if [ "$EXIT_CODE" = 143 ]; then
exit $EXIT_CODE
fi
exit $EXIT_CODE
resources:
limits:
cpu:

volumeMounts:
- mountPath: /dev/shm
name: dshm-2

volumes:
- emptyDir:
medium: Memory
name: dshm-2

0 comments on commit af889a4

Please sign in to comment.