diff --git a/.github/workflows/build_tests.yaml b/.github/workflows/build_tests.yaml index 0f623c61..9f0307af 100644 --- a/.github/workflows/build_tests.yaml +++ b/.github/workflows/build_tests.yaml @@ -140,6 +140,8 @@ jobs: run: | make install echo $PWD/bin >> "$GITHUB_PATH" + - name: install kubectl and auth plugin + run: gcloud components install kubectl && gcloud components install gke-gcloud-auth-plugin - name: Check xpk installation run: xpk --help - name: Create a private Pathways-enabled XPK Cluster with 2x $TPU_TYPE nodepools. Larger num-nodes to avoid master resizing. diff --git a/Makefile b/Makefile index 952feddb..b1b7eec7 100644 --- a/Makefile +++ b/Makefile @@ -18,7 +18,7 @@ BIN_PATH=$(PROJECT_DIR)/bin install: check-python check-gcloud install-kueuectl install-kjob pip-install .PHONY: install-dev -install-dev: check-python check-gcloud mkdir-bin install-kubectl install-kueuectl install-kjob pip-install install-pytest +install-dev: check-python check-gcloud mkdir-bin install-kueuectl install-kjob pip-install install-pytest .PHONY: pip-install pip-install: @@ -36,7 +36,7 @@ run-integrationtests: pytest src/xpk/core/tests/integration/ .PHONY: install-kjob -install-kjob: install-kubectl +install-kjob: mkdir-bin docker build -f tools/Dockerfile-kjob -t $(KJOB_DOCKER_IMG) tools/ docker run -idt --name $(KJOB_DOCKER_CONTAINER) $(KJOB_DOCKER_IMG) docker cp $(KJOB_DOCKER_CONTAINER):/kjob/bin/kubectl-kjob $(BIN_PATH)/kubectl-kjob @@ -47,13 +47,8 @@ install-kjob: install-kubectl mkdir-bin: mkdir -p $(BIN_PATH) -.PHONY: install-kubectl -install-kubectl: mkdir-bin - gcloud components install kubectl - gcloud components install gke-gcloud-auth-plugin - .PHONY: install-kueuectl -install-kueuectl: install-kubectl +install-kueuectl: mkdir-bin curl -Lo $(BIN_PATH)/kubectl-kueue $(KUEUECTL_URL) chmod +x $(BIN_PATH)/kubectl-kueue diff --git a/README.md b/README.md index 7fc13e18..470593fa 100644 --- a/README.md +++ b/README.md @@ -76,6 +76,8 @@ Following tools must be installed: - gcloud (install from [here](https://cloud.google.com/sdk/gcloud#download_and_install_the)) - Run `gcloud init` - [Authenticate](https://cloud.google.com/sdk/gcloud/reference/auth/application-default/login) to Google Cloud +- kubectl (install from [here](https://cloud.google.com/kubernetes-engine/docs/how-to/cluster-access-for-kubectl#install_kubectl)) + - Install `gke-gcloud-auth-plugin` from [here](https://cloud.google.com/kubernetes-engine/docs/how-to/cluster-access-for-kubectl#install_plugin) - docker ([installation instruction](https://docs.docker.com/engine/install/)) - Run `gcloud auth configure-docker` to ensure images can be uploaded to registry - make - please run below command. @@ -84,7 +86,6 @@ Following tools must be installed: apt-get -y install make ``` In addition, below dependencies will be installed with `make install` command: -- kubectl (install from [here](https://kubernetes.io/docs/tasks/tools/)) - kueuectl (install from [here](https://kueue.sigs.k8s.io/docs/reference/kubectl-kueue/installation/)) - kjob (installation instructions [here](https://github.com/kubernetes-sigs/kjob/blob/main/docs/installation.md)) diff --git a/pyproject.toml b/pyproject.toml index cddeec3c..5ee87a21 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,7 +34,8 @@ dependencies = [ "tabulate", "ruamel.yaml", "pyyaml", - "docker" + "docker", + "packaging" ] [project.urls] diff --git a/src/xpk/commands/cluster.py b/src/xpk/commands/cluster.py index 53425039..97ec1fc0 100644 --- a/src/xpk/commands/cluster.py +++ b/src/xpk/commands/cluster.py @@ -75,6 +75,9 @@ def cluster_create(args) -> None: Returns: 0 if successful and 1 otherwise. """ + + xpk_print(f'****** args are: {args}') + system, return_code = get_system_characteristics(args) if return_code > 0: diff --git a/src/xpk/commands/test.txt b/src/xpk/commands/test.txt new file mode 100644 index 00000000..9a21063c --- /dev/null +++ b/src/xpk/commands/test.txt @@ -0,0 +1,233 @@ +apiVersion: jobset.x-k8s.io/v1alpha2 +kind: JobSet +metadata: + name: pw-llama2-70b-4096-rd-pw-lr-1-bx6 + labels: + kueue.x-k8s.io/queue-name: multislice-queue # Name of the LocalQueue + xpk.google.com/workload: pw-llama2-70b-4096-rd-pw-lr-1-bx6 +spec: + ttlSecondsAfterFinished: 43200 + failurePolicy: + maxRestarts: 0 + successPolicy: + operator: "All" + targetReplicatedJobs: + - main + replicatedJobs: + - name: worker + replicas: 1 + template: + metadata: + annotations: + alpha.jobset.sigs.k8s.io/exclusive-topology: cloud.google.com/gke-nodepool + labels: + xpk.google.com/workload: pw-llama2-70b-4096-rd-pw-lr-1-bx6 + spec: + backoffLimit: 16 + completions: 4 + parallelism: 4 + template: + spec: + terminationGracePeriodSeconds: 300 + containers: + - args: + - --server_port=29001 + - --resource_manager_address=pw-llama2-70b-4096-rd-pw-lr-1-bx6-rm-0-0.pw-llama2-70b-4096-rd-pw-lr-1-bx6:29001 + - --gcs_scratch_location=gs://sujinesh-us-west1/sujinesh + image: us-docker.pkg.dev/cloud-tpu-v2-images-dev/pathways/gke/sujinesh/server:latest + imagePullPolicy: Always + name: pathways-worker + ports: + - containerPort: 29001 + - containerPort: 8471 + - containerPort: 8080 + resources: + limits: + google.com/tpu: 4 + securityContext: + privileged: true + volumeMounts: + - mountPath: /tmp + name: shared-tmp + initContainers: + # TODO(sujinesh): We should make this optional and only part of the + # workload if the user provides the image/enables remote python. + - name: remote-python-sidecar + image: TempImageLocation + imagePullPolicy: Always + command: + - "bash" + - "-c" + - | + python start_remote_python.py + securityContext: + privileged: true + volumeMounts: + - mountPath: /tmp + name: shared-tmp + ports: + - containerPort: 50051 + env: + - name: GRPC_SERVER_ADDRESS + value: "0.0.0.0:50051" + nodeSelector: + cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice + cloud.google.com/gke-tpu-topology: 4x4 + + priorityClassName: very-high + hostNetwork: true + dnsPolicy: ClusterFirstWithHostNet + volumes: + - hostPath: + path: /tmp + type: DirectoryOrCreate + name: shared-tmp + - name: rm + replicas: 1 + template: + metadata: + labels: + xpk.google.com/workload: pw-llama2-70b-4096-rd-pw-lr-1-bx6 + spec: + backoffLimit: 0 + completions: 1 + parallelism: 1 + template: + spec: + containers: + - args: + - --server_port=29001 + - --gcs_scratch_location=gs://sujinesh-us-west1/sujinesh + - --node_type=resource_manager + - --instance_count=1 + - --instance_type=tpuv5e:4x4 + env: + - name: REPLICATED_JOB_NAME + valueFrom: + fieldRef: + fieldPath: metadata.annotations['jobset.sigs.k8s.io/replicatedjob-name'] + - name: JOBSET_NAME + valueFrom: + fieldRef: + fieldPath: metadata.annotations['jobset.sigs.k8s.io/jobset-name'] + - name: HOST_ADDRESS + value: $(JOBSET_NAME)-$(REPLICATED_JOB_NAME)-0-0.$(JOBSET_NAME) + - name: TPU_SKIP_MDS_QUERY + value: "true" + image: us-docker.pkg.dev/cloud-tpu-v2-images-dev/pathways/gke/sujinesh/server:latest + imagePullPolicy: Always + name: pathways-rm + ports: + - containerPort: 29001 + securityContext: + privileged: true + volumeMounts: + - mountPath: /tmp + name: shared-tmp + nodeSelector: + cloud.google.com/gke-nodepool: cpu-rm-np + hostNetwork: true + dnsPolicy: ClusterFirstWithHostNet + volumes: + - hostPath: + path: /tmp + type: DirectoryOrCreate + name: shared-tmp + - name: proxy + replicas: 1 + template: + metadata: + labels: + xpk.google.com/workload: pw-llama2-70b-4096-rd-pw-lr-1-bx6 + spec: + backoffLimit: 0 + completions: 1 + parallelism: 1 + template: + spec: + containers: + - args: + - --server_port=29000 + - --resource_manager_address=pw-llama2-70b-4096-rd-pw-lr-1-bx6-rm-0-0.pw-llama2-70b-4096-rd-pw-lr-1-bx6:29001 + - --gcs_scratch_location=gs://sujinesh-us-west1/sujinesh + image: us-docker.pkg.dev/cloud-tpu-v2-images-dev/pathways/gke/sujinesh/proxy_server:latest + imagePullPolicy: Always + name: pathways-proxy + ports: + - containerPort: 29000 + hostNetwork: true + dnsPolicy: ClusterFirstWithHostNet + nodeSelector: + cloud.google.com/gke-nodepool: cpu-proxy-np + - name: main + replicas: 1 + template: + metadata: + labels: + xpk.google.com/workload: pw-llama2-70b-4096-rd-pw-lr-1-bx6 + spec: + backoffLimit: 0 + completions: 1 + parallelism: 1 + template: + spec: + containers: + - name: jax-tpu + image: gcr.io/cloud-tpu-multipod-dev/sujinesh_latest:latest + imagePullPolicy: Always + env: + - name: XCLOUD_ENVIRONMENT + value: GCP + - name: JAX_PLATFORMS + value: proxy + - name: JAX_BACKEND_TARGET + value: grpc://pw-llama2-70b-4096-rd-pw-lr-1-bx6-proxy-0-0.pw-llama2-70b-4096-rd-pw-lr-1-bx6:29000 + - name: JOBSET_NAME + valueFrom: + fieldRef: + fieldPath: metadata.annotations['jobset.sigs.k8s.io/jobset-name'] + ports: + + + securityContext: + privileged: true + command: + - bash + - -c + - | + echo XPK Start: $(date); + _sigterm() (kill -SIGTERM $! 2>/dev/null;); + trap _sigterm SIGTERM; + + (export TPU_STDERR_LOG_LEVEL=0 && export TPU_MIN_LOG_LEVEL=0 && export TF_CPP_MIN_LOG_LEVEL=0 && export TPU_VMODULE=real_program_continuator=1 && export ENABLE_PATHWAYS_PERSISTENCE=1 && export JAX_PLATFORMS=proxy && export TPU_PREMAPPED_BUFFER_SIZE=4294967296 && echo 4294967296 && export ENABLE_PJRT_COMPATIBILITY=true && python3 MaxText/train.py MaxText/configs/base.yml per_device_batch_size=4 ici_fsdp_parallelism=-1 remat_policy=full max_target_length=4096 attention=flash gcs_metrics=True use_iota_embed=True reuse_example_batch=0 profiler=xplane dataset_path=gs://max-datasets-rogue dataset_type=tfds tokenizer_path=assets/tokenizer.llama2 sa_block_q=1024 sa_block_q_dkv=2048 sa_block_q_dq=2048 steps=1000000 enable_checkpointing=True async_checkpointing=True checkpoint_period=100 checkpoint_storage_use_ocdbt=False checkpoint_storage_use_zarr3=False metrics_file=metrics.txt goodput_upload_interval_seconds=30 enable_pathways_goodput=True enable_checkpoint_cloud_logger=True enable_single_controller=True model_name=llama2-70b base_output_directory=gs://sujinesh-us-west1/sujinesh use_vertex_tensorboard=false vertex_tensorboard_project= vertex_tensorboard_region= run_name=pw-llama2-70b-4096-rd-pw-lr-1-bx6) & PID=$!; + while kill -0 $PID 2>/dev/null; + do sleep 5; + done; + wait $PID; + EXIT_CODE=$?; + + echo XPK End: $(date); + echo EXIT_CODE=$EXIT_CODE; + + + if [ "$EXIT_CODE" = 143 ]; then + exit $EXIT_CODE + fi + exit $EXIT_CODE + resources: + limits: + cpu: "24" + memory: 100G + + volumeMounts: + - mountPath: /tmp + name: shared-tmp + + nodeSelector: + cloud.google.com/gke-nodepool: cpu-user-np + restartPolicy: OnFailure + volumes: + - hostPath: + path: /tmp + type: DirectoryOrCreate + name: shared-tmp \ No newline at end of file diff --git a/src/xpk/commands/workload.py b/src/xpk/commands/workload.py index bc26e4e8..b2a81bfe 100644 --- a/src/xpk/commands/workload.py +++ b/src/xpk/commands/workload.py @@ -214,6 +214,7 @@ kueue.x-k8s.io/queue-name: {local_queue_name} # Name of the LocalQueue xpk.google.com/workload: {args.workload} spec: + # ttlSecondsAfterFinished: {args.ttl_seconds_after_finished} failurePolicy: maxRestarts: {args.max_restarts} successPolicy: @@ -254,6 +255,27 @@ volumeMounts: - mountPath: /tmp name: shared-tmp + initContainers: + # TODO(sujinesh): We should make this optional and only part of the + # workload if the user provides the image/enables remote python. + - name: remote-python-sidecar + image: {args.remote_python_sidecar_image} + imagePullPolicy: Always + command: + - "bash" + - "-c" + - | + python start_remote_python.py + securityContext: + privileged: true + volumeMounts: + - mountPath: /tmp + name: shared-tmp + ports: + - containerPort: 50051 + env: + - name: GRPC_SERVER_ADDRESS + value: "0.0.0.0:50051" nodeSelector: {accelerator_label} {machine_label} diff --git a/src/xpk/core/blueprint/blueprint_generator.py b/src/xpk/core/blueprint/blueprint_generator.py index a5576b47..39b12c13 100644 --- a/src/xpk/core/blueprint/blueprint_generator.py +++ b/src/xpk/core/blueprint/blueprint_generator.py @@ -195,7 +195,7 @@ def generate_a3_mega_blueprint( "config_path": f'$(ghpc_stage("{blueprint_name}"))/kueue-xpk-configuration.yaml.tftpl', "config_template_vars": {"num_chips": f"{num_chips}"}, }, - "jobset": {"install": True}, + "jobset": {"install": True, "version": "v0.7.2"}, }, ) @@ -482,7 +482,13 @@ def generate_a3_ultra_blueprint( use=[net_0_id], settings={ "release_channel": "RAPID", - "min_master_version": "1.31.4-gke.1072000", + "version_prefix": "1.31.", + "maintenance_exclusions": [{ + "name": "no-minor-or-node-upgrades-indefinite", + "start_time": "2024-12-01T00:00:00Z", + "end_time": "2025-12-22T00:00:00Z", + "exclusion_scope": "NO_MINOR_OR_NODE_UPGRADES", + }], "prefix_with_deployment_name": False, "name_suffix": cluster_name, "system_node_pool_machine_type": system_node_pool_machine_type, @@ -567,7 +573,7 @@ def generate_a3_ultra_blueprint( "config_path": f'$(ghpc_stage("{blueprint_name}"))/kueue-xpk-configuration.yaml.tftpl', "config_template_vars": {"num_chips": f"{num_chips}"}, }, - "jobset": {"install": True, "version": "v0.7.1"}, + "jobset": {"install": True, "version": "v0.7.2"}, "apply_manifests": [ {"source": nccl_installer_path}, {"source": mlgru_disable_path}, diff --git a/src/xpk/core/core.py b/src/xpk/core/core.py index b0c8095b..9562e3f2 100644 --- a/src/xpk/core/core.py +++ b/src/xpk/core/core.py @@ -2311,6 +2311,9 @@ def get_main_container_resources( if system.accelerator_type == AcceleratorType['GPU']: return gpu_resources_yaml.format(system=system) + if system.accelerator_type == AcceleratorType['CPU']: + return '' + return f'{resource_type}: {system.chips_per_vm}' diff --git a/src/xpk/core/kueue.py b/src/xpk/core/kueue.py index 1f02e860..cc138c74 100644 --- a/src/xpk/core/kueue.py +++ b/src/xpk/core/kueue.py @@ -15,6 +15,8 @@ """ from argparse import Namespace +from packaging.version import Version +import packaging from ..utils.file import write_tmp_file from ..utils.console import xpk_print, xpk_exit from .commands import run_command_with_updates, run_command_with_updates_retry, run_command_for_value @@ -35,6 +37,7 @@ LOCAL_QUEUE_NAME = 'multislice-queue' WAIT_FOR_KUEUE_TIMEOUT = '5m' +packaging.version.VERSION_PATTERN = r'^v\d+\.\d+\.\d+$' cluster_set_crd_yaml = """apiVersion: kueue.x-k8s.io/v1beta1 kind: ResourceFlavor @@ -168,6 +171,38 @@ def verify_kueuectl(args: Namespace) -> None: xpk_exit(verify_kueuectl_installed_code) +def delete_multikueueconfigs_definitions(args) -> int: + command = 'kubectl delete crd multikueueconfigs.kueue.x-k8s.io' + task = 'Delete multikueueconfigs crds' + return_code = run_command_with_updates_retry(command, task, args) + if return_code != 0: + xpk_print(f'{task} returned ERROR {return_code}') + return return_code + + +def delete_multikueueclusters_definitions(args) -> int: + command = 'kubectl delete crd multikueueclusters.kueue.x-k8s.io' + task = 'Delete multikueueclusters crds' + return_code = run_command_with_updates_retry(command, task, args) + if return_code != 0: + xpk_print(f'{task} returned ERROR {return_code}') + return return_code + + +def get_kueue_version(args) -> (int, str): + command = 'kubectl kueue version' + task = 'Get kueue version on server' + return_code, val = run_command_for_value(command, task, args) + if return_code != 0: + return return_code, '' + lines = val.splitlines() + if len(lines) == 1: + return 1, '' + server_version_line = lines[1] + manager_image_version = server_version_line.split(':')[-1] + return return_code, manager_image_version + + def install_kueue_on_cluster(args) -> int: """Install Kueue on the cluster. @@ -177,6 +212,20 @@ def install_kueue_on_cluster(args) -> int: Returns: 0 if successful and 1 otherwise. """ + + err_code, kueue_version_installed = get_kueue_version(args) + if err_code == 0: + if Version(kueue_version_installed) < Version('v0.9.0') and Version( + KUEUE_VERSION + ) >= Version('v0.9.0'): + xpk_print('Upgrading kueue on cluster from version < 0.9.0.') + upgrade_code = delete_multikueueclusters_definitions(args) + if upgrade_code != 0: + return upgrade_code + upgrade_code = delete_multikueueconfigs_definitions(args) + if upgrade_code != 0: + return upgrade_code + command = ( 'kubectl apply --server-side --force-conflicts -f' f' https://github.com/kubernetes-sigs/kueue/releases/download/{KUEUE_VERSION}/manifests.yaml' diff --git a/src/xpk/core/tests/data/a3_mega.yaml b/src/xpk/core/tests/data/a3_mega.yaml index c3a50eb2..a13779e5 100644 --- a/src/xpk/core/tests/data/a3_mega.yaml +++ b/src/xpk/core/tests/data/a3_mega.yaml @@ -101,6 +101,7 @@ deployment_groups: config_template_vars: {num_chips: "16"} jobset: install: true + version: v0.7.2 - !DeploymentModule id: workload_configmap diff --git a/src/xpk/core/tests/data/a3_ultra.yaml b/src/xpk/core/tests/data/a3_ultra.yaml index 8258cfc9..6983e769 100644 --- a/src/xpk/core/tests/data/a3_ultra.yaml +++ b/src/xpk/core/tests/data/a3_ultra.yaml @@ -86,7 +86,12 @@ deployment_groups: use: [gke-a3-ultra-net-0] settings: release_channel: "RAPID" - min_master_version: "1.31.4-gke.1072000" + version_prefix: "1.31." + maintenance_exclusions: + - name: no-minor-or-node-upgrades-indefinite + start_time: "2024-12-01T00:00:00Z" + end_time: "2025-12-22T00:00:00Z" + exclusion_scope: NO_MINOR_OR_NODE_UPGRADES prefix_with_deployment_name: false name_suffix: gke-a3-ultra system_node_pool_machine_type: "e2-standard-16" @@ -139,7 +144,7 @@ deployment_groups: num_chips: "16" jobset: install: true - version: v0.7.1 + version: v0.7.2 apply_manifests: - source: $(ghpc_stage("xpk-gke-a3-ultra"))/nccl-installer.yaml - source: $(ghpc_stage("xpk-gke-a3-ultra"))/mlgru-disable.yaml diff --git a/src/xpk/parser/workload.py b/src/xpk/parser/workload.py index f12b4224..a4f0c149 100644 --- a/src/xpk/parser/workload.py +++ b/src/xpk/parser/workload.py @@ -114,15 +114,6 @@ def set_workload_parsers(workload_parser): ), ) - workload_create_parser_optional_arguments.add_argument( - '--ttl-seconds-after-finished', - type=int, - default=12 * 60 * 60, - help=( - 'Set the number of seconds to clean up finished Jobsets (either' - ' Complete or Failed). This is by default set to 12 hours.' - ), - ) workload_create_parser_optional_arguments.add_argument( '--num-nodes', type=int, @@ -493,6 +484,15 @@ def add_shared_workload_create_optional_arguments(args_parsers): ' Defaults to 0.' ), ) + custom_parser.add_argument( + '--ttl-seconds-after-finished', + type=int, + default=12 * 60 * 60, + help=( + 'Set the number of seconds to clean up finished Jobsets (either' + ' Complete or Failed). This is by default set to 12 hours.' + ), + ) custom_parser.add_argument( '-tgps', '--termination-grace-period-seconds', @@ -503,6 +503,14 @@ def add_shared_workload_create_optional_arguments(args_parsers): ' event or deletion request.Defaults to 30 seconds.' ), ) + custom_parser.add_argument( + '--remote-python-sidecar-image', + type=str, + default='TempImageLocation', + help=( + 'Remote Python sidecar image to use.' + ), + ) custom_parser.add_argument( '--enable-debug-logs', action='store_true', diff --git a/xpk-large-scale-guide.sh b/xpk-large-scale-guide.sh index 56ea3859..8c6ba6dd 100644 --- a/xpk-large-scale-guide.sh +++ b/xpk-large-scale-guide.sh @@ -121,10 +121,8 @@ export CLUSTER_ARGUMENTS=" \ --subnetwork=${SUBNET_NAME} \ --scopes=storage-full,gke-default \ --enable-ip-alias \ - --enable-private-nodes \ --master-ipv4-cidr 172.16.0.32/28 \ --cluster-ipv4-cidr=10.224.0.0/12 \ - --no-enable-master-authorized-networks \ " export TPU_NODEPOOL_ARGUMENTS=" \ @@ -148,13 +146,13 @@ echo python3 xpk.py cluster create \ --num-slices="${NUMSLICES}" \ --host-maintenance-interval=PERIODIC \ --custom-cluster-arguments="${CLUSTER_ARGUMENTS}" \ - --custom-tpu-nodepool-arguments="${TPU_NODEPOOL_ARGUMENTS}" + --custom-nodepool-arguments="${TPU_NODEPOOL_ARGUMENTS}" # example output ... # python3 xpk.py cluster create --cluster NAME \ # --tpu-type=v5litepod-256 --num-slices=4 \ # --host-maintenance-interval=PERIODIC \ -# --custom-cluster-arguments=" --network=NETWORK --subnetwork=SUBNET --scopes=storage-full,gke-default --enable-ip-alias --enable-private-nodes --master-ipv4-cidr 172.16.0.32/28 --cluster-ipv4-cidr=10.224.0.0/12 --no-enable-master-authorized-networks" +# --custom-cluster-arguments=" --network=NETWORK --subnetwork=SUBNET --scopes=storage-full,gke-default --enable-ip-alias --master-ipv4-cidr 172.16.0.32/28 --cluster-ipv4-cidr=10.224.0.0/12" # --custom-tpu-nodepool-arguments=" --scopes=storage-full,gke-default --enable-gvnic --max-pods-per-node 15 --disk-size=50" @@ -168,7 +166,7 @@ python3 xpk.py cluster create \ --num-slices="${NUMSLICES}" \ --host-maintenance-interval=PERIODIC \ --custom-cluster-arguments="${CLUSTER_ARGUMENTS}" \ - --custom-tpu-nodepool-arguments="${TPU_NODEPOOL_ARGUMENTS}" + --custom-nodepool-arguments="${TPU_NODEPOOL_ARGUMENTS}" # This process takes around 4 minutes with 4 slices of v5e-256. @@ -263,7 +261,7 @@ python3 xpk.py cluster create \ --num-slices="${NUMSLICES}" \ --host-maintenance-interval=PERIODIC \ --custom-cluster-arguments="${CLUSTER_ARGUMENTS}" \ - --custom-tpu-nodepool-arguments="${TPU_NODEPOOL_ARGUMENTS}" + --custom-nodepool-arguments="${TPU_NODEPOOL_ARGUMENTS}" ##### STEP 4 ################################################### ##### PASS Cluster name and Project ID to Google POCs ########## @@ -330,14 +328,14 @@ echo python3 xpk.py cluster create \ --num-slices="${NUMSLICES}" \ --host-maintenance-interval=PERIODIC \ --custom-cluster-arguments="${CLUSTER_ARGUMENTS}" \ - --custom-tpu-nodepool-arguments="${TPU_NODEPOOL_ARGUMENTS}" + --custom-nodepool-arguments="${TPU_NODEPOOL_ARGUMENTS}" # example output ... # python3 xpk.py cluster create --cluster NAME \ # --tpu-type=v5litepod-256 --num-slices=64 \ # --host-maintenance-interval=PERIODIC \ # --custom-cluster-arguments=" --network=NETWORK --subnetwork=SUBNET --scopes=storage-full,gke-default --enable-ip-alias --enable-private-nodes --master-ipv4-cidr 172.16.0.32/28 --cluster-ipv4-cidr=10.224.0.0/12 --no-enable-master-authorized-networks" -# --custom-tpu-nodepool-arguments=" --scopes=storage-full,gke-default --enable-gvnic --max-pods-per-node 15 --disk-size=50" +# --custom-nodepool-arguments=" --scopes=storage-full,gke-default --enable-gvnic --max-pods-per-node 15 --disk-size=50" ##### 5C ##################### # Scale up to NUMSLICES (64 in the provided case) V5e-256s. @@ -348,7 +346,7 @@ python3 xpk.py cluster create \ --num-slices="${NUMSLICES}" \ --host-maintenance-interval=PERIODIC \ --custom-cluster-arguments="${CLUSTER_ARGUMENTS}" \ - --custom-tpu-nodepool-arguments="${TPU_NODEPOOL_ARGUMENTS}" + --custom-nodepool-arguments="${TPU_NODEPOOL_ARGUMENTS}" ############################### ##### 5C - POTENTIAL ERRORS ###