From 32ff651206f68930fca97a49ebf65f68ebd33cb8 Mon Sep 17 00:00:00 2001 From: Jon Huhn Date: Thu, 3 Oct 2024 13:32:40 -0500 Subject: [PATCH] add HorizontalPodAutoscaler stub to catalog --- .../core/deployment/components/hpa/hpa.yaml | 10 ++++++++++ .../components/hpa/kustomization.yaml | 6 ++++++ .../jetstream/gemma-7b-it/gke/hpa.patch.yaml | 20 +++++++++++++++++++ .../gemma-7b-it/gke/kustomization.yaml | 6 ++++++ .../jetstream/llama3-8b/gke/hpa.patch.yaml | 20 +++++++++++++++++++ .../llama3-8b/gke/kustomization.yaml | 6 ++++++ .../vllm/gemma-2b/gke/hpa.patch.yaml | 17 ++++++++++++++++ .../vllm/gemma-2b/gke/kustomization.yaml | 6 ++++++ .../vllm/llama3-8b/gke/hpa.patch.yaml | 17 ++++++++++++++++ .../vllm/llama3-8b/gke/kustomization.yaml | 6 ++++++ 10 files changed, 114 insertions(+) create mode 100644 serving-catalog/core/deployment/components/hpa/hpa.yaml create mode 100644 serving-catalog/core/deployment/components/hpa/kustomization.yaml create mode 100644 serving-catalog/core/deployment/jetstream/gemma-7b-it/gke/hpa.patch.yaml create mode 100644 serving-catalog/core/deployment/jetstream/llama3-8b/gke/hpa.patch.yaml create mode 100644 serving-catalog/core/deployment/vllm/gemma-2b/gke/hpa.patch.yaml create mode 100644 serving-catalog/core/deployment/vllm/llama3-8b/gke/hpa.patch.yaml diff --git a/serving-catalog/core/deployment/components/hpa/hpa.yaml b/serving-catalog/core/deployment/components/hpa/hpa.yaml new file mode 100644 index 0000000..74815e7 --- /dev/null +++ b/serving-catalog/core/deployment/components/hpa/hpa.yaml @@ -0,0 +1,10 @@ +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: singlehost-inference-hpa +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: singlehost-inference-deployment-blueprint + # TODO: also include stabilizing-windows, tolerance threshold, etc. diff --git a/serving-catalog/core/deployment/components/hpa/kustomization.yaml b/serving-catalog/core/deployment/components/hpa/kustomization.yaml new file mode 100644 index 0000000..ed69ab0 --- /dev/null +++ b/serving-catalog/core/deployment/components/hpa/kustomization.yaml @@ -0,0 +1,6 @@ +# kustomization.yaml +apiVersion: kustomize.config.k8s.io/v1alpha1 +kind: Component + +resources: +- hpa.yaml diff --git a/serving-catalog/core/deployment/jetstream/gemma-7b-it/gke/hpa.patch.yaml b/serving-catalog/core/deployment/jetstream/gemma-7b-it/gke/hpa.patch.yaml new file mode 100644 index 0000000..7291ce3 --- /dev/null +++ b/serving-catalog/core/deployment/jetstream/gemma-7b-it/gke/hpa.patch.yaml @@ -0,0 +1,20 @@ +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + labels: + app: gemma-7b-it-jetstream-inference-server + name: gemma-7b-it-jetstream-hpa +spec: + # TODO: add best practices as defined at: + # - https://cloud.google.com/kubernetes-engine/docs/how-to/machine-learning/inference/autoscaling-tpu + # - https://cloud.google.com/kubernetes-engine/docs/how-to/machine-learning/inference/autoscaling + minReplicas: 1 + maxReplicas: 10 + metrics: + - type: Pods + pods: + metric: + name: jetstream-token-latency-ms + target: + type: AverageValue + averageValue: 50 diff --git a/serving-catalog/core/deployment/jetstream/gemma-7b-it/gke/kustomization.yaml b/serving-catalog/core/deployment/jetstream/gemma-7b-it/gke/kustomization.yaml index 57be8f8..47cce52 100644 --- a/serving-catalog/core/deployment/jetstream/gemma-7b-it/gke/kustomization.yaml +++ b/serving-catalog/core/deployment/jetstream/gemma-7b-it/gke/kustomization.yaml @@ -7,6 +7,7 @@ resources: components: - ../../../components/gke/resources/tpu/v5e-2x4 + # - ../../../components/hpa # HPA is a work-in-progress patches: - path: deployment.patch.yaml @@ -15,3 +16,8 @@ patches: - path: job.patch.yaml target: kind: Job + - options: + allowNameChange: true + path: hpa.patch.yaml + target: + kind: HorizontalPodAutoscaler diff --git a/serving-catalog/core/deployment/jetstream/llama3-8b/gke/hpa.patch.yaml b/serving-catalog/core/deployment/jetstream/llama3-8b/gke/hpa.patch.yaml new file mode 100644 index 0000000..4d83636 --- /dev/null +++ b/serving-catalog/core/deployment/jetstream/llama3-8b/gke/hpa.patch.yaml @@ -0,0 +1,20 @@ +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + labels: + app: llama3-8b-jetstream-inference-server + name: llama3-8b-jetstream-hpa +spec: + # TODO: add best practices as defined at: + # - https://cloud.google.com/kubernetes-engine/docs/how-to/machine-learning/inference/autoscaling-tpu + # - https://cloud.google.com/kubernetes-engine/docs/how-to/machine-learning/inference/autoscaling + minReplicas: 1 + maxReplicas: 10 + metrics: + - type: Pods + pods: + metric: + name: jetstream-token-latency-ms + target: + type: AverageValue + averageValue: 50 diff --git a/serving-catalog/core/deployment/jetstream/llama3-8b/gke/kustomization.yaml b/serving-catalog/core/deployment/jetstream/llama3-8b/gke/kustomization.yaml index 57be8f8..47cce52 100644 --- a/serving-catalog/core/deployment/jetstream/llama3-8b/gke/kustomization.yaml +++ b/serving-catalog/core/deployment/jetstream/llama3-8b/gke/kustomization.yaml @@ -7,6 +7,7 @@ resources: components: - ../../../components/gke/resources/tpu/v5e-2x4 + # - ../../../components/hpa # HPA is a work-in-progress patches: - path: deployment.patch.yaml @@ -15,3 +16,8 @@ patches: - path: job.patch.yaml target: kind: Job + - options: + allowNameChange: true + path: hpa.patch.yaml + target: + kind: HorizontalPodAutoscaler diff --git a/serving-catalog/core/deployment/vllm/gemma-2b/gke/hpa.patch.yaml b/serving-catalog/core/deployment/vllm/gemma-2b/gke/hpa.patch.yaml new file mode 100644 index 0000000..8542ad6 --- /dev/null +++ b/serving-catalog/core/deployment/vllm/gemma-2b/gke/hpa.patch.yaml @@ -0,0 +1,17 @@ +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + labels: + app: gemma-2b-vllm-inference-server + name: gemma-2b-vllm-hpa +spec: + minReplicas: 1 + maxReplicas: 10 + metrics: + - type: Pods + pods: + metric: + name: vllm-token-latency-ms + target: + type: AverageValue + averageValue: 50 diff --git a/serving-catalog/core/deployment/vllm/gemma-2b/gke/kustomization.yaml b/serving-catalog/core/deployment/vllm/gemma-2b/gke/kustomization.yaml index 56515de..a47059c 100644 --- a/serving-catalog/core/deployment/vllm/gemma-2b/gke/kustomization.yaml +++ b/serving-catalog/core/deployment/vllm/gemma-2b/gke/kustomization.yaml @@ -7,8 +7,14 @@ resources: components: - ../../../components/gke/resources/gpu/1-L4 + # - ../../../components/hpa # HPA is a work-in-progress patches: - path: deployment.patch.yaml target: kind: Deployment + - options: + allowNameChange: true + path: hpa.patch.yaml + target: + kind: HorizontalPodAutoscaler diff --git a/serving-catalog/core/deployment/vllm/llama3-8b/gke/hpa.patch.yaml b/serving-catalog/core/deployment/vllm/llama3-8b/gke/hpa.patch.yaml new file mode 100644 index 0000000..66044ac --- /dev/null +++ b/serving-catalog/core/deployment/vllm/llama3-8b/gke/hpa.patch.yaml @@ -0,0 +1,17 @@ +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + labels: + app: llama3-8b-vllm-inference-server + name: llama3-8b-vllm-hpa +spec: + minReplicas: 1 + maxReplicas: 10 + metrics: + - type: Pods + pods: + metric: + name: vllm-token-latency-ms + target: + type: AverageValue + averageValue: 50 diff --git a/serving-catalog/core/deployment/vllm/llama3-8b/gke/kustomization.yaml b/serving-catalog/core/deployment/vllm/llama3-8b/gke/kustomization.yaml index 56515de..a47059c 100644 --- a/serving-catalog/core/deployment/vllm/llama3-8b/gke/kustomization.yaml +++ b/serving-catalog/core/deployment/vllm/llama3-8b/gke/kustomization.yaml @@ -7,8 +7,14 @@ resources: components: - ../../../components/gke/resources/gpu/1-L4 + # - ../../../components/hpa # HPA is a work-in-progress patches: - path: deployment.patch.yaml target: kind: Deployment + - options: + allowNameChange: true + path: hpa.patch.yaml + target: + kind: HorizontalPodAutoscaler