diff --git a/serving-catalog/core/deployment/components/hpa/base/hpa.yaml b/serving-catalog/core/deployment/components/hpa/base/hpa.yaml new file mode 100644 index 0000000..74815e7 --- /dev/null +++ b/serving-catalog/core/deployment/components/hpa/base/hpa.yaml @@ -0,0 +1,10 @@ +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: singlehost-inference-hpa +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: singlehost-inference-deployment-blueprint + # TODO: also include stabilizing-windows, tolerance threshold, etc. diff --git a/serving-catalog/core/deployment/components/hpa/base/kustomization.yaml b/serving-catalog/core/deployment/components/hpa/base/kustomization.yaml new file mode 100644 index 0000000..ed69ab0 --- /dev/null +++ b/serving-catalog/core/deployment/components/hpa/base/kustomization.yaml @@ -0,0 +1,6 @@ +# kustomization.yaml +apiVersion: kustomize.config.k8s.io/v1alpha1 +kind: Component + +resources: +- hpa.yaml diff --git a/serving-catalog/core/deployment/components/hpa/jetstream/token-latency/hpa.patch.yaml b/serving-catalog/core/deployment/components/hpa/jetstream/token-latency/hpa.patch.yaml new file mode 100644 index 0000000..5c38495 --- /dev/null +++ b/serving-catalog/core/deployment/components/hpa/jetstream/token-latency/hpa.patch.yaml @@ -0,0 +1,17 @@ +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: singlehost-inference-hpa +spec: + # TODO: add best practices as defined at: + # - https://cloud.google.com/kubernetes-engine/docs/how-to/machine-learning/inference/autoscaling-tpu + # - https://cloud.google.com/kubernetes-engine/docs/how-to/machine-learning/inference/autoscaling + minReplicas: 1 + maxReplicas: 10 + metrics: + - type: Pods + pods: + metric: + name: jetstream-token-latency-ms + target: + type: AverageValue diff --git a/serving-catalog/core/deployment/components/hpa/jetstream/token-latency/kustomization.yaml b/serving-catalog/core/deployment/components/hpa/jetstream/token-latency/kustomization.yaml new file mode 100644 index 0000000..f25ba19 --- /dev/null +++ b/serving-catalog/core/deployment/components/hpa/jetstream/token-latency/kustomization.yaml @@ -0,0 +1,11 @@ +# kustomization.yaml +apiVersion: kustomize.config.k8s.io/v1alpha1 +kind: Component + +components: + - ../../base + +patches: + - path: hpa.patch.yaml + target: + kind: HorizontalPodAutoscaler diff --git a/serving-catalog/core/deployment/components/hpa/vllm/token-latency/hpa.patch.yaml b/serving-catalog/core/deployment/components/hpa/vllm/token-latency/hpa.patch.yaml new file mode 100644 index 0000000..fb0791c --- /dev/null +++ b/serving-catalog/core/deployment/components/hpa/vllm/token-latency/hpa.patch.yaml @@ -0,0 +1,14 @@ +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: singlehost-inference-hpa +spec: + minReplicas: 1 + maxReplicas: 10 + metrics: + - type: Pods + pods: + metric: + name: vllm-token-latency-ms + target: + type: AverageValue diff --git a/serving-catalog/core/deployment/components/hpa/vllm/token-latency/kustomization.yaml b/serving-catalog/core/deployment/components/hpa/vllm/token-latency/kustomization.yaml new file mode 100644 index 0000000..f25ba19 --- /dev/null +++ b/serving-catalog/core/deployment/components/hpa/vllm/token-latency/kustomization.yaml @@ -0,0 +1,11 @@ +# kustomization.yaml +apiVersion: kustomize.config.k8s.io/v1alpha1 +kind: Component + +components: + - ../../base + +patches: + - path: hpa.patch.yaml + target: + kind: HorizontalPodAutoscaler diff --git a/serving-catalog/core/deployment/jetstream/gemma-7b-it/gke/hpa.patch.yaml b/serving-catalog/core/deployment/jetstream/gemma-7b-it/gke/hpa.patch.yaml new file mode 100644 index 0000000..89052c6 --- /dev/null +++ b/serving-catalog/core/deployment/jetstream/gemma-7b-it/gke/hpa.patch.yaml @@ -0,0 +1,10 @@ +- op: add + path: /metadata/name + value: gemma-7b-it-jetstream-hpa +- op: add + path: /metadata/labels + value: + app: gemma-7b-it-jetstream-inference-server +- op: add + path: /spec/metrics/0/pods/target/averageValue + value: 50 diff --git a/serving-catalog/core/deployment/jetstream/gemma-7b-it/gke/kustomization.yaml b/serving-catalog/core/deployment/jetstream/gemma-7b-it/gke/kustomization.yaml index 57be8f8..4684303 100644 --- a/serving-catalog/core/deployment/jetstream/gemma-7b-it/gke/kustomization.yaml +++ b/serving-catalog/core/deployment/jetstream/gemma-7b-it/gke/kustomization.yaml @@ -7,6 +7,7 @@ resources: components: - ../../../components/gke/resources/tpu/v5e-2x4 + # - ../../../components/hpa/jetstream/token-latency # HPA is a work-in-progress patches: - path: deployment.patch.yaml @@ -15,3 +16,8 @@ patches: - path: job.patch.yaml target: kind: Job + - options: + allowNameChange: true + path: hpa.patch.yaml + target: + kind: HorizontalPodAutoscaler diff --git a/serving-catalog/core/deployment/jetstream/llama3-8b/gke/hpa.patch.yaml b/serving-catalog/core/deployment/jetstream/llama3-8b/gke/hpa.patch.yaml new file mode 100644 index 0000000..9c65a32 --- /dev/null +++ b/serving-catalog/core/deployment/jetstream/llama3-8b/gke/hpa.patch.yaml @@ -0,0 +1,10 @@ +- op: add + path: /metadata/name + value: llama3-8b-jetstream-hpa +- op: add + path: /metadata/labels + value: + app: llama3-8b-jetstream-inference-server +- op: add + path: /spec/metrics/0/pods/target/averageValue + value: 50 diff --git a/serving-catalog/core/deployment/jetstream/llama3-8b/gke/kustomization.yaml b/serving-catalog/core/deployment/jetstream/llama3-8b/gke/kustomization.yaml index 57be8f8..4684303 100644 --- a/serving-catalog/core/deployment/jetstream/llama3-8b/gke/kustomization.yaml +++ b/serving-catalog/core/deployment/jetstream/llama3-8b/gke/kustomization.yaml @@ -7,6 +7,7 @@ resources: components: - ../../../components/gke/resources/tpu/v5e-2x4 + # - ../../../components/hpa/jetstream/token-latency # HPA is a work-in-progress patches: - path: deployment.patch.yaml @@ -15,3 +16,8 @@ patches: - path: job.patch.yaml target: kind: Job + - options: + allowNameChange: true + path: hpa.patch.yaml + target: + kind: HorizontalPodAutoscaler diff --git a/serving-catalog/core/deployment/vllm/gemma-2b/gke/hpa.patch.yaml b/serving-catalog/core/deployment/vllm/gemma-2b/gke/hpa.patch.yaml new file mode 100644 index 0000000..ed0b8e7 --- /dev/null +++ b/serving-catalog/core/deployment/vllm/gemma-2b/gke/hpa.patch.yaml @@ -0,0 +1,10 @@ +- op: add + path: /metadata/name + value: gemma-2b-vllm-hpa +- op: add + path: /metadata/labels + value: + app: gemma-2b-vllm-inference-server +- op: add + path: /spec/metrics/0/pods/target/averageValue + value: 50 diff --git a/serving-catalog/core/deployment/vllm/gemma-2b/gke/kustomization.yaml b/serving-catalog/core/deployment/vllm/gemma-2b/gke/kustomization.yaml index 56515de..10333bf 100644 --- a/serving-catalog/core/deployment/vllm/gemma-2b/gke/kustomization.yaml +++ b/serving-catalog/core/deployment/vllm/gemma-2b/gke/kustomization.yaml @@ -7,8 +7,14 @@ resources: components: - ../../../components/gke/resources/gpu/1-L4 + # - ../../../components/hpa/vllm/token-latency # HPA is a work-in-progress patches: - path: deployment.patch.yaml target: kind: Deployment + - options: + allowNameChange: true + path: hpa.patch.yaml + target: + kind: HorizontalPodAutoscaler diff --git a/serving-catalog/core/deployment/vllm/llama3-70b/gke/hpa.patch.yaml b/serving-catalog/core/deployment/vllm/llama3-70b/gke/hpa.patch.yaml new file mode 100644 index 0000000..7142681 --- /dev/null +++ b/serving-catalog/core/deployment/vllm/llama3-70b/gke/hpa.patch.yaml @@ -0,0 +1,10 @@ +- op: add + path: /metadata/name + value: llama3-70b-vllm-hpa +- op: add + path: /metadata/labels + value: + app: llama3-70b-vllm-inference-server +- op: add + path: /spec/metrics/0/pods/target/averageValue + value: 50 diff --git a/serving-catalog/core/deployment/vllm/llama3-70b/gke/kustomization.yaml b/serving-catalog/core/deployment/vllm/llama3-70b/gke/kustomization.yaml index 9a33e24..f8e194e 100644 --- a/serving-catalog/core/deployment/vllm/llama3-70b/gke/kustomization.yaml +++ b/serving-catalog/core/deployment/vllm/llama3-70b/gke/kustomization.yaml @@ -7,8 +7,14 @@ resources: components: - ../../../components/gke/resources/gpu/8-L4 + # - ../../../components/hpa/vllm/token-latency # HPA is a work-in-progress patches: - path: deployment.patch.yaml target: kind: Deployment + - options: + allowNameChange: true + path: hpa.patch.yaml + target: + kind: HorizontalPodAutoscaler diff --git a/serving-catalog/core/deployment/vllm/llama3-8b/gke/hpa.patch.yaml b/serving-catalog/core/deployment/vllm/llama3-8b/gke/hpa.patch.yaml new file mode 100644 index 0000000..eb0e5ad --- /dev/null +++ b/serving-catalog/core/deployment/vllm/llama3-8b/gke/hpa.patch.yaml @@ -0,0 +1,10 @@ +- op: add + path: /metadata/name + value: llama3-8b-vllm-hpa +- op: add + path: /metadata/labels + value: + app: llama3-8b-vllm-inference-server +- op: add + path: /spec/metrics/0/pods/target/averageValue + value: 50 diff --git a/serving-catalog/core/deployment/vllm/llama3-8b/gke/kustomization.yaml b/serving-catalog/core/deployment/vllm/llama3-8b/gke/kustomization.yaml index 56515de..10333bf 100644 --- a/serving-catalog/core/deployment/vllm/llama3-8b/gke/kustomization.yaml +++ b/serving-catalog/core/deployment/vllm/llama3-8b/gke/kustomization.yaml @@ -7,8 +7,14 @@ resources: components: - ../../../components/gke/resources/gpu/1-L4 + # - ../../../components/hpa/vllm/token-latency # HPA is a work-in-progress patches: - path: deployment.patch.yaml target: kind: Deployment + - options: + allowNameChange: true + path: hpa.patch.yaml + target: + kind: HorizontalPodAutoscaler