From 5efe829292ba8236d7ed589140b6f492b4040ced Mon Sep 17 00:00:00 2001 From: Jon Huhn Date: Thu, 3 Oct 2024 13:32:40 -0500 Subject: [PATCH] add HorizontalPodAutoscaler stub to catalog --- serving-catalog/core/deployment/base/hpa.yaml | 9 +++++++++ .../core/deployment/base/kustomization.yaml | 1 + .../core/deployment/jetstream/base/hpa.patch.yaml | 15 +++++++++++++++ .../deployment/jetstream/base/kustomization.yaml | 6 ++++++ .../jetstream/gemma-7b-it/base/hpa.patch.yaml | 6 ++++++ .../jetstream/gemma-7b-it/base/kustomization.yaml | 6 ++++++ .../jetstream/llama3-8b/base/hpa.patch.yaml | 6 ++++++ .../jetstream/llama3-8b/base/kustomization.yaml | 6 ++++++ .../core/deployment/vllm/base/hpa.patch.yaml | 15 +++++++++++++++ .../core/deployment/vllm/base/kustomization.yaml | 6 ++++++ .../deployment/vllm/gemma-2b/base/hpa.patch.yaml | 6 ++++++ .../vllm/gemma-2b/base/kustomization.yaml | 6 ++++++ .../deployment/vllm/llama3-8b/base/hpa.patch.yaml | 6 ++++++ .../vllm/llama3-8b/base/kustomization.yaml | 6 ++++++ 14 files changed, 100 insertions(+) create mode 100644 serving-catalog/core/deployment/base/hpa.yaml create mode 100644 serving-catalog/core/deployment/jetstream/base/hpa.patch.yaml create mode 100644 serving-catalog/core/deployment/jetstream/gemma-7b-it/base/hpa.patch.yaml create mode 100644 serving-catalog/core/deployment/jetstream/llama3-8b/base/hpa.patch.yaml create mode 100644 serving-catalog/core/deployment/vllm/base/hpa.patch.yaml create mode 100644 serving-catalog/core/deployment/vllm/gemma-2b/base/hpa.patch.yaml create mode 100644 serving-catalog/core/deployment/vllm/llama3-8b/base/hpa.patch.yaml diff --git a/serving-catalog/core/deployment/base/hpa.yaml b/serving-catalog/core/deployment/base/hpa.yaml new file mode 100644 index 0000000..c54850a --- /dev/null +++ b/serving-catalog/core/deployment/base/hpa.yaml @@ -0,0 +1,9 @@ +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: singlehost-inference-hpa +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: singlehost-inference-deployment-blueprint diff --git a/serving-catalog/core/deployment/base/kustomization.yaml b/serving-catalog/core/deployment/base/kustomization.yaml index 3097586..72ce5bc 100644 --- a/serving-catalog/core/deployment/base/kustomization.yaml +++ b/serving-catalog/core/deployment/base/kustomization.yaml @@ -4,3 +4,4 @@ kind: Kustomization resources: - deployment.yaml +# - hpa.yaml # HPA is still a work-in-progress diff --git a/serving-catalog/core/deployment/jetstream/base/hpa.patch.yaml b/serving-catalog/core/deployment/jetstream/base/hpa.patch.yaml new file mode 100644 index 0000000..aa440c6 --- /dev/null +++ b/serving-catalog/core/deployment/jetstream/base/hpa.patch.yaml @@ -0,0 +1,15 @@ +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: jetstream-hpa-blueprint +spec: + minReplicas: 1 + maxReplicas: 10 + metrics: + - type: Pods + pods: + metric: + name: jetstream-token-latency-ms + target: + type: AverageValue + averageValue: 50 diff --git a/serving-catalog/core/deployment/jetstream/base/kustomization.yaml b/serving-catalog/core/deployment/jetstream/base/kustomization.yaml index 8dffffe..228e759 100644 --- a/serving-catalog/core/deployment/jetstream/base/kustomization.yaml +++ b/serving-catalog/core/deployment/jetstream/base/kustomization.yaml @@ -13,3 +13,9 @@ patches: group: apps version: v1 kind: Deployment + # HPA is still a work-in-progress + # - path: hpa.patch.yaml + # target: + # group: autoscaling + # version: v2 + # kind: HorizontalPodAutoscaler diff --git a/serving-catalog/core/deployment/jetstream/gemma-7b-it/base/hpa.patch.yaml b/serving-catalog/core/deployment/jetstream/gemma-7b-it/base/hpa.patch.yaml new file mode 100644 index 0000000..0901bb8 --- /dev/null +++ b/serving-catalog/core/deployment/jetstream/gemma-7b-it/base/hpa.patch.yaml @@ -0,0 +1,6 @@ +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + labels: + app: gemma-7b-it-jetstream-inference-server + name: gemma-7b-it-jetstream-hpa diff --git a/serving-catalog/core/deployment/jetstream/gemma-7b-it/base/kustomization.yaml b/serving-catalog/core/deployment/jetstream/gemma-7b-it/base/kustomization.yaml index b235d6f..a14bc77 100644 --- a/serving-catalog/core/deployment/jetstream/gemma-7b-it/base/kustomization.yaml +++ b/serving-catalog/core/deployment/jetstream/gemma-7b-it/base/kustomization.yaml @@ -11,6 +11,12 @@ patches: kind: Deployment options: allowNameChange: true + # HPA is still a work-in-progress + # - path: hpa.patch.yaml + # target: + # kind: HorizontalPodAutoscaler + # options: + # allowNameChange: true - path: job.patch.yaml target: kind: Job diff --git a/serving-catalog/core/deployment/jetstream/llama3-8b/base/hpa.patch.yaml b/serving-catalog/core/deployment/jetstream/llama3-8b/base/hpa.patch.yaml new file mode 100644 index 0000000..d3553d9 --- /dev/null +++ b/serving-catalog/core/deployment/jetstream/llama3-8b/base/hpa.patch.yaml @@ -0,0 +1,6 @@ +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + labels: + app: llama3-8b-jetstream-inference-server + name: llama3-8b-jetstream-hpa diff --git a/serving-catalog/core/deployment/jetstream/llama3-8b/base/kustomization.yaml b/serving-catalog/core/deployment/jetstream/llama3-8b/base/kustomization.yaml index b235d6f..a14bc77 100644 --- a/serving-catalog/core/deployment/jetstream/llama3-8b/base/kustomization.yaml +++ b/serving-catalog/core/deployment/jetstream/llama3-8b/base/kustomization.yaml @@ -11,6 +11,12 @@ patches: kind: Deployment options: allowNameChange: true + # HPA is still a work-in-progress + # - path: hpa.patch.yaml + # target: + # kind: HorizontalPodAutoscaler + # options: + # allowNameChange: true - path: job.patch.yaml target: kind: Job diff --git a/serving-catalog/core/deployment/vllm/base/hpa.patch.yaml b/serving-catalog/core/deployment/vllm/base/hpa.patch.yaml new file mode 100644 index 0000000..518622c --- /dev/null +++ b/serving-catalog/core/deployment/vllm/base/hpa.patch.yaml @@ -0,0 +1,15 @@ +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: vllm-hpa-blueprint +spec: + minReplicas: 1 + maxReplicas: 10 + metrics: + - type: Pods + pods: + metric: + name: vllm-token-latency-ms + target: + type: AverageValue + averageValue: 50 diff --git a/serving-catalog/core/deployment/vllm/base/kustomization.yaml b/serving-catalog/core/deployment/vllm/base/kustomization.yaml index 6a4a488..7ad4ed6 100644 --- a/serving-catalog/core/deployment/vllm/base/kustomization.yaml +++ b/serving-catalog/core/deployment/vllm/base/kustomization.yaml @@ -12,3 +12,9 @@ patches: group: apps version: v1 kind: Deployment + # HPA is still a work-in-progress + # - path: hpa.patch.yaml + # target: + # group: autoscaling + # version: v2 + # kind: HorizontalPodAutoscaler diff --git a/serving-catalog/core/deployment/vllm/gemma-2b/base/hpa.patch.yaml b/serving-catalog/core/deployment/vllm/gemma-2b/base/hpa.patch.yaml new file mode 100644 index 0000000..c54d08e --- /dev/null +++ b/serving-catalog/core/deployment/vllm/gemma-2b/base/hpa.patch.yaml @@ -0,0 +1,6 @@ +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + labels: + app: gemma-2b-vllm-inference-server + name: gemma-2b-vllm-hpa diff --git a/serving-catalog/core/deployment/vllm/gemma-2b/base/kustomization.yaml b/serving-catalog/core/deployment/vllm/gemma-2b/base/kustomization.yaml index f5845bd..17bf17e 100644 --- a/serving-catalog/core/deployment/vllm/gemma-2b/base/kustomization.yaml +++ b/serving-catalog/core/deployment/vllm/gemma-2b/base/kustomization.yaml @@ -11,6 +11,12 @@ patches: kind: Deployment options: allowNameChange: true + # HPA is still a work-in-progress + # - path: hpa.patch.yaml + # target: + # kind: HorizontalPodAutoscaler + # options: + # allowNameChange: true - path: service.patch.yaml target: kind: Service diff --git a/serving-catalog/core/deployment/vllm/llama3-8b/base/hpa.patch.yaml b/serving-catalog/core/deployment/vllm/llama3-8b/base/hpa.patch.yaml new file mode 100644 index 0000000..dd11ffb --- /dev/null +++ b/serving-catalog/core/deployment/vllm/llama3-8b/base/hpa.patch.yaml @@ -0,0 +1,6 @@ +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + labels: + app: llama3-8b-vllm-inference-server + name: llama3-8b-vllm-hpa diff --git a/serving-catalog/core/deployment/vllm/llama3-8b/base/kustomization.yaml b/serving-catalog/core/deployment/vllm/llama3-8b/base/kustomization.yaml index 2dc82cb..3145c75 100644 --- a/serving-catalog/core/deployment/vllm/llama3-8b/base/kustomization.yaml +++ b/serving-catalog/core/deployment/vllm/llama3-8b/base/kustomization.yaml @@ -11,6 +11,12 @@ patches: path: deployment.patch.yaml target: kind: Deployment +# HPA is still a work-in-progress +# - options: +# allowNameChange: true +# path: hpa.patch.yaml +# target: +# kind: HorizontalPodAutoscaler - options: allowNameChange: true path: service.patch.yaml