From 32ff651206f68930fca97a49ebf65f68ebd33cb8 Mon Sep 17 00:00:00 2001
From: Jon Huhn <huhnjon@gmail.com>
Date: Thu, 3 Oct 2024 13:32:40 -0500
Subject: [PATCH] add HorizontalPodAutoscaler stub to catalog

---
 .../core/deployment/components/hpa/hpa.yaml   | 10 ++++++++++
 .../components/hpa/kustomization.yaml         |  6 ++++++
 .../jetstream/gemma-7b-it/gke/hpa.patch.yaml  | 20 +++++++++++++++++++
 .../gemma-7b-it/gke/kustomization.yaml        |  6 ++++++
 .../jetstream/llama3-8b/gke/hpa.patch.yaml    | 20 +++++++++++++++++++
 .../llama3-8b/gke/kustomization.yaml          |  6 ++++++
 .../vllm/gemma-2b/gke/hpa.patch.yaml          | 17 ++++++++++++++++
 .../vllm/gemma-2b/gke/kustomization.yaml      |  6 ++++++
 .../vllm/llama3-8b/gke/hpa.patch.yaml         | 17 ++++++++++++++++
 .../vllm/llama3-8b/gke/kustomization.yaml     |  6 ++++++
 10 files changed, 114 insertions(+)
 create mode 100644 serving-catalog/core/deployment/components/hpa/hpa.yaml
 create mode 100644 serving-catalog/core/deployment/components/hpa/kustomization.yaml
 create mode 100644 serving-catalog/core/deployment/jetstream/gemma-7b-it/gke/hpa.patch.yaml
 create mode 100644 serving-catalog/core/deployment/jetstream/llama3-8b/gke/hpa.patch.yaml
 create mode 100644 serving-catalog/core/deployment/vllm/gemma-2b/gke/hpa.patch.yaml
 create mode 100644 serving-catalog/core/deployment/vllm/llama3-8b/gke/hpa.patch.yaml

diff --git a/serving-catalog/core/deployment/components/hpa/hpa.yaml b/serving-catalog/core/deployment/components/hpa/hpa.yaml
new file mode 100644
index 0000000..74815e7
--- /dev/null
+++ b/serving-catalog/core/deployment/components/hpa/hpa.yaml
@@ -0,0 +1,10 @@
+apiVersion: autoscaling/v2
+kind: HorizontalPodAutoscaler
+metadata:
+  name: singlehost-inference-hpa
+spec:
+  scaleTargetRef:
+    apiVersion: apps/v1
+    kind: Deployment
+    name: singlehost-inference-deployment-blueprint
+  # TODO: also include stabilizing-windows, tolerance threshold, etc.
diff --git a/serving-catalog/core/deployment/components/hpa/kustomization.yaml b/serving-catalog/core/deployment/components/hpa/kustomization.yaml
new file mode 100644
index 0000000..ed69ab0
--- /dev/null
+++ b/serving-catalog/core/deployment/components/hpa/kustomization.yaml
@@ -0,0 +1,6 @@
+# kustomization.yaml
+apiVersion: kustomize.config.k8s.io/v1alpha1
+kind: Component
+
+resources:
+- hpa.yaml
diff --git a/serving-catalog/core/deployment/jetstream/gemma-7b-it/gke/hpa.patch.yaml b/serving-catalog/core/deployment/jetstream/gemma-7b-it/gke/hpa.patch.yaml
new file mode 100644
index 0000000..7291ce3
--- /dev/null
+++ b/serving-catalog/core/deployment/jetstream/gemma-7b-it/gke/hpa.patch.yaml
@@ -0,0 +1,20 @@
+apiVersion: autoscaling/v2
+kind: HorizontalPodAutoscaler
+metadata:
+  labels:
+    app: gemma-7b-it-jetstream-inference-server
+  name: gemma-7b-it-jetstream-hpa
+spec:
+  # TODO: add best practices as defined at:
+  #   - https://cloud.google.com/kubernetes-engine/docs/how-to/machine-learning/inference/autoscaling-tpu
+  #   - https://cloud.google.com/kubernetes-engine/docs/how-to/machine-learning/inference/autoscaling
+  minReplicas: 1
+  maxReplicas: 10
+  metrics:
+  - type: Pods
+    pods:
+      metric:
+        name: jetstream-token-latency-ms
+      target:
+        type: AverageValue
+        averageValue: 50
diff --git a/serving-catalog/core/deployment/jetstream/gemma-7b-it/gke/kustomization.yaml b/serving-catalog/core/deployment/jetstream/gemma-7b-it/gke/kustomization.yaml
index 57be8f8..47cce52 100644
--- a/serving-catalog/core/deployment/jetstream/gemma-7b-it/gke/kustomization.yaml
+++ b/serving-catalog/core/deployment/jetstream/gemma-7b-it/gke/kustomization.yaml
@@ -7,6 +7,7 @@ resources:
 
 components:
   - ../../../components/gke/resources/tpu/v5e-2x4
+  # - ../../../components/hpa # HPA is a work-in-progress
 
 patches:
   - path: deployment.patch.yaml
@@ -15,3 +16,8 @@ patches:
   - path: job.patch.yaml
     target:
       kind: Job
+  - options:
+      allowNameChange: true
+    path: hpa.patch.yaml
+    target:
+      kind: HorizontalPodAutoscaler
diff --git a/serving-catalog/core/deployment/jetstream/llama3-8b/gke/hpa.patch.yaml b/serving-catalog/core/deployment/jetstream/llama3-8b/gke/hpa.patch.yaml
new file mode 100644
index 0000000..4d83636
--- /dev/null
+++ b/serving-catalog/core/deployment/jetstream/llama3-8b/gke/hpa.patch.yaml
@@ -0,0 +1,20 @@
+apiVersion: autoscaling/v2
+kind: HorizontalPodAutoscaler
+metadata:
+  labels:
+    app: llama3-8b-jetstream-inference-server
+  name: llama3-8b-jetstream-hpa
+spec:
+  # TODO: add best practices as defined at:
+  #   - https://cloud.google.com/kubernetes-engine/docs/how-to/machine-learning/inference/autoscaling-tpu
+  #   - https://cloud.google.com/kubernetes-engine/docs/how-to/machine-learning/inference/autoscaling
+  minReplicas: 1
+  maxReplicas: 10
+  metrics:
+  - type: Pods
+    pods:
+      metric:
+        name: jetstream-token-latency-ms
+      target:
+        type: AverageValue
+        averageValue: 50
diff --git a/serving-catalog/core/deployment/jetstream/llama3-8b/gke/kustomization.yaml b/serving-catalog/core/deployment/jetstream/llama3-8b/gke/kustomization.yaml
index 57be8f8..47cce52 100644
--- a/serving-catalog/core/deployment/jetstream/llama3-8b/gke/kustomization.yaml
+++ b/serving-catalog/core/deployment/jetstream/llama3-8b/gke/kustomization.yaml
@@ -7,6 +7,7 @@ resources:
 
 components:
   - ../../../components/gke/resources/tpu/v5e-2x4
+  # - ../../../components/hpa # HPA is a work-in-progress
 
 patches:
   - path: deployment.patch.yaml
@@ -15,3 +16,8 @@ patches:
   - path: job.patch.yaml
     target:
       kind: Job
+  - options:
+      allowNameChange: true
+    path: hpa.patch.yaml
+    target:
+      kind: HorizontalPodAutoscaler
diff --git a/serving-catalog/core/deployment/vllm/gemma-2b/gke/hpa.patch.yaml b/serving-catalog/core/deployment/vllm/gemma-2b/gke/hpa.patch.yaml
new file mode 100644
index 0000000..8542ad6
--- /dev/null
+++ b/serving-catalog/core/deployment/vllm/gemma-2b/gke/hpa.patch.yaml
@@ -0,0 +1,17 @@
+apiVersion: autoscaling/v2
+kind: HorizontalPodAutoscaler
+metadata:
+  labels:
+    app: gemma-2b-vllm-inference-server
+  name: gemma-2b-vllm-hpa
+spec:
+  minReplicas: 1
+  maxReplicas: 10
+  metrics:
+  - type: Pods
+    pods:
+      metric:
+        name: vllm-token-latency-ms
+      target:
+        type: AverageValue
+        averageValue: 50
diff --git a/serving-catalog/core/deployment/vllm/gemma-2b/gke/kustomization.yaml b/serving-catalog/core/deployment/vllm/gemma-2b/gke/kustomization.yaml
index 56515de..a47059c 100644
--- a/serving-catalog/core/deployment/vllm/gemma-2b/gke/kustomization.yaml
+++ b/serving-catalog/core/deployment/vllm/gemma-2b/gke/kustomization.yaml
@@ -7,8 +7,14 @@ resources:
 
 components:
   - ../../../components/gke/resources/gpu/1-L4
+  # - ../../../components/hpa # HPA is a work-in-progress
 
 patches:
   - path: deployment.patch.yaml
     target:
       kind: Deployment
+  - options:
+      allowNameChange: true
+    path: hpa.patch.yaml
+    target:
+      kind: HorizontalPodAutoscaler
diff --git a/serving-catalog/core/deployment/vllm/llama3-8b/gke/hpa.patch.yaml b/serving-catalog/core/deployment/vllm/llama3-8b/gke/hpa.patch.yaml
new file mode 100644
index 0000000..66044ac
--- /dev/null
+++ b/serving-catalog/core/deployment/vllm/llama3-8b/gke/hpa.patch.yaml
@@ -0,0 +1,17 @@
+apiVersion: autoscaling/v2
+kind: HorizontalPodAutoscaler
+metadata:
+  labels:
+    app: llama3-8b-vllm-inference-server
+  name: llama3-8b-vllm-hpa
+spec:
+  minReplicas: 1
+  maxReplicas: 10
+  metrics:
+  - type: Pods
+    pods:
+      metric:
+        name: vllm-token-latency-ms
+      target:
+        type: AverageValue
+        averageValue: 50
diff --git a/serving-catalog/core/deployment/vllm/llama3-8b/gke/kustomization.yaml b/serving-catalog/core/deployment/vllm/llama3-8b/gke/kustomization.yaml
index 56515de..a47059c 100644
--- a/serving-catalog/core/deployment/vllm/llama3-8b/gke/kustomization.yaml
+++ b/serving-catalog/core/deployment/vllm/llama3-8b/gke/kustomization.yaml
@@ -7,8 +7,14 @@ resources:
 
 components:
   - ../../../components/gke/resources/gpu/1-L4
+  # - ../../../components/hpa # HPA is a work-in-progress
 
 patches:
   - path: deployment.patch.yaml
     target:
       kind: Deployment
+  - options:
+      allowNameChange: true
+    path: hpa.patch.yaml
+    target:
+      kind: HorizontalPodAutoscaler