Merge pull request #3008 from danswer-ai/horizontal_slack

Add Functional Horizontal scaling for Slack
onyx-dot-app · Nov 6, 2024 · faeb9f0 · faeb9f0
2 parents da6ed5b + 25f5c12
commit faeb9f0
Show file tree

Hide file tree

Showing 8 changed files with 341 additions and 93 deletions.
diff --git a/backend/danswer/configs/constants.py b/backend/danswer/configs/constants.py
@@ -225,6 +225,9 @@ class DanswerRedisLocks:
     PRUNING_LOCK_PREFIX = "da_lock:pruning"
     INDEXING_METADATA_PREFIX = "da_metadata:indexing"
 
+    SLACK_BOT_LOCK = "da_lock:slack_bot"
+    SLACK_BOT_HEARTBEAT_PREFIX = "da_heartbeat:slack_bot"
+
 
 class DanswerCeleryPriority(int, Enum):
     HIGHEST = 0

diff --git a/backend/danswer/danswerbot/slack/config.py b/backend/danswer/danswerbot/slack/config.py
@@ -1,3 +1,5 @@
+import os
+
 from sqlalchemy.orm import Session
 
 from danswer.db.models import SlackBotConfig
@@ -48,3 +50,16 @@ def validate_channel_names(
                 )
 
     return cleaned_channel_names
+
+
+# Scaling configurations for multi-tenant Slack bot handling
+TENANT_LOCK_EXPIRATION = 1800  # How long a pod can hold exclusive access to a tenant before other pods can acquire it
+TENANT_HEARTBEAT_INTERVAL = (
+    60  # How often pods send heartbeats to indicate they are still processing a tenant
+)
+TENANT_HEARTBEAT_EXPIRATION = 180  # How long before a tenant's heartbeat expires, allowing other pods to take over
+TENANT_ACQUISITION_INTERVAL = (
+    60  # How often pods attempt to acquire unprocessed tenants
+)
+
+MAX_TENANTS_PER_POD = int(os.getenv("MAX_TENANTS_PER_POD", 50))
diff --git a/backend/danswer/danswerbot/slack/listener.py b/backend/danswer/danswerbot/slack/listener.py
diff --git a/backend/danswer/db/engine.py b/backend/danswer/db/engine.py
@@ -331,11 +331,13 @@ def get_session_with_tenant(
     Generate a database session bound to a connection with the appropriate tenant schema set.
     This preserves the tenant ID across the session and reverts to the previous tenant ID
     after the session is closed.
+    If tenant ID is set, we save the previous tenant ID from the context var to set
+    after the session is closed. The value `None` evaluates to the default schema.
     """
     engine = get_sqlalchemy_engine()
 
     # Store the previous tenant ID
-    previous_tenant_id = CURRENT_TENANT_ID_CONTEXTVAR.get()
+    previous_tenant_id = CURRENT_TENANT_ID_CONTEXTVAR.get() or POSTGRES_DEFAULT_SCHEMA
 
     if tenant_id is None:
         tenant_id = previous_tenant_id

diff --git a/backend/ee/danswer/db/usage_export.py b/backend/ee/danswer/db/usage_export.py
@@ -66,7 +66,7 @@ def get_all_empty_chat_message_entries(
             return
 
         yield message_skeletons
-        initial_id = message_skeletons[-1].message_id
+        initial_id = message_skeletons[-1].chat_session_id
 
 
 def get_all_usage_reports(db_session: Session) -> list[UsageReportMetadata]:

diff --git a/backend/requirements/default.txt b/backend/requirements/default.txt
@@ -81,3 +81,4 @@ stripe==10.12.0
 urllib3==2.2.3
 mistune==0.8.4
 sentry-sdk==2.14.0
+prometheus_client==0.21.0
diff --git a/deployment/kubernetes/slackbot-service-deployment.yaml b/deployment/kubernetes/slackbot-service-deployment.yaml
@@ -0,0 +1,80 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: slack-bot-deployment
+  labels:
+    app: slack-bot
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: slack-bot
+  template:
+    metadata:
+      labels:
+        app: slack-bot
+    spec:
+      containers:
+        - name: slack-bot
+          image: danswer/danswer-backend:latest
+          imagePullPolicy: IfNotPresent
+          command: ["python", "danswer/danswerbot/slack/listener.py"]
+          ports:
+            - containerPort: 8000
+          resources:
+            requests:
+              cpu: "100m"
+              memory: "200Mi"
+            limits:
+              cpu: "500m"
+              memory: "500Mi"
+          readinessProbe:
+            httpGet:
+              path: /metrics
+              port: 8000
+            initialDelaySeconds: 10
+            periodSeconds: 10
+          livenessProbe:
+            httpGet:
+              path: /metrics
+              port: 8000
+            initialDelaySeconds: 15
+            periodSeconds: 20
+
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: slack-bot-service
+  labels:
+    app: slack-bot
+spec:
+  selector:
+    app: slack-bot
+  ports:
+    # Port exposed for Prometheus metrics
+    - protocol: TCP
+      port: 8000
+      targetPort: 8000
+  type: ClusterIP
+
+---
+apiVersion: autoscaling/v2beta2
+kind: HorizontalPodAutoscaler
+
+  name: slack-bot-hpa
+spec:
+  scaleTargetRef:
+    apiVersion: apps/v1
+    kind: Deployment
+    name: slack-bot-deployment
+  minReplicas: 1
+  maxReplicas: 10
+  metrics:
+    - type: Pods
+      pods:
+        metric:
+          name: active_tenants
+        target:
+          type: AverageValue
+          averageValue: "40"
diff --git a/web/src/app/admin/assistants/AssistantEditor.tsx b/web/src/app/admin/assistants/AssistantEditor.tsx
@@ -5,6 +5,7 @@ import { generateRandomIconShape, createSVG } from "@/lib/assistantIconUtils";
 import { CCPairBasicInfo, DocumentSet, User } from "@/lib/types";
 import { Separator } from "@/components/ui/separator";
 import { Button } from "@/components/ui/button";
+import { Textarea } from "@/components/ui/textarea";
 import { IsPublicGroupSelector } from "@/components/IsPublicGroupSelector";
 import {
   ArrayHelpers,
@@ -1102,7 +1103,9 @@ export function AssistantEditor({
                                               w-full 
                                               py-2 
                                               px-3 
+                                              min-h-12
                                               mr-4
+                                              line-clamp-
                                           `}
                                             as="textarea"
                                             autoComplete="off"