tenant seeding docs (#2925)

* tenant seeding docs * k
onyx-dot-app · Oct 27, 2024 · 53e9165 · 53e9165
1 parent 179dc41
commit 53e9165
Show file tree

Hide file tree

Showing 7 changed files with 11 additions and 8 deletions.
diff --git a/backend/danswer/background/celery/apps/beat.py b/backend/danswer/background/celery/apps/beat.py
@@ -78,6 +78,7 @@ def on_setup_logging(
     },
 ]
 
+
 # Build the celery beat schedule dynamically
 beat_schedule = {}
 

diff --git a/backend/danswer/document_index/vespa/indexing_utils.py b/backend/danswer/document_index/vespa/indexing_utils.py
@@ -57,7 +57,6 @@ def _does_document_exist(
     chunk. This checks for whether the chunk exists already in the index"""
     doc_url = f"{DOCUMENT_ID_ENDPOINT.format(index_name=index_name)}/{doc_chunk_id}"
     doc_fetch_response = http_client.get(doc_url)
-
     if doc_fetch_response.status_code == 404:
         return False
 

diff --git a/backend/danswer/document_index/vespa_constants.py b/backend/danswer/document_index/vespa_constants.py
@@ -29,6 +29,7 @@
 # main search application
 VESPA_APP_CONTAINER_URL = VESPA_CLOUD_URL or f"http://{VESPA_HOST}:{VESPA_PORT}"
 
+
 # danswer_chunk below is defined in vespa/app_configs/schemas/danswer_chunk.sd
 DOCUMENT_ID_ENDPOINT = (
     f"{VESPA_APP_CONTAINER_URL}/document/v1/default/{{index_name}}/docid"

diff --git a/backend/danswer/main.py b/backend/danswer/main.py
@@ -184,7 +184,7 @@ async def lifespan(app: FastAPI) -> AsyncGenerator:
 
         # If we are multi-tenant, we need to only set up initial public tables
         with Session(engine) as db_session:
-            setup_danswer(db_session)
+            setup_danswer(db_session, None)
     else:
         setup_multitenant_danswer()
 

diff --git a/backend/danswer/seeding/load_docs.py b/backend/danswer/seeding/load_docs.py
@@ -39,6 +39,7 @@
 
 def _create_indexable_chunks(
     preprocessed_docs: list[dict],
+    tenant_id: str | None,
 ) -> tuple[list[Document], list[DocMetadataAwareIndexChunk]]:
     ids_to_documents = {}
     chunks = []
@@ -80,7 +81,7 @@ def _create_indexable_chunks(
                 mini_chunk_embeddings=[],
             ),
             title_embedding=preprocessed_doc["title_embedding"],
-            tenant_id=None,
+            tenant_id=tenant_id,
             access=default_public_access,
             document_sets=set(),
             boost=DEFAULT_BOOST,
@@ -90,7 +91,7 @@ def _create_indexable_chunks(
     return list(ids_to_documents.values()), chunks
 
 
-def seed_initial_documents(db_session: Session) -> None:
+def seed_initial_documents(db_session: Session, tenant_id: str | None) -> None:
     """
     Seed initial documents so users don't have an empty index to start
 
@@ -177,7 +178,7 @@ def seed_initial_documents(db_session: Session) -> None:
     )
     processed_docs = json.load(open(initial_docs_path))
 
-    docs, chunks = _create_indexable_chunks(processed_docs)
+    docs, chunks = _create_indexable_chunks(processed_docs, tenant_id)
 
     index_doc_batch_prepare(
         document_batch=docs,
@@ -198,6 +199,7 @@ def seed_initial_documents(db_session: Session) -> None:
 
     # Retries here because the index may take a few seconds to become ready
     # as we just sent over the Vespa schema and there is a slight delay
+
     index_with_retries = retry_builder()(document_index.index)
     index_with_retries(chunks=chunks)
 

diff --git a/backend/danswer/setup.py b/backend/danswer/setup.py
@@ -59,7 +59,7 @@
 logger = setup_logger()
 
 
-def setup_danswer(db_session: Session) -> None:
+def setup_danswer(db_session: Session, tenant_id: str | None) -> None:
     """
     Setup Danswer for a particular tenant. In the Single Tenant case, it will set it up for the default schema
     on server startup. In the MT case, it will be called when the tenant is created.
@@ -148,7 +148,7 @@ def setup_danswer(db_session: Session) -> None:
     # update multipass indexing setting based on GPU availability
     update_default_multipass_indexing(db_session)
 
-    seed_initial_documents(db_session)
+    seed_initial_documents(db_session, tenant_id)
 
 
 def translate_saved_search_settings(db_session: Session) -> None:

diff --git a/backend/ee/danswer/server/tenants/api.py b/backend/ee/danswer/server/tenants/api.py
@@ -59,7 +59,7 @@ def create_tenant(
         run_alembic_migrations(tenant_id)
 
         with get_session_with_tenant(tenant_id) as db_session:
-            setup_danswer(db_session)
+            setup_danswer(db_session, tenant_id)
 
         add_users_to_tenant([email], tenant_id)
-Original file line number
+Diff line change
@@ Expand Up / @@ -78,6 +78,7 @@ def on_setup_logging( @@
         },
     ]
     # Build the celery beat schedule dynamically
     beat_schedule = {}
@@ Expand Down @@