diff --git a/backend/danswer/background/celery/apps/beat.py b/backend/danswer/background/celery/apps/beat.py index 8ddc17efc52..f88295fa139 100644 --- a/backend/danswer/background/celery/apps/beat.py +++ b/backend/danswer/background/celery/apps/beat.py @@ -78,6 +78,7 @@ def on_setup_logging( }, ] + # Build the celery beat schedule dynamically beat_schedule = {} diff --git a/backend/danswer/document_index/vespa/indexing_utils.py b/backend/danswer/document_index/vespa/indexing_utils.py index 8ecdc22672b..aafc6bf4efe 100644 --- a/backend/danswer/document_index/vespa/indexing_utils.py +++ b/backend/danswer/document_index/vespa/indexing_utils.py @@ -57,7 +57,6 @@ def _does_document_exist( chunk. This checks for whether the chunk exists already in the index""" doc_url = f"{DOCUMENT_ID_ENDPOINT.format(index_name=index_name)}/{doc_chunk_id}" doc_fetch_response = http_client.get(doc_url) - if doc_fetch_response.status_code == 404: return False diff --git a/backend/danswer/document_index/vespa_constants.py b/backend/danswer/document_index/vespa_constants.py index d4a36ef9725..30039922f1a 100644 --- a/backend/danswer/document_index/vespa_constants.py +++ b/backend/danswer/document_index/vespa_constants.py @@ -29,6 +29,7 @@ # main search application VESPA_APP_CONTAINER_URL = VESPA_CLOUD_URL or f"http://{VESPA_HOST}:{VESPA_PORT}" + # danswer_chunk below is defined in vespa/app_configs/schemas/danswer_chunk.sd DOCUMENT_ID_ENDPOINT = ( f"{VESPA_APP_CONTAINER_URL}/document/v1/default/{{index_name}}/docid" diff --git a/backend/danswer/main.py b/backend/danswer/main.py index 65036c43ea4..ae18ab3ccf2 100644 --- a/backend/danswer/main.py +++ b/backend/danswer/main.py @@ -184,7 +184,7 @@ async def lifespan(app: FastAPI) -> AsyncGenerator: # If we are multi-tenant, we need to only set up initial public tables with Session(engine) as db_session: - setup_danswer(db_session) + setup_danswer(db_session, None) else: setup_multitenant_danswer() diff --git a/backend/danswer/seeding/load_docs.py b/backend/danswer/seeding/load_docs.py index 2756e7ddf61..2e9c13b10ba 100644 --- a/backend/danswer/seeding/load_docs.py +++ b/backend/danswer/seeding/load_docs.py @@ -39,6 +39,7 @@ def _create_indexable_chunks( preprocessed_docs: list[dict], + tenant_id: str | None, ) -> tuple[list[Document], list[DocMetadataAwareIndexChunk]]: ids_to_documents = {} chunks = [] @@ -80,7 +81,7 @@ def _create_indexable_chunks( mini_chunk_embeddings=[], ), title_embedding=preprocessed_doc["title_embedding"], - tenant_id=None, + tenant_id=tenant_id, access=default_public_access, document_sets=set(), boost=DEFAULT_BOOST, @@ -90,7 +91,7 @@ def _create_indexable_chunks( return list(ids_to_documents.values()), chunks -def seed_initial_documents(db_session: Session) -> None: +def seed_initial_documents(db_session: Session, tenant_id: str | None) -> None: """ Seed initial documents so users don't have an empty index to start @@ -177,7 +178,7 @@ def seed_initial_documents(db_session: Session) -> None: ) processed_docs = json.load(open(initial_docs_path)) - docs, chunks = _create_indexable_chunks(processed_docs) + docs, chunks = _create_indexable_chunks(processed_docs, tenant_id) index_doc_batch_prepare( document_batch=docs, @@ -198,6 +199,7 @@ def seed_initial_documents(db_session: Session) -> None: # Retries here because the index may take a few seconds to become ready # as we just sent over the Vespa schema and there is a slight delay + index_with_retries = retry_builder()(document_index.index) index_with_retries(chunks=chunks) diff --git a/backend/danswer/setup.py b/backend/danswer/setup.py index 8b6bede7563..7abd7482a78 100644 --- a/backend/danswer/setup.py +++ b/backend/danswer/setup.py @@ -59,7 +59,7 @@ logger = setup_logger() -def setup_danswer(db_session: Session) -> None: +def setup_danswer(db_session: Session, tenant_id: str | None) -> None: """ Setup Danswer for a particular tenant. In the Single Tenant case, it will set it up for the default schema on server startup. In the MT case, it will be called when the tenant is created. @@ -148,7 +148,7 @@ def setup_danswer(db_session: Session) -> None: # update multipass indexing setting based on GPU availability update_default_multipass_indexing(db_session) - seed_initial_documents(db_session) + seed_initial_documents(db_session, tenant_id) def translate_saved_search_settings(db_session: Session) -> None: diff --git a/backend/ee/danswer/server/tenants/api.py b/backend/ee/danswer/server/tenants/api.py index 792d486947d..66485975f31 100644 --- a/backend/ee/danswer/server/tenants/api.py +++ b/backend/ee/danswer/server/tenants/api.py @@ -59,7 +59,7 @@ def create_tenant( run_alembic_migrations(tenant_id) with get_session_with_tenant(tenant_id) as db_session: - setup_danswer(db_session) + setup_danswer(db_session, tenant_id) add_users_to_tenant([email], tenant_id)