Skip to content

Commit

Permalink
tenant seeding docs (#2925)
Browse files Browse the repository at this point in the history
* tenant seeding docs

* k
  • Loading branch information
pablonyx authored Oct 27, 2024
1 parent 179dc41 commit 53e9165
Show file tree
Hide file tree
Showing 7 changed files with 11 additions and 8 deletions.
1 change: 1 addition & 0 deletions backend/danswer/background/celery/apps/beat.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ def on_setup_logging(
},
]


# Build the celery beat schedule dynamically
beat_schedule = {}

Expand Down
1 change: 0 additions & 1 deletion backend/danswer/document_index/vespa/indexing_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,6 @@ def _does_document_exist(
chunk. This checks for whether the chunk exists already in the index"""
doc_url = f"{DOCUMENT_ID_ENDPOINT.format(index_name=index_name)}/{doc_chunk_id}"
doc_fetch_response = http_client.get(doc_url)

if doc_fetch_response.status_code == 404:
return False

Expand Down
1 change: 1 addition & 0 deletions backend/danswer/document_index/vespa_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
# main search application
VESPA_APP_CONTAINER_URL = VESPA_CLOUD_URL or f"http://{VESPA_HOST}:{VESPA_PORT}"


# danswer_chunk below is defined in vespa/app_configs/schemas/danswer_chunk.sd
DOCUMENT_ID_ENDPOINT = (
f"{VESPA_APP_CONTAINER_URL}/document/v1/default/{{index_name}}/docid"
Expand Down
2 changes: 1 addition & 1 deletion backend/danswer/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,7 @@ async def lifespan(app: FastAPI) -> AsyncGenerator:

# If we are multi-tenant, we need to only set up initial public tables
with Session(engine) as db_session:
setup_danswer(db_session)
setup_danswer(db_session, None)
else:
setup_multitenant_danswer()

Expand Down
8 changes: 5 additions & 3 deletions backend/danswer/seeding/load_docs.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@

def _create_indexable_chunks(
preprocessed_docs: list[dict],
tenant_id: str | None,
) -> tuple[list[Document], list[DocMetadataAwareIndexChunk]]:
ids_to_documents = {}
chunks = []
Expand Down Expand Up @@ -80,7 +81,7 @@ def _create_indexable_chunks(
mini_chunk_embeddings=[],
),
title_embedding=preprocessed_doc["title_embedding"],
tenant_id=None,
tenant_id=tenant_id,
access=default_public_access,
document_sets=set(),
boost=DEFAULT_BOOST,
Expand All @@ -90,7 +91,7 @@ def _create_indexable_chunks(
return list(ids_to_documents.values()), chunks


def seed_initial_documents(db_session: Session) -> None:
def seed_initial_documents(db_session: Session, tenant_id: str | None) -> None:
"""
Seed initial documents so users don't have an empty index to start
Expand Down Expand Up @@ -177,7 +178,7 @@ def seed_initial_documents(db_session: Session) -> None:
)
processed_docs = json.load(open(initial_docs_path))

docs, chunks = _create_indexable_chunks(processed_docs)
docs, chunks = _create_indexable_chunks(processed_docs, tenant_id)

index_doc_batch_prepare(
document_batch=docs,
Expand All @@ -198,6 +199,7 @@ def seed_initial_documents(db_session: Session) -> None:

# Retries here because the index may take a few seconds to become ready
# as we just sent over the Vespa schema and there is a slight delay

index_with_retries = retry_builder()(document_index.index)
index_with_retries(chunks=chunks)

Expand Down
4 changes: 2 additions & 2 deletions backend/danswer/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@
logger = setup_logger()


def setup_danswer(db_session: Session) -> None:
def setup_danswer(db_session: Session, tenant_id: str | None) -> None:
"""
Setup Danswer for a particular tenant. In the Single Tenant case, it will set it up for the default schema
on server startup. In the MT case, it will be called when the tenant is created.
Expand Down Expand Up @@ -148,7 +148,7 @@ def setup_danswer(db_session: Session) -> None:
# update multipass indexing setting based on GPU availability
update_default_multipass_indexing(db_session)

seed_initial_documents(db_session)
seed_initial_documents(db_session, tenant_id)


def translate_saved_search_settings(db_session: Session) -> None:
Expand Down
2 changes: 1 addition & 1 deletion backend/ee/danswer/server/tenants/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def create_tenant(
run_alembic_migrations(tenant_id)

with get_session_with_tenant(tenant_id) as db_session:
setup_danswer(db_session)
setup_danswer(db_session, tenant_id)

add_users_to_tenant([email], tenant_id)

Expand Down

0 comments on commit 53e9165

Please sign in to comment.