Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

tenant seeding docs #2925

Merged
merged 2 commits into from
Oct 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions backend/danswer/background/celery/apps/beat.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ def on_setup_logging(
},
]


# Build the celery beat schedule dynamically
beat_schedule = {}

Expand Down
1 change: 0 additions & 1 deletion backend/danswer/document_index/vespa/indexing_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,6 @@ def _does_document_exist(
chunk. This checks for whether the chunk exists already in the index"""
doc_url = f"{DOCUMENT_ID_ENDPOINT.format(index_name=index_name)}/{doc_chunk_id}"
doc_fetch_response = http_client.get(doc_url)

if doc_fetch_response.status_code == 404:
return False

Expand Down
1 change: 1 addition & 0 deletions backend/danswer/document_index/vespa_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
# main search application
VESPA_APP_CONTAINER_URL = VESPA_CLOUD_URL or f"http://{VESPA_HOST}:{VESPA_PORT}"


# danswer_chunk below is defined in vespa/app_configs/schemas/danswer_chunk.sd
DOCUMENT_ID_ENDPOINT = (
f"{VESPA_APP_CONTAINER_URL}/document/v1/default/{{index_name}}/docid"
Expand Down
2 changes: 1 addition & 1 deletion backend/danswer/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ async def lifespan(app: FastAPI) -> AsyncGenerator:

# If we are multi-tenant, we need to only set up initial public tables
with Session(engine) as db_session:
setup_danswer(db_session)
setup_danswer(db_session, None)
else:
setup_multitenant_danswer()

Expand Down
8 changes: 5 additions & 3 deletions backend/danswer/seeding/load_docs.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@

def _create_indexable_chunks(
preprocessed_docs: list[dict],
tenant_id: str | None,
) -> tuple[list[Document], list[DocMetadataAwareIndexChunk]]:
ids_to_documents = {}
chunks = []
Expand Down Expand Up @@ -80,7 +81,7 @@ def _create_indexable_chunks(
mini_chunk_embeddings=[],
),
title_embedding=preprocessed_doc["title_embedding"],
tenant_id=None,
tenant_id=tenant_id,
access=default_public_access,
document_sets=set(),
boost=DEFAULT_BOOST,
Expand All @@ -90,7 +91,7 @@ def _create_indexable_chunks(
return list(ids_to_documents.values()), chunks


def seed_initial_documents(db_session: Session) -> None:
def seed_initial_documents(db_session: Session, tenant_id: str | None) -> None:
"""
Seed initial documents so users don't have an empty index to start

Expand Down Expand Up @@ -177,7 +178,7 @@ def seed_initial_documents(db_session: Session) -> None:
)
processed_docs = json.load(open(initial_docs_path))

docs, chunks = _create_indexable_chunks(processed_docs)
docs, chunks = _create_indexable_chunks(processed_docs, tenant_id)

index_doc_batch_prepare(
document_batch=docs,
Expand All @@ -198,6 +199,7 @@ def seed_initial_documents(db_session: Session) -> None:

# Retries here because the index may take a few seconds to become ready
# as we just sent over the Vespa schema and there is a slight delay

index_with_retries = retry_builder()(document_index.index)
index_with_retries(chunks=chunks)

Expand Down
4 changes: 2 additions & 2 deletions backend/danswer/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@
logger = setup_logger()


def setup_danswer(db_session: Session) -> None:
def setup_danswer(db_session: Session, tenant_id: str | None) -> None:
"""
Setup Danswer for a particular tenant. In the Single Tenant case, it will set it up for the default schema
on server startup. In the MT case, it will be called when the tenant is created.
Expand Down Expand Up @@ -147,7 +147,7 @@ def setup_danswer(db_session: Session) -> None:
# update multipass indexing setting based on GPU availability
update_default_multipass_indexing(db_session)

seed_initial_documents(db_session)
seed_initial_documents(db_session, tenant_id)


def translate_saved_search_settings(db_session: Session) -> None:
Expand Down
2 changes: 1 addition & 1 deletion backend/ee/danswer/server/tenants/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def create_tenant(
run_alembic_migrations(tenant_id)

with get_session_with_tenant(tenant_id) as db_session:
setup_danswer(db_session)
setup_danswer(db_session, tenant_id)

add_users_to_tenant([email], tenant_id)

Expand Down
Loading