From 3fdd233e84521270865f3a34bc579452c87c7ba6 Mon Sep 17 00:00:00 2001 From: rkuo-danswer Date: Wed, 2 Oct 2024 18:57:25 -0700 Subject: [PATCH] delete directly via selection instead of making multiple calls to get chunk ids and delete each one (#2666) --- .../danswer/background/connector_deletion.py | 2 +- backend/danswer/document_index/interfaces.py | 10 +++ backend/danswer/document_index/vespa/index.py | 61 +++++++++++++++++++ 3 files changed, 72 insertions(+), 1 deletion(-) diff --git a/backend/danswer/background/connector_deletion.py b/backend/danswer/background/connector_deletion.py index 983a3c129ba..84b696dd8e4 100644 --- a/backend/danswer/background/connector_deletion.py +++ b/backend/danswer/background/connector_deletion.py @@ -148,7 +148,7 @@ def document_by_cc_pair_cleanup_task( if count == 1: # count == 1 means this is the only remaining cc_pair reference to the doc # delete it from vespa and the db - document_index.delete(doc_ids=[document_id]) + document_index.delete_single(doc_id=document_id) delete_documents_complete__no_commit( db_session=db_session, document_ids=[document_id], diff --git a/backend/danswer/document_index/interfaces.py b/backend/danswer/document_index/interfaces.py index eaa34b37752..b499d696743 100644 --- a/backend/danswer/document_index/interfaces.py +++ b/backend/danswer/document_index/interfaces.py @@ -156,6 +156,16 @@ class Deletable(abc.ABC): Class must implement the ability to delete document by their unique document ids. """ + @abc.abstractmethod + def delete_single(self, doc_id: str) -> None: + """ + Given a single document id, hard delete it from the document index + + Parameters: + - doc_id: document id as specified by the connector + """ + raise NotImplementedError + @abc.abstractmethod def delete(self, doc_ids: list[str]) -> None: """ diff --git a/backend/danswer/document_index/vespa/index.py b/backend/danswer/document_index/vespa/index.py index 700f8860fb5..467260ed619 100644 --- a/backend/danswer/document_index/vespa/index.py +++ b/backend/danswer/document_index/vespa/index.py @@ -13,6 +13,7 @@ import httpx import requests +from danswer.configs.app_configs import DOCUMENT_INDEX_NAME from danswer.configs.chat_configs import DOC_TIME_DECAY from danswer.configs.chat_configs import NUM_RETURNED_HITS from danswer.configs.chat_configs import TITLE_CONTENT_RATIO @@ -479,6 +480,66 @@ def delete(self, doc_ids: list[str]) -> None: document_ids=doc_ids, index_name=index_name, http_client=http_client ) + def delete_single(self, doc_id: str) -> None: + """Possibly faster overall than the delete method due to using a single + delete call with a selection query.""" + + # Vespa deletion is poorly documented ... luckily we found this + # https://docs.vespa.ai/en/operations/batch-delete.html#example + + doc_id = replace_invalid_doc_id_characters(doc_id) + + # NOTE: using `httpx` here since `requests` doesn't support HTTP2. This is beneficial for + # indexing / updates / deletes since we have to make a large volume of requests. + index_names = [self.index_name] + if self.secondary_index_name: + index_names.append(self.secondary_index_name) + + with httpx.Client(http2=True) as http_client: + for index_name in index_names: + params = httpx.QueryParams( + { + "selection": f"{index_name}.document_id=='{doc_id}'", + "cluster": DOCUMENT_INDEX_NAME, + } + ) + + total_chunks_deleted = 0 + while True: + try: + resp = http_client.delete( + f"{DOCUMENT_ID_ENDPOINT.format(index_name=index_name)}", + params=params, + ) + resp.raise_for_status() + except httpx.HTTPStatusError as e: + logger.error( + f"Failed to delete chunk, details: {e.response.text}" + ) + raise + + resp_data = resp.json() + + if "documentCount" in resp_data: + chunks_deleted = resp_data["documentCount"] + total_chunks_deleted += chunks_deleted + + # Check for continuation token to handle pagination + if "continuation" not in resp_data: + break # Exit loop if no continuation token + + if not resp_data["continuation"]: + break # Exit loop if continuation token is empty + + params = params.set("continuation", resp_data["continuation"]) + + logger.debug( + f"VespaIndex.delete_single: " + f"index={index_name} " + f"doc={doc_id} " + f"chunks_deleted={total_chunks_deleted}" + ) + def id_based_retrieval( self, chunk_requests: list[VespaChunkRequest],