From 2a492c3155aa72b0046338e92969cde426247323 Mon Sep 17 00:00:00 2001 From: "Richard Kuo (Danswer)" Date: Thu, 14 Nov 2024 19:42:49 -0800 Subject: [PATCH] better logging and rough cut at testing --- backend/danswer/indexing/indexing_pipeline.py | 20 ++++--- .../connector/test_connector_deletion.py | 53 +++++++++++++++++++ 2 files changed, 65 insertions(+), 8 deletions(-) diff --git a/backend/danswer/indexing/indexing_pipeline.py b/backend/danswer/indexing/indexing_pipeline.py index 688650b2936..f23bf91d2f1 100644 --- a/backend/danswer/indexing/indexing_pipeline.py +++ b/backend/danswer/indexing/indexing_pipeline.py @@ -238,15 +238,21 @@ def index_doc_batch_prepare( else documents ) - # Create a record in the DB for every updateable document. + # for all updatable docs, upsert into the DB # Does not include doc_updated_at which is also used to indicate a successful update - _upsert_documents_in_db( - documents=updatable_docs, - index_attempt_metadata=index_attempt_metadata, - db_session=db_session, + if updatable_docs: + _upsert_documents_in_db( + documents=updatable_docs, + index_attempt_metadata=index_attempt_metadata, + db_session=db_session, + ) + + logger.info( + f"Upserted {len(updatable_docs)} changed docs out of " + f"{len(documents)} total docs into the DB" ) - # Upsert the document to cc pair relationship for all documents + # for all docs, upsert the document to cc pair relationship upsert_document_by_connector_credential_pair( db_session, index_attempt_metadata.connector_id, @@ -254,8 +260,6 @@ def index_doc_batch_prepare( document_ids, ) - logger.info(f"Upserted {len(updatable_docs)} documents into the DB") - # No docs to process because the batch is empty or every doc was already indexed if not updatable_docs: return None diff --git a/backend/tests/integration/tests/connector/test_connector_deletion.py b/backend/tests/integration/tests/connector/test_connector_deletion.py index 663aedfc335..b14a75e0045 100644 --- a/backend/tests/integration/tests/connector/test_connector_deletion.py +++ b/backend/tests/integration/tests/connector/test_connector_deletion.py @@ -48,6 +48,59 @@ def test_connector_creation(reset: None) -> None: assert cc_pair_info.creator_email == admin_user.email +# TODO(rkuo): will enable this once i have credentials on github +# def test_overlapping_connector_creation(reset: None) -> None: +# # Creating an admin user (first user created is automatically an admin) +# admin_user: DATestUser = UserManager.create(name="admin_user") + +# config = { +# "wiki_base": os.environ["CONFLUENCE_TEST_SPACE_URL"], +# "space": os.environ["CONFLUENCE_TEST_SPACE"], +# "is_cloud": True, +# "page_id": "", +# } + +# credential = { +# "confluence_username": os.environ["CONFLUENCE_USER_NAME"], +# "confluence_access_token": os.environ["CONFLUENCE_ACCESS_TOKEN"], +# } + +# # store the time before we create the connector so that we know after +# # when the indexing should have started +# now = datetime.now(timezone.utc) + +# # create connector +# cc_pair_1 = CCPairManager.create_from_scratch( +# source=DocumentSource.CONFLUENCE, +# connector_specific_config=config, +# credential_json=credential, +# user_performing_action=admin_user, +# ) + +# CCPairManager.wait_for_indexing( +# cc_pair_1, now, timeout=60, user_performing_action=admin_user +# ) + +# cc_pair_2 = CCPairManager.create_from_scratch( +# source=DocumentSource.CONFLUENCE, +# connector_specific_config=config, +# credential_json=credential, +# user_performing_action=admin_user, +# ) + +# CCPairManager.wait_for_indexing( +# cc_pair_2, now, timeout=60, user_performing_action=admin_user +# ) + +# info_1 = CCPairManager.get_single(cc_pair_1.id) +# assert info_1 + +# info_2 = CCPairManager.get_single(cc_pair_2.id) +# assert info_2 + +# assert info_1.num_docs_indexed == info_2.num_docs_indexed + + def test_connector_deletion(reset: None, vespa_client: vespa_fixture) -> None: # Creating an admin user (first user created is automatically an admin) admin_user: DATestUser = UserManager.create(name="admin_user")