From 40c98859afc24ee558cefa0e0dc7b603d9d36507 Mon Sep 17 00:00:00 2001 From: Adam Dougal Date: Fri, 3 May 2024 17:30:19 +0100 Subject: [PATCH 01/10] Remove langchain from index operations - This is preparation for adding an additional image vector field for advanced image processing - I've tried to keep changes to a minimum - Still using langchain in the question and answer tool for now - Increased unit test coverage Required by https://github.com/Azure-Samples/chat-with-your-data-solution-accelerator/issues/748 --- .../batch/utilities/common/SourceDocument.py | 29 +- .../utilities/helpers/AzureSearchHelper.py | 128 +++++++-- .../batch/utilities/helpers/LLMHelper.py | 11 +- .../helpers/embedders/PushEmbedder.py | 87 ++++-- .../utilities/search/AzureSearchHandler.py | 78 +++-- .../IntegratedVectorizationSearchHandler.py | 26 +- code/backend/batch/utilities/search/Search.py | 41 +-- .../utilities/search/SearchHandlerBase.py | 8 +- .../utilities/tools/QuestionAnswerTool.py | 29 +- code/tests/common/test_SourceDocument.py | 29 -- code/tests/functional/backend_api/conftest.py | 9 +- .../tests/default/test_conversation_custom.py | 53 +++- ...est_response_with_search_documents_tool.py | 8 +- .../test_response_without_tool_call.py | 4 +- .../test_AzureSearchHandler.py | 182 +++++++++--- ...st_IntegratedVectorizationSearchHandler.py | 63 +++-- code/tests/search_utilities/test_Search.py | 87 ++---- .../test_IntegratedVectorizationEmbedder.py | 17 +- .../helpers/test_AzureSearchHelper.py | 267 +++++++++++++++--- .../tests/utilities/helpers/test_LLMHelper.py | 44 +++ .../utilities/helpers/test_PushEmbedder.py | 257 +++++++++++++---- .../utilities/test_QuestionAnswerTool.py | 107 +++---- 22 files changed, 1093 insertions(+), 471 deletions(-) diff --git a/code/backend/batch/utilities/common/SourceDocument.py b/code/backend/batch/utilities/common/SourceDocument.py index 296dc90eb..94a31319a 100644 --- a/code/backend/batch/utilities/common/SourceDocument.py +++ b/code/backend/batch/utilities/common/SourceDocument.py @@ -3,7 +3,6 @@ import json from urllib.parse import urlparse, quote from ..helpers.AzureBlobStorageClient import AzureBlobStorageClient -from langchain.docstore.document import Document class SourceDocument: @@ -30,6 +29,20 @@ def __init__( def __str__(self): return f"SourceDocument(id={self.id}, title={self.title}, source={self.source}, chunk={self.chunk}, offset={self.offset}, page_number={self.page_number}, chunk_id={self.chunk_id})" + def __eq__(self, other): + if isinstance(self, other.__class__): + return ( + self.id == other.id + and self.content == other.content + and self.source == other.source + and self.title == other.title + and self.chunk == other.chunk + and self.offset == other.offset + and self.page_number == other.page_number + and self.chunk_id == other.chunk_id + ) + return False + def to_json(self): return json.dumps(self, cls=SourceDocumentEncoder) @@ -80,20 +93,6 @@ def from_metadata( chunk_id=metadata.get("chunk_id"), ) - def convert_to_langchain_document(self): - return Document( - page_content=self.content, - metadata={ - "id": self.id, - "source": self.source, - "title": self.title, - "chunk": self.chunk, - "offset": self.offset, - "page_number": self.page_number, - "chunk_id": self.chunk_id, - }, - ) - def get_filename(self, include_path=False): filename = self.source.replace("_SAS_TOKEN_PLACEHOLDER_", "").replace( "http://", "" diff --git a/code/backend/batch/utilities/helpers/AzureSearchHelper.py b/code/backend/batch/utilities/helpers/AzureSearchHelper.py index 4d14d2ad6..c07f32ecf 100644 --- a/code/backend/batch/utilities/helpers/AzureSearchHelper.py +++ b/code/backend/batch/utilities/helpers/AzureSearchHelper.py @@ -1,13 +1,34 @@ +import logging +from typing import Union from langchain.vectorstores.azuresearch import AzureSearch +from azure.core.credentials import AzureKeyCredential +from azure.identity import DefaultAzureCredential +from azure.search.documents import SearchClient +from azure.search.documents.indexes import SearchIndexClient from azure.search.documents.indexes.models import ( + ExhaustiveKnnAlgorithmConfiguration, + ExhaustiveKnnParameters, + HnswAlgorithmConfiguration, + HnswParameters, SearchableField, SearchField, SearchFieldDataType, + SearchIndex, + SemanticConfiguration, + SemanticField, + SemanticPrioritizedFields, + SemanticSearch, SimpleField, + VectorSearch, + VectorSearchAlgorithmKind, + VectorSearchAlgorithmMetric, + VectorSearchProfile, ) from .LLMHelper import LLMHelper from .EnvHelper import EnvHelper +logger = logging.getLogger(__name__) + class AzureSearchHelper: _search_dimension: int | None = None @@ -16,6 +37,37 @@ def __init__(self): self.llm_helper = LLMHelper() self.env_helper = EnvHelper() + search_credential = self._search_credential() + self.search_client = self._create_search_client(search_credential) + self.search_index_client = self._create_search_index_client(search_credential) + self.create_index() + + def _search_credential(self): + if self.env_helper.is_auth_type_keys(): + return AzureKeyCredential(self.env_helper.AZURE_SEARCH_KEY) + else: + return DefaultAzureCredential() + + def _create_search_client( + self, search_credential: Union[AzureKeyCredential, DefaultAzureCredential] + ) -> SearchClient: + return SearchClient( + endpoint=self.env_helper.AZURE_SEARCH_SERVICE, + index_name=self.env_helper.AZURE_SEARCH_INDEX, + credential=search_credential, + ) + + def _create_search_index_client( + self, search_credential: Union[AzureKeyCredential, DefaultAzureCredential] + ): + return SearchIndexClient( + endpoint=self.env_helper.AZURE_SEARCH_SERVICE, credential=search_credential + ) + + def get_search_client(self) -> SearchClient: + self.create_index() + return self.search_client + @property def search_dimensions(self) -> int: if AzureSearchHelper._search_dimension is None: @@ -24,7 +76,7 @@ def search_dimensions(self) -> int: ) return AzureSearchHelper._search_dimension - def get_vector_store(self): + def create_index(self): fields = [ SimpleField( name="id", @@ -70,25 +122,67 @@ def get_vector_store(self): ), ] - return AzureSearch( - azure_search_endpoint=self.env_helper.AZURE_SEARCH_SERVICE, - azure_search_key=( - self.env_helper.AZURE_SEARCH_KEY - if self.env_helper.AZURE_AUTH_TYPE == "keys" - else None - ), - index_name=self.env_helper.AZURE_SEARCH_INDEX, - embedding_function=self.llm_helper.get_embedding_model().embed_query, + index = SearchIndex( + name=self.env_helper.AZURE_SEARCH_INDEX, fields=fields, - search_type=( - "semantic_hybrid" - if self.env_helper.AZURE_SEARCH_USE_SEMANTIC_SEARCH - else "hybrid" + semantic_search=( + SemanticSearch( + configurations=[ + SemanticConfiguration( + name=self.env_helper.AZURE_SEARCH_SEMANTIC_SEARCH_CONFIG, + prioritized_fields=SemanticPrioritizedFields( + title_field=None, + content_fields=[SemanticField(field_name="content")], + ), + ) + ] + ) + ), + vector_search=VectorSearch( + algorithms=[ + HnswAlgorithmConfiguration( + name="default", + parameters=HnswParameters( + metric=VectorSearchAlgorithmMetric.COSINE + ), + kind=VectorSearchAlgorithmKind.HNSW, + ), + ExhaustiveKnnAlgorithmConfiguration( + name="default_exhaustive_knn", + kind=VectorSearchAlgorithmKind.EXHAUSTIVE_KNN, + parameters=ExhaustiveKnnParameters( + metric=VectorSearchAlgorithmMetric.COSINE + ), + ), + ], + profiles=[ + VectorSearchProfile( + name="myHnswProfile", + algorithm_configuration_name="default", + ), + VectorSearchProfile( + name="myExhaustiveKnnProfile", + algorithm_configuration_name="default_exhaustive_knn", + ), + ], ), - semantic_configuration_name=self.env_helper.AZURE_SEARCH_SEMANTIC_SEARCH_CONFIG, - user_agent="langchain chatwithyourdata-sa", ) + if self._index_not_exists(self.env_helper.AZURE_SEARCH_INDEX): + try: + logger.info( + f"Creating or updating index {self.env_helper.AZURE_SEARCH_INDEX}" + ) + self.search_index_client.create_index(index) + except Exception as e: + logger.exception("Error Creating index") + raise e + + def _index_not_exists(self, index_name: str) -> bool: + return index_name not in [ + name for name in self.search_index_client.list_index_names() + ] + def get_conversation_logger(self): fields = [ SimpleField( @@ -152,7 +246,7 @@ def get_conversation_logger(self): azure_search_endpoint=self.env_helper.AZURE_SEARCH_SERVICE, azure_search_key=( self.env_helper.AZURE_SEARCH_KEY - if self.env_helper.AZURE_AUTH_TYPE == "keys" + if self.env_helper.is_auth_type_keys() else None ), index_name=self.env_helper.AZURE_SEARCH_CONVERSATIONS_LOG_INDEX, diff --git a/code/backend/batch/utilities/helpers/LLMHelper.py b/code/backend/batch/utilities/helpers/LLMHelper.py index 74d9652c7..e2133bcf5 100644 --- a/code/backend/batch/utilities/helpers/LLMHelper.py +++ b/code/backend/batch/utilities/helpers/LLMHelper.py @@ -1,5 +1,5 @@ from openai import AzureOpenAI -from typing import cast +from typing import List, Union, cast from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler from semantic_kernel.connectors.ai.open_ai import AzureChatCompletion @@ -98,6 +98,15 @@ def get_embedding_model(self): azure_ad_token_provider=self.token_provider, ) + def generate_embeddings(self, input: Union[str, list[int]]) -> List[float]: + return ( + self.openai_client.embeddings.create( + input=[input], model=self.embedding_model + ) + .data[0] + .embedding + ) + def get_chat_completion_with_functions( self, messages: list[dict], functions: list[dict], function_call: str = "auto" ): diff --git a/code/backend/batch/utilities/helpers/embedders/PushEmbedder.py b/code/backend/batch/utilities/helpers/embedders/PushEmbedder.py index 3b2eb12ad..9f9463012 100644 --- a/code/backend/batch/utilities/helpers/embedders/PushEmbedder.py +++ b/code/backend/batch/utilities/helpers/embedders/PushEmbedder.py @@ -1,6 +1,9 @@ +import json import logging from typing import List +from ...helpers import LLMHelper + from ..AzureBlobStorageClient import AzureBlobStorageClient from ..config.EmbeddingConfig import EmbeddingConfig @@ -17,39 +20,75 @@ class PushEmbedder(EmbedderBase): def __init__(self, blob_client: AzureBlobStorageClient): + self.llm_helper = LLMHelper() + self.azure_search_helper = AzureSearchHelper() + self.document_loading = DocumentLoading() + self.document_chunking = DocumentChunking() self.blob_client = blob_client config = ConfigHelper.get_active_config_or_default() - self.processor_map = {} + self.embedding_configs = {} for processor in config.document_processors: ext = processor.document_type.lower() - self.processor_map[ext] = processor + self.embedding_configs[ext] = processor def embed_file(self, source_url: str, file_name: str): file_extension = file_name.split(".")[-1] - processor = self.processor_map.get(file_extension) - self.__embed(source_url=source_url, processor=processor) + embedding_config = self.embedding_configs.get(file_extension) + self.__embed(source_url=source_url, embedding_config=embedding_config) if file_extension != "url": self.blob_client.upsert_blob_metadata( file_name, {"embeddings_added": "true"} ) - def __embed(self, source_url: str, processor: EmbeddingConfig): - vector_store_helper = AzureSearchHelper() - vector_store = vector_store_helper.get_vector_store() - if not processor.use_advanced_image_processing: - try: - document_loading = DocumentLoading() - document_chunking = DocumentChunking() - documents: List[SourceDocument] = [] - documents = document_loading.load(source_url, processor.loading) - documents = document_chunking.chunk(documents, processor.chunking) - keys = list(map(lambda x: x.id, documents)) - documents = [ - document.convert_to_langchain_document() for document in documents - ] - return vector_store.add_documents(documents=documents, keys=keys) - except Exception as e: - logger.error(f"Error adding embeddings for {source_url}: {e}") - raise e - else: - logger.warn("Advanced image processing is not supported yet") + def __embed(self, source_url: str, embedding_config: EmbeddingConfig): + documents_to_upload: List[SourceDocument] = [] + try: + if not embedding_config.use_advanced_image_processing: + documents: List[SourceDocument] = self.document_loading.load( + source_url, embedding_config.loading + ) + documents = self.document_chunking.chunk( + documents, embedding_config.chunking + ) + + for document in documents: + documents_to_upload.append( + self._convert_to_search_document(document) + ) + + response = ( + self.azure_search_helper.get_search_client().upload_documents( + documents_to_upload + ) + ) + if not all([r.succeeded for r in response]): + raise Exception(response) + + else: + logger.warning("Advanced image processing is not supported yet") + + except Exception as e: + logger.error(f"Error adding embeddings for {source_url}: {e}") + raise e + + def _convert_to_search_document(self, document: SourceDocument): + embedded_content = self.llm_helper.generate_embeddings(document.content) + metadata = { + "id": document.id, + "source": document.source, + "title": document.title, + "chunk": document.chunk, + "offset": document.offset, + "page_number": document.page_number, + "chunk_id": document.chunk_id, + } + return { + "id": document.id, + "content": document.content, + "content_vector": embedded_content, + "metadata": json.dumps(metadata), + "title": document.title, + "source": document.source, + "chunk": document.chunk, + "offset": document.offset, + } diff --git a/code/backend/batch/utilities/search/AzureSearchHandler.py b/code/backend/batch/utilities/search/AzureSearchHandler.py index 5b07169c4..c3129096f 100644 --- a/code/backend/batch/utilities/search/AzureSearchHandler.py +++ b/code/backend/batch/utilities/search/AzureSearchHandler.py @@ -1,13 +1,24 @@ +from typing import List from .SearchHandlerBase import SearchHandlerBase +from ..helpers.LLMHelper import LLMHelper from ..helpers.AzureSearchHelper import AzureSearchHelper from ..common.SourceDocument import SourceDocument import json +from azure.search.documents.models import VectorizedQuery +import tiktoken class AzureSearchHandler(SearchHandlerBase): + + _ENCODER_NAME = "cl100k_base" + _VECTOR_FIELD = "content_vector" + + def __init__(self, env_helper): + super().__init__(env_helper) + self.llm_helper = LLMHelper() + def create_search_client(self): - vector_store_helper = AzureSearchHelper() - return vector_store_helper.get_vector_store().client + return AzureSearchHelper().get_search_client() def perform_search(self, filename): return self.search_client.search( @@ -49,26 +60,61 @@ def delete_files(self, files): return ", ".join(files_to_delete) - def query_search(self, question): - vector_store = AzureSearchHelper().get_vector_store() - return vector_store.similarity_search( - query=question, - k=self.env_helper.AZURE_SEARCH_TOP_K, - filters=self.env_helper.AZURE_SEARCH_FILTER, + def query_search(self, question) -> List[SourceDocument]: + encoding = tiktoken.get_encoding(self._ENCODER_NAME) + tokenised_question = encoding.encode(question) + if self.env_helper.AZURE_SEARCH_USE_SEMANTIC_SEARCH: + results = self._semantic_search(question, tokenised_question) + else: + results = self._hybrid_search(question, tokenised_question) + + return self._convert_to_source_documents(results) + + def _semantic_search(self, question: str, tokenised_question: list[int]): + return self.search_client.search( + search_text=question, + vector_queries=[ + VectorizedQuery( + vector=self.llm_helper.generate_embeddings(tokenised_question), + k_nearest_neighbors=self.env_helper.AZURE_SEARCH_TOP_K, + fields=self._VECTOR_FIELD, + ) + ], + filter=self.env_helper.AZURE_SEARCH_FILTER, + query_type="semantic", + semantic_configuration_name=self.env_helper.AZURE_SEARCH_SEMANTIC_CONFIG_NAME, + query_caption="extractive", + query_answer="extractive", + top=self.env_helper.AZURE_SEARCH_TOP_K, + ) + + def _hybrid_search(self, question: str, tokenised_question: list[int]): + return self.search_client.search( + search_text=question, + vector_queries=[ + VectorizedQuery( + vector=self.llm_helper.generate_embeddings(tokenised_question), + k_nearest_neighbors=self.env_helper.AZURE_SEARCH_TOP_K, + filter=self.env_helper.AZURE_SEARCH_FILTER, + fields=self._VECTOR_FIELD, + ) + ], + filter=self.env_helper.AZURE_SEARCH_FILTER, + top=self.env_helper.AZURE_SEARCH_TOP_K, ) - def return_answer_source_documents(self, search_results): + def _convert_to_source_documents(self, search_results) -> List[SourceDocument]: source_documents = [] for source in search_results: source_documents.append( SourceDocument( - id=source.metadata["id"], - content=source.page_content, - title=source.metadata["title"], - source=source.metadata["source"], - chunk=source.metadata["chunk"], - offset=source.metadata["offset"], - page_number=source.metadata["page_number"], + id=source.get("id"), + content=source.get("content"), + title=source.get("title"), + source=source.get("source"), + chunk=source.get("chunk"), + offset=source.get("offset"), + page_number=source.get("page_number"), ) ) return source_documents diff --git a/code/backend/batch/utilities/search/IntegratedVectorizationSearchHandler.py b/code/backend/batch/utilities/search/IntegratedVectorizationSearchHandler.py index 7963ecf5d..429db192d 100644 --- a/code/backend/batch/utilities/search/IntegratedVectorizationSearchHandler.py +++ b/code/backend/batch/utilities/search/IntegratedVectorizationSearchHandler.py @@ -1,3 +1,4 @@ +from typing import List from .SearchHandlerBase import SearchHandlerBase from azure.search.documents import SearchClient from azure.search.documents.models import VectorizableTextQuery @@ -62,7 +63,7 @@ def delete_files(self, files): return ", ".join(files_to_delete) - def query_search(self, question): + def query_search(self, question) -> List[SourceDocument]: vector_query = VectorizableTextQuery( text=question, k_nearest_neighbors=self.env_helper.AZURE_SEARCH_TOP_K, @@ -74,18 +75,27 @@ def query_search(self, question): vector_queries=[vector_query], top=self.env_helper.AZURE_SEARCH_TOP_K, ) - return search_results + return self._convert_to_source_documents(search_results) - def return_answer_source_documents(self, search_results): + def _convert_to_source_documents(self, search_results) -> List[SourceDocument]: source_documents = [] for source in search_results: source_documents.append( SourceDocument( - id=source.metadata["id"], - content=source.page_content, - title=source.metadata["title"], - source=source.metadata["source"], - chunk_id=source.metadata["chunk_id"], + id=source.get("id"), + content=source.get("content"), + title=source.get("title"), + source=self._extract_source_url(source.get("source")), + chunk_id=source.get("chunk_id"), ) ) return source_documents + + def _extract_source_url(self, original_source: str) -> str: + matches = list(re.finditer(r"https?://", original_source)) + if len(matches) > 1: + second_http_start = matches[1].start() + source_url = original_source[second_http_start:] + else: + source_url = original_source + "_SAS_TOKEN_PLACEHOLDER_" + return source_url diff --git a/code/backend/batch/utilities/search/Search.py b/code/backend/batch/utilities/search/Search.py index 76bfd4ef7..836bf146b 100644 --- a/code/backend/batch/utilities/search/Search.py +++ b/code/backend/batch/utilities/search/Search.py @@ -2,8 +2,7 @@ from ..search.IntegratedVectorizationSearchHandler import ( IntegratedVectorizationSearchHandler, ) -from langchain_core.documents import Document -import re +from ..common.SourceDocument import SourceDocument from ..helpers.EnvHelper import EnvHelper @@ -16,39 +15,5 @@ def get_search_handler(env_helper: EnvHelper): return AzureSearchHandler(env_helper) @staticmethod - def get_source_documents(search_handler, question): - if isinstance(search_handler, IntegratedVectorizationSearchHandler): - search_results = search_handler.query_search(question) - return Search.generate_source_documents(search_results) - else: - return search_handler.query_search(question) - - @staticmethod - def generate_source_documents(search_results): - sources = [] - for result in search_results: - source_url = Search._extract_source_url(result.get("source", "")) - - metadata_dict = { - "id": result.get("id", ""), - "title": result.get("title", ""), - "source": source_url, - "chunk_id": result.get("chunk_id", ""), - } - sources.append( - Document( - page_content=result["content"], - metadata=metadata_dict, - ) - ) - return sources - - @staticmethod - def _extract_source_url(original_source): - matches = list(re.finditer(r"https?://", original_source)) - if len(matches) > 1: - second_http_start = matches[1].start() - source_url = original_source[second_http_start:] - else: - source_url = original_source + "_SAS_TOKEN_PLACEHOLDER_" - return source_url + def get_source_documents(search_handler, question) -> list[SourceDocument]: + return search_handler.query_search(question) diff --git a/code/backend/batch/utilities/search/SearchHandlerBase.py b/code/backend/batch/utilities/search/SearchHandlerBase.py index 69d58441b..ad6d39dc3 100644 --- a/code/backend/batch/utilities/search/SearchHandlerBase.py +++ b/code/backend/batch/utilities/search/SearchHandlerBase.py @@ -1,5 +1,7 @@ from abc import ABC, abstractmethod +from ..common.SourceDocument import SourceDocument + class SearchHandlerBase(ABC): def __init__(self, env_helper): @@ -31,9 +33,5 @@ def delete_files(self, files, id_field): pass @abstractmethod - def query_search(self, question): - pass - - @abstractmethod - def return_answer_source_documents(self, search_results): + def query_search(self, question) -> list[SourceDocument]: pass diff --git a/code/backend/batch/utilities/tools/QuestionAnswerTool.py b/code/backend/batch/utilities/tools/QuestionAnswerTool.py index 4e7334ad4..b29fd2f18 100644 --- a/code/backend/batch/utilities/tools/QuestionAnswerTool.py +++ b/code/backend/batch/utilities/tools/QuestionAnswerTool.py @@ -1,6 +1,7 @@ import json import logging -import warnings + +from ..common.SourceDocument import SourceDocument from ..search.Search import Search from .AnsweringToolBase import AnsweringToolBase @@ -14,7 +15,6 @@ PromptTemplate, ) from langchain_community.callbacks import get_openai_callback -from langchain_core.documents import Document from langchain_core.messages import SystemMessage from ..helpers.config.ConfigHelper import ConfigHelper @@ -45,14 +45,14 @@ def json_remove_whitespace(obj: str) -> str: except json.JSONDecodeError: return obj - def generate_llm_chain(self, question: str, sources: list[Document]): + def generate_llm_chain(self, question: str, sources: list[dict]): answering_prompt = PromptTemplate( template=self.config.prompts.answering_user_prompt, input_variables=["question", "sources"], ) sources_text = "\n\n".join( - [f"[doc{i+1}]: {source.page_content}" for i, source in enumerate(sources)] + [f"[doc{i+1}]: {source.content}" for i, source in enumerate(sources)] ) return answering_prompt, { @@ -64,7 +64,7 @@ def generate_on_your_data_llm_chain( self, question: str, chat_history: list[dict], - sources: list[Document], + sources: list[SourceDocument], ): examples = [] @@ -83,7 +83,7 @@ def generate_on_your_data_llm_chain( if all((few_shot_example.values())): examples.append(few_shot_example) else: - warnings.warn( + logger.warning( "Not all example fields are set in the config. Skipping few-shot example." ) @@ -116,7 +116,7 @@ def generate_on_your_data_llm_chain( documents = json.dumps( { "retrieved_documents": [ - {f"[doc{i+1}]": {"content": source.page_content}} + {f"[doc{i+1}]": {"content": source.content}} for i, source in enumerate(sources) ], }, @@ -129,18 +129,22 @@ def generate_on_your_data_llm_chain( "chat_history": chat_history, } - def answer_question(self, question: str, chat_history: list[dict], **kwargs): - sources = Search.get_source_documents(self.search_handler, question) + def answer_question( + self, question: str, chat_history: list[SourceDocument], **kwargs + ): + source_documents = Search.get_source_documents(self.search_handler, question) if self.config.prompts.use_on_your_data_format: answering_prompt, input = self.generate_on_your_data_llm_chain( - question, chat_history, sources + question, chat_history, source_documents ) else: - warnings.warn( + logger.warning( "Azure OpenAI On Your Data prompt format is recommended and should be enabled in the Admin app.", ) - answering_prompt, input = self.generate_llm_chain(question, sources) + answering_prompt, input = self.generate_llm_chain( + question, source_documents + ) llm_helper = LLMHelper() @@ -155,7 +159,6 @@ def answer_question(self, question: str, chat_history: list[dict], **kwargs): logger.debug(f"Answer: {answer}") # Generate Answer Object - source_documents = self.search_handler.return_answer_source_documents(sources) clean_answer = Answer( question=question, answer=answer, diff --git a/code/tests/common/test_SourceDocument.py b/code/tests/common/test_SourceDocument.py index 47a83ca83..f175e1953 100644 --- a/code/tests/common/test_SourceDocument.py +++ b/code/tests/common/test_SourceDocument.py @@ -8,35 +8,6 @@ ) -def test_convert_to_langchain_document(): - # Given - source_document = SourceDocument( - id="1", - content="Some content", - title="A title", - source="A source", - chunk="A chunk", - offset="An offset", - page_number="1", - chunk_id="abcd", - ) - - # When - langchain_document = source_document.convert_to_langchain_document() - - # Then - assert langchain_document.page_content == "Some content" - assert langchain_document.metadata == { - "id": "1", - "source": "A source", - "title": "A title", - "chunk": "A chunk", - "offset": "An offset", - "page_number": "1", - "chunk_id": "abcd", - } - - def test_get_filename(): # Given source_document = SourceDocument( diff --git a/code/tests/functional/backend_api/conftest.py b/code/tests/functional/backend_api/conftest.py index 0d2fa7d9e..2c7bc54e9 100644 --- a/code/tests/functional/backend_api/conftest.py +++ b/code/tests/functional/backend_api/conftest.py @@ -55,10 +55,15 @@ def setup_default_mocking(httpserver: HTTPServer, app_config: AppConfig): } ) + httpserver.expect_oneshot_request( + "/indexes", + method="GET", + ).respond_with_json({"value": []}) + httpserver.expect_request( - f"/indexes('{app_config.get('AZURE_SEARCH_INDEX')}')", + "/indexes", method="GET", - ).respond_with_json({}, status=404) + ).respond_with_json({"value": [{"name": app_config.get("AZURE_SEARCH_INDEX")}]}) httpserver.expect_request( "/indexes", diff --git a/code/tests/functional/backend_api/tests/default/test_conversation_custom.py b/code/tests/functional/backend_api/tests/default/test_conversation_custom.py index 48bbb670f..13b113081 100644 --- a/code/tests/functional/backend_api/tests/default/test_conversation_custom.py +++ b/code/tests/functional/backend_api/tests/default/test_conversation_custom.py @@ -137,12 +137,12 @@ def test_post_makes_correct_calls_to_openai_embeddings_to_get_vector_dimensions( "Api-Key": app_config.get("AZURE_OPENAI_API_KEY"), }, query_string="api-version=2024-02-01", - times=3, + times=1, ), ) -def test_post_makes_correct_calls_to_openai_embeddings_to_embed_question( +def test_post_makes_correct_calls_to_openai_embeddings_to_embed_question_to_search( app_url: str, app_config: AppConfig, httpserver: HTTPServer ): # when @@ -158,7 +158,7 @@ def test_post_makes_correct_calls_to_openai_embeddings_to_embed_question( "input": [ [3923, 374, 279, 7438, 315, 2324, 30] ], # Embedding of "What is the meaning of life?" - "model": "text-embedding-ada-002", + "model": app_config.get("AZURE_OPENAI_EMBEDDING_MODEL"), "encoding_format": "base64", }, headers={ @@ -168,7 +168,38 @@ def test_post_makes_correct_calls_to_openai_embeddings_to_embed_question( "Api-Key": app_config.get("AZURE_OPENAI_API_KEY"), }, query_string="api-version=2024-02-01", - times=2, # once to use when quering search and once when adding to the conversation log + times=1, + ), + ) + + +def test_post_makes_correct_calls_to_openai_embeddings_to_embed_question_to_store_in_conversation_log( + app_url: str, app_config: AppConfig, httpserver: HTTPServer +): + # when + requests.post(f"{app_url}{path}", json=body) + + # then + verify_request_made( + mock_httpserver=httpserver, + request_matcher=RequestMatcher( + path=f"/openai/deployments/{app_config.get('AZURE_OPENAI_EMBEDDING_MODEL')}/embeddings", + method="POST", + json={ + "input": [ + [3923, 374, 279, 7438, 315, 2324, 30] + ], # Embedding of "What is the meaning of life?" + "model": "text-embedding-ada-002", # this is harded in the langchain code base + "encoding_format": "base64", + }, + headers={ + "Accept": "application/json", + "Content-Type": "application/json", + "Authorization": f"Bearer {app_config.get('AZURE_OPENAI_API_KEY')}", + "Api-Key": app_config.get("AZURE_OPENAI_API_KEY"), + }, + query_string="api-version=2024-02-01", + times=1, ), ) @@ -290,7 +321,7 @@ def test_post_makes_correct_call_to_openai_chat_completions_with_functions( ) -def test_post_makes_correct_call_to_get_documents_search_index( +def test_post_makes_correct_call_to_list_search_indexes( app_url: str, app_config: AppConfig, httpserver: HTTPServer ): # when @@ -300,14 +331,14 @@ def test_post_makes_correct_call_to_get_documents_search_index( verify_request_made( mock_httpserver=httpserver, request_matcher=RequestMatcher( - path=f"/indexes('{app_config.get('AZURE_SEARCH_INDEX')}')", + path="/indexes", method="GET", headers={ "Accept": "application/json;odata.metadata=minimal", "Api-Key": app_config.get("AZURE_SEARCH_KEY"), }, query_string="api-version=2023-10-01-Preview", - times=2, + times=3, ), ) @@ -449,7 +480,7 @@ def test_post_makes_correct_call_to_create_documents_search_index( ], }, }, - times=2, + times=1, ), ) @@ -475,7 +506,7 @@ def test_post_makes_correct_call_to_search_documents_search_index( "kind": "vector", "k": int(app_config.get("AZURE_SEARCH_TOP_K")), "fields": "content_vector", - "vector": [0.9320719838142395, -0.3622731566429138], + "vector": [0.018990106880664825, -0.0073809814639389515], } ], }, @@ -598,8 +629,8 @@ def test_post_returns_error_when_downstream_fails( httpserver.clear_all_handlers() # Clear default successful responses httpserver.expect_request( - f"/openai/deployments/{app_config.get('AZURE_OPENAI_EMBEDDING_MODEL')}/embeddings", - method="POST", + "/indexes", + method="GET", ).respond_with_json({}, status=500) # when diff --git a/code/tests/functional/backend_api/tests/sk_orchestrator/test_response_with_search_documents_tool.py b/code/tests/functional/backend_api/tests/sk_orchestrator/test_response_with_search_documents_tool.py index f0f909c85..56ce52b81 100644 --- a/code/tests/functional/backend_api/tests/sk_orchestrator/test_response_with_search_documents_tool.py +++ b/code/tests/functional/backend_api/tests/sk_orchestrator/test_response_with_search_documents_tool.py @@ -150,7 +150,7 @@ def test_post_responds_successfully(app_url: str, app_config: AppConfig): assert response.headers["Content-Type"] == "application/json" -def test_post_makes_correct_call_to_get_search_index( +def test_post_makes_correct_call_to_list_search_index( app_url: str, app_config: AppConfig, httpserver: HTTPServer ): # when @@ -160,14 +160,14 @@ def test_post_makes_correct_call_to_get_search_index( verify_request_made( mock_httpserver=httpserver, request_matcher=RequestMatcher( - path=f"/indexes('{app_config.get('AZURE_SEARCH_INDEX')}')", + path="/indexes", method="GET", headers={ "Accept": "application/json;odata.metadata=minimal", "Api-Key": app_config.get("AZURE_SEARCH_KEY"), }, query_string="api-version=2023-10-01-Preview", - times=2, + times=3, ), ) @@ -193,7 +193,7 @@ def test_post_makes_correct_call_to_get_search_documents( "kind": "vector", "k": int(app_config.get("AZURE_SEARCH_TOP_K")), "fields": "content_vector", - "vector": [0.9320719838142395, -0.3622731566429138], + "vector": [0.018990106880664825, -0.0073809814639389515], } ], }, diff --git a/code/tests/functional/backend_api/tests/sk_orchestrator/test_response_without_tool_call.py b/code/tests/functional/backend_api/tests/sk_orchestrator/test_response_without_tool_call.py index ade675e9e..afcdb9bbe 100644 --- a/code/tests/functional/backend_api/tests/sk_orchestrator/test_response_without_tool_call.py +++ b/code/tests/functional/backend_api/tests/sk_orchestrator/test_response_without_tool_call.py @@ -260,8 +260,8 @@ def test_post_returns_error_when_downstream_fails( httpserver.clear_all_handlers() # Clear default successful responses httpserver.expect_request( - f"/openai/deployments/{app_config.get('AZURE_OPENAI_EMBEDDING_MODEL')}/embeddings", - method="POST", + "/indexes", + method="GET", ).respond_with_json({}, status=500) # when diff --git a/code/tests/search_utilities/test_AzureSearchHandler.py b/code/tests/search_utilities/test_AzureSearchHandler.py index bad8eafe8..e6f4977db 100644 --- a/code/tests/search_utilities/test_AzureSearchHandler.py +++ b/code/tests/search_utilities/test_AzureSearchHandler.py @@ -1,53 +1,56 @@ import pytest -from unittest.mock import Mock, patch +from unittest.mock import MagicMock, Mock, patch from backend.batch.utilities.search.AzureSearchHandler import AzureSearchHandler -from langchain_core.documents import Document import json +from azure.search.documents.models import VectorizedQuery + +from backend.batch.utilities.common.SourceDocument import SourceDocument @pytest.fixture def env_helper_mock(): mock = Mock() - mock.AZURE_SEARCH_SERVICE = "https://example.search.windows.net" - mock.AZURE_SEARCH_INDEX = "example-index" - mock.AZURE_SEARCH_KEY = "example-key" - mock.is_auth_type_keys = Mock(return_value=True) + mock.AZURE_SEARCH_USE_SEMANTIC_SEARCH = False + mock.AZURE_SEARCH_TOP_K = 3 + mock.AZURE_SEARCH_FILTER = "some-search-filter" return mock @pytest.fixture -def mock_azure_search_helper(): +def mock_search_client(): with patch( "backend.batch.utilities.search.AzureSearchHandler.AzureSearchHelper" ) as mock: - vector_store = mock.return_value.get_vector_store.return_value.client - yield vector_store + search_client = mock.return_value.get_search_client.return_value + yield search_client @pytest.fixture -def mock_vector_store(): - with patch( - "backend.batch.utilities.search.AzureSearchHandler.AzureSearchHelper" - ) as mock: - vector_store = mock.return_value.get_vector_store.return_value - yield vector_store +def mock_llm_helper(): + with patch("backend.batch.utilities.search.AzureSearchHandler.LLMHelper") as mock: + mock_llm_helper = mock.return_value + yield mock_llm_helper @pytest.fixture -def handler(env_helper_mock, mock_azure_search_helper): +def handler(env_helper_mock, mock_search_client, mock_llm_helper): with patch( "backend.batch.utilities.search.AzureSearchHandler.AzureSearchHelper", - return_value=mock_azure_search_helper, + return_value=mock_search_client, ): - return AzureSearchHandler(env_helper_mock) + with patch( + "backend.batch.utilities.search.AzureSearchHandler.LLMHelper", + return_value=mock_llm_helper, + ): + return AzureSearchHandler(env_helper_mock) -def test_create_search_client(handler, mock_azure_search_helper): +def test_create_search_client(handler, mock_search_client): # when search_client = handler.create_search_client() # then - assert search_client == mock_azure_search_helper + assert search_client == mock_search_client def test_process_results(handler): @@ -112,40 +115,131 @@ def test_get_files(handler): ) -def test_query_search(handler, mock_vector_store): +@patch("backend.batch.utilities.search.AzureSearchHandler.tiktoken") +def test_query_search_uses_tiktoken_encoder(mock_tiktoken, handler, mock_llm_helper): # given question = "What is the answer?" + mock_encoder = MagicMock() + mock_tiktoken.get_encoding.return_value = mock_encoder + mock_encoder.encode.return_value = [1, 2, 3] + # when - result = handler.query_search(question) + handler.query_search(question) # then - mock_vector_store.similarity_search.assert_called_once_with( - query=question, - k=handler.env_helper.AZURE_SEARCH_TOP_K, - filters=handler.env_helper.AZURE_SEARCH_FILTER, + mock_tiktoken.get_encoding.assert_called_once_with("cl100k_base") + mock_encoder.encode.assert_called_once_with(question) + mock_llm_helper.generate_embeddings.assert_called_once_with([1, 2, 3]) + + +def test_query_search_performs_hybrid_search(handler, mock_llm_helper): + # given + question = "What is the answer?" + + mock_llm_helper.generate_embeddings.return_value = [1, 2, 3] + + # when + handler.query_search(question) + + # then + handler.search_client.search.assert_called_once_with( + search_text=question, + vector_queries=[ + VectorizedQuery( + vector=[1, 2, 3], + k_nearest_neighbors=handler.env_helper.AZURE_SEARCH_TOP_K, + filter=handler.env_helper.AZURE_SEARCH_FILTER, + fields="content_vector", + ) + ], + filter=handler.env_helper.AZURE_SEARCH_FILTER, + top=handler.env_helper.AZURE_SEARCH_TOP_K, ) - assert result == mock_vector_store.similarity_search.return_value -def test_return_answer_source_documents(handler): +def test_query_search_performs_semantic_search( + handler, mock_llm_helper, env_helper_mock +): # given - document = Document("mock content") - document.metadata = { - "id": "mock id", - "title": "mock title", - "source": "mock source", - "chunk": "mock chunk", - "offset": "mock offset", - "page_number": "mock page number", - } - documents = [document] + question = "What is the answer?" + + mock_llm_helper.generate_embeddings.return_value = [1, 2, 3] + env_helper_mock.AZURE_SEARCH_USE_SEMANTIC_SEARCH = True + env_helper_mock.AZURE_SEARCH_SEMANTIC_CONFIG_NAME = "some-semantic-config" + + # when + handler.query_search(question) + + # then + handler.search_client.search.assert_called_once_with( + search_text=question, + vector_queries=[ + VectorizedQuery( + vector=[1, 2, 3], + k_nearest_neighbors=handler.env_helper.AZURE_SEARCH_TOP_K, + fields="content_vector", + ) + ], + filter=handler.env_helper.AZURE_SEARCH_FILTER, + query_type="semantic", + semantic_configuration_name=handler.env_helper.AZURE_SEARCH_SEMANTIC_CONFIG_NAME, + query_caption="extractive", + query_answer="extractive", + top=handler.env_helper.AZURE_SEARCH_TOP_K, + ) + + +def test_query_search_converts_results_to_source_documents( + handler, +): + # given + question = "What is the answer?" + + handler.search_client.search.return_value = [ + { + "id": 1, + "content": "content1", + "title": "title1", + "source": "source1", + "chunk": "chunk1", + "offset": "offset1", + "page_number": "page_number1", + }, + { + "id": 2, + "content": "content2", + "title": "title2", + "source": "source2", + "chunk": "chunk2", + "offset": "offset2", + "page_number": "page_number2", + }, + ] + + expected_results = [ + SourceDocument( + id=1, + content="content1", + title="title1", + source="source1", + chunk="chunk1", + offset="offset1", + page_number="page_number1", + ), + SourceDocument( + id=2, + content="content2", + title="title2", + source="source2", + chunk="chunk2", + offset="offset2", + page_number="page_number2", + ), + ] + # when - source_documents = handler.return_answer_source_documents(documents) + actual_results = handler.query_search(question) # then - assert len(source_documents) == 1 - assert source_documents[0].id == "mock id" - assert source_documents[0].content == "mock content" - assert source_documents[0].title == "mock title" - assert source_documents[0].source == "mock source" + assert actual_results == expected_results diff --git a/code/tests/search_utilities/test_IntegratedVectorizationSearchHandler.py b/code/tests/search_utilities/test_IntegratedVectorizationSearchHandler.py index ce7c24e68..15f498dcd 100644 --- a/code/tests/search_utilities/test_IntegratedVectorizationSearchHandler.py +++ b/code/tests/search_utilities/test_IntegratedVectorizationSearchHandler.py @@ -4,7 +4,8 @@ IntegratedVectorizationSearchHandler, ) from azure.search.documents.models import VectorizableTextQuery -from langchain_core.documents import Document + +from backend.batch.utilities.common.SourceDocument import SourceDocument @pytest.fixture @@ -114,7 +115,7 @@ def test_get_files(handler, search_client_mock): ) -def test_query_search(handler, env_helper_mock): +def test_query_search_performs_search(handler, env_helper_mock): # given question = "test question" vector_query = VectorizableTextQuery( @@ -125,7 +126,7 @@ def test_query_search(handler, env_helper_mock): ) # when - result = handler.query_search(question) + handler.query_search(question) # then handler.search_client.search.assert_called_once_with( @@ -133,26 +134,48 @@ def test_query_search(handler, env_helper_mock): vector_queries=[vector_query], top=env_helper_mock.AZURE_SEARCH_TOP_K, ) - assert result == handler.search_client.search.return_value -def test_return_answer_source_documents(handler): +def test_query_search_converts_results_to_source_documents(handler): # given - document = Document("mock content") - document.metadata = { - "id": "mock id", - "title": "mock title", - "source": "mock source", - "chunk_id": "abcd_page_1", - } - documents = [document] + question = "test question" + + handler.search_client.search.return_value = [ + { + "id": 1, + "content": "content1", + "title": "title1", + "source": "https://example.com/http://example.com", + "chunk_id": "chunk_id1", + }, + { + "id": 2, + "content": "content2", + "title": "title2", + "source": "https://example.com", + "chunk_id": "chunk_id2", + }, + ] + + expected_results = [ + SourceDocument( + id=1, + content="content1", + title="title1", + source="http://example.com", + chunk_id="chunk_id1", + ), + SourceDocument( + id=2, + content="content2", + title="title2", + source="https://example.com_SAS_TOKEN_PLACEHOLDER_", + chunk_id="chunk_id2", + ), + ] + # when - source_documents = handler.return_answer_source_documents(documents) + actual_results = handler.query_search(question) # then - assert len(source_documents) == 1 - assert source_documents[0].id == "mock id" - assert source_documents[0].content == "mock content" - assert source_documents[0].title == "mock title" - assert source_documents[0].source == "mock source" - assert source_documents[0].chunk_id == "abcd_page_1" + assert actual_results == expected_results diff --git a/code/tests/search_utilities/test_Search.py b/code/tests/search_utilities/test_Search.py index 8b12a5565..f4cd08227 100644 --- a/code/tests/search_utilities/test_Search.py +++ b/code/tests/search_utilities/test_Search.py @@ -4,6 +4,7 @@ from backend.batch.utilities.search.IntegratedVectorizationSearchHandler import ( IntegratedVectorizationSearchHandler, ) +from backend.batch.utilities.common.SourceDocument import SourceDocument @pytest.fixture @@ -72,72 +73,30 @@ def test_get_source_documents_azure_search(search_handler_mock: MagicMock): # given question = "example question" - search_results = [ - { - "id": 1, - "title": "Example Title 1", - "source": "https://example.com/1", - "chunk_id": "chunk1", - }, - { - "id": 2, - "title": "Example Title 2", - "source": "https://example.com/2", - "chunk_id": "chunk2", - }, - ] - search_handler_mock.query_search.return_value = search_results - - # when - source_documents = Search.get_source_documents(search_handler_mock, question) - - # then - assert len(source_documents) == len(search_results) - - -def test_generate_source_documents(): - # given - search_results = [ - { - "id": 1, - "title": "Example Title 1", - "source": "https://example.com/1", - "chunk_id": "chunk1", - "content": "mock content 1", - }, - { - "id": 2, - "title": "Example Title 2", - "source": "https://example.com/2", - "chunk_id": "chunk2", - "content": "mock content 2", - }, + expected_source_documents = [ + SourceDocument( + id=1, + content="content1", + title="title1", + source="source1", + chunk="chunk1", + offset="offset1", + page_number="page_number1", + ), + SourceDocument( + id=2, + content="content2", + title="title2", + source="source2", + chunk="chunk2", + offset="offset2", + page_number="page_number2", + ), ] + search_handler_mock.query_search.return_value = expected_source_documents # when - source_documents = Search.generate_source_documents(search_results) - - # then - assert len(source_documents) == len(search_results) - - -def test__extract_source_url_multiple_http(): - # given - original_source = "https://example.com/http://example.com" - - # when - source_url = Search._extract_source_url(original_source) - - # then - assert source_url == "http://example.com" - - -def test__extract_source_url_single_http(): - # given - original_source = "https://example.com" - - # when - source_url = Search._extract_source_url(original_source) + actual_source_documents = Search.get_source_documents(search_handler_mock, question) # then - assert source_url == "https://example.com_SAS_TOKEN_PLACEHOLDER_" + assert len(actual_source_documents) == len(expected_source_documents) diff --git a/code/tests/utilities/helpers/processors/test_IntegratedVectorizationEmbedder.py b/code/tests/utilities/helpers/processors/test_IntegratedVectorizationEmbedder.py index d7412d894..b424e5439 100644 --- a/code/tests/utilities/helpers/processors/test_IntegratedVectorizationEmbedder.py +++ b/code/tests/utilities/helpers/processors/test_IntegratedVectorizationEmbedder.py @@ -3,8 +3,13 @@ from backend.batch.utilities.helpers.embedders.IntegratedVectorizationEmbedder import ( IntegratedVectorizationEmbedder, ) +from backend.batch.utilities.document_chunking.Strategies import ChunkingSettings +from backend.batch.utilities.document_loading import LoadingSettings +from backend.batch.utilities.document_loading.Strategies import LoadingStrategy AZURE_SEARCH_INDEXER_NAME = "mock-indexer-name" +CHUNKING_SETTINGS = ChunkingSettings({"strategy": "layout", "size": 1, "overlap": 0}) +LOADING_SETTINGS = LoadingSettings({"strategy": LoadingStrategy.LAYOUT}) @pytest.fixture(autouse=True) @@ -27,8 +32,16 @@ def llm_helper_mock(): llm_helper.get_embedding_model.return_value.embed_query.return_value = [ 0 ] * 1536 + llm_helper.generate_embeddings.return_value = [123] + yield mock + - yield llm_helper +@pytest.fixture(autouse=True) +def azure_search_helper_mock(): + with patch( + "backend.batch.utilities.helpers.DocumentProcessorHelper.AzureSearchHelper" + ) as mock: + yield mock @pytest.fixture(autouse=True) @@ -94,7 +107,7 @@ def test_process_using_integrated_vectorization( azure_search_iv_datasource_helper_mock.return_value.create_or_update_datasource.assert_called_once() azure_search_iv_index_helper_mock.assert_called_once_with( - env_helper_mock, llm_helper_mock + env_helper_mock, llm_helper_mock.return_value ) azure_search_iv_index_helper_mock.return_value.create_or_update_index.assert_called_once() diff --git a/code/tests/utilities/helpers/test_AzureSearchHelper.py b/code/tests/utilities/helpers/test_AzureSearchHelper.py index 84252e835..6c0b3b4a1 100644 --- a/code/tests/utilities/helpers/test_AzureSearchHelper.py +++ b/code/tests/utilities/helpers/test_AzureSearchHelper.py @@ -1,6 +1,25 @@ import pytest from unittest.mock import ANY, MagicMock, patch from backend.batch.utilities.helpers.AzureSearchHelper import AzureSearchHelper +from azure.search.documents.indexes.models import ( + ExhaustiveKnnAlgorithmConfiguration, + ExhaustiveKnnParameters, + HnswAlgorithmConfiguration, + HnswParameters, + SearchableField, + SearchField, + SearchFieldDataType, + SearchIndex, + SemanticConfiguration, + SemanticField, + SemanticPrioritizedFields, + SemanticSearch, + SimpleField, + VectorSearch, + VectorSearchAlgorithmKind, + VectorSearchAlgorithmMetric, + VectorSearchProfile, +) AZURE_AUTH_TYPE = "keys" AZURE_SEARCH_KEY = "mock-key" @@ -12,7 +31,7 @@ @pytest.fixture(autouse=True) -def AzureSearchMock(): +def azure_search_mock(): with patch("backend.batch.utilities.helpers.AzureSearchHelper.AzureSearch") as mock: yield mock @@ -44,58 +63,230 @@ def env_helper_mock(): AZURE_SEARCH_CONVERSATIONS_LOG_INDEX ) + env_helper.is_auth_type_keys.return_value = True + yield env_helper -def test_get_vector_store_keys(AzureSearchMock: MagicMock, llm_helper_mock: MagicMock): +@patch("backend.batch.utilities.helpers.AzureSearchHelper.SearchClient") +@patch("backend.batch.utilities.helpers.AzureSearchHelper.SearchIndexClient") +@patch("backend.batch.utilities.helpers.AzureSearchHelper.AzureKeyCredential") +def test_creates_search_clients_with_keys( + azure_key_credential_mock: MagicMock, + search_index_client_mock: MagicMock, + search_client_mock: MagicMock, +): + # when + AzureSearchHelper() + + # then + azure_key_credential_mock.assert_called_once_with(AZURE_SEARCH_KEY) + search_client_mock.assert_called_once_with( + endpoint=AZURE_SEARCH_SERVICE, + index_name=AZURE_SEARCH_INDEX, + credential=azure_key_credential_mock.return_value, + ) + search_index_client_mock.assert_called_once_with( + endpoint=AZURE_SEARCH_SERVICE, credential=azure_key_credential_mock.return_value + ) + + +@patch("backend.batch.utilities.helpers.AzureSearchHelper.SearchClient") +@patch("backend.batch.utilities.helpers.AzureSearchHelper.SearchIndexClient") +@patch("backend.batch.utilities.helpers.AzureSearchHelper.DefaultAzureCredential") +def test_creates_search_clients_with_rabc( + default_azure_credential_mock: MagicMock, + search_index_client_mock: MagicMock, + search_client_mock: MagicMock, + env_helper_mock: MagicMock, +): # given - azure_search_helper = AzureSearchHelper() + env_helper_mock.is_auth_type_keys.return_value = False # when - vector_store = azure_search_helper.get_vector_store() + AzureSearchHelper() # then - assert vector_store == AzureSearchMock.return_value - - AzureSearchMock.assert_called_once_with( - azure_search_endpoint=AZURE_SEARCH_SERVICE, - azure_search_key=AZURE_SEARCH_KEY, + default_azure_credential_mock.assert_called_once_with() + search_client_mock.assert_called_once_with( + endpoint=AZURE_SEARCH_SERVICE, index_name=AZURE_SEARCH_INDEX, - embedding_function=llm_helper_mock.get_embedding_model.return_value.embed_query, - fields=ANY, - search_type="hybrid", - semantic_configuration_name=AZURE_SEARCH_SEMANTIC_SEARCH_CONFIG, - user_agent="langchain chatwithyourdata-sa", + credential=default_azure_credential_mock.return_value, + ) + search_index_client_mock.assert_called_once_with( + endpoint=AZURE_SEARCH_SERVICE, + credential=default_azure_credential_mock.return_value, ) -def test_get_vector_store_rbac( - AzureSearchMock: MagicMock, llm_helper_mock: MagicMock, env_helper_mock: MagicMock +@patch("backend.batch.utilities.helpers.AzureSearchHelper.SearchClient") +@patch("backend.batch.utilities.helpers.AzureSearchHelper.SearchIndexClient") +def test_returns_search_client( + search_index_client_mock: MagicMock, search_client_mock: MagicMock ): # given - env_helper_mock.AZURE_AUTH_TYPE = "rbac" azure_search_helper = AzureSearchHelper() # when - vector_store = azure_search_helper.get_vector_store() + search_client = azure_search_helper.get_search_client() # then - assert vector_store == AzureSearchMock.return_value + assert search_client is search_client_mock.return_value - AzureSearchMock.assert_called_once_with( - azure_search_endpoint=AZURE_SEARCH_SERVICE, - azure_search_key=None, - index_name=AZURE_SEARCH_INDEX, - embedding_function=llm_helper_mock.get_embedding_model.return_value.embed_query, - fields=ANY, - search_type="hybrid", - semantic_configuration_name=AZURE_SEARCH_SEMANTIC_SEARCH_CONFIG, - user_agent="langchain chatwithyourdata-sa", + +@patch("backend.batch.utilities.helpers.AzureSearchHelper.SearchClient") +@patch("backend.batch.utilities.helpers.AzureSearchHelper.SearchIndexClient") +def test_creates_search_index_if_not_exists( + search_index_client_mock: MagicMock, search_client_mock: MagicMock +): + # given + search_index_client_mock.return_value.list_index_names.return_value = [ + "some-irrelevant-index" + ] + + fields = [ + SimpleField( + name="id", + type=SearchFieldDataType.String, + key=True, + filterable=True, + ), + SearchableField( + name="content", + type=SearchFieldDataType.String, + ), + SearchField( + name="content_vector", + type=SearchFieldDataType.Collection(SearchFieldDataType.Single), + searchable=True, + vector_search_dimensions=1536, + vector_search_profile_name="myHnswProfile", + ), + SearchableField( + name="metadata", + type=SearchFieldDataType.String, + ), + SearchableField( + name="title", + type=SearchFieldDataType.String, + facetable=True, + filterable=True, + ), + SearchableField( + name="source", + type=SearchFieldDataType.String, + filterable=True, + ), + SimpleField( + name="chunk", + type=SearchFieldDataType.Int32, + filterable=True, + ), + SimpleField( + name="offset", + type=SearchFieldDataType.Int32, + filterable=True, + ), + ] + + expected_index = SearchIndex( + name=AZURE_SEARCH_INDEX, + fields=fields, + semantic_search=( + SemanticSearch( + configurations=[ + SemanticConfiguration( + name=AZURE_SEARCH_SEMANTIC_SEARCH_CONFIG, + prioritized_fields=SemanticPrioritizedFields( + title_field=None, + content_fields=[SemanticField(field_name="content")], + ), + ) + ] + ) + ), + vector_search=VectorSearch( + algorithms=[ + HnswAlgorithmConfiguration( + name="default", + parameters=HnswParameters( + metric=VectorSearchAlgorithmMetric.COSINE + ), + kind=VectorSearchAlgorithmKind.HNSW, + ), + ExhaustiveKnnAlgorithmConfiguration( + name="default_exhaustive_knn", + kind=VectorSearchAlgorithmKind.EXHAUSTIVE_KNN, + parameters=ExhaustiveKnnParameters( + metric=VectorSearchAlgorithmMetric.COSINE + ), + ), + ], + profiles=[ + VectorSearchProfile( + name="myHnswProfile", + algorithm_configuration_name="default", + ), + VectorSearchProfile( + name="myExhaustiveKnnProfile", + algorithm_configuration_name="default_exhaustive_knn", + ), + ], + ), ) + # when + AzureSearchHelper() + + # then + search_index_client_mock.return_value.create_index.assert_called_once_with( + expected_index + ) + + +@patch("backend.batch.utilities.helpers.AzureSearchHelper.SearchClient") +@patch("backend.batch.utilities.helpers.AzureSearchHelper.SearchIndexClient") +def test_does_not_create_search_index_if_it_exists( + search_index_client_mock: MagicMock, + search_client_mock: MagicMock, +): + # given + search_index_client_mock.return_value.list_index_names.return_value = [ + AZURE_SEARCH_INDEX + ] + + # when + AzureSearchHelper() + + # then + search_index_client_mock.return_value.create_index.assert_not_called() + + +@patch("backend.batch.utilities.helpers.AzureSearchHelper.SearchClient") +@patch("backend.batch.utilities.helpers.AzureSearchHelper.SearchIndexClient") +def test_propogates_exceptions_when_creating_search_index( + search_index_client_mock: MagicMock, + search_client_mock: MagicMock, +): + # given + expected_exception = Exception() + search_index_client_mock.return_value.create_index.side_effect = expected_exception + + # when + with pytest.raises(Exception) as exc_info: + AzureSearchHelper() + + # then + assert exc_info.value == expected_exception + +@patch("backend.batch.utilities.helpers.AzureSearchHelper.SearchClient") +@patch("backend.batch.utilities.helpers.AzureSearchHelper.SearchIndexClient") def test_get_conversation_logger_keys( - AzureSearchMock: MagicMock, llm_helper_mock: MagicMock + search_index_client_mock: MagicMock, + search_client_mock: MagicMock, + azure_search_mock: MagicMock, + llm_helper_mock: MagicMock, ): # given azure_search_helper = AzureSearchHelper() @@ -104,9 +295,9 @@ def test_get_conversation_logger_keys( conversation_logger = azure_search_helper.get_conversation_logger() # then - assert conversation_logger == AzureSearchMock.return_value + assert conversation_logger == azure_search_mock.return_value - AzureSearchMock.assert_called_once_with( + azure_search_mock.assert_called_once_with( azure_search_endpoint=AZURE_SEARCH_SERVICE, azure_search_key=AZURE_SEARCH_KEY, index_name=AZURE_SEARCH_CONVERSATIONS_LOG_INDEX, @@ -116,20 +307,26 @@ def test_get_conversation_logger_keys( ) +@patch("backend.batch.utilities.helpers.AzureSearchHelper.SearchClient") +@patch("backend.batch.utilities.helpers.AzureSearchHelper.SearchIndexClient") def test_get_conversation_logger_rbac( - AzureSearchMock: MagicMock, llm_helper_mock: MagicMock, env_helper_mock: MagicMock + search_index_client_mock: MagicMock, + search_client_mock: MagicMock, + azure_search_mock: MagicMock, + llm_helper_mock: MagicMock, + env_helper_mock: MagicMock, ): # given - env_helper_mock.AZURE_AUTH_TYPE = "rbac" + env_helper_mock.is_auth_type_keys.return_value = False azure_search_helper = AzureSearchHelper() # when conversation_logger = azure_search_helper.get_conversation_logger() # then - assert conversation_logger == AzureSearchMock.return_value + assert conversation_logger == azure_search_mock.return_value - AzureSearchMock.assert_called_once_with( + azure_search_mock.assert_called_once_with( azure_search_endpoint=AZURE_SEARCH_SERVICE, azure_search_key=None, index_name=AZURE_SEARCH_CONVERSATIONS_LOG_INDEX, diff --git a/code/tests/utilities/helpers/test_LLMHelper.py b/code/tests/utilities/helpers/test_LLMHelper.py index 43ab357f7..db93fc169 100644 --- a/code/tests/utilities/helpers/test_LLMHelper.py +++ b/code/tests/utilities/helpers/test_LLMHelper.py @@ -3,6 +3,9 @@ import pytest from backend.batch.utilities.helpers.LLMHelper import LLMHelper from semantic_kernel.connectors.ai.open_ai import AzureChatCompletion +from openai.types.create_embedding_response import CreateEmbeddingResponse +from openai.types.embedding import Embedding + AZURE_OPENAI_ENDPOINT = "https://mock-endpoint" AZURE_OPENAI_API_VERSION = "mock-api-version" @@ -27,6 +30,12 @@ def env_helper_mock(): yield env_helper +@pytest.fixture(autouse=True) +def azure_openai_mock(): + with patch("backend.batch.utilities.helpers.LLMHelper.AzureOpenAI") as mock: + yield mock + + @patch("backend.batch.utilities.helpers.LLMHelper.AzureChatCompletion") def test_get_sk_chat_completion_service_keys(AzureChatCompletionMock: MagicMock): # given @@ -83,3 +92,38 @@ def test_get_sk_service_settings(): assert settings.service_id == "mock-service-id" assert settings.temperature == 0 assert settings.max_tokens == int(AZURE_OPENAI_MAX_TOKENS) + + +def test_generate_embeddings_embeds_input(azure_openai_mock): + # given + llm_helper = LLMHelper() + + # when + llm_helper.generate_embeddings("some input") + + # then + azure_openai_mock.return_value.embeddings.create.assert_called_once_with( + input=["some input"], model=AZURE_OPENAI_EMBEDDING_MODEL + ) + + +def test_generate_embeddings_returns_embeddings(azure_openai_mock): + # given + llm_helper = LLMHelper() + expected_embeddings = [1, 2, 3] + azure_openai_mock.return_value.embeddings.create.return_value = ( + CreateEmbeddingResponse( + data=[ + Embedding(embedding=expected_embeddings, index=0, object="embedding") + ], + model="mock-model", + object="list", + usage={"prompt_tokens": 0, "total_tokens": 0}, + ) + ) + + # when + actual_embeddings = llm_helper.generate_embeddings("some input") + + # then + assert actual_embeddings == expected_embeddings diff --git a/code/tests/utilities/helpers/test_PushEmbedder.py b/code/tests/utilities/helpers/test_PushEmbedder.py index 561096edc..0cbe18088 100644 --- a/code/tests/utilities/helpers/test_PushEmbedder.py +++ b/code/tests/utilities/helpers/test_PushEmbedder.py @@ -1,15 +1,28 @@ +import json import pytest -from unittest.mock import MagicMock, patch -from backend.batch.utilities.helpers.embedders.PushEmbedder import ( - PushEmbedder, -) +from unittest.mock import MagicMock, call, patch +from backend.batch.utilities.helpers.embedders.PushEmbedder import PushEmbedder +from backend.batch.utilities.document_chunking.Strategies import ChunkingSettings +from backend.batch.utilities.document_loading import LoadingSettings +from backend.batch.utilities.document_loading.Strategies import LoadingStrategy +from backend.batch.utilities.common.SourceDocument import SourceDocument from backend.batch.utilities.helpers.config.EmbeddingConfig import EmbeddingConfig -from backend.batch.utilities.helpers.DocumentLoadingHelper import DocumentLoading -from backend.batch.utilities.helpers.DocumentChunkingHelper import DocumentChunking -from backend.batch.utilities.common.SourceDocument import SourceDocument +CHUNKING_SETTINGS = ChunkingSettings({"strategy": "layout", "size": 1, "overlap": 0}) +LOADING_SETTINGS = LoadingSettings({"strategy": LoadingStrategy.LAYOUT}) -AZURE_SEARCH_INDEXER_NAME = "mock-indexer-name" + +@pytest.fixture(autouse=True) +def llm_helper_mock(): + with patch( + "backend.batch.utilities.helpers.embedders.PushEmbedder.LLMHelper" + ) as mock: + llm_helper = mock.return_value + llm_helper.get_embedding_model.return_value.embed_query.return_value = [ + 0 + ] * 1536 + llm_helper.generate_embeddings.return_value = [123] + yield mock @pytest.fixture(autouse=True) @@ -26,72 +39,208 @@ def mock_config_helper(): "backend.batch.utilities.helpers.embedders.PushEmbedder.ConfigHelper" ) as mock: config_helper = mock.get_active_config_or_default.return_value + config_helper.document_processors = [ + EmbeddingConfig( + "jpg", + CHUNKING_SETTINGS, + LOADING_SETTINGS, + use_advanced_image_processing=True, + ), + EmbeddingConfig( + "pdf", + CHUNKING_SETTINGS, + LOADING_SETTINGS, + use_advanced_image_processing=False, + ), + ] yield config_helper -def test_process_use_advanced_image_processing_skips_processing( - azure_search_helper_mock, mock_config_helper +@pytest.fixture(autouse=True) +def document_loading_mock(): + with patch( + "backend.batch.utilities.helpers.embedders.PushEmbedder.DocumentLoading" + ) as mock: + expected_documents = [ + SourceDocument(content="some content", source="some source") + ] + mock.return_value.load.return_value = expected_documents + yield mock + + +@pytest.fixture(autouse=True) +def document_chunking_mock(): + with patch( + "backend.batch.utilities.helpers.embedders.PushEmbedder.DocumentChunking" + ) as mock: + expected_chunked_documents = [ + SourceDocument( + content="some content", + source="some source", + id="some id", + title="some-title", + offset=1, + chunk=1, + page_number=1, + chunk_id="some chunk id", + ), + SourceDocument( + content="some other content", + source="some other source", + id="some other id", + title="some other-title", + offset=2, + chunk=2, + page_number=2, + chunk_id="some other chunk id", + ), + ] + mock.return_value.chunk.return_value = expected_chunked_documents + yield mock + + +def test_embed_file_use_advanced_image_processing_skips_processing( + azure_search_helper_mock, ): # given - vector_store_mock = MagicMock() - azure_search_helper_mock.return_value.get_vector_store.return_value = ( - vector_store_mock - ) - push_embedder = PushEmbedder(None) - processor = EmbeddingConfig("pdf", None, None, use_advanced_image_processing=True) + push_embedder = PushEmbedder(MagicMock()) # when - push_embedder._PushEmbedder__embed("some-url", processor) + push_embedder.embed_file("some-url", "some-file-name.jpg") # then - vector_store_mock.add_documents.assert_not_called() + azure_search_helper_mock.return_value.get_search_client.assert_not_called() -def test_process_with_non_advanced_image_processing_adds_documents_to_vector_store( - azure_search_helper_mock, mock_config_helper -): +def test_embed_file_loads_documents(document_loading_mock): # given - vector_store_mock = MagicMock() - azure_search_helper_mock.return_value.get_vector_store.return_value = ( - vector_store_mock - ) - push_embedder = PushEmbedder(None) - processor = EmbeddingConfig("pdf", None, None, use_advanced_image_processing=False) + push_embedder = PushEmbedder(MagicMock()) source_url = "some-url" - documents = [ - SourceDocument("1", "document1", "content1"), - SourceDocument("2", "document2", "content2"), - ] - with patch.object( - DocumentLoading, "load", return_value=documents - ) as load_mock, patch.object( - DocumentChunking, "chunk", return_value=documents - ) as chunk_mock: + # when + push_embedder.embed_file( + source_url, + "some-file-name.pdf", + ) + + # then + document_loading_mock.return_value.load.assert_called_once_with( + source_url, LOADING_SETTINGS + ) + + +def test_embed_file_chunks_documents(document_loading_mock, document_chunking_mock): + # given + push_embedder = PushEmbedder(MagicMock()) + + # when + push_embedder.embed_file( + "some-url", + "some-file-name.pdf", + ) + + # then + document_chunking_mock.return_value.chunk.assert_called_once_with( + document_loading_mock.return_value.load.return_value, CHUNKING_SETTINGS + ) - # when - push_embedder._PushEmbedder__embed(source_url, processor) - # then - load_mock.assert_called_once_with(source_url, processor.loading) - chunk_mock.assert_called_once_with(documents, processor.chunking) +def test_embed_file_generates_embeddings_for_documents(llm_helper_mock): + # given + push_embedder = PushEmbedder(MagicMock()) + # when + push_embedder.embed_file( + "some-url", + "some-file-name.pdf", + ) -def test_process_file_with_non_url_extension_processes_and_adds_metadata( - azure_search_helper_mock, mock_config_helper + # then + llm_helper_mock.return_value.generate_embeddings.assert_has_calls( + [call("some content"), call("some other content")] + ) + + +def test_embed_file_stores_documents_in_search_index( + document_chunking_mock, + llm_helper_mock, + azure_search_helper_mock, ): # given - vector_store_mock = MagicMock() - azure_search_helper_mock.return_value.get_vector_store.return_value = ( - vector_store_mock + push_embedder = PushEmbedder(MagicMock()) + + # when + push_embedder.embed_file( + "some-url", + "some-file-name.pdf", ) - push_embedder = PushEmbedder(blob_client=MagicMock()) - source_url = "some-url" - file_name = "sample.pdf" - with patch.object(push_embedder, "_PushEmbedder__embed") as embed_mock: - # when - push_embedder.embed_file(source_url, file_name) + # then + expected_chunked_documents = document_chunking_mock.return_value.chunk.return_value + azure_search_helper_mock.return_value.get_search_client.return_value.upload_documents.assert_called_once_with( + [ + { + "id": expected_chunked_documents[0].id, + "content": expected_chunked_documents[0].content, + "content_vector": llm_helper_mock.return_value.generate_embeddings.return_value, + "metadata": json.dumps( + { + "id": expected_chunked_documents[0].id, + "source": expected_chunked_documents[0].source, + "title": expected_chunked_documents[0].title, + "chunk": expected_chunked_documents[0].chunk, + "offset": expected_chunked_documents[0].offset, + "page_number": expected_chunked_documents[0].page_number, + "chunk_id": expected_chunked_documents[0].chunk_id, + } + ), + "title": expected_chunked_documents[0].title, + "source": expected_chunked_documents[0].source, + "chunk": expected_chunked_documents[0].chunk, + "offset": expected_chunked_documents[0].offset, + }, + { + "id": expected_chunked_documents[1].id, + "content": expected_chunked_documents[1].content, + "content_vector": llm_helper_mock.return_value.generate_embeddings.return_value, + "metadata": json.dumps( + { + "id": expected_chunked_documents[1].id, + "source": expected_chunked_documents[1].source, + "title": expected_chunked_documents[1].title, + "chunk": expected_chunked_documents[1].chunk, + "offset": expected_chunked_documents[1].offset, + "page_number": expected_chunked_documents[1].page_number, + "chunk_id": expected_chunked_documents[1].chunk_id, + } + ), + "title": expected_chunked_documents[1].title, + "source": expected_chunked_documents[1].source, + "chunk": expected_chunked_documents[1].chunk, + "offset": expected_chunked_documents[1].offset, + }, + ] + ) + + +def test_embed_file_raises_exception_on_failure( + azure_search_helper_mock, +): + # given + push_embedder = PushEmbedder(MagicMock()) + + successful_indexing_result = MagicMock() + successful_indexing_result.succeeded = True + failed_indexing_result = MagicMock() + failed_indexing_result.succeeded = False + azure_search_helper_mock.return_value.get_search_client.return_value.upload_documents.return_value = [ + successful_indexing_result, + failed_indexing_result, + ] - # then - embed_mock.assert_called_once() + # when + then + with pytest.raises(Exception): + push_embedder.embed_file( + "some-url", + "some-file-name.pdf", + ) diff --git a/code/tests/utilities/test_QuestionAnswerTool.py b/code/tests/utilities/test_QuestionAnswerTool.py index ade574287..2fe8f5680 100644 --- a/code/tests/utilities/test_QuestionAnswerTool.py +++ b/code/tests/utilities/test_QuestionAnswerTool.py @@ -87,28 +87,31 @@ def get_source_documents_yield(): with patch( "backend.batch.utilities.tools.QuestionAnswerTool.Search.get_source_documents" ) as mock: - document = Document(page_content="mock content") - document.metadata = { - "id": "mock id", - "title": "mock title", - "source": "mock source", - "chunk": "mock chunk", - "offset": "mock offset", - "page_number": "mock page number", - } - documents = [document] + documents = [ + SourceDocument( + id="mock id", + content="mock content", + title="mock title", + source="mock source", + chunk=123, + offset=123, + page_number=123, + ), + SourceDocument( + id="mock id 2", + content="mock content 2", + title="mock title 2", + source="mock source 2", + chunk_id="mock chunk id 2", + ), + ] mock.return_value = documents yield mock -def test_answer_question_returns_source_documents( - get_search_handler_mock, get_source_documents_mock -): +def test_answer_question_returns_source_documents(): # given tool = QuestionAnswerTool() - create_document_and_source_documents( - get_source_documents_mock, get_search_handler_mock - ) # when answer = tool.answer_question("mock question", []) @@ -116,55 +119,22 @@ def test_answer_question_returns_source_documents( # then source_documents = answer.source_documents - assert len(source_documents) == 1 + assert len(source_documents) == 2 assert source_documents[0].id == "mock id" assert source_documents[0].title == "mock title" + assert source_documents[0].content == "mock content" assert source_documents[0].source == "mock source" - assert source_documents[0].chunk == "mock chunk" - assert source_documents[0].offset == "mock offset" - assert source_documents[0].page_number == "mock page number" - - -def test_answer_question_integrated_vectorization( - env_helper_mock: MagicMock, get_search_handler_mock, get_source_documents_mock -): - # given - env_helper_mock.AZURE_SEARCH_USE_INTEGRATED_VECTORIZATION = True - document = Document("mock content") - document.metadata = { - "id": "mock id 1", - "title": "mock title 1", - "source": "https://example.com/doc1", - "chunk_id": "mock chunk id 1", - "content": "mock content 1", - } - get_source_documents_mock.return_value = document - documents = [] - documents.append( - SourceDocument( - id=document.metadata["id"], - content=document.page_content, - title=document.metadata["title"], - source=document.metadata["source"], - chunk_id=document.metadata["chunk_id"], - ) - ) - get_search_handler_mock.return_answer_source_documents.return_value = documents - tool = QuestionAnswerTool() - - # when - answer = tool.answer_question("mock question", []) - - # then - source_documents = answer.source_documents + assert source_documents[0].chunk == 123 + assert source_documents[0].offset == 123 + assert source_documents[0].page_number == 123 - assert len(source_documents) == 1 + assert source_documents[1].id == "mock id 2" + assert source_documents[1].title == "mock title 2" + assert source_documents[1].content == "mock content 2" - assert source_documents[0].chunk_id == "mock chunk id 1" - assert source_documents[0].title == "mock title 1" - assert source_documents[0].source == "https://example.com/doc1" - assert source_documents[0].content == "mock content" + assert source_documents[1].source == "mock source 2" + assert source_documents[1].chunk_id == "mock chunk id 2" def test_answer_question_returns_answer( @@ -211,8 +181,8 @@ def test_correct_prompt_with_few_shot_example( # then expected_input = { + "sources": '{"retrieved_documents":[{"[doc1]":{"content":"mock content"}},{"[doc2]":{"content":"mock content 2"}}]}', "question": "mock question", - "sources": '{"retrieved_documents":[{"[doc1]":{"content":"mock content"}}]}', "chat_history": [], } @@ -230,7 +200,7 @@ def test_correct_prompt_with_few_shot_example( Human: Sources: {"retrieved_documents":[{"[doc1]":{"content":"mock example content"}}]}, Question: mock example user question AI: mock example answer System: mock azure openai system message -Human: Sources: {"retrieved_documents":[{"[doc1]":{"content":"mock content"}}]}, Question: mock question""" +Human: Sources: {"retrieved_documents":[{"[doc1]":{"content":"mock content"}},{"[doc2]":{"content":"mock content 2"}}]}, Question: mock question""" ) @@ -251,8 +221,8 @@ def test_correct_prompt_without_few_shot_example( # then expected_input = { + "sources": '{"retrieved_documents":[{"[doc1]":{"content":"mock content"}},{"[doc2]":{"content":"mock content 2"}}]}', "question": "mock question", - "sources": '{"retrieved_documents":[{"[doc1]":{"content":"mock content"}}]}', "chat_history": [], } @@ -265,7 +235,7 @@ def test_correct_prompt_without_few_shot_example( prompt_test == """System: mock answering system prompt System: mock azure openai system message -Human: Sources: {"retrieved_documents":[{"[doc1]":{"content":"mock content"}}]}, Question: mock question""" +Human: Sources: {"retrieved_documents":[{"[doc1]":{"content":"mock content"}},{"[doc2]":{"content":"mock content 2"}}]}, Question: mock question""" ) @@ -286,7 +256,7 @@ def test_correct_prompt_with_few_shot_example_and_chat_history( # then expected_input = { "question": "mock question", - "sources": '{"retrieved_documents":[{"[doc1]":{"content":"mock content"}}]}', + "sources": '{"retrieved_documents":[{"[doc1]":{"content":"mock content"}},{"[doc2]":{"content":"mock content 2"}}]}', "chat_history": chat_history, } @@ -303,7 +273,7 @@ def test_correct_prompt_with_few_shot_example_and_chat_history( System: mock azure openai system message Human: Hello AI: Hi, how can I help? -Human: Sources: {"retrieved_documents":[{"[doc1]":{"content":"mock content"}}]}, Question: mock question""" +Human: Sources: {"retrieved_documents":[{"[doc1]":{"content":"mock content"}},{"[doc2]":{"content":"mock content 2"}}]}, Question: mock question""" ) @@ -326,7 +296,7 @@ def test_non_on_your_data_prompt_correct( # then expected_input = { - "sources": """[doc1]: mock content""", + "sources": """[doc1]: mock content\n\n[doc2]: mock content 2""", "question": "mock question", } @@ -335,7 +305,10 @@ def test_non_on_your_data_prompt_correct( prompt = LLMChainMock.call_args[1]["prompt"] prompt_test = prompt.format(**expected_input) - assert prompt_test == """Sources: [doc1]: mock content, Question: mock question""" + assert ( + prompt_test + == """Sources: [doc1]: mock content\n\n[doc2]: mock content 2, Question: mock question""" + ) @pytest.mark.parametrize( From ed97f2f3466c887e8bdc81fe2b31e765920c4566 Mon Sep 17 00:00:00 2001 From: Adam Dougal Date: Tue, 7 May 2024 18:45:14 +0100 Subject: [PATCH 02/10] Fix after merge --- .../processors/test_IntegratedVectorizationEmbedder.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/code/tests/utilities/helpers/processors/test_IntegratedVectorizationEmbedder.py b/code/tests/utilities/helpers/processors/test_IntegratedVectorizationEmbedder.py index b424e5439..76d6cc6e6 100644 --- a/code/tests/utilities/helpers/processors/test_IntegratedVectorizationEmbedder.py +++ b/code/tests/utilities/helpers/processors/test_IntegratedVectorizationEmbedder.py @@ -36,14 +36,6 @@ def llm_helper_mock(): yield mock -@pytest.fixture(autouse=True) -def azure_search_helper_mock(): - with patch( - "backend.batch.utilities.helpers.DocumentProcessorHelper.AzureSearchHelper" - ) as mock: - yield mock - - @pytest.fixture(autouse=True) def azure_search_iv_index_helper_mock(): with patch( From d7d854620687f314979d3f65867a35bd858911be Mon Sep 17 00:00:00 2001 From: Adam Dougal Date: Wed, 8 May 2024 10:42:11 +0000 Subject: [PATCH 03/10] Fix import --- code/backend/batch/utilities/helpers/embedders/PushEmbedder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/code/backend/batch/utilities/helpers/embedders/PushEmbedder.py b/code/backend/batch/utilities/helpers/embedders/PushEmbedder.py index 9f9463012..43e261045 100644 --- a/code/backend/batch/utilities/helpers/embedders/PushEmbedder.py +++ b/code/backend/batch/utilities/helpers/embedders/PushEmbedder.py @@ -2,7 +2,7 @@ import logging from typing import List -from ...helpers import LLMHelper +from ...helpers.LLMHelper import LLMHelper from ..AzureBlobStorageClient import AzureBlobStorageClient From 0522fb12aeb46ae7ddd921cb52a67490f17c82b7 Mon Sep 17 00:00:00 2001 From: Adam Dougal Date: Wed, 8 May 2024 16:23:35 +0000 Subject: [PATCH 04/10] Stop catching and rethrowing exception --- .../batch/utilities/helpers/AzureSearchHelper.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/code/backend/batch/utilities/helpers/AzureSearchHelper.py b/code/backend/batch/utilities/helpers/AzureSearchHelper.py index c07f32ecf..bddebf548 100644 --- a/code/backend/batch/utilities/helpers/AzureSearchHelper.py +++ b/code/backend/batch/utilities/helpers/AzureSearchHelper.py @@ -169,14 +169,10 @@ def create_index(self): ) if self._index_not_exists(self.env_helper.AZURE_SEARCH_INDEX): - try: - logger.info( - f"Creating or updating index {self.env_helper.AZURE_SEARCH_INDEX}" - ) - self.search_index_client.create_index(index) - except Exception as e: - logger.exception("Error Creating index") - raise e + logger.info( + f"Creating or updating index {self.env_helper.AZURE_SEARCH_INDEX}" + ) + self.search_index_client.create_index(index) def _index_not_exists(self, index_name: str) -> bool: return index_name not in [ From a3d8eab5807d2572fe0e00fd6582c9029ed4423f Mon Sep 17 00:00:00 2001 From: Adam Dougal Date: Wed, 8 May 2024 16:24:04 +0000 Subject: [PATCH 05/10] Typo --- .../backend_api/tests/default/test_conversation_custom.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/code/tests/functional/backend_api/tests/default/test_conversation_custom.py b/code/tests/functional/backend_api/tests/default/test_conversation_custom.py index 13b113081..c644321ee 100644 --- a/code/tests/functional/backend_api/tests/default/test_conversation_custom.py +++ b/code/tests/functional/backend_api/tests/default/test_conversation_custom.py @@ -189,7 +189,7 @@ def test_post_makes_correct_calls_to_openai_embeddings_to_embed_question_to_stor "input": [ [3923, 374, 279, 7438, 315, 2324, 30] ], # Embedding of "What is the meaning of life?" - "model": "text-embedding-ada-002", # this is harded in the langchain code base + "model": "text-embedding-ada-002", # this is hard coded in the langchain code base "encoding_format": "base64", }, headers={ From 365e5a2b26b30748fd6427a69e828f666fcafaeb Mon Sep 17 00:00:00 2001 From: Adam Dougal Date: Wed, 8 May 2024 16:31:25 +0000 Subject: [PATCH 06/10] Explain oneshot request --- code/tests/functional/backend_api/conftest.py | 26 ++++++++++++------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/code/tests/functional/backend_api/conftest.py b/code/tests/functional/backend_api/conftest.py index 2c7bc54e9..69116f7c7 100644 --- a/code/tests/functional/backend_api/conftest.py +++ b/code/tests/functional/backend_api/conftest.py @@ -55,15 +55,7 @@ def setup_default_mocking(httpserver: HTTPServer, app_config: AppConfig): } ) - httpserver.expect_oneshot_request( - "/indexes", - method="GET", - ).respond_with_json({"value": []}) - - httpserver.expect_request( - "/indexes", - method="GET", - ).respond_with_json({"value": [{"name": app_config.get("AZURE_SEARCH_INDEX")}]}) + prime_search_to_trigger_creation_of_index(httpserver, app_config) httpserver.expect_request( "/indexes", @@ -155,3 +147,19 @@ def setup_default_mocking(httpserver: HTTPServer, app_config: AppConfig): yield httpserver.check() + + +def prime_search_to_trigger_creation_of_index( + httpserver: HTTPServer, app_config: AppConfig +): + # first request should return no indexes + httpserver.expect_oneshot_request( + "/indexes", + method="GET", + ).respond_with_json({"value": []}) + + # second request should return the index as it will have been "created" + httpserver.expect_request( + "/indexes", + method="GET", + ).respond_with_json({"value": [{"name": app_config.get("AZURE_SEARCH_INDEX")}]}) From 60a83659d5382152421987c014c575e4203b502d Mon Sep 17 00:00:00 2001 From: Adam Dougal Date: Wed, 8 May 2024 16:32:43 +0000 Subject: [PATCH 07/10] Yield llmhelper --- .../processors/test_IntegratedVectorizationEmbedder.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/code/tests/utilities/helpers/processors/test_IntegratedVectorizationEmbedder.py b/code/tests/utilities/helpers/processors/test_IntegratedVectorizationEmbedder.py index 76d6cc6e6..dc1b8b0ec 100644 --- a/code/tests/utilities/helpers/processors/test_IntegratedVectorizationEmbedder.py +++ b/code/tests/utilities/helpers/processors/test_IntegratedVectorizationEmbedder.py @@ -33,7 +33,7 @@ def llm_helper_mock(): 0 ] * 1536 llm_helper.generate_embeddings.return_value = [123] - yield mock + yield llm_helper @pytest.fixture(autouse=True) @@ -99,7 +99,7 @@ def test_process_using_integrated_vectorization( azure_search_iv_datasource_helper_mock.return_value.create_or_update_datasource.assert_called_once() azure_search_iv_index_helper_mock.assert_called_once_with( - env_helper_mock, llm_helper_mock.return_value + env_helper_mock, llm_helper_mock ) azure_search_iv_index_helper_mock.return_value.create_or_update_index.assert_called_once() From db354b1bd556569b69ca527e066e700e143f2bcc Mon Sep 17 00:00:00 2001 From: Adam Dougal Date: Thu, 9 May 2024 08:42:50 +0000 Subject: [PATCH 08/10] Stop catching exceptions --- .../helpers/embedders/PushEmbedder.py | 41 ++++++++----------- 1 file changed, 16 insertions(+), 25 deletions(-) diff --git a/code/backend/batch/utilities/helpers/embedders/PushEmbedder.py b/code/backend/batch/utilities/helpers/embedders/PushEmbedder.py index 43e261045..730bc8b0d 100644 --- a/code/backend/batch/utilities/helpers/embedders/PushEmbedder.py +++ b/code/backend/batch/utilities/helpers/embedders/PushEmbedder.py @@ -42,34 +42,25 @@ def embed_file(self, source_url: str, file_name: str): def __embed(self, source_url: str, embedding_config: EmbeddingConfig): documents_to_upload: List[SourceDocument] = [] - try: - if not embedding_config.use_advanced_image_processing: - documents: List[SourceDocument] = self.document_loading.load( - source_url, embedding_config.loading - ) - documents = self.document_chunking.chunk( - documents, embedding_config.chunking - ) - - for document in documents: - documents_to_upload.append( - self._convert_to_search_document(document) - ) + if not embedding_config.use_advanced_image_processing: + documents: List[SourceDocument] = self.document_loading.load( + source_url, embedding_config.loading + ) + documents = self.document_chunking.chunk( + documents, embedding_config.chunking + ) - response = ( - self.azure_search_helper.get_search_client().upload_documents( - documents_to_upload - ) - ) - if not all([r.succeeded for r in response]): - raise Exception(response) + for document in documents: + documents_to_upload.append(self._convert_to_search_document(document)) - else: - logger.warning("Advanced image processing is not supported yet") + response = self.azure_search_helper.get_search_client().upload_documents( + documents_to_upload + ) + if not all([r.succeeded for r in response]): + raise Exception(response) - except Exception as e: - logger.error(f"Error adding embeddings for {source_url}: {e}") - raise e + else: + logger.warning("Advanced image processing is not supported yet") def _convert_to_search_document(self, document: SourceDocument): embedded_content = self.llm_helper.generate_embeddings(document.content) From dbfd07f65f62df79b72aad39c04d10771bb3e6ef Mon Sep 17 00:00:00 2001 From: Adam Dougal Date: Thu, 9 May 2024 08:46:12 +0000 Subject: [PATCH 09/10] Fix after merge --- .../helpers/processors/test_IntegratedVectorizationEmbedder.py | 2 +- code/tests/utilities/helpers/test_PushEmbedder.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/code/tests/utilities/helpers/processors/test_IntegratedVectorizationEmbedder.py b/code/tests/utilities/helpers/processors/test_IntegratedVectorizationEmbedder.py index dc1b8b0ec..86e18f597 100644 --- a/code/tests/utilities/helpers/processors/test_IntegratedVectorizationEmbedder.py +++ b/code/tests/utilities/helpers/processors/test_IntegratedVectorizationEmbedder.py @@ -3,7 +3,7 @@ from backend.batch.utilities.helpers.embedders.IntegratedVectorizationEmbedder import ( IntegratedVectorizationEmbedder, ) -from backend.batch.utilities.document_chunking.Strategies import ChunkingSettings +from backend.batch.utilities.document_chunking.ChunkingStrategy import ChunkingSettings from backend.batch.utilities.document_loading import LoadingSettings from backend.batch.utilities.document_loading.Strategies import LoadingStrategy diff --git a/code/tests/utilities/helpers/test_PushEmbedder.py b/code/tests/utilities/helpers/test_PushEmbedder.py index 0cbe18088..56bb0f21f 100644 --- a/code/tests/utilities/helpers/test_PushEmbedder.py +++ b/code/tests/utilities/helpers/test_PushEmbedder.py @@ -2,7 +2,7 @@ import pytest from unittest.mock import MagicMock, call, patch from backend.batch.utilities.helpers.embedders.PushEmbedder import PushEmbedder -from backend.batch.utilities.document_chunking.Strategies import ChunkingSettings +from backend.batch.utilities.document_chunking.ChunkingStrategy import ChunkingSettings from backend.batch.utilities.document_loading import LoadingSettings from backend.batch.utilities.document_loading.Strategies import LoadingStrategy from backend.batch.utilities.common.SourceDocument import SourceDocument From 5f04de9db38f16ee1afe9e7d4a6f042911f5c1db Mon Sep 17 00:00:00 2001 From: Adam Dougal Date: Thu, 9 May 2024 10:49:35 +0000 Subject: [PATCH 10/10] Revert warnings change --- code/backend/batch/utilities/tools/QuestionAnswerTool.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/code/backend/batch/utilities/tools/QuestionAnswerTool.py b/code/backend/batch/utilities/tools/QuestionAnswerTool.py index b29fd2f18..58c62c767 100644 --- a/code/backend/batch/utilities/tools/QuestionAnswerTool.py +++ b/code/backend/batch/utilities/tools/QuestionAnswerTool.py @@ -1,5 +1,6 @@ import json import logging +import warnings from ..common.SourceDocument import SourceDocument from ..search.Search import Search @@ -83,7 +84,7 @@ def generate_on_your_data_llm_chain( if all((few_shot_example.values())): examples.append(few_shot_example) else: - logger.warning( + warnings.warn( "Not all example fields are set in the config. Skipping few-shot example." ) @@ -139,7 +140,7 @@ def answer_question( question, chat_history, source_documents ) else: - logger.warning( + warnings.warn( "Azure OpenAI On Your Data prompt format is recommended and should be enabled in the Admin app.", ) answering_prompt, input = self.generate_llm_chain(