From 40c98859afc24ee558cefa0e0dc7b603d9d36507 Mon Sep 17 00:00:00 2001
From: Adam Dougal <adamdougal@microsoft.com>
Date: Fri, 3 May 2024 17:30:19 +0100
Subject: [PATCH 01/10] Remove langchain from index operations

- This is preparation for adding an additional image vector field for
  advanced image processing
- I've tried to keep changes to a minimum
- Still using langchain in the question and answer tool for now
- Increased unit test coverage

Required by https://github.com/Azure-Samples/chat-with-your-data-solution-accelerator/issues/748
---
 .../batch/utilities/common/SourceDocument.py  |  29 +-
 .../utilities/helpers/AzureSearchHelper.py    | 128 +++++++--
 .../batch/utilities/helpers/LLMHelper.py      |  11 +-
 .../helpers/embedders/PushEmbedder.py         |  87 ++++--
 .../utilities/search/AzureSearchHandler.py    |  78 +++--
 .../IntegratedVectorizationSearchHandler.py   |  26 +-
 code/backend/batch/utilities/search/Search.py |  41 +--
 .../utilities/search/SearchHandlerBase.py     |   8 +-
 .../utilities/tools/QuestionAnswerTool.py     |  29 +-
 code/tests/common/test_SourceDocument.py      |  29 --
 code/tests/functional/backend_api/conftest.py |   9 +-
 .../tests/default/test_conversation_custom.py |  53 +++-
 ...est_response_with_search_documents_tool.py |   8 +-
 .../test_response_without_tool_call.py        |   4 +-
 .../test_AzureSearchHandler.py                | 182 +++++++++---
 ...st_IntegratedVectorizationSearchHandler.py |  63 +++--
 code/tests/search_utilities/test_Search.py    |  87 ++----
 .../test_IntegratedVectorizationEmbedder.py   |  17 +-
 .../helpers/test_AzureSearchHelper.py         | 267 +++++++++++++++---
 .../tests/utilities/helpers/test_LLMHelper.py |  44 +++
 .../utilities/helpers/test_PushEmbedder.py    | 257 +++++++++++++----
 .../utilities/test_QuestionAnswerTool.py      | 107 +++----
 22 files changed, 1093 insertions(+), 471 deletions(-)

diff --git a/code/backend/batch/utilities/common/SourceDocument.py b/code/backend/batch/utilities/common/SourceDocument.py
index 296dc90eb..94a31319a 100644
--- a/code/backend/batch/utilities/common/SourceDocument.py
+++ b/code/backend/batch/utilities/common/SourceDocument.py
@@ -3,7 +3,6 @@
 import json
 from urllib.parse import urlparse, quote
 from ..helpers.AzureBlobStorageClient import AzureBlobStorageClient
-from langchain.docstore.document import Document
 
 
 class SourceDocument:
@@ -30,6 +29,20 @@ def __init__(
     def __str__(self):
         return f"SourceDocument(id={self.id}, title={self.title}, source={self.source}, chunk={self.chunk}, offset={self.offset}, page_number={self.page_number}, chunk_id={self.chunk_id})"
 
+    def __eq__(self, other):
+        if isinstance(self, other.__class__):
+            return (
+                self.id == other.id
+                and self.content == other.content
+                and self.source == other.source
+                and self.title == other.title
+                and self.chunk == other.chunk
+                and self.offset == other.offset
+                and self.page_number == other.page_number
+                and self.chunk_id == other.chunk_id
+            )
+        return False
+
     def to_json(self):
         return json.dumps(self, cls=SourceDocumentEncoder)
 
@@ -80,20 +93,6 @@ def from_metadata(
             chunk_id=metadata.get("chunk_id"),
         )
 
-    def convert_to_langchain_document(self):
-        return Document(
-            page_content=self.content,
-            metadata={
-                "id": self.id,
-                "source": self.source,
-                "title": self.title,
-                "chunk": self.chunk,
-                "offset": self.offset,
-                "page_number": self.page_number,
-                "chunk_id": self.chunk_id,
-            },
-        )
-
     def get_filename(self, include_path=False):
         filename = self.source.replace("_SAS_TOKEN_PLACEHOLDER_", "").replace(
             "http://", ""
diff --git a/code/backend/batch/utilities/helpers/AzureSearchHelper.py b/code/backend/batch/utilities/helpers/AzureSearchHelper.py
index 4d14d2ad6..c07f32ecf 100644
--- a/code/backend/batch/utilities/helpers/AzureSearchHelper.py
+++ b/code/backend/batch/utilities/helpers/AzureSearchHelper.py
@@ -1,13 +1,34 @@
+import logging
+from typing import Union
 from langchain.vectorstores.azuresearch import AzureSearch
+from azure.core.credentials import AzureKeyCredential
+from azure.identity import DefaultAzureCredential
+from azure.search.documents import SearchClient
+from azure.search.documents.indexes import SearchIndexClient
 from azure.search.documents.indexes.models import (
+    ExhaustiveKnnAlgorithmConfiguration,
+    ExhaustiveKnnParameters,
+    HnswAlgorithmConfiguration,
+    HnswParameters,
     SearchableField,
     SearchField,
     SearchFieldDataType,
+    SearchIndex,
+    SemanticConfiguration,
+    SemanticField,
+    SemanticPrioritizedFields,
+    SemanticSearch,
     SimpleField,
+    VectorSearch,
+    VectorSearchAlgorithmKind,
+    VectorSearchAlgorithmMetric,
+    VectorSearchProfile,
 )
 from .LLMHelper import LLMHelper
 from .EnvHelper import EnvHelper
 
+logger = logging.getLogger(__name__)
+
 
 class AzureSearchHelper:
     _search_dimension: int | None = None
@@ -16,6 +37,37 @@ def __init__(self):
         self.llm_helper = LLMHelper()
         self.env_helper = EnvHelper()
 
+        search_credential = self._search_credential()
+        self.search_client = self._create_search_client(search_credential)
+        self.search_index_client = self._create_search_index_client(search_credential)
+        self.create_index()
+
+    def _search_credential(self):
+        if self.env_helper.is_auth_type_keys():
+            return AzureKeyCredential(self.env_helper.AZURE_SEARCH_KEY)
+        else:
+            return DefaultAzureCredential()
+
+    def _create_search_client(
+        self, search_credential: Union[AzureKeyCredential, DefaultAzureCredential]
+    ) -> SearchClient:
+        return SearchClient(
+            endpoint=self.env_helper.AZURE_SEARCH_SERVICE,
+            index_name=self.env_helper.AZURE_SEARCH_INDEX,
+            credential=search_credential,
+        )
+
+    def _create_search_index_client(
+        self, search_credential: Union[AzureKeyCredential, DefaultAzureCredential]
+    ):
+        return SearchIndexClient(
+            endpoint=self.env_helper.AZURE_SEARCH_SERVICE, credential=search_credential
+        )
+
+    def get_search_client(self) -> SearchClient:
+        self.create_index()
+        return self.search_client
+
     @property
     def search_dimensions(self) -> int:
         if AzureSearchHelper._search_dimension is None:
@@ -24,7 +76,7 @@ def search_dimensions(self) -> int:
             )
         return AzureSearchHelper._search_dimension
 
-    def get_vector_store(self):
+    def create_index(self):
         fields = [
             SimpleField(
                 name="id",
@@ -70,25 +122,67 @@ def get_vector_store(self):
             ),
         ]
 
-        return AzureSearch(
-            azure_search_endpoint=self.env_helper.AZURE_SEARCH_SERVICE,
-            azure_search_key=(
-                self.env_helper.AZURE_SEARCH_KEY
-                if self.env_helper.AZURE_AUTH_TYPE == "keys"
-                else None
-            ),
-            index_name=self.env_helper.AZURE_SEARCH_INDEX,
-            embedding_function=self.llm_helper.get_embedding_model().embed_query,
+        index = SearchIndex(
+            name=self.env_helper.AZURE_SEARCH_INDEX,
             fields=fields,
-            search_type=(
-                "semantic_hybrid"
-                if self.env_helper.AZURE_SEARCH_USE_SEMANTIC_SEARCH
-                else "hybrid"
+            semantic_search=(
+                SemanticSearch(
+                    configurations=[
+                        SemanticConfiguration(
+                            name=self.env_helper.AZURE_SEARCH_SEMANTIC_SEARCH_CONFIG,
+                            prioritized_fields=SemanticPrioritizedFields(
+                                title_field=None,
+                                content_fields=[SemanticField(field_name="content")],
+                            ),
+                        )
+                    ]
+                )
+            ),
+            vector_search=VectorSearch(
+                algorithms=[
+                    HnswAlgorithmConfiguration(
+                        name="default",
+                        parameters=HnswParameters(
+                            metric=VectorSearchAlgorithmMetric.COSINE
+                        ),
+                        kind=VectorSearchAlgorithmKind.HNSW,
+                    ),
+                    ExhaustiveKnnAlgorithmConfiguration(
+                        name="default_exhaustive_knn",
+                        kind=VectorSearchAlgorithmKind.EXHAUSTIVE_KNN,
+                        parameters=ExhaustiveKnnParameters(
+                            metric=VectorSearchAlgorithmMetric.COSINE
+                        ),
+                    ),
+                ],
+                profiles=[
+                    VectorSearchProfile(
+                        name="myHnswProfile",
+                        algorithm_configuration_name="default",
+                    ),
+                    VectorSearchProfile(
+                        name="myExhaustiveKnnProfile",
+                        algorithm_configuration_name="default_exhaustive_knn",
+                    ),
+                ],
             ),
-            semantic_configuration_name=self.env_helper.AZURE_SEARCH_SEMANTIC_SEARCH_CONFIG,
-            user_agent="langchain chatwithyourdata-sa",
         )
 
+        if self._index_not_exists(self.env_helper.AZURE_SEARCH_INDEX):
+            try:
+                logger.info(
+                    f"Creating or updating index {self.env_helper.AZURE_SEARCH_INDEX}"
+                )
+                self.search_index_client.create_index(index)
+            except Exception as e:
+                logger.exception("Error Creating index")
+                raise e
+
+    def _index_not_exists(self, index_name: str) -> bool:
+        return index_name not in [
+            name for name in self.search_index_client.list_index_names()
+        ]
+
     def get_conversation_logger(self):
         fields = [
             SimpleField(
@@ -152,7 +246,7 @@ def get_conversation_logger(self):
             azure_search_endpoint=self.env_helper.AZURE_SEARCH_SERVICE,
             azure_search_key=(
                 self.env_helper.AZURE_SEARCH_KEY
-                if self.env_helper.AZURE_AUTH_TYPE == "keys"
+                if self.env_helper.is_auth_type_keys()
                 else None
             ),
             index_name=self.env_helper.AZURE_SEARCH_CONVERSATIONS_LOG_INDEX,
diff --git a/code/backend/batch/utilities/helpers/LLMHelper.py b/code/backend/batch/utilities/helpers/LLMHelper.py
index 74d9652c7..e2133bcf5 100644
--- a/code/backend/batch/utilities/helpers/LLMHelper.py
+++ b/code/backend/batch/utilities/helpers/LLMHelper.py
@@ -1,5 +1,5 @@
 from openai import AzureOpenAI
-from typing import cast
+from typing import List, Union, cast
 from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings
 from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
 from semantic_kernel.connectors.ai.open_ai import AzureChatCompletion
@@ -98,6 +98,15 @@ def get_embedding_model(self):
                 azure_ad_token_provider=self.token_provider,
             )
 
+    def generate_embeddings(self, input: Union[str, list[int]]) -> List[float]:
+        return (
+            self.openai_client.embeddings.create(
+                input=[input], model=self.embedding_model
+            )
+            .data[0]
+            .embedding
+        )
+
     def get_chat_completion_with_functions(
         self, messages: list[dict], functions: list[dict], function_call: str = "auto"
     ):
diff --git a/code/backend/batch/utilities/helpers/embedders/PushEmbedder.py b/code/backend/batch/utilities/helpers/embedders/PushEmbedder.py
index 3b2eb12ad..9f9463012 100644
--- a/code/backend/batch/utilities/helpers/embedders/PushEmbedder.py
+++ b/code/backend/batch/utilities/helpers/embedders/PushEmbedder.py
@@ -1,6 +1,9 @@
+import json
 import logging
 from typing import List
 
+from ...helpers import LLMHelper
+
 from ..AzureBlobStorageClient import AzureBlobStorageClient
 
 from ..config.EmbeddingConfig import EmbeddingConfig
@@ -17,39 +20,75 @@
 
 class PushEmbedder(EmbedderBase):
     def __init__(self, blob_client: AzureBlobStorageClient):
+        self.llm_helper = LLMHelper()
+        self.azure_search_helper = AzureSearchHelper()
+        self.document_loading = DocumentLoading()
+        self.document_chunking = DocumentChunking()
         self.blob_client = blob_client
         config = ConfigHelper.get_active_config_or_default()
-        self.processor_map = {}
+        self.embedding_configs = {}
         for processor in config.document_processors:
             ext = processor.document_type.lower()
-            self.processor_map[ext] = processor
+            self.embedding_configs[ext] = processor
 
     def embed_file(self, source_url: str, file_name: str):
         file_extension = file_name.split(".")[-1]
-        processor = self.processor_map.get(file_extension)
-        self.__embed(source_url=source_url, processor=processor)
+        embedding_config = self.embedding_configs.get(file_extension)
+        self.__embed(source_url=source_url, embedding_config=embedding_config)
         if file_extension != "url":
             self.blob_client.upsert_blob_metadata(
                 file_name, {"embeddings_added": "true"}
             )
 
-    def __embed(self, source_url: str, processor: EmbeddingConfig):
-        vector_store_helper = AzureSearchHelper()
-        vector_store = vector_store_helper.get_vector_store()
-        if not processor.use_advanced_image_processing:
-            try:
-                document_loading = DocumentLoading()
-                document_chunking = DocumentChunking()
-                documents: List[SourceDocument] = []
-                documents = document_loading.load(source_url, processor.loading)
-                documents = document_chunking.chunk(documents, processor.chunking)
-                keys = list(map(lambda x: x.id, documents))
-                documents = [
-                    document.convert_to_langchain_document() for document in documents
-                ]
-                return vector_store.add_documents(documents=documents, keys=keys)
-            except Exception as e:
-                logger.error(f"Error adding embeddings for {source_url}: {e}")
-                raise e
-        else:
-            logger.warn("Advanced image processing is not supported yet")
+    def __embed(self, source_url: str, embedding_config: EmbeddingConfig):
+        documents_to_upload: List[SourceDocument] = []
+        try:
+            if not embedding_config.use_advanced_image_processing:
+                documents: List[SourceDocument] = self.document_loading.load(
+                    source_url, embedding_config.loading
+                )
+                documents = self.document_chunking.chunk(
+                    documents, embedding_config.chunking
+                )
+
+                for document in documents:
+                    documents_to_upload.append(
+                        self._convert_to_search_document(document)
+                    )
+
+                response = (
+                    self.azure_search_helper.get_search_client().upload_documents(
+                        documents_to_upload
+                    )
+                )
+                if not all([r.succeeded for r in response]):
+                    raise Exception(response)
+
+            else:
+                logger.warning("Advanced image processing is not supported yet")
+
+        except Exception as e:
+            logger.error(f"Error adding embeddings for {source_url}: {e}")
+            raise e
+
+    def _convert_to_search_document(self, document: SourceDocument):
+        embedded_content = self.llm_helper.generate_embeddings(document.content)
+        metadata = {
+            "id": document.id,
+            "source": document.source,
+            "title": document.title,
+            "chunk": document.chunk,
+            "offset": document.offset,
+            "page_number": document.page_number,
+            "chunk_id": document.chunk_id,
+        }
+        return {
+            "id": document.id,
+            "content": document.content,
+            "content_vector": embedded_content,
+            "metadata": json.dumps(metadata),
+            "title": document.title,
+            "source": document.source,
+            "chunk": document.chunk,
+            "offset": document.offset,
+        }
diff --git a/code/backend/batch/utilities/search/AzureSearchHandler.py b/code/backend/batch/utilities/search/AzureSearchHandler.py
index 5b07169c4..c3129096f 100644
--- a/code/backend/batch/utilities/search/AzureSearchHandler.py
+++ b/code/backend/batch/utilities/search/AzureSearchHandler.py
@@ -1,13 +1,24 @@
+from typing import List
 from .SearchHandlerBase import SearchHandlerBase
+from ..helpers.LLMHelper import LLMHelper
 from ..helpers.AzureSearchHelper import AzureSearchHelper
 from ..common.SourceDocument import SourceDocument
 import json
+from azure.search.documents.models import VectorizedQuery
+import tiktoken
 
 
 class AzureSearchHandler(SearchHandlerBase):
+
+    _ENCODER_NAME = "cl100k_base"
+    _VECTOR_FIELD = "content_vector"
+
+    def __init__(self, env_helper):
+        super().__init__(env_helper)
+        self.llm_helper = LLMHelper()
+
     def create_search_client(self):
-        vector_store_helper = AzureSearchHelper()
-        return vector_store_helper.get_vector_store().client
+        return AzureSearchHelper().get_search_client()
 
     def perform_search(self, filename):
         return self.search_client.search(
@@ -49,26 +60,61 @@ def delete_files(self, files):
 
         return ", ".join(files_to_delete)
 
-    def query_search(self, question):
-        vector_store = AzureSearchHelper().get_vector_store()
-        return vector_store.similarity_search(
-            query=question,
-            k=self.env_helper.AZURE_SEARCH_TOP_K,
-            filters=self.env_helper.AZURE_SEARCH_FILTER,
+    def query_search(self, question) -> List[SourceDocument]:
+        encoding = tiktoken.get_encoding(self._ENCODER_NAME)
+        tokenised_question = encoding.encode(question)
+        if self.env_helper.AZURE_SEARCH_USE_SEMANTIC_SEARCH:
+            results = self._semantic_search(question, tokenised_question)
+        else:
+            results = self._hybrid_search(question, tokenised_question)
+
+        return self._convert_to_source_documents(results)
+
+    def _semantic_search(self, question: str, tokenised_question: list[int]):
+        return self.search_client.search(
+            search_text=question,
+            vector_queries=[
+                VectorizedQuery(
+                    vector=self.llm_helper.generate_embeddings(tokenised_question),
+                    k_nearest_neighbors=self.env_helper.AZURE_SEARCH_TOP_K,
+                    fields=self._VECTOR_FIELD,
+                )
+            ],
+            filter=self.env_helper.AZURE_SEARCH_FILTER,
+            query_type="semantic",
+            semantic_configuration_name=self.env_helper.AZURE_SEARCH_SEMANTIC_CONFIG_NAME,
+            query_caption="extractive",
+            query_answer="extractive",
+            top=self.env_helper.AZURE_SEARCH_TOP_K,
+        )
+
+    def _hybrid_search(self, question: str, tokenised_question: list[int]):
+        return self.search_client.search(
+            search_text=question,
+            vector_queries=[
+                VectorizedQuery(
+                    vector=self.llm_helper.generate_embeddings(tokenised_question),
+                    k_nearest_neighbors=self.env_helper.AZURE_SEARCH_TOP_K,
+                    filter=self.env_helper.AZURE_SEARCH_FILTER,
+                    fields=self._VECTOR_FIELD,
+                )
+            ],
+            filter=self.env_helper.AZURE_SEARCH_FILTER,
+            top=self.env_helper.AZURE_SEARCH_TOP_K,
         )
 
-    def return_answer_source_documents(self, search_results):
+    def _convert_to_source_documents(self, search_results) -> List[SourceDocument]:
         source_documents = []
         for source in search_results:
             source_documents.append(
                 SourceDocument(
-                    id=source.metadata["id"],
-                    content=source.page_content,
-                    title=source.metadata["title"],
-                    source=source.metadata["source"],
-                    chunk=source.metadata["chunk"],
-                    offset=source.metadata["offset"],
-                    page_number=source.metadata["page_number"],
+                    id=source.get("id"),
+                    content=source.get("content"),
+                    title=source.get("title"),
+                    source=source.get("source"),
+                    chunk=source.get("chunk"),
+                    offset=source.get("offset"),
+                    page_number=source.get("page_number"),
                 )
             )
         return source_documents
diff --git a/code/backend/batch/utilities/search/IntegratedVectorizationSearchHandler.py b/code/backend/batch/utilities/search/IntegratedVectorizationSearchHandler.py
index 7963ecf5d..429db192d 100644
--- a/code/backend/batch/utilities/search/IntegratedVectorizationSearchHandler.py
+++ b/code/backend/batch/utilities/search/IntegratedVectorizationSearchHandler.py
@@ -1,3 +1,4 @@
+from typing import List
 from .SearchHandlerBase import SearchHandlerBase
 from azure.search.documents import SearchClient
 from azure.search.documents.models import VectorizableTextQuery
@@ -62,7 +63,7 @@ def delete_files(self, files):
 
         return ", ".join(files_to_delete)
 
-    def query_search(self, question):
+    def query_search(self, question) -> List[SourceDocument]:
         vector_query = VectorizableTextQuery(
             text=question,
             k_nearest_neighbors=self.env_helper.AZURE_SEARCH_TOP_K,
@@ -74,18 +75,27 @@ def query_search(self, question):
             vector_queries=[vector_query],
             top=self.env_helper.AZURE_SEARCH_TOP_K,
         )
-        return search_results
+        return self._convert_to_source_documents(search_results)
 
-    def return_answer_source_documents(self, search_results):
+    def _convert_to_source_documents(self, search_results) -> List[SourceDocument]:
         source_documents = []
         for source in search_results:
             source_documents.append(
                 SourceDocument(
-                    id=source.metadata["id"],
-                    content=source.page_content,
-                    title=source.metadata["title"],
-                    source=source.metadata["source"],
-                    chunk_id=source.metadata["chunk_id"],
+                    id=source.get("id"),
+                    content=source.get("content"),
+                    title=source.get("title"),
+                    source=self._extract_source_url(source.get("source")),
+                    chunk_id=source.get("chunk_id"),
                 )
             )
         return source_documents
+
+    def _extract_source_url(self, original_source: str) -> str:
+        matches = list(re.finditer(r"https?://", original_source))
+        if len(matches) > 1:
+            second_http_start = matches[1].start()
+            source_url = original_source[second_http_start:]
+        else:
+            source_url = original_source + "_SAS_TOKEN_PLACEHOLDER_"
+        return source_url
diff --git a/code/backend/batch/utilities/search/Search.py b/code/backend/batch/utilities/search/Search.py
index 76bfd4ef7..836bf146b 100644
--- a/code/backend/batch/utilities/search/Search.py
+++ b/code/backend/batch/utilities/search/Search.py
@@ -2,8 +2,7 @@
 from ..search.IntegratedVectorizationSearchHandler import (
     IntegratedVectorizationSearchHandler,
 )
-from langchain_core.documents import Document
-import re
+from ..common.SourceDocument import SourceDocument
 from ..helpers.EnvHelper import EnvHelper
 
 
@@ -16,39 +15,5 @@ def get_search_handler(env_helper: EnvHelper):
             return AzureSearchHandler(env_helper)
 
     @staticmethod
-    def get_source_documents(search_handler, question):
-        if isinstance(search_handler, IntegratedVectorizationSearchHandler):
-            search_results = search_handler.query_search(question)
-            return Search.generate_source_documents(search_results)
-        else:
-            return search_handler.query_search(question)
-
-    @staticmethod
-    def generate_source_documents(search_results):
-        sources = []
-        for result in search_results:
-            source_url = Search._extract_source_url(result.get("source", ""))
-
-            metadata_dict = {
-                "id": result.get("id", ""),
-                "title": result.get("title", ""),
-                "source": source_url,
-                "chunk_id": result.get("chunk_id", ""),
-            }
-            sources.append(
-                Document(
-                    page_content=result["content"],
-                    metadata=metadata_dict,
-                )
-            )
-        return sources
-
-    @staticmethod
-    def _extract_source_url(original_source):
-        matches = list(re.finditer(r"https?://", original_source))
-        if len(matches) > 1:
-            second_http_start = matches[1].start()
-            source_url = original_source[second_http_start:]
-        else:
-            source_url = original_source + "_SAS_TOKEN_PLACEHOLDER_"
-        return source_url
+    def get_source_documents(search_handler, question) -> list[SourceDocument]:
+        return search_handler.query_search(question)
diff --git a/code/backend/batch/utilities/search/SearchHandlerBase.py b/code/backend/batch/utilities/search/SearchHandlerBase.py
index 69d58441b..ad6d39dc3 100644
--- a/code/backend/batch/utilities/search/SearchHandlerBase.py
+++ b/code/backend/batch/utilities/search/SearchHandlerBase.py
@@ -1,5 +1,7 @@
 from abc import ABC, abstractmethod
 
+from ..common.SourceDocument import SourceDocument
+
 
 class SearchHandlerBase(ABC):
     def __init__(self, env_helper):
@@ -31,9 +33,5 @@ def delete_files(self, files, id_field):
         pass
 
     @abstractmethod
-    def query_search(self, question):
-        pass
-
-    @abstractmethod
-    def return_answer_source_documents(self, search_results):
+    def query_search(self, question) -> list[SourceDocument]:
         pass
diff --git a/code/backend/batch/utilities/tools/QuestionAnswerTool.py b/code/backend/batch/utilities/tools/QuestionAnswerTool.py
index 4e7334ad4..b29fd2f18 100644
--- a/code/backend/batch/utilities/tools/QuestionAnswerTool.py
+++ b/code/backend/batch/utilities/tools/QuestionAnswerTool.py
@@ -1,6 +1,7 @@
 import json
 import logging
-import warnings
+
+from ..common.SourceDocument import SourceDocument
 from ..search.Search import Search
 from .AnsweringToolBase import AnsweringToolBase
 
@@ -14,7 +15,6 @@
     PromptTemplate,
 )
 from langchain_community.callbacks import get_openai_callback
-from langchain_core.documents import Document
 from langchain_core.messages import SystemMessage
 
 from ..helpers.config.ConfigHelper import ConfigHelper
@@ -45,14 +45,14 @@ def json_remove_whitespace(obj: str) -> str:
         except json.JSONDecodeError:
             return obj
 
-    def generate_llm_chain(self, question: str, sources: list[Document]):
+    def generate_llm_chain(self, question: str, sources: list[dict]):
         answering_prompt = PromptTemplate(
             template=self.config.prompts.answering_user_prompt,
             input_variables=["question", "sources"],
         )
 
         sources_text = "\n\n".join(
-            [f"[doc{i+1}]: {source.page_content}" for i, source in enumerate(sources)]
+            [f"[doc{i+1}]: {source.content}" for i, source in enumerate(sources)]
         )
 
         return answering_prompt, {
@@ -64,7 +64,7 @@ def generate_on_your_data_llm_chain(
         self,
         question: str,
         chat_history: list[dict],
-        sources: list[Document],
+        sources: list[SourceDocument],
     ):
         examples = []
 
@@ -83,7 +83,7 @@ def generate_on_your_data_llm_chain(
             if all((few_shot_example.values())):
                 examples.append(few_shot_example)
             else:
-                warnings.warn(
+                logger.warning(
                     "Not all example fields are set in the config. Skipping few-shot example."
                 )
 
@@ -116,7 +116,7 @@ def generate_on_your_data_llm_chain(
         documents = json.dumps(
             {
                 "retrieved_documents": [
-                    {f"[doc{i+1}]": {"content": source.page_content}}
+                    {f"[doc{i+1}]": {"content": source.content}}
                     for i, source in enumerate(sources)
                 ],
             },
@@ -129,18 +129,22 @@ def generate_on_your_data_llm_chain(
             "chat_history": chat_history,
         }
 
-    def answer_question(self, question: str, chat_history: list[dict], **kwargs):
-        sources = Search.get_source_documents(self.search_handler, question)
+    def answer_question(
+        self, question: str, chat_history: list[SourceDocument], **kwargs
+    ):
+        source_documents = Search.get_source_documents(self.search_handler, question)
 
         if self.config.prompts.use_on_your_data_format:
             answering_prompt, input = self.generate_on_your_data_llm_chain(
-                question, chat_history, sources
+                question, chat_history, source_documents
             )
         else:
-            warnings.warn(
+            logger.warning(
                 "Azure OpenAI On Your Data prompt format is recommended and should be enabled in the Admin app.",
             )
-            answering_prompt, input = self.generate_llm_chain(question, sources)
+            answering_prompt, input = self.generate_llm_chain(
+                question, source_documents
+            )
 
         llm_helper = LLMHelper()
 
@@ -155,7 +159,6 @@ def answer_question(self, question: str, chat_history: list[dict], **kwargs):
         logger.debug(f"Answer: {answer}")
 
         # Generate Answer Object
-        source_documents = self.search_handler.return_answer_source_documents(sources)
         clean_answer = Answer(
             question=question,
             answer=answer,
diff --git a/code/tests/common/test_SourceDocument.py b/code/tests/common/test_SourceDocument.py
index 47a83ca83..f175e1953 100644
--- a/code/tests/common/test_SourceDocument.py
+++ b/code/tests/common/test_SourceDocument.py
@@ -8,35 +8,6 @@
 )
 
 
-def test_convert_to_langchain_document():
-    # Given
-    source_document = SourceDocument(
-        id="1",
-        content="Some content",
-        title="A title",
-        source="A source",
-        chunk="A chunk",
-        offset="An offset",
-        page_number="1",
-        chunk_id="abcd",
-    )
-
-    # When
-    langchain_document = source_document.convert_to_langchain_document()
-
-    # Then
-    assert langchain_document.page_content == "Some content"
-    assert langchain_document.metadata == {
-        "id": "1",
-        "source": "A source",
-        "title": "A title",
-        "chunk": "A chunk",
-        "offset": "An offset",
-        "page_number": "1",
-        "chunk_id": "abcd",
-    }
-
-
 def test_get_filename():
     # Given
     source_document = SourceDocument(
diff --git a/code/tests/functional/backend_api/conftest.py b/code/tests/functional/backend_api/conftest.py
index 0d2fa7d9e..2c7bc54e9 100644
--- a/code/tests/functional/backend_api/conftest.py
+++ b/code/tests/functional/backend_api/conftest.py
@@ -55,10 +55,15 @@ def setup_default_mocking(httpserver: HTTPServer, app_config: AppConfig):
         }
     )
 
+    httpserver.expect_oneshot_request(
+        "/indexes",
+        method="GET",
+    ).respond_with_json({"value": []})
+
     httpserver.expect_request(
-        f"/indexes('{app_config.get('AZURE_SEARCH_INDEX')}')",
+        "/indexes",
         method="GET",
-    ).respond_with_json({}, status=404)
+    ).respond_with_json({"value": [{"name": app_config.get("AZURE_SEARCH_INDEX")}]})
 
     httpserver.expect_request(
         "/indexes",
diff --git a/code/tests/functional/backend_api/tests/default/test_conversation_custom.py b/code/tests/functional/backend_api/tests/default/test_conversation_custom.py
index 48bbb670f..13b113081 100644
--- a/code/tests/functional/backend_api/tests/default/test_conversation_custom.py
+++ b/code/tests/functional/backend_api/tests/default/test_conversation_custom.py
@@ -137,12 +137,12 @@ def test_post_makes_correct_calls_to_openai_embeddings_to_get_vector_dimensions(
                 "Api-Key": app_config.get("AZURE_OPENAI_API_KEY"),
             },
             query_string="api-version=2024-02-01",
-            times=3,
+            times=1,
         ),
     )
 
 
-def test_post_makes_correct_calls_to_openai_embeddings_to_embed_question(
+def test_post_makes_correct_calls_to_openai_embeddings_to_embed_question_to_search(
     app_url: str, app_config: AppConfig, httpserver: HTTPServer
 ):
     # when
@@ -158,7 +158,7 @@ def test_post_makes_correct_calls_to_openai_embeddings_to_embed_question(
                 "input": [
                     [3923, 374, 279, 7438, 315, 2324, 30]
                 ],  # Embedding of "What is the meaning of life?"
-                "model": "text-embedding-ada-002",
+                "model": app_config.get("AZURE_OPENAI_EMBEDDING_MODEL"),
                 "encoding_format": "base64",
             },
             headers={
@@ -168,7 +168,38 @@ def test_post_makes_correct_calls_to_openai_embeddings_to_embed_question(
                 "Api-Key": app_config.get("AZURE_OPENAI_API_KEY"),
             },
             query_string="api-version=2024-02-01",
-            times=2,  # once to use when quering search and once when adding to the conversation log
+            times=1,
+        ),
+    )
+
+
+def test_post_makes_correct_calls_to_openai_embeddings_to_embed_question_to_store_in_conversation_log(
+    app_url: str, app_config: AppConfig, httpserver: HTTPServer
+):
+    # when
+    requests.post(f"{app_url}{path}", json=body)
+
+    # then
+    verify_request_made(
+        mock_httpserver=httpserver,
+        request_matcher=RequestMatcher(
+            path=f"/openai/deployments/{app_config.get('AZURE_OPENAI_EMBEDDING_MODEL')}/embeddings",
+            method="POST",
+            json={
+                "input": [
+                    [3923, 374, 279, 7438, 315, 2324, 30]
+                ],  # Embedding of "What is the meaning of life?"
+                "model": "text-embedding-ada-002",  # this is harded in the langchain code base
+                "encoding_format": "base64",
+            },
+            headers={
+                "Accept": "application/json",
+                "Content-Type": "application/json",
+                "Authorization": f"Bearer {app_config.get('AZURE_OPENAI_API_KEY')}",
+                "Api-Key": app_config.get("AZURE_OPENAI_API_KEY"),
+            },
+            query_string="api-version=2024-02-01",
+            times=1,
         ),
     )
 
@@ -290,7 +321,7 @@ def test_post_makes_correct_call_to_openai_chat_completions_with_functions(
     )
 
 
-def test_post_makes_correct_call_to_get_documents_search_index(
+def test_post_makes_correct_call_to_list_search_indexes(
     app_url: str, app_config: AppConfig, httpserver: HTTPServer
 ):
     # when
@@ -300,14 +331,14 @@ def test_post_makes_correct_call_to_get_documents_search_index(
     verify_request_made(
         mock_httpserver=httpserver,
         request_matcher=RequestMatcher(
-            path=f"/indexes('{app_config.get('AZURE_SEARCH_INDEX')}')",
+            path="/indexes",
             method="GET",
             headers={
                 "Accept": "application/json;odata.metadata=minimal",
                 "Api-Key": app_config.get("AZURE_SEARCH_KEY"),
             },
             query_string="api-version=2023-10-01-Preview",
-            times=2,
+            times=3,
         ),
     )
 
@@ -449,7 +480,7 @@ def test_post_makes_correct_call_to_create_documents_search_index(
                     ],
                 },
             },
-            times=2,
+            times=1,
         ),
     )
 
@@ -475,7 +506,7 @@ def test_post_makes_correct_call_to_search_documents_search_index(
                         "kind": "vector",
                         "k": int(app_config.get("AZURE_SEARCH_TOP_K")),
                         "fields": "content_vector",
-                        "vector": [0.9320719838142395, -0.3622731566429138],
+                        "vector": [0.018990106880664825, -0.0073809814639389515],
                     }
                 ],
             },
@@ -598,8 +629,8 @@ def test_post_returns_error_when_downstream_fails(
     httpserver.clear_all_handlers()  # Clear default successful responses
 
     httpserver.expect_request(
-        f"/openai/deployments/{app_config.get('AZURE_OPENAI_EMBEDDING_MODEL')}/embeddings",
-        method="POST",
+        "/indexes",
+        method="GET",
     ).respond_with_json({}, status=500)
 
     # when
diff --git a/code/tests/functional/backend_api/tests/sk_orchestrator/test_response_with_search_documents_tool.py b/code/tests/functional/backend_api/tests/sk_orchestrator/test_response_with_search_documents_tool.py
index f0f909c85..56ce52b81 100644
--- a/code/tests/functional/backend_api/tests/sk_orchestrator/test_response_with_search_documents_tool.py
+++ b/code/tests/functional/backend_api/tests/sk_orchestrator/test_response_with_search_documents_tool.py
@@ -150,7 +150,7 @@ def test_post_responds_successfully(app_url: str, app_config: AppConfig):
     assert response.headers["Content-Type"] == "application/json"
 
 
-def test_post_makes_correct_call_to_get_search_index(
+def test_post_makes_correct_call_to_list_search_index(
     app_url: str, app_config: AppConfig, httpserver: HTTPServer
 ):
     # when
@@ -160,14 +160,14 @@ def test_post_makes_correct_call_to_get_search_index(
     verify_request_made(
         mock_httpserver=httpserver,
         request_matcher=RequestMatcher(
-            path=f"/indexes('{app_config.get('AZURE_SEARCH_INDEX')}')",
+            path="/indexes",
             method="GET",
             headers={
                 "Accept": "application/json;odata.metadata=minimal",
                 "Api-Key": app_config.get("AZURE_SEARCH_KEY"),
             },
             query_string="api-version=2023-10-01-Preview",
-            times=2,
+            times=3,
         ),
     )
 
@@ -193,7 +193,7 @@ def test_post_makes_correct_call_to_get_search_documents(
                         "kind": "vector",
                         "k": int(app_config.get("AZURE_SEARCH_TOP_K")),
                         "fields": "content_vector",
-                        "vector": [0.9320719838142395, -0.3622731566429138],
+                        "vector": [0.018990106880664825, -0.0073809814639389515],
                     }
                 ],
             },
diff --git a/code/tests/functional/backend_api/tests/sk_orchestrator/test_response_without_tool_call.py b/code/tests/functional/backend_api/tests/sk_orchestrator/test_response_without_tool_call.py
index ade675e9e..afcdb9bbe 100644
--- a/code/tests/functional/backend_api/tests/sk_orchestrator/test_response_without_tool_call.py
+++ b/code/tests/functional/backend_api/tests/sk_orchestrator/test_response_without_tool_call.py
@@ -260,8 +260,8 @@ def test_post_returns_error_when_downstream_fails(
     httpserver.clear_all_handlers()  # Clear default successful responses
 
     httpserver.expect_request(
-        f"/openai/deployments/{app_config.get('AZURE_OPENAI_EMBEDDING_MODEL')}/embeddings",
-        method="POST",
+        "/indexes",
+        method="GET",
     ).respond_with_json({}, status=500)
 
     # when
diff --git a/code/tests/search_utilities/test_AzureSearchHandler.py b/code/tests/search_utilities/test_AzureSearchHandler.py
index bad8eafe8..e6f4977db 100644
--- a/code/tests/search_utilities/test_AzureSearchHandler.py
+++ b/code/tests/search_utilities/test_AzureSearchHandler.py
@@ -1,53 +1,56 @@
 import pytest
-from unittest.mock import Mock, patch
+from unittest.mock import MagicMock, Mock, patch
 from backend.batch.utilities.search.AzureSearchHandler import AzureSearchHandler
-from langchain_core.documents import Document
 import json
+from azure.search.documents.models import VectorizedQuery
+
+from backend.batch.utilities.common.SourceDocument import SourceDocument
 
 
 @pytest.fixture
 def env_helper_mock():
     mock = Mock()
-    mock.AZURE_SEARCH_SERVICE = "https://example.search.windows.net"
-    mock.AZURE_SEARCH_INDEX = "example-index"
-    mock.AZURE_SEARCH_KEY = "example-key"
-    mock.is_auth_type_keys = Mock(return_value=True)
+    mock.AZURE_SEARCH_USE_SEMANTIC_SEARCH = False
+    mock.AZURE_SEARCH_TOP_K = 3
+    mock.AZURE_SEARCH_FILTER = "some-search-filter"
     return mock
 
 
 @pytest.fixture
-def mock_azure_search_helper():
+def mock_search_client():
     with patch(
         "backend.batch.utilities.search.AzureSearchHandler.AzureSearchHelper"
     ) as mock:
-        vector_store = mock.return_value.get_vector_store.return_value.client
-        yield vector_store
+        search_client = mock.return_value.get_search_client.return_value
+        yield search_client
 
 
 @pytest.fixture
-def mock_vector_store():
-    with patch(
-        "backend.batch.utilities.search.AzureSearchHandler.AzureSearchHelper"
-    ) as mock:
-        vector_store = mock.return_value.get_vector_store.return_value
-        yield vector_store
+def mock_llm_helper():
+    with patch("backend.batch.utilities.search.AzureSearchHandler.LLMHelper") as mock:
+        mock_llm_helper = mock.return_value
+        yield mock_llm_helper
 
 
 @pytest.fixture
-def handler(env_helper_mock, mock_azure_search_helper):
+def handler(env_helper_mock, mock_search_client, mock_llm_helper):
     with patch(
         "backend.batch.utilities.search.AzureSearchHandler.AzureSearchHelper",
-        return_value=mock_azure_search_helper,
+        return_value=mock_search_client,
     ):
-        return AzureSearchHandler(env_helper_mock)
+        with patch(
+            "backend.batch.utilities.search.AzureSearchHandler.LLMHelper",
+            return_value=mock_llm_helper,
+        ):
+            return AzureSearchHandler(env_helper_mock)
 
 
-def test_create_search_client(handler, mock_azure_search_helper):
+def test_create_search_client(handler, mock_search_client):
     # when
     search_client = handler.create_search_client()
 
     # then
-    assert search_client == mock_azure_search_helper
+    assert search_client == mock_search_client
 
 
 def test_process_results(handler):
@@ -112,40 +115,131 @@ def test_get_files(handler):
     )
 
 
-def test_query_search(handler, mock_vector_store):
+@patch("backend.batch.utilities.search.AzureSearchHandler.tiktoken")
+def test_query_search_uses_tiktoken_encoder(mock_tiktoken, handler, mock_llm_helper):
     # given
     question = "What is the answer?"
 
+    mock_encoder = MagicMock()
+    mock_tiktoken.get_encoding.return_value = mock_encoder
+    mock_encoder.encode.return_value = [1, 2, 3]
+
     # when
-    result = handler.query_search(question)
+    handler.query_search(question)
 
     # then
-    mock_vector_store.similarity_search.assert_called_once_with(
-        query=question,
-        k=handler.env_helper.AZURE_SEARCH_TOP_K,
-        filters=handler.env_helper.AZURE_SEARCH_FILTER,
+    mock_tiktoken.get_encoding.assert_called_once_with("cl100k_base")
+    mock_encoder.encode.assert_called_once_with(question)
+    mock_llm_helper.generate_embeddings.assert_called_once_with([1, 2, 3])
+
+
+def test_query_search_performs_hybrid_search(handler, mock_llm_helper):
+    # given
+    question = "What is the answer?"
+
+    mock_llm_helper.generate_embeddings.return_value = [1, 2, 3]
+
+    # when
+    handler.query_search(question)
+
+    # then
+    handler.search_client.search.assert_called_once_with(
+        search_text=question,
+        vector_queries=[
+            VectorizedQuery(
+                vector=[1, 2, 3],
+                k_nearest_neighbors=handler.env_helper.AZURE_SEARCH_TOP_K,
+                filter=handler.env_helper.AZURE_SEARCH_FILTER,
+                fields="content_vector",
+            )
+        ],
+        filter=handler.env_helper.AZURE_SEARCH_FILTER,
+        top=handler.env_helper.AZURE_SEARCH_TOP_K,
     )
-    assert result == mock_vector_store.similarity_search.return_value
 
 
-def test_return_answer_source_documents(handler):
+def test_query_search_performs_semantic_search(
+    handler, mock_llm_helper, env_helper_mock
+):
     # given
-    document = Document("mock content")
-    document.metadata = {
-        "id": "mock id",
-        "title": "mock title",
-        "source": "mock source",
-        "chunk": "mock chunk",
-        "offset": "mock offset",
-        "page_number": "mock page number",
-    }
-    documents = [document]
+    question = "What is the answer?"
+
+    mock_llm_helper.generate_embeddings.return_value = [1, 2, 3]
+    env_helper_mock.AZURE_SEARCH_USE_SEMANTIC_SEARCH = True
+    env_helper_mock.AZURE_SEARCH_SEMANTIC_CONFIG_NAME = "some-semantic-config"
+
+    # when
+    handler.query_search(question)
+
+    # then
+    handler.search_client.search.assert_called_once_with(
+        search_text=question,
+        vector_queries=[
+            VectorizedQuery(
+                vector=[1, 2, 3],
+                k_nearest_neighbors=handler.env_helper.AZURE_SEARCH_TOP_K,
+                fields="content_vector",
+            )
+        ],
+        filter=handler.env_helper.AZURE_SEARCH_FILTER,
+        query_type="semantic",
+        semantic_configuration_name=handler.env_helper.AZURE_SEARCH_SEMANTIC_CONFIG_NAME,
+        query_caption="extractive",
+        query_answer="extractive",
+        top=handler.env_helper.AZURE_SEARCH_TOP_K,
+    )
+
+
+def test_query_search_converts_results_to_source_documents(
+    handler,
+):
+    # given
+    question = "What is the answer?"
+
+    handler.search_client.search.return_value = [
+        {
+            "id": 1,
+            "content": "content1",
+            "title": "title1",
+            "source": "source1",
+            "chunk": "chunk1",
+            "offset": "offset1",
+            "page_number": "page_number1",
+        },
+        {
+            "id": 2,
+            "content": "content2",
+            "title": "title2",
+            "source": "source2",
+            "chunk": "chunk2",
+            "offset": "offset2",
+            "page_number": "page_number2",
+        },
+    ]
+
+    expected_results = [
+        SourceDocument(
+            id=1,
+            content="content1",
+            title="title1",
+            source="source1",
+            chunk="chunk1",
+            offset="offset1",
+            page_number="page_number1",
+        ),
+        SourceDocument(
+            id=2,
+            content="content2",
+            title="title2",
+            source="source2",
+            chunk="chunk2",
+            offset="offset2",
+            page_number="page_number2",
+        ),
+    ]
+
     # when
-    source_documents = handler.return_answer_source_documents(documents)
+    actual_results = handler.query_search(question)
 
     # then
-    assert len(source_documents) == 1
-    assert source_documents[0].id == "mock id"
-    assert source_documents[0].content == "mock content"
-    assert source_documents[0].title == "mock title"
-    assert source_documents[0].source == "mock source"
+    assert actual_results == expected_results
diff --git a/code/tests/search_utilities/test_IntegratedVectorizationSearchHandler.py b/code/tests/search_utilities/test_IntegratedVectorizationSearchHandler.py
index ce7c24e68..15f498dcd 100644
--- a/code/tests/search_utilities/test_IntegratedVectorizationSearchHandler.py
+++ b/code/tests/search_utilities/test_IntegratedVectorizationSearchHandler.py
@@ -4,7 +4,8 @@
     IntegratedVectorizationSearchHandler,
 )
 from azure.search.documents.models import VectorizableTextQuery
-from langchain_core.documents import Document
+
+from backend.batch.utilities.common.SourceDocument import SourceDocument
 
 
 @pytest.fixture
@@ -114,7 +115,7 @@ def test_get_files(handler, search_client_mock):
     )
 
 
-def test_query_search(handler, env_helper_mock):
+def test_query_search_performs_search(handler, env_helper_mock):
     # given
     question = "test question"
     vector_query = VectorizableTextQuery(
@@ -125,7 +126,7 @@ def test_query_search(handler, env_helper_mock):
     )
 
     # when
-    result = handler.query_search(question)
+    handler.query_search(question)
 
     # then
     handler.search_client.search.assert_called_once_with(
@@ -133,26 +134,48 @@ def test_query_search(handler, env_helper_mock):
         vector_queries=[vector_query],
         top=env_helper_mock.AZURE_SEARCH_TOP_K,
     )
-    assert result == handler.search_client.search.return_value
 
 
-def test_return_answer_source_documents(handler):
+def test_query_search_converts_results_to_source_documents(handler):
     # given
-    document = Document("mock content")
-    document.metadata = {
-        "id": "mock id",
-        "title": "mock title",
-        "source": "mock source",
-        "chunk_id": "abcd_page_1",
-    }
-    documents = [document]
+    question = "test question"
+
+    handler.search_client.search.return_value = [
+        {
+            "id": 1,
+            "content": "content1",
+            "title": "title1",
+            "source": "https://example.com/http://example.com",
+            "chunk_id": "chunk_id1",
+        },
+        {
+            "id": 2,
+            "content": "content2",
+            "title": "title2",
+            "source": "https://example.com",
+            "chunk_id": "chunk_id2",
+        },
+    ]
+
+    expected_results = [
+        SourceDocument(
+            id=1,
+            content="content1",
+            title="title1",
+            source="http://example.com",
+            chunk_id="chunk_id1",
+        ),
+        SourceDocument(
+            id=2,
+            content="content2",
+            title="title2",
+            source="https://example.com_SAS_TOKEN_PLACEHOLDER_",
+            chunk_id="chunk_id2",
+        ),
+    ]
+
     # when
-    source_documents = handler.return_answer_source_documents(documents)
+    actual_results = handler.query_search(question)
 
     # then
-    assert len(source_documents) == 1
-    assert source_documents[0].id == "mock id"
-    assert source_documents[0].content == "mock content"
-    assert source_documents[0].title == "mock title"
-    assert source_documents[0].source == "mock source"
-    assert source_documents[0].chunk_id == "abcd_page_1"
+    assert actual_results == expected_results
diff --git a/code/tests/search_utilities/test_Search.py b/code/tests/search_utilities/test_Search.py
index 8b12a5565..f4cd08227 100644
--- a/code/tests/search_utilities/test_Search.py
+++ b/code/tests/search_utilities/test_Search.py
@@ -4,6 +4,7 @@
 from backend.batch.utilities.search.IntegratedVectorizationSearchHandler import (
     IntegratedVectorizationSearchHandler,
 )
+from backend.batch.utilities.common.SourceDocument import SourceDocument
 
 
 @pytest.fixture
@@ -72,72 +73,30 @@ def test_get_source_documents_azure_search(search_handler_mock: MagicMock):
     # given
     question = "example question"
 
-    search_results = [
-        {
-            "id": 1,
-            "title": "Example Title 1",
-            "source": "https://example.com/1",
-            "chunk_id": "chunk1",
-        },
-        {
-            "id": 2,
-            "title": "Example Title 2",
-            "source": "https://example.com/2",
-            "chunk_id": "chunk2",
-        },
-    ]
-    search_handler_mock.query_search.return_value = search_results
-
-    # when
-    source_documents = Search.get_source_documents(search_handler_mock, question)
-
-    # then
-    assert len(source_documents) == len(search_results)
-
-
-def test_generate_source_documents():
-    # given
-    search_results = [
-        {
-            "id": 1,
-            "title": "Example Title 1",
-            "source": "https://example.com/1",
-            "chunk_id": "chunk1",
-            "content": "mock content 1",
-        },
-        {
-            "id": 2,
-            "title": "Example Title 2",
-            "source": "https://example.com/2",
-            "chunk_id": "chunk2",
-            "content": "mock content 2",
-        },
+    expected_source_documents = [
+        SourceDocument(
+            id=1,
+            content="content1",
+            title="title1",
+            source="source1",
+            chunk="chunk1",
+            offset="offset1",
+            page_number="page_number1",
+        ),
+        SourceDocument(
+            id=2,
+            content="content2",
+            title="title2",
+            source="source2",
+            chunk="chunk2",
+            offset="offset2",
+            page_number="page_number2",
+        ),
     ]
+    search_handler_mock.query_search.return_value = expected_source_documents
 
     # when
-    source_documents = Search.generate_source_documents(search_results)
-
-    # then
-    assert len(source_documents) == len(search_results)
-
-
-def test__extract_source_url_multiple_http():
-    # given
-    original_source = "https://example.com/http://example.com"
-
-    # when
-    source_url = Search._extract_source_url(original_source)
-
-    # then
-    assert source_url == "http://example.com"
-
-
-def test__extract_source_url_single_http():
-    # given
-    original_source = "https://example.com"
-
-    # when
-    source_url = Search._extract_source_url(original_source)
+    actual_source_documents = Search.get_source_documents(search_handler_mock, question)
 
     # then
-    assert source_url == "https://example.com_SAS_TOKEN_PLACEHOLDER_"
+    assert len(actual_source_documents) == len(expected_source_documents)
diff --git a/code/tests/utilities/helpers/processors/test_IntegratedVectorizationEmbedder.py b/code/tests/utilities/helpers/processors/test_IntegratedVectorizationEmbedder.py
index d7412d894..b424e5439 100644
--- a/code/tests/utilities/helpers/processors/test_IntegratedVectorizationEmbedder.py
+++ b/code/tests/utilities/helpers/processors/test_IntegratedVectorizationEmbedder.py
@@ -3,8 +3,13 @@
 from backend.batch.utilities.helpers.embedders.IntegratedVectorizationEmbedder import (
     IntegratedVectorizationEmbedder,
 )
+from backend.batch.utilities.document_chunking.Strategies import ChunkingSettings
+from backend.batch.utilities.document_loading import LoadingSettings
+from backend.batch.utilities.document_loading.Strategies import LoadingStrategy
 
 AZURE_SEARCH_INDEXER_NAME = "mock-indexer-name"
+CHUNKING_SETTINGS = ChunkingSettings({"strategy": "layout", "size": 1, "overlap": 0})
+LOADING_SETTINGS = LoadingSettings({"strategy": LoadingStrategy.LAYOUT})
 
 
 @pytest.fixture(autouse=True)
@@ -27,8 +32,16 @@ def llm_helper_mock():
         llm_helper.get_embedding_model.return_value.embed_query.return_value = [
             0
         ] * 1536
+        llm_helper.generate_embeddings.return_value = [123]
+        yield mock
+
 
-        yield llm_helper
+@pytest.fixture(autouse=True)
+def azure_search_helper_mock():
+    with patch(
+        "backend.batch.utilities.helpers.DocumentProcessorHelper.AzureSearchHelper"
+    ) as mock:
+        yield mock
 
 
 @pytest.fixture(autouse=True)
@@ -94,7 +107,7 @@ def test_process_using_integrated_vectorization(
     azure_search_iv_datasource_helper_mock.return_value.create_or_update_datasource.assert_called_once()
 
     azure_search_iv_index_helper_mock.assert_called_once_with(
-        env_helper_mock, llm_helper_mock
+        env_helper_mock, llm_helper_mock.return_value
     )
     azure_search_iv_index_helper_mock.return_value.create_or_update_index.assert_called_once()
 
diff --git a/code/tests/utilities/helpers/test_AzureSearchHelper.py b/code/tests/utilities/helpers/test_AzureSearchHelper.py
index 84252e835..6c0b3b4a1 100644
--- a/code/tests/utilities/helpers/test_AzureSearchHelper.py
+++ b/code/tests/utilities/helpers/test_AzureSearchHelper.py
@@ -1,6 +1,25 @@
 import pytest
 from unittest.mock import ANY, MagicMock, patch
 from backend.batch.utilities.helpers.AzureSearchHelper import AzureSearchHelper
+from azure.search.documents.indexes.models import (
+    ExhaustiveKnnAlgorithmConfiguration,
+    ExhaustiveKnnParameters,
+    HnswAlgorithmConfiguration,
+    HnswParameters,
+    SearchableField,
+    SearchField,
+    SearchFieldDataType,
+    SearchIndex,
+    SemanticConfiguration,
+    SemanticField,
+    SemanticPrioritizedFields,
+    SemanticSearch,
+    SimpleField,
+    VectorSearch,
+    VectorSearchAlgorithmKind,
+    VectorSearchAlgorithmMetric,
+    VectorSearchProfile,
+)
 
 AZURE_AUTH_TYPE = "keys"
 AZURE_SEARCH_KEY = "mock-key"
@@ -12,7 +31,7 @@
 
 
 @pytest.fixture(autouse=True)
-def AzureSearchMock():
+def azure_search_mock():
     with patch("backend.batch.utilities.helpers.AzureSearchHelper.AzureSearch") as mock:
         yield mock
 
@@ -44,58 +63,230 @@ def env_helper_mock():
             AZURE_SEARCH_CONVERSATIONS_LOG_INDEX
         )
 
+        env_helper.is_auth_type_keys.return_value = True
+
         yield env_helper
 
 
-def test_get_vector_store_keys(AzureSearchMock: MagicMock, llm_helper_mock: MagicMock):
+@patch("backend.batch.utilities.helpers.AzureSearchHelper.SearchClient")
+@patch("backend.batch.utilities.helpers.AzureSearchHelper.SearchIndexClient")
+@patch("backend.batch.utilities.helpers.AzureSearchHelper.AzureKeyCredential")
+def test_creates_search_clients_with_keys(
+    azure_key_credential_mock: MagicMock,
+    search_index_client_mock: MagicMock,
+    search_client_mock: MagicMock,
+):
+    # when
+    AzureSearchHelper()
+
+    # then
+    azure_key_credential_mock.assert_called_once_with(AZURE_SEARCH_KEY)
+    search_client_mock.assert_called_once_with(
+        endpoint=AZURE_SEARCH_SERVICE,
+        index_name=AZURE_SEARCH_INDEX,
+        credential=azure_key_credential_mock.return_value,
+    )
+    search_index_client_mock.assert_called_once_with(
+        endpoint=AZURE_SEARCH_SERVICE, credential=azure_key_credential_mock.return_value
+    )
+
+
+@patch("backend.batch.utilities.helpers.AzureSearchHelper.SearchClient")
+@patch("backend.batch.utilities.helpers.AzureSearchHelper.SearchIndexClient")
+@patch("backend.batch.utilities.helpers.AzureSearchHelper.DefaultAzureCredential")
+def test_creates_search_clients_with_rabc(
+    default_azure_credential_mock: MagicMock,
+    search_index_client_mock: MagicMock,
+    search_client_mock: MagicMock,
+    env_helper_mock: MagicMock,
+):
     # given
-    azure_search_helper = AzureSearchHelper()
+    env_helper_mock.is_auth_type_keys.return_value = False
 
     # when
-    vector_store = azure_search_helper.get_vector_store()
+    AzureSearchHelper()
 
     # then
-    assert vector_store == AzureSearchMock.return_value
-
-    AzureSearchMock.assert_called_once_with(
-        azure_search_endpoint=AZURE_SEARCH_SERVICE,
-        azure_search_key=AZURE_SEARCH_KEY,
+    default_azure_credential_mock.assert_called_once_with()
+    search_client_mock.assert_called_once_with(
+        endpoint=AZURE_SEARCH_SERVICE,
         index_name=AZURE_SEARCH_INDEX,
-        embedding_function=llm_helper_mock.get_embedding_model.return_value.embed_query,
-        fields=ANY,
-        search_type="hybrid",
-        semantic_configuration_name=AZURE_SEARCH_SEMANTIC_SEARCH_CONFIG,
-        user_agent="langchain chatwithyourdata-sa",
+        credential=default_azure_credential_mock.return_value,
+    )
+    search_index_client_mock.assert_called_once_with(
+        endpoint=AZURE_SEARCH_SERVICE,
+        credential=default_azure_credential_mock.return_value,
     )
 
 
-def test_get_vector_store_rbac(
-    AzureSearchMock: MagicMock, llm_helper_mock: MagicMock, env_helper_mock: MagicMock
+@patch("backend.batch.utilities.helpers.AzureSearchHelper.SearchClient")
+@patch("backend.batch.utilities.helpers.AzureSearchHelper.SearchIndexClient")
+def test_returns_search_client(
+    search_index_client_mock: MagicMock, search_client_mock: MagicMock
 ):
     # given
-    env_helper_mock.AZURE_AUTH_TYPE = "rbac"
     azure_search_helper = AzureSearchHelper()
 
     # when
-    vector_store = azure_search_helper.get_vector_store()
+    search_client = azure_search_helper.get_search_client()
 
     # then
-    assert vector_store == AzureSearchMock.return_value
+    assert search_client is search_client_mock.return_value
 
-    AzureSearchMock.assert_called_once_with(
-        azure_search_endpoint=AZURE_SEARCH_SERVICE,
-        azure_search_key=None,
-        index_name=AZURE_SEARCH_INDEX,
-        embedding_function=llm_helper_mock.get_embedding_model.return_value.embed_query,
-        fields=ANY,
-        search_type="hybrid",
-        semantic_configuration_name=AZURE_SEARCH_SEMANTIC_SEARCH_CONFIG,
-        user_agent="langchain chatwithyourdata-sa",
+
+@patch("backend.batch.utilities.helpers.AzureSearchHelper.SearchClient")
+@patch("backend.batch.utilities.helpers.AzureSearchHelper.SearchIndexClient")
+def test_creates_search_index_if_not_exists(
+    search_index_client_mock: MagicMock, search_client_mock: MagicMock
+):
+    # given
+    search_index_client_mock.return_value.list_index_names.return_value = [
+        "some-irrelevant-index"
+    ]
+
+    fields = [
+        SimpleField(
+            name="id",
+            type=SearchFieldDataType.String,
+            key=True,
+            filterable=True,
+        ),
+        SearchableField(
+            name="content",
+            type=SearchFieldDataType.String,
+        ),
+        SearchField(
+            name="content_vector",
+            type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
+            searchable=True,
+            vector_search_dimensions=1536,
+            vector_search_profile_name="myHnswProfile",
+        ),
+        SearchableField(
+            name="metadata",
+            type=SearchFieldDataType.String,
+        ),
+        SearchableField(
+            name="title",
+            type=SearchFieldDataType.String,
+            facetable=True,
+            filterable=True,
+        ),
+        SearchableField(
+            name="source",
+            type=SearchFieldDataType.String,
+            filterable=True,
+        ),
+        SimpleField(
+            name="chunk",
+            type=SearchFieldDataType.Int32,
+            filterable=True,
+        ),
+        SimpleField(
+            name="offset",
+            type=SearchFieldDataType.Int32,
+            filterable=True,
+        ),
+    ]
+
+    expected_index = SearchIndex(
+        name=AZURE_SEARCH_INDEX,
+        fields=fields,
+        semantic_search=(
+            SemanticSearch(
+                configurations=[
+                    SemanticConfiguration(
+                        name=AZURE_SEARCH_SEMANTIC_SEARCH_CONFIG,
+                        prioritized_fields=SemanticPrioritizedFields(
+                            title_field=None,
+                            content_fields=[SemanticField(field_name="content")],
+                        ),
+                    )
+                ]
+            )
+        ),
+        vector_search=VectorSearch(
+            algorithms=[
+                HnswAlgorithmConfiguration(
+                    name="default",
+                    parameters=HnswParameters(
+                        metric=VectorSearchAlgorithmMetric.COSINE
+                    ),
+                    kind=VectorSearchAlgorithmKind.HNSW,
+                ),
+                ExhaustiveKnnAlgorithmConfiguration(
+                    name="default_exhaustive_knn",
+                    kind=VectorSearchAlgorithmKind.EXHAUSTIVE_KNN,
+                    parameters=ExhaustiveKnnParameters(
+                        metric=VectorSearchAlgorithmMetric.COSINE
+                    ),
+                ),
+            ],
+            profiles=[
+                VectorSearchProfile(
+                    name="myHnswProfile",
+                    algorithm_configuration_name="default",
+                ),
+                VectorSearchProfile(
+                    name="myExhaustiveKnnProfile",
+                    algorithm_configuration_name="default_exhaustive_knn",
+                ),
+            ],
+        ),
     )
 
+    # when
+    AzureSearchHelper()
+
+    # then
+    search_index_client_mock.return_value.create_index.assert_called_once_with(
+        expected_index
+    )
+
+
+@patch("backend.batch.utilities.helpers.AzureSearchHelper.SearchClient")
+@patch("backend.batch.utilities.helpers.AzureSearchHelper.SearchIndexClient")
+def test_does_not_create_search_index_if_it_exists(
+    search_index_client_mock: MagicMock,
+    search_client_mock: MagicMock,
+):
+    # given
+    search_index_client_mock.return_value.list_index_names.return_value = [
+        AZURE_SEARCH_INDEX
+    ]
+
+    # when
+    AzureSearchHelper()
+
+    # then
+    search_index_client_mock.return_value.create_index.assert_not_called()
+
+
+@patch("backend.batch.utilities.helpers.AzureSearchHelper.SearchClient")
+@patch("backend.batch.utilities.helpers.AzureSearchHelper.SearchIndexClient")
+def test_propogates_exceptions_when_creating_search_index(
+    search_index_client_mock: MagicMock,
+    search_client_mock: MagicMock,
+):
+    # given
+    expected_exception = Exception()
+    search_index_client_mock.return_value.create_index.side_effect = expected_exception
+
+    # when
+    with pytest.raises(Exception) as exc_info:
+        AzureSearchHelper()
+
+    # then
+    assert exc_info.value == expected_exception
+
 
+@patch("backend.batch.utilities.helpers.AzureSearchHelper.SearchClient")
+@patch("backend.batch.utilities.helpers.AzureSearchHelper.SearchIndexClient")
 def test_get_conversation_logger_keys(
-    AzureSearchMock: MagicMock, llm_helper_mock: MagicMock
+    search_index_client_mock: MagicMock,
+    search_client_mock: MagicMock,
+    azure_search_mock: MagicMock,
+    llm_helper_mock: MagicMock,
 ):
     # given
     azure_search_helper = AzureSearchHelper()
@@ -104,9 +295,9 @@ def test_get_conversation_logger_keys(
     conversation_logger = azure_search_helper.get_conversation_logger()
 
     # then
-    assert conversation_logger == AzureSearchMock.return_value
+    assert conversation_logger == azure_search_mock.return_value
 
-    AzureSearchMock.assert_called_once_with(
+    azure_search_mock.assert_called_once_with(
         azure_search_endpoint=AZURE_SEARCH_SERVICE,
         azure_search_key=AZURE_SEARCH_KEY,
         index_name=AZURE_SEARCH_CONVERSATIONS_LOG_INDEX,
@@ -116,20 +307,26 @@ def test_get_conversation_logger_keys(
     )
 
 
+@patch("backend.batch.utilities.helpers.AzureSearchHelper.SearchClient")
+@patch("backend.batch.utilities.helpers.AzureSearchHelper.SearchIndexClient")
 def test_get_conversation_logger_rbac(
-    AzureSearchMock: MagicMock, llm_helper_mock: MagicMock, env_helper_mock: MagicMock
+    search_index_client_mock: MagicMock,
+    search_client_mock: MagicMock,
+    azure_search_mock: MagicMock,
+    llm_helper_mock: MagicMock,
+    env_helper_mock: MagicMock,
 ):
     # given
-    env_helper_mock.AZURE_AUTH_TYPE = "rbac"
+    env_helper_mock.is_auth_type_keys.return_value = False
     azure_search_helper = AzureSearchHelper()
 
     # when
     conversation_logger = azure_search_helper.get_conversation_logger()
 
     # then
-    assert conversation_logger == AzureSearchMock.return_value
+    assert conversation_logger == azure_search_mock.return_value
 
-    AzureSearchMock.assert_called_once_with(
+    azure_search_mock.assert_called_once_with(
         azure_search_endpoint=AZURE_SEARCH_SERVICE,
         azure_search_key=None,
         index_name=AZURE_SEARCH_CONVERSATIONS_LOG_INDEX,
diff --git a/code/tests/utilities/helpers/test_LLMHelper.py b/code/tests/utilities/helpers/test_LLMHelper.py
index 43ab357f7..db93fc169 100644
--- a/code/tests/utilities/helpers/test_LLMHelper.py
+++ b/code/tests/utilities/helpers/test_LLMHelper.py
@@ -3,6 +3,9 @@
 import pytest
 from backend.batch.utilities.helpers.LLMHelper import LLMHelper
 from semantic_kernel.connectors.ai.open_ai import AzureChatCompletion
+from openai.types.create_embedding_response import CreateEmbeddingResponse
+from openai.types.embedding import Embedding
+
 
 AZURE_OPENAI_ENDPOINT = "https://mock-endpoint"
 AZURE_OPENAI_API_VERSION = "mock-api-version"
@@ -27,6 +30,12 @@ def env_helper_mock():
         yield env_helper
 
 
+@pytest.fixture(autouse=True)
+def azure_openai_mock():
+    with patch("backend.batch.utilities.helpers.LLMHelper.AzureOpenAI") as mock:
+        yield mock
+
+
 @patch("backend.batch.utilities.helpers.LLMHelper.AzureChatCompletion")
 def test_get_sk_chat_completion_service_keys(AzureChatCompletionMock: MagicMock):
     # given
@@ -83,3 +92,38 @@ def test_get_sk_service_settings():
     assert settings.service_id == "mock-service-id"
     assert settings.temperature == 0
     assert settings.max_tokens == int(AZURE_OPENAI_MAX_TOKENS)
+
+
+def test_generate_embeddings_embeds_input(azure_openai_mock):
+    # given
+    llm_helper = LLMHelper()
+
+    # when
+    llm_helper.generate_embeddings("some input")
+
+    # then
+    azure_openai_mock.return_value.embeddings.create.assert_called_once_with(
+        input=["some input"], model=AZURE_OPENAI_EMBEDDING_MODEL
+    )
+
+
+def test_generate_embeddings_returns_embeddings(azure_openai_mock):
+    # given
+    llm_helper = LLMHelper()
+    expected_embeddings = [1, 2, 3]
+    azure_openai_mock.return_value.embeddings.create.return_value = (
+        CreateEmbeddingResponse(
+            data=[
+                Embedding(embedding=expected_embeddings, index=0, object="embedding")
+            ],
+            model="mock-model",
+            object="list",
+            usage={"prompt_tokens": 0, "total_tokens": 0},
+        )
+    )
+
+    # when
+    actual_embeddings = llm_helper.generate_embeddings("some input")
+
+    # then
+    assert actual_embeddings == expected_embeddings
diff --git a/code/tests/utilities/helpers/test_PushEmbedder.py b/code/tests/utilities/helpers/test_PushEmbedder.py
index 561096edc..0cbe18088 100644
--- a/code/tests/utilities/helpers/test_PushEmbedder.py
+++ b/code/tests/utilities/helpers/test_PushEmbedder.py
@@ -1,15 +1,28 @@
+import json
 import pytest
-from unittest.mock import MagicMock, patch
-from backend.batch.utilities.helpers.embedders.PushEmbedder import (
-    PushEmbedder,
-)
+from unittest.mock import MagicMock, call, patch
+from backend.batch.utilities.helpers.embedders.PushEmbedder import PushEmbedder
+from backend.batch.utilities.document_chunking.Strategies import ChunkingSettings
+from backend.batch.utilities.document_loading import LoadingSettings
+from backend.batch.utilities.document_loading.Strategies import LoadingStrategy
+from backend.batch.utilities.common.SourceDocument import SourceDocument
 from backend.batch.utilities.helpers.config.EmbeddingConfig import EmbeddingConfig
 
-from backend.batch.utilities.helpers.DocumentLoadingHelper import DocumentLoading
-from backend.batch.utilities.helpers.DocumentChunkingHelper import DocumentChunking
-from backend.batch.utilities.common.SourceDocument import SourceDocument
+CHUNKING_SETTINGS = ChunkingSettings({"strategy": "layout", "size": 1, "overlap": 0})
+LOADING_SETTINGS = LoadingSettings({"strategy": LoadingStrategy.LAYOUT})
 
-AZURE_SEARCH_INDEXER_NAME = "mock-indexer-name"
+
+@pytest.fixture(autouse=True)
+def llm_helper_mock():
+    with patch(
+        "backend.batch.utilities.helpers.embedders.PushEmbedder.LLMHelper"
+    ) as mock:
+        llm_helper = mock.return_value
+        llm_helper.get_embedding_model.return_value.embed_query.return_value = [
+            0
+        ] * 1536
+        llm_helper.generate_embeddings.return_value = [123]
+        yield mock
 
 
 @pytest.fixture(autouse=True)
@@ -26,72 +39,208 @@ def mock_config_helper():
         "backend.batch.utilities.helpers.embedders.PushEmbedder.ConfigHelper"
     ) as mock:
         config_helper = mock.get_active_config_or_default.return_value
+        config_helper.document_processors = [
+            EmbeddingConfig(
+                "jpg",
+                CHUNKING_SETTINGS,
+                LOADING_SETTINGS,
+                use_advanced_image_processing=True,
+            ),
+            EmbeddingConfig(
+                "pdf",
+                CHUNKING_SETTINGS,
+                LOADING_SETTINGS,
+                use_advanced_image_processing=False,
+            ),
+        ]
         yield config_helper
 
 
-def test_process_use_advanced_image_processing_skips_processing(
-    azure_search_helper_mock, mock_config_helper
+@pytest.fixture(autouse=True)
+def document_loading_mock():
+    with patch(
+        "backend.batch.utilities.helpers.embedders.PushEmbedder.DocumentLoading"
+    ) as mock:
+        expected_documents = [
+            SourceDocument(content="some content", source="some source")
+        ]
+        mock.return_value.load.return_value = expected_documents
+        yield mock
+
+
+@pytest.fixture(autouse=True)
+def document_chunking_mock():
+    with patch(
+        "backend.batch.utilities.helpers.embedders.PushEmbedder.DocumentChunking"
+    ) as mock:
+        expected_chunked_documents = [
+            SourceDocument(
+                content="some content",
+                source="some source",
+                id="some id",
+                title="some-title",
+                offset=1,
+                chunk=1,
+                page_number=1,
+                chunk_id="some chunk id",
+            ),
+            SourceDocument(
+                content="some other content",
+                source="some other source",
+                id="some other id",
+                title="some other-title",
+                offset=2,
+                chunk=2,
+                page_number=2,
+                chunk_id="some other chunk id",
+            ),
+        ]
+        mock.return_value.chunk.return_value = expected_chunked_documents
+        yield mock
+
+
+def test_embed_file_use_advanced_image_processing_skips_processing(
+    azure_search_helper_mock,
 ):
     # given
-    vector_store_mock = MagicMock()
-    azure_search_helper_mock.return_value.get_vector_store.return_value = (
-        vector_store_mock
-    )
-    push_embedder = PushEmbedder(None)
-    processor = EmbeddingConfig("pdf", None, None, use_advanced_image_processing=True)
+    push_embedder = PushEmbedder(MagicMock())
 
     # when
-    push_embedder._PushEmbedder__embed("some-url", processor)
+    push_embedder.embed_file("some-url", "some-file-name.jpg")
 
     # then
-    vector_store_mock.add_documents.assert_not_called()
+    azure_search_helper_mock.return_value.get_search_client.assert_not_called()
 
 
-def test_process_with_non_advanced_image_processing_adds_documents_to_vector_store(
-    azure_search_helper_mock, mock_config_helper
-):
+def test_embed_file_loads_documents(document_loading_mock):
     # given
-    vector_store_mock = MagicMock()
-    azure_search_helper_mock.return_value.get_vector_store.return_value = (
-        vector_store_mock
-    )
-    push_embedder = PushEmbedder(None)
-    processor = EmbeddingConfig("pdf", None, None, use_advanced_image_processing=False)
+    push_embedder = PushEmbedder(MagicMock())
     source_url = "some-url"
-    documents = [
-        SourceDocument("1", "document1", "content1"),
-        SourceDocument("2", "document2", "content2"),
-    ]
 
-    with patch.object(
-        DocumentLoading, "load", return_value=documents
-    ) as load_mock, patch.object(
-        DocumentChunking, "chunk", return_value=documents
-    ) as chunk_mock:
+    # when
+    push_embedder.embed_file(
+        source_url,
+        "some-file-name.pdf",
+    )
+
+    # then
+    document_loading_mock.return_value.load.assert_called_once_with(
+        source_url, LOADING_SETTINGS
+    )
+
+
+def test_embed_file_chunks_documents(document_loading_mock, document_chunking_mock):
+    # given
+    push_embedder = PushEmbedder(MagicMock())
+
+    # when
+    push_embedder.embed_file(
+        "some-url",
+        "some-file-name.pdf",
+    )
+
+    # then
+    document_chunking_mock.return_value.chunk.assert_called_once_with(
+        document_loading_mock.return_value.load.return_value, CHUNKING_SETTINGS
+    )
 
-        # when
-        push_embedder._PushEmbedder__embed(source_url, processor)
 
-        # then
-        load_mock.assert_called_once_with(source_url, processor.loading)
-        chunk_mock.assert_called_once_with(documents, processor.chunking)
+def test_embed_file_generates_embeddings_for_documents(llm_helper_mock):
+    # given
+    push_embedder = PushEmbedder(MagicMock())
 
+    # when
+    push_embedder.embed_file(
+        "some-url",
+        "some-file-name.pdf",
+    )
 
-def test_process_file_with_non_url_extension_processes_and_adds_metadata(
-    azure_search_helper_mock, mock_config_helper
+    # then
+    llm_helper_mock.return_value.generate_embeddings.assert_has_calls(
+        [call("some content"), call("some other content")]
+    )
+
+
+def test_embed_file_stores_documents_in_search_index(
+    document_chunking_mock,
+    llm_helper_mock,
+    azure_search_helper_mock,
 ):
     # given
-    vector_store_mock = MagicMock()
-    azure_search_helper_mock.return_value.get_vector_store.return_value = (
-        vector_store_mock
+    push_embedder = PushEmbedder(MagicMock())
+
+    # when
+    push_embedder.embed_file(
+        "some-url",
+        "some-file-name.pdf",
     )
-    push_embedder = PushEmbedder(blob_client=MagicMock())
-    source_url = "some-url"
-    file_name = "sample.pdf"
 
-    with patch.object(push_embedder, "_PushEmbedder__embed") as embed_mock:
-        # when
-        push_embedder.embed_file(source_url, file_name)
+    # then
+    expected_chunked_documents = document_chunking_mock.return_value.chunk.return_value
+    azure_search_helper_mock.return_value.get_search_client.return_value.upload_documents.assert_called_once_with(
+        [
+            {
+                "id": expected_chunked_documents[0].id,
+                "content": expected_chunked_documents[0].content,
+                "content_vector": llm_helper_mock.return_value.generate_embeddings.return_value,
+                "metadata": json.dumps(
+                    {
+                        "id": expected_chunked_documents[0].id,
+                        "source": expected_chunked_documents[0].source,
+                        "title": expected_chunked_documents[0].title,
+                        "chunk": expected_chunked_documents[0].chunk,
+                        "offset": expected_chunked_documents[0].offset,
+                        "page_number": expected_chunked_documents[0].page_number,
+                        "chunk_id": expected_chunked_documents[0].chunk_id,
+                    }
+                ),
+                "title": expected_chunked_documents[0].title,
+                "source": expected_chunked_documents[0].source,
+                "chunk": expected_chunked_documents[0].chunk,
+                "offset": expected_chunked_documents[0].offset,
+            },
+            {
+                "id": expected_chunked_documents[1].id,
+                "content": expected_chunked_documents[1].content,
+                "content_vector": llm_helper_mock.return_value.generate_embeddings.return_value,
+                "metadata": json.dumps(
+                    {
+                        "id": expected_chunked_documents[1].id,
+                        "source": expected_chunked_documents[1].source,
+                        "title": expected_chunked_documents[1].title,
+                        "chunk": expected_chunked_documents[1].chunk,
+                        "offset": expected_chunked_documents[1].offset,
+                        "page_number": expected_chunked_documents[1].page_number,
+                        "chunk_id": expected_chunked_documents[1].chunk_id,
+                    }
+                ),
+                "title": expected_chunked_documents[1].title,
+                "source": expected_chunked_documents[1].source,
+                "chunk": expected_chunked_documents[1].chunk,
+                "offset": expected_chunked_documents[1].offset,
+            },
+        ]
+    )
+
+
+def test_embed_file_raises_exception_on_failure(
+    azure_search_helper_mock,
+):
+    # given
+    push_embedder = PushEmbedder(MagicMock())
+
+    successful_indexing_result = MagicMock()
+    successful_indexing_result.succeeded = True
+    failed_indexing_result = MagicMock()
+    failed_indexing_result.succeeded = False
+    azure_search_helper_mock.return_value.get_search_client.return_value.upload_documents.return_value = [
+        successful_indexing_result,
+        failed_indexing_result,
+    ]
 
-        # then
-        embed_mock.assert_called_once()
+    # when + then
+    with pytest.raises(Exception):
+        push_embedder.embed_file(
+            "some-url",
+            "some-file-name.pdf",
+        )
diff --git a/code/tests/utilities/test_QuestionAnswerTool.py b/code/tests/utilities/test_QuestionAnswerTool.py
index ade574287..2fe8f5680 100644
--- a/code/tests/utilities/test_QuestionAnswerTool.py
+++ b/code/tests/utilities/test_QuestionAnswerTool.py
@@ -87,28 +87,31 @@ def get_source_documents_yield():
     with patch(
         "backend.batch.utilities.tools.QuestionAnswerTool.Search.get_source_documents"
     ) as mock:
-        document = Document(page_content="mock content")
-        document.metadata = {
-            "id": "mock id",
-            "title": "mock title",
-            "source": "mock source",
-            "chunk": "mock chunk",
-            "offset": "mock offset",
-            "page_number": "mock page number",
-        }
-        documents = [document]
+        documents = [
+            SourceDocument(
+                id="mock id",
+                content="mock content",
+                title="mock title",
+                source="mock source",
+                chunk=123,
+                offset=123,
+                page_number=123,
+            ),
+            SourceDocument(
+                id="mock id 2",
+                content="mock content 2",
+                title="mock title 2",
+                source="mock source 2",
+                chunk_id="mock chunk id 2",
+            ),
+        ]
         mock.return_value = documents
         yield mock
 
 
-def test_answer_question_returns_source_documents(
-    get_search_handler_mock, get_source_documents_mock
-):
+def test_answer_question_returns_source_documents():
     # given
     tool = QuestionAnswerTool()
-    create_document_and_source_documents(
-        get_source_documents_mock, get_search_handler_mock
-    )
 
     # when
     answer = tool.answer_question("mock question", [])
@@ -116,55 +119,22 @@ def test_answer_question_returns_source_documents(
     # then
     source_documents = answer.source_documents
 
-    assert len(source_documents) == 1
+    assert len(source_documents) == 2
 
     assert source_documents[0].id == "mock id"
     assert source_documents[0].title == "mock title"
+    assert source_documents[0].content == "mock content"
     assert source_documents[0].source == "mock source"
-    assert source_documents[0].chunk == "mock chunk"
-    assert source_documents[0].offset == "mock offset"
-    assert source_documents[0].page_number == "mock page number"
-
-
-def test_answer_question_integrated_vectorization(
-    env_helper_mock: MagicMock, get_search_handler_mock, get_source_documents_mock
-):
-    # given
-    env_helper_mock.AZURE_SEARCH_USE_INTEGRATED_VECTORIZATION = True
-    document = Document("mock content")
-    document.metadata = {
-        "id": "mock id 1",
-        "title": "mock title 1",
-        "source": "https://example.com/doc1",
-        "chunk_id": "mock chunk id 1",
-        "content": "mock content 1",
-    }
-    get_source_documents_mock.return_value = document
-    documents = []
-    documents.append(
-        SourceDocument(
-            id=document.metadata["id"],
-            content=document.page_content,
-            title=document.metadata["title"],
-            source=document.metadata["source"],
-            chunk_id=document.metadata["chunk_id"],
-        )
-    )
-    get_search_handler_mock.return_answer_source_documents.return_value = documents
-    tool = QuestionAnswerTool()
-
-    # when
-    answer = tool.answer_question("mock question", [])
-
-    # then
-    source_documents = answer.source_documents
+    assert source_documents[0].chunk == 123
+    assert source_documents[0].offset == 123
+    assert source_documents[0].page_number == 123
 
-    assert len(source_documents) == 1
+    assert source_documents[1].id == "mock id 2"
+    assert source_documents[1].title == "mock title 2"
+    assert source_documents[1].content == "mock content 2"
 
-    assert source_documents[0].chunk_id == "mock chunk id 1"
-    assert source_documents[0].title == "mock title 1"
-    assert source_documents[0].source == "https://example.com/doc1"
-    assert source_documents[0].content == "mock content"
+    assert source_documents[1].source == "mock source 2"
+    assert source_documents[1].chunk_id == "mock chunk id 2"
 
 
 def test_answer_question_returns_answer(
@@ -211,8 +181,8 @@ def test_correct_prompt_with_few_shot_example(
 
     # then
     expected_input = {
+        "sources": '{"retrieved_documents":[{"[doc1]":{"content":"mock content"}},{"[doc2]":{"content":"mock content 2"}}]}',
         "question": "mock question",
-        "sources": '{"retrieved_documents":[{"[doc1]":{"content":"mock content"}}]}',
         "chat_history": [],
     }
 
@@ -230,7 +200,7 @@ def test_correct_prompt_with_few_shot_example(
 Human: Sources: {"retrieved_documents":[{"[doc1]":{"content":"mock example content"}}]}, Question: mock example user question
 AI: mock example answer
 System: mock azure openai system message
-Human: Sources: {"retrieved_documents":[{"[doc1]":{"content":"mock content"}}]}, Question: mock question"""
+Human: Sources: {"retrieved_documents":[{"[doc1]":{"content":"mock content"}},{"[doc2]":{"content":"mock content 2"}}]}, Question: mock question"""
     )
 
 
@@ -251,8 +221,8 @@ def test_correct_prompt_without_few_shot_example(
 
     # then
     expected_input = {
+        "sources": '{"retrieved_documents":[{"[doc1]":{"content":"mock content"}},{"[doc2]":{"content":"mock content 2"}}]}',
         "question": "mock question",
-        "sources": '{"retrieved_documents":[{"[doc1]":{"content":"mock content"}}]}',
         "chat_history": [],
     }
 
@@ -265,7 +235,7 @@ def test_correct_prompt_without_few_shot_example(
         prompt_test
         == """System: mock answering system prompt
 System: mock azure openai system message
-Human: Sources: {"retrieved_documents":[{"[doc1]":{"content":"mock content"}}]}, Question: mock question"""
+Human: Sources: {"retrieved_documents":[{"[doc1]":{"content":"mock content"}},{"[doc2]":{"content":"mock content 2"}}]}, Question: mock question"""
     )
 
 
@@ -286,7 +256,7 @@ def test_correct_prompt_with_few_shot_example_and_chat_history(
     # then
     expected_input = {
         "question": "mock question",
-        "sources": '{"retrieved_documents":[{"[doc1]":{"content":"mock content"}}]}',
+        "sources": '{"retrieved_documents":[{"[doc1]":{"content":"mock content"}},{"[doc2]":{"content":"mock content 2"}}]}',
         "chat_history": chat_history,
     }
 
@@ -303,7 +273,7 @@ def test_correct_prompt_with_few_shot_example_and_chat_history(
 System: mock azure openai system message
 Human: Hello
 AI: Hi, how can I help?
-Human: Sources: {"retrieved_documents":[{"[doc1]":{"content":"mock content"}}]}, Question: mock question"""
+Human: Sources: {"retrieved_documents":[{"[doc1]":{"content":"mock content"}},{"[doc2]":{"content":"mock content 2"}}]}, Question: mock question"""
     )
 
 
@@ -326,7 +296,7 @@ def test_non_on_your_data_prompt_correct(
 
     # then
     expected_input = {
-        "sources": """[doc1]: mock content""",
+        "sources": """[doc1]: mock content\n\n[doc2]: mock content 2""",
         "question": "mock question",
     }
 
@@ -335,7 +305,10 @@ def test_non_on_your_data_prompt_correct(
     prompt = LLMChainMock.call_args[1]["prompt"]
     prompt_test = prompt.format(**expected_input)
 
-    assert prompt_test == """Sources: [doc1]: mock content, Question: mock question"""
+    assert (
+        prompt_test
+        == """Sources: [doc1]: mock content\n\n[doc2]: mock content 2, Question: mock question"""
+    )
 
 
 @pytest.mark.parametrize(

From ed97f2f3466c887e8bdc81fe2b31e765920c4566 Mon Sep 17 00:00:00 2001
From: Adam Dougal <adamdougal@microsoft.com>
Date: Tue, 7 May 2024 18:45:14 +0100
Subject: [PATCH 02/10] Fix after merge

---
 .../processors/test_IntegratedVectorizationEmbedder.py    | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/code/tests/utilities/helpers/processors/test_IntegratedVectorizationEmbedder.py b/code/tests/utilities/helpers/processors/test_IntegratedVectorizationEmbedder.py
index b424e5439..76d6cc6e6 100644
--- a/code/tests/utilities/helpers/processors/test_IntegratedVectorizationEmbedder.py
+++ b/code/tests/utilities/helpers/processors/test_IntegratedVectorizationEmbedder.py
@@ -36,14 +36,6 @@ def llm_helper_mock():
         yield mock
 
 
-@pytest.fixture(autouse=True)
-def azure_search_helper_mock():
-    with patch(
-        "backend.batch.utilities.helpers.DocumentProcessorHelper.AzureSearchHelper"
-    ) as mock:
-        yield mock
-
-
 @pytest.fixture(autouse=True)
 def azure_search_iv_index_helper_mock():
     with patch(

From d7d854620687f314979d3f65867a35bd858911be Mon Sep 17 00:00:00 2001
From: Adam Dougal <adamdougal@microsoft.com>
Date: Wed, 8 May 2024 10:42:11 +0000
Subject: [PATCH 03/10] Fix import

---
 code/backend/batch/utilities/helpers/embedders/PushEmbedder.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/code/backend/batch/utilities/helpers/embedders/PushEmbedder.py b/code/backend/batch/utilities/helpers/embedders/PushEmbedder.py
index 9f9463012..43e261045 100644
--- a/code/backend/batch/utilities/helpers/embedders/PushEmbedder.py
+++ b/code/backend/batch/utilities/helpers/embedders/PushEmbedder.py
@@ -2,7 +2,7 @@
 import logging
 from typing import List
 
-from ...helpers import LLMHelper
+from ...helpers.LLMHelper import LLMHelper
 
 from ..AzureBlobStorageClient import AzureBlobStorageClient
 

From 0522fb12aeb46ae7ddd921cb52a67490f17c82b7 Mon Sep 17 00:00:00 2001
From: Adam Dougal <adamdougal@microsoft.com>
Date: Wed, 8 May 2024 16:23:35 +0000
Subject: [PATCH 04/10] Stop catching and rethrowing exception

---
 .../batch/utilities/helpers/AzureSearchHelper.py     | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/code/backend/batch/utilities/helpers/AzureSearchHelper.py b/code/backend/batch/utilities/helpers/AzureSearchHelper.py
index c07f32ecf..bddebf548 100644
--- a/code/backend/batch/utilities/helpers/AzureSearchHelper.py
+++ b/code/backend/batch/utilities/helpers/AzureSearchHelper.py
@@ -169,14 +169,10 @@ def create_index(self):
         )
 
         if self._index_not_exists(self.env_helper.AZURE_SEARCH_INDEX):
-            try:
-                logger.info(
-                    f"Creating or updating index {self.env_helper.AZURE_SEARCH_INDEX}"
-                )
-                self.search_index_client.create_index(index)
-            except Exception as e:
-                logger.exception("Error Creating index")
-                raise e
+            logger.info(
+                f"Creating or updating index {self.env_helper.AZURE_SEARCH_INDEX}"
+            )
+            self.search_index_client.create_index(index)
 
     def _index_not_exists(self, index_name: str) -> bool:
         return index_name not in [

From a3d8eab5807d2572fe0e00fd6582c9029ed4423f Mon Sep 17 00:00:00 2001
From: Adam Dougal <adamdougal@microsoft.com>
Date: Wed, 8 May 2024 16:24:04 +0000
Subject: [PATCH 05/10] Typo

---
 .../backend_api/tests/default/test_conversation_custom.py       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/code/tests/functional/backend_api/tests/default/test_conversation_custom.py b/code/tests/functional/backend_api/tests/default/test_conversation_custom.py
index 13b113081..c644321ee 100644
--- a/code/tests/functional/backend_api/tests/default/test_conversation_custom.py
+++ b/code/tests/functional/backend_api/tests/default/test_conversation_custom.py
@@ -189,7 +189,7 @@ def test_post_makes_correct_calls_to_openai_embeddings_to_embed_question_to_stor
                 "input": [
                     [3923, 374, 279, 7438, 315, 2324, 30]
                 ],  # Embedding of "What is the meaning of life?"
-                "model": "text-embedding-ada-002",  # this is harded in the langchain code base
+                "model": "text-embedding-ada-002",  # this is hard coded in the langchain code base
                 "encoding_format": "base64",
             },
             headers={

From 365e5a2b26b30748fd6427a69e828f666fcafaeb Mon Sep 17 00:00:00 2001
From: Adam Dougal <adamdougal@microsoft.com>
Date: Wed, 8 May 2024 16:31:25 +0000
Subject: [PATCH 06/10] Explain oneshot request

---
 code/tests/functional/backend_api/conftest.py | 26 ++++++++++++-------
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/code/tests/functional/backend_api/conftest.py b/code/tests/functional/backend_api/conftest.py
index 2c7bc54e9..69116f7c7 100644
--- a/code/tests/functional/backend_api/conftest.py
+++ b/code/tests/functional/backend_api/conftest.py
@@ -55,15 +55,7 @@ def setup_default_mocking(httpserver: HTTPServer, app_config: AppConfig):
         }
     )
 
-    httpserver.expect_oneshot_request(
-        "/indexes",
-        method="GET",
-    ).respond_with_json({"value": []})
-
-    httpserver.expect_request(
-        "/indexes",
-        method="GET",
-    ).respond_with_json({"value": [{"name": app_config.get("AZURE_SEARCH_INDEX")}]})
+    prime_search_to_trigger_creation_of_index(httpserver, app_config)
 
     httpserver.expect_request(
         "/indexes",
@@ -155,3 +147,19 @@ def setup_default_mocking(httpserver: HTTPServer, app_config: AppConfig):
     yield
 
     httpserver.check()
+
+
+def prime_search_to_trigger_creation_of_index(
+    httpserver: HTTPServer, app_config: AppConfig
+):
+    # first request should return no indexes
+    httpserver.expect_oneshot_request(
+        "/indexes",
+        method="GET",
+    ).respond_with_json({"value": []})
+
+    # second request should return the index as it will have been "created"
+    httpserver.expect_request(
+        "/indexes",
+        method="GET",
+    ).respond_with_json({"value": [{"name": app_config.get("AZURE_SEARCH_INDEX")}]})

From 60a83659d5382152421987c014c575e4203b502d Mon Sep 17 00:00:00 2001
From: Adam Dougal <adamdougal@microsoft.com>
Date: Wed, 8 May 2024 16:32:43 +0000
Subject: [PATCH 07/10] Yield llmhelper

---
 .../processors/test_IntegratedVectorizationEmbedder.py        | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/code/tests/utilities/helpers/processors/test_IntegratedVectorizationEmbedder.py b/code/tests/utilities/helpers/processors/test_IntegratedVectorizationEmbedder.py
index 76d6cc6e6..dc1b8b0ec 100644
--- a/code/tests/utilities/helpers/processors/test_IntegratedVectorizationEmbedder.py
+++ b/code/tests/utilities/helpers/processors/test_IntegratedVectorizationEmbedder.py
@@ -33,7 +33,7 @@ def llm_helper_mock():
             0
         ] * 1536
         llm_helper.generate_embeddings.return_value = [123]
-        yield mock
+        yield llm_helper
 
 
 @pytest.fixture(autouse=True)
@@ -99,7 +99,7 @@ def test_process_using_integrated_vectorization(
     azure_search_iv_datasource_helper_mock.return_value.create_or_update_datasource.assert_called_once()
 
     azure_search_iv_index_helper_mock.assert_called_once_with(
-        env_helper_mock, llm_helper_mock.return_value
+        env_helper_mock, llm_helper_mock
     )
     azure_search_iv_index_helper_mock.return_value.create_or_update_index.assert_called_once()
 

From db354b1bd556569b69ca527e066e700e143f2bcc Mon Sep 17 00:00:00 2001
From: Adam Dougal <adamdougal@microsoft.com>
Date: Thu, 9 May 2024 08:42:50 +0000
Subject: [PATCH 08/10] Stop catching exceptions

---
 .../helpers/embedders/PushEmbedder.py         | 41 ++++++++-----------
 1 file changed, 16 insertions(+), 25 deletions(-)

diff --git a/code/backend/batch/utilities/helpers/embedders/PushEmbedder.py b/code/backend/batch/utilities/helpers/embedders/PushEmbedder.py
index 43e261045..730bc8b0d 100644
--- a/code/backend/batch/utilities/helpers/embedders/PushEmbedder.py
+++ b/code/backend/batch/utilities/helpers/embedders/PushEmbedder.py
@@ -42,34 +42,25 @@ def embed_file(self, source_url: str, file_name: str):
 
     def __embed(self, source_url: str, embedding_config: EmbeddingConfig):
         documents_to_upload: List[SourceDocument] = []
-        try:
-            if not embedding_config.use_advanced_image_processing:
-                documents: List[SourceDocument] = self.document_loading.load(
-                    source_url, embedding_config.loading
-                )
-                documents = self.document_chunking.chunk(
-                    documents, embedding_config.chunking
-                )
-
-                for document in documents:
-                    documents_to_upload.append(
-                        self._convert_to_search_document(document)
-                    )
+        if not embedding_config.use_advanced_image_processing:
+            documents: List[SourceDocument] = self.document_loading.load(
+                source_url, embedding_config.loading
+            )
+            documents = self.document_chunking.chunk(
+                documents, embedding_config.chunking
+            )
 
-                response = (
-                    self.azure_search_helper.get_search_client().upload_documents(
-                        documents_to_upload
-                    )
-                )
-                if not all([r.succeeded for r in response]):
-                    raise Exception(response)
+            for document in documents:
+                documents_to_upload.append(self._convert_to_search_document(document))
 
-            else:
-                logger.warning("Advanced image processing is not supported yet")
+            response = self.azure_search_helper.get_search_client().upload_documents(
+                documents_to_upload
+            )
+            if not all([r.succeeded for r in response]):
+                raise Exception(response)
 
-        except Exception as e:
-            logger.error(f"Error adding embeddings for {source_url}: {e}")
-            raise e
+        else:
+            logger.warning("Advanced image processing is not supported yet")
 
     def _convert_to_search_document(self, document: SourceDocument):
         embedded_content = self.llm_helper.generate_embeddings(document.content)

From dbfd07f65f62df79b72aad39c04d10771bb3e6ef Mon Sep 17 00:00:00 2001
From: Adam Dougal <adamdougal@microsoft.com>
Date: Thu, 9 May 2024 08:46:12 +0000
Subject: [PATCH 09/10] Fix after merge

---
 .../helpers/processors/test_IntegratedVectorizationEmbedder.py  | 2 +-
 code/tests/utilities/helpers/test_PushEmbedder.py               | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/code/tests/utilities/helpers/processors/test_IntegratedVectorizationEmbedder.py b/code/tests/utilities/helpers/processors/test_IntegratedVectorizationEmbedder.py
index dc1b8b0ec..86e18f597 100644
--- a/code/tests/utilities/helpers/processors/test_IntegratedVectorizationEmbedder.py
+++ b/code/tests/utilities/helpers/processors/test_IntegratedVectorizationEmbedder.py
@@ -3,7 +3,7 @@
 from backend.batch.utilities.helpers.embedders.IntegratedVectorizationEmbedder import (
     IntegratedVectorizationEmbedder,
 )
-from backend.batch.utilities.document_chunking.Strategies import ChunkingSettings
+from backend.batch.utilities.document_chunking.ChunkingStrategy import ChunkingSettings
 from backend.batch.utilities.document_loading import LoadingSettings
 from backend.batch.utilities.document_loading.Strategies import LoadingStrategy
 
diff --git a/code/tests/utilities/helpers/test_PushEmbedder.py b/code/tests/utilities/helpers/test_PushEmbedder.py
index 0cbe18088..56bb0f21f 100644
--- a/code/tests/utilities/helpers/test_PushEmbedder.py
+++ b/code/tests/utilities/helpers/test_PushEmbedder.py
@@ -2,7 +2,7 @@
 import pytest
 from unittest.mock import MagicMock, call, patch
 from backend.batch.utilities.helpers.embedders.PushEmbedder import PushEmbedder
-from backend.batch.utilities.document_chunking.Strategies import ChunkingSettings
+from backend.batch.utilities.document_chunking.ChunkingStrategy import ChunkingSettings
 from backend.batch.utilities.document_loading import LoadingSettings
 from backend.batch.utilities.document_loading.Strategies import LoadingStrategy
 from backend.batch.utilities.common.SourceDocument import SourceDocument

From 5f04de9db38f16ee1afe9e7d4a6f042911f5c1db Mon Sep 17 00:00:00 2001
From: Adam Dougal <adamdougal@microsoft.com>
Date: Thu, 9 May 2024 10:49:35 +0000
Subject: [PATCH 10/10] Revert warnings change

---
 code/backend/batch/utilities/tools/QuestionAnswerTool.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/code/backend/batch/utilities/tools/QuestionAnswerTool.py b/code/backend/batch/utilities/tools/QuestionAnswerTool.py
index b29fd2f18..58c62c767 100644
--- a/code/backend/batch/utilities/tools/QuestionAnswerTool.py
+++ b/code/backend/batch/utilities/tools/QuestionAnswerTool.py
@@ -1,5 +1,6 @@
 import json
 import logging
+import warnings
 
 from ..common.SourceDocument import SourceDocument
 from ..search.Search import Search
@@ -83,7 +84,7 @@ def generate_on_your_data_llm_chain(
             if all((few_shot_example.values())):
                 examples.append(few_shot_example)
             else:
-                logger.warning(
+                warnings.warn(
                     "Not all example fields are set in the config. Skipping few-shot example."
                 )
 
@@ -139,7 +140,7 @@ def answer_question(
                 question, chat_history, source_documents
             )
         else:
-            logger.warning(
+            warnings.warn(
                 "Azure OpenAI On Your Data prompt format is recommended and should be enabled in the Admin app.",
             )
             answering_prompt, input = self.generate_llm_chain(