From 6d718c17d356a943a1e443c3a5d7d910843791c5 Mon Sep 17 00:00:00 2001 From: Farzad Date: Sat, 4 Jan 2025 17:31:05 -0600 Subject: [PATCH 1/9] docling azure ai search --- docs/examples/rag_azuresearch.ipynb | 749 ++++++++++++++++++++++++++++ 1 file changed, 749 insertions(+) create mode 100644 docs/examples/rag_azuresearch.ipynb diff --git a/docs/examples/rag_azuresearch.ipynb b/docs/examples/rag_azuresearch.ipynb new file mode 100644 index 00000000..3f441088 --- /dev/null +++ b/docs/examples/rag_azuresearch.ipynb @@ -0,0 +1,749 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "Ag9kcX2B_atc" + }, + "source": [ + "# RAG using Docling + Azure AI Search + Azure OpenAI\n", + "\n", + "This is a code recipe that uses [Azure AI Search](https://azure.microsoft.com/en-us/products/ai-services/ai-search/?msockid=0109678bea39665431e37323ebff6723) to perform RAG over PDF documents parsed by [Docling](https://ds4sd.github.io/docling/).\n", + "\n", + "# Description:\n", + "\n", + "1. Parse and chunk \"State of AI\" PPTX from Google Slides using Docling\n", + "2. Use Azure OpenAI embeddings for vector creation\n", + "3. Insert vector data into Azure AI Search\n", + "4. Perform a RAG query using Azure AI Search and Azure OpenAI\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# If running in a new environment, uncomment and run these:\n", + "%pip install docling~=\"2.7.0\"\n", + "%pip install 'docling-core[chunking]'\n", + "%pip install azure-search-documents==11.5.2\n", + "%pip install azure-identity\n", + "%pip install openai\n", + "%pip install rich\n", + "%pip install torch" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Part 0: Prerequisites\n", + "Before running this notebook, you'll need:\n", + "\n", + "1) Azure AI Search resource\n", + " - If using Role-based authentication, enable \"Managed Identities\" or \"both\" in the portal\n", + " - If using API keys, supply them in environment variables or secrets\n", + "\n", + "2) Azure OpenAI resource\n", + " - Deployed an Embeddings model (e.g., text-embedding-3-small)\n", + " - Deployed a Chat model (e.g., gpt-4o)\n", + "\n", + "3) Docling installed\n", + "4) Python 3.8+ environment with the packages listed above" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CUDA GPU is enabled: NVIDIA A100 80GB PCIe\n" + ] + } + ], + "source": [ + "import torch\n", + "\n", + "# Check if GPU or MPS is available\n", + "if torch.cuda.is_available():\n", + " device = torch.device(\"cuda\")\n", + " print(f\"CUDA GPU is enabled: {torch.cuda.get_device_name(0)}\")\n", + "elif torch.backends.mps.is_available():\n", + " device = torch.device(\"mps\")\n", + " print(\"MPS GPU is enabled.\")\n", + "else:\n", + " raise EnvironmentError(\n", + " \"No GPU or MPS device found. Please check your environment and ensure GPU or MPS support is configured.\"\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Part 1: Configure environment vars" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "from azure.identity import DefaultAzureCredential\n", + "from azure.search.documents import SearchClient\n", + "from azure.search.documents.models import VectorizableTextQuery\n", + "import openai\n", + "from rich.console import Console\n", + "from rich.panel import Panel\n", + "import os\n", + "import os\n", + "\n", + "from azure.identity import DefaultAzureCredential\n", + "from azure.search.documents import SearchClient\n", + "from azure.search.documents.models import VectorizableTextQuery\n", + "from azure.search.documents.indexes import SearchIndexClient\n", + "from azure.search.documents.indexes.models import (\n", + " SearchIndex,\n", + " SearchField,\n", + " SearchFieldDataType,\n", + " SimpleField,\n", + " SearchableField,\n", + " VectorSearch,\n", + " HnswAlgorithmConfiguration,\n", + " VectorSearchProfile\n", + ")\n", + "from azure.search.documents.indexes.models import (\n", + " SearchField,\n", + " SearchFieldDataType,\n", + " VectorSearch,\n", + " HnswAlgorithmConfiguration,\n", + " VectorSearchProfile,\n", + " AzureOpenAIVectorizer,\n", + " AzureOpenAIVectorizerParameters,\n", + ")\n", + "from azure.core.credentials import AzureKeyCredential\n", + "\n", + "import openai\n", + "from rich.console import Console\n", + "from rich.panel import Panel\n", + "from azure.search.documents import SearchClient\n", + "\n", + "from dotenv import load_dotenv\n", + "\n", + "load_dotenv()\n", + "\n", + "AZURE_SEARCH_ENDPOINT = os.getenv(\"AZURE_SEARCH_ENDPOINT\")\n", + "AZURE_SEARCH_KEY = os.getenv(\"AZURE_SEARCH_KEY\")\n", + "AZURE_SEARCH_INDEX_NAME = os.getenv(\"AZURE_SEARCH_INDEX_NAME\")\n", + "AZURE_OPENAI_ENDPOINT = os.getenv(\"AZURE_OPENAI_ENDPOINT\")\n", + "AZURE_OPENAI_API_KEY = os.getenv(\"AZURE_OPENAI_API_KEY\")\n", + "AZURE_OPENAI_CHAT_MODEL = os.getenv(\"AZURE_OPENAI_CHAT_MODEL\")\n", + "AZURE_OPENAI_API_VERSION = os.getenv(\"AZURE_OPENAI_API_VERSION\")\n", + "AZURE_OPENAI_EMBEDDINGS = os.getenv(\"AZURE_OPENAI_EMBEDDINGS\")\n", + "\n", + "# # Provide environment variables or paste in your values:\n", + "# AZURE_SEARCH_ENDPOINT = (\n", + "# os.getenv(\"AZURE_SEARCH_ENDPOINT\") or \"PUT_AZURE_SEARCH_ENDPOINT_HERE\"\n", + "# )\n", + "# AZURE_SEARCH_INDEX_NAME = os.getenv(\"AZURE_SEARCH_INDEX_NAME\") or \"docling-rag-sample\"\n", + "# AZURE_SEARCH_ADMIN_KEY = (\n", + "# os.getenv(\"AZURE_SEARCH_KEY\") or \"YOUR_SEARCH_ADMIN_KEY_OR_DELETE_IF_RBAC\"\n", + "# )\n", + "# AZURE_OPENAI_ENDPOINT = (\n", + "# os.getenv(\"AZURE_OPENAI_ENDPOINT\") or \"PUT_AZURE_OPENAI_ENDPOINT_HERE\"\n", + "# )\n", + "# AZURE_OPENAI_API_VERSION = os.getenv(\"AZURE_OPENAI_API_VERSION\") or \"2024-06-01\"\n", + "# AZURE_OPENAI_EMBEDDINGS = (\n", + "# os.getenv(\"AZURE_OPENAI_EMBEDDINGS\") or \"text-embedding-3-large\"\n", + "# )\n", + "# AZURE_OPENAI_CHAT_MODEL = os.getenv(\"AZURE_OPENAI_CHAT_MODEL\") or \"gpt-4o\"\n", + "\n", + "# # If using Key-based auth for Azure OpenAI\n", + "# AZURE_OPENAI_KEY = os.getenv(\"AZURE_OPENAI_KEY\") or \"YOUR_OPENAI_KEY_HERE\"\n", + "\n", + "# # If using Role-based auth for Azure OpenAI, comment out openai.api_key below\n", + "# openai.api_key = AZURE_OPENAI_KEY\n", + "console = Console()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Part 2: Parse with Docling \n", + "We'll parse the \"State of AI\" slides from a remote link. Feel free to use whatever document or source you want. \n", + "\n", + "Note: In real use, you might prefer doc_converter.convert_all() or a single convert() call.\n", + "We'll show a simple approach here.\n", + "\n", + "On a A100 GPU, it took ~4 mins. Azure SKU: \"Standard_NC24ads_A100_v4 (24 cores, 220 GB RAM, 64 GB disk)\"" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Fetching 9 files: 100%|██████████| 9/9 [00:00<00:00, 97541.95it/s]\n" + ] + }, + { + "data": { + "text/html": [ + "
╭─────────────────────────────────────────── Docling Markdown Preview ────────────────────────────────────────────╮\n",
+       "│ ## STATE OF AI REPORT.                                                                                          │\n",
+       "│                                                                                                                 │\n",
+       "│ October 10, 2024                                                                                                │\n",
+       "│                                                                                                                 │\n",
+       "│ Nathan Benaich                                                                                                  │\n",
+       "│                                                                                                                 │\n",
+       "│ AIR STREET CAPITAL.                                                                                             │\n",
+       "│                                                                                                                 │\n",
+       "│ ## About the authors                                                                                            │\n",
+       "│                                                                                                                 │\n",
+       "│ Nathan Benaich                                                                                                  │\n",
+       "│                                                                                                                 │\n",
+       "│ <!-- image -->                                                                                                  │\n",
+       "│                                                                                                                 │\n",
+       "│ Nathan is the General Partner of Air Street Capital , a venture capital firm investing in AI-first companies.   │\n",
+       "│ He runs the Research and Applied AI Summit (RAAIS), the RAAIS Foundation (funding open-source AI projects), AI  │\n",
+       "│ communities in the US and Europe, and Spinout.fyi (improving university spinout creation). He studied biology   │\n",
+       "│ at Williams College and earned a PhD...                                                                         │\n",
+       "╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n",
+       "
\n" + ], + "text/plain": [ + "╭─────────────────────────────────────────── Docling Markdown Preview ────────────────────────────────────────────╮\n", + "│ ## STATE OF AI REPORT. │\n", + "│ │\n", + "│ October 10, 2024 │\n", + "│ │\n", + "│ Nathan Benaich │\n", + "│ │\n", + "│ AIR STREET CAPITAL. │\n", + "│ │\n", + "│ ## About the authors │\n", + "│ │\n", + "│ Nathan Benaich │\n", + "│ │\n", + "│ │\n", + "│ │\n", + "│ Nathan is the General Partner of Air Street Capital , a venture capital firm investing in AI-first companies. │\n", + "│ He runs the Research and Applied AI Summit (RAAIS), the RAAIS Foundation (funding open-source AI projects), AI │\n", + "│ communities in the US and Europe, and Spinout.fyi (improving university spinout creation). He studied biology │\n", + "│ at Williams College and earned a PhD... │\n", + "╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from docling.document_converter import DocumentConverter\n", + "\n", + "source_url = \"https://ignite2024demo.blob.core.windows.net/state-of-ai-2024/State of AI Report 2024.pdf\"\n", + "\n", + "converter = DocumentConverter()\n", + "result = converter.convert(source_url)\n", + "\n", + "# We'll just display the Markdown output to confirm parse success:\n", + "md_preview = result.document.export_to_markdown()\n", + "console.print(Panel(md_preview[:500] + \"...\", title=\"Docling Markdown Preview\"))\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Part 3: Hierarchical Chunking of the parsed text " + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
Total chunks from PDF: 966\n",
+       "
\n" + ], + "text/plain": [ + "Total chunks from PDF: \u001b[1;36m966\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from docling_core.transforms.chunker import HierarchicalChunker\n", + "\n", + "chunker = HierarchicalChunker()\n", + "\n", + "# We'll chunk the single result.document from above\n", + "doc_chunks = list(chunker.chunk(result.document))\n", + "\n", + "# For each chunk, create a simple \"content\" text. \n", + "# Optionally you can prefix with a doc/page title if relevant.\n", + "all_chunks = []\n", + "for idx, c in enumerate(doc_chunks):\n", + " chunk_text = c.text \n", + " all_chunks.append((f\"chunk_{idx}\", chunk_text))\n", + "\n", + "console.print(f\"Total chunks from PDF: {len(all_chunks)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Part 4: Create an Azure AI Search index and push chunk embeddings \n", + "We'll embed each chunk using Azure OpenAI, then upsert to a custom index\n", + "that has:\n", + "- a primary key: chunk_id\n", + "- a text field: content\n", + "- a vector field: content_vector (dimension 1536, if using text-embedding-3-small)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create the search index" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
Index 'docling-rag-sample' created.\n",
+       "
\n" + ], + "text/plain": [ + "Index \u001b[32m'docling-rag-sample'\u001b[0m created.\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "VECTOR_DIM = 1536 # Adjust as needed for your embedding model\n", + "\n", + "index_client = SearchIndexClient(\n", + " AZURE_SEARCH_ENDPOINT, AzureKeyCredential(AZURE_SEARCH_KEY)\n", + ")\n", + "\n", + "\n", + "def create_search_index(index_name: str):\n", + " fields = [\n", + " SimpleField(name=\"chunk_id\", type=SearchFieldDataType.String, key=True),\n", + " SearchableField(name=\"content\", type=SearchFieldDataType.String),\n", + " SearchField(\n", + " name=\"content_vector\",\n", + " type=SearchFieldDataType.Collection(SearchFieldDataType.Single),\n", + " searchable=True,\n", + " filterable=False,\n", + " sortable=False,\n", + " facetable=False,\n", + " vector_search_dimensions=VECTOR_DIM,\n", + " vector_search_profile_name=\"default\",\n", + " ),\n", + " ]\n", + "\n", + " vector_search = VectorSearch(\n", + " algorithms=[HnswAlgorithmConfiguration(name=\"default\")],\n", + " profiles=[\n", + " VectorSearchProfile(\n", + " name=\"default\",\n", + " algorithm_configuration_name=\"default\",\n", + " vectorizer_name=\"default\",\n", + " )\n", + " ],\n", + " vectorizers=[\n", + " AzureOpenAIVectorizer(\n", + " vectorizer_name=\"default\",\n", + " parameters=AzureOpenAIVectorizerParameters(\n", + " resource_url=AZURE_OPENAI_ENDPOINT,\n", + " deployment_name=AZURE_OPENAI_EMBEDDINGS,\n", + " model_name=\"text-embedding-3-small\",\n", + " api_key=AZURE_OPENAI_API_KEY,\n", + " ),\n", + " )\n", + " ],\n", + " )\n", + "\n", + " new_index = SearchIndex(name=index_name, fields=fields, vector_search=vector_search)\n", + "\n", + " try:\n", + " index_client.delete_index(index_name)\n", + " except:\n", + " pass\n", + "\n", + " index_client.create_or_update_index(new_index)\n", + " console.print(f\"Index '{index_name}' created.\")\n", + "\n", + "\n", + "create_search_index(AZURE_SEARCH_INDEX_NAME)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## B) Generate embeddings & upsert" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
Uploaded batch 0 -> 250; success: True, status code: 201\n",
+       "
\n" + ], + "text/plain": [ + "Uploaded batch \u001b[1;36m0\u001b[0m -> \u001b[1;36m250\u001b[0m; success: \u001b[3;92mTrue\u001b[0m, status code: \u001b[1;36m201\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
Uploaded batch 250 -> 500; success: True, status code: 201\n",
+       "
\n" + ], + "text/plain": [ + "Uploaded batch \u001b[1;36m250\u001b[0m -> \u001b[1;36m500\u001b[0m; success: \u001b[3;92mTrue\u001b[0m, status code: \u001b[1;36m201\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
Uploaded batch 500 -> 750; success: True, status code: 201\n",
+       "
\n" + ], + "text/plain": [ + "Uploaded batch \u001b[1;36m500\u001b[0m -> \u001b[1;36m750\u001b[0m; success: \u001b[3;92mTrue\u001b[0m, status code: \u001b[1;36m201\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
Uploaded batch 750 -> 966; success: True, status code: 201\n",
+       "
\n" + ], + "text/plain": [ + "Uploaded batch \u001b[1;36m750\u001b[0m -> \u001b[1;36m966\u001b[0m; success: \u001b[3;92mTrue\u001b[0m, status code: \u001b[1;36m201\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
All chunks uploaded to Azure Search.\n",
+       "
\n" + ], + "text/plain": [ + "All chunks uploaded to Azure Search.\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from openai import AzureOpenAI\n", + "\n", + "search_client = SearchClient(\n", + " AZURE_SEARCH_ENDPOINT, AZURE_SEARCH_INDEX_NAME, AzureKeyCredential(AZURE_SEARCH_KEY)\n", + ")\n", + "openai_client = AzureOpenAI(\n", + " api_key=AZURE_OPENAI_API_KEY,\n", + " api_version=AZURE_OPENAI_API_VERSION,\n", + " azure_endpoint=AZURE_OPENAI_ENDPOINT,\n", + ")\n", + "\n", + "\n", + "def embed_text(text: str):\n", + " # Basic call to Azure OpenAI Embeddings\n", + " response = openai_client.embeddings.create(\n", + " input=text, model=AZURE_OPENAI_EMBEDDINGS # or deployment name\n", + " )\n", + " return response.data[0].embedding\n", + "\n", + "\n", + "import uuid\n", + "\n", + "upload_docs = []\n", + "for chunk_id, chunk_text in all_chunks:\n", + " embedding_vector = embed_text(chunk_text)\n", + " upload_docs.append(\n", + " {\n", + " \"chunk_id\": str(uuid.uuid4()),\n", + " \"content\": chunk_text,\n", + " \"content_vector\": embedding_vector,\n", + " }\n", + " )\n", + "\n", + "# Upload in small batches\n", + "BATCH_SIZE = 250\n", + "for i in range(0, len(upload_docs), BATCH_SIZE):\n", + " subset = upload_docs[i : i + BATCH_SIZE]\n", + " resp = search_client.upload_documents(documents=subset)\n", + " console.print(\n", + " f\"Uploaded batch {i} -> {i+len(subset)}; success: {resp[0].succeeded}, status code: {resp[0].status_code}\"\n", + ")\n", + "\n", + "console.print(\"All chunks uploaded to Azure Search.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Part 5: RAG Query with Azure OpenAI" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
╭────────────────────────────────────────────────── RAG Prompt ───────────────────────────────────────────────────╮\n",
+       "│                                                                                                                 │\n",
+       "│ You are an AI assistant helping summarize the State of AI 2024 PDF.                                             │\n",
+       "│ Use ONLY the text below to answer the user's question.                                                          │\n",
+       "│ If the answer isn't in the text, say you don't know.                                                            │\n",
+       "│ Context:                                                                                                        │\n",
+       "│ -NVIDIA remains the most powerful company in the world, enjoying a stint in the $3T club, while regulators      │\n",
+       "│ probe the concentrations of power within GenAI.                                                                 │\n",
+       "│ -More established GenAI companies bring in billions of dollars in revenue, while start-ups begin to gain        │\n",
+       "│ traction in sectors like video and audio generation. Although companies begin to make the journey from model to │\n",
+       "│ product, long-term questions around pricing and sustainability remain unresolved.                               │\n",
+       "│ -Driven by a bull run in public markets, AI companies reach $9T in value, while investment levels grow          │\n",
+       "│ healthily in private companies.                                                                                 │\n",
+       "│ ---                                                                                                             │\n",
+       "│ Driven by GenAI megarounds like xAI and OpenAI's $6B fundraises, US private market continue to lead. Total      │\n",
+       "│ investment into AI companies reached close to $100B.                                                            │\n",
+       "│ ---                                                                                                             │\n",
+       "│ stateof.ai 2024 Sam Altman is reportedly raising huge sums of money to do this, while each of Google, Amazon,   │\n",
+       "│ Meta and Microsoft continue to build and improve their owned AI silicon.                                        │\n",
+       "│ ---                                                                                                             │\n",
+       "│ stateof.ai 2024                                                                                                 │\n",
+       "│ ---                                                                                                             │\n",
+       "│ stateof.ai 2024                                                                                                 │\n",
+       "│ ---                                                                                                             │\n",
+       "│ Of all venture-backed companies, the highest % of AI companies are found in robotics, enterprise software,      │\n",
+       "│ space and security categories.                                                                                  │\n",
+       "│ ---                                                                                                             │\n",
+       "│ Deep learning (DL): an approach to AI inspired by how neurons in the brain recognise complex patterns in data.  │\n",
+       "│ The \"deep\" refers to the many layers of neurons in today's models that help to learn rich representations of    │\n",
+       "│ data to achieve better performance gains.                                                                       │\n",
+       "│ ---                                                                                                             │\n",
+       "│ While private company valuations have continued to climb at a steady pace, a small handful of publicly traded   │\n",
+       "│ companies have held up the market like Atlas. Publics alone now enjoy a greater enterprise value than the       │\n",
+       "│ entire market in 2023.                                                                                          │\n",
+       "│ ---                                                                                                             │\n",
+       "│ Analysis of the 100 highest revenue grossing AI companies using Stripe reveals that, as a group, they are       │\n",
+       "│ generating revenue at a much faster pace than previous waves of equivalently well-performing SaaS companies.    │\n",
+       "│ Strikingly, the average AI company that has reached $30M+ annualised revenue took just 20 months to get there,  │\n",
+       "│ compared to 65 months for equally promising SaaS companies.                                                     │\n",
+       "│ ---                                                                                                             │\n",
+       "│ A generative AI media company is investigated for its misuse during in the 2024 US election circuit.            │\n",
+       "│                                                                                                                 │\n",
+       "│ Question: in 2024, AI companies reached how many $$$ in value?                                                  │\n",
+       "│ Answer:                                                                                                         │\n",
+       "│                                                                                                                 │\n",
+       "╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[1;31m╭─\u001b[0m\u001b[1;31m─────────────────────────────────────────────────\u001b[0m RAG Prompt \u001b[1;31m──────────────────────────────────────────────────\u001b[0m\u001b[1;31m─╮\u001b[0m\n", + "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n", + "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mYou are an AI assistant helping summarize the State of AI 2024 PDF.\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n", + "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mUse ONLY the text below to answer the user's question. \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n", + "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mIf the answer isn't in the text, say you don't know.\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n", + "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mContext:\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n", + "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m-NVIDIA remains the most powerful company in the world, enjoying a stint in the $3T club, while regulators \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n", + "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mprobe the concentrations of power within GenAI.\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n", + "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m-More established GenAI companies bring in billions of dollars in revenue, while start-ups begin to gain \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n", + "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mtraction in sectors like video and audio generation. Although companies begin to make the journey from model to\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n", + "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mproduct, long-term questions around pricing and sustainability remain unresolved.\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n", + "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m-Driven by a bull run in public markets, AI companies reach $9T in value, while investment levels grow \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n", + "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mhealthily in private companies.\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n", + "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m---\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n", + "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mDriven by GenAI megarounds like xAI and OpenAI's $6B fundraises, US private market continue to lead. Total \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n", + "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31minvestment into AI companies reached close to $100B.\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n", + "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m---\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n", + "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mstateof.ai 2024 Sam Altman is reportedly raising huge sums of money to do this, while each of Google, Amazon, \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n", + "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mMeta and Microsoft continue to build and improve their owned AI silicon.\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n", + "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m---\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n", + "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mstateof.ai 2024\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n", + "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m---\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n", + "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mstateof.ai 2024\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n", + "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m---\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n", + "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mOf all venture-backed companies, the highest % of AI companies are found in robotics, enterprise software, \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n", + "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mspace and security categories.\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n", + "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m---\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n", + "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mDeep learning (DL): an approach to AI inspired by how neurons in the brain recognise complex patterns in data. \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n", + "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mThe \"deep\" refers to the many layers of neurons in today's models that help to learn rich representations of \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n", + "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mdata to achieve better performance gains.\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n", + "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m---\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n", + "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mWhile private company valuations have continued to climb at a steady pace, a small handful of publicly traded \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n", + "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mcompanies have held up the market like Atlas. Publics alone now enjoy a greater enterprise value than the \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n", + "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mentire market in 2023.\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n", + "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m---\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n", + "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mAnalysis of the 100 highest revenue grossing AI companies using Stripe reveals that, as a group, they are \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n", + "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mgenerating revenue at a much faster pace than previous waves of equivalently well-performing SaaS companies. \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n", + "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mStrikingly, the average AI company that has reached $30M+ annualised revenue took just 20 months to get there, \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n", + "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mcompared to 65 months for equally promising SaaS companies.\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n", + "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m---\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n", + "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mA generative AI media company is investigated for its misuse during in the 2024 US election circuit.\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n", + "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n", + "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mQuestion: in 2024, AI companies reached how many $$$ in value?\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n", + "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mAnswer:\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n", + "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n", + "\u001b[1;31m╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
╭───────────────────────────────────────────────── RAG Response ──────────────────────────────────────────────────╮\n",
+       "│ AI companies reached $9T in value in 2024.                                                                      │\n",
+       "╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[1;32m╭─\u001b[0m\u001b[1;32m────────────────────────────────────────────────\u001b[0m RAG Response \u001b[1;32m─────────────────────────────────────────────────\u001b[0m\u001b[1;32m─╮\u001b[0m\n", + "\u001b[1;32m│\u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32mAI companies reached $9T in value in 2024.\u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m│\u001b[0m\n", + "\u001b[1;32m╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "def generate_chat_response(prompt: str, system_message: str = None):\n", + " \"\"\"\n", + " Basic Chat request to Azure OpenAI.\n", + " In production, consider passing more parameters (temperature, presence_penalty, etc.).\n", + " \"\"\"\n", + " messages = []\n", + " if system_message:\n", + " messages.append({\"role\": \"system\", \"content\": system_message})\n", + " messages.append({\"role\": \"user\", \"content\": prompt})\n", + "\n", + " completion = openai_client.chat.completions.create(\n", + " model=AZURE_OPENAI_CHAT_MODEL, messages=messages, temperature=0.7\n", + " )\n", + " return completion.choices[0].message.content\n", + "\n", + "\n", + "# Example question\n", + "user_query = (\n", + " \"in 2024, AI companies reached how many $$$ in value?\"\n", + ")\n", + "user_embed = embed_text(user_query)\n", + "\n", + "# We'll use integrated vectorization to generate query embeddings in Azure AI Search\n", + "vector_query = VectorizableTextQuery(\n", + " text=user_query, k_nearest_neighbors=5, fields=\"content_vector\"\n", + ")\n", + "\n", + "search_results = search_client.search(\n", + " search_text=user_query, vector_queries=[vector_query], select=[\"content\"], top=10\n", + ")\n", + "\n", + "retrieved_chunks = []\n", + "for result in search_results:\n", + " snippet = result[\"content\"]\n", + " retrieved_chunks.append(snippet)\n", + "\n", + "# Combine retrieved chunks\n", + "context_str = \"\\n---\\n\".join(retrieved_chunks)\n", + "\n", + "rag_prompt = f\"\"\"\n", + "You are an AI assistant helping summarize the State of AI 2024 PDF.\n", + "Use ONLY the text below to answer the user's question. \n", + "If the answer isn't in the text, say you don't know.\n", + "Context:\n", + "{context_str}\n", + "\n", + "Question: {user_query}\n", + "Answer:\n", + "\"\"\"\n", + "\n", + "final_answer = generate_chat_response(rag_prompt)\n", + "\n", + "console.print(Panel(rag_prompt, title=\"RAG Prompt\", style=\"bold red\"))\n", + "console.print(Panel(final_answer, title=\"RAG Response\", style=\"bold green\"))" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "gpuType": "T4", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3.10 - SDK v2", + "language": "python", + "name": "python310-sdkv2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.11" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} From acdb32d302d4d0b96da7ebab6c3e86ca42ea524b Mon Sep 17 00:00:00 2001 From: Farzad Date: Sat, 4 Jan 2025 17:49:18 -0600 Subject: [PATCH 2/9] azure ai search updates --- docs/examples/rag_azuresearch.ipynb | 284 ++++++++++++---------------- 1 file changed, 118 insertions(+), 166 deletions(-) diff --git a/docs/examples/rag_azuresearch.ipynb b/docs/examples/rag_azuresearch.ipynb index 3f441088..ec0b8352 100644 --- a/docs/examples/rag_azuresearch.ipynb +++ b/docs/examples/rag_azuresearch.ipynb @@ -6,16 +6,19 @@ "id": "Ag9kcX2B_atc" }, "source": [ - "# RAG using Docling + Azure AI Search + Azure OpenAI\n", - "\n", - "This is a code recipe that uses [Azure AI Search](https://azure.microsoft.com/en-us/products/ai-services/ai-search/?msockid=0109678bea39665431e37323ebff6723) to perform RAG over PDF documents parsed by [Docling](https://ds4sd.github.io/docling/).\n", - "\n", - "# Description:\n", - "\n", - "1. Parse and chunk \"State of AI\" PPTX from Google Slides using Docling\n", - "2. Use Azure OpenAI embeddings for vector creation\n", - "3. Insert vector data into Azure AI Search\n", - "4. Perform a RAG query using Azure AI Search and Azure OpenAI\n" + "# Building a RAG System with Docling and Azure AI Search\n", + "\n", + "This notebook demonstrates how to build a Retrieval-Augmented Generation (RAG) system using:\n", + "- [Docling](https://ds4sd.github.io/docling/) for document parsing and chunking\n", + "- [Azure AI Search](https://azure.microsoft.com/products/ai-services/ai-search/?msockid=0109678bea39665431e37323ebff6723) for vector indexing and retrieval\n", + "- [Azure OpenAI](https://azure.microsoft.com/products/ai-services/openai-service?msockid=0109678bea39665431e37323ebff6723) for embeddings and chat completion\n", + "\n", + "This sample demonstrates how to:\n", + "1. Parse a PDF with Docling.\n", + "2. Chunk the parsed text.\n", + "3. Use Azure OpenAI for embeddings.\n", + "4. Index and search in Azure AI Search.\n", + "5. Run a retrieval-augmented generation (RAG) query with Azure OpenAI GPT-4o.\n" ] }, { @@ -26,7 +29,6 @@ "source": [ "# If running in a new environment, uncomment and run these:\n", "%pip install docling~=\"2.7.0\"\n", - "%pip install 'docling-core[chunking]'\n", "%pip install azure-search-documents==11.5.2\n", "%pip install azure-identity\n", "%pip install openai\n", @@ -39,18 +41,10 @@ "metadata": {}, "source": [ "# Part 0: Prerequisites\n", - "Before running this notebook, you'll need:\n", - "\n", - "1) Azure AI Search resource\n", - " - If using Role-based authentication, enable \"Managed Identities\" or \"both\" in the portal\n", - " - If using API keys, supply them in environment variables or secrets\n", - "\n", - "2) Azure OpenAI resource\n", - " - Deployed an Embeddings model (e.g., text-embedding-3-small)\n", - " - Deployed a Chat model (e.g., gpt-4o)\n", - "\n", - "3) Docling installed\n", - "4) Python 3.8+ environment with the packages listed above" + " - Azure AI Search resource\n", + " - Azure OpenAI resource with deployed embeddings & chat models\n", + " - Docling installed (Python 3.8+ environment)\n", + " - GPU or MPS recommended" ] }, { @@ -91,50 +85,11 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 38, "metadata": {}, "outputs": [], "source": [ "import os\n", - "from azure.identity import DefaultAzureCredential\n", - "from azure.search.documents import SearchClient\n", - "from azure.search.documents.models import VectorizableTextQuery\n", - "import openai\n", - "from rich.console import Console\n", - "from rich.panel import Panel\n", - "import os\n", - "import os\n", - "\n", - "from azure.identity import DefaultAzureCredential\n", - "from azure.search.documents import SearchClient\n", - "from azure.search.documents.models import VectorizableTextQuery\n", - "from azure.search.documents.indexes import SearchIndexClient\n", - "from azure.search.documents.indexes.models import (\n", - " SearchIndex,\n", - " SearchField,\n", - " SearchFieldDataType,\n", - " SimpleField,\n", - " SearchableField,\n", - " VectorSearch,\n", - " HnswAlgorithmConfiguration,\n", - " VectorSearchProfile\n", - ")\n", - "from azure.search.documents.indexes.models import (\n", - " SearchField,\n", - " SearchFieldDataType,\n", - " VectorSearch,\n", - " HnswAlgorithmConfiguration,\n", - " VectorSearchProfile,\n", - " AzureOpenAIVectorizer,\n", - " AzureOpenAIVectorizerParameters,\n", - ")\n", - "from azure.core.credentials import AzureKeyCredential\n", - "\n", - "import openai\n", - "from rich.console import Console\n", - "from rich.panel import Panel\n", - "from azure.search.documents import SearchClient\n", - "\n", "from dotenv import load_dotenv\n", "\n", "load_dotenv()\n", @@ -146,56 +101,29 @@ "AZURE_OPENAI_API_KEY = os.getenv(\"AZURE_OPENAI_API_KEY\")\n", "AZURE_OPENAI_CHAT_MODEL = os.getenv(\"AZURE_OPENAI_CHAT_MODEL\")\n", "AZURE_OPENAI_API_VERSION = os.getenv(\"AZURE_OPENAI_API_VERSION\")\n", - "AZURE_OPENAI_EMBEDDINGS = os.getenv(\"AZURE_OPENAI_EMBEDDINGS\")\n", - "\n", - "# # Provide environment variables or paste in your values:\n", - "# AZURE_SEARCH_ENDPOINT = (\n", - "# os.getenv(\"AZURE_SEARCH_ENDPOINT\") or \"PUT_AZURE_SEARCH_ENDPOINT_HERE\"\n", - "# )\n", - "# AZURE_SEARCH_INDEX_NAME = os.getenv(\"AZURE_SEARCH_INDEX_NAME\") or \"docling-rag-sample\"\n", - "# AZURE_SEARCH_ADMIN_KEY = (\n", - "# os.getenv(\"AZURE_SEARCH_KEY\") or \"YOUR_SEARCH_ADMIN_KEY_OR_DELETE_IF_RBAC\"\n", - "# )\n", - "# AZURE_OPENAI_ENDPOINT = (\n", - "# os.getenv(\"AZURE_OPENAI_ENDPOINT\") or \"PUT_AZURE_OPENAI_ENDPOINT_HERE\"\n", - "# )\n", - "# AZURE_OPENAI_API_VERSION = os.getenv(\"AZURE_OPENAI_API_VERSION\") or \"2024-06-01\"\n", - "# AZURE_OPENAI_EMBEDDINGS = (\n", - "# os.getenv(\"AZURE_OPENAI_EMBEDDINGS\") or \"text-embedding-3-large\"\n", - "# )\n", - "# AZURE_OPENAI_CHAT_MODEL = os.getenv(\"AZURE_OPENAI_CHAT_MODEL\") or \"gpt-4o\"\n", - "\n", - "# # If using Key-based auth for Azure OpenAI\n", - "# AZURE_OPENAI_KEY = os.getenv(\"AZURE_OPENAI_KEY\") or \"YOUR_OPENAI_KEY_HERE\"\n", - "\n", - "# # If using Role-based auth for Azure OpenAI, comment out openai.api_key below\n", - "# openai.api_key = AZURE_OPENAI_KEY\n", - "console = Console()" + "AZURE_OPENAI_EMBEDDINGS = os.getenv(\"AZURE_OPENAI_EMBEDDINGS\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "# Part 2: Parse with Docling \n", - "We'll parse the \"State of AI\" slides from a remote link. Feel free to use whatever document or source you want. \n", - "\n", - "Note: In real use, you might prefer doc_converter.convert_all() or a single convert() call.\n", - "We'll show a simple approach here.\n", + "# Part 2: Parse the PDF with Docling\n", + "Example: \"State of AI\" slides from a remote link.\n", "\n", - "On a A100 GPU, it took ~4 mins. Azure SKU: \"Standard_NC24ads_A100_v4 (24 cores, 220 GB RAM, 64 GB disk)\"" + "You can find the raw powerpoint here: https://docs.google.com/presentation/d/1GmZmoWOa2O92BPrncRcTKa15xvQGhq7g4I4hJSNlC0M/edit?usp=sharing" ] }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 39, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "Fetching 9 files: 100%|██████████| 9/9 [00:00<00:00, 97541.95it/s]\n" + "Fetching 9 files: 100%|██████████| 9/9 [00:00<00:00, 109734.70it/s]\n" ] }, { @@ -251,28 +179,32 @@ } ], "source": [ + "from rich.console import Console\n", + "from rich.panel import Panel\n", "from docling.document_converter import DocumentConverter\n", "\n", - "source_url = \"https://ignite2024demo.blob.core.windows.net/state-of-ai-2024/State of AI Report 2024.pdf\"\n", + "console = Console()\n", "\n", + "source_url = \"https://ignite2024demo.blob.core.windows.net/state-of-ai-2024/State of AI Report 2024.pdf\"\n", "converter = DocumentConverter()\n", "result = converter.convert(source_url)\n", "\n", - "# We'll just display the Markdown output to confirm parse success:\n", + "# Optional: preview the parsed Markdown\n", "md_preview = result.document.export_to_markdown()\n", - "console.print(Panel(md_preview[:500] + \"...\", title=\"Docling Markdown Preview\"))\n" + "console.print(Panel(md_preview[:500] + \"...\", title=\"Docling Markdown Preview\"))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "# Part 3: Hierarchical Chunking of the parsed text " + "# Part 3: Hierarchical Chunking\n", + " Convert the Document into smaller chunks for embedding & indexing" ] }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 40, "metadata": {}, "outputs": [ { @@ -293,15 +225,11 @@ "from docling_core.transforms.chunker import HierarchicalChunker\n", "\n", "chunker = HierarchicalChunker()\n", - "\n", - "# We'll chunk the single result.document from above\n", "doc_chunks = list(chunker.chunk(result.document))\n", "\n", - "# For each chunk, create a simple \"content\" text. \n", - "# Optionally you can prefix with a doc/page title if relevant.\n", "all_chunks = []\n", "for idx, c in enumerate(doc_chunks):\n", - " chunk_text = c.text \n", + " chunk_text = c.text\n", " all_chunks.append((f\"chunk_{idx}\", chunk_text))\n", "\n", "console.print(f\"Total chunks from PDF: {len(all_chunks)}\")" @@ -311,24 +239,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Part 4: Create an Azure AI Search index and push chunk embeddings \n", - "We'll embed each chunk using Azure OpenAI, then upsert to a custom index\n", - "that has:\n", - "- a primary key: chunk_id\n", - "- a text field: content\n", - "- a vector field: content_vector (dimension 1536, if using text-embedding-3-small)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Create the search index" + "# Part 4: Create Azure Search index and push chunk embeddings\n", + "We'll define a vector index and store chunk embeddings in Azure AI Search." ] }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 41, "metadata": {}, "outputs": [ { @@ -346,12 +263,25 @@ } ], "source": [ - "VECTOR_DIM = 1536 # Adjust as needed for your embedding model\n", - "\n", - "index_client = SearchIndexClient(\n", - " AZURE_SEARCH_ENDPOINT, AzureKeyCredential(AZURE_SEARCH_KEY)\n", + "from azure.identity import DefaultAzureCredential\n", + "from azure.search.documents.indexes import SearchIndexClient\n", + "from azure.search.documents.indexes.models import (\n", + " SearchIndex,\n", + " SearchField,\n", + " SearchFieldDataType,\n", + " SimpleField,\n", + " SearchableField,\n", + " VectorSearch,\n", + " HnswAlgorithmConfiguration,\n", + " VectorSearchProfile,\n", + " AzureOpenAIVectorizer,\n", + " AzureOpenAIVectorizerParameters,\n", ")\n", + "from azure.core.credentials import AzureKeyCredential\n", + "\n", + "VECTOR_DIM = 1536 # Adjust based on your chosen embeddings model\n", "\n", + "index_client = SearchIndexClient(AZURE_SEARCH_ENDPOINT, AzureKeyCredential(AZURE_SEARCH_KEY))\n", "\n", "def create_search_index(index_name: str):\n", " fields = [\n", @@ -391,7 +321,11 @@ " ],\n", " )\n", "\n", - " new_index = SearchIndex(name=index_name, fields=fields, vector_search=vector_search)\n", + " new_index = SearchIndex(\n", + " name=index_name,\n", + " fields=fields,\n", + " vector_search=vector_search\n", + " )\n", "\n", " try:\n", " index_client.delete_index(index_name)\n", @@ -401,20 +335,19 @@ " index_client.create_or_update_index(new_index)\n", " console.print(f\"Index '{index_name}' created.\")\n", "\n", - "\n", - "create_search_index(AZURE_SEARCH_INDEX_NAME)" + "create_search_index(AZURE_SEARCH_INDEX_NAME)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## B) Generate embeddings & upsert" + "Embed chunks and upsert them into Azure AI Search" ] }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 42, "metadata": {}, "outputs": [ { @@ -485,27 +418,23 @@ ], "source": [ "from openai import AzureOpenAI\n", + "from azure.search.documents import SearchClient\n", + "import uuid\n", "\n", - "search_client = SearchClient(\n", - " AZURE_SEARCH_ENDPOINT, AZURE_SEARCH_INDEX_NAME, AzureKeyCredential(AZURE_SEARCH_KEY)\n", - ")\n", + "search_client = SearchClient(AZURE_SEARCH_ENDPOINT, AZURE_SEARCH_INDEX_NAME, AzureKeyCredential(AZURE_SEARCH_KEY))\n", "openai_client = AzureOpenAI(\n", " api_key=AZURE_OPENAI_API_KEY,\n", " api_version=AZURE_OPENAI_API_VERSION,\n", " azure_endpoint=AZURE_OPENAI_ENDPOINT,\n", ")\n", "\n", - "\n", "def embed_text(text: str):\n", - " # Basic call to Azure OpenAI Embeddings\n", " response = openai_client.embeddings.create(\n", - " input=text, model=AZURE_OPENAI_EMBEDDINGS # or deployment name\n", + " input=text,\n", + " model=AZURE_OPENAI_EMBEDDINGS\n", " )\n", " return response.data[0].embedding\n", "\n", - "\n", - "import uuid\n", - "\n", "upload_docs = []\n", "for chunk_id, chunk_text in all_chunks:\n", " embedding_vector = embed_text(chunk_text)\n", @@ -517,28 +446,28 @@ " }\n", " )\n", "\n", - "# Upload in small batches\n", "BATCH_SIZE = 250\n", "for i in range(0, len(upload_docs), BATCH_SIZE):\n", " subset = upload_docs[i : i + BATCH_SIZE]\n", " resp = search_client.upload_documents(documents=subset)\n", " console.print(\n", - " f\"Uploaded batch {i} -> {i+len(subset)}; success: {resp[0].succeeded}, status code: {resp[0].status_code}\"\n", - ")\n", + " f\"Uploaded batch {i} -> {i+len(subset)}; success: {resp[0].succeeded}, status code: {resp[0].status_code}\"\n", + " )\n", "\n", - "console.print(\"All chunks uploaded to Azure Search.\")" + "console.print(\"All chunks uploaded to Azure Search.\")\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "# Part 5: RAG Query with Azure OpenAI" + "# Part 5: RAG Query with Azure OpenAI\n", + "Combine retrieval from Azure Search with Chat Completions (aka. grounding your LLM)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 44, "metadata": {}, "outputs": [ { @@ -546,9 +475,10 @@ "text/html": [ "
╭────────────────────────────────────────────────── RAG Prompt ───────────────────────────────────────────────────╮\n",
        "│                                                                                                                 │\n",
-       "│ You are an AI assistant helping summarize the State of AI 2024 PDF.                                             │\n",
+       "│ You are an AI assistant helping answering questions about the State of AI 2024 Report.                          │\n",
        "│ Use ONLY the text below to answer the user's question.                                                          │\n",
        "│ If the answer isn't in the text, say you don't know.                                                            │\n",
+       "│                                                                                                                 │\n",
        "│ Context:                                                                                                        │\n",
        "│ -NVIDIA remains the most powerful company in the world, enjoying a stint in the $3T club, while regulators      │\n",
        "│ probe the concentrations of power within GenAI.                                                                 │\n",
@@ -584,7 +514,18 @@
        "│ Strikingly, the average AI company that has reached $30M+ annualised revenue took just 20 months to get there,  │\n",
        "│ compared to 65 months for equally promising SaaS companies.                                                     │\n",
        "│ ---                                                                                                             │\n",
-       "│ A generative AI media company is investigated for its misuse during in the 2024 US election circuit.            │\n",
+       "│ In last year's report, we covered how the culture wars appeared to be slowly coming for AI, with the Gemini     │\n",
+       "│ 'woke AI' blow up fuelling the fires. Could the US presidential election signal a change in direction?          │\n",
+       "│ ● The 2024 Republican platform commits to repealing the AI executive order (EO), claiming it \"hinders AI        │\n",
+       "│ Innovation, and imposes Radical Leftwing ideas on the development of this technology\", attracting the support   │\n",
+       "│ of some big names in the Valley. It, however, makes no mention of the future of the US AISI.                    │\n",
+       "│ ● JD Vance is the first member of a presidential ticket to have apparently developed views on these issues,     │\n",
+       "│ having previously accused big tech companies of using AI safety as a vehicle for regulatory capture.            │\n",
+       "│ ● Meanwhile, Kamala Harris has said less on the subject. However, her remarks when she visited the UK for the   │\n",
+       "│ Bletchley Summit were widely interpreted as an implicit critique of the focus on safety questions at the        │\n",
+       "│ expense of ethics, echoing many UK civil society groups.                                                        │\n",
+       "│ ● Regardless of the fate of the EO, at a Congressional level, safety remains a bipartisan issue, with both      │\n",
+       "│ parties signing up to an AI policy roadmap in May.                                                              │\n",
        "│                                                                                                                 │\n",
        "│ Question: in 2024, AI companies reached how many $$$ in value?                                                  │\n",
        "│ Answer:                                                                                                         │\n",
@@ -595,9 +536,10 @@
       "text/plain": [
        "\u001b[1;31m╭─\u001b[0m\u001b[1;31m─────────────────────────────────────────────────\u001b[0m RAG Prompt \u001b[1;31m──────────────────────────────────────────────────\u001b[0m\u001b[1;31m─╮\u001b[0m\n",
        "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m                                                                                                               \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
-       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mYou are an AI assistant helping summarize the State of AI 2024 PDF.\u001b[0m\u001b[1;31m                                            \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
-       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mUse ONLY the text below to answer the user's question. \u001b[0m\u001b[1;31m                                                        \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mYou are an AI assistant helping answering questions about the State of AI 2024 Report.\u001b[0m\u001b[1;31m                         \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mUse ONLY the text below to answer the user's question.\u001b[0m\u001b[1;31m                                                         \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
        "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mIf the answer isn't in the text, say you don't know.\u001b[0m\u001b[1;31m                                                           \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m                                                                                                               \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
        "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mContext:\u001b[0m\u001b[1;31m                                                                                                       \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
        "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m-NVIDIA remains the most powerful company in the world, enjoying a stint in the $3T club, while regulators \u001b[0m\u001b[1;31m    \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
        "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mprobe the concentrations of power within GenAI.\u001b[0m\u001b[1;31m                                                                \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
@@ -633,7 +575,18 @@
        "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mStrikingly, the average AI company that has reached $30M+ annualised revenue took just 20 months to get there, \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
        "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mcompared to 65 months for equally promising SaaS companies.\u001b[0m\u001b[1;31m                                                    \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
        "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m---\u001b[0m\u001b[1;31m                                                                                                            \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
-       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mA generative AI media company is investigated for its misuse during in the 2024 US election circuit.\u001b[0m\u001b[1;31m           \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mIn last year's report, we covered how the culture wars appeared to be slowly coming for AI, with the Gemini \u001b[0m\u001b[1;31m   \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m'woke AI' blow up fuelling the fires. Could the US presidential election signal a change in direction?\u001b[0m\u001b[1;31m         \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m● The 2024 Republican platform commits to repealing the AI executive order (EO), claiming it \"hinders AI \u001b[0m\u001b[1;31m      \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mInnovation, and imposes Radical Leftwing ideas on the development of this technology\", attracting the support \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mof some big names in the Valley. It, however, makes no mention of the future of the US AISI.\u001b[0m\u001b[1;31m                   \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m● JD Vance is the first member of a presidential ticket to have apparently developed views on these issues, \u001b[0m\u001b[1;31m   \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mhaving previously accused big tech companies of using AI safety as a vehicle for regulatory capture.\u001b[0m\u001b[1;31m           \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m● Meanwhile, Kamala Harris has said less on the subject. However, her remarks when she visited the UK for the \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mBletchley Summit were widely interpreted as an implicit critique of the focus on safety questions at the \u001b[0m\u001b[1;31m      \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mexpense of ethics, echoing many UK civil society groups.\u001b[0m\u001b[1;31m                                                       \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m● Regardless of the fate of the EO, at a Congressional level, safety remains a bipartisan issue, with both \u001b[0m\u001b[1;31m    \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mparties signing up to an AI policy roadmap in May.\u001b[0m\u001b[1;31m                                                             \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
        "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m                                                                                                               \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
        "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mQuestion: in 2024, AI companies reached how many $$$ in value?\u001b[0m\u001b[1;31m                                                 \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
        "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mAnswer:\u001b[0m\u001b[1;31m                                                                                                        \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
@@ -663,35 +616,35 @@
     }
    ],
    "source": [
+    "from azure.search.documents.models import VectorizableTextQuery\n",
+    "\n",
     "def generate_chat_response(prompt: str, system_message: str = None):\n",
-    "    \"\"\"\n",
-    "    Basic Chat request to Azure OpenAI.\n",
-    "    In production, consider passing more parameters (temperature, presence_penalty, etc.).\n",
-    "    \"\"\"\n",
     "    messages = []\n",
     "    if system_message:\n",
     "        messages.append({\"role\": \"system\", \"content\": system_message})\n",
     "    messages.append({\"role\": \"user\", \"content\": prompt})\n",
     "\n",
     "    completion = openai_client.chat.completions.create(\n",
-    "        model=AZURE_OPENAI_CHAT_MODEL, messages=messages, temperature=0.7\n",
+    "        model=AZURE_OPENAI_CHAT_MODEL,\n",
+    "        messages=messages,\n",
+    "        temperature=0.7\n",
     "    )\n",
     "    return completion.choices[0].message.content\n",
     "\n",
-    "\n",
-    "# Example question\n",
-    "user_query = (\n",
-    "    \"in 2024, AI companies reached how many $$$ in value?\"\n",
-    ")\n",
+    "user_query = \"in 2024, AI companies reached how many $$$ in value?\"\n",
     "user_embed = embed_text(user_query)\n",
     "\n",
-    "# We'll use integrated vectorization to generate query embeddings in Azure AI Search\n",
     "vector_query = VectorizableTextQuery(\n",
-    "    text=user_query, k_nearest_neighbors=5, fields=\"content_vector\"\n",
+    "    text=user_query, # passing in text for a hybrid search\n",
+    "    k_nearest_neighbors=5,\n",
+    "    fields=\"content_vector\"\n",
     ")\n",
     "\n",
     "search_results = search_client.search(\n",
-    "    search_text=user_query, vector_queries=[vector_query], select=[\"content\"], top=10\n",
+    "    search_text=user_query,\n",
+    "    vector_queries=[vector_query],\n",
+    "    select=[\"content\"],\n",
+    "    top=10\n",
     ")\n",
     "\n",
     "retrieved_chunks = []\n",
@@ -699,13 +652,12 @@
     "    snippet = result[\"content\"]\n",
     "    retrieved_chunks.append(snippet)\n",
     "\n",
-    "# Combine retrieved chunks\n",
     "context_str = \"\\n---\\n\".join(retrieved_chunks)\n",
-    "\n",
     "rag_prompt = f\"\"\"\n",
-    "You are an AI assistant helping summarize the State of AI 2024 PDF.\n",
-    "Use ONLY the text below to answer the user's question. \n",
+    "You are an AI assistant helping answering questions about the State of AI 2024 Report.\n",
+    "Use ONLY the text below to answer the user's question.\n",
     "If the answer isn't in the text, say you don't know.\n",
+    "\n",
     "Context:\n",
     "{context_str}\n",
     "\n",

From df6201a1c080eefd2de60b8104a9cc188459a9e5 Mon Sep 17 00:00:00 2001
From: Farzad 
Date: Sat, 4 Jan 2025 17:52:07 -0600
Subject: [PATCH 3/9] mkdocs

---
 mkdocs.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mkdocs.yml b/mkdocs.yml
index 0428693c..ca682926 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -79,6 +79,7 @@ nav:
     - Chunking:
       - "Hybrid chunking": examples/hybrid_chunking.ipynb
     - RAG / QA:
+      - "RAG with Azure AI Search": examples/rag_azuresearch.ipynb
       - "RAG with Haystack": examples/rag_haystack.ipynb
       - "RAG with LlamaIndex 🦙": examples/rag_llamaindex.ipynb
       - "RAG with LangChain 🦜🔗": examples/rag_langchain.ipynb

From 86c1fd2dae8644fbcbf456e26d5878c413e594eb Mon Sep 17 00:00:00 2001
From: Farzad 
Date: Sat, 4 Jan 2025 17:59:58 -0600
Subject: [PATCH 4/9] colab check

---
 docs/examples/rag_azuresearch.ipynb | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/examples/rag_azuresearch.ipynb b/docs/examples/rag_azuresearch.ipynb
index ec0b8352..51af90f2 100644
--- a/docs/examples/rag_azuresearch.ipynb
+++ b/docs/examples/rag_azuresearch.ipynb
@@ -7,6 +7,7 @@
    },
    "source": [
     "# Building a RAG System with Docling and Azure AI Search\n",
+    "[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/farzad528/docling/blob/tree/main/docs/examples/rag_azuresearch.ipynb)\n",
     "\n",
     "This notebook demonstrates how to build a Retrieval-Augmented Generation (RAG) system using:\n",
     "- [Docling](https://ds4sd.github.io/docling/) for document parsing and chunking\n",

From e582b887e278e5ef32e25e7f2133cf297b8b8ae8 Mon Sep 17 00:00:00 2001
From: Farzad 
Date: Sat, 4 Jan 2025 18:06:21 -0600
Subject: [PATCH 5/9] colab link fix

---
 docs/examples/rag_azuresearch.ipynb | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/examples/rag_azuresearch.ipynb b/docs/examples/rag_azuresearch.ipynb
index 51af90f2..ebd1cebf 100644
--- a/docs/examples/rag_azuresearch.ipynb
+++ b/docs/examples/rag_azuresearch.ipynb
@@ -7,7 +7,8 @@
    },
    "source": [
     "# Building a RAG System with Docling and Azure AI Search\n",
-    "[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/farzad528/docling/blob/tree/main/docs/examples/rag_azuresearch.ipynb)\n",
+    "[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/farzad528/docling/blob/main/docs/examples/rag_azuresearch.ipynb)\n",
+    "\n",
     "\n",
     "This notebook demonstrates how to build a Retrieval-Augmented Generation (RAG) system using:\n",
     "- [Docling](https://ds4sd.github.io/docling/) for document parsing and chunking\n",

From 674539c8fed19570e20398ff443d4ecbf63737ee Mon Sep 17 00:00:00 2001
From: Farzad 
Date: Fri, 10 Jan 2025 16:36:16 -0600
Subject: [PATCH 6/9] pr comments

---
 docs/examples/rag_azuresearch.ipynb | 80 +++++++++++++++++++----------
 1 file changed, 54 insertions(+), 26 deletions(-)

diff --git a/docs/examples/rag_azuresearch.ipynb b/docs/examples/rag_azuresearch.ipynb
index ebd1cebf..cc4a9614 100644
--- a/docs/examples/rag_azuresearch.ipynb
+++ b/docs/examples/rag_azuresearch.ipynb
@@ -44,9 +44,10 @@
    "source": [
     "# Part 0: Prerequisites\n",
     " - Azure AI Search resource\n",
-    " - Azure OpenAI resource with deployed embeddings & chat models\n",
+    " - Azure OpenAI resource with A deployed embedding & chat completion model\n",
     " - Docling installed (Python 3.8+ environment)\n",
-    " - GPU or MPS recommended"
+    "\n",
+    "GPU or MPS usage can speed up Docling’s parsing (especially for large PDFs or when OCR/table extraction is needed). However, if no GPU is detected, you can comment out the following checks and proceed with CPU, albeit slower performance."
    ]
   },
   {
@@ -65,7 +66,6 @@
    "source": [
     "import torch\n",
     "\n",
-    "# Check if GPU or MPS is available\n",
     "if torch.cuda.is_available():\n",
     "    device = torch.device(\"cuda\")\n",
     "    print(f\"CUDA GPU is enabled: {torch.cuda.get_device_name(0)}\")\n",
@@ -73,8 +73,10 @@
     "    device = torch.device(\"mps\")\n",
     "    print(\"MPS GPU is enabled.\")\n",
     "else:\n",
+    "    # Comment out the error if you'd like to allow CPU fallback\n",
+    "    # But be aware parsing could be slower\n",
     "    raise EnvironmentError(\n",
-    "        \"No GPU or MPS device found. Please check your environment and ensure GPU or MPS support is configured.\"\n",
+    "        \"No GPU or MPS device found. Proceed with CPU only if you understand the performance implications.\"\n",
     "    )"
    ]
   },
@@ -100,10 +102,10 @@
     "AZURE_SEARCH_KEY = os.getenv(\"AZURE_SEARCH_KEY\")\n",
     "AZURE_SEARCH_INDEX_NAME = os.getenv(\"AZURE_SEARCH_INDEX_NAME\")\n",
     "AZURE_OPENAI_ENDPOINT = os.getenv(\"AZURE_OPENAI_ENDPOINT\")\n",
-    "AZURE_OPENAI_API_KEY = os.getenv(\"AZURE_OPENAI_API_KEY\")\n",
-    "AZURE_OPENAI_CHAT_MODEL = os.getenv(\"AZURE_OPENAI_CHAT_MODEL\")\n",
-    "AZURE_OPENAI_API_VERSION = os.getenv(\"AZURE_OPENAI_API_VERSION\")\n",
-    "AZURE_OPENAI_EMBEDDINGS = os.getenv(\"AZURE_OPENAI_EMBEDDINGS\")"
+    "AZURE_OPENAI_API_KEY = os.getenv(\"AZURE_OPENAI_API_KEY\") # Ensure this your Admin Key\n",
+    "AZURE_OPENAI_CHAT_MODEL = os.getenv(\"AZURE_OPENAI_CHAT_MODEL\") # Using a deployed model named \"gpt-4o\"\n",
+    "AZURE_OPENAI_API_VERSION = os.getenv(\"AZURE_OPENAI_API_VERSION\", \"2024-10-21\")\n",
+    "AZURE_OPENAI_EMBEDDINGS = os.getenv(\"AZURE_OPENAI_EMBEDDINGS\") # Using a deployed model named \"text-embeddings-3-small\""
    ]
   },
   {
@@ -206,7 +208,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 40,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
@@ -316,13 +318,14 @@
     "                parameters=AzureOpenAIVectorizerParameters(\n",
     "                    resource_url=AZURE_OPENAI_ENDPOINT,\n",
     "                    deployment_name=AZURE_OPENAI_EMBEDDINGS,\n",
-    "                    model_name=\"text-embedding-3-small\",\n",
+    "                    model_name=\"text-embedding-3-small\", # same as the environment variable \n",
     "                    api_key=AZURE_OPENAI_API_KEY,\n",
     "                ),\n",
     "            )\n",
     "        ],\n",
     "    )\n",
-    "\n",
+    "    \n",
+    "    # Clean up any old index if it exists\n",
     "    new_index = SearchIndex(\n",
     "        name=index_name,\n",
     "        fields=fields,\n",
@@ -349,17 +352,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 42,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/html": [
-       "
Uploaded batch 0 -> 250; success: True, status code: 201\n",
+       "
Uploaded batch 0 -> 250; all_succeeded: True, first_doc_status_code: 201\n",
        "
\n" ], "text/plain": [ - "Uploaded batch \u001b[1;36m0\u001b[0m -> \u001b[1;36m250\u001b[0m; success: \u001b[3;92mTrue\u001b[0m, status code: \u001b[1;36m201\u001b[0m\n" + "Uploaded batch \u001b[1;36m0\u001b[0m -> \u001b[1;36m250\u001b[0m; all_succeeded: \u001b[3;92mTrue\u001b[0m, first_doc_status_code: \u001b[1;36m201\u001b[0m\n" ] }, "metadata": {}, @@ -368,11 +371,11 @@ { "data": { "text/html": [ - "
Uploaded batch 250 -> 500; success: True, status code: 201\n",
+       "
Uploaded batch 250 -> 500; all_succeeded: True, first_doc_status_code: 201\n",
        "
\n" ], "text/plain": [ - "Uploaded batch \u001b[1;36m250\u001b[0m -> \u001b[1;36m500\u001b[0m; success: \u001b[3;92mTrue\u001b[0m, status code: \u001b[1;36m201\u001b[0m\n" + "Uploaded batch \u001b[1;36m250\u001b[0m -> \u001b[1;36m500\u001b[0m; all_succeeded: \u001b[3;92mTrue\u001b[0m, first_doc_status_code: \u001b[1;36m201\u001b[0m\n" ] }, "metadata": {}, @@ -381,11 +384,11 @@ { "data": { "text/html": [ - "
Uploaded batch 500 -> 750; success: True, status code: 201\n",
+       "
Uploaded batch 500 -> 750; all_succeeded: True, first_doc_status_code: 201\n",
        "
\n" ], "text/plain": [ - "Uploaded batch \u001b[1;36m500\u001b[0m -> \u001b[1;36m750\u001b[0m; success: \u001b[3;92mTrue\u001b[0m, status code: \u001b[1;36m201\u001b[0m\n" + "Uploaded batch \u001b[1;36m500\u001b[0m -> \u001b[1;36m750\u001b[0m; all_succeeded: \u001b[3;92mTrue\u001b[0m, first_doc_status_code: \u001b[1;36m201\u001b[0m\n" ] }, "metadata": {}, @@ -394,11 +397,11 @@ { "data": { "text/html": [ - "
Uploaded batch 750 -> 966; success: True, status code: 201\n",
+       "
Uploaded batch 750 -> 966; all_succeeded: True, first_doc_status_code: 201\n",
        "
\n" ], "text/plain": [ - "Uploaded batch \u001b[1;36m750\u001b[0m -> \u001b[1;36m966\u001b[0m; success: \u001b[3;92mTrue\u001b[0m, status code: \u001b[1;36m201\u001b[0m\n" + "Uploaded batch \u001b[1;36m750\u001b[0m -> \u001b[1;36m966\u001b[0m; all_succeeded: \u001b[3;92mTrue\u001b[0m, first_doc_status_code: \u001b[1;36m201\u001b[0m\n" ] }, "metadata": {}, @@ -431,6 +434,9 @@ ")\n", "\n", "def embed_text(text: str):\n", + " \"\"\"\n", + " Helper to generate embeddings with Azure OpenAI.\n", + " \"\"\"\n", " response = openai_client.embeddings.create(\n", " input=text,\n", " model=AZURE_OPENAI_EMBEDDINGS\n", @@ -438,22 +444,26 @@ " return response.data[0].embedding\n", "\n", "upload_docs = []\n", - "for chunk_id, chunk_text in all_chunks:\n", + "for (chunk_id, chunk_text) in all_chunks:\n", " embedding_vector = embed_text(chunk_text)\n", " upload_docs.append(\n", " {\n", - " \"chunk_id\": str(uuid.uuid4()),\n", + " \"chunk_id\": chunk_id,\n", " \"content\": chunk_text,\n", " \"content_vector\": embedding_vector,\n", " }\n", " )\n", "\n", + "\n", "BATCH_SIZE = 250\n", "for i in range(0, len(upload_docs), BATCH_SIZE):\n", " subset = upload_docs[i : i + BATCH_SIZE]\n", " resp = search_client.upload_documents(documents=subset)\n", + "\n", + " all_succeeded = all(r.succeeded for r in resp)\n", " console.print(\n", - " f\"Uploaded batch {i} -> {i+len(subset)}; success: {resp[0].succeeded}, status code: {resp[0].status_code}\"\n", + " f\"Uploaded batch {i} -> {i+len(subset)}; all_succeeded: {all_succeeded}, \"\n", + " f\"first_doc_status_code: {resp[0].status_code}\"\n", " )\n", "\n", "console.print(\"All chunks uploaded to Azure Search.\")\n" @@ -621,6 +631,11 @@ "from azure.search.documents.models import VectorizableTextQuery\n", "\n", "def generate_chat_response(prompt: str, system_message: str = None):\n", + " \"\"\"\n", + " Generates a single-turn chat response using Azure OpenAI Chat.\n", + " If you need multi-turn conversation or follow-up queries, you'll have to\n", + " maintain the messages list externally.\n", + " \"\"\"\n", " messages = []\n", " if system_message:\n", " messages.append({\"role\": \"system\", \"content\": system_message})\n", @@ -672,6 +687,19 @@ "console.print(Panel(rag_prompt, title=\"RAG Prompt\", style=\"bold red\"))\n", "console.print(Panel(final_answer, title=\"RAG Response\", style=\"bold green\"))" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Dataset Citation\n", + "\n", + "**State of AI Report 2024** \n", + "Benaich, N. & Air Street Capital. (2024). *State of AI Report 2024*. \n", + "Licensed under [Creative Commons Attribution 4.0 International License](https://creativecommons.org/licenses/by/4.0/). \n", + "Available at: [STATE OF AI REPORT 2024](https://www.stateof.ai/)\n", + "\n" + ] } ], "metadata": { @@ -681,9 +709,9 @@ "provenance": [] }, "kernelspec": { - "display_name": "Python 3.10 - SDK v2", + "display_name": "Python 3", "language": "python", - "name": "python310-sdkv2" + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -695,7 +723,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.11" + "version": "3.11.9" } }, "nbformat": 4, From 43f87971ec4e2d2097aea9f2390451e16363a254 Mon Sep 17 00:00:00 2001 From: Farzad Date: Fri, 10 Jan 2025 16:45:18 -0600 Subject: [PATCH 7/9] title change --- docs/examples/rag_azuresearch.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/examples/rag_azuresearch.ipynb b/docs/examples/rag_azuresearch.ipynb index cc4a9614..841fb6fa 100644 --- a/docs/examples/rag_azuresearch.ipynb +++ b/docs/examples/rag_azuresearch.ipynb @@ -6,7 +6,7 @@ "id": "Ag9kcX2B_atc" }, "source": [ - "# Building a RAG System with Docling and Azure AI Search\n", + "# RAG with Azure AI Search\n", "[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/farzad528/docling/blob/main/docs/examples/rag_azuresearch.ipynb)\n", "\n", "\n", From 64b608f13f79c1b8b3334ea90ed18d1b28436903 Mon Sep 17 00:00:00 2001 From: Farzad Date: Fri, 10 Jan 2025 16:47:49 -0600 Subject: [PATCH 8/9] table --- docs/examples/rag_azuresearch.ipynb | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/docs/examples/rag_azuresearch.ipynb b/docs/examples/rag_azuresearch.ipynb index 841fb6fa..f2a760c7 100644 --- a/docs/examples/rag_azuresearch.ipynb +++ b/docs/examples/rag_azuresearch.ipynb @@ -9,6 +9,12 @@ "# RAG with Azure AI Search\n", "[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/farzad528/docling/blob/main/docs/examples/rag_azuresearch.ipynb)\n", "\n", + "| Step | Tech | Execution |\n", + "| ------------------ | ------------------ | --------- |\n", + "| Embedding | Azure OpenAI | 🌐 Remote |\n", + "| Vector Store | Azure AI Search | 🌐 Remote |\n", + "| Gen AI | Azure OpenAI GPT-4o | 🌐 Remote |\n", + "\n", "\n", "This notebook demonstrates how to build a Retrieval-Augmented Generation (RAG) system using:\n", "- [Docling](https://ds4sd.github.io/docling/) for document parsing and chunking\n", From c54ee398d16e2754d258e3ce2455f456999219f9 Mon Sep 17 00:00:00 2001 From: Farzad Date: Fri, 10 Jan 2025 16:53:24 -0600 Subject: [PATCH 9/9] rename to Azure OpenAI --- docs/examples/rag_azuresearch.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/examples/rag_azuresearch.ipynb b/docs/examples/rag_azuresearch.ipynb index f2a760c7..f4408d9d 100644 --- a/docs/examples/rag_azuresearch.ipynb +++ b/docs/examples/rag_azuresearch.ipynb @@ -13,7 +13,7 @@ "| ------------------ | ------------------ | --------- |\n", "| Embedding | Azure OpenAI | 🌐 Remote |\n", "| Vector Store | Azure AI Search | 🌐 Remote |\n", - "| Gen AI | Azure OpenAI GPT-4o | 🌐 Remote |\n", + "| Gen AI | Azure OpenAI | 🌐 Remote |\n", "\n", "\n", "This notebook demonstrates how to build a Retrieval-Augmented Generation (RAG) system using:\n",