diff --git a/examples/notebooks/intro/dpk_intro_1_python.ipynb b/examples/notebooks/intro/dpk_intro_1_python.ipynb
index f3659afcf..ab7cda854 100644
--- a/examples/notebooks/intro/dpk_intro_1_python.ipynb
+++ b/examples/notebooks/intro/dpk_intro_1_python.ipynb
@@ -13,7 +13,7 @@
     "\n",
     "Here is the workflow\n",
     "\n",
-    "![](https://raw.githubusercontent.com/sujee/data-prep-kit/intro-example1/examples/notebooks/intro/images/data-prep-kit-3-workflow.png)\n"
+    "![](https://raw.githubusercontent.com/IBM/data-prep-kit/dev/examples/notebooks/intro/images/data-prep-kit-3-workflow.png)\n"
    ]
   },
   {
@@ -27,7 +27,7 @@
     "\n",
     "Two options:\n",
     "\n",
-    "- **Option 1 - Google Colab:** easiest option.  no setup required.  Click this link to open this on google colab.  [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/sujee/data-prep-kit/blob/intro-example1/examples/notebooks/intro/dpk_intro_1_python.ipynb)\n",
+    "- **Option 1 - Google Colab:** easiest option.  no setup required.  Click this link to open this on google colab.  [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/IBM/data-prep-kit/blob/dev/examples/notebooks/intro/dpk_intro_1_python.ipynb)\n",
     "- **Option 2 - Local python dev environment:**  Setup using this [guide](../../../README.md#-getting-started)\n",
     "\n",
     "The notebook will work as in both environments"
@@ -42,10 +42,10 @@
    "source": [
     "## Step-1: Inspect the Data\n",
     "\n",
-    "We will use simple PDFs about Solar system.  The files are [here](https://github.com/sujee/data-prep-kit/tree/intro-example1/examples/notebooks/intro/input/solar-system)\n",
+    "We will use simple PDFs about Solar system.  The files are [here](https://github.com/IBM/data-prep-kit/tree/dev/examples/notebooks/intro/input/solar-system)\n",
     "\n",
-    "- [earth.pdf](https://github.com/sujee/data-prep-kit/blob/intro-example1/examples/notebooks/intro/input/solar-system/earth.pdf)\n",
-    "- [mars.pdf](https://github.com/sujee/data-prep-kit/blob/intro-example1/examples/notebooks/intro/input/solar-system/mars.pdf)\n"
+    "- [earth.pdf](https://github.com/IBM/data-prep-kit/blob/dev/examples/notebooks/intro/input/solar-system/earth.pdf)\n",
+    "- [mars.pdf](https://github.com/IBM/data-prep-kit/blob/dev/examples/notebooks/intro/input/solar-system/mars.pdf)\n"
    ]
   },
   {
@@ -118,9 +118,9 @@
    "source": [
     "if RUNNING_IN_COLAB:\n",
     "    !mkdir -p 'input/solar-system'\n",
-    "    !wget -O 'input/solar-system/earth.pdf'  'https://raw.githubusercontent.com/sujee/data-prep-kit/intro-example1/examples/notebooks/intro/input/solar-system/earth.pdf'\n",
-    "    !wget -O 'input/solar-system/mars.pdf'  'https://raw.githubusercontent.com/sujee/data-prep-kit/intro-example1/examples/notebooks/intro/input/solar-system/mars.pdf'\n",
-    "    !wget -O 'my_utils.py'  'https://raw.githubusercontent.com/sujee/data-prep-kit/intro-example1/examples/notebooks/intro/my_utils.py'"
+    "    !wget -O 'input/solar-system/earth.pdf'  'https://raw.githubusercontent.com/IBM/data-prep-kit/dev/examples/notebooks/intro/input/solar-system/earth.pdf'\n",
+    "    !wget -O 'input/solar-system/mars.pdf'  'https://raw.githubusercontent.com/IBM/data-prep-kit/dev/examples/notebooks/intro/input/solar-system/mars.pdf'\n",
+    "    !wget -O 'my_utils.py'  'https://raw.githubusercontent.com/IBM/data-prep-kit/dev/examples/notebooks/intro/my_utils.py'"
    ]
   },
   {
diff --git a/examples/notebooks/intro/dpk_intro_1_ray.ipynb b/examples/notebooks/intro/dpk_intro_1_ray.ipynb
index da33a3499..b2feb9135 100644
--- a/examples/notebooks/intro/dpk_intro_1_ray.ipynb
+++ b/examples/notebooks/intro/dpk_intro_1_ray.ipynb
@@ -1,4358 +1,4359 @@
 {
-  "cells": [
-    {
-      "cell_type": "markdown",
-      "id": "841e533d-ebb3-406d-9da7-b19e2c5f5866",
-      "metadata": {
-        "id": "841e533d-ebb3-406d-9da7-b19e2c5f5866"
-      },
-      "source": [
-        "# Data Prep Kit Demo 1 - Ray Version\n",
-        "\n",
-        "This notebook will introduce DPK and showcase some of it's capabilities.\n",
-        "\n",
-        "Here is the workflow\n",
-        "\n",
-        "![](https://raw.githubusercontent.com/sujee/data-prep-kit/intro-example1/examples/notebooks/intro/images/data-prep-kit-3-workflow.png)\n",
-        "\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "b15976e3",
-      "metadata": {
-        "id": "b15976e3"
-      },
-      "source": [
-        "## How to run this notebook\n",
-        "\n",
-        "Two options:\n",
-        "\n",
-        "- **Option 1 - Google Colab:** easiest option.  no setup required.  Click this link to open this on google colab.  [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/sujee/data-prep-kit/blob/intro-example1/examples/notebooks/intro/dpk_intro_1_ray.ipynb)\n",
-        "- **Option 2 - Local python dev environment:**  Setup using this [guide](../../../README.md#-getting-started)\n",
-        "\n",
-        "The notebook will work as in both environments"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "eb8b0d5c",
-      "metadata": {
-        "id": "eb8b0d5c"
-      },
-      "source": [
-        "## Step-1: Inspect the Data\n",
-        "\n",
-        "We will use simple PDFs about Solar system.  The files are [here](https://github.com/sujee/data-prep-kit/tree/intro-example1/examples/notebooks/intro/input/solar-system)\n",
-        "\n",
-        "- [earth.pdf](https://github.com/sujee/data-prep-kit/blob/intro-example1/examples/notebooks/intro/input/solar-system/earth.pdf)\n",
-        "- [mars.pdf](https://github.com/sujee/data-prep-kit/blob/intro-example1/examples/notebooks/intro/input/solar-system/mars.pdf)\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "39a0ab6e",
-      "metadata": {
-        "id": "39a0ab6e"
-      },
-      "source": [
-        "## Step-2: Figure out Runtime Environment\n",
-        "\n",
-        "### 2.1 - Determine runtime\n",
-        "\n",
-        "Determine if we are running on Google colab or local python environment"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 1,
-      "id": "1fe354b7",
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "1fe354b7",
-        "outputId": "6665c654-baa5-46dc-d370-9931e0e9eed3"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "NOT in Colab\n"
-          ]
-        }
-      ],
-      "source": [
-        "import os\n",
-        "\n",
-        "if os.getenv(\"COLAB_RELEASE_TAG\"):\n",
-        "   print(\"Running in Colab\")\n",
-        "   RUNNING_IN_COLAB = True\n",
-        "else:\n",
-        "   print(\"NOT in Colab\")\n",
-        "   RUNNING_IN_COLAB = False"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "8e7c104b",
-      "metadata": {
-        "id": "8e7c104b"
-      },
-      "source": [
-        "### 2.2 -Download Data if running on Google Colab"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 2,
-      "id": "3309799e",
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "3309799e",
-        "outputId": "00d7362e-d675-4aaf-8c87-d99027d9a06c"
-      },
-      "outputs": [],
-      "source": [
-        "if RUNNING_IN_COLAB:\n",
-        "    !mkdir -p 'input/solar-system'\n",
-        "    !wget -O 'input/solar-system/earth.pdf'  'https://raw.githubusercontent.com/sujee/data-prep-kit/intro-example1/examples/notebooks/intro/input/solar-system/earth.pdf'\n",
-        "    !wget -O 'input/solar-system/mars.pdf'  'https://raw.githubusercontent.com/sujee/data-prep-kit/intro-example1/examples/notebooks/intro/input/solar-system/mars.pdf'\n",
-        "    !wget -O 'my_utils.py'  'https://raw.githubusercontent.com/sujee/data-prep-kit/intro-example1/examples/notebooks/intro/my_utils.py'"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "a5dc2b68",
-      "metadata": {
-        "id": "a5dc2b68"
-      },
-      "source": [
-        "### 2.3 - Install dependencies if running on Google Colab"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 3,
-      "id": "1fcec577",
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 1000
-        },
-        "id": "1fcec577",
-        "outputId": "48cf233b-f04e-4b9b-9605-423f87693f10"
-      },
-      "outputs": [],
-      "source": [
-        "if RUNNING_IN_COLAB:\n",
-        "    ! pip install  --default-timeout=100  \\\n",
-        "        data-prep-toolkit==0.2.1 \\\n",
-        "        data-prep-toolkit-transforms==0.2.1 \\\n",
-        "        data-prep-toolkit-transforms-ray==0.2.1 \\\n",
-        "        deepsearch-toolkit"
-      ]
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "841e533d-ebb3-406d-9da7-b19e2c5f5866",
+   "metadata": {
+    "id": "841e533d-ebb3-406d-9da7-b19e2c5f5866"
+   },
+   "source": [
+    "# Data Prep Kit Demo 1 - Ray Version\n",
+    "\n",
+    "This notebook will introduce DPK and showcase some of it's capabilities.\n",
+    "\n",
+    "Here is the workflow\n",
+    "\n",
+    "![](https://raw.githubusercontent.com/IBM/data-prep-kit/dev/examples/notebooks/intro/images/data-prep-kit-3-workflow.png)\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b15976e3",
+   "metadata": {
+    "id": "b15976e3"
+   },
+   "source": [
+    "## How to run this notebook\n",
+    "\n",
+    "Two options:\n",
+    "\n",
+    "- **Option 1 - Google Colab:** easiest option.  no setup required.  Click this link to open this on google colab.  [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/IBM/data-prep-kit/blob/dev/examples/notebooks/intro/dpk_intro_1_ray.ipynb)\n",
+    "- **Option 2 - Local python dev environment:**  Setup using this [guide](../../../README.md#-getting-started)\n",
+    "\n",
+    "The notebook will work as in both environments"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "eb8b0d5c",
+   "metadata": {
+    "id": "eb8b0d5c"
+   },
+   "source": [
+    "## Step-1: Inspect the Data\n",
+    "\n",
+    "We will use simple PDFs about Solar system.  The files are [here](https://github.com/IBM/data-prep-kit/tree/dev/examples/notebooks/intro/input/solar-system)\n",
+    "\n",
+    "- [earth.pdf](https://github.com/IBM/data-prep-kit/blob/dev/examples/notebooks/intro/input/solar-system/earth.pdf)\n",
+    "- [mars.pdf](https://github.com/IBM/data-prep-kit/blob/dev/examples/notebooks/intro/input/solar-system/mars.pdf)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "39a0ab6e",
+   "metadata": {
+    "id": "39a0ab6e"
+   },
+   "source": [
+    "## Step-2: Figure out Runtime Environment\n",
+    "\n",
+    "### 2.1 - Determine runtime\n",
+    "\n",
+    "Determine if we are running on Google colab or local python environment"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "1fe354b7",
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
     },
+    "id": "1fe354b7",
+    "outputId": "6665c654-baa5-46dc-d370-9931e0e9eed3"
+   },
+   "outputs": [
     {
-      "cell_type": "markdown",
-      "id": "243322b8",
-      "metadata": {
-        "id": "243322b8"
-      },
-      "source": [
-        "### 2.4 - Restart Runtime\n",
-        "\n",
-        "After installing dependencies, be sure <font color=\"red\">restart runtime</font>, so libraries will be loaded\n",
-        "\n",
-        "You do this by going to **`Runtime --> Restart Session`**\n",
-        "\n",
-        "Then you can continue to the next step (no need to re-run the notebook)"
-      ]
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "NOT in Colab\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "\n",
+    "if os.getenv(\"COLAB_RELEASE_TAG\"):\n",
+    "   print(\"Running in Colab\")\n",
+    "   RUNNING_IN_COLAB = True\n",
+    "else:\n",
+    "   print(\"NOT in Colab\")\n",
+    "   RUNNING_IN_COLAB = False"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8e7c104b",
+   "metadata": {
+    "id": "8e7c104b"
+   },
+   "source": [
+    "### 2.2 -Download Data if running on Google Colab"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "3309799e",
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
     },
-    {
-      "cell_type": "markdown",
-      "id": "e8b10be1",
-      "metadata": {
-        "id": "e8b10be1"
-      },
-      "source": [
-        "## Step-2: Configuration"
-      ]
+    "id": "3309799e",
+    "outputId": "00d7362e-d675-4aaf-8c87-d99027d9a06c"
+   },
+   "outputs": [],
+   "source": [
+    "if RUNNING_IN_COLAB:\n",
+    "    !mkdir -p 'input/solar-system'\n",
+    "    !wget -O 'input/solar-system/earth.pdf'  'https://raw.githubusercontent.com/IBM/data-prep-kit/dev/examples/notebooks/intro/input/solar-system/earth.pdf'\n",
+    "    !wget -O 'input/solar-system/mars.pdf'  'https://raw.githubusercontent.com/IBM/data-prep-kit/dev/examples/notebooks/intro/input/solar-system/mars.pdf'\n",
+    "    !wget -O 'my_utils.py'  'https://raw.githubusercontent.com/IBM/data-prep-kit/dev/examples/notebooks/intro/my_utils.py'"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a5dc2b68",
+   "metadata": {
+    "id": "a5dc2b68"
+   },
+   "source": [
+    "### 2.3 - Install dependencies if running on Google Colab"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "1fcec577",
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/",
+     "height": 1000
     },
-    {
-      "cell_type": "markdown",
-      "id": "356c66f7",
-      "metadata": {
-        "id": "356c66f7"
-      },
-      "source": [
-        "### 2.1 - Basic Config"
-      ]
+    "id": "1fcec577",
+    "outputId": "48cf233b-f04e-4b9b-9605-423f87693f10"
+   },
+   "outputs": [],
+   "source": [
+    "if RUNNING_IN_COLAB:\n",
+    "    ! pip install  --default-timeout=100  \\\n",
+    "        data-prep-toolkit==0.2.1 \\\n",
+    "        data-prep-toolkit-transforms==0.2.1 \\\n",
+    "        data-prep-toolkit-transforms-ray==0.2.1 \\\n",
+    "        deepsearch-toolkit"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "243322b8",
+   "metadata": {
+    "id": "243322b8"
+   },
+   "source": [
+    "### 2.4 - Restart Runtime\n",
+    "\n",
+    "After installing dependencies, be sure <font color=\"red\">restart runtime</font>, so libraries will be loaded\n",
+    "\n",
+    "You do this by going to **`Runtime --> Restart Session`**\n",
+    "\n",
+    "Then you can continue to the next step (no need to re-run the notebook)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e8b10be1",
+   "metadata": {
+    "id": "e8b10be1"
+   },
+   "source": [
+    "## Step-2: Configuration"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "356c66f7",
+   "metadata": {
+    "id": "356c66f7"
+   },
+   "source": [
+    "### 2.1 - Basic Config"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "e4YMZrBuFycl",
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
     },
+    "id": "e4YMZrBuFycl",
+    "outputId": "1a1d5f01-0856-40b6-8b1c-8187b0c38d64"
+   },
+   "outputs": [
     {
-      "cell_type": "code",
-      "execution_count": 4,
-      "id": "e4YMZrBuFycl",
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "e4YMZrBuFycl",
-        "outputId": "1a1d5f01-0856-40b6-8b1c-8187b0c38d64"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "NOT in Colab\n"
-          ]
-        }
-      ],
-      "source": [
-        "import os\n",
-        "\n",
-        "if os.getenv(\"COLAB_RELEASE_TAG\"):\n",
-        "   print(\"Running in Colab\")\n",
-        "   RUNNING_IN_COLAB = True\n",
-        "else:\n",
-        "   print(\"NOT in Colab\")\n",
-        "   RUNNING_IN_COLAB = False"
-      ]
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "NOT in Colab\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "\n",
+    "if os.getenv(\"COLAB_RELEASE_TAG\"):\n",
+    "   print(\"Running in Colab\")\n",
+    "   RUNNING_IN_COLAB = True\n",
+    "else:\n",
+    "   print(\"NOT in Colab\")\n",
+    "   RUNNING_IN_COLAB = False"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "33345487",
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
     },
+    "id": "33345487",
+    "outputId": "f3e71a25-4864-4f8f-dfce-4af3d7e08a8a"
+   },
+   "outputs": [
     {
-      "cell_type": "code",
-      "execution_count": 5,
-      "id": "33345487",
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "33345487",
-        "outputId": "f3e71a25-4864-4f8f-dfce-4af3d7e08a8a"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "MY_CONFIG.RAY_RUNTIME_WORKERS: 2\n",
-            "MY_CONFIG.RAY_NUM_CPUS: 0.8\n",
-            "MY_CONFIG.RAY_MEMORY_GB: 2\n"
-          ]
-        }
-      ],
-      "source": [
-        "import os\n",
-        "\n",
-        "## Configuration\n",
-        "class MyConfig:\n",
-        "    pass\n",
-        "\n",
-        "MY_CONFIG = MyConfig ()\n",
-        "\n",
-        "MY_CONFIG.INPUT_DATA_DIR = 'input/solar-system'\n",
-        "\n",
-        "MY_CONFIG.OUTPUT_FOLDER = \"output\"\n",
-        "MY_CONFIG.OUTPUT_FOLDER_FINAL = os.path.join(MY_CONFIG.OUTPUT_FOLDER , \"output_final\")\n",
-        "\n",
-        "## Embedding model\n",
-        "MY_CONFIG.EMBEDDING_MODEL = 'sentence-transformers/all-MiniLM-L6-v2'\n",
-        "\n",
-        "## RAY CONFIGURATION\n",
-        "### For local runs, we can use more parallelism\n",
-        "### For google colab, be conservative\n",
-        "\n",
-        "if RUNNING_IN_COLAB:\n",
-        "  MY_CONFIG.RAY_RUNTIME_WORKERS = 2\n",
-        "  MY_CONFIG.RAY_NUM_CPUS =  0.3\n",
-        "  MY_CONFIG.RAY_MEMORY_GB = 2  # GB\n",
-        "else:  # local run\n",
-        "  num_cpus_available =  os.cpu_count()\n",
-        "  # print (num_cpus_available)\n",
-        "\n",
-        "  MY_CONFIG.RAY_RUNTIME_WORKERS = 2\n",
-        "  MY_CONFIG.RAY_NUM_CPUS =  0.8\n",
-        "  MY_CONFIG.RAY_MEMORY_GB = 2  # GB\n",
-        "  # MY_CONFIG.RAY_RUNTIME_WORKERS = num_cpus_available // 3\n",
-        "\n",
-        "print ('MY_CONFIG.RAY_RUNTIME_WORKERS:', MY_CONFIG.RAY_RUNTIME_WORKERS)\n",
-        "print ('MY_CONFIG.RAY_NUM_CPUS:', MY_CONFIG.RAY_NUM_CPUS)\n",
-        "print ('MY_CONFIG.RAY_MEMORY_GB:', MY_CONFIG.RAY_MEMORY_GB)\n"
-      ]
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "MY_CONFIG.RAY_RUNTIME_WORKERS: 2\n",
+      "MY_CONFIG.RAY_NUM_CPUS: 0.8\n",
+      "MY_CONFIG.RAY_MEMORY_GB: 2\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "\n",
+    "## Configuration\n",
+    "class MyConfig:\n",
+    "    pass\n",
+    "\n",
+    "MY_CONFIG = MyConfig ()\n",
+    "\n",
+    "MY_CONFIG.INPUT_DATA_DIR = 'input/solar-system'\n",
+    "\n",
+    "MY_CONFIG.OUTPUT_FOLDER = \"output\"\n",
+    "MY_CONFIG.OUTPUT_FOLDER_FINAL = os.path.join(MY_CONFIG.OUTPUT_FOLDER , \"output_final\")\n",
+    "\n",
+    "## Embedding model\n",
+    "MY_CONFIG.EMBEDDING_MODEL = 'sentence-transformers/all-MiniLM-L6-v2'\n",
+    "\n",
+    "## RAY CONFIGURATION\n",
+    "### For local runs, we can use more parallelism\n",
+    "### For google colab, be conservative\n",
+    "\n",
+    "if RUNNING_IN_COLAB:\n",
+    "  MY_CONFIG.RAY_RUNTIME_WORKERS = 2\n",
+    "  MY_CONFIG.RAY_NUM_CPUS =  0.3\n",
+    "  MY_CONFIG.RAY_MEMORY_GB = 2  # GB\n",
+    "else:  # local run\n",
+    "  num_cpus_available =  os.cpu_count()\n",
+    "  # print (num_cpus_available)\n",
+    "\n",
+    "  MY_CONFIG.RAY_RUNTIME_WORKERS = 2\n",
+    "  MY_CONFIG.RAY_NUM_CPUS =  0.8\n",
+    "  MY_CONFIG.RAY_MEMORY_GB = 2  # GB\n",
+    "  # MY_CONFIG.RAY_RUNTIME_WORKERS = num_cpus_available // 3\n",
+    "\n",
+    "print ('MY_CONFIG.RAY_RUNTIME_WORKERS:', MY_CONFIG.RAY_RUNTIME_WORKERS)\n",
+    "print ('MY_CONFIG.RAY_NUM_CPUS:', MY_CONFIG.RAY_NUM_CPUS)\n",
+    "print ('MY_CONFIG.RAY_MEMORY_GB:', MY_CONFIG.RAY_MEMORY_GB)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "b15e6827",
+   "metadata": {
+    "id": "b15e6827"
+   },
+   "outputs": [],
+   "source": [
+    "## Add parent dir to path\n",
+    "import os,sys\n",
+    "\n",
+    "this_dir = os.path.abspath('')\n",
+    "parent_dir = os.path.dirname(this_dir)\n",
+    "sys.path.append (os.path.abspath (parent_dir))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "72510ae6-48b0-4b88-9e13-a623281c3a63",
+   "metadata": {
+    "id": "72510ae6-48b0-4b88-9e13-a623281c3a63"
+   },
+   "source": [
+    "### 2.2 - Setup input/outpur directories"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "60ac8bee-0960-4309-b225-d7a211b14262",
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
     },
+    "id": "60ac8bee-0960-4309-b225-d7a211b14262",
+    "outputId": "ec5beb05-027a-49eb-9a96-271471619d81"
+   },
+   "outputs": [
     {
-      "cell_type": "code",
-      "execution_count": 6,
-      "id": "b15e6827",
-      "metadata": {
-        "id": "b15e6827"
-      },
-      "outputs": [],
-      "source": [
-        "## Add parent dir to path\n",
-        "import os,sys\n",
-        "\n",
-        "this_dir = os.path.abspath('')\n",
-        "parent_dir = os.path.dirname(this_dir)\n",
-        "sys.path.append (os.path.abspath (parent_dir))"
-      ]
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "✅ Cleared output directory\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os, sys\n",
+    "import shutil\n",
+    "\n",
+    "if not os.path.exists(MY_CONFIG.INPUT_DATA_DIR ):\n",
+    "    raise Exception (f\"❌ Input folder MY_CONFIG.INPUT_DATA_DIR = '{MY_CONFIG.INPUT_DATA_DIR}' not found\")\n",
+    "\n",
+    "output_parquet_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '01_parquet_out')\n",
+    "output_chunk_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '02_chunk_out')\n",
+    "output_docid_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '03_docid_out')\n",
+    "output_exact_dedupe_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '04_exact_dedupe_out')\n",
+    "output_fuzzy_dedupe_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '05_fuzzy_dedupe_out')\n",
+    "output_embeddings_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '06_embeddings_out')\n",
+    "\n",
+    "## clear output folder\n",
+    "shutil.rmtree(MY_CONFIG.OUTPUT_FOLDER, ignore_errors=True)\n",
+    "shutil.os.makedirs(MY_CONFIG.OUTPUT_FOLDER, exist_ok=True)\n",
+    "\n",
+    "print (\"✅ Cleared output directory\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2449e5c7-078c-4ad6-a2f6-21d39d4da3fb",
+   "metadata": {
+    "id": "2449e5c7-078c-4ad6-a2f6-21d39d4da3fb"
+   },
+   "source": [
+    "## Step-3: pdf2parquet -  Convert data from PDF to Parquet\n",
+    "\n",
+    "This step is reading the input folder containing all PDF files and ingest them in a parquet table using the [Docling package](https://github.com/DS4SD/docling).\n",
+    "The documents are converted into a JSON format which allows to easily chunk it in the later steps.\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c0c574c4-9dc4-4dab-9ad6-b5338207e67a",
+   "metadata": {
+    "id": "c0c574c4-9dc4-4dab-9ad6-b5338207e67a"
+   },
+   "source": [
+    "### 3.1 - Set Input/output Folder"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "482605b2-d814-456d-9195-49a2ec454ef0",
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
     },
+    "id": "482605b2-d814-456d-9195-49a2ec454ef0",
+    "outputId": "f8383739-a4fb-450c-dc37-5df32aab8212"
+   },
+   "outputs": [
     {
-      "cell_type": "markdown",
-      "id": "72510ae6-48b0-4b88-9e13-a623281c3a63",
-      "metadata": {
-        "id": "72510ae6-48b0-4b88-9e13-a623281c3a63"
-      },
-      "source": [
-        "### 2.2 - Setup input/outpur directories"
-      ]
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "🏃🏼 STAGE-1: Processing input='input/solar-system' --> output='output/01_parquet_out'\n"
+     ]
+    }
+   ],
+   "source": [
+    "STAGE = 1\n",
+    "\n",
+    "input_folder = MY_CONFIG.INPUT_DATA_DIR\n",
+    "output_folder =  output_parquet_dir\n",
+    "\n",
+    "print (f\"🏃🏼 STAGE-{STAGE}: Processing input='{input_folder}' --> output='{output_folder}'\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9bb15f02-ab5c-4525-a536-cfa1fd2ba70b",
+   "metadata": {
+    "id": "9bb15f02-ab5c-4525-a536-cfa1fd2ba70b"
+   },
+   "source": [
+    "### 3.2 - Execute"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b0cd8ebd-bf71-42d6-a397-8df0c7b66a26",
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
     },
+    "id": "b0cd8ebd-bf71-42d6-a397-8df0c7b66a26",
+    "outputId": "14a36e73-a186-4431-a755-f46ccb691130"
+   },
+   "outputs": [
     {
-      "cell_type": "code",
-      "execution_count": 7,
-      "id": "60ac8bee-0960-4309-b225-d7a211b14262",
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "60ac8bee-0960-4309-b225-d7a211b14262",
-        "outputId": "ec5beb05-027a-49eb-9a96-271471619d81"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "✅ Cleared output directory\n"
-          ]
-        }
-      ],
-      "source": [
-        "import os, sys\n",
-        "import shutil\n",
-        "\n",
-        "if not os.path.exists(MY_CONFIG.INPUT_DATA_DIR ):\n",
-        "    raise Exception (f\"❌ Input folder MY_CONFIG.INPUT_DATA_DIR = '{MY_CONFIG.INPUT_DATA_DIR}' not found\")\n",
-        "\n",
-        "output_parquet_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '01_parquet_out')\n",
-        "output_chunk_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '02_chunk_out')\n",
-        "output_docid_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '03_docid_out')\n",
-        "output_exact_dedupe_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '04_exact_dedupe_out')\n",
-        "output_fuzzy_dedupe_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '05_fuzzy_dedupe_out')\n",
-        "output_embeddings_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '06_embeddings_out')\n",
-        "\n",
-        "## clear output folder\n",
-        "shutil.rmtree(MY_CONFIG.OUTPUT_FOLDER, ignore_errors=True)\n",
-        "shutil.os.makedirs(MY_CONFIG.OUTPUT_FOLDER, exist_ok=True)\n",
-        "\n",
-        "print (\"✅ Cleared output directory\")"
-      ]
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "13:30:44 INFO - pdf2parquet parameters are : {'artifacts_path': None, 'contents_type': <pdf2parquet_contents_types.JSON: 'application/json'>, 'do_table_structure': True, 'do_ocr': True, 'double_precision': 8}\n",
+      "13:30:44 INFO - pipeline id pipeline_id\n",
+      "13:30:44 INFO - code location None\n",
+      "13:30:44 INFO - number of workers 2 worker options {'num_cpus': 0.8, 'memory': 2147483648, 'max_restarts': -1}\n",
+      "13:30:44 INFO - actor creation delay 0\n",
+      "13:30:44 INFO - job details {'job category': 'preprocessing', 'job name': 'pdf2parquet', 'job type': 'ray', 'job id': 'job_id'}\n",
+      "13:30:44 INFO - data factory data_ is using local data access: input_folder - input/solar-system output_folder - output/01_parquet_out\n",
+      "13:30:44 INFO - data factory data_ max_files -1, n_sample -1\n",
+      "13:30:44 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.pdf'], files to checkpoint ['.parquet']\n",
+      "13:30:44 INFO - Running locally\n",
+      "2024-10-18 13:30:47,436\tINFO worker.py:1744 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n",
+      "\u001b[36m(orchestrate pid=9266)\u001b[0m 13:30:50 INFO - orchestrator started at 2024-10-18 13:30:50\n",
+      "\u001b[36m(orchestrate pid=9266)\u001b[0m 13:30:50 INFO - Number of files is 2, source profile {'max_file_size': 0.055823326110839844, 'min_file_size': 0.0551910400390625, 'total_file_size': 0.11101436614990234}\n",
+      "\u001b[36m(orchestrate pid=9266)\u001b[0m 13:30:50 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 14.872821807861328, 'object_store': 7.436410903930664}\n",
+      "\u001b[36m(orchestrate pid=9266)\u001b[0m 13:30:50 INFO - Number of workers - 2 with {'num_cpus': 0.8, 'memory': 2147483648, 'max_restarts': -1} each\n",
+      "\u001b[36m(orchestrate pid=9266)\u001b[0m 13:30:50 INFO - Completed 0 files (0.0%)  in 0.0 min. Waiting for completion\n",
+      "\u001b[36m(RayTransformFileProcessor pid=10098)\u001b[0m 13:30:53 INFO - Initializing models\n",
+      "Fetching 10 files: 100%|██████████| 10/10 [00:00<00:00, 110376.42it/s]\n",
+      "\u001b[36m(RayTransformFileProcessor pid=10098)\u001b[0m Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.\n",
+      "\u001b[36m(orchestrate pid=9266)\u001b[0m 13:30:59 INFO - Completed processing 2 files in 0.145 min\n",
+      "\u001b[36m(orchestrate pid=9266)\u001b[0m 13:30:59 INFO - done flushing in 0.001 sec\n",
+      "\u001b[36m(RayTransformFileProcessor pid=10099)\u001b[0m 13:30:53 INFO - Initializing models\n",
+      "Fetching 10 files: 100%|██████████| 10/10 [00:00<00:00, 73713.60it/s]\n",
+      "\u001b[36m(RayTransformFileProcessor pid=10099)\u001b[0m Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.\n",
+      "13:31:09 INFO - Completed execution in 0.421 min, execution result 0\n"
+     ]
     },
     {
-      "cell_type": "markdown",
-      "id": "2449e5c7-078c-4ad6-a2f6-21d39d4da3fb",
-      "metadata": {
-        "id": "2449e5c7-078c-4ad6-a2f6-21d39d4da3fb"
-      },
-      "source": [
-        "## Step-3: pdf2parquet -  Convert data from PDF to Parquet\n",
-        "\n",
-        "This step is reading the input folder containing all PDF files and ingest them in a parquet table using the [Docling package](https://github.com/DS4SD/docling).\n",
-        "The documents are converted into a JSON format which allows to easily chunk it in the later steps.\n",
-        "\n"
-      ]
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "✅ Stage:1 completed successfully\n",
+      "CPU times: user 4.41 s, sys: 1.39 s, total: 5.8 s\n",
+      "Wall time: 31.1 s\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "\n",
+    "import ast\n",
+    "import os\n",
+    "import sys\n",
+    "\n",
+    "from pdf2parquet_transform import (\n",
+    "    pdf2parquet_contents_type_cli_param,\n",
+    "    pdf2parquet_contents_types,\n",
+    ")\n",
+    "from data_processing_ray.runtime.ray import RayTransformLauncher\n",
+    "from pdf2parquet_transform_python import Pdf2ParquetPythonTransformConfiguration\n",
+    "from pdf2parquet_transform_ray import Pdf2ParquetRayTransformConfiguration\n",
+    "\n",
+    "from data_processing.utils import GB, ParamsUtils\n",
+    "\n",
+    "\n",
+    "# create parameters\n",
+    "local_conf = {\n",
+    "    \"input_folder\": input_folder,\n",
+    "    \"output_folder\": output_folder,\n",
+    "}\n",
+    "worker_options = {\"num_cpus\" : MY_CONFIG.RAY_NUM_CPUS, \"memory\": MY_CONFIG.RAY_MEMORY_GB * GB}\n",
+    "ingest_config = {\n",
+    "    pdf2parquet_contents_type_cli_param: pdf2parquet_contents_types.JSON,\n",
+    "}\n",
+    "\n",
+    "params = {\n",
+    "    # where to run\n",
+    "    \"run_locally\": True,\n",
+    "    # Data access. Only required parameters are specified\n",
+    "    \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n",
+    "    \"data_files_to_use\": ast.literal_eval(\"['.pdf']\"),\n",
+    "    # orchestrator\n",
+    "    \"runtime_worker_options\": ParamsUtils.convert_to_ast(worker_options),\n",
+    "    \"runtime_num_workers\": 1,  # so model download to cleanup works properly\n",
+    "\n",
+    "}\n",
+    "\n",
+    "\n",
+    "sys.argv = ParamsUtils.dict_to_req(d=(params | ingest_config))\n",
+    "# create launcher\n",
+    "launcher = RayTransformLauncher(Pdf2ParquetRayTransformConfiguration())\n",
+    "# launcher = PythonTransformLauncher(Pdf2ParquetPythonTransformConfiguration())\n",
+    "# launch\n",
+    "return_code = launcher.launch()\n",
+    "\n",
+    "if return_code == 0:\n",
+    "    print (f\"✅ Stage:{STAGE} completed successfully\")\n",
+    "else:\n",
+    "    raise Exception (\"❌ Ray job failed\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5ca790e0",
+   "metadata": {
+    "id": "5ca790e0"
+   },
+   "source": [
+    "### 3.3 - Inspect Generated output\n",
+    "\n",
+    "Here we should see one entry per input file processed."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "fe59563d",
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/",
+     "height": 255
     },
+    "id": "fe59563d",
+    "outputId": "d10c022d-524f-4a13-ebf8-6431114e9172"
+   },
+   "outputs": [
     {
-      "cell_type": "markdown",
-      "id": "c0c574c4-9dc4-4dab-9ad6-b5338207e67a",
-      "metadata": {
-        "id": "c0c574c4-9dc4-4dab-9ad6-b5338207e67a"
-      },
-      "source": [
-        "### 3.1 - Set Input/output Folder"
-      ]
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Output dimensions (rows x columns)=  (2, 12)\n"
+     ]
     },
     {
-      "cell_type": "code",
-      "execution_count": 8,
-      "id": "482605b2-d814-456d-9195-49a2ec454ef0",
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "482605b2-d814-456d-9195-49a2ec454ef0",
-        "outputId": "f8383739-a4fb-450c-dc37-5df32aab8212"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "🏃🏼 STAGE-1: Processing input='input/solar-system' --> output='output/01_parquet_out'\n"
-          ]
-        }
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>filename</th>\n",
+       "      <th>contents</th>\n",
+       "      <th>num_pages</th>\n",
+       "      <th>num_tables</th>\n",
+       "      <th>num_doc_elements</th>\n",
+       "      <th>document_id</th>\n",
+       "      <th>ext</th>\n",
+       "      <th>hash</th>\n",
+       "      <th>size</th>\n",
+       "      <th>date_acquired</th>\n",
+       "      <th>pdf_convert_time</th>\n",
+       "      <th>source_filename</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>mars.pdf</td>\n",
+       "      <td>{\"_name\":\"\",\"type\":\"pdf-document\",\"description...</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>11</td>\n",
+       "      <td>62e5639f-f922-4ccc-a041-3cb02f1cfd83</td>\n",
+       "      <td>pdf</td>\n",
+       "      <td>8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...</td>\n",
+       "      <td>2800</td>\n",
+       "      <td>2024-10-18T13:30:59.490007</td>\n",
+       "      <td>2.011138</td>\n",
+       "      <td>mars.pdf</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>earth.pdf</td>\n",
+       "      <td>{\"_name\":\"\",\"type\":\"pdf-document\",\"description...</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>11</td>\n",
+       "      <td>f3c0ac2e-1de2-472b-8216-2043f3b3e9d1</td>\n",
+       "      <td>pdf</td>\n",
+       "      <td>18713f970989055625bef22209b6f4b6830b9ca22046bf...</td>\n",
+       "      <td>2686</td>\n",
+       "      <td>2024-10-18T13:30:59.494027</td>\n",
+       "      <td>2.015123</td>\n",
+       "      <td>earth.pdf</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
       ],
-      "source": [
-        "STAGE = 1\n",
-        "\n",
-        "input_folder = MY_CONFIG.INPUT_DATA_DIR\n",
-        "output_folder =  output_parquet_dir\n",
-        "\n",
-        "print (f\"🏃🏼 STAGE-{STAGE}: Processing input='{input_folder}' --> output='{output_folder}'\")"
+      "text/plain": [
+       "    filename                                           contents  num_pages  \\\n",
+       "0   mars.pdf  {\"_name\":\"\",\"type\":\"pdf-document\",\"description...          1   \n",
+       "1  earth.pdf  {\"_name\":\"\",\"type\":\"pdf-document\",\"description...          1   \n",
+       "\n",
+       "   num_tables  num_doc_elements                           document_id  ext  \\\n",
+       "0           0                11  62e5639f-f922-4ccc-a041-3cb02f1cfd83  pdf   \n",
+       "1           0                11  f3c0ac2e-1de2-472b-8216-2043f3b3e9d1  pdf   \n",
+       "\n",
+       "                                                hash  size  \\\n",
+       "0  8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...  2800   \n",
+       "1  18713f970989055625bef22209b6f4b6830b9ca22046bf...  2686   \n",
+       "\n",
+       "                date_acquired  pdf_convert_time source_filename  \n",
+       "0  2024-10-18T13:30:59.490007          2.011138        mars.pdf  \n",
+       "1  2024-10-18T13:30:59.494027          2.015123       earth.pdf  "
       ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from my_utils import read_parquet_files_as_df\n",
+    "\n",
+    "output_df = read_parquet_files_as_df(output_folder)\n",
+    "\n",
+    "print (\"Output dimensions (rows x columns)= \", output_df.shape)\n",
+    "\n",
+    "output_df.head(5)\n",
+    "\n",
+    "## To display certain columns\n",
+    "#parquet_df[['column1', 'column2', 'column3']].head(5)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e5058a21",
+   "metadata": {
+    "id": "e5058a21"
+   },
+   "source": [
+    "\n",
+    "### 3.4 - Understand the output\n",
+    "\n",
+    "Here are some interesting attributes to note:\n",
+    "\n",
+    "- **filename** : original filename\n",
+    "- **contents** : text\n",
+    "- **document_id**: unique id (UUID) assignd to this document\n",
+    "- **hash** : hash of document\n",
+    "- **pdf_convert_time** : time to convert this pdf in seconds\n",
+    "\n",
+    "Let's inspect the **contents** column.  See how the text is being divided up!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "f870e624",
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
     },
+    "id": "f870e624",
+    "outputId": "9142246b-988c-4674-99d7-e2f3fffbaaf4"
+   },
+   "outputs": [
     {
-      "cell_type": "markdown",
-      "id": "9bb15f02-ab5c-4525-a536-cfa1fd2ba70b",
-      "metadata": {
-        "id": "9bb15f02-ab5c-4525-a536-cfa1fd2ba70b"
-      },
-      "source": [
-        "### 3.2 - Execute"
-      ]
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'_name': '',\n",
+      " 'description': {'logs': []},\n",
+      " 'equations': [],\n",
+      " 'figures': [],\n",
+      " 'file-info': {'#-pages': 1,\n",
+      "               'document-hash': '1a83f43f3a202e3f203c1263e36961ecc45d401aad488f638fc5559a584333b2',\n",
+      "               'filename': 'mars.pdf',\n",
+      "               'page-hashes': [{'hash': '551fe7a9bde2a9302f150c0a79a13fcc0868fcf73ac6afb80be645c1174734a0',\n",
+      "                                'model': 'default',\n",
+      "                                'page': 1}]},\n",
+      " 'footnotes': [],\n",
+      " 'main-text': [{'name': 'Section-header',\n",
+      "                'prov': [{'bbox': [133.35137939,\n",
+      "                                   654.45184326,\n",
+      "                                   169.88169861,\n",
+      "                                   667.98492432],\n",
+      "                          'page': 1,\n",
+      "                          'span': [0, 4]}],\n",
+      "                'text': 'Mars',\n",
+      "                'type': 'subtitle-level-1'},\n",
+      "               {'name': 'Section-header',\n",
+      "                'prov': [{'bbox': [133.09541321,\n",
+      "                                   630.68127441,\n",
+      "                                   210.66503906,\n",
+      "                                   642.34405518],\n",
+      "                          'page': 1,\n",
+      "                          'span': [0, 12]}],\n",
+      "                'text': 'Solar System',\n",
+      "                'type': 'subtitle-level-1'},\n",
+      "               {'name': 'Text',\n",
+      "                'prov': [{'bbox': [132.84518433,\n",
+      "                                   588.96014404,\n",
+      "                                   479.40917969,\n",
+      "                                   623.02520752],\n",
+      "                          'page': 1,\n",
+      "                          'span': [0, 205]}],\n",
+      "                'text': 'Our solar system is a vast and fascinating expanse, '\n",
+      "                        'comprising eight planets, five dwarf planets, '\n",
+      "                        'numerous moons, asteroids, comets, and other '\n",
+      "                        'celestial bodies. At its center lies the star we call '\n",
+      "                        'the Sun.',\n",
+      "                'type': 'paragraph'},\n",
+      "               {'name': 'Text',\n",
+      "                'prov': [{'bbox': [133.18510437,\n",
+      "                                   570.83258057,\n",
+      "                                   374.99838257,\n",
+      "                                   581.07043457],\n",
+      "                          'page': 1,\n",
+      "                          'span': [0, 54]}],\n",
+      "                'text': 'For more details about the Solar system see Chapter '\n",
+      "                        '1.',\n",
+      "                'type': 'paragraph'},\n",
+      "               {'name': 'Section-header',\n",
+      "                'prov': [{'bbox': [133.22866821,\n",
+      "                                   542.98168945,\n",
+      "                                   163.86282349,\n",
+      "                                   554.45288086],\n",
+      "                          'page': 1,\n",
+      "                          'span': [0, 4]}],\n",
+      "                'text': 'Mars',\n",
+      "                'type': 'subtitle-level-1'},\n",
+      "               {'name': 'Text',\n",
+      "                'prov': [{'bbox': [132.87440491,\n",
+      "                                   500.84011841,\n",
+      "                                   477.48345947,\n",
+      "                                   534.55810547],\n",
+      "                          'page': 1,\n",
+      "                          'span': [0, 196]}],\n",
+      "                'text': 'Mars, the fourth planet from the Sun, is a cold, '\n",
+      "                        'desert world with a thin atmosphere composed '\n",
+      "                        'primarily of carbon dioxide. Its reddish hue comes '\n",
+      "                        'from iron oxide, or rust, prevalent on its surface.',\n",
+      "                'type': 'paragraph'},\n",
+      "               {'name': 'Section-header',\n",
+      "                'prov': [{'bbox': [133.2026062,\n",
+      "                                   482.90710449,\n",
+      "                                   237.04431152,\n",
+      "                                   493.07443237],\n",
+      "                          'page': 1,\n",
+      "                          'span': [0, 23]}],\n",
+      "                'text': 'Basic facts about Mars:',\n",
+      "                'type': 'subtitle-level-1'},\n",
+      "               {'name': 'List-item',\n",
+      "                'prov': [{'bbox': [145.94500732,\n",
+      "                                   453.019104,\n",
+      "                                   477.48171997,\n",
+      "                                   474.9703064],\n",
+      "                          'page': 1,\n",
+      "                          'span': [0, 78]}],\n",
+      "                'text': '· Distance from the Sun: Average of 228 million '\n",
+      "                        'kilometers (142 million miles)',\n",
+      "                'type': 'paragraph'},\n",
+      "               {'name': 'List-item',\n",
+      "                'prov': [{'bbox': [145.94500732,\n",
+      "                                   440.79351807,\n",
+      "                                   431.73287964,\n",
+      "                                   451.2142334],\n",
+      "                          'page': 1,\n",
+      "                          'span': [0, 64]}],\n",
+      "                'text': '· Rotation Period: 24.6 hours (one Martian day - '\n",
+      "                        'called a \"sol\")',\n",
+      "                'type': 'paragraph'},\n",
+      "               {'name': 'List-item',\n",
+      "                'prov': [{'bbox': [145.94500732,\n",
+      "                                   429.10913086,\n",
+      "                                   365.9559021,\n",
+      "                                   438.83737183],\n",
+      "                          'page': 1,\n",
+      "                          'span': [0, 44]}],\n",
+      "                'text': '· Moons: Two small moons, Phobos and Deimos.',\n",
+      "                'type': 'paragraph'},\n",
+      "               {'name': 'Page-footer',\n",
+      "                'prov': [{'bbox': [303.13299561,\n",
+      "                                   87.20314026,\n",
+      "                                   308.11428833,\n",
+      "                                   96.51646423],\n",
+      "                          'page': 1,\n",
+      "                          'span': [0, 1]}],\n",
+      "                'text': '1',\n",
+      "                'type': 'page-footer'}],\n",
+      " 'page-dimensions': [{'height': 792.0, 'page': 1, 'width': 612.0}],\n",
+      " 'page-footers': [],\n",
+      " 'page-headers': [],\n",
+      " 'tables': [],\n",
+      " 'type': 'pdf-document'}\n"
+     ]
+    }
+   ],
+   "source": [
+    "import pprint\n",
+    "import json\n",
+    "\n",
+    "pprint.pprint (json.loads(output_df.iloc[0, ]['contents']))\n",
+    "# json.loads(output_df.iloc[0, ]['contents'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "e1a10c2d",
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
     },
+    "id": "e1a10c2d",
+    "outputId": "ca74113e-6fd3-488b-836a-60bd58299fb1"
+   },
+   "outputs": [
     {
-      "cell_type": "code",
-      "execution_count": 9,
-      "id": "b0cd8ebd-bf71-42d6-a397-8df0c7b66a26",
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "b0cd8ebd-bf71-42d6-a397-8df0c7b66a26",
-        "outputId": "14a36e73-a186-4431-a755-f46ccb691130"
-      },
-      "outputs": [
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "13:30:44 INFO - pdf2parquet parameters are : {'artifacts_path': None, 'contents_type': <pdf2parquet_contents_types.JSON: 'application/json'>, 'do_table_structure': True, 'do_ocr': True, 'double_precision': 8}\n",
-            "13:30:44 INFO - pipeline id pipeline_id\n",
-            "13:30:44 INFO - code location None\n",
-            "13:30:44 INFO - number of workers 2 worker options {'num_cpus': 0.8, 'memory': 2147483648, 'max_restarts': -1}\n",
-            "13:30:44 INFO - actor creation delay 0\n",
-            "13:30:44 INFO - job details {'job category': 'preprocessing', 'job name': 'pdf2parquet', 'job type': 'ray', 'job id': 'job_id'}\n",
-            "13:30:44 INFO - data factory data_ is using local data access: input_folder - input/solar-system output_folder - output/01_parquet_out\n",
-            "13:30:44 INFO - data factory data_ max_files -1, n_sample -1\n",
-            "13:30:44 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.pdf'], files to checkpoint ['.parquet']\n",
-            "13:30:44 INFO - Running locally\n",
-            "2024-10-18 13:30:47,436\tINFO worker.py:1744 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n",
-            "\u001b[36m(orchestrate pid=9266)\u001b[0m 13:30:50 INFO - orchestrator started at 2024-10-18 13:30:50\n",
-            "\u001b[36m(orchestrate pid=9266)\u001b[0m 13:30:50 INFO - Number of files is 2, source profile {'max_file_size': 0.055823326110839844, 'min_file_size': 0.0551910400390625, 'total_file_size': 0.11101436614990234}\n",
-            "\u001b[36m(orchestrate pid=9266)\u001b[0m 13:30:50 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 14.872821807861328, 'object_store': 7.436410903930664}\n",
-            "\u001b[36m(orchestrate pid=9266)\u001b[0m 13:30:50 INFO - Number of workers - 2 with {'num_cpus': 0.8, 'memory': 2147483648, 'max_restarts': -1} each\n",
-            "\u001b[36m(orchestrate pid=9266)\u001b[0m 13:30:50 INFO - Completed 0 files (0.0%)  in 0.0 min. Waiting for completion\n",
-            "\u001b[36m(RayTransformFileProcessor pid=10098)\u001b[0m 13:30:53 INFO - Initializing models\n",
-            "Fetching 10 files: 100%|██████████| 10/10 [00:00<00:00, 110376.42it/s]\n",
-            "\u001b[36m(RayTransformFileProcessor pid=10098)\u001b[0m Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.\n",
-            "\u001b[36m(orchestrate pid=9266)\u001b[0m 13:30:59 INFO - Completed processing 2 files in 0.145 min\n",
-            "\u001b[36m(orchestrate pid=9266)\u001b[0m 13:30:59 INFO - done flushing in 0.001 sec\n",
-            "\u001b[36m(RayTransformFileProcessor pid=10099)\u001b[0m 13:30:53 INFO - Initializing models\n",
-            "Fetching 10 files: 100%|██████████| 10/10 [00:00<00:00, 73713.60it/s]\n",
-            "\u001b[36m(RayTransformFileProcessor pid=10099)\u001b[0m Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.\n",
-            "13:31:09 INFO - Completed execution in 0.421 min, execution result 0\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "✅ Stage:1 completed successfully\n",
-            "CPU times: user 4.41 s, sys: 1.39 s, total: 5.8 s\n",
-            "Wall time: 31.1 s\n"
-          ]
-        }
-      ],
-      "source": [
-        "%%time\n",
-        "\n",
-        "import ast\n",
-        "import os\n",
-        "import sys\n",
-        "\n",
-        "from pdf2parquet_transform import (\n",
-        "    pdf2parquet_contents_type_cli_param,\n",
-        "    pdf2parquet_contents_types,\n",
-        ")\n",
-        "from data_processing_ray.runtime.ray import RayTransformLauncher\n",
-        "from pdf2parquet_transform_python import Pdf2ParquetPythonTransformConfiguration\n",
-        "from pdf2parquet_transform_ray import Pdf2ParquetRayTransformConfiguration\n",
-        "\n",
-        "from data_processing.utils import GB, ParamsUtils\n",
-        "\n",
-        "\n",
-        "# create parameters\n",
-        "local_conf = {\n",
-        "    \"input_folder\": input_folder,\n",
-        "    \"output_folder\": output_folder,\n",
-        "}\n",
-        "worker_options = {\"num_cpus\" : MY_CONFIG.RAY_NUM_CPUS, \"memory\": MY_CONFIG.RAY_MEMORY_GB * GB}\n",
-        "ingest_config = {\n",
-        "    pdf2parquet_contents_type_cli_param: pdf2parquet_contents_types.JSON,\n",
-        "}\n",
-        "\n",
-        "params = {\n",
-        "    # where to run\n",
-        "    \"run_locally\": True,\n",
-        "    # Data access. Only required parameters are specified\n",
-        "    \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n",
-        "    \"data_files_to_use\": ast.literal_eval(\"['.pdf']\"),\n",
-        "    # orchestrator\n",
-        "    \"runtime_worker_options\": ParamsUtils.convert_to_ast(worker_options),\n",
-        "    \"runtime_num_workers\": MY_CONFIG.RAY_RUNTIME_WORKERS,\n",
-        "}\n",
-        "\n",
-        "\n",
-        "sys.argv = ParamsUtils.dict_to_req(d=(params | ingest_config))\n",
-        "# create launcher\n",
-        "launcher = RayTransformLauncher(Pdf2ParquetRayTransformConfiguration())\n",
-        "# launcher = PythonTransformLauncher(Pdf2ParquetPythonTransformConfiguration())\n",
-        "# launch\n",
-        "return_code = launcher.launch()\n",
-        "\n",
-        "if return_code == 0:\n",
-        "    print (f\"✅ Stage:{STAGE} completed successfully\")\n",
-        "else:\n",
-        "    raise Exception (\"❌ Ray job failed\")\n"
-      ]
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'_name': '',\n",
+      " 'description': {'logs': []},\n",
+      " 'equations': [],\n",
+      " 'figures': [],\n",
+      " 'file-info': {'#-pages': 1,\n",
+      "               'document-hash': '7401ae81637dbb89e7040dcd5945bbfb75ff8648bb761c69f8a1595e86538748',\n",
+      "               'filename': 'earth.pdf',\n",
+      "               'page-hashes': [{'hash': 'ca802e4bd5a3301792808caea2a47db51f0520888875b77fc230c99ee851c19b',\n",
+      "                                'model': 'default',\n",
+      "                                'page': 1}]},\n",
+      " 'footnotes': [],\n",
+      " 'main-text': [{'name': 'Section-header',\n",
+      "                'prov': [{'bbox': [133.30961609,\n",
+      "                                   654.45184326,\n",
+      "                                   174.04208374,\n",
+      "                                   667.93347168],\n",
+      "                          'page': 1,\n",
+      "                          'span': [0, 5]}],\n",
+      "                'text': 'Earth',\n",
+      "                'type': 'subtitle-level-1'},\n",
+      "               {'name': 'Section-header',\n",
+      "                'prov': [{'bbox': [133.12528992,\n",
+      "                                   630.69073486,\n",
+      "                                   210.66503906,\n",
+      "                                   642.27935791],\n",
+      "                          'page': 1,\n",
+      "                          'span': [0, 12]}],\n",
+      "                'text': 'Solar System',\n",
+      "                'type': 'subtitle-level-1'},\n",
+      "               {'name': 'Text',\n",
+      "                'prov': [{'bbox': [132.87112427,\n",
+      "                                   588.96014404,\n",
+      "                                   479.40917969,\n",
+      "                                   623.04595947],\n",
+      "                          'page': 1,\n",
+      "                          'span': [0, 205]}],\n",
+      "                'text': 'Our solar system is a vast and fascinating expanse, '\n",
+      "                        'comprising eight planets, five dwarf planets, '\n",
+      "                        'numerous moons, asteroids, comets, and other '\n",
+      "                        'celestial bodies. At its center lies the star we call '\n",
+      "                        'the Sun.',\n",
+      "                'type': 'paragraph'},\n",
+      "               {'name': 'Text',\n",
+      "                'prov': [{'bbox': [133.20942688,\n",
+      "                                   570.81555176,\n",
+      "                                   375.57919312,\n",
+      "                                   581.08459473],\n",
+      "                          'page': 1,\n",
+      "                          'span': [0, 54]}],\n",
+      "                'text': 'For more details about our Solar system see Chapter '\n",
+      "                        '1.',\n",
+      "                'type': 'paragraph'},\n",
+      "               {'name': 'Section-header',\n",
+      "                'prov': [{'bbox': [133.15542603,\n",
+      "                                   542.98168945,\n",
+      "                                   167.32983398,\n",
+      "                                   554.36669922],\n",
+      "                          'page': 1,\n",
+      "                          'span': [0, 5]}],\n",
+      "                'text': 'Earth',\n",
+      "                'type': 'subtitle-level-1'},\n",
+      "               {'name': 'Text',\n",
+      "                'prov': [{'bbox': [132.91053772,\n",
+      "                                   512.46295166,\n",
+      "                                   477.84887695,\n",
+      "                                   534.48431396],\n",
+      "                          'page': 1,\n",
+      "                          'span': [0, 107]}],\n",
+      "                'text': \"Earth is the third planet from the Sun. It's our home \"\n",
+      "                        'planet. Earth is the only place we know of with life.',\n",
+      "                'type': 'paragraph'},\n",
+      "               {'name': 'Text',\n",
+      "                'prov': [{'bbox': [133.30151367,\n",
+      "                                   494.86206055,\n",
+      "                                   240.17156982,\n",
+      "                                   505.07229614],\n",
+      "                          'page': 1,\n",
+      "                          'span': [0, 24]}],\n",
+      "                'text': 'Basic facts about Earth:',\n",
+      "                'type': 'paragraph'},\n",
+      "               {'name': 'List-item',\n",
+      "                'prov': [{'bbox': [145.94500732,\n",
+      "                                   464.97409058,\n",
+      "                                   477.47979736,\n",
+      "                                   487.02810669],\n",
+      "                          'page': 1,\n",
+      "                          'span': [0, 79]}],\n",
+      "                'text': '· Distance from the Sun: Average of 149.6 million '\n",
+      "                        'kilometers (93 million miles)',\n",
+      "                'type': 'paragraph'},\n",
+      "               {'name': 'List-item',\n",
+      "                'prov': [{'bbox': [145.94500732,\n",
+      "                                   452.86901855,\n",
+      "                                   317.90722656,\n",
+      "                                   463.24041748],\n",
+      "                          'page': 1,\n",
+      "                          'span': [0, 37]}],\n",
+      "                'text': '· Rotation Period: 24 hours (one day)',\n",
+      "                'type': 'paragraph'},\n",
+      "               {'name': 'List-item',\n",
+      "                'prov': [{'bbox': [145.94500732,\n",
+      "                                   440.71496582,\n",
+      "                                   396.66357422,\n",
+      "                                   451.19915771],\n",
+      "                          'page': 1,\n",
+      "                          'span': [0, 52]}],\n",
+      "                'text': '· Moons: One moon, called Luna or simply \"the Moon\".',\n",
+      "                'type': 'paragraph'},\n",
+      "               {'name': 'Page-footer',\n",
+      "                'prov': [{'bbox': [303.13299561,\n",
+      "                                   87.20314026,\n",
+      "                                   308.11428833,\n",
+      "                                   96.53633118],\n",
+      "                          'page': 1,\n",
+      "                          'span': [0, 1]}],\n",
+      "                'text': '1',\n",
+      "                'type': 'page-footer'}],\n",
+      " 'page-dimensions': [{'height': 792.0, 'page': 1, 'width': 612.0}],\n",
+      " 'page-footers': [],\n",
+      " 'page-headers': [],\n",
+      " 'tables': [],\n",
+      " 'type': 'pdf-document'}\n"
+     ]
+    }
+   ],
+   "source": [
+    "pprint.pprint (json.loads(output_df.iloc[1, ]['contents']))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "72274586",
+   "metadata": {
+    "id": "72274586"
+   },
+   "source": [
+    "##  Step-4: Doc chunks\n",
+    "\n",
+    "In the previous step, we have extracted text from oru PDFs.  But we have the content of entire file as 'one row' in our parquet output.\n",
+    "\n",
+    "In this step, we are going to split the documents in chunks, according to their layout segmentation.\n",
+    "\n",
+    "This transform uses [Quackling](https://github.com/DS4SD/quackling) `HierarchicalChunker`\n",
+    "to chunk according to the document layout segmentation, i.e. respecting the original document components as paragraphs, tables, enumerations, etc.\n",
+    "It relies on documents converted with the Docling library in the [pdf2parquet transform](https://github.com/IBM/data-prep-kit/blob/dev/transforms/language/pdf2parquet/python/README.md) using the option `contents_type: \"application/json\"`,\n",
+    "which provides the required JSON structure."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "96198fa6",
+   "metadata": {
+    "id": "96198fa6"
+   },
+   "source": [
+    "### 4.1 - Set Input/output Folder"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "305f00a3",
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
     },
+    "id": "305f00a3",
+    "outputId": "689f1531-7007-49d9-9a27-39c39f8f2c50"
+   },
+   "outputs": [
     {
-      "cell_type": "markdown",
-      "id": "5ca790e0",
-      "metadata": {
-        "id": "5ca790e0"
-      },
-      "source": [
-        "### 3.3 - Inspect Generated output\n",
-        "\n",
-        "Here we should see one entry per input file processed."
-      ]
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "🏃🏼 STAGE-2: Processing input='output/01_parquet_out' --> output='output/02_chunk_out'\n"
+     ]
+    }
+   ],
+   "source": [
+    "STAGE = 2\n",
+    "\n",
+    "input_folder = output_parquet_dir # previous output folder is the input folder for the current stage\n",
+    "output_folder =  output_chunk_dir\n",
+    "\n",
+    "input_df = read_parquet_files_as_df(input_folder)  ## for debug purposes\n",
+    "\n",
+    "print (f\"🏃🏼 STAGE-{STAGE}: Processing input='{input_folder}' --> output='{output_folder}'\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "369f2cd1",
+   "metadata": {
+    "id": "369f2cd1"
+   },
+   "source": [
+    "### 4.2 - Execute"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "5b7b18d5",
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
     },
+    "id": "5b7b18d5",
+    "outputId": "0146bd91-2ccb-4e56-c649-f415a38bfcf8"
+   },
+   "outputs": [
     {
-      "cell_type": "code",
-      "execution_count": 10,
-      "id": "fe59563d",
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 255
-        },
-        "id": "fe59563d",
-        "outputId": "d10c022d-524f-4a13-ebf8-6431114e9172"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Output dimensions (rows x columns)=  (2, 12)\n"
-          ]
-        },
-        {
-          "data": {
-            "text/html": [
-              "<div>\n",
-              "<style scoped>\n",
-              "    .dataframe tbody tr th:only-of-type {\n",
-              "        vertical-align: middle;\n",
-              "    }\n",
-              "\n",
-              "    .dataframe tbody tr th {\n",
-              "        vertical-align: top;\n",
-              "    }\n",
-              "\n",
-              "    .dataframe thead th {\n",
-              "        text-align: right;\n",
-              "    }\n",
-              "</style>\n",
-              "<table border=\"1\" class=\"dataframe\">\n",
-              "  <thead>\n",
-              "    <tr style=\"text-align: right;\">\n",
-              "      <th></th>\n",
-              "      <th>filename</th>\n",
-              "      <th>contents</th>\n",
-              "      <th>num_pages</th>\n",
-              "      <th>num_tables</th>\n",
-              "      <th>num_doc_elements</th>\n",
-              "      <th>document_id</th>\n",
-              "      <th>ext</th>\n",
-              "      <th>hash</th>\n",
-              "      <th>size</th>\n",
-              "      <th>date_acquired</th>\n",
-              "      <th>pdf_convert_time</th>\n",
-              "      <th>source_filename</th>\n",
-              "    </tr>\n",
-              "  </thead>\n",
-              "  <tbody>\n",
-              "    <tr>\n",
-              "      <th>0</th>\n",
-              "      <td>mars.pdf</td>\n",
-              "      <td>{\"_name\":\"\",\"type\":\"pdf-document\",\"description...</td>\n",
-              "      <td>1</td>\n",
-              "      <td>0</td>\n",
-              "      <td>11</td>\n",
-              "      <td>62e5639f-f922-4ccc-a041-3cb02f1cfd83</td>\n",
-              "      <td>pdf</td>\n",
-              "      <td>8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...</td>\n",
-              "      <td>2800</td>\n",
-              "      <td>2024-10-18T13:30:59.490007</td>\n",
-              "      <td>2.011138</td>\n",
-              "      <td>mars.pdf</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>1</th>\n",
-              "      <td>earth.pdf</td>\n",
-              "      <td>{\"_name\":\"\",\"type\":\"pdf-document\",\"description...</td>\n",
-              "      <td>1</td>\n",
-              "      <td>0</td>\n",
-              "      <td>11</td>\n",
-              "      <td>f3c0ac2e-1de2-472b-8216-2043f3b3e9d1</td>\n",
-              "      <td>pdf</td>\n",
-              "      <td>18713f970989055625bef22209b6f4b6830b9ca22046bf...</td>\n",
-              "      <td>2686</td>\n",
-              "      <td>2024-10-18T13:30:59.494027</td>\n",
-              "      <td>2.015123</td>\n",
-              "      <td>earth.pdf</td>\n",
-              "    </tr>\n",
-              "  </tbody>\n",
-              "</table>\n",
-              "</div>"
-            ],
-            "text/plain": [
-              "    filename                                           contents  num_pages  \\\n",
-              "0   mars.pdf  {\"_name\":\"\",\"type\":\"pdf-document\",\"description...          1   \n",
-              "1  earth.pdf  {\"_name\":\"\",\"type\":\"pdf-document\",\"description...          1   \n",
-              "\n",
-              "   num_tables  num_doc_elements                           document_id  ext  \\\n",
-              "0           0                11  62e5639f-f922-4ccc-a041-3cb02f1cfd83  pdf   \n",
-              "1           0                11  f3c0ac2e-1de2-472b-8216-2043f3b3e9d1  pdf   \n",
-              "\n",
-              "                                                hash  size  \\\n",
-              "0  8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...  2800   \n",
-              "1  18713f970989055625bef22209b6f4b6830b9ca22046bf...  2686   \n",
-              "\n",
-              "                date_acquired  pdf_convert_time source_filename  \n",
-              "0  2024-10-18T13:30:59.490007          2.011138        mars.pdf  \n",
-              "1  2024-10-18T13:30:59.494027          2.015123       earth.pdf  "
-            ]
-          },
-          "execution_count": 10,
-          "metadata": {},
-          "output_type": "execute_result"
-        }
-      ],
-      "source": [
-        "from my_utils import read_parquet_files_as_df\n",
-        "\n",
-        "output_df = read_parquet_files_as_df(output_folder)\n",
-        "\n",
-        "print (\"Output dimensions (rows x columns)= \", output_df.shape)\n",
-        "\n",
-        "output_df.head(5)\n",
-        "\n",
-        "## To display certain columns\n",
-        "#parquet_df[['column1', 'column2', 'column3']].head(5)"
-      ]
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "13:31:12 INFO - doc_chunk parameters are : {'chunking_type': <chunking_types.DL_JSON: 'dl_json'>, 'content_column_name': 'contents', 'doc_id_column_name': 'document_id', 'dl_min_chunk_len': None, 'output_chunk_column_name': 'contents', 'output_source_doc_id_column_name': 'source_document_id', 'output_jsonpath_column_name': 'doc_jsonpath', 'output_pageno_column_name': 'page_number', 'output_bbox_column_name': 'bbox'}\n",
+      "13:31:12 INFO - pipeline id pipeline_id\n",
+      "13:31:12 INFO - code location None\n",
+      "13:31:12 INFO - number of workers 2 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n",
+      "13:31:12 INFO - actor creation delay 0\n",
+      "13:31:12 INFO - job details {'job category': 'preprocessing', 'job name': 'doc_chunk', 'job type': 'ray', 'job id': 'job_id'}\n",
+      "13:31:12 INFO - data factory data_ is using local data access: input_folder - output/01_parquet_out output_folder - output/02_chunk_out\n",
+      "13:31:12 INFO - data factory data_ max_files -1, n_sample -1\n",
+      "13:31:12 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n",
+      "13:31:12 INFO - Running locally\n",
+      "2024-10-18 13:31:14,121\tINFO worker.py:1744 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n",
+      "\u001b[36m(orchestrate pid=10912)\u001b[0m 13:31:16 INFO - orchestrator started at 2024-10-18 13:31:16\n",
+      "\u001b[36m(orchestrate pid=10912)\u001b[0m 13:31:16 INFO - Number of files is 2, source profile {'max_file_size': 0.02239513397216797, 'min_file_size': 0.02167987823486328, 'total_file_size': 0.04407501220703125}\n",
+      "\u001b[36m(orchestrate pid=10912)\u001b[0m 13:31:16 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 14.963891602121294, 'object_store': 7.4819458005949855}\n",
+      "\u001b[36m(orchestrate pid=10912)\u001b[0m 13:31:16 INFO - Number of workers - 2 with {'num_cpus': 0.8, 'max_restarts': -1} each\n",
+      "\u001b[36m(orchestrate pid=10912)\u001b[0m 13:31:16 INFO - Completed 0 files (0.0%)  in 0.0 min. Waiting for completion\n",
+      "\u001b[36m(orchestrate pid=10912)\u001b[0m 13:31:18 INFO - Completed processing 2 files in 0.032 min\n",
+      "\u001b[36m(orchestrate pid=10912)\u001b[0m 13:31:18 INFO - done flushing in 0.001 sec\n",
+      "13:31:28 INFO - Completed execution in 0.269 min, execution result 0\n"
+     ]
     },
     {
-      "cell_type": "markdown",
-      "id": "e5058a21",
-      "metadata": {
-        "id": "e5058a21"
-      },
-      "source": [
-        "\n",
-        "### 3.4 - Understand the output\n",
-        "\n",
-        "Here are some interesting attributes to note:\n",
-        "\n",
-        "- **filename** : original filename\n",
-        "- **contents** : text\n",
-        "- **document_id**: unique id (UUID) assignd to this document\n",
-        "- **hash** : hash of document\n",
-        "- **pdf_convert_time** : time to convert this pdf in seconds\n",
-        "\n",
-        "Let's inspect the **contents** column.  See how the text is being divided up!"
-      ]
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "✅ Stage:2 completed successfully\n",
+      "CPU times: user 982 ms, sys: 291 ms, total: 1.27 s\n",
+      "Wall time: 18.9 s\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "\n",
+    "from data_processing_ray.runtime.ray import RayTransformLauncher\n",
+    "from doc_chunk_transform_ray import DocChunkRayTransformConfiguration\n",
+    "\n",
+    "\n",
+    "# Prepare the commandline params\n",
+    "local_conf = {\n",
+    "    \"input_folder\": input_folder,\n",
+    "    \"output_folder\": output_folder,\n",
+    "}\n",
+    "worker_options = {\"num_cpus\" : MY_CONFIG.RAY_NUM_CPUS}\n",
+    "params = {\n",
+    "    # where to run\n",
+    "    \"run_locally\": True,\n",
+    "    # Data access. Only required parameters are specified\n",
+    "    \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n",
+    "    # orchestrator\n",
+    "    \"runtime_worker_options\": ParamsUtils.convert_to_ast(worker_options),\n",
+    "    \"runtime_num_workers\": MY_CONFIG.RAY_RUNTIME_WORKERS,\n",
+    "    # doc_chunk arguments\n",
+    "    # ...\n",
+    "}\n",
+    "\n",
+    "# Pass the commandline params\n",
+    "sys.argv = ParamsUtils.dict_to_req(d=params)\n",
+    "\n",
+    "# create launcher\n",
+    "launcher = RayTransformLauncher(DocChunkRayTransformConfiguration())\n",
+    "# launch\n",
+    "return_code = launcher.launch()\n",
+    "\n",
+    "if return_code == 0:\n",
+    "    print (f\"✅ Stage:{STAGE} completed successfully\")\n",
+    "else:\n",
+    "    raise Exception (\"❌ Ray job failed\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "213afdf6",
+   "metadata": {
+    "id": "213afdf6"
+   },
+   "source": [
+    "### 4.3 - Inspect Generated output\n",
+    "\n",
+    "We would see documents are split into many chunks"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "d8138d43",
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/",
+     "height": 897
     },
+    "id": "d8138d43",
+    "outputId": "e1758b0c-5f22-4368-c3e6-ff778fc9ae82"
+   },
+   "outputs": [
     {
-      "cell_type": "code",
-      "execution_count": 11,
-      "id": "f870e624",
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "f870e624",
-        "outputId": "9142246b-988c-4674-99d7-e2f3fffbaaf4"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "{'_name': '',\n",
-            " 'description': {'logs': []},\n",
-            " 'equations': [],\n",
-            " 'figures': [],\n",
-            " 'file-info': {'#-pages': 1,\n",
-            "               'document-hash': '1a83f43f3a202e3f203c1263e36961ecc45d401aad488f638fc5559a584333b2',\n",
-            "               'filename': 'mars.pdf',\n",
-            "               'page-hashes': [{'hash': '551fe7a9bde2a9302f150c0a79a13fcc0868fcf73ac6afb80be645c1174734a0',\n",
-            "                                'model': 'default',\n",
-            "                                'page': 1}]},\n",
-            " 'footnotes': [],\n",
-            " 'main-text': [{'name': 'Section-header',\n",
-            "                'prov': [{'bbox': [133.35137939,\n",
-            "                                   654.45184326,\n",
-            "                                   169.88169861,\n",
-            "                                   667.98492432],\n",
-            "                          'page': 1,\n",
-            "                          'span': [0, 4]}],\n",
-            "                'text': 'Mars',\n",
-            "                'type': 'subtitle-level-1'},\n",
-            "               {'name': 'Section-header',\n",
-            "                'prov': [{'bbox': [133.09541321,\n",
-            "                                   630.68127441,\n",
-            "                                   210.66503906,\n",
-            "                                   642.34405518],\n",
-            "                          'page': 1,\n",
-            "                          'span': [0, 12]}],\n",
-            "                'text': 'Solar System',\n",
-            "                'type': 'subtitle-level-1'},\n",
-            "               {'name': 'Text',\n",
-            "                'prov': [{'bbox': [132.84518433,\n",
-            "                                   588.96014404,\n",
-            "                                   479.40917969,\n",
-            "                                   623.02520752],\n",
-            "                          'page': 1,\n",
-            "                          'span': [0, 205]}],\n",
-            "                'text': 'Our solar system is a vast and fascinating expanse, '\n",
-            "                        'comprising eight planets, five dwarf planets, '\n",
-            "                        'numerous moons, asteroids, comets, and other '\n",
-            "                        'celestial bodies. At its center lies the star we call '\n",
-            "                        'the Sun.',\n",
-            "                'type': 'paragraph'},\n",
-            "               {'name': 'Text',\n",
-            "                'prov': [{'bbox': [133.18510437,\n",
-            "                                   570.83258057,\n",
-            "                                   374.99838257,\n",
-            "                                   581.07043457],\n",
-            "                          'page': 1,\n",
-            "                          'span': [0, 54]}],\n",
-            "                'text': 'For more details about the Solar system see Chapter '\n",
-            "                        '1.',\n",
-            "                'type': 'paragraph'},\n",
-            "               {'name': 'Section-header',\n",
-            "                'prov': [{'bbox': [133.22866821,\n",
-            "                                   542.98168945,\n",
-            "                                   163.86282349,\n",
-            "                                   554.45288086],\n",
-            "                          'page': 1,\n",
-            "                          'span': [0, 4]}],\n",
-            "                'text': 'Mars',\n",
-            "                'type': 'subtitle-level-1'},\n",
-            "               {'name': 'Text',\n",
-            "                'prov': [{'bbox': [132.87440491,\n",
-            "                                   500.84011841,\n",
-            "                                   477.48345947,\n",
-            "                                   534.55810547],\n",
-            "                          'page': 1,\n",
-            "                          'span': [0, 196]}],\n",
-            "                'text': 'Mars, the fourth planet from the Sun, is a cold, '\n",
-            "                        'desert world with a thin atmosphere composed '\n",
-            "                        'primarily of carbon dioxide. Its reddish hue comes '\n",
-            "                        'from iron oxide, or rust, prevalent on its surface.',\n",
-            "                'type': 'paragraph'},\n",
-            "               {'name': 'Section-header',\n",
-            "                'prov': [{'bbox': [133.2026062,\n",
-            "                                   482.90710449,\n",
-            "                                   237.04431152,\n",
-            "                                   493.07443237],\n",
-            "                          'page': 1,\n",
-            "                          'span': [0, 23]}],\n",
-            "                'text': 'Basic facts about Mars:',\n",
-            "                'type': 'subtitle-level-1'},\n",
-            "               {'name': 'List-item',\n",
-            "                'prov': [{'bbox': [145.94500732,\n",
-            "                                   453.019104,\n",
-            "                                   477.48171997,\n",
-            "                                   474.9703064],\n",
-            "                          'page': 1,\n",
-            "                          'span': [0, 78]}],\n",
-            "                'text': '· Distance from the Sun: Average of 228 million '\n",
-            "                        'kilometers (142 million miles)',\n",
-            "                'type': 'paragraph'},\n",
-            "               {'name': 'List-item',\n",
-            "                'prov': [{'bbox': [145.94500732,\n",
-            "                                   440.79351807,\n",
-            "                                   431.73287964,\n",
-            "                                   451.2142334],\n",
-            "                          'page': 1,\n",
-            "                          'span': [0, 64]}],\n",
-            "                'text': '· Rotation Period: 24.6 hours (one Martian day - '\n",
-            "                        'called a \"sol\")',\n",
-            "                'type': 'paragraph'},\n",
-            "               {'name': 'List-item',\n",
-            "                'prov': [{'bbox': [145.94500732,\n",
-            "                                   429.10913086,\n",
-            "                                   365.9559021,\n",
-            "                                   438.83737183],\n",
-            "                          'page': 1,\n",
-            "                          'span': [0, 44]}],\n",
-            "                'text': '· Moons: Two small moons, Phobos and Deimos.',\n",
-            "                'type': 'paragraph'},\n",
-            "               {'name': 'Page-footer',\n",
-            "                'prov': [{'bbox': [303.13299561,\n",
-            "                                   87.20314026,\n",
-            "                                   308.11428833,\n",
-            "                                   96.51646423],\n",
-            "                          'page': 1,\n",
-            "                          'span': [0, 1]}],\n",
-            "                'text': '1',\n",
-            "                'type': 'page-footer'}],\n",
-            " 'page-dimensions': [{'height': 792.0, 'page': 1, 'width': 612.0}],\n",
-            " 'page-footers': [],\n",
-            " 'page-headers': [],\n",
-            " 'tables': [],\n",
-            " 'type': 'pdf-document'}\n"
-          ]
-        }
-      ],
-      "source": [
-        "import pprint\n",
-        "import json\n",
-        "\n",
-        "pprint.pprint (json.loads(output_df.iloc[0, ]['contents']))\n",
-        "# json.loads(output_df.iloc[0, ]['contents'])"
-      ]
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Files processed : 2\n",
+      "Chunks created : 8\n",
+      "Input data dimensions (rows x columns)=  (2, 12)\n",
+      "Output data dimensions (rows x columns)=  (8, 16)\n"
+     ]
     },
     {
-      "cell_type": "code",
-      "execution_count": 12,
-      "id": "e1a10c2d",
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "e1a10c2d",
-        "outputId": "ca74113e-6fd3-488b-836a-60bd58299fb1"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "{'_name': '',\n",
-            " 'description': {'logs': []},\n",
-            " 'equations': [],\n",
-            " 'figures': [],\n",
-            " 'file-info': {'#-pages': 1,\n",
-            "               'document-hash': '7401ae81637dbb89e7040dcd5945bbfb75ff8648bb761c69f8a1595e86538748',\n",
-            "               'filename': 'earth.pdf',\n",
-            "               'page-hashes': [{'hash': 'ca802e4bd5a3301792808caea2a47db51f0520888875b77fc230c99ee851c19b',\n",
-            "                                'model': 'default',\n",
-            "                                'page': 1}]},\n",
-            " 'footnotes': [],\n",
-            " 'main-text': [{'name': 'Section-header',\n",
-            "                'prov': [{'bbox': [133.30961609,\n",
-            "                                   654.45184326,\n",
-            "                                   174.04208374,\n",
-            "                                   667.93347168],\n",
-            "                          'page': 1,\n",
-            "                          'span': [0, 5]}],\n",
-            "                'text': 'Earth',\n",
-            "                'type': 'subtitle-level-1'},\n",
-            "               {'name': 'Section-header',\n",
-            "                'prov': [{'bbox': [133.12528992,\n",
-            "                                   630.69073486,\n",
-            "                                   210.66503906,\n",
-            "                                   642.27935791],\n",
-            "                          'page': 1,\n",
-            "                          'span': [0, 12]}],\n",
-            "                'text': 'Solar System',\n",
-            "                'type': 'subtitle-level-1'},\n",
-            "               {'name': 'Text',\n",
-            "                'prov': [{'bbox': [132.87112427,\n",
-            "                                   588.96014404,\n",
-            "                                   479.40917969,\n",
-            "                                   623.04595947],\n",
-            "                          'page': 1,\n",
-            "                          'span': [0, 205]}],\n",
-            "                'text': 'Our solar system is a vast and fascinating expanse, '\n",
-            "                        'comprising eight planets, five dwarf planets, '\n",
-            "                        'numerous moons, asteroids, comets, and other '\n",
-            "                        'celestial bodies. At its center lies the star we call '\n",
-            "                        'the Sun.',\n",
-            "                'type': 'paragraph'},\n",
-            "               {'name': 'Text',\n",
-            "                'prov': [{'bbox': [133.20942688,\n",
-            "                                   570.81555176,\n",
-            "                                   375.57919312,\n",
-            "                                   581.08459473],\n",
-            "                          'page': 1,\n",
-            "                          'span': [0, 54]}],\n",
-            "                'text': 'For more details about our Solar system see Chapter '\n",
-            "                        '1.',\n",
-            "                'type': 'paragraph'},\n",
-            "               {'name': 'Section-header',\n",
-            "                'prov': [{'bbox': [133.15542603,\n",
-            "                                   542.98168945,\n",
-            "                                   167.32983398,\n",
-            "                                   554.36669922],\n",
-            "                          'page': 1,\n",
-            "                          'span': [0, 5]}],\n",
-            "                'text': 'Earth',\n",
-            "                'type': 'subtitle-level-1'},\n",
-            "               {'name': 'Text',\n",
-            "                'prov': [{'bbox': [132.91053772,\n",
-            "                                   512.46295166,\n",
-            "                                   477.84887695,\n",
-            "                                   534.48431396],\n",
-            "                          'page': 1,\n",
-            "                          'span': [0, 107]}],\n",
-            "                'text': \"Earth is the third planet from the Sun. It's our home \"\n",
-            "                        'planet. Earth is the only place we know of with life.',\n",
-            "                'type': 'paragraph'},\n",
-            "               {'name': 'Text',\n",
-            "                'prov': [{'bbox': [133.30151367,\n",
-            "                                   494.86206055,\n",
-            "                                   240.17156982,\n",
-            "                                   505.07229614],\n",
-            "                          'page': 1,\n",
-            "                          'span': [0, 24]}],\n",
-            "                'text': 'Basic facts about Earth:',\n",
-            "                'type': 'paragraph'},\n",
-            "               {'name': 'List-item',\n",
-            "                'prov': [{'bbox': [145.94500732,\n",
-            "                                   464.97409058,\n",
-            "                                   477.47979736,\n",
-            "                                   487.02810669],\n",
-            "                          'page': 1,\n",
-            "                          'span': [0, 79]}],\n",
-            "                'text': '· Distance from the Sun: Average of 149.6 million '\n",
-            "                        'kilometers (93 million miles)',\n",
-            "                'type': 'paragraph'},\n",
-            "               {'name': 'List-item',\n",
-            "                'prov': [{'bbox': [145.94500732,\n",
-            "                                   452.86901855,\n",
-            "                                   317.90722656,\n",
-            "                                   463.24041748],\n",
-            "                          'page': 1,\n",
-            "                          'span': [0, 37]}],\n",
-            "                'text': '· Rotation Period: 24 hours (one day)',\n",
-            "                'type': 'paragraph'},\n",
-            "               {'name': 'List-item',\n",
-            "                'prov': [{'bbox': [145.94500732,\n",
-            "                                   440.71496582,\n",
-            "                                   396.66357422,\n",
-            "                                   451.19915771],\n",
-            "                          'page': 1,\n",
-            "                          'span': [0, 52]}],\n",
-            "                'text': '· Moons: One moon, called Luna or simply \"the Moon\".',\n",
-            "                'type': 'paragraph'},\n",
-            "               {'name': 'Page-footer',\n",
-            "                'prov': [{'bbox': [303.13299561,\n",
-            "                                   87.20314026,\n",
-            "                                   308.11428833,\n",
-            "                                   96.53633118],\n",
-            "                          'page': 1,\n",
-            "                          'span': [0, 1]}],\n",
-            "                'text': '1',\n",
-            "                'type': 'page-footer'}],\n",
-            " 'page-dimensions': [{'height': 792.0, 'page': 1, 'width': 612.0}],\n",
-            " 'page-footers': [],\n",
-            " 'page-headers': [],\n",
-            " 'tables': [],\n",
-            " 'type': 'pdf-document'}\n"
-          ]
-        }
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>filename</th>\n",
+       "      <th>num_pages</th>\n",
+       "      <th>num_tables</th>\n",
+       "      <th>num_doc_elements</th>\n",
+       "      <th>ext</th>\n",
+       "      <th>hash</th>\n",
+       "      <th>size</th>\n",
+       "      <th>date_acquired</th>\n",
+       "      <th>pdf_convert_time</th>\n",
+       "      <th>source_filename</th>\n",
+       "      <th>source_document_id</th>\n",
+       "      <th>contents</th>\n",
+       "      <th>doc_jsonpath</th>\n",
+       "      <th>page_number</th>\n",
+       "      <th>bbox</th>\n",
+       "      <th>document_id</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>mars.pdf</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>11</td>\n",
+       "      <td>pdf</td>\n",
+       "      <td>8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...</td>\n",
+       "      <td>2800</td>\n",
+       "      <td>2024-10-18T13:30:59.490007</td>\n",
+       "      <td>2.011138</td>\n",
+       "      <td>mars.pdf</td>\n",
+       "      <td>62e5639f-f922-4ccc-a041-3cb02f1cfd83</td>\n",
+       "      <td>Solar System\\nOur solar system is a vast and f...</td>\n",
+       "      <td>$.main-text[2]</td>\n",
+       "      <td>1</td>\n",
+       "      <td>[132.84518433, 588.96014404, 479.40917969, 623...</td>\n",
+       "      <td>44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>mars.pdf</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>11</td>\n",
+       "      <td>pdf</td>\n",
+       "      <td>8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...</td>\n",
+       "      <td>2800</td>\n",
+       "      <td>2024-10-18T13:30:59.490007</td>\n",
+       "      <td>2.011138</td>\n",
+       "      <td>mars.pdf</td>\n",
+       "      <td>62e5639f-f922-4ccc-a041-3cb02f1cfd83</td>\n",
+       "      <td>Solar System\\nFor more details about the Solar...</td>\n",
+       "      <td>$.main-text[3]</td>\n",
+       "      <td>1</td>\n",
+       "      <td>[133.18510437, 570.83258057, 374.99838257, 581...</td>\n",
+       "      <td>dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>mars.pdf</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>11</td>\n",
+       "      <td>pdf</td>\n",
+       "      <td>8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...</td>\n",
+       "      <td>2800</td>\n",
+       "      <td>2024-10-18T13:30:59.490007</td>\n",
+       "      <td>2.011138</td>\n",
+       "      <td>mars.pdf</td>\n",
+       "      <td>62e5639f-f922-4ccc-a041-3cb02f1cfd83</td>\n",
+       "      <td>Mars\\nMars, the fourth planet from the Sun, is...</td>\n",
+       "      <td>$.main-text[5]</td>\n",
+       "      <td>1</td>\n",
+       "      <td>[132.87440491, 500.84011841, 477.48345947, 534...</td>\n",
+       "      <td>a31663e06fac41470ecc459f5a58658a3f9997d7801053...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>mars.pdf</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>11</td>\n",
+       "      <td>pdf</td>\n",
+       "      <td>8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...</td>\n",
+       "      <td>2800</td>\n",
+       "      <td>2024-10-18T13:30:59.490007</td>\n",
+       "      <td>2.011138</td>\n",
+       "      <td>mars.pdf</td>\n",
+       "      <td>62e5639f-f922-4ccc-a041-3cb02f1cfd83</td>\n",
+       "      <td>Basic facts about Mars:\\n· Distance from the S...</td>\n",
+       "      <td>$.main-text[6]</td>\n",
+       "      <td>1</td>\n",
+       "      <td>[133.2026062, 482.90710449, 237.04431152, 493....</td>\n",
+       "      <td>7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>earth.pdf</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>11</td>\n",
+       "      <td>pdf</td>\n",
+       "      <td>18713f970989055625bef22209b6f4b6830b9ca22046bf...</td>\n",
+       "      <td>2686</td>\n",
+       "      <td>2024-10-18T13:30:59.494027</td>\n",
+       "      <td>2.015123</td>\n",
+       "      <td>earth.pdf</td>\n",
+       "      <td>f3c0ac2e-1de2-472b-8216-2043f3b3e9d1</td>\n",
+       "      <td>Solar System\\nOur solar system is a vast and f...</td>\n",
+       "      <td>$.main-text[2]</td>\n",
+       "      <td>1</td>\n",
+       "      <td>[132.87112427, 588.96014404, 479.40917969, 623...</td>\n",
+       "      <td>44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>earth.pdf</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>11</td>\n",
+       "      <td>pdf</td>\n",
+       "      <td>18713f970989055625bef22209b6f4b6830b9ca22046bf...</td>\n",
+       "      <td>2686</td>\n",
+       "      <td>2024-10-18T13:30:59.494027</td>\n",
+       "      <td>2.015123</td>\n",
+       "      <td>earth.pdf</td>\n",
+       "      <td>f3c0ac2e-1de2-472b-8216-2043f3b3e9d1</td>\n",
+       "      <td>Solar System\\nFor more details about our Solar...</td>\n",
+       "      <td>$.main-text[3]</td>\n",
+       "      <td>1</td>\n",
+       "      <td>[133.20942688, 570.81555176, 375.57919312, 581...</td>\n",
+       "      <td>d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>earth.pdf</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>11</td>\n",
+       "      <td>pdf</td>\n",
+       "      <td>18713f970989055625bef22209b6f4b6830b9ca22046bf...</td>\n",
+       "      <td>2686</td>\n",
+       "      <td>2024-10-18T13:30:59.494027</td>\n",
+       "      <td>2.015123</td>\n",
+       "      <td>earth.pdf</td>\n",
+       "      <td>f3c0ac2e-1de2-472b-8216-2043f3b3e9d1</td>\n",
+       "      <td>Earth\\nEarth is the third planet from the Sun....</td>\n",
+       "      <td>$.main-text[5]</td>\n",
+       "      <td>1</td>\n",
+       "      <td>[132.91053772, 512.46295166, 477.84887695, 534...</td>\n",
+       "      <td>7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>earth.pdf</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>11</td>\n",
+       "      <td>pdf</td>\n",
+       "      <td>18713f970989055625bef22209b6f4b6830b9ca22046bf...</td>\n",
+       "      <td>2686</td>\n",
+       "      <td>2024-10-18T13:30:59.494027</td>\n",
+       "      <td>2.015123</td>\n",
+       "      <td>earth.pdf</td>\n",
+       "      <td>f3c0ac2e-1de2-472b-8216-2043f3b3e9d1</td>\n",
+       "      <td>Earth\\nBasic facts about Earth:\\n· Distance fr...</td>\n",
+       "      <td>$.main-text[6]</td>\n",
+       "      <td>1</td>\n",
+       "      <td>[133.30151367, 494.86206055, 240.17156982, 505...</td>\n",
+       "      <td>189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
       ],
-      "source": [
-        "pprint.pprint (json.loads(output_df.iloc[1, ]['contents']))"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "72274586",
-      "metadata": {
-        "id": "72274586"
-      },
-      "source": [
-        "##  Step-4: Doc chunks\n",
-        "\n",
-        "In the previous step, we have extracted text from oru PDFs.  But we have the content of entire file as 'one row' in our parquet output.\n",
-        "\n",
-        "In this step, we are going to split the documents in chunks, according to their layout segmentation.\n",
-        "\n",
-        "This transform uses [Quackling](https://github.com/DS4SD/quackling) `HierarchicalChunker`\n",
-        "to chunk according to the document layout segmentation, i.e. respecting the original document components as paragraphs, tables, enumerations, etc.\n",
-        "It relies on documents converted with the Docling library in the [pdf2parquet transform](https://github.com/IBM/data-prep-kit/blob/dev/transforms/language/pdf2parquet/python/README.md) using the option `contents_type: \"application/json\"`,\n",
-        "which provides the required JSON structure."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "96198fa6",
-      "metadata": {
-        "id": "96198fa6"
-      },
-      "source": [
-        "### 4.1 - Set Input/output Folder"
+      "text/plain": [
+       "    filename  num_pages  num_tables  num_doc_elements  ext  \\\n",
+       "0   mars.pdf          1           0                11  pdf   \n",
+       "1   mars.pdf          1           0                11  pdf   \n",
+       "2   mars.pdf          1           0                11  pdf   \n",
+       "3   mars.pdf          1           0                11  pdf   \n",
+       "4  earth.pdf          1           0                11  pdf   \n",
+       "5  earth.pdf          1           0                11  pdf   \n",
+       "6  earth.pdf          1           0                11  pdf   \n",
+       "7  earth.pdf          1           0                11  pdf   \n",
+       "\n",
+       "                                                hash  size  \\\n",
+       "0  8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...  2800   \n",
+       "1  8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...  2800   \n",
+       "2  8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...  2800   \n",
+       "3  8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...  2800   \n",
+       "4  18713f970989055625bef22209b6f4b6830b9ca22046bf...  2686   \n",
+       "5  18713f970989055625bef22209b6f4b6830b9ca22046bf...  2686   \n",
+       "6  18713f970989055625bef22209b6f4b6830b9ca22046bf...  2686   \n",
+       "7  18713f970989055625bef22209b6f4b6830b9ca22046bf...  2686   \n",
+       "\n",
+       "                date_acquired  pdf_convert_time source_filename  \\\n",
+       "0  2024-10-18T13:30:59.490007          2.011138        mars.pdf   \n",
+       "1  2024-10-18T13:30:59.490007          2.011138        mars.pdf   \n",
+       "2  2024-10-18T13:30:59.490007          2.011138        mars.pdf   \n",
+       "3  2024-10-18T13:30:59.490007          2.011138        mars.pdf   \n",
+       "4  2024-10-18T13:30:59.494027          2.015123       earth.pdf   \n",
+       "5  2024-10-18T13:30:59.494027          2.015123       earth.pdf   \n",
+       "6  2024-10-18T13:30:59.494027          2.015123       earth.pdf   \n",
+       "7  2024-10-18T13:30:59.494027          2.015123       earth.pdf   \n",
+       "\n",
+       "                     source_document_id  \\\n",
+       "0  62e5639f-f922-4ccc-a041-3cb02f1cfd83   \n",
+       "1  62e5639f-f922-4ccc-a041-3cb02f1cfd83   \n",
+       "2  62e5639f-f922-4ccc-a041-3cb02f1cfd83   \n",
+       "3  62e5639f-f922-4ccc-a041-3cb02f1cfd83   \n",
+       "4  f3c0ac2e-1de2-472b-8216-2043f3b3e9d1   \n",
+       "5  f3c0ac2e-1de2-472b-8216-2043f3b3e9d1   \n",
+       "6  f3c0ac2e-1de2-472b-8216-2043f3b3e9d1   \n",
+       "7  f3c0ac2e-1de2-472b-8216-2043f3b3e9d1   \n",
+       "\n",
+       "                                            contents    doc_jsonpath  \\\n",
+       "0  Solar System\\nOur solar system is a vast and f...  $.main-text[2]   \n",
+       "1  Solar System\\nFor more details about the Solar...  $.main-text[3]   \n",
+       "2  Mars\\nMars, the fourth planet from the Sun, is...  $.main-text[5]   \n",
+       "3  Basic facts about Mars:\\n· Distance from the S...  $.main-text[6]   \n",
+       "4  Solar System\\nOur solar system is a vast and f...  $.main-text[2]   \n",
+       "5  Solar System\\nFor more details about our Solar...  $.main-text[3]   \n",
+       "6  Earth\\nEarth is the third planet from the Sun....  $.main-text[5]   \n",
+       "7  Earth\\nBasic facts about Earth:\\n· Distance fr...  $.main-text[6]   \n",
+       "\n",
+       "   page_number                                               bbox  \\\n",
+       "0            1  [132.84518433, 588.96014404, 479.40917969, 623...   \n",
+       "1            1  [133.18510437, 570.83258057, 374.99838257, 581...   \n",
+       "2            1  [132.87440491, 500.84011841, 477.48345947, 534...   \n",
+       "3            1  [133.2026062, 482.90710449, 237.04431152, 493....   \n",
+       "4            1  [132.87112427, 588.96014404, 479.40917969, 623...   \n",
+       "5            1  [133.20942688, 570.81555176, 375.57919312, 581...   \n",
+       "6            1  [132.91053772, 512.46295166, 477.84887695, 534...   \n",
+       "7            1  [133.30151367, 494.86206055, 240.17156982, 505...   \n",
+       "\n",
+       "                                         document_id  \n",
+       "0  44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...  \n",
+       "1  dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07...  \n",
+       "2  a31663e06fac41470ecc459f5a58658a3f9997d7801053...  \n",
+       "3  7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...  \n",
+       "4  44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...  \n",
+       "5  d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...  \n",
+       "6  7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...  \n",
+       "7  189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...  "
       ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from my_utils import read_parquet_files_as_df\n",
+    "\n",
+    "output_df = read_parquet_files_as_df(output_folder)\n",
+    "\n",
+    "print (f\"Files processed : {input_df.shape[0]:,}\")\n",
+    "print (f\"Chunks created : {output_df.shape[0]:,}\")\n",
+    "\n",
+    "print (\"Input data dimensions (rows x columns)= \", input_df.shape)\n",
+    "print (\"Output data dimensions (rows x columns)= \", output_df.shape)\n",
+    "\n",
+    "output_df.head(10)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9e9ca75c",
+   "metadata": {
+    "id": "9e9ca75c"
+   },
+   "source": [
+    "### 4.4 - Understanding the Output\n",
+    "\n",
+    "Here we see 2 PDF files are split into 6 chunks.  Basically we see the documents are being split along 'natural boundaris' - paragraphs and bullet points\n",
+    "\n",
+    "See how **document_id** is carried throughout.  This helps us identify original documents.\n",
+    "\n",
+    "Also note **contents** is now plain text (not JSON as before)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "3090c950",
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/",
+     "height": 300
     },
+    "id": "3090c950",
+    "outputId": "3f542446-2cfa-404c-c642-3732f7b74568"
+   },
+   "outputs": [
     {
-      "cell_type": "code",
-      "execution_count": 13,
-      "id": "305f00a3",
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "305f00a3",
-        "outputId": "689f1531-7007-49d9-9a27-39c39f8f2c50"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "🏃🏼 STAGE-2: Processing input='output/01_parquet_out' --> output='output/02_chunk_out'\n"
-          ]
-        }
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>filename</th>\n",
+       "      <th>contents</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>mars.pdf</td>\n",
+       "      <td>Solar System\\nOur solar system is a vast and f...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>mars.pdf</td>\n",
+       "      <td>Solar System\\nFor more details about the Solar...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>mars.pdf</td>\n",
+       "      <td>Mars\\nMars, the fourth planet from the Sun, is...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>mars.pdf</td>\n",
+       "      <td>Basic facts about Mars:\\n· Distance from the S...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>earth.pdf</td>\n",
+       "      <td>Solar System\\nOur solar system is a vast and f...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>earth.pdf</td>\n",
+       "      <td>Solar System\\nFor more details about our Solar...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>earth.pdf</td>\n",
+       "      <td>Earth\\nEarth is the third planet from the Sun....</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>earth.pdf</td>\n",
+       "      <td>Earth\\nBasic facts about Earth:\\n· Distance fr...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
       ],
-      "source": [
-        "STAGE = 2\n",
-        "\n",
-        "input_folder = output_parquet_dir # previous output folder is the input folder for the current stage\n",
-        "output_folder =  output_chunk_dir\n",
-        "\n",
-        "input_df = read_parquet_files_as_df(input_folder)  ## for debug purposes\n",
-        "\n",
-        "print (f\"🏃🏼 STAGE-{STAGE}: Processing input='{input_folder}' --> output='{output_folder}'\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "369f2cd1",
-      "metadata": {
-        "id": "369f2cd1"
-      },
-      "source": [
-        "### 4.2 - Execute"
+      "text/plain": [
+       "    filename                                           contents\n",
+       "0   mars.pdf  Solar System\\nOur solar system is a vast and f...\n",
+       "1   mars.pdf  Solar System\\nFor more details about the Solar...\n",
+       "2   mars.pdf  Mars\\nMars, the fourth planet from the Sun, is...\n",
+       "3   mars.pdf  Basic facts about Mars:\\n· Distance from the S...\n",
+       "4  earth.pdf  Solar System\\nOur solar system is a vast and f...\n",
+       "5  earth.pdf  Solar System\\nFor more details about our Solar...\n",
+       "6  earth.pdf  Earth\\nEarth is the third planet from the Sun....\n",
+       "7  earth.pdf  Earth\\nBasic facts about Earth:\\n· Distance fr..."
       ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "output_df[['filename', 'contents']]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "d5f151ae",
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
     },
+    "id": "d5f151ae",
+    "outputId": "4616d648-0852-4ecb-cef8-f5940e176de0"
+   },
+   "outputs": [
     {
-      "cell_type": "code",
-      "execution_count": 14,
-      "id": "5b7b18d5",
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "5b7b18d5",
-        "outputId": "0146bd91-2ccb-4e56-c649-f415a38bfcf8"
-      },
-      "outputs": [
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "13:31:12 INFO - doc_chunk parameters are : {'chunking_type': <chunking_types.DL_JSON: 'dl_json'>, 'content_column_name': 'contents', 'doc_id_column_name': 'document_id', 'dl_min_chunk_len': None, 'output_chunk_column_name': 'contents', 'output_source_doc_id_column_name': 'source_document_id', 'output_jsonpath_column_name': 'doc_jsonpath', 'output_pageno_column_name': 'page_number', 'output_bbox_column_name': 'bbox'}\n",
-            "13:31:12 INFO - pipeline id pipeline_id\n",
-            "13:31:12 INFO - code location None\n",
-            "13:31:12 INFO - number of workers 2 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n",
-            "13:31:12 INFO - actor creation delay 0\n",
-            "13:31:12 INFO - job details {'job category': 'preprocessing', 'job name': 'doc_chunk', 'job type': 'ray', 'job id': 'job_id'}\n",
-            "13:31:12 INFO - data factory data_ is using local data access: input_folder - output/01_parquet_out output_folder - output/02_chunk_out\n",
-            "13:31:12 INFO - data factory data_ max_files -1, n_sample -1\n",
-            "13:31:12 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n",
-            "13:31:12 INFO - Running locally\n",
-            "2024-10-18 13:31:14,121\tINFO worker.py:1744 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n",
-            "\u001b[36m(orchestrate pid=10912)\u001b[0m 13:31:16 INFO - orchestrator started at 2024-10-18 13:31:16\n",
-            "\u001b[36m(orchestrate pid=10912)\u001b[0m 13:31:16 INFO - Number of files is 2, source profile {'max_file_size': 0.02239513397216797, 'min_file_size': 0.02167987823486328, 'total_file_size': 0.04407501220703125}\n",
-            "\u001b[36m(orchestrate pid=10912)\u001b[0m 13:31:16 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 14.963891602121294, 'object_store': 7.4819458005949855}\n",
-            "\u001b[36m(orchestrate pid=10912)\u001b[0m 13:31:16 INFO - Number of workers - 2 with {'num_cpus': 0.8, 'max_restarts': -1} each\n",
-            "\u001b[36m(orchestrate pid=10912)\u001b[0m 13:31:16 INFO - Completed 0 files (0.0%)  in 0.0 min. Waiting for completion\n",
-            "\u001b[36m(orchestrate pid=10912)\u001b[0m 13:31:18 INFO - Completed processing 2 files in 0.032 min\n",
-            "\u001b[36m(orchestrate pid=10912)\u001b[0m 13:31:18 INFO - done flushing in 0.001 sec\n",
-            "13:31:28 INFO - Completed execution in 0.269 min, execution result 0\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "✅ Stage:2 completed successfully\n",
-            "CPU times: user 982 ms, sys: 291 ms, total: 1.27 s\n",
-            "Wall time: 18.9 s\n"
-          ]
-        }
-      ],
-      "source": [
-        "%%time\n",
-        "\n",
-        "from data_processing_ray.runtime.ray import RayTransformLauncher\n",
-        "from doc_chunk_transform_ray import DocChunkRayTransformConfiguration\n",
-        "\n",
-        "\n",
-        "# Prepare the commandline params\n",
-        "local_conf = {\n",
-        "    \"input_folder\": input_folder,\n",
-        "    \"output_folder\": output_folder,\n",
-        "}\n",
-        "worker_options = {\"num_cpus\" : MY_CONFIG.RAY_NUM_CPUS}\n",
-        "params = {\n",
-        "    # where to run\n",
-        "    \"run_locally\": True,\n",
-        "    # Data access. Only required parameters are specified\n",
-        "    \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n",
-        "    # orchestrator\n",
-        "    \"runtime_worker_options\": ParamsUtils.convert_to_ast(worker_options),\n",
-        "    \"runtime_num_workers\": MY_CONFIG.RAY_RUNTIME_WORKERS,\n",
-        "    # doc_chunk arguments\n",
-        "    # ...\n",
-        "}\n",
-        "\n",
-        "# Pass the commandline params\n",
-        "sys.argv = ParamsUtils.dict_to_req(d=params)\n",
-        "\n",
-        "# create launcher\n",
-        "launcher = RayTransformLauncher(DocChunkRayTransformConfiguration())\n",
-        "# launch\n",
-        "return_code = launcher.launch()\n",
-        "\n",
-        "if return_code == 0:\n",
-        "    print (f\"✅ Stage:{STAGE} completed successfully\")\n",
-        "else:\n",
-        "    raise Exception (\"❌ Ray job failed\")"
-      ]
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "========== mars.pdf ===========\n",
+      "-------Chunk 0------\n",
+      "Solar System\n",
+      "Our solar system is a vast and fascinating expanse, comprising eight planets, five dwarf planets, numerous moons, asteroids, comets, and other celestial bodies. At its center lies the star we call the Sun.\n",
+      "-------\n",
+      "-------Chunk 1------\n",
+      "Solar System\n",
+      "For more details about the Solar system see Chapter 1.\n",
+      "-------\n",
+      "-------Chunk 2------\n",
+      "Mars\n",
+      "Mars, the fourth planet from the Sun, is a cold, desert world with a thin atmosphere composed primarily of carbon dioxide. Its reddish hue comes from iron oxide, or rust, prevalent on its surface.\n",
+      "-------\n",
+      "-------Chunk 3------\n",
+      "Basic facts about Mars:\n",
+      "· Distance from the Sun: Average of 228 million kilometers (142 million miles)\n",
+      "· Rotation Period: 24.6 hours (one Martian day - called a \"sol\")\n",
+      "· Moons: Two small moons, Phobos and Deimos.\n",
+      "-------\n",
+      "========== earth.pdf ===========\n",
+      "-------Chunk 0------\n",
+      "Solar System\n",
+      "Our solar system is a vast and fascinating expanse, comprising eight planets, five dwarf planets, numerous moons, asteroids, comets, and other celestial bodies. At its center lies the star we call the Sun.\n",
+      "-------\n",
+      "-------Chunk 1------\n",
+      "Solar System\n",
+      "For more details about our Solar system see Chapter 1.\n",
+      "-------\n",
+      "-------Chunk 2------\n",
+      "Earth\n",
+      "Earth is the third planet from the Sun. It's our home planet. Earth is the only place we know of with life.\n",
+      "-------\n",
+      "-------Chunk 3------\n",
+      "Earth\n",
+      "Basic facts about Earth:\n",
+      "· Distance from the Sun: Average of 149.6 million kilometers (93 million miles)\n",
+      "· Rotation Period: 24 hours (one day)\n",
+      "· Moons: One moon, called Luna or simply \"the Moon\".\n",
+      "-------\n"
+     ]
+    }
+   ],
+   "source": [
+    "for f in output_df['filename'].unique():\n",
+    "    print ('==========' , f, '===========')\n",
+    "    chunks = output_df[output_df['filename'] == f]['contents']\n",
+    "    for idx , chunk in enumerate(chunks):\n",
+    "        print (f'-------Chunk {idx}------\\n{chunk}\\n-------')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "20217298",
+   "metadata": {
+    "id": "20217298"
+   },
+   "source": [
+    "## Step-5:  DOC ID generation\n",
+    "\n",
+    "This transform annotates documents with document \"ids\". It supports the following transformations of the original data:\n",
+    "\n",
+    " - Adding document hash: this enables the addition of a document hash-based id to the data. The hash is calculated with `hashlib.sha256(doc.encode(\"utf-8\")).hexdigest()`. To enable this annotation, set **hash_column** to the name of the column, where you want to store it.\n",
+    " - Adding integer document id: this allows the addition of an integer document id to the data that is unique across all rows in all tables provided to the transform() method. To enable this annotation, set **int_id_column** to the name of the column, where you want to store it.\n",
+    "\n",
+    "**This is a pre-requisite for fuzzy dedup** in the pipeline."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "66811f5b",
+   "metadata": {
+    "id": "66811f5b"
+   },
+   "source": [
+    "### 5.1 - Set Input/output Folder"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "1f747c0d",
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
     },
+    "id": "1f747c0d",
+    "outputId": "e42500b7-5d1e-41fd-b53b-34d3393f36f4"
+   },
+   "outputs": [
     {
-      "cell_type": "markdown",
-      "id": "213afdf6",
-      "metadata": {
-        "id": "213afdf6"
-      },
-      "source": [
-        "### 4.3 - Inspect Generated output\n",
-        "\n",
-        "We would see documents are split into many chunks"
-      ]
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "🏃🏼 STAGE-3: Processing input='output/02_chunk_out' --> output='output/03_docid_out'\n"
+     ]
+    }
+   ],
+   "source": [
+    "\n",
+    "# Input for this stage is the output of exact dedeup component\n",
+    "# output of this component makes it possible for fdedup component to run on data.\n",
+    "\n",
+    "STAGE  = 3\n",
+    "\n",
+    "input_folder = output_chunk_dir\n",
+    "output_folder =  output_docid_dir\n",
+    "\n",
+    "input_df = read_parquet_files_as_df(input_folder)  ## for debug purposes\n",
+    "\n",
+    "print (f\"🏃🏼 STAGE-{STAGE}: Processing input='{input_folder}' --> output='{output_folder}'\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "18aa0fe1",
+   "metadata": {
+    "id": "18aa0fe1"
+   },
+   "source": [
+    "### 5.2 - Execute"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "f6e9e145",
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
     },
+    "id": "f6e9e145",
+    "outputId": "2add5f0c-3ab6-4336-8a7b-ac8b1b76ab73"
+   },
+   "outputs": [
     {
-      "cell_type": "code",
-      "execution_count": 15,
-      "id": "d8138d43",
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 897
-        },
-        "id": "d8138d43",
-        "outputId": "e1758b0c-5f22-4368-c3e6-ff778fc9ae82"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Files processed : 2\n",
-            "Chunks created : 8\n",
-            "Input data dimensions (rows x columns)=  (2, 12)\n",
-            "Output data dimensions (rows x columns)=  (8, 16)\n"
-          ]
-        },
-        {
-          "data": {
-            "text/html": [
-              "<div>\n",
-              "<style scoped>\n",
-              "    .dataframe tbody tr th:only-of-type {\n",
-              "        vertical-align: middle;\n",
-              "    }\n",
-              "\n",
-              "    .dataframe tbody tr th {\n",
-              "        vertical-align: top;\n",
-              "    }\n",
-              "\n",
-              "    .dataframe thead th {\n",
-              "        text-align: right;\n",
-              "    }\n",
-              "</style>\n",
-              "<table border=\"1\" class=\"dataframe\">\n",
-              "  <thead>\n",
-              "    <tr style=\"text-align: right;\">\n",
-              "      <th></th>\n",
-              "      <th>filename</th>\n",
-              "      <th>num_pages</th>\n",
-              "      <th>num_tables</th>\n",
-              "      <th>num_doc_elements</th>\n",
-              "      <th>ext</th>\n",
-              "      <th>hash</th>\n",
-              "      <th>size</th>\n",
-              "      <th>date_acquired</th>\n",
-              "      <th>pdf_convert_time</th>\n",
-              "      <th>source_filename</th>\n",
-              "      <th>source_document_id</th>\n",
-              "      <th>contents</th>\n",
-              "      <th>doc_jsonpath</th>\n",
-              "      <th>page_number</th>\n",
-              "      <th>bbox</th>\n",
-              "      <th>document_id</th>\n",
-              "    </tr>\n",
-              "  </thead>\n",
-              "  <tbody>\n",
-              "    <tr>\n",
-              "      <th>0</th>\n",
-              "      <td>mars.pdf</td>\n",
-              "      <td>1</td>\n",
-              "      <td>0</td>\n",
-              "      <td>11</td>\n",
-              "      <td>pdf</td>\n",
-              "      <td>8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...</td>\n",
-              "      <td>2800</td>\n",
-              "      <td>2024-10-18T13:30:59.490007</td>\n",
-              "      <td>2.011138</td>\n",
-              "      <td>mars.pdf</td>\n",
-              "      <td>62e5639f-f922-4ccc-a041-3cb02f1cfd83</td>\n",
-              "      <td>Solar System\\nOur solar system is a vast and f...</td>\n",
-              "      <td>$.main-text[2]</td>\n",
-              "      <td>1</td>\n",
-              "      <td>[132.84518433, 588.96014404, 479.40917969, 623...</td>\n",
-              "      <td>44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>1</th>\n",
-              "      <td>mars.pdf</td>\n",
-              "      <td>1</td>\n",
-              "      <td>0</td>\n",
-              "      <td>11</td>\n",
-              "      <td>pdf</td>\n",
-              "      <td>8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...</td>\n",
-              "      <td>2800</td>\n",
-              "      <td>2024-10-18T13:30:59.490007</td>\n",
-              "      <td>2.011138</td>\n",
-              "      <td>mars.pdf</td>\n",
-              "      <td>62e5639f-f922-4ccc-a041-3cb02f1cfd83</td>\n",
-              "      <td>Solar System\\nFor more details about the Solar...</td>\n",
-              "      <td>$.main-text[3]</td>\n",
-              "      <td>1</td>\n",
-              "      <td>[133.18510437, 570.83258057, 374.99838257, 581...</td>\n",
-              "      <td>dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07...</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>2</th>\n",
-              "      <td>mars.pdf</td>\n",
-              "      <td>1</td>\n",
-              "      <td>0</td>\n",
-              "      <td>11</td>\n",
-              "      <td>pdf</td>\n",
-              "      <td>8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...</td>\n",
-              "      <td>2800</td>\n",
-              "      <td>2024-10-18T13:30:59.490007</td>\n",
-              "      <td>2.011138</td>\n",
-              "      <td>mars.pdf</td>\n",
-              "      <td>62e5639f-f922-4ccc-a041-3cb02f1cfd83</td>\n",
-              "      <td>Mars\\nMars, the fourth planet from the Sun, is...</td>\n",
-              "      <td>$.main-text[5]</td>\n",
-              "      <td>1</td>\n",
-              "      <td>[132.87440491, 500.84011841, 477.48345947, 534...</td>\n",
-              "      <td>a31663e06fac41470ecc459f5a58658a3f9997d7801053...</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>3</th>\n",
-              "      <td>mars.pdf</td>\n",
-              "      <td>1</td>\n",
-              "      <td>0</td>\n",
-              "      <td>11</td>\n",
-              "      <td>pdf</td>\n",
-              "      <td>8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...</td>\n",
-              "      <td>2800</td>\n",
-              "      <td>2024-10-18T13:30:59.490007</td>\n",
-              "      <td>2.011138</td>\n",
-              "      <td>mars.pdf</td>\n",
-              "      <td>62e5639f-f922-4ccc-a041-3cb02f1cfd83</td>\n",
-              "      <td>Basic facts about Mars:\\n· Distance from the S...</td>\n",
-              "      <td>$.main-text[6]</td>\n",
-              "      <td>1</td>\n",
-              "      <td>[133.2026062, 482.90710449, 237.04431152, 493....</td>\n",
-              "      <td>7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>4</th>\n",
-              "      <td>earth.pdf</td>\n",
-              "      <td>1</td>\n",
-              "      <td>0</td>\n",
-              "      <td>11</td>\n",
-              "      <td>pdf</td>\n",
-              "      <td>18713f970989055625bef22209b6f4b6830b9ca22046bf...</td>\n",
-              "      <td>2686</td>\n",
-              "      <td>2024-10-18T13:30:59.494027</td>\n",
-              "      <td>2.015123</td>\n",
-              "      <td>earth.pdf</td>\n",
-              "      <td>f3c0ac2e-1de2-472b-8216-2043f3b3e9d1</td>\n",
-              "      <td>Solar System\\nOur solar system is a vast and f...</td>\n",
-              "      <td>$.main-text[2]</td>\n",
-              "      <td>1</td>\n",
-              "      <td>[132.87112427, 588.96014404, 479.40917969, 623...</td>\n",
-              "      <td>44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>5</th>\n",
-              "      <td>earth.pdf</td>\n",
-              "      <td>1</td>\n",
-              "      <td>0</td>\n",
-              "      <td>11</td>\n",
-              "      <td>pdf</td>\n",
-              "      <td>18713f970989055625bef22209b6f4b6830b9ca22046bf...</td>\n",
-              "      <td>2686</td>\n",
-              "      <td>2024-10-18T13:30:59.494027</td>\n",
-              "      <td>2.015123</td>\n",
-              "      <td>earth.pdf</td>\n",
-              "      <td>f3c0ac2e-1de2-472b-8216-2043f3b3e9d1</td>\n",
-              "      <td>Solar System\\nFor more details about our Solar...</td>\n",
-              "      <td>$.main-text[3]</td>\n",
-              "      <td>1</td>\n",
-              "      <td>[133.20942688, 570.81555176, 375.57919312, 581...</td>\n",
-              "      <td>d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>6</th>\n",
-              "      <td>earth.pdf</td>\n",
-              "      <td>1</td>\n",
-              "      <td>0</td>\n",
-              "      <td>11</td>\n",
-              "      <td>pdf</td>\n",
-              "      <td>18713f970989055625bef22209b6f4b6830b9ca22046bf...</td>\n",
-              "      <td>2686</td>\n",
-              "      <td>2024-10-18T13:30:59.494027</td>\n",
-              "      <td>2.015123</td>\n",
-              "      <td>earth.pdf</td>\n",
-              "      <td>f3c0ac2e-1de2-472b-8216-2043f3b3e9d1</td>\n",
-              "      <td>Earth\\nEarth is the third planet from the Sun....</td>\n",
-              "      <td>$.main-text[5]</td>\n",
-              "      <td>1</td>\n",
-              "      <td>[132.91053772, 512.46295166, 477.84887695, 534...</td>\n",
-              "      <td>7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>7</th>\n",
-              "      <td>earth.pdf</td>\n",
-              "      <td>1</td>\n",
-              "      <td>0</td>\n",
-              "      <td>11</td>\n",
-              "      <td>pdf</td>\n",
-              "      <td>18713f970989055625bef22209b6f4b6830b9ca22046bf...</td>\n",
-              "      <td>2686</td>\n",
-              "      <td>2024-10-18T13:30:59.494027</td>\n",
-              "      <td>2.015123</td>\n",
-              "      <td>earth.pdf</td>\n",
-              "      <td>f3c0ac2e-1de2-472b-8216-2043f3b3e9d1</td>\n",
-              "      <td>Earth\\nBasic facts about Earth:\\n· Distance fr...</td>\n",
-              "      <td>$.main-text[6]</td>\n",
-              "      <td>1</td>\n",
-              "      <td>[133.30151367, 494.86206055, 240.17156982, 505...</td>\n",
-              "      <td>189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...</td>\n",
-              "    </tr>\n",
-              "  </tbody>\n",
-              "</table>\n",
-              "</div>"
-            ],
-            "text/plain": [
-              "    filename  num_pages  num_tables  num_doc_elements  ext  \\\n",
-              "0   mars.pdf          1           0                11  pdf   \n",
-              "1   mars.pdf          1           0                11  pdf   \n",
-              "2   mars.pdf          1           0                11  pdf   \n",
-              "3   mars.pdf          1           0                11  pdf   \n",
-              "4  earth.pdf          1           0                11  pdf   \n",
-              "5  earth.pdf          1           0                11  pdf   \n",
-              "6  earth.pdf          1           0                11  pdf   \n",
-              "7  earth.pdf          1           0                11  pdf   \n",
-              "\n",
-              "                                                hash  size  \\\n",
-              "0  8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...  2800   \n",
-              "1  8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...  2800   \n",
-              "2  8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...  2800   \n",
-              "3  8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...  2800   \n",
-              "4  18713f970989055625bef22209b6f4b6830b9ca22046bf...  2686   \n",
-              "5  18713f970989055625bef22209b6f4b6830b9ca22046bf...  2686   \n",
-              "6  18713f970989055625bef22209b6f4b6830b9ca22046bf...  2686   \n",
-              "7  18713f970989055625bef22209b6f4b6830b9ca22046bf...  2686   \n",
-              "\n",
-              "                date_acquired  pdf_convert_time source_filename  \\\n",
-              "0  2024-10-18T13:30:59.490007          2.011138        mars.pdf   \n",
-              "1  2024-10-18T13:30:59.490007          2.011138        mars.pdf   \n",
-              "2  2024-10-18T13:30:59.490007          2.011138        mars.pdf   \n",
-              "3  2024-10-18T13:30:59.490007          2.011138        mars.pdf   \n",
-              "4  2024-10-18T13:30:59.494027          2.015123       earth.pdf   \n",
-              "5  2024-10-18T13:30:59.494027          2.015123       earth.pdf   \n",
-              "6  2024-10-18T13:30:59.494027          2.015123       earth.pdf   \n",
-              "7  2024-10-18T13:30:59.494027          2.015123       earth.pdf   \n",
-              "\n",
-              "                     source_document_id  \\\n",
-              "0  62e5639f-f922-4ccc-a041-3cb02f1cfd83   \n",
-              "1  62e5639f-f922-4ccc-a041-3cb02f1cfd83   \n",
-              "2  62e5639f-f922-4ccc-a041-3cb02f1cfd83   \n",
-              "3  62e5639f-f922-4ccc-a041-3cb02f1cfd83   \n",
-              "4  f3c0ac2e-1de2-472b-8216-2043f3b3e9d1   \n",
-              "5  f3c0ac2e-1de2-472b-8216-2043f3b3e9d1   \n",
-              "6  f3c0ac2e-1de2-472b-8216-2043f3b3e9d1   \n",
-              "7  f3c0ac2e-1de2-472b-8216-2043f3b3e9d1   \n",
-              "\n",
-              "                                            contents    doc_jsonpath  \\\n",
-              "0  Solar System\\nOur solar system is a vast and f...  $.main-text[2]   \n",
-              "1  Solar System\\nFor more details about the Solar...  $.main-text[3]   \n",
-              "2  Mars\\nMars, the fourth planet from the Sun, is...  $.main-text[5]   \n",
-              "3  Basic facts about Mars:\\n· Distance from the S...  $.main-text[6]   \n",
-              "4  Solar System\\nOur solar system is a vast and f...  $.main-text[2]   \n",
-              "5  Solar System\\nFor more details about our Solar...  $.main-text[3]   \n",
-              "6  Earth\\nEarth is the third planet from the Sun....  $.main-text[5]   \n",
-              "7  Earth\\nBasic facts about Earth:\\n· Distance fr...  $.main-text[6]   \n",
-              "\n",
-              "   page_number                                               bbox  \\\n",
-              "0            1  [132.84518433, 588.96014404, 479.40917969, 623...   \n",
-              "1            1  [133.18510437, 570.83258057, 374.99838257, 581...   \n",
-              "2            1  [132.87440491, 500.84011841, 477.48345947, 534...   \n",
-              "3            1  [133.2026062, 482.90710449, 237.04431152, 493....   \n",
-              "4            1  [132.87112427, 588.96014404, 479.40917969, 623...   \n",
-              "5            1  [133.20942688, 570.81555176, 375.57919312, 581...   \n",
-              "6            1  [132.91053772, 512.46295166, 477.84887695, 534...   \n",
-              "7            1  [133.30151367, 494.86206055, 240.17156982, 505...   \n",
-              "\n",
-              "                                         document_id  \n",
-              "0  44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...  \n",
-              "1  dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07...  \n",
-              "2  a31663e06fac41470ecc459f5a58658a3f9997d7801053...  \n",
-              "3  7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...  \n",
-              "4  44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...  \n",
-              "5  d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...  \n",
-              "6  7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...  \n",
-              "7  189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...  "
-            ]
-          },
-          "execution_count": 15,
-          "metadata": {},
-          "output_type": "execute_result"
-        }
-      ],
-      "source": [
-        "from my_utils import read_parquet_files_as_df\n",
-        "\n",
-        "output_df = read_parquet_files_as_df(output_folder)\n",
-        "\n",
-        "print (f\"Files processed : {input_df.shape[0]:,}\")\n",
-        "print (f\"Chunks created : {output_df.shape[0]:,}\")\n",
-        "\n",
-        "print (\"Input data dimensions (rows x columns)= \", input_df.shape)\n",
-        "print (\"Output data dimensions (rows x columns)= \", output_df.shape)\n",
-        "\n",
-        "output_df.head(10)"
-      ]
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "13:31:29 INFO - Doc id parameters are : {'doc_column': 'contents', 'hash_column': 'chunk_hash', 'int_column': 'chunk_id', 'start_id': 0}\n",
+      "13:31:29 INFO - pipeline id pipeline_id\n",
+      "13:31:29 INFO - code location None\n",
+      "13:31:29 INFO - number of workers 2 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n",
+      "13:31:29 INFO - actor creation delay 0\n",
+      "13:31:29 INFO - job details {'job category': 'preprocessing', 'job name': 'doc_id', 'job type': 'ray', 'job id': 'job_id'}\n",
+      "13:31:29 INFO - data factory data_ is using local data access: input_folder - output/02_chunk_out output_folder - output/03_docid_out\n",
+      "13:31:29 INFO - data factory data_ max_files -1, n_sample -1\n",
+      "13:31:29 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n",
+      "13:31:29 INFO - Running locally\n",
+      "2024-10-18 13:31:31,792\tINFO worker.py:1744 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n",
+      "\u001b[36m(orchestrate pid=12291)\u001b[0m 13:31:32 INFO - orchestrator started at 2024-10-18 13:31:32\n",
+      "\u001b[36m(orchestrate pid=12291)\u001b[0m 13:31:32 INFO - Number of files is 2, source profile {'max_file_size': 0.008975982666015625, 'min_file_size': 0.008897781372070312, 'total_file_size': 0.017873764038085938}\n",
+      "\u001b[36m(orchestrate pid=12291)\u001b[0m 13:31:32 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 15.033103181049228, 'object_store': 7.516551589593291}\n",
+      "\u001b[36m(orchestrate pid=12291)\u001b[0m 13:31:32 INFO - Number of workers - 2 with {'num_cpus': 0.8, 'max_restarts': -1} each\n",
+      "\u001b[36m(orchestrate pid=12291)\u001b[0m 13:31:32 INFO - Completed 0 files (0.0%)  in 0.0 min. Waiting for completion\n",
+      "\u001b[36m(orchestrate pid=12291)\u001b[0m 13:31:33 INFO - Completed processing 2 files in 0.012 min\n",
+      "\u001b[36m(orchestrate pid=12291)\u001b[0m 13:31:33 INFO - done flushing in 0.001 sec\n",
+      "13:31:43 INFO - Completed execution in 0.228 min, execution result 0\n"
+     ]
     },
     {
-      "cell_type": "markdown",
-      "id": "9e9ca75c",
-      "metadata": {
-        "id": "9e9ca75c"
-      },
-      "source": [
-        "### 4.4 - Understanding the Output\n",
-        "\n",
-        "Here we see 2 PDF files are split into 6 chunks.  Basically we see the documents are being split along 'natural boundaris' - paragraphs and bullet points\n",
-        "\n",
-        "See how **document_id** is carried throughout.  This helps us identify original documents.\n",
-        "\n",
-        "Also note **contents** is now plain text (not JSON as before)"
-      ]
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "✅ Stage:3 completed successfully\n",
+      "CPU times: user 123 ms, sys: 145 ms, total: 267 ms\n",
+      "Wall time: 15.2 s\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "\n",
+    "from data_processing_ray.runtime.ray import RayTransformLauncher\n",
+    "from doc_id_transform_ray import DocIDRayTransformRuntimeConfiguration\n",
+    "\n",
+    "local_conf = {\n",
+    "    \"input_folder\": input_folder,\n",
+    "    \"output_folder\": output_folder,\n",
+    "}\n",
+    "worker_options = {\"num_cpus\" : MY_CONFIG.RAY_NUM_CPUS}\n",
+    "params = {\n",
+    "    # where to run\n",
+    "    \"run_locally\": True,\n",
+    "    # Data access. Only required parameters are specified\n",
+    "    \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n",
+    "    # orchestrator\n",
+    "    \"runtime_worker_options\": ParamsUtils.convert_to_ast(worker_options),\n",
+    "    \"runtime_num_workers\": MY_CONFIG.RAY_RUNTIME_WORKERS,\n",
+    "    # doc id configuration\n",
+    "    \"doc_id_doc_column\": \"contents\",\n",
+    "    \"doc_id_hash_column\": \"chunk_hash\",\n",
+    "    \"doc_id_int_column\": \"chunk_id\",\n",
+    "}\n",
+    "sys.argv = ParamsUtils.dict_to_req(d=params)\n",
+    "\n",
+    "# launch\n",
+    "\n",
+    "launcher = RayTransformLauncher(DocIDRayTransformRuntimeConfiguration())\n",
+    "\n",
+    "return_code = launcher.launch()\n",
+    "\n",
+    "if return_code == 0:\n",
+    "    print (f\"✅ Stage:{STAGE} completed successfully\")\n",
+    "else:\n",
+    "    raise Exception (\"❌ Ray job failed\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4954402f",
+   "metadata": {
+    "id": "4954402f"
+   },
+   "source": [
+    "### 5.3 - Inspect Generated output\n",
+    "\n",
+    "You will notice we have two extra columns\n",
+    "\n",
+    "- **hash_column**\n",
+    "- **int_id_column**\n",
+    "\n",
+    "But still the same number or rows as before"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "1911179a",
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/",
+     "height": 860
     },
+    "id": "1911179a",
+    "outputId": "45e83e2a-1f70-46b9-e311-c50f025419be"
+   },
+   "outputs": [
     {
-      "cell_type": "code",
-      "execution_count": 16,
-      "id": "3090c950",
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 300
-        },
-        "id": "3090c950",
-        "outputId": "3f542446-2cfa-404c-c642-3732f7b74568"
-      },
-      "outputs": [
-        {
-          "data": {
-            "text/html": [
-              "<div>\n",
-              "<style scoped>\n",
-              "    .dataframe tbody tr th:only-of-type {\n",
-              "        vertical-align: middle;\n",
-              "    }\n",
-              "\n",
-              "    .dataframe tbody tr th {\n",
-              "        vertical-align: top;\n",
-              "    }\n",
-              "\n",
-              "    .dataframe thead th {\n",
-              "        text-align: right;\n",
-              "    }\n",
-              "</style>\n",
-              "<table border=\"1\" class=\"dataframe\">\n",
-              "  <thead>\n",
-              "    <tr style=\"text-align: right;\">\n",
-              "      <th></th>\n",
-              "      <th>filename</th>\n",
-              "      <th>contents</th>\n",
-              "    </tr>\n",
-              "  </thead>\n",
-              "  <tbody>\n",
-              "    <tr>\n",
-              "      <th>0</th>\n",
-              "      <td>mars.pdf</td>\n",
-              "      <td>Solar System\\nOur solar system is a vast and f...</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>1</th>\n",
-              "      <td>mars.pdf</td>\n",
-              "      <td>Solar System\\nFor more details about the Solar...</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>2</th>\n",
-              "      <td>mars.pdf</td>\n",
-              "      <td>Mars\\nMars, the fourth planet from the Sun, is...</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>3</th>\n",
-              "      <td>mars.pdf</td>\n",
-              "      <td>Basic facts about Mars:\\n· Distance from the S...</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>4</th>\n",
-              "      <td>earth.pdf</td>\n",
-              "      <td>Solar System\\nOur solar system is a vast and f...</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>5</th>\n",
-              "      <td>earth.pdf</td>\n",
-              "      <td>Solar System\\nFor more details about our Solar...</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>6</th>\n",
-              "      <td>earth.pdf</td>\n",
-              "      <td>Earth\\nEarth is the third planet from the Sun....</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>7</th>\n",
-              "      <td>earth.pdf</td>\n",
-              "      <td>Earth\\nBasic facts about Earth:\\n· Distance fr...</td>\n",
-              "    </tr>\n",
-              "  </tbody>\n",
-              "</table>\n",
-              "</div>"
-            ],
-            "text/plain": [
-              "    filename                                           contents\n",
-              "0   mars.pdf  Solar System\\nOur solar system is a vast and f...\n",
-              "1   mars.pdf  Solar System\\nFor more details about the Solar...\n",
-              "2   mars.pdf  Mars\\nMars, the fourth planet from the Sun, is...\n",
-              "3   mars.pdf  Basic facts about Mars:\\n· Distance from the S...\n",
-              "4  earth.pdf  Solar System\\nOur solar system is a vast and f...\n",
-              "5  earth.pdf  Solar System\\nFor more details about our Solar...\n",
-              "6  earth.pdf  Earth\\nEarth is the third planet from the Sun....\n",
-              "7  earth.pdf  Earth\\nBasic facts about Earth:\\n· Distance fr..."
-            ]
-          },
-          "execution_count": 16,
-          "metadata": {},
-          "output_type": "execute_result"
-        }
-      ],
-      "source": [
-        "output_df[['filename', 'contents']]"
-      ]
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Input data dimensions (rows x columns)=  (8, 16)\n",
+      "Output data dimensions (rows x columns)=  (8, 18)\n"
+     ]
     },
     {
-      "cell_type": "code",
-      "execution_count": 17,
-      "id": "d5f151ae",
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "d5f151ae",
-        "outputId": "4616d648-0852-4ecb-cef8-f5940e176de0"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "========== mars.pdf ===========\n",
-            "-------Chunk 0------\n",
-            "Solar System\n",
-            "Our solar system is a vast and fascinating expanse, comprising eight planets, five dwarf planets, numerous moons, asteroids, comets, and other celestial bodies. At its center lies the star we call the Sun.\n",
-            "-------\n",
-            "-------Chunk 1------\n",
-            "Solar System\n",
-            "For more details about the Solar system see Chapter 1.\n",
-            "-------\n",
-            "-------Chunk 2------\n",
-            "Mars\n",
-            "Mars, the fourth planet from the Sun, is a cold, desert world with a thin atmosphere composed primarily of carbon dioxide. Its reddish hue comes from iron oxide, or rust, prevalent on its surface.\n",
-            "-------\n",
-            "-------Chunk 3------\n",
-            "Basic facts about Mars:\n",
-            "· Distance from the Sun: Average of 228 million kilometers (142 million miles)\n",
-            "· Rotation Period: 24.6 hours (one Martian day - called a \"sol\")\n",
-            "· Moons: Two small moons, Phobos and Deimos.\n",
-            "-------\n",
-            "========== earth.pdf ===========\n",
-            "-------Chunk 0------\n",
-            "Solar System\n",
-            "Our solar system is a vast and fascinating expanse, comprising eight planets, five dwarf planets, numerous moons, asteroids, comets, and other celestial bodies. At its center lies the star we call the Sun.\n",
-            "-------\n",
-            "-------Chunk 1------\n",
-            "Solar System\n",
-            "For more details about our Solar system see Chapter 1.\n",
-            "-------\n",
-            "-------Chunk 2------\n",
-            "Earth\n",
-            "Earth is the third planet from the Sun. It's our home planet. Earth is the only place we know of with life.\n",
-            "-------\n",
-            "-------Chunk 3------\n",
-            "Earth\n",
-            "Basic facts about Earth:\n",
-            "· Distance from the Sun: Average of 149.6 million kilometers (93 million miles)\n",
-            "· Rotation Period: 24 hours (one day)\n",
-            "· Moons: One moon, called Luna or simply \"the Moon\".\n",
-            "-------\n"
-          ]
-        }
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>filename</th>\n",
+       "      <th>num_pages</th>\n",
+       "      <th>num_tables</th>\n",
+       "      <th>num_doc_elements</th>\n",
+       "      <th>ext</th>\n",
+       "      <th>hash</th>\n",
+       "      <th>size</th>\n",
+       "      <th>date_acquired</th>\n",
+       "      <th>pdf_convert_time</th>\n",
+       "      <th>source_filename</th>\n",
+       "      <th>source_document_id</th>\n",
+       "      <th>contents</th>\n",
+       "      <th>doc_jsonpath</th>\n",
+       "      <th>page_number</th>\n",
+       "      <th>bbox</th>\n",
+       "      <th>document_id</th>\n",
+       "      <th>chunk_hash</th>\n",
+       "      <th>chunk_id</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>mars.pdf</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>11</td>\n",
+       "      <td>pdf</td>\n",
+       "      <td>8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...</td>\n",
+       "      <td>2800</td>\n",
+       "      <td>2024-10-18T13:30:59.490007</td>\n",
+       "      <td>2.011138</td>\n",
+       "      <td>mars.pdf</td>\n",
+       "      <td>62e5639f-f922-4ccc-a041-3cb02f1cfd83</td>\n",
+       "      <td>Solar System\\nOur solar system is a vast and f...</td>\n",
+       "      <td>$.main-text[2]</td>\n",
+       "      <td>1</td>\n",
+       "      <td>[132.84518433, 588.96014404, 479.40917969, 623...</td>\n",
+       "      <td>44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...</td>\n",
+       "      <td>44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...</td>\n",
+       "      <td>4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>mars.pdf</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>11</td>\n",
+       "      <td>pdf</td>\n",
+       "      <td>8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...</td>\n",
+       "      <td>2800</td>\n",
+       "      <td>2024-10-18T13:30:59.490007</td>\n",
+       "      <td>2.011138</td>\n",
+       "      <td>mars.pdf</td>\n",
+       "      <td>62e5639f-f922-4ccc-a041-3cb02f1cfd83</td>\n",
+       "      <td>Solar System\\nFor more details about the Solar...</td>\n",
+       "      <td>$.main-text[3]</td>\n",
+       "      <td>1</td>\n",
+       "      <td>[133.18510437, 570.83258057, 374.99838257, 581...</td>\n",
+       "      <td>dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07...</td>\n",
+       "      <td>dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07...</td>\n",
+       "      <td>5</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>mars.pdf</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>11</td>\n",
+       "      <td>pdf</td>\n",
+       "      <td>8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...</td>\n",
+       "      <td>2800</td>\n",
+       "      <td>2024-10-18T13:30:59.490007</td>\n",
+       "      <td>2.011138</td>\n",
+       "      <td>mars.pdf</td>\n",
+       "      <td>62e5639f-f922-4ccc-a041-3cb02f1cfd83</td>\n",
+       "      <td>Mars\\nMars, the fourth planet from the Sun, is...</td>\n",
+       "      <td>$.main-text[5]</td>\n",
+       "      <td>1</td>\n",
+       "      <td>[132.87440491, 500.84011841, 477.48345947, 534...</td>\n",
+       "      <td>a31663e06fac41470ecc459f5a58658a3f9997d7801053...</td>\n",
+       "      <td>a31663e06fac41470ecc459f5a58658a3f9997d7801053...</td>\n",
+       "      <td>6</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>mars.pdf</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>11</td>\n",
+       "      <td>pdf</td>\n",
+       "      <td>8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...</td>\n",
+       "      <td>2800</td>\n",
+       "      <td>2024-10-18T13:30:59.490007</td>\n",
+       "      <td>2.011138</td>\n",
+       "      <td>mars.pdf</td>\n",
+       "      <td>62e5639f-f922-4ccc-a041-3cb02f1cfd83</td>\n",
+       "      <td>Basic facts about Mars:\\n· Distance from the S...</td>\n",
+       "      <td>$.main-text[6]</td>\n",
+       "      <td>1</td>\n",
+       "      <td>[133.2026062, 482.90710449, 237.04431152, 493....</td>\n",
+       "      <td>7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...</td>\n",
+       "      <td>7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...</td>\n",
+       "      <td>7</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>earth.pdf</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>11</td>\n",
+       "      <td>pdf</td>\n",
+       "      <td>18713f970989055625bef22209b6f4b6830b9ca22046bf...</td>\n",
+       "      <td>2686</td>\n",
+       "      <td>2024-10-18T13:30:59.494027</td>\n",
+       "      <td>2.015123</td>\n",
+       "      <td>earth.pdf</td>\n",
+       "      <td>f3c0ac2e-1de2-472b-8216-2043f3b3e9d1</td>\n",
+       "      <td>Solar System\\nOur solar system is a vast and f...</td>\n",
+       "      <td>$.main-text[2]</td>\n",
+       "      <td>1</td>\n",
+       "      <td>[132.87112427, 588.96014404, 479.40917969, 623...</td>\n",
+       "      <td>44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...</td>\n",
+       "      <td>44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>earth.pdf</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>11</td>\n",
+       "      <td>pdf</td>\n",
+       "      <td>18713f970989055625bef22209b6f4b6830b9ca22046bf...</td>\n",
+       "      <td>2686</td>\n",
+       "      <td>2024-10-18T13:30:59.494027</td>\n",
+       "      <td>2.015123</td>\n",
+       "      <td>earth.pdf</td>\n",
+       "      <td>f3c0ac2e-1de2-472b-8216-2043f3b3e9d1</td>\n",
+       "      <td>Solar System\\nFor more details about our Solar...</td>\n",
+       "      <td>$.main-text[3]</td>\n",
+       "      <td>1</td>\n",
+       "      <td>[133.20942688, 570.81555176, 375.57919312, 581...</td>\n",
+       "      <td>d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...</td>\n",
+       "      <td>d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>earth.pdf</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>11</td>\n",
+       "      <td>pdf</td>\n",
+       "      <td>18713f970989055625bef22209b6f4b6830b9ca22046bf...</td>\n",
+       "      <td>2686</td>\n",
+       "      <td>2024-10-18T13:30:59.494027</td>\n",
+       "      <td>2.015123</td>\n",
+       "      <td>earth.pdf</td>\n",
+       "      <td>f3c0ac2e-1de2-472b-8216-2043f3b3e9d1</td>\n",
+       "      <td>Earth\\nEarth is the third planet from the Sun....</td>\n",
+       "      <td>$.main-text[5]</td>\n",
+       "      <td>1</td>\n",
+       "      <td>[132.91053772, 512.46295166, 477.84887695, 534...</td>\n",
+       "      <td>7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...</td>\n",
+       "      <td>7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>earth.pdf</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>11</td>\n",
+       "      <td>pdf</td>\n",
+       "      <td>18713f970989055625bef22209b6f4b6830b9ca22046bf...</td>\n",
+       "      <td>2686</td>\n",
+       "      <td>2024-10-18T13:30:59.494027</td>\n",
+       "      <td>2.015123</td>\n",
+       "      <td>earth.pdf</td>\n",
+       "      <td>f3c0ac2e-1de2-472b-8216-2043f3b3e9d1</td>\n",
+       "      <td>Earth\\nBasic facts about Earth:\\n· Distance fr...</td>\n",
+       "      <td>$.main-text[6]</td>\n",
+       "      <td>1</td>\n",
+       "      <td>[133.30151367, 494.86206055, 240.17156982, 505...</td>\n",
+       "      <td>189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...</td>\n",
+       "      <td>189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...</td>\n",
+       "      <td>3</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
       ],
-      "source": [
-        "for f in output_df['filename'].unique():\n",
-        "    print ('==========' , f, '===========')\n",
-        "    chunks = output_df[output_df['filename'] == f]['contents']\n",
-        "    for idx , chunk in enumerate(chunks):\n",
-        "        print (f'-------Chunk {idx}------\\n{chunk}\\n-------')"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "20217298",
-      "metadata": {
-        "id": "20217298"
-      },
-      "source": [
-        "## Step-5:  DOC ID generation\n",
-        "\n",
-        "This transform annotates documents with document \"ids\". It supports the following transformations of the original data:\n",
-        "\n",
-        " - Adding document hash: this enables the addition of a document hash-based id to the data. The hash is calculated with `hashlib.sha256(doc.encode(\"utf-8\")).hexdigest()`. To enable this annotation, set **hash_column** to the name of the column, where you want to store it.\n",
-        " - Adding integer document id: this allows the addition of an integer document id to the data that is unique across all rows in all tables provided to the transform() method. To enable this annotation, set **int_id_column** to the name of the column, where you want to store it.\n",
-        "\n",
-        "**This is a pre-requisite for fuzzy dedup** in the pipeline."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "66811f5b",
-      "metadata": {
-        "id": "66811f5b"
-      },
-      "source": [
-        "### 5.1 - Set Input/output Folder"
+      "text/plain": [
+       "    filename  num_pages  num_tables  num_doc_elements  ext  \\\n",
+       "0   mars.pdf          1           0                11  pdf   \n",
+       "1   mars.pdf          1           0                11  pdf   \n",
+       "2   mars.pdf          1           0                11  pdf   \n",
+       "3   mars.pdf          1           0                11  pdf   \n",
+       "4  earth.pdf          1           0                11  pdf   \n",
+       "5  earth.pdf          1           0                11  pdf   \n",
+       "6  earth.pdf          1           0                11  pdf   \n",
+       "7  earth.pdf          1           0                11  pdf   \n",
+       "\n",
+       "                                                hash  size  \\\n",
+       "0  8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...  2800   \n",
+       "1  8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...  2800   \n",
+       "2  8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...  2800   \n",
+       "3  8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...  2800   \n",
+       "4  18713f970989055625bef22209b6f4b6830b9ca22046bf...  2686   \n",
+       "5  18713f970989055625bef22209b6f4b6830b9ca22046bf...  2686   \n",
+       "6  18713f970989055625bef22209b6f4b6830b9ca22046bf...  2686   \n",
+       "7  18713f970989055625bef22209b6f4b6830b9ca22046bf...  2686   \n",
+       "\n",
+       "                date_acquired  pdf_convert_time source_filename  \\\n",
+       "0  2024-10-18T13:30:59.490007          2.011138        mars.pdf   \n",
+       "1  2024-10-18T13:30:59.490007          2.011138        mars.pdf   \n",
+       "2  2024-10-18T13:30:59.490007          2.011138        mars.pdf   \n",
+       "3  2024-10-18T13:30:59.490007          2.011138        mars.pdf   \n",
+       "4  2024-10-18T13:30:59.494027          2.015123       earth.pdf   \n",
+       "5  2024-10-18T13:30:59.494027          2.015123       earth.pdf   \n",
+       "6  2024-10-18T13:30:59.494027          2.015123       earth.pdf   \n",
+       "7  2024-10-18T13:30:59.494027          2.015123       earth.pdf   \n",
+       "\n",
+       "                     source_document_id  \\\n",
+       "0  62e5639f-f922-4ccc-a041-3cb02f1cfd83   \n",
+       "1  62e5639f-f922-4ccc-a041-3cb02f1cfd83   \n",
+       "2  62e5639f-f922-4ccc-a041-3cb02f1cfd83   \n",
+       "3  62e5639f-f922-4ccc-a041-3cb02f1cfd83   \n",
+       "4  f3c0ac2e-1de2-472b-8216-2043f3b3e9d1   \n",
+       "5  f3c0ac2e-1de2-472b-8216-2043f3b3e9d1   \n",
+       "6  f3c0ac2e-1de2-472b-8216-2043f3b3e9d1   \n",
+       "7  f3c0ac2e-1de2-472b-8216-2043f3b3e9d1   \n",
+       "\n",
+       "                                            contents    doc_jsonpath  \\\n",
+       "0  Solar System\\nOur solar system is a vast and f...  $.main-text[2]   \n",
+       "1  Solar System\\nFor more details about the Solar...  $.main-text[3]   \n",
+       "2  Mars\\nMars, the fourth planet from the Sun, is...  $.main-text[5]   \n",
+       "3  Basic facts about Mars:\\n· Distance from the S...  $.main-text[6]   \n",
+       "4  Solar System\\nOur solar system is a vast and f...  $.main-text[2]   \n",
+       "5  Solar System\\nFor more details about our Solar...  $.main-text[3]   \n",
+       "6  Earth\\nEarth is the third planet from the Sun....  $.main-text[5]   \n",
+       "7  Earth\\nBasic facts about Earth:\\n· Distance fr...  $.main-text[6]   \n",
+       "\n",
+       "   page_number                                               bbox  \\\n",
+       "0            1  [132.84518433, 588.96014404, 479.40917969, 623...   \n",
+       "1            1  [133.18510437, 570.83258057, 374.99838257, 581...   \n",
+       "2            1  [132.87440491, 500.84011841, 477.48345947, 534...   \n",
+       "3            1  [133.2026062, 482.90710449, 237.04431152, 493....   \n",
+       "4            1  [132.87112427, 588.96014404, 479.40917969, 623...   \n",
+       "5            1  [133.20942688, 570.81555176, 375.57919312, 581...   \n",
+       "6            1  [132.91053772, 512.46295166, 477.84887695, 534...   \n",
+       "7            1  [133.30151367, 494.86206055, 240.17156982, 505...   \n",
+       "\n",
+       "                                         document_id  \\\n",
+       "0  44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...   \n",
+       "1  dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07...   \n",
+       "2  a31663e06fac41470ecc459f5a58658a3f9997d7801053...   \n",
+       "3  7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...   \n",
+       "4  44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...   \n",
+       "5  d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...   \n",
+       "6  7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...   \n",
+       "7  189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...   \n",
+       "\n",
+       "                                          chunk_hash  chunk_id  \n",
+       "0  44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...         4  \n",
+       "1  dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07...         5  \n",
+       "2  a31663e06fac41470ecc459f5a58658a3f9997d7801053...         6  \n",
+       "3  7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...         7  \n",
+       "4  44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...         0  \n",
+       "5  d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...         1  \n",
+       "6  7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...         2  \n",
+       "7  189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...         3  "
       ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from my_utils import read_parquet_files_as_df\n",
+    "\n",
+    "output_df = read_parquet_files_as_df(output_folder)\n",
+    "\n",
+    "print (\"Input data dimensions (rows x columns)= \", input_df.shape)\n",
+    "print (\"Output data dimensions (rows x columns)= \", output_df.shape)\n",
+    "\n",
+    "output_df.head(10)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "852829dc",
+   "metadata": {
+    "id": "852829dc"
+   },
+   "source": [
+    "## Step-6: Exact Dedup\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5acfd3a2-a236-4143-bcfc-15804f1da7fe",
+   "metadata": {
+    "id": "5acfd3a2-a236-4143-bcfc-15804f1da7fe"
+   },
+   "source": [
+    "### 6.1 - Set Input/output Folder"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "4c7a1b94",
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
     },
+    "id": "4c7a1b94",
+    "outputId": "40a119b4-44fc-483d-9ad0-da178a2a8eb1"
+   },
+   "outputs": [
     {
-      "cell_type": "code",
-      "execution_count": 18,
-      "id": "1f747c0d",
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "1f747c0d",
-        "outputId": "e42500b7-5d1e-41fd-b53b-34d3393f36f4"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "🏃🏼 STAGE-3: Processing input='output/02_chunk_out' --> output='output/03_docid_out'\n"
-          ]
-        }
-      ],
-      "source": [
-        "\n",
-        "# Input for this stage is the output of exact dedeup component\n",
-        "# output of this component makes it possible for fdedup component to run on data.\n",
-        "\n",
-        "STAGE  = 3\n",
-        "\n",
-        "input_folder = output_chunk_dir\n",
-        "output_folder =  output_docid_dir\n",
-        "\n",
-        "input_df = read_parquet_files_as_df(input_folder)  ## for debug purposes\n",
-        "\n",
-        "print (f\"🏃🏼 STAGE-{STAGE}: Processing input='{input_folder}' --> output='{output_folder}'\")"
-      ]
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "🏃🏼 STAGE-4: Processing input='output/03_docid_out' --> output='output/04_exact_dedupe_out'\n"
+     ]
+    }
+   ],
+   "source": [
+    "STAGE  = 4\n",
+    "\n",
+    "input_folder = output_docid_dir # previous output folder is the input folder for the current stage\n",
+    "output_folder =  output_exact_dedupe_dir\n",
+    "\n",
+    "input_df = read_parquet_files_as_df(input_folder)  ## for debug purposes\n",
+    "\n",
+    "print (f\"🏃🏼 STAGE-{STAGE}: Processing input='{input_folder}' --> output='{output_folder}'\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3661cb37-39c7-4b09-a784-925bfa9eaf1e",
+   "metadata": {
+    "id": "3661cb37-39c7-4b09-a784-925bfa9eaf1e"
+   },
+   "source": [
+    "### 6.2 - Execute"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "a624b2b2-faad-4325-ac7d-53a840f564ef",
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
     },
+    "id": "a624b2b2-faad-4325-ac7d-53a840f564ef",
+    "outputId": "bd0f3f94-8c48-4c6b-b911-858e389243f4"
+   },
+   "outputs": [
     {
-      "cell_type": "markdown",
-      "id": "18aa0fe1",
-      "metadata": {
-        "id": "18aa0fe1"
-      },
-      "source": [
-        "### 5.2 - Execute"
-      ]
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "13:31:45 INFO - exact dedup params are {'doc_column': 'contents', 'doc_id_column': 'chunk_hash', 'use_snapshot': False, 'snapshot_directory': None, 'hash_cpu': 0.5, 'num_hashes': 2}\n",
+      "13:31:45 INFO - pipeline id pipeline_id\n",
+      "13:31:45 INFO - code location None\n",
+      "13:31:45 INFO - number of workers 2 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n",
+      "13:31:45 INFO - actor creation delay 0\n",
+      "13:31:45 INFO - job details {'job category': 'preprocessing', 'job name': 'ededup', 'job type': 'ray', 'job id': 'job_id'}\n",
+      "13:31:45 INFO - data factory data_ is using local data access: input_folder - output/03_docid_out output_folder - output/04_exact_dedupe_out\n",
+      "13:31:45 INFO - data factory data_ max_files -1, n_sample -1\n",
+      "13:31:45 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n",
+      "13:31:45 INFO - Running locally\n",
+      "2024-10-18 13:31:47,001\tINFO worker.py:1744 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n",
+      "\u001b[36m(orchestrate pid=13775)\u001b[0m 13:31:48 INFO - orchestrator started at 2024-10-18 13:31:48\n",
+      "\u001b[36m(orchestrate pid=13775)\u001b[0m 13:31:48 INFO - Number of files is 2, source profile {'max_file_size': 0.010180473327636719, 'min_file_size': 0.010101318359375, 'total_file_size': 0.02028179168701172}\n",
+      "\u001b[36m(orchestrate pid=13775)\u001b[0m 13:31:48 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 15.010423279367387, 'object_store': 7.505211639218032}\n",
+      "\u001b[36m(orchestrate pid=13775)\u001b[0m 13:31:48 INFO - Number of workers - 2 with {'num_cpus': 0.8, 'max_restarts': -1} each\n",
+      "\u001b[36m(orchestrate pid=13775)\u001b[0m 13:31:48 INFO - Completed 0 files (0.0%)  in 0.0 min. Waiting for completion\n",
+      "\u001b[36m(orchestrate pid=13775)\u001b[0m 13:31:48 INFO - Completed processing 2 files in 0.013 min\n",
+      "\u001b[36m(orchestrate pid=13775)\u001b[0m 13:31:48 INFO - done flushing in 0.001 sec\n",
+      "13:31:58 INFO - Completed execution in 0.228 min, execution result 0\n"
+     ]
     },
     {
-      "cell_type": "code",
-      "execution_count": 19,
-      "id": "f6e9e145",
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "f6e9e145",
-        "outputId": "2add5f0c-3ab6-4336-8a7b-ac8b1b76ab73"
-      },
-      "outputs": [
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "13:31:29 INFO - Doc id parameters are : {'doc_column': 'contents', 'hash_column': 'chunk_hash', 'int_column': 'chunk_id', 'start_id': 0}\n",
-            "13:31:29 INFO - pipeline id pipeline_id\n",
-            "13:31:29 INFO - code location None\n",
-            "13:31:29 INFO - number of workers 2 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n",
-            "13:31:29 INFO - actor creation delay 0\n",
-            "13:31:29 INFO - job details {'job category': 'preprocessing', 'job name': 'doc_id', 'job type': 'ray', 'job id': 'job_id'}\n",
-            "13:31:29 INFO - data factory data_ is using local data access: input_folder - output/02_chunk_out output_folder - output/03_docid_out\n",
-            "13:31:29 INFO - data factory data_ max_files -1, n_sample -1\n",
-            "13:31:29 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n",
-            "13:31:29 INFO - Running locally\n",
-            "2024-10-18 13:31:31,792\tINFO worker.py:1744 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n",
-            "\u001b[36m(orchestrate pid=12291)\u001b[0m 13:31:32 INFO - orchestrator started at 2024-10-18 13:31:32\n",
-            "\u001b[36m(orchestrate pid=12291)\u001b[0m 13:31:32 INFO - Number of files is 2, source profile {'max_file_size': 0.008975982666015625, 'min_file_size': 0.008897781372070312, 'total_file_size': 0.017873764038085938}\n",
-            "\u001b[36m(orchestrate pid=12291)\u001b[0m 13:31:32 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 15.033103181049228, 'object_store': 7.516551589593291}\n",
-            "\u001b[36m(orchestrate pid=12291)\u001b[0m 13:31:32 INFO - Number of workers - 2 with {'num_cpus': 0.8, 'max_restarts': -1} each\n",
-            "\u001b[36m(orchestrate pid=12291)\u001b[0m 13:31:32 INFO - Completed 0 files (0.0%)  in 0.0 min. Waiting for completion\n",
-            "\u001b[36m(orchestrate pid=12291)\u001b[0m 13:31:33 INFO - Completed processing 2 files in 0.012 min\n",
-            "\u001b[36m(orchestrate pid=12291)\u001b[0m 13:31:33 INFO - done flushing in 0.001 sec\n",
-            "13:31:43 INFO - Completed execution in 0.228 min, execution result 0\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "✅ Stage:3 completed successfully\n",
-            "CPU times: user 123 ms, sys: 145 ms, total: 267 ms\n",
-            "Wall time: 15.2 s\n"
-          ]
-        }
-      ],
-      "source": [
-        "%%time\n",
-        "\n",
-        "from data_processing_ray.runtime.ray import RayTransformLauncher\n",
-        "from doc_id_transform_ray import DocIDRayTransformRuntimeConfiguration\n",
-        "\n",
-        "local_conf = {\n",
-        "    \"input_folder\": input_folder,\n",
-        "    \"output_folder\": output_folder,\n",
-        "}\n",
-        "worker_options = {\"num_cpus\" : MY_CONFIG.RAY_NUM_CPUS}\n",
-        "params = {\n",
-        "    # where to run\n",
-        "    \"run_locally\": True,\n",
-        "    # Data access. Only required parameters are specified\n",
-        "    \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n",
-        "    # orchestrator\n",
-        "    \"runtime_worker_options\": ParamsUtils.convert_to_ast(worker_options),\n",
-        "    \"runtime_num_workers\": MY_CONFIG.RAY_RUNTIME_WORKERS,\n",
-        "    # doc id configuration\n",
-        "    \"doc_id_doc_column\": \"contents\",\n",
-        "    \"doc_id_hash_column\": \"chunk_hash\",\n",
-        "    \"doc_id_int_column\": \"chunk_id\",\n",
-        "}\n",
-        "sys.argv = ParamsUtils.dict_to_req(d=params)\n",
-        "\n",
-        "# launch\n",
-        "\n",
-        "launcher = RayTransformLauncher(DocIDRayTransformRuntimeConfiguration())\n",
-        "\n",
-        "return_code = launcher.launch()\n",
-        "\n",
-        "if return_code == 0:\n",
-        "    print (f\"✅ Stage:{STAGE} completed successfully\")\n",
-        "else:\n",
-        "    raise Exception (\"❌ Ray job failed\")"
-      ]
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "✅ Stage:4 completed successfully\n",
+      "CPU times: user 136 ms, sys: 154 ms, total: 289 ms\n",
+      "Wall time: 15.2 s\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "\n",
+    "from data_processing_ray.runtime.ray import RayTransformLauncher\n",
+    "from ededup_transform_ray import EdedupRayTransformRuntimeConfiguration\n",
+    "\n",
+    "\n",
+    "# Prepare the commandline params\n",
+    "local_conf = {\n",
+    "    \"input_folder\": input_folder,\n",
+    "    \"output_folder\": output_folder,\n",
+    "}\n",
+    "worker_options = {\"num_cpus\" : MY_CONFIG.RAY_NUM_CPUS}\n",
+    "params = {\n",
+    "    # where to run\n",
+    "    \"run_locally\": True,\n",
+    "    # Data access. Only required parameters are specified\n",
+    "    \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n",
+    "    # orchestrator\n",
+    "    \"runtime_worker_options\": ParamsUtils.convert_to_ast(worker_options),\n",
+    "    \"runtime_num_workers\": MY_CONFIG.RAY_RUNTIME_WORKERS,\n",
+    "    # ededup parameters\n",
+    "    \"ededup_hash_cpu\": 0.5,\n",
+    "    \"ededup_num_hashes\": 2,\n",
+    "    \"ededup_doc_column\": \"contents\",\n",
+    "    \"ededup_doc_id_column\": \"chunk_hash\",\n",
+    "}\n",
+    "\n",
+    "# Pass the commandline params\n",
+    "sys.argv = ParamsUtils.dict_to_req(d=params)\n",
+    "\n",
+    "# create launcher\n",
+    "launcher = RayTransformLauncher(EdedupRayTransformRuntimeConfiguration())\n",
+    "# launch\n",
+    "return_code = launcher.launch()\n",
+    "\n",
+    "if return_code == 0:\n",
+    "    print (f\"✅ Stage:{STAGE} completed successfully\")\n",
+    "else:\n",
+    "    raise Exception (\"❌ Ray job failed\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "eaf1c3c3",
+   "metadata": {
+    "id": "eaf1c3c3"
+   },
+   "source": [
+    "### 6.3 - Inspect Generated output"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "d824ebf6",
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/",
+     "height": 815
     },
+    "id": "d824ebf6",
+    "outputId": "9173efb6-1b95-4a7e-b531-1a611841a4d0"
+   },
+   "outputs": [
     {
-      "cell_type": "markdown",
-      "id": "4954402f",
-      "metadata": {
-        "id": "4954402f"
-      },
-      "source": [
-        "### 5.3 - Inspect Generated output\n",
-        "\n",
-        "You will notice we have two extra columns\n",
-        "\n",
-        "- **hash_column**\n",
-        "- **int_id_column**\n",
-        "\n",
-        "But still the same number or rows as before"
-      ]
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Input data dimensions (rows x columns)=  (8, 18)\n",
+      "Output data dimensions (rows x columns)=  (7, 19)\n",
+      "Input chunks before exact dedupe : 8\n",
+      "Output chunks after exact dedupe : 7\n",
+      "Duplicate chunks removed :   1\n"
+     ]
     },
     {
-      "cell_type": "code",
-      "execution_count": 20,
-      "id": "1911179a",
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 860
-        },
-        "id": "1911179a",
-        "outputId": "45e83e2a-1f70-46b9-e311-c50f025419be"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Input data dimensions (rows x columns)=  (8, 16)\n",
-            "Output data dimensions (rows x columns)=  (8, 18)\n"
-          ]
-        },
-        {
-          "data": {
-            "text/html": [
-              "<div>\n",
-              "<style scoped>\n",
-              "    .dataframe tbody tr th:only-of-type {\n",
-              "        vertical-align: middle;\n",
-              "    }\n",
-              "\n",
-              "    .dataframe tbody tr th {\n",
-              "        vertical-align: top;\n",
-              "    }\n",
-              "\n",
-              "    .dataframe thead th {\n",
-              "        text-align: right;\n",
-              "    }\n",
-              "</style>\n",
-              "<table border=\"1\" class=\"dataframe\">\n",
-              "  <thead>\n",
-              "    <tr style=\"text-align: right;\">\n",
-              "      <th></th>\n",
-              "      <th>filename</th>\n",
-              "      <th>num_pages</th>\n",
-              "      <th>num_tables</th>\n",
-              "      <th>num_doc_elements</th>\n",
-              "      <th>ext</th>\n",
-              "      <th>hash</th>\n",
-              "      <th>size</th>\n",
-              "      <th>date_acquired</th>\n",
-              "      <th>pdf_convert_time</th>\n",
-              "      <th>source_filename</th>\n",
-              "      <th>source_document_id</th>\n",
-              "      <th>contents</th>\n",
-              "      <th>doc_jsonpath</th>\n",
-              "      <th>page_number</th>\n",
-              "      <th>bbox</th>\n",
-              "      <th>document_id</th>\n",
-              "      <th>chunk_hash</th>\n",
-              "      <th>chunk_id</th>\n",
-              "    </tr>\n",
-              "  </thead>\n",
-              "  <tbody>\n",
-              "    <tr>\n",
-              "      <th>0</th>\n",
-              "      <td>mars.pdf</td>\n",
-              "      <td>1</td>\n",
-              "      <td>0</td>\n",
-              "      <td>11</td>\n",
-              "      <td>pdf</td>\n",
-              "      <td>8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...</td>\n",
-              "      <td>2800</td>\n",
-              "      <td>2024-10-18T13:30:59.490007</td>\n",
-              "      <td>2.011138</td>\n",
-              "      <td>mars.pdf</td>\n",
-              "      <td>62e5639f-f922-4ccc-a041-3cb02f1cfd83</td>\n",
-              "      <td>Solar System\\nOur solar system is a vast and f...</td>\n",
-              "      <td>$.main-text[2]</td>\n",
-              "      <td>1</td>\n",
-              "      <td>[132.84518433, 588.96014404, 479.40917969, 623...</td>\n",
-              "      <td>44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...</td>\n",
-              "      <td>44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...</td>\n",
-              "      <td>4</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>1</th>\n",
-              "      <td>mars.pdf</td>\n",
-              "      <td>1</td>\n",
-              "      <td>0</td>\n",
-              "      <td>11</td>\n",
-              "      <td>pdf</td>\n",
-              "      <td>8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...</td>\n",
-              "      <td>2800</td>\n",
-              "      <td>2024-10-18T13:30:59.490007</td>\n",
-              "      <td>2.011138</td>\n",
-              "      <td>mars.pdf</td>\n",
-              "      <td>62e5639f-f922-4ccc-a041-3cb02f1cfd83</td>\n",
-              "      <td>Solar System\\nFor more details about the Solar...</td>\n",
-              "      <td>$.main-text[3]</td>\n",
-              "      <td>1</td>\n",
-              "      <td>[133.18510437, 570.83258057, 374.99838257, 581...</td>\n",
-              "      <td>dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07...</td>\n",
-              "      <td>dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07...</td>\n",
-              "      <td>5</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>2</th>\n",
-              "      <td>mars.pdf</td>\n",
-              "      <td>1</td>\n",
-              "      <td>0</td>\n",
-              "      <td>11</td>\n",
-              "      <td>pdf</td>\n",
-              "      <td>8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...</td>\n",
-              "      <td>2800</td>\n",
-              "      <td>2024-10-18T13:30:59.490007</td>\n",
-              "      <td>2.011138</td>\n",
-              "      <td>mars.pdf</td>\n",
-              "      <td>62e5639f-f922-4ccc-a041-3cb02f1cfd83</td>\n",
-              "      <td>Mars\\nMars, the fourth planet from the Sun, is...</td>\n",
-              "      <td>$.main-text[5]</td>\n",
-              "      <td>1</td>\n",
-              "      <td>[132.87440491, 500.84011841, 477.48345947, 534...</td>\n",
-              "      <td>a31663e06fac41470ecc459f5a58658a3f9997d7801053...</td>\n",
-              "      <td>a31663e06fac41470ecc459f5a58658a3f9997d7801053...</td>\n",
-              "      <td>6</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>3</th>\n",
-              "      <td>mars.pdf</td>\n",
-              "      <td>1</td>\n",
-              "      <td>0</td>\n",
-              "      <td>11</td>\n",
-              "      <td>pdf</td>\n",
-              "      <td>8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...</td>\n",
-              "      <td>2800</td>\n",
-              "      <td>2024-10-18T13:30:59.490007</td>\n",
-              "      <td>2.011138</td>\n",
-              "      <td>mars.pdf</td>\n",
-              "      <td>62e5639f-f922-4ccc-a041-3cb02f1cfd83</td>\n",
-              "      <td>Basic facts about Mars:\\n· Distance from the S...</td>\n",
-              "      <td>$.main-text[6]</td>\n",
-              "      <td>1</td>\n",
-              "      <td>[133.2026062, 482.90710449, 237.04431152, 493....</td>\n",
-              "      <td>7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...</td>\n",
-              "      <td>7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...</td>\n",
-              "      <td>7</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>4</th>\n",
-              "      <td>earth.pdf</td>\n",
-              "      <td>1</td>\n",
-              "      <td>0</td>\n",
-              "      <td>11</td>\n",
-              "      <td>pdf</td>\n",
-              "      <td>18713f970989055625bef22209b6f4b6830b9ca22046bf...</td>\n",
-              "      <td>2686</td>\n",
-              "      <td>2024-10-18T13:30:59.494027</td>\n",
-              "      <td>2.015123</td>\n",
-              "      <td>earth.pdf</td>\n",
-              "      <td>f3c0ac2e-1de2-472b-8216-2043f3b3e9d1</td>\n",
-              "      <td>Solar System\\nOur solar system is a vast and f...</td>\n",
-              "      <td>$.main-text[2]</td>\n",
-              "      <td>1</td>\n",
-              "      <td>[132.87112427, 588.96014404, 479.40917969, 623...</td>\n",
-              "      <td>44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...</td>\n",
-              "      <td>44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...</td>\n",
-              "      <td>0</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>5</th>\n",
-              "      <td>earth.pdf</td>\n",
-              "      <td>1</td>\n",
-              "      <td>0</td>\n",
-              "      <td>11</td>\n",
-              "      <td>pdf</td>\n",
-              "      <td>18713f970989055625bef22209b6f4b6830b9ca22046bf...</td>\n",
-              "      <td>2686</td>\n",
-              "      <td>2024-10-18T13:30:59.494027</td>\n",
-              "      <td>2.015123</td>\n",
-              "      <td>earth.pdf</td>\n",
-              "      <td>f3c0ac2e-1de2-472b-8216-2043f3b3e9d1</td>\n",
-              "      <td>Solar System\\nFor more details about our Solar...</td>\n",
-              "      <td>$.main-text[3]</td>\n",
-              "      <td>1</td>\n",
-              "      <td>[133.20942688, 570.81555176, 375.57919312, 581...</td>\n",
-              "      <td>d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...</td>\n",
-              "      <td>d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...</td>\n",
-              "      <td>1</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>6</th>\n",
-              "      <td>earth.pdf</td>\n",
-              "      <td>1</td>\n",
-              "      <td>0</td>\n",
-              "      <td>11</td>\n",
-              "      <td>pdf</td>\n",
-              "      <td>18713f970989055625bef22209b6f4b6830b9ca22046bf...</td>\n",
-              "      <td>2686</td>\n",
-              "      <td>2024-10-18T13:30:59.494027</td>\n",
-              "      <td>2.015123</td>\n",
-              "      <td>earth.pdf</td>\n",
-              "      <td>f3c0ac2e-1de2-472b-8216-2043f3b3e9d1</td>\n",
-              "      <td>Earth\\nEarth is the third planet from the Sun....</td>\n",
-              "      <td>$.main-text[5]</td>\n",
-              "      <td>1</td>\n",
-              "      <td>[132.91053772, 512.46295166, 477.84887695, 534...</td>\n",
-              "      <td>7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...</td>\n",
-              "      <td>7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...</td>\n",
-              "      <td>2</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>7</th>\n",
-              "      <td>earth.pdf</td>\n",
-              "      <td>1</td>\n",
-              "      <td>0</td>\n",
-              "      <td>11</td>\n",
-              "      <td>pdf</td>\n",
-              "      <td>18713f970989055625bef22209b6f4b6830b9ca22046bf...</td>\n",
-              "      <td>2686</td>\n",
-              "      <td>2024-10-18T13:30:59.494027</td>\n",
-              "      <td>2.015123</td>\n",
-              "      <td>earth.pdf</td>\n",
-              "      <td>f3c0ac2e-1de2-472b-8216-2043f3b3e9d1</td>\n",
-              "      <td>Earth\\nBasic facts about Earth:\\n· Distance fr...</td>\n",
-              "      <td>$.main-text[6]</td>\n",
-              "      <td>1</td>\n",
-              "      <td>[133.30151367, 494.86206055, 240.17156982, 505...</td>\n",
-              "      <td>189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...</td>\n",
-              "      <td>189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...</td>\n",
-              "      <td>3</td>\n",
-              "    </tr>\n",
-              "  </tbody>\n",
-              "</table>\n",
-              "</div>"
-            ],
-            "text/plain": [
-              "    filename  num_pages  num_tables  num_doc_elements  ext  \\\n",
-              "0   mars.pdf          1           0                11  pdf   \n",
-              "1   mars.pdf          1           0                11  pdf   \n",
-              "2   mars.pdf          1           0                11  pdf   \n",
-              "3   mars.pdf          1           0                11  pdf   \n",
-              "4  earth.pdf          1           0                11  pdf   \n",
-              "5  earth.pdf          1           0                11  pdf   \n",
-              "6  earth.pdf          1           0                11  pdf   \n",
-              "7  earth.pdf          1           0                11  pdf   \n",
-              "\n",
-              "                                                hash  size  \\\n",
-              "0  8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...  2800   \n",
-              "1  8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...  2800   \n",
-              "2  8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...  2800   \n",
-              "3  8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...  2800   \n",
-              "4  18713f970989055625bef22209b6f4b6830b9ca22046bf...  2686   \n",
-              "5  18713f970989055625bef22209b6f4b6830b9ca22046bf...  2686   \n",
-              "6  18713f970989055625bef22209b6f4b6830b9ca22046bf...  2686   \n",
-              "7  18713f970989055625bef22209b6f4b6830b9ca22046bf...  2686   \n",
-              "\n",
-              "                date_acquired  pdf_convert_time source_filename  \\\n",
-              "0  2024-10-18T13:30:59.490007          2.011138        mars.pdf   \n",
-              "1  2024-10-18T13:30:59.490007          2.011138        mars.pdf   \n",
-              "2  2024-10-18T13:30:59.490007          2.011138        mars.pdf   \n",
-              "3  2024-10-18T13:30:59.490007          2.011138        mars.pdf   \n",
-              "4  2024-10-18T13:30:59.494027          2.015123       earth.pdf   \n",
-              "5  2024-10-18T13:30:59.494027          2.015123       earth.pdf   \n",
-              "6  2024-10-18T13:30:59.494027          2.015123       earth.pdf   \n",
-              "7  2024-10-18T13:30:59.494027          2.015123       earth.pdf   \n",
-              "\n",
-              "                     source_document_id  \\\n",
-              "0  62e5639f-f922-4ccc-a041-3cb02f1cfd83   \n",
-              "1  62e5639f-f922-4ccc-a041-3cb02f1cfd83   \n",
-              "2  62e5639f-f922-4ccc-a041-3cb02f1cfd83   \n",
-              "3  62e5639f-f922-4ccc-a041-3cb02f1cfd83   \n",
-              "4  f3c0ac2e-1de2-472b-8216-2043f3b3e9d1   \n",
-              "5  f3c0ac2e-1de2-472b-8216-2043f3b3e9d1   \n",
-              "6  f3c0ac2e-1de2-472b-8216-2043f3b3e9d1   \n",
-              "7  f3c0ac2e-1de2-472b-8216-2043f3b3e9d1   \n",
-              "\n",
-              "                                            contents    doc_jsonpath  \\\n",
-              "0  Solar System\\nOur solar system is a vast and f...  $.main-text[2]   \n",
-              "1  Solar System\\nFor more details about the Solar...  $.main-text[3]   \n",
-              "2  Mars\\nMars, the fourth planet from the Sun, is...  $.main-text[5]   \n",
-              "3  Basic facts about Mars:\\n· Distance from the S...  $.main-text[6]   \n",
-              "4  Solar System\\nOur solar system is a vast and f...  $.main-text[2]   \n",
-              "5  Solar System\\nFor more details about our Solar...  $.main-text[3]   \n",
-              "6  Earth\\nEarth is the third planet from the Sun....  $.main-text[5]   \n",
-              "7  Earth\\nBasic facts about Earth:\\n· Distance fr...  $.main-text[6]   \n",
-              "\n",
-              "   page_number                                               bbox  \\\n",
-              "0            1  [132.84518433, 588.96014404, 479.40917969, 623...   \n",
-              "1            1  [133.18510437, 570.83258057, 374.99838257, 581...   \n",
-              "2            1  [132.87440491, 500.84011841, 477.48345947, 534...   \n",
-              "3            1  [133.2026062, 482.90710449, 237.04431152, 493....   \n",
-              "4            1  [132.87112427, 588.96014404, 479.40917969, 623...   \n",
-              "5            1  [133.20942688, 570.81555176, 375.57919312, 581...   \n",
-              "6            1  [132.91053772, 512.46295166, 477.84887695, 534...   \n",
-              "7            1  [133.30151367, 494.86206055, 240.17156982, 505...   \n",
-              "\n",
-              "                                         document_id  \\\n",
-              "0  44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...   \n",
-              "1  dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07...   \n",
-              "2  a31663e06fac41470ecc459f5a58658a3f9997d7801053...   \n",
-              "3  7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...   \n",
-              "4  44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...   \n",
-              "5  d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...   \n",
-              "6  7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...   \n",
-              "7  189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...   \n",
-              "\n",
-              "                                          chunk_hash  chunk_id  \n",
-              "0  44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...         4  \n",
-              "1  dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07...         5  \n",
-              "2  a31663e06fac41470ecc459f5a58658a3f9997d7801053...         6  \n",
-              "3  7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...         7  \n",
-              "4  44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...         0  \n",
-              "5  d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...         1  \n",
-              "6  7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...         2  \n",
-              "7  189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...         3  "
-            ]
-          },
-          "execution_count": 20,
-          "metadata": {},
-          "output_type": "execute_result"
-        }
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>filename</th>\n",
+       "      <th>num_pages</th>\n",
+       "      <th>num_tables</th>\n",
+       "      <th>num_doc_elements</th>\n",
+       "      <th>ext</th>\n",
+       "      <th>hash</th>\n",
+       "      <th>size</th>\n",
+       "      <th>date_acquired</th>\n",
+       "      <th>pdf_convert_time</th>\n",
+       "      <th>source_filename</th>\n",
+       "      <th>source_document_id</th>\n",
+       "      <th>contents</th>\n",
+       "      <th>doc_jsonpath</th>\n",
+       "      <th>page_number</th>\n",
+       "      <th>bbox</th>\n",
+       "      <th>document_id</th>\n",
+       "      <th>chunk_hash</th>\n",
+       "      <th>chunk_id</th>\n",
+       "      <th>removed</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>mars.pdf</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>11</td>\n",
+       "      <td>pdf</td>\n",
+       "      <td>8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...</td>\n",
+       "      <td>2800</td>\n",
+       "      <td>2024-10-18T13:30:59.490007</td>\n",
+       "      <td>2.011138</td>\n",
+       "      <td>mars.pdf</td>\n",
+       "      <td>62e5639f-f922-4ccc-a041-3cb02f1cfd83</td>\n",
+       "      <td>Solar System\\nFor more details about the Solar...</td>\n",
+       "      <td>$.main-text[3]</td>\n",
+       "      <td>1</td>\n",
+       "      <td>[133.18510437, 570.83258057, 374.99838257, 581...</td>\n",
+       "      <td>dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07...</td>\n",
+       "      <td>dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07...</td>\n",
+       "      <td>5</td>\n",
+       "      <td>[44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf567...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>mars.pdf</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>11</td>\n",
+       "      <td>pdf</td>\n",
+       "      <td>8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...</td>\n",
+       "      <td>2800</td>\n",
+       "      <td>2024-10-18T13:30:59.490007</td>\n",
+       "      <td>2.011138</td>\n",
+       "      <td>mars.pdf</td>\n",
+       "      <td>62e5639f-f922-4ccc-a041-3cb02f1cfd83</td>\n",
+       "      <td>Mars\\nMars, the fourth planet from the Sun, is...</td>\n",
+       "      <td>$.main-text[5]</td>\n",
+       "      <td>1</td>\n",
+       "      <td>[132.87440491, 500.84011841, 477.48345947, 534...</td>\n",
+       "      <td>a31663e06fac41470ecc459f5a58658a3f9997d7801053...</td>\n",
+       "      <td>a31663e06fac41470ecc459f5a58658a3f9997d7801053...</td>\n",
+       "      <td>6</td>\n",
+       "      <td>[]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>mars.pdf</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>11</td>\n",
+       "      <td>pdf</td>\n",
+       "      <td>8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...</td>\n",
+       "      <td>2800</td>\n",
+       "      <td>2024-10-18T13:30:59.490007</td>\n",
+       "      <td>2.011138</td>\n",
+       "      <td>mars.pdf</td>\n",
+       "      <td>62e5639f-f922-4ccc-a041-3cb02f1cfd83</td>\n",
+       "      <td>Basic facts about Mars:\\n· Distance from the S...</td>\n",
+       "      <td>$.main-text[6]</td>\n",
+       "      <td>1</td>\n",
+       "      <td>[133.2026062, 482.90710449, 237.04431152, 493....</td>\n",
+       "      <td>7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...</td>\n",
+       "      <td>7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...</td>\n",
+       "      <td>7</td>\n",
+       "      <td>[]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>earth.pdf</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>11</td>\n",
+       "      <td>pdf</td>\n",
+       "      <td>18713f970989055625bef22209b6f4b6830b9ca22046bf...</td>\n",
+       "      <td>2686</td>\n",
+       "      <td>2024-10-18T13:30:59.494027</td>\n",
+       "      <td>2.015123</td>\n",
+       "      <td>earth.pdf</td>\n",
+       "      <td>f3c0ac2e-1de2-472b-8216-2043f3b3e9d1</td>\n",
+       "      <td>Solar System\\nOur solar system is a vast and f...</td>\n",
+       "      <td>$.main-text[2]</td>\n",
+       "      <td>1</td>\n",
+       "      <td>[132.87112427, 588.96014404, 479.40917969, 623...</td>\n",
+       "      <td>44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...</td>\n",
+       "      <td>44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>[]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>earth.pdf</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>11</td>\n",
+       "      <td>pdf</td>\n",
+       "      <td>18713f970989055625bef22209b6f4b6830b9ca22046bf...</td>\n",
+       "      <td>2686</td>\n",
+       "      <td>2024-10-18T13:30:59.494027</td>\n",
+       "      <td>2.015123</td>\n",
+       "      <td>earth.pdf</td>\n",
+       "      <td>f3c0ac2e-1de2-472b-8216-2043f3b3e9d1</td>\n",
+       "      <td>Solar System\\nFor more details about our Solar...</td>\n",
+       "      <td>$.main-text[3]</td>\n",
+       "      <td>1</td>\n",
+       "      <td>[133.20942688, 570.81555176, 375.57919312, 581...</td>\n",
+       "      <td>d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...</td>\n",
+       "      <td>d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...</td>\n",
+       "      <td>1</td>\n",
+       "      <td>[]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>earth.pdf</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>11</td>\n",
+       "      <td>pdf</td>\n",
+       "      <td>18713f970989055625bef22209b6f4b6830b9ca22046bf...</td>\n",
+       "      <td>2686</td>\n",
+       "      <td>2024-10-18T13:30:59.494027</td>\n",
+       "      <td>2.015123</td>\n",
+       "      <td>earth.pdf</td>\n",
+       "      <td>f3c0ac2e-1de2-472b-8216-2043f3b3e9d1</td>\n",
+       "      <td>Earth\\nEarth is the third planet from the Sun....</td>\n",
+       "      <td>$.main-text[5]</td>\n",
+       "      <td>1</td>\n",
+       "      <td>[132.91053772, 512.46295166, 477.84887695, 534...</td>\n",
+       "      <td>7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...</td>\n",
+       "      <td>7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...</td>\n",
+       "      <td>2</td>\n",
+       "      <td>[]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>earth.pdf</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>11</td>\n",
+       "      <td>pdf</td>\n",
+       "      <td>18713f970989055625bef22209b6f4b6830b9ca22046bf...</td>\n",
+       "      <td>2686</td>\n",
+       "      <td>2024-10-18T13:30:59.494027</td>\n",
+       "      <td>2.015123</td>\n",
+       "      <td>earth.pdf</td>\n",
+       "      <td>f3c0ac2e-1de2-472b-8216-2043f3b3e9d1</td>\n",
+       "      <td>Earth\\nBasic facts about Earth:\\n· Distance fr...</td>\n",
+       "      <td>$.main-text[6]</td>\n",
+       "      <td>1</td>\n",
+       "      <td>[133.30151367, 494.86206055, 240.17156982, 505...</td>\n",
+       "      <td>189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...</td>\n",
+       "      <td>189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...</td>\n",
+       "      <td>3</td>\n",
+       "      <td>[]</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
       ],
-      "source": [
-        "from my_utils import read_parquet_files_as_df\n",
-        "\n",
-        "output_df = read_parquet_files_as_df(output_folder)\n",
-        "\n",
-        "print (\"Input data dimensions (rows x columns)= \", input_df.shape)\n",
-        "print (\"Output data dimensions (rows x columns)= \", output_df.shape)\n",
-        "\n",
-        "output_df.head(10)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "852829dc",
-      "metadata": {
-        "id": "852829dc"
-      },
-      "source": [
-        "## Step-6: Exact Dedup\n",
-        "\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "5acfd3a2-a236-4143-bcfc-15804f1da7fe",
-      "metadata": {
-        "id": "5acfd3a2-a236-4143-bcfc-15804f1da7fe"
-      },
-      "source": [
-        "### 6.1 - Set Input/output Folder"
+      "text/plain": [
+       "    filename  num_pages  num_tables  num_doc_elements  ext  \\\n",
+       "0   mars.pdf          1           0                11  pdf   \n",
+       "1   mars.pdf          1           0                11  pdf   \n",
+       "2   mars.pdf          1           0                11  pdf   \n",
+       "3  earth.pdf          1           0                11  pdf   \n",
+       "4  earth.pdf          1           0                11  pdf   \n",
+       "5  earth.pdf          1           0                11  pdf   \n",
+       "6  earth.pdf          1           0                11  pdf   \n",
+       "\n",
+       "                                                hash  size  \\\n",
+       "0  8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...  2800   \n",
+       "1  8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...  2800   \n",
+       "2  8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...  2800   \n",
+       "3  18713f970989055625bef22209b6f4b6830b9ca22046bf...  2686   \n",
+       "4  18713f970989055625bef22209b6f4b6830b9ca22046bf...  2686   \n",
+       "5  18713f970989055625bef22209b6f4b6830b9ca22046bf...  2686   \n",
+       "6  18713f970989055625bef22209b6f4b6830b9ca22046bf...  2686   \n",
+       "\n",
+       "                date_acquired  pdf_convert_time source_filename  \\\n",
+       "0  2024-10-18T13:30:59.490007          2.011138        mars.pdf   \n",
+       "1  2024-10-18T13:30:59.490007          2.011138        mars.pdf   \n",
+       "2  2024-10-18T13:30:59.490007          2.011138        mars.pdf   \n",
+       "3  2024-10-18T13:30:59.494027          2.015123       earth.pdf   \n",
+       "4  2024-10-18T13:30:59.494027          2.015123       earth.pdf   \n",
+       "5  2024-10-18T13:30:59.494027          2.015123       earth.pdf   \n",
+       "6  2024-10-18T13:30:59.494027          2.015123       earth.pdf   \n",
+       "\n",
+       "                     source_document_id  \\\n",
+       "0  62e5639f-f922-4ccc-a041-3cb02f1cfd83   \n",
+       "1  62e5639f-f922-4ccc-a041-3cb02f1cfd83   \n",
+       "2  62e5639f-f922-4ccc-a041-3cb02f1cfd83   \n",
+       "3  f3c0ac2e-1de2-472b-8216-2043f3b3e9d1   \n",
+       "4  f3c0ac2e-1de2-472b-8216-2043f3b3e9d1   \n",
+       "5  f3c0ac2e-1de2-472b-8216-2043f3b3e9d1   \n",
+       "6  f3c0ac2e-1de2-472b-8216-2043f3b3e9d1   \n",
+       "\n",
+       "                                            contents    doc_jsonpath  \\\n",
+       "0  Solar System\\nFor more details about the Solar...  $.main-text[3]   \n",
+       "1  Mars\\nMars, the fourth planet from the Sun, is...  $.main-text[5]   \n",
+       "2  Basic facts about Mars:\\n· Distance from the S...  $.main-text[6]   \n",
+       "3  Solar System\\nOur solar system is a vast and f...  $.main-text[2]   \n",
+       "4  Solar System\\nFor more details about our Solar...  $.main-text[3]   \n",
+       "5  Earth\\nEarth is the third planet from the Sun....  $.main-text[5]   \n",
+       "6  Earth\\nBasic facts about Earth:\\n· Distance fr...  $.main-text[6]   \n",
+       "\n",
+       "   page_number                                               bbox  \\\n",
+       "0            1  [133.18510437, 570.83258057, 374.99838257, 581...   \n",
+       "1            1  [132.87440491, 500.84011841, 477.48345947, 534...   \n",
+       "2            1  [133.2026062, 482.90710449, 237.04431152, 493....   \n",
+       "3            1  [132.87112427, 588.96014404, 479.40917969, 623...   \n",
+       "4            1  [133.20942688, 570.81555176, 375.57919312, 581...   \n",
+       "5            1  [132.91053772, 512.46295166, 477.84887695, 534...   \n",
+       "6            1  [133.30151367, 494.86206055, 240.17156982, 505...   \n",
+       "\n",
+       "                                         document_id  \\\n",
+       "0  dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07...   \n",
+       "1  a31663e06fac41470ecc459f5a58658a3f9997d7801053...   \n",
+       "2  7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...   \n",
+       "3  44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...   \n",
+       "4  d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...   \n",
+       "5  7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...   \n",
+       "6  189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...   \n",
+       "\n",
+       "                                          chunk_hash  chunk_id  \\\n",
+       "0  dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07...         5   \n",
+       "1  a31663e06fac41470ecc459f5a58658a3f9997d7801053...         6   \n",
+       "2  7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...         7   \n",
+       "3  44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...         0   \n",
+       "4  d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...         1   \n",
+       "5  7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...         2   \n",
+       "6  189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...         3   \n",
+       "\n",
+       "                                             removed  \n",
+       "0  [44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf567...  \n",
+       "1                                                 []  \n",
+       "2                                                 []  \n",
+       "3                                                 []  \n",
+       "4                                                 []  \n",
+       "5                                                 []  \n",
+       "6                                                 []  "
       ]
+     },
+     "execution_count": 23,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from my_utils import read_parquet_files_as_df\n",
+    "\n",
+    "output_df = read_parquet_files_as_df(output_folder)\n",
+    "\n",
+    "print (\"Input data dimensions (rows x columns)= \", input_df.shape)\n",
+    "print (\"Output data dimensions (rows x columns)= \", output_df.shape)\n",
+    "print (f\"Input chunks before exact dedupe : {input_df.shape[0]:,}\")\n",
+    "print (f\"Output chunks after exact dedupe : {output_df.shape[0]:,}\")\n",
+    "print (\"Duplicate chunks removed :  \", (input_df.shape[0] - output_df.shape[0]))\n",
+    "\n",
+    "output_df.head(10)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "82cc9bb0",
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/",
+     "height": 269
     },
+    "id": "82cc9bb0",
+    "outputId": "e043fa01-ceca-49ae-b764-8154219c7b6c"
+   },
+   "outputs": [
     {
-      "cell_type": "code",
-      "execution_count": 21,
-      "id": "4c7a1b94",
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "4c7a1b94",
-        "outputId": "40a119b4-44fc-483d-9ad0-da178a2a8eb1"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "🏃🏼 STAGE-4: Processing input='output/03_docid_out' --> output='output/04_exact_dedupe_out'\n"
-          ]
-        }
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>filename</th>\n",
+       "      <th>contents</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>mars.pdf</td>\n",
+       "      <td>Solar System\\nFor more details about the Solar...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>mars.pdf</td>\n",
+       "      <td>Mars\\nMars, the fourth planet from the Sun, is...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>mars.pdf</td>\n",
+       "      <td>Basic facts about Mars:\\n· Distance from the S...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>earth.pdf</td>\n",
+       "      <td>Solar System\\nOur solar system is a vast and f...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>earth.pdf</td>\n",
+       "      <td>Solar System\\nFor more details about our Solar...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>earth.pdf</td>\n",
+       "      <td>Earth\\nEarth is the third planet from the Sun....</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>earth.pdf</td>\n",
+       "      <td>Earth\\nBasic facts about Earth:\\n· Distance fr...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
       ],
-      "source": [
-        "STAGE  = 4\n",
-        "\n",
-        "input_folder = output_docid_dir # previous output folder is the input folder for the current stage\n",
-        "output_folder =  output_exact_dedupe_dir\n",
-        "\n",
-        "input_df = read_parquet_files_as_df(input_folder)  ## for debug purposes\n",
-        "\n",
-        "print (f\"🏃🏼 STAGE-{STAGE}: Processing input='{input_folder}' --> output='{output_folder}'\")"
+      "text/plain": [
+       "    filename                                           contents\n",
+       "0   mars.pdf  Solar System\\nFor more details about the Solar...\n",
+       "1   mars.pdf  Mars\\nMars, the fourth planet from the Sun, is...\n",
+       "2   mars.pdf  Basic facts about Mars:\\n· Distance from the S...\n",
+       "3  earth.pdf  Solar System\\nOur solar system is a vast and f...\n",
+       "4  earth.pdf  Solar System\\nFor more details about our Solar...\n",
+       "5  earth.pdf  Earth\\nEarth is the third planet from the Sun....\n",
+       "6  earth.pdf  Earth\\nBasic facts about Earth:\\n· Distance fr..."
       ]
+     },
+     "execution_count": 24,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "output_df[['filename', 'contents']]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "id": "cc61dffa",
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
     },
+    "id": "cc61dffa",
+    "outputId": "aff7a0d9-a791-42a5-d5b7-ad643f59f261"
+   },
+   "outputs": [
     {
-      "cell_type": "markdown",
-      "id": "3661cb37-39c7-4b09-a784-925bfa9eaf1e",
-      "metadata": {
-        "id": "3661cb37-39c7-4b09-a784-925bfa9eaf1e"
-      },
-      "source": [
-        "### 6.2 - Execute"
-      ]
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "========== mars.pdf ===========\n",
+      "-------Chunk 0------\n",
+      "Solar System\n",
+      "For more details about the Solar system see Chapter 1.\n",
+      "-------\n",
+      "-------Chunk 1------\n",
+      "Mars\n",
+      "Mars, the fourth planet from the Sun, is a cold, desert world with a thin atmosphere composed primarily of carbon dioxide. Its reddish hue comes from iron oxide, or rust, prevalent on its surface.\n",
+      "-------\n",
+      "-------Chunk 2------\n",
+      "Basic facts about Mars:\n",
+      "· Distance from the Sun: Average of 228 million kilometers (142 million miles)\n",
+      "· Rotation Period: 24.6 hours (one Martian day - called a \"sol\")\n",
+      "· Moons: Two small moons, Phobos and Deimos.\n",
+      "-------\n",
+      "========== earth.pdf ===========\n",
+      "-------Chunk 0------\n",
+      "Solar System\n",
+      "Our solar system is a vast and fascinating expanse, comprising eight planets, five dwarf planets, numerous moons, asteroids, comets, and other celestial bodies. At its center lies the star we call the Sun.\n",
+      "-------\n",
+      "-------Chunk 1------\n",
+      "Solar System\n",
+      "For more details about our Solar system see Chapter 1.\n",
+      "-------\n",
+      "-------Chunk 2------\n",
+      "Earth\n",
+      "Earth is the third planet from the Sun. It's our home planet. Earth is the only place we know of with life.\n",
+      "-------\n",
+      "-------Chunk 3------\n",
+      "Earth\n",
+      "Basic facts about Earth:\n",
+      "· Distance from the Sun: Average of 149.6 million kilometers (93 million miles)\n",
+      "· Rotation Period: 24 hours (one day)\n",
+      "· Moons: One moon, called Luna or simply \"the Moon\".\n",
+      "-------\n"
+     ]
+    }
+   ],
+   "source": [
+    "for f in output_df['filename'].unique():\n",
+    "    print ('==========' , f, '===========')\n",
+    "    chunks = output_df[output_df['filename'] == f]['contents']\n",
+    "    for idx , chunk in enumerate(chunks):\n",
+    "        print (f'-------Chunk {idx}------\\n{chunk}\\n-------')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "383f40ba",
+   "metadata": {
+    "id": "383f40ba"
+   },
+   "source": [
+    "### 6.4 - Understanding the output\n",
+    "\n",
+    "Remember we had 8 chunks initially.  Now we have 7!  One duplicate chunk is removed.\n",
+    "\n",
+    "If you look at the PDF, the following common paragraph in `earth.pdf` and `mars.pdf`  is removed from one of the documents!  Pretty neat, eh!\n",
+    "\n",
+    "```text\n",
+    "## Solar System\n",
+    "\n",
+    "Our solar system is a vast and fascinating expanse, comprising eight planets, five dwarf planets, numerous moons, asteroids, comets, and other celestial bodies. At its center lies the star we call the Sun.\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "85309751-8556-41c6-ac32-84acc941bc8d",
+   "metadata": {
+    "id": "85309751-8556-41c6-ac32-84acc941bc8d"
+   },
+   "source": [
+    "## Step-7: Fuzzy Dedup\n",
+    "\n",
+    "Post exact deduplication, fuzzy deduplication is applied with the goal of removing code files that may have **slight variations** and thereby unbiasing\n",
+    "the data further.\n",
+    "\n",
+    "Small variations are quite commonly seen in code data in the form of variations in the values of variables, addittion of logging statements etc."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "fcf574a3-b287-419c-9c86-07b828b41ca6",
+   "metadata": {
+    "id": "fcf574a3-b287-419c-9c86-07b828b41ca6"
+   },
+   "source": [
+    "### 7.1 - Set Input/output Folder"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "id": "9e431c8c-c7c7-48de-ba5f-2c4649c35399",
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
     },
+    "id": "9e431c8c-c7c7-48de-ba5f-2c4649c35399",
+    "outputId": "d53a92d2-0f1c-465f-f11c-b9bc2931f651"
+   },
+   "outputs": [
     {
-      "cell_type": "code",
-      "execution_count": 22,
-      "id": "a624b2b2-faad-4325-ac7d-53a840f564ef",
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "a624b2b2-faad-4325-ac7d-53a840f564ef",
-        "outputId": "bd0f3f94-8c48-4c6b-b911-858e389243f4"
-      },
-      "outputs": [
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "13:31:45 INFO - exact dedup params are {'doc_column': 'contents', 'doc_id_column': 'chunk_hash', 'use_snapshot': False, 'snapshot_directory': None, 'hash_cpu': 0.5, 'num_hashes': 2}\n",
-            "13:31:45 INFO - pipeline id pipeline_id\n",
-            "13:31:45 INFO - code location None\n",
-            "13:31:45 INFO - number of workers 2 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n",
-            "13:31:45 INFO - actor creation delay 0\n",
-            "13:31:45 INFO - job details {'job category': 'preprocessing', 'job name': 'ededup', 'job type': 'ray', 'job id': 'job_id'}\n",
-            "13:31:45 INFO - data factory data_ is using local data access: input_folder - output/03_docid_out output_folder - output/04_exact_dedupe_out\n",
-            "13:31:45 INFO - data factory data_ max_files -1, n_sample -1\n",
-            "13:31:45 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n",
-            "13:31:45 INFO - Running locally\n",
-            "2024-10-18 13:31:47,001\tINFO worker.py:1744 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n",
-            "\u001b[36m(orchestrate pid=13775)\u001b[0m 13:31:48 INFO - orchestrator started at 2024-10-18 13:31:48\n",
-            "\u001b[36m(orchestrate pid=13775)\u001b[0m 13:31:48 INFO - Number of files is 2, source profile {'max_file_size': 0.010180473327636719, 'min_file_size': 0.010101318359375, 'total_file_size': 0.02028179168701172}\n",
-            "\u001b[36m(orchestrate pid=13775)\u001b[0m 13:31:48 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 15.010423279367387, 'object_store': 7.505211639218032}\n",
-            "\u001b[36m(orchestrate pid=13775)\u001b[0m 13:31:48 INFO - Number of workers - 2 with {'num_cpus': 0.8, 'max_restarts': -1} each\n",
-            "\u001b[36m(orchestrate pid=13775)\u001b[0m 13:31:48 INFO - Completed 0 files (0.0%)  in 0.0 min. Waiting for completion\n",
-            "\u001b[36m(orchestrate pid=13775)\u001b[0m 13:31:48 INFO - Completed processing 2 files in 0.013 min\n",
-            "\u001b[36m(orchestrate pid=13775)\u001b[0m 13:31:48 INFO - done flushing in 0.001 sec\n",
-            "13:31:58 INFO - Completed execution in 0.228 min, execution result 0\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "✅ Stage:4 completed successfully\n",
-            "CPU times: user 136 ms, sys: 154 ms, total: 289 ms\n",
-            "Wall time: 15.2 s\n"
-          ]
-        }
-      ],
-      "source": [
-        "%%time\n",
-        "\n",
-        "from data_processing_ray.runtime.ray import RayTransformLauncher\n",
-        "from ededup_transform_ray import EdedupRayTransformRuntimeConfiguration\n",
-        "\n",
-        "\n",
-        "# Prepare the commandline params\n",
-        "local_conf = {\n",
-        "    \"input_folder\": input_folder,\n",
-        "    \"output_folder\": output_folder,\n",
-        "}\n",
-        "worker_options = {\"num_cpus\" : MY_CONFIG.RAY_NUM_CPUS}\n",
-        "params = {\n",
-        "    # where to run\n",
-        "    \"run_locally\": True,\n",
-        "    # Data access. Only required parameters are specified\n",
-        "    \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n",
-        "    # orchestrator\n",
-        "    \"runtime_worker_options\": ParamsUtils.convert_to_ast(worker_options),\n",
-        "    \"runtime_num_workers\": MY_CONFIG.RAY_RUNTIME_WORKERS,\n",
-        "    # ededup parameters\n",
-        "    \"ededup_hash_cpu\": 0.5,\n",
-        "    \"ededup_num_hashes\": 2,\n",
-        "    \"ededup_doc_column\": \"contents\",\n",
-        "    \"ededup_doc_id_column\": \"chunk_hash\",\n",
-        "}\n",
-        "\n",
-        "# Pass the commandline params\n",
-        "sys.argv = ParamsUtils.dict_to_req(d=params)\n",
-        "\n",
-        "# create launcher\n",
-        "launcher = RayTransformLauncher(EdedupRayTransformRuntimeConfiguration())\n",
-        "# launch\n",
-        "return_code = launcher.launch()\n",
-        "\n",
-        "if return_code == 0:\n",
-        "    print (f\"✅ Stage:{STAGE} completed successfully\")\n",
-        "else:\n",
-        "    raise Exception (\"❌ Ray job failed\")"
-      ]
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "🏃🏼 STAGE-5: Processing input='output/03_docid_out' --> output='output/05_fuzzy_dedupe_out'\n"
+     ]
+    }
+   ],
+   "source": [
+    "## Input to this component is the output of doc_id generator component.\n",
+    "\n",
+    "STAGE  = 5\n",
+    "\n",
+    "input_folder = output_docid_dir # previous output folder is the input folder for the current stage\n",
+    "output_folder =  output_fuzzy_dedupe_dir\n",
+    "\n",
+    "input_df = read_parquet_files_as_df(input_folder)  ## for debug purposes\n",
+    "\n",
+    "print (f\"🏃🏼 STAGE-{STAGE}: Processing input='{input_folder}' --> output='{output_folder}'\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f4c82a8f-b513-4fe5-b172-d41b104b54f3",
+   "metadata": {
+    "id": "f4c82a8f-b513-4fe5-b172-d41b104b54f3"
+   },
+   "source": [
+    "### 7.2 - Execute"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "id": "3864ff77-e9a8-48f7-973b-c3b3aef1a94f",
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
     },
+    "id": "3864ff77-e9a8-48f7-973b-c3b3aef1a94f",
+    "outputId": "1e63d364-3944-465a-ff7c-6e1dc750b2de"
+   },
+   "outputs": [
     {
-      "cell_type": "markdown",
-      "id": "eaf1c3c3",
-      "metadata": {
-        "id": "eaf1c3c3"
-      },
-      "source": [
-        "### 6.3 - Inspect Generated output"
-      ]
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "13:32:00 INFO - fuzzy dedup params are {'doc_column': 'contents', 'id_column': 'chunk_id', 'cluster_column': 'chunk_hash', 'bucket_cpu': 0.3, 'mhash_cpu': 0.3, 'doc_cpu': 0.3, 'num_doc_actors': 1, 'num_minhash_actors': 1, 'num_bucket_actors': 1, 'num_preprocessors': 1, 'num_permutations': 64, 'threshold': 0.7, 'shingles_size': 5, 'delimiters': ' ', 'snapshot_delay': 1, 'use_bucket_snapshot': False, 'use_doc_snapshot': False, 'random_delay_limit': 10, 'worker_options': {'num_cpus': 0.8}}\n",
+      "13:32:00 INFO - pipeline id pipeline_id\n",
+      "13:32:00 INFO - code location None\n",
+      "13:32:00 INFO - number of workers 2 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n",
+      "13:32:00 INFO - actor creation delay 0\n",
+      "13:32:00 INFO - job details {'job category': 'preprocessing', 'job name': 'fdedup', 'job type': 'ray', 'job id': 'job_id'}\n",
+      "13:32:00 INFO - data factory data_ is using local data access: input_folder - output/03_docid_out output_folder - output/05_fuzzy_dedupe_out\n",
+      "13:32:00 INFO - data factory data_ max_files -1, n_sample -1\n",
+      "13:32:00 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n",
+      "13:32:00 INFO - Running locally\n",
+      "2024-10-18 13:32:02,246\tINFO worker.py:1744 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n",
+      "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - orchestrator started at 2024-10-18 13:32:03\n",
+      "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - Number of files is 2, source profile {'max_file_size': 0.010180473327636719, 'min_file_size': 0.010101318359375, 'total_file_size': 0.02028179168701172}\n",
+      "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 15.000544739887118, 'object_store': 7.500272369012237}\n",
+      "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - Number of workers - 2 with {'num_cpus': 0.8, 'max_restarts': -1} each\n",
+      "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - starting run from the beginning\n",
+      "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - continuing from the very beginning\n",
+      "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - Fuzzy: num buckets 8, bucket length 8\n",
+      "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - created 1 bucket actors\n",
+      "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - created 1 minhash actors\n",
+      "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - Table preprocessing uses 1 readers\n",
+      "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - created 1 table processor actors\n",
+      "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:07 INFO - Completed 1 files in 0.064 min\n",
+      "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:07 INFO - Completed 1 files (50.0%)  in 0.064 min. Waiting for completion\n",
+      "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:15 INFO - Completed processing 2 files in 0.197 min\n",
+      "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:15 INFO - creating minhash snapshots\n",
+      "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:16 INFO - minhash snapshots created\n",
+      "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:16 INFO - creating bucket snapshots\n",
+      "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:17 INFO - bucket snapshots created\n",
+      "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:17 INFO - created 1 document actors\n",
+      "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:17 INFO - created 1 bucket processor actors\n",
+      "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:17 INFO - created bucket processor invoker\n",
+      "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:17 INFO - added invoker to bucket collectors\n",
+      "\u001b[36m(BucketsHash pid=16209)\u001b[0m 13:32:17 INFO - processing buckets 0 long, 53 short\n",
+      "\u001b[36m(BucketsHash pid=16209)\u001b[0m 13:32:17 INFO - Done submitting long buckets\n",
+      "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:17 INFO - Done processing buckets in 0.01 min\n",
+      "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:17 INFO - creating document snapshots\n",
+      "\u001b[36m(BucketsHashProcessorInvoker pid=16602)\u001b[0m 13:32:17 INFO - Waiting bucket processing completion. Submitted requests 1\n",
+      "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:18 INFO - document snapshots created\n",
+      "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:18 INFO - Completed 0 files (0.0%)  in 0.0 min. Waiting for completion\n",
+      "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:25 INFO - Completed processing 2 files in 0.113 min\n",
+      "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:25 INFO - done flushing in 0.005 sec\n",
+      "13:32:35 INFO - Completed execution in 0.588 min, execution result 0\n"
+     ]
     },
     {
-      "cell_type": "code",
-      "execution_count": 23,
-      "id": "d824ebf6",
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 815
-        },
-        "id": "d824ebf6",
-        "outputId": "9173efb6-1b95-4a7e-b531-1a611841a4d0"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Input data dimensions (rows x columns)=  (8, 18)\n",
-            "Output data dimensions (rows x columns)=  (7, 19)\n",
-            "Input chunks before exact dedupe : 8\n",
-            "Output chunks after exact dedupe : 7\n",
-            "Duplicate chunks removed :   1\n"
-          ]
-        },
-        {
-          "data": {
-            "text/html": [
-              "<div>\n",
-              "<style scoped>\n",
-              "    .dataframe tbody tr th:only-of-type {\n",
-              "        vertical-align: middle;\n",
-              "    }\n",
-              "\n",
-              "    .dataframe tbody tr th {\n",
-              "        vertical-align: top;\n",
-              "    }\n",
-              "\n",
-              "    .dataframe thead th {\n",
-              "        text-align: right;\n",
-              "    }\n",
-              "</style>\n",
-              "<table border=\"1\" class=\"dataframe\">\n",
-              "  <thead>\n",
-              "    <tr style=\"text-align: right;\">\n",
-              "      <th></th>\n",
-              "      <th>filename</th>\n",
-              "      <th>num_pages</th>\n",
-              "      <th>num_tables</th>\n",
-              "      <th>num_doc_elements</th>\n",
-              "      <th>ext</th>\n",
-              "      <th>hash</th>\n",
-              "      <th>size</th>\n",
-              "      <th>date_acquired</th>\n",
-              "      <th>pdf_convert_time</th>\n",
-              "      <th>source_filename</th>\n",
-              "      <th>source_document_id</th>\n",
-              "      <th>contents</th>\n",
-              "      <th>doc_jsonpath</th>\n",
-              "      <th>page_number</th>\n",
-              "      <th>bbox</th>\n",
-              "      <th>document_id</th>\n",
-              "      <th>chunk_hash</th>\n",
-              "      <th>chunk_id</th>\n",
-              "      <th>removed</th>\n",
-              "    </tr>\n",
-              "  </thead>\n",
-              "  <tbody>\n",
-              "    <tr>\n",
-              "      <th>0</th>\n",
-              "      <td>mars.pdf</td>\n",
-              "      <td>1</td>\n",
-              "      <td>0</td>\n",
-              "      <td>11</td>\n",
-              "      <td>pdf</td>\n",
-              "      <td>8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...</td>\n",
-              "      <td>2800</td>\n",
-              "      <td>2024-10-18T13:30:59.490007</td>\n",
-              "      <td>2.011138</td>\n",
-              "      <td>mars.pdf</td>\n",
-              "      <td>62e5639f-f922-4ccc-a041-3cb02f1cfd83</td>\n",
-              "      <td>Solar System\\nFor more details about the Solar...</td>\n",
-              "      <td>$.main-text[3]</td>\n",
-              "      <td>1</td>\n",
-              "      <td>[133.18510437, 570.83258057, 374.99838257, 581...</td>\n",
-              "      <td>dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07...</td>\n",
-              "      <td>dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07...</td>\n",
-              "      <td>5</td>\n",
-              "      <td>[44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf567...</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>1</th>\n",
-              "      <td>mars.pdf</td>\n",
-              "      <td>1</td>\n",
-              "      <td>0</td>\n",
-              "      <td>11</td>\n",
-              "      <td>pdf</td>\n",
-              "      <td>8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...</td>\n",
-              "      <td>2800</td>\n",
-              "      <td>2024-10-18T13:30:59.490007</td>\n",
-              "      <td>2.011138</td>\n",
-              "      <td>mars.pdf</td>\n",
-              "      <td>62e5639f-f922-4ccc-a041-3cb02f1cfd83</td>\n",
-              "      <td>Mars\\nMars, the fourth planet from the Sun, is...</td>\n",
-              "      <td>$.main-text[5]</td>\n",
-              "      <td>1</td>\n",
-              "      <td>[132.87440491, 500.84011841, 477.48345947, 534...</td>\n",
-              "      <td>a31663e06fac41470ecc459f5a58658a3f9997d7801053...</td>\n",
-              "      <td>a31663e06fac41470ecc459f5a58658a3f9997d7801053...</td>\n",
-              "      <td>6</td>\n",
-              "      <td>[]</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>2</th>\n",
-              "      <td>mars.pdf</td>\n",
-              "      <td>1</td>\n",
-              "      <td>0</td>\n",
-              "      <td>11</td>\n",
-              "      <td>pdf</td>\n",
-              "      <td>8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...</td>\n",
-              "      <td>2800</td>\n",
-              "      <td>2024-10-18T13:30:59.490007</td>\n",
-              "      <td>2.011138</td>\n",
-              "      <td>mars.pdf</td>\n",
-              "      <td>62e5639f-f922-4ccc-a041-3cb02f1cfd83</td>\n",
-              "      <td>Basic facts about Mars:\\n· Distance from the S...</td>\n",
-              "      <td>$.main-text[6]</td>\n",
-              "      <td>1</td>\n",
-              "      <td>[133.2026062, 482.90710449, 237.04431152, 493....</td>\n",
-              "      <td>7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...</td>\n",
-              "      <td>7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...</td>\n",
-              "      <td>7</td>\n",
-              "      <td>[]</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>3</th>\n",
-              "      <td>earth.pdf</td>\n",
-              "      <td>1</td>\n",
-              "      <td>0</td>\n",
-              "      <td>11</td>\n",
-              "      <td>pdf</td>\n",
-              "      <td>18713f970989055625bef22209b6f4b6830b9ca22046bf...</td>\n",
-              "      <td>2686</td>\n",
-              "      <td>2024-10-18T13:30:59.494027</td>\n",
-              "      <td>2.015123</td>\n",
-              "      <td>earth.pdf</td>\n",
-              "      <td>f3c0ac2e-1de2-472b-8216-2043f3b3e9d1</td>\n",
-              "      <td>Solar System\\nOur solar system is a vast and f...</td>\n",
-              "      <td>$.main-text[2]</td>\n",
-              "      <td>1</td>\n",
-              "      <td>[132.87112427, 588.96014404, 479.40917969, 623...</td>\n",
-              "      <td>44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...</td>\n",
-              "      <td>44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...</td>\n",
-              "      <td>0</td>\n",
-              "      <td>[]</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>4</th>\n",
-              "      <td>earth.pdf</td>\n",
-              "      <td>1</td>\n",
-              "      <td>0</td>\n",
-              "      <td>11</td>\n",
-              "      <td>pdf</td>\n",
-              "      <td>18713f970989055625bef22209b6f4b6830b9ca22046bf...</td>\n",
-              "      <td>2686</td>\n",
-              "      <td>2024-10-18T13:30:59.494027</td>\n",
-              "      <td>2.015123</td>\n",
-              "      <td>earth.pdf</td>\n",
-              "      <td>f3c0ac2e-1de2-472b-8216-2043f3b3e9d1</td>\n",
-              "      <td>Solar System\\nFor more details about our Solar...</td>\n",
-              "      <td>$.main-text[3]</td>\n",
-              "      <td>1</td>\n",
-              "      <td>[133.20942688, 570.81555176, 375.57919312, 581...</td>\n",
-              "      <td>d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...</td>\n",
-              "      <td>d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...</td>\n",
-              "      <td>1</td>\n",
-              "      <td>[]</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>5</th>\n",
-              "      <td>earth.pdf</td>\n",
-              "      <td>1</td>\n",
-              "      <td>0</td>\n",
-              "      <td>11</td>\n",
-              "      <td>pdf</td>\n",
-              "      <td>18713f970989055625bef22209b6f4b6830b9ca22046bf...</td>\n",
-              "      <td>2686</td>\n",
-              "      <td>2024-10-18T13:30:59.494027</td>\n",
-              "      <td>2.015123</td>\n",
-              "      <td>earth.pdf</td>\n",
-              "      <td>f3c0ac2e-1de2-472b-8216-2043f3b3e9d1</td>\n",
-              "      <td>Earth\\nEarth is the third planet from the Sun....</td>\n",
-              "      <td>$.main-text[5]</td>\n",
-              "      <td>1</td>\n",
-              "      <td>[132.91053772, 512.46295166, 477.84887695, 534...</td>\n",
-              "      <td>7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...</td>\n",
-              "      <td>7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...</td>\n",
-              "      <td>2</td>\n",
-              "      <td>[]</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>6</th>\n",
-              "      <td>earth.pdf</td>\n",
-              "      <td>1</td>\n",
-              "      <td>0</td>\n",
-              "      <td>11</td>\n",
-              "      <td>pdf</td>\n",
-              "      <td>18713f970989055625bef22209b6f4b6830b9ca22046bf...</td>\n",
-              "      <td>2686</td>\n",
-              "      <td>2024-10-18T13:30:59.494027</td>\n",
-              "      <td>2.015123</td>\n",
-              "      <td>earth.pdf</td>\n",
-              "      <td>f3c0ac2e-1de2-472b-8216-2043f3b3e9d1</td>\n",
-              "      <td>Earth\\nBasic facts about Earth:\\n· Distance fr...</td>\n",
-              "      <td>$.main-text[6]</td>\n",
-              "      <td>1</td>\n",
-              "      <td>[133.30151367, 494.86206055, 240.17156982, 505...</td>\n",
-              "      <td>189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...</td>\n",
-              "      <td>189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...</td>\n",
-              "      <td>3</td>\n",
-              "      <td>[]</td>\n",
-              "    </tr>\n",
-              "  </tbody>\n",
-              "</table>\n",
-              "</div>"
-            ],
-            "text/plain": [
-              "    filename  num_pages  num_tables  num_doc_elements  ext  \\\n",
-              "0   mars.pdf          1           0                11  pdf   \n",
-              "1   mars.pdf          1           0                11  pdf   \n",
-              "2   mars.pdf          1           0                11  pdf   \n",
-              "3  earth.pdf          1           0                11  pdf   \n",
-              "4  earth.pdf          1           0                11  pdf   \n",
-              "5  earth.pdf          1           0                11  pdf   \n",
-              "6  earth.pdf          1           0                11  pdf   \n",
-              "\n",
-              "                                                hash  size  \\\n",
-              "0  8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...  2800   \n",
-              "1  8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...  2800   \n",
-              "2  8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...  2800   \n",
-              "3  18713f970989055625bef22209b6f4b6830b9ca22046bf...  2686   \n",
-              "4  18713f970989055625bef22209b6f4b6830b9ca22046bf...  2686   \n",
-              "5  18713f970989055625bef22209b6f4b6830b9ca22046bf...  2686   \n",
-              "6  18713f970989055625bef22209b6f4b6830b9ca22046bf...  2686   \n",
-              "\n",
-              "                date_acquired  pdf_convert_time source_filename  \\\n",
-              "0  2024-10-18T13:30:59.490007          2.011138        mars.pdf   \n",
-              "1  2024-10-18T13:30:59.490007          2.011138        mars.pdf   \n",
-              "2  2024-10-18T13:30:59.490007          2.011138        mars.pdf   \n",
-              "3  2024-10-18T13:30:59.494027          2.015123       earth.pdf   \n",
-              "4  2024-10-18T13:30:59.494027          2.015123       earth.pdf   \n",
-              "5  2024-10-18T13:30:59.494027          2.015123       earth.pdf   \n",
-              "6  2024-10-18T13:30:59.494027          2.015123       earth.pdf   \n",
-              "\n",
-              "                     source_document_id  \\\n",
-              "0  62e5639f-f922-4ccc-a041-3cb02f1cfd83   \n",
-              "1  62e5639f-f922-4ccc-a041-3cb02f1cfd83   \n",
-              "2  62e5639f-f922-4ccc-a041-3cb02f1cfd83   \n",
-              "3  f3c0ac2e-1de2-472b-8216-2043f3b3e9d1   \n",
-              "4  f3c0ac2e-1de2-472b-8216-2043f3b3e9d1   \n",
-              "5  f3c0ac2e-1de2-472b-8216-2043f3b3e9d1   \n",
-              "6  f3c0ac2e-1de2-472b-8216-2043f3b3e9d1   \n",
-              "\n",
-              "                                            contents    doc_jsonpath  \\\n",
-              "0  Solar System\\nFor more details about the Solar...  $.main-text[3]   \n",
-              "1  Mars\\nMars, the fourth planet from the Sun, is...  $.main-text[5]   \n",
-              "2  Basic facts about Mars:\\n· Distance from the S...  $.main-text[6]   \n",
-              "3  Solar System\\nOur solar system is a vast and f...  $.main-text[2]   \n",
-              "4  Solar System\\nFor more details about our Solar...  $.main-text[3]   \n",
-              "5  Earth\\nEarth is the third planet from the Sun....  $.main-text[5]   \n",
-              "6  Earth\\nBasic facts about Earth:\\n· Distance fr...  $.main-text[6]   \n",
-              "\n",
-              "   page_number                                               bbox  \\\n",
-              "0            1  [133.18510437, 570.83258057, 374.99838257, 581...   \n",
-              "1            1  [132.87440491, 500.84011841, 477.48345947, 534...   \n",
-              "2            1  [133.2026062, 482.90710449, 237.04431152, 493....   \n",
-              "3            1  [132.87112427, 588.96014404, 479.40917969, 623...   \n",
-              "4            1  [133.20942688, 570.81555176, 375.57919312, 581...   \n",
-              "5            1  [132.91053772, 512.46295166, 477.84887695, 534...   \n",
-              "6            1  [133.30151367, 494.86206055, 240.17156982, 505...   \n",
-              "\n",
-              "                                         document_id  \\\n",
-              "0  dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07...   \n",
-              "1  a31663e06fac41470ecc459f5a58658a3f9997d7801053...   \n",
-              "2  7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...   \n",
-              "3  44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...   \n",
-              "4  d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...   \n",
-              "5  7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...   \n",
-              "6  189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...   \n",
-              "\n",
-              "                                          chunk_hash  chunk_id  \\\n",
-              "0  dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07...         5   \n",
-              "1  a31663e06fac41470ecc459f5a58658a3f9997d7801053...         6   \n",
-              "2  7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...         7   \n",
-              "3  44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...         0   \n",
-              "4  d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...         1   \n",
-              "5  7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...         2   \n",
-              "6  189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...         3   \n",
-              "\n",
-              "                                             removed  \n",
-              "0  [44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf567...  \n",
-              "1                                                 []  \n",
-              "2                                                 []  \n",
-              "3                                                 []  \n",
-              "4                                                 []  \n",
-              "5                                                 []  \n",
-              "6                                                 []  "
-            ]
-          },
-          "execution_count": 23,
-          "metadata": {},
-          "output_type": "execute_result"
-        }
-      ],
-      "source": [
-        "from my_utils import read_parquet_files_as_df\n",
-        "\n",
-        "output_df = read_parquet_files_as_df(output_folder)\n",
-        "\n",
-        "print (\"Input data dimensions (rows x columns)= \", input_df.shape)\n",
-        "print (\"Output data dimensions (rows x columns)= \", output_df.shape)\n",
-        "print (f\"Input chunks before exact dedupe : {input_df.shape[0]:,}\")\n",
-        "print (f\"Output chunks after exact dedupe : {output_df.shape[0]:,}\")\n",
-        "print (\"Duplicate chunks removed :  \", (input_df.shape[0] - output_df.shape[0]))\n",
-        "\n",
-        "output_df.head(10)"
-      ]
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "✅ Stage:5 completed successfully\n",
+      "CPU times: user 270 ms, sys: 200 ms, total: 470 ms\n",
+      "Wall time: 36.6 s\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "\n",
+    "import os\n",
+    "import sys\n",
+    "\n",
+    "from data_processing.utils import ParamsUtils\n",
+    "from fdedup_transform_ray import FdedupRayTransformConfiguration\n",
+    "from data_processing_ray.runtime.ray import RayTransformLauncher\n",
+    "\n",
+    "# create parameters\n",
+    "\n",
+    "local_conf = {\n",
+    "    \"input_folder\": input_folder,\n",
+    "    \"output_folder\": output_folder,\n",
+    "}\n",
+    "worker_options = {\"num_cpus\" : MY_CONFIG.RAY_NUM_CPUS}\n",
+    "code_location = {\"github\": \"github\", \"commit_hash\": \"12345\", \"path\": \"path\"}\n",
+    "params = {\n",
+    "    # where to run\n",
+    "    \"run_locally\": True,\n",
+    "    # Data access. Only required parameters are specified\n",
+    "    \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n",
+    "    # Orchestration parameters\n",
+    "    \"runtime_worker_options\": ParamsUtils.convert_to_ast(worker_options),\n",
+    "    \"runtime_num_workers\": MY_CONFIG.RAY_RUNTIME_WORKERS,\n",
+    "    # columns used\n",
+    "    \"fdedup_doc_column\": \"contents\",\n",
+    "    \"fdedup_id_column\": \"chunk_id\",\n",
+    "    \"fdedup_cluster_column\": \"chunk_hash\",\n",
+    "    # infrastructure\n",
+    "    \"fdedup_bucket_cpu\": 0.3,\n",
+    "    \"fdedup_doc_cpu\": 0.3,\n",
+    "    \"fdedup_mhash_cpu\": 0.3,\n",
+    "    \"fdedup_num_doc_actors\": 1,\n",
+    "    \"fdedup_num_bucket_actors\": 1,\n",
+    "    \"fdedup_num_minhash_actors\": 1,\n",
+    "    \"fdedup_num_preprocessors\": 1,\n",
+    "    # fuzzy parameters\n",
+    "    \"fdedup_num_permutations\": 64,\n",
+    "    \"fdedup_threshold\": 0.7, # (default 0.8)\n",
+    "    \"fdedup_shingles_size\": 5,\n",
+    "    \"fdedup_delimiters\": \" \"\n",
+    "}\n",
+    "\n",
+    "# Pass commandline params\n",
+    "sys.argv = ParamsUtils.dict_to_req(d=params)\n",
+    "\n",
+    "# launch\n",
+    "\n",
+    "launcher = RayTransformLauncher(FdedupRayTransformConfiguration())\n",
+    "\n",
+    "return_code = launcher.launch()\n",
+    "\n",
+    "if return_code == 0:\n",
+    "    print (f\"✅ Stage:{STAGE} completed successfully\")\n",
+    "else:\n",
+    "    raise Exception (\"❌ Ray job failed\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a6f8cd11",
+   "metadata": {
+    "id": "a6f8cd11"
+   },
+   "source": [
+    "### 7.3 - Inspect Generated output"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "id": "e899ad60",
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/",
+     "height": 677
     },
+    "id": "e899ad60",
+    "outputId": "fcfda84c-ebbf-490f-f478-ceef7ca9e83b"
+   },
+   "outputs": [
     {
-      "cell_type": "code",
-      "execution_count": 24,
-      "id": "82cc9bb0",
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 269
-        },
-        "id": "82cc9bb0",
-        "outputId": "e043fa01-ceca-49ae-b764-8154219c7b6c"
-      },
-      "outputs": [
-        {
-          "data": {
-            "text/html": [
-              "<div>\n",
-              "<style scoped>\n",
-              "    .dataframe tbody tr th:only-of-type {\n",
-              "        vertical-align: middle;\n",
-              "    }\n",
-              "\n",
-              "    .dataframe tbody tr th {\n",
-              "        vertical-align: top;\n",
-              "    }\n",
-              "\n",
-              "    .dataframe thead th {\n",
-              "        text-align: right;\n",
-              "    }\n",
-              "</style>\n",
-              "<table border=\"1\" class=\"dataframe\">\n",
-              "  <thead>\n",
-              "    <tr style=\"text-align: right;\">\n",
-              "      <th></th>\n",
-              "      <th>filename</th>\n",
-              "      <th>contents</th>\n",
-              "    </tr>\n",
-              "  </thead>\n",
-              "  <tbody>\n",
-              "    <tr>\n",
-              "      <th>0</th>\n",
-              "      <td>mars.pdf</td>\n",
-              "      <td>Solar System\\nFor more details about the Solar...</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>1</th>\n",
-              "      <td>mars.pdf</td>\n",
-              "      <td>Mars\\nMars, the fourth planet from the Sun, is...</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>2</th>\n",
-              "      <td>mars.pdf</td>\n",
-              "      <td>Basic facts about Mars:\\n· Distance from the S...</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>3</th>\n",
-              "      <td>earth.pdf</td>\n",
-              "      <td>Solar System\\nOur solar system is a vast and f...</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>4</th>\n",
-              "      <td>earth.pdf</td>\n",
-              "      <td>Solar System\\nFor more details about our Solar...</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>5</th>\n",
-              "      <td>earth.pdf</td>\n",
-              "      <td>Earth\\nEarth is the third planet from the Sun....</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>6</th>\n",
-              "      <td>earth.pdf</td>\n",
-              "      <td>Earth\\nBasic facts about Earth:\\n· Distance fr...</td>\n",
-              "    </tr>\n",
-              "  </tbody>\n",
-              "</table>\n",
-              "</div>"
-            ],
-            "text/plain": [
-              "    filename                                           contents\n",
-              "0   mars.pdf  Solar System\\nFor more details about the Solar...\n",
-              "1   mars.pdf  Mars\\nMars, the fourth planet from the Sun, is...\n",
-              "2   mars.pdf  Basic facts about Mars:\\n· Distance from the S...\n",
-              "3  earth.pdf  Solar System\\nOur solar system is a vast and f...\n",
-              "4  earth.pdf  Solar System\\nFor more details about our Solar...\n",
-              "5  earth.pdf  Earth\\nEarth is the third planet from the Sun....\n",
-              "6  earth.pdf  Earth\\nBasic facts about Earth:\\n· Distance fr..."
-            ]
-          },
-          "execution_count": 24,
-          "metadata": {},
-          "output_type": "execute_result"
-        }
-      ],
-      "source": [
-        "output_df[['filename', 'contents']]"
-      ]
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Input data dimensions (rows x columns)=  (8, 18)\n",
+      "Output data dimensions (rows x columns)=  (6, 18)\n",
+      "Duplicate chunks removed  by fuzzy-dedupe:   2\n"
+     ]
     },
     {
-      "cell_type": "code",
-      "execution_count": 25,
-      "id": "cc61dffa",
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "cc61dffa",
-        "outputId": "aff7a0d9-a791-42a5-d5b7-ad643f59f261"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "========== mars.pdf ===========\n",
-            "-------Chunk 0------\n",
-            "Solar System\n",
-            "For more details about the Solar system see Chapter 1.\n",
-            "-------\n",
-            "-------Chunk 1------\n",
-            "Mars\n",
-            "Mars, the fourth planet from the Sun, is a cold, desert world with a thin atmosphere composed primarily of carbon dioxide. Its reddish hue comes from iron oxide, or rust, prevalent on its surface.\n",
-            "-------\n",
-            "-------Chunk 2------\n",
-            "Basic facts about Mars:\n",
-            "· Distance from the Sun: Average of 228 million kilometers (142 million miles)\n",
-            "· Rotation Period: 24.6 hours (one Martian day - called a \"sol\")\n",
-            "· Moons: Two small moons, Phobos and Deimos.\n",
-            "-------\n",
-            "========== earth.pdf ===========\n",
-            "-------Chunk 0------\n",
-            "Solar System\n",
-            "Our solar system is a vast and fascinating expanse, comprising eight planets, five dwarf planets, numerous moons, asteroids, comets, and other celestial bodies. At its center lies the star we call the Sun.\n",
-            "-------\n",
-            "-------Chunk 1------\n",
-            "Solar System\n",
-            "For more details about our Solar system see Chapter 1.\n",
-            "-------\n",
-            "-------Chunk 2------\n",
-            "Earth\n",
-            "Earth is the third planet from the Sun. It's our home planet. Earth is the only place we know of with life.\n",
-            "-------\n",
-            "-------Chunk 3------\n",
-            "Earth\n",
-            "Basic facts about Earth:\n",
-            "· Distance from the Sun: Average of 149.6 million kilometers (93 million miles)\n",
-            "· Rotation Period: 24 hours (one day)\n",
-            "· Moons: One moon, called Luna or simply \"the Moon\".\n",
-            "-------\n"
-          ]
-        }
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>filename</th>\n",
+       "      <th>num_pages</th>\n",
+       "      <th>num_tables</th>\n",
+       "      <th>num_doc_elements</th>\n",
+       "      <th>ext</th>\n",
+       "      <th>hash</th>\n",
+       "      <th>size</th>\n",
+       "      <th>date_acquired</th>\n",
+       "      <th>pdf_convert_time</th>\n",
+       "      <th>source_filename</th>\n",
+       "      <th>source_document_id</th>\n",
+       "      <th>contents</th>\n",
+       "      <th>doc_jsonpath</th>\n",
+       "      <th>page_number</th>\n",
+       "      <th>bbox</th>\n",
+       "      <th>document_id</th>\n",
+       "      <th>chunk_id</th>\n",
+       "      <th>chunk_hash</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>mars.pdf</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>11</td>\n",
+       "      <td>pdf</td>\n",
+       "      <td>8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...</td>\n",
+       "      <td>2800</td>\n",
+       "      <td>2024-10-18T13:30:59.490007</td>\n",
+       "      <td>2.011138</td>\n",
+       "      <td>mars.pdf</td>\n",
+       "      <td>62e5639f-f922-4ccc-a041-3cb02f1cfd83</td>\n",
+       "      <td>Solar System\\nOur solar system is a vast and f...</td>\n",
+       "      <td>$.main-text[2]</td>\n",
+       "      <td>1</td>\n",
+       "      <td>[132.84518433, 588.96014404, 479.40917969, 623...</td>\n",
+       "      <td>44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...</td>\n",
+       "      <td>4</td>\n",
+       "      <td>-1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>mars.pdf</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>11</td>\n",
+       "      <td>pdf</td>\n",
+       "      <td>8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...</td>\n",
+       "      <td>2800</td>\n",
+       "      <td>2024-10-18T13:30:59.490007</td>\n",
+       "      <td>2.011138</td>\n",
+       "      <td>mars.pdf</td>\n",
+       "      <td>62e5639f-f922-4ccc-a041-3cb02f1cfd83</td>\n",
+       "      <td>Mars\\nMars, the fourth planet from the Sun, is...</td>\n",
+       "      <td>$.main-text[5]</td>\n",
+       "      <td>1</td>\n",
+       "      <td>[132.87440491, 500.84011841, 477.48345947, 534...</td>\n",
+       "      <td>a31663e06fac41470ecc459f5a58658a3f9997d7801053...</td>\n",
+       "      <td>6</td>\n",
+       "      <td>-1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>mars.pdf</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>11</td>\n",
+       "      <td>pdf</td>\n",
+       "      <td>8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...</td>\n",
+       "      <td>2800</td>\n",
+       "      <td>2024-10-18T13:30:59.490007</td>\n",
+       "      <td>2.011138</td>\n",
+       "      <td>mars.pdf</td>\n",
+       "      <td>62e5639f-f922-4ccc-a041-3cb02f1cfd83</td>\n",
+       "      <td>Basic facts about Mars:\\n· Distance from the S...</td>\n",
+       "      <td>$.main-text[6]</td>\n",
+       "      <td>1</td>\n",
+       "      <td>[133.2026062, 482.90710449, 237.04431152, 493....</td>\n",
+       "      <td>7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...</td>\n",
+       "      <td>7</td>\n",
+       "      <td>-1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>earth.pdf</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>11</td>\n",
+       "      <td>pdf</td>\n",
+       "      <td>18713f970989055625bef22209b6f4b6830b9ca22046bf...</td>\n",
+       "      <td>2686</td>\n",
+       "      <td>2024-10-18T13:30:59.494027</td>\n",
+       "      <td>2.015123</td>\n",
+       "      <td>earth.pdf</td>\n",
+       "      <td>f3c0ac2e-1de2-472b-8216-2043f3b3e9d1</td>\n",
+       "      <td>Solar System\\nFor more details about our Solar...</td>\n",
+       "      <td>$.main-text[3]</td>\n",
+       "      <td>1</td>\n",
+       "      <td>[133.20942688, 570.81555176, 375.57919312, 581...</td>\n",
+       "      <td>d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...</td>\n",
+       "      <td>1</td>\n",
+       "      <td>5</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>earth.pdf</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>11</td>\n",
+       "      <td>pdf</td>\n",
+       "      <td>18713f970989055625bef22209b6f4b6830b9ca22046bf...</td>\n",
+       "      <td>2686</td>\n",
+       "      <td>2024-10-18T13:30:59.494027</td>\n",
+       "      <td>2.015123</td>\n",
+       "      <td>earth.pdf</td>\n",
+       "      <td>f3c0ac2e-1de2-472b-8216-2043f3b3e9d1</td>\n",
+       "      <td>Earth\\nEarth is the third planet from the Sun....</td>\n",
+       "      <td>$.main-text[5]</td>\n",
+       "      <td>1</td>\n",
+       "      <td>[132.91053772, 512.46295166, 477.84887695, 534...</td>\n",
+       "      <td>7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...</td>\n",
+       "      <td>2</td>\n",
+       "      <td>-1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>earth.pdf</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>11</td>\n",
+       "      <td>pdf</td>\n",
+       "      <td>18713f970989055625bef22209b6f4b6830b9ca22046bf...</td>\n",
+       "      <td>2686</td>\n",
+       "      <td>2024-10-18T13:30:59.494027</td>\n",
+       "      <td>2.015123</td>\n",
+       "      <td>earth.pdf</td>\n",
+       "      <td>f3c0ac2e-1de2-472b-8216-2043f3b3e9d1</td>\n",
+       "      <td>Earth\\nBasic facts about Earth:\\n· Distance fr...</td>\n",
+       "      <td>$.main-text[6]</td>\n",
+       "      <td>1</td>\n",
+       "      <td>[133.30151367, 494.86206055, 240.17156982, 505...</td>\n",
+       "      <td>189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...</td>\n",
+       "      <td>3</td>\n",
+       "      <td>-1</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
       ],
-      "source": [
-        "for f in output_df['filename'].unique():\n",
-        "    print ('==========' , f, '===========')\n",
-        "    chunks = output_df[output_df['filename'] == f]['contents']\n",
-        "    for idx , chunk in enumerate(chunks):\n",
-        "        print (f'-------Chunk {idx}------\\n{chunk}\\n-------')"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "383f40ba",
-      "metadata": {
-        "id": "383f40ba"
-      },
-      "source": [
-        "### 6.4 - Understanding the output\n",
-        "\n",
-        "Remember we had 8 chunks initially.  Now we have 7!  One duplicate chunk is removed.\n",
-        "\n",
-        "If you look at the PDF, the following common paragraph in `earth.pdf` and `mars.pdf`  is removed from one of the documents!  Pretty neat, eh!\n",
-        "\n",
-        "```text\n",
-        "## Solar System\n",
-        "\n",
-        "Our solar system is a vast and fascinating expanse, comprising eight planets, five dwarf planets, numerous moons, asteroids, comets, and other celestial bodies. At its center lies the star we call the Sun.\n",
-        "```"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "85309751-8556-41c6-ac32-84acc941bc8d",
-      "metadata": {
-        "id": "85309751-8556-41c6-ac32-84acc941bc8d"
-      },
-      "source": [
-        "## Step-7: Fuzzy Dedup\n",
-        "\n",
-        "Post exact deduplication, fuzzy deduplication is applied with the goal of removing code files that may have **slight variations** and thereby unbiasing\n",
-        "the data further.\n",
-        "\n",
-        "Small variations are quite commonly seen in code data in the form of variations in the values of variables, addittion of logging statements etc."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "fcf574a3-b287-419c-9c86-07b828b41ca6",
-      "metadata": {
-        "id": "fcf574a3-b287-419c-9c86-07b828b41ca6"
-      },
-      "source": [
-        "### 7.1 - Set Input/output Folder"
+      "text/plain": [
+       "    filename  num_pages  num_tables  num_doc_elements  ext  \\\n",
+       "0   mars.pdf          1           0                11  pdf   \n",
+       "1   mars.pdf          1           0                11  pdf   \n",
+       "2   mars.pdf          1           0                11  pdf   \n",
+       "3  earth.pdf          1           0                11  pdf   \n",
+       "4  earth.pdf          1           0                11  pdf   \n",
+       "5  earth.pdf          1           0                11  pdf   \n",
+       "\n",
+       "                                                hash  size  \\\n",
+       "0  8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...  2800   \n",
+       "1  8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...  2800   \n",
+       "2  8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...  2800   \n",
+       "3  18713f970989055625bef22209b6f4b6830b9ca22046bf...  2686   \n",
+       "4  18713f970989055625bef22209b6f4b6830b9ca22046bf...  2686   \n",
+       "5  18713f970989055625bef22209b6f4b6830b9ca22046bf...  2686   \n",
+       "\n",
+       "                date_acquired  pdf_convert_time source_filename  \\\n",
+       "0  2024-10-18T13:30:59.490007          2.011138        mars.pdf   \n",
+       "1  2024-10-18T13:30:59.490007          2.011138        mars.pdf   \n",
+       "2  2024-10-18T13:30:59.490007          2.011138        mars.pdf   \n",
+       "3  2024-10-18T13:30:59.494027          2.015123       earth.pdf   \n",
+       "4  2024-10-18T13:30:59.494027          2.015123       earth.pdf   \n",
+       "5  2024-10-18T13:30:59.494027          2.015123       earth.pdf   \n",
+       "\n",
+       "                     source_document_id  \\\n",
+       "0  62e5639f-f922-4ccc-a041-3cb02f1cfd83   \n",
+       "1  62e5639f-f922-4ccc-a041-3cb02f1cfd83   \n",
+       "2  62e5639f-f922-4ccc-a041-3cb02f1cfd83   \n",
+       "3  f3c0ac2e-1de2-472b-8216-2043f3b3e9d1   \n",
+       "4  f3c0ac2e-1de2-472b-8216-2043f3b3e9d1   \n",
+       "5  f3c0ac2e-1de2-472b-8216-2043f3b3e9d1   \n",
+       "\n",
+       "                                            contents    doc_jsonpath  \\\n",
+       "0  Solar System\\nOur solar system is a vast and f...  $.main-text[2]   \n",
+       "1  Mars\\nMars, the fourth planet from the Sun, is...  $.main-text[5]   \n",
+       "2  Basic facts about Mars:\\n· Distance from the S...  $.main-text[6]   \n",
+       "3  Solar System\\nFor more details about our Solar...  $.main-text[3]   \n",
+       "4  Earth\\nEarth is the third planet from the Sun....  $.main-text[5]   \n",
+       "5  Earth\\nBasic facts about Earth:\\n· Distance fr...  $.main-text[6]   \n",
+       "\n",
+       "   page_number                                               bbox  \\\n",
+       "0            1  [132.84518433, 588.96014404, 479.40917969, 623...   \n",
+       "1            1  [132.87440491, 500.84011841, 477.48345947, 534...   \n",
+       "2            1  [133.2026062, 482.90710449, 237.04431152, 493....   \n",
+       "3            1  [133.20942688, 570.81555176, 375.57919312, 581...   \n",
+       "4            1  [132.91053772, 512.46295166, 477.84887695, 534...   \n",
+       "5            1  [133.30151367, 494.86206055, 240.17156982, 505...   \n",
+       "\n",
+       "                                         document_id  chunk_id  chunk_hash  \n",
+       "0  44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...         4          -1  \n",
+       "1  a31663e06fac41470ecc459f5a58658a3f9997d7801053...         6          -1  \n",
+       "2  7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...         7          -1  \n",
+       "3  d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...         1           5  \n",
+       "4  7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...         2          -1  \n",
+       "5  189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...         3          -1  "
       ]
+     },
+     "execution_count": 28,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from my_utils import read_parquet_files_as_df\n",
+    "\n",
+    "output_df = read_parquet_files_as_df(output_folder)\n",
+    "\n",
+    "print (\"Input data dimensions (rows x columns)= \", input_df.shape)\n",
+    "print (\"Output data dimensions (rows x columns)= \", output_df.shape)\n",
+    "print (\"Duplicate chunks removed  by fuzzy-dedupe:  \", (input_df.shape[0] - output_df.shape[0]))\n",
+    "\n",
+    "output_df.head(10)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "id": "ab7ea52b",
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/",
+     "height": 238
     },
+    "id": "ab7ea52b",
+    "outputId": "e38754ee-777f-4ed7-ebc0-9299ee122662"
+   },
+   "outputs": [
     {
-      "cell_type": "code",
-      "execution_count": 26,
-      "id": "9e431c8c-c7c7-48de-ba5f-2c4649c35399",
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "9e431c8c-c7c7-48de-ba5f-2c4649c35399",
-        "outputId": "d53a92d2-0f1c-465f-f11c-b9bc2931f651"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "🏃🏼 STAGE-5: Processing input='output/03_docid_out' --> output='output/05_fuzzy_dedupe_out'\n"
-          ]
-        }
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>filename</th>\n",
+       "      <th>contents</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>mars.pdf</td>\n",
+       "      <td>Solar System\\nOur solar system is a vast and f...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>mars.pdf</td>\n",
+       "      <td>Mars\\nMars, the fourth planet from the Sun, is...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>mars.pdf</td>\n",
+       "      <td>Basic facts about Mars:\\n· Distance from the S...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>earth.pdf</td>\n",
+       "      <td>Solar System\\nFor more details about our Solar...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>earth.pdf</td>\n",
+       "      <td>Earth\\nEarth is the third planet from the Sun....</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>earth.pdf</td>\n",
+       "      <td>Earth\\nBasic facts about Earth:\\n· Distance fr...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
       ],
-      "source": [
-        "## Input to this component is the output of doc_id generator component.\n",
-        "\n",
-        "STAGE  = 5\n",
-        "\n",
-        "input_folder = output_docid_dir # previous output folder is the input folder for the current stage\n",
-        "output_folder =  output_fuzzy_dedupe_dir\n",
-        "\n",
-        "input_df = read_parquet_files_as_df(input_folder)  ## for debug purposes\n",
-        "\n",
-        "print (f\"🏃🏼 STAGE-{STAGE}: Processing input='{input_folder}' --> output='{output_folder}'\")"
+      "text/plain": [
+       "    filename                                           contents\n",
+       "0   mars.pdf  Solar System\\nOur solar system is a vast and f...\n",
+       "1   mars.pdf  Mars\\nMars, the fourth planet from the Sun, is...\n",
+       "2   mars.pdf  Basic facts about Mars:\\n· Distance from the S...\n",
+       "3  earth.pdf  Solar System\\nFor more details about our Solar...\n",
+       "4  earth.pdf  Earth\\nEarth is the third planet from the Sun....\n",
+       "5  earth.pdf  Earth\\nBasic facts about Earth:\\n· Distance fr..."
       ]
+     },
+     "execution_count": 29,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "output_df[['filename', 'contents']]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "id": "6bdd3515",
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
     },
+    "id": "6bdd3515",
+    "outputId": "e6e3f2c0-5b23-4336-bc95-013921f0724a"
+   },
+   "outputs": [
     {
-      "cell_type": "markdown",
-      "id": "f4c82a8f-b513-4fe5-b172-d41b104b54f3",
-      "metadata": {
-        "id": "f4c82a8f-b513-4fe5-b172-d41b104b54f3"
-      },
-      "source": [
-        "### 7.2 - Execute"
-      ]
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "========== mars.pdf ===========\n",
+      "-------Chunk 0------\n",
+      "Solar System\n",
+      "Our solar system is a vast and fascinating expanse, comprising eight planets, five dwarf planets, numerous moons, asteroids, comets, and other celestial bodies. At its center lies the star we call the Sun.\n",
+      "-------\n",
+      "-------Chunk 1------\n",
+      "Mars\n",
+      "Mars, the fourth planet from the Sun, is a cold, desert world with a thin atmosphere composed primarily of carbon dioxide. Its reddish hue comes from iron oxide, or rust, prevalent on its surface.\n",
+      "-------\n",
+      "-------Chunk 2------\n",
+      "Basic facts about Mars:\n",
+      "· Distance from the Sun: Average of 228 million kilometers (142 million miles)\n",
+      "· Rotation Period: 24.6 hours (one Martian day - called a \"sol\")\n",
+      "· Moons: Two small moons, Phobos and Deimos.\n",
+      "-------\n",
+      "========== earth.pdf ===========\n",
+      "-------Chunk 0------\n",
+      "Solar System\n",
+      "For more details about our Solar system see Chapter 1.\n",
+      "-------\n",
+      "-------Chunk 1------\n",
+      "Earth\n",
+      "Earth is the third planet from the Sun. It's our home planet. Earth is the only place we know of with life.\n",
+      "-------\n",
+      "-------Chunk 2------\n",
+      "Earth\n",
+      "Basic facts about Earth:\n",
+      "· Distance from the Sun: Average of 149.6 million kilometers (93 million miles)\n",
+      "· Rotation Period: 24 hours (one day)\n",
+      "· Moons: One moon, called Luna or simply \"the Moon\".\n",
+      "-------\n"
+     ]
+    }
+   ],
+   "source": [
+    "for f in output_df['filename'].unique():\n",
+    "    print ('==========' , f, '===========')\n",
+    "    chunks = output_df[output_df['filename'] == f]['contents']\n",
+    "    for idx , chunk in enumerate(chunks):\n",
+    "        print (f'-------Chunk {idx}------\\n{chunk}\\n-------')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2b34d9c6",
+   "metadata": {
+    "id": "2b34d9c6"
+   },
+   "source": [
+    "### 7.4- Understanding the output\n",
+    "\n",
+    "So we started with 7 rows and ended up with 6.  Fuzzy dedupe removed the following **very similar** chunk.\n",
+    "\n",
+    "These are pretty similar chunks except for the words 'the' and 'our'\n",
+    "\n",
+    "**earth.pdf**\n",
+    "\n",
+    "`For more details about *our* Solar system see Chapter 1.`\n",
+    "\n",
+    "**mars.pdf**\n",
+    "\n",
+    "`For more details about *the* Solar system see Chapter 1.`\n",
+    "\n",
+    "Pretty neat, eh? 👏\n",
+    "\n",
+    "### Configuring Fuzzy de-dupe\n",
+    "\n",
+    "You can tweak fuzzy dedupe by tweaking the following parameters\n",
+    "\n",
+    "```python\n",
+    "# fuzzy parameters\n",
+    "    \"fdedup_num_permutations\": 64,\n",
+    "    \"fdedup_threshold\": 0.7, #  (default 0.8)\n",
+    "    \"fdedup_shingles_size\": 5,\n",
+    "    \"fdedup_delimiters\": \" \"\n",
+    "```\n",
+    "\n",
+    "In our case, we set `fdedup_threshold` parameter to 0.7.  \n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5370950a-2a3a-4143-8218-f9b4808099ba",
+   "metadata": {
+    "id": "5370950a-2a3a-4143-8218-f9b4808099ba"
+   },
+   "source": [
+    "## Step-8:   Text encoding\n",
+    "\n",
+    "Encode text for the vector storage."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "85aba685",
+   "metadata": {
+    "id": "85aba685"
+   },
+   "source": [
+    "### 8.1 - Set Input/output Folder"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "id": "20a153fa-fd56-401e-86be-4f7617affcc8",
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
     },
+    "id": "20a153fa-fd56-401e-86be-4f7617affcc8",
+    "outputId": "530e65c6-7ceb-4c73-cb87-50da46c78add"
+   },
+   "outputs": [
     {
-      "cell_type": "code",
-      "execution_count": 27,
-      "id": "3864ff77-e9a8-48f7-973b-c3b3aef1a94f",
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "3864ff77-e9a8-48f7-973b-c3b3aef1a94f",
-        "outputId": "1e63d364-3944-465a-ff7c-6e1dc750b2de"
-      },
-      "outputs": [
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "13:32:00 INFO - fuzzy dedup params are {'doc_column': 'contents', 'id_column': 'chunk_id', 'cluster_column': 'chunk_hash', 'bucket_cpu': 0.3, 'mhash_cpu': 0.3, 'doc_cpu': 0.3, 'num_doc_actors': 1, 'num_minhash_actors': 1, 'num_bucket_actors': 1, 'num_preprocessors': 1, 'num_permutations': 64, 'threshold': 0.7, 'shingles_size': 5, 'delimiters': ' ', 'snapshot_delay': 1, 'use_bucket_snapshot': False, 'use_doc_snapshot': False, 'random_delay_limit': 10, 'worker_options': {'num_cpus': 0.8}}\n",
-            "13:32:00 INFO - pipeline id pipeline_id\n",
-            "13:32:00 INFO - code location None\n",
-            "13:32:00 INFO - number of workers 2 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n",
-            "13:32:00 INFO - actor creation delay 0\n",
-            "13:32:00 INFO - job details {'job category': 'preprocessing', 'job name': 'fdedup', 'job type': 'ray', 'job id': 'job_id'}\n",
-            "13:32:00 INFO - data factory data_ is using local data access: input_folder - output/03_docid_out output_folder - output/05_fuzzy_dedupe_out\n",
-            "13:32:00 INFO - data factory data_ max_files -1, n_sample -1\n",
-            "13:32:00 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n",
-            "13:32:00 INFO - Running locally\n",
-            "2024-10-18 13:32:02,246\tINFO worker.py:1744 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n",
-            "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - orchestrator started at 2024-10-18 13:32:03\n",
-            "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - Number of files is 2, source profile {'max_file_size': 0.010180473327636719, 'min_file_size': 0.010101318359375, 'total_file_size': 0.02028179168701172}\n",
-            "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 15.000544739887118, 'object_store': 7.500272369012237}\n",
-            "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - Number of workers - 2 with {'num_cpus': 0.8, 'max_restarts': -1} each\n",
-            "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - starting run from the beginning\n",
-            "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - continuing from the very beginning\n",
-            "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - Fuzzy: num buckets 8, bucket length 8\n",
-            "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - created 1 bucket actors\n",
-            "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - created 1 minhash actors\n",
-            "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - Table preprocessing uses 1 readers\n",
-            "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - created 1 table processor actors\n",
-            "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:07 INFO - Completed 1 files in 0.064 min\n",
-            "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:07 INFO - Completed 1 files (50.0%)  in 0.064 min. Waiting for completion\n",
-            "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:15 INFO - Completed processing 2 files in 0.197 min\n",
-            "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:15 INFO - creating minhash snapshots\n",
-            "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:16 INFO - minhash snapshots created\n",
-            "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:16 INFO - creating bucket snapshots\n",
-            "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:17 INFO - bucket snapshots created\n",
-            "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:17 INFO - created 1 document actors\n",
-            "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:17 INFO - created 1 bucket processor actors\n",
-            "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:17 INFO - created bucket processor invoker\n",
-            "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:17 INFO - added invoker to bucket collectors\n",
-            "\u001b[36m(BucketsHash pid=16209)\u001b[0m 13:32:17 INFO - processing buckets 0 long, 53 short\n",
-            "\u001b[36m(BucketsHash pid=16209)\u001b[0m 13:32:17 INFO - Done submitting long buckets\n",
-            "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:17 INFO - Done processing buckets in 0.01 min\n",
-            "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:17 INFO - creating document snapshots\n",
-            "\u001b[36m(BucketsHashProcessorInvoker pid=16602)\u001b[0m 13:32:17 INFO - Waiting bucket processing completion. Submitted requests 1\n",
-            "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:18 INFO - document snapshots created\n",
-            "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:18 INFO - Completed 0 files (0.0%)  in 0.0 min. Waiting for completion\n",
-            "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:25 INFO - Completed processing 2 files in 0.113 min\n",
-            "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:25 INFO - done flushing in 0.005 sec\n",
-            "13:32:35 INFO - Completed execution in 0.588 min, execution result 0\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "✅ Stage:5 completed successfully\n",
-            "CPU times: user 270 ms, sys: 200 ms, total: 470 ms\n",
-            "Wall time: 36.6 s\n"
-          ]
-        }
-      ],
-      "source": [
-        "%%time\n",
-        "\n",
-        "import os\n",
-        "import sys\n",
-        "\n",
-        "from data_processing.utils import ParamsUtils\n",
-        "from fdedup_transform_ray import FdedupRayTransformConfiguration\n",
-        "from data_processing_ray.runtime.ray import RayTransformLauncher\n",
-        "\n",
-        "# create parameters\n",
-        "\n",
-        "local_conf = {\n",
-        "    \"input_folder\": input_folder,\n",
-        "    \"output_folder\": output_folder,\n",
-        "}\n",
-        "worker_options = {\"num_cpus\" : MY_CONFIG.RAY_NUM_CPUS}\n",
-        "code_location = {\"github\": \"github\", \"commit_hash\": \"12345\", \"path\": \"path\"}\n",
-        "params = {\n",
-        "    # where to run\n",
-        "    \"run_locally\": True,\n",
-        "    # Data access. Only required parameters are specified\n",
-        "    \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n",
-        "    # Orchestration parameters\n",
-        "    \"runtime_worker_options\": ParamsUtils.convert_to_ast(worker_options),\n",
-        "    \"runtime_num_workers\": MY_CONFIG.RAY_RUNTIME_WORKERS,\n",
-        "    # columns used\n",
-        "    \"fdedup_doc_column\": \"contents\",\n",
-        "    \"fdedup_id_column\": \"chunk_id\",\n",
-        "    \"fdedup_cluster_column\": \"chunk_hash\",\n",
-        "    # infrastructure\n",
-        "    \"fdedup_bucket_cpu\": 0.3,\n",
-        "    \"fdedup_doc_cpu\": 0.3,\n",
-        "    \"fdedup_mhash_cpu\": 0.3,\n",
-        "    \"fdedup_num_doc_actors\": 1,\n",
-        "    \"fdedup_num_bucket_actors\": 1,\n",
-        "    \"fdedup_num_minhash_actors\": 1,\n",
-        "    \"fdedup_num_preprocessors\": 1,\n",
-        "    # fuzzy parameters\n",
-        "    \"fdedup_num_permutations\": 64,\n",
-        "    \"fdedup_threshold\": 0.7, # (default 0.8)\n",
-        "    \"fdedup_shingles_size\": 5,\n",
-        "    \"fdedup_delimiters\": \" \"\n",
-        "}\n",
-        "\n",
-        "# Pass commandline params\n",
-        "sys.argv = ParamsUtils.dict_to_req(d=params)\n",
-        "\n",
-        "# launch\n",
-        "\n",
-        "launcher = RayTransformLauncher(FdedupRayTransformConfiguration())\n",
-        "\n",
-        "return_code = launcher.launch()\n",
-        "\n",
-        "if return_code == 0:\n",
-        "    print (f\"✅ Stage:{STAGE} completed successfully\")\n",
-        "else:\n",
-        "    raise Exception (\"❌ Ray job failed\")"
-      ]
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "🏃🏼 STAGE-6: Processing input='output/05_fuzzy_dedupe_out' --> output='output/06_embeddings_out'\n"
+     ]
+    }
+   ],
+   "source": [
+    "STAGE  = 6\n",
+    "\n",
+    "input_folder = output_fuzzy_dedupe_dir # previous output folder is the input folder for the current stage\n",
+    "output_folder =  output_embeddings_dir\n",
+    "\n",
+    "input_df = read_parquet_files_as_df(input_folder)  ## for debug purposes\n",
+    "\n",
+    "print (f\"🏃🏼 STAGE-{STAGE}: Processing input='{input_folder}' --> output='{output_folder}'\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c97545f4",
+   "metadata": {
+    "id": "c97545f4"
+   },
+   "source": [
+    "### 8.2 - Execute"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "id": "228df6b2-bc62-494b-9697-03ece98d7853",
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/",
+     "height": 914,
+     "referenced_widgets": [
+      "8b7571c585df431eb901fcdebdf8177e",
+      "06107a2f48b3491f91bbe84e46e10ba0",
+      "bd74356eca18423aa0373c808d9097e3",
+      "7e13e8779a81400f996d4428c74acfaf",
+      "a75892696be546a3970962bae7bf732a",
+      "68997339f13240a4824a9e416096bee4",
+      "919b086abd314077bbff75687392bd91",
+      "b4c209371e7a403986991a786cfb296d",
+      "6c08de2dd9a2402c90b1a7a645db9b13",
+      "91fff81a1de8487c9009e872b751edb0",
+      "ada62d24cbcf4361acbb21808f334d33"
+     ]
     },
+    "id": "228df6b2-bc62-494b-9697-03ece98d7853",
+    "outputId": "b10eecc1-cd17-49c1-e3b1-b80e0e1bfa86"
+   },
+   "outputs": [
     {
-      "cell_type": "markdown",
-      "id": "a6f8cd11",
-      "metadata": {
-        "id": "a6f8cd11"
-      },
-      "source": [
-        "### 7.3 - Inspect Generated output"
-      ]
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "13:32:37 INFO - text_encoder parameters are : {'content_column_name': 'contents', 'output_embeddings_column_name': 'embeddings', 'model_name': 'sentence-transformers/all-MiniLM-L6-v2'}\n",
+      "13:32:37 INFO - pipeline id pipeline_id\n",
+      "13:32:37 INFO - code location None\n",
+      "13:32:37 INFO - number of workers 2 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n",
+      "13:32:37 INFO - actor creation delay 0\n",
+      "13:32:37 INFO - job details {'job category': 'preprocessing', 'job name': 'text_encoder', 'job type': 'ray', 'job id': 'job_id'}\n",
+      "13:32:37 INFO - data factory data_ is using local data access: input_folder - output/05_fuzzy_dedupe_out output_folder - output/06_embeddings_out\n",
+      "13:32:37 INFO - data factory data_ max_files -1, n_sample -1\n",
+      "13:32:37 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n",
+      "13:32:37 INFO - Running locally\n",
+      "2024-10-18 13:32:39,609\tINFO worker.py:1744 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n",
+      "\u001b[36m(orchestrate pid=17394)\u001b[0m 13:32:42 INFO - orchestrator started at 2024-10-18 13:32:42\n",
+      "\u001b[36m(orchestrate pid=17394)\u001b[0m 13:32:42 INFO - Number of files is 2, source profile {'max_file_size': 0.009654045104980469, 'min_file_size': 0.00907135009765625, 'total_file_size': 0.01872539520263672}\n",
+      "\u001b[36m(orchestrate pid=17394)\u001b[0m 13:32:42 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 14.943363189697266, 'object_store': 7.471681594848633}\n",
+      "\u001b[36m(orchestrate pid=17394)\u001b[0m 13:32:42 INFO - Number of workers - 2 with {'num_cpus': 0.8, 'max_restarts': -1} each\n",
+      "\u001b[36m(orchestrate pid=17394)\u001b[0m 13:32:42 INFO - Completed 0 files (0.0%)  in 0.0 min. Waiting for completion\n",
+      "\u001b[36m(orchestrate pid=17394)\u001b[0m 13:32:47 INFO - Completed processing 2 files in 0.087 min\n",
+      "\u001b[36m(orchestrate pid=17394)\u001b[0m 13:32:47 INFO - done flushing in 0.001 sec\n",
+      "13:32:57 INFO - Completed execution in 0.333 min, execution result 0\n"
+     ]
     },
     {
-      "cell_type": "code",
-      "execution_count": 28,
-      "id": "e899ad60",
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 677
-        },
-        "id": "e899ad60",
-        "outputId": "fcfda84c-ebbf-490f-f478-ceef7ca9e83b"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Input data dimensions (rows x columns)=  (8, 18)\n",
-            "Output data dimensions (rows x columns)=  (6, 18)\n",
-            "Duplicate chunks removed  by fuzzy-dedupe:   2\n"
-          ]
-        },
-        {
-          "data": {
-            "text/html": [
-              "<div>\n",
-              "<style scoped>\n",
-              "    .dataframe tbody tr th:only-of-type {\n",
-              "        vertical-align: middle;\n",
-              "    }\n",
-              "\n",
-              "    .dataframe tbody tr th {\n",
-              "        vertical-align: top;\n",
-              "    }\n",
-              "\n",
-              "    .dataframe thead th {\n",
-              "        text-align: right;\n",
-              "    }\n",
-              "</style>\n",
-              "<table border=\"1\" class=\"dataframe\">\n",
-              "  <thead>\n",
-              "    <tr style=\"text-align: right;\">\n",
-              "      <th></th>\n",
-              "      <th>filename</th>\n",
-              "      <th>num_pages</th>\n",
-              "      <th>num_tables</th>\n",
-              "      <th>num_doc_elements</th>\n",
-              "      <th>ext</th>\n",
-              "      <th>hash</th>\n",
-              "      <th>size</th>\n",
-              "      <th>date_acquired</th>\n",
-              "      <th>pdf_convert_time</th>\n",
-              "      <th>source_filename</th>\n",
-              "      <th>source_document_id</th>\n",
-              "      <th>contents</th>\n",
-              "      <th>doc_jsonpath</th>\n",
-              "      <th>page_number</th>\n",
-              "      <th>bbox</th>\n",
-              "      <th>document_id</th>\n",
-              "      <th>chunk_id</th>\n",
-              "      <th>chunk_hash</th>\n",
-              "    </tr>\n",
-              "  </thead>\n",
-              "  <tbody>\n",
-              "    <tr>\n",
-              "      <th>0</th>\n",
-              "      <td>mars.pdf</td>\n",
-              "      <td>1</td>\n",
-              "      <td>0</td>\n",
-              "      <td>11</td>\n",
-              "      <td>pdf</td>\n",
-              "      <td>8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...</td>\n",
-              "      <td>2800</td>\n",
-              "      <td>2024-10-18T13:30:59.490007</td>\n",
-              "      <td>2.011138</td>\n",
-              "      <td>mars.pdf</td>\n",
-              "      <td>62e5639f-f922-4ccc-a041-3cb02f1cfd83</td>\n",
-              "      <td>Solar System\\nOur solar system is a vast and f...</td>\n",
-              "      <td>$.main-text[2]</td>\n",
-              "      <td>1</td>\n",
-              "      <td>[132.84518433, 588.96014404, 479.40917969, 623...</td>\n",
-              "      <td>44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...</td>\n",
-              "      <td>4</td>\n",
-              "      <td>-1</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>1</th>\n",
-              "      <td>mars.pdf</td>\n",
-              "      <td>1</td>\n",
-              "      <td>0</td>\n",
-              "      <td>11</td>\n",
-              "      <td>pdf</td>\n",
-              "      <td>8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...</td>\n",
-              "      <td>2800</td>\n",
-              "      <td>2024-10-18T13:30:59.490007</td>\n",
-              "      <td>2.011138</td>\n",
-              "      <td>mars.pdf</td>\n",
-              "      <td>62e5639f-f922-4ccc-a041-3cb02f1cfd83</td>\n",
-              "      <td>Mars\\nMars, the fourth planet from the Sun, is...</td>\n",
-              "      <td>$.main-text[5]</td>\n",
-              "      <td>1</td>\n",
-              "      <td>[132.87440491, 500.84011841, 477.48345947, 534...</td>\n",
-              "      <td>a31663e06fac41470ecc459f5a58658a3f9997d7801053...</td>\n",
-              "      <td>6</td>\n",
-              "      <td>-1</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>2</th>\n",
-              "      <td>mars.pdf</td>\n",
-              "      <td>1</td>\n",
-              "      <td>0</td>\n",
-              "      <td>11</td>\n",
-              "      <td>pdf</td>\n",
-              "      <td>8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...</td>\n",
-              "      <td>2800</td>\n",
-              "      <td>2024-10-18T13:30:59.490007</td>\n",
-              "      <td>2.011138</td>\n",
-              "      <td>mars.pdf</td>\n",
-              "      <td>62e5639f-f922-4ccc-a041-3cb02f1cfd83</td>\n",
-              "      <td>Basic facts about Mars:\\n· Distance from the S...</td>\n",
-              "      <td>$.main-text[6]</td>\n",
-              "      <td>1</td>\n",
-              "      <td>[133.2026062, 482.90710449, 237.04431152, 493....</td>\n",
-              "      <td>7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...</td>\n",
-              "      <td>7</td>\n",
-              "      <td>-1</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>3</th>\n",
-              "      <td>earth.pdf</td>\n",
-              "      <td>1</td>\n",
-              "      <td>0</td>\n",
-              "      <td>11</td>\n",
-              "      <td>pdf</td>\n",
-              "      <td>18713f970989055625bef22209b6f4b6830b9ca22046bf...</td>\n",
-              "      <td>2686</td>\n",
-              "      <td>2024-10-18T13:30:59.494027</td>\n",
-              "      <td>2.015123</td>\n",
-              "      <td>earth.pdf</td>\n",
-              "      <td>f3c0ac2e-1de2-472b-8216-2043f3b3e9d1</td>\n",
-              "      <td>Solar System\\nFor more details about our Solar...</td>\n",
-              "      <td>$.main-text[3]</td>\n",
-              "      <td>1</td>\n",
-              "      <td>[133.20942688, 570.81555176, 375.57919312, 581...</td>\n",
-              "      <td>d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...</td>\n",
-              "      <td>1</td>\n",
-              "      <td>5</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>4</th>\n",
-              "      <td>earth.pdf</td>\n",
-              "      <td>1</td>\n",
-              "      <td>0</td>\n",
-              "      <td>11</td>\n",
-              "      <td>pdf</td>\n",
-              "      <td>18713f970989055625bef22209b6f4b6830b9ca22046bf...</td>\n",
-              "      <td>2686</td>\n",
-              "      <td>2024-10-18T13:30:59.494027</td>\n",
-              "      <td>2.015123</td>\n",
-              "      <td>earth.pdf</td>\n",
-              "      <td>f3c0ac2e-1de2-472b-8216-2043f3b3e9d1</td>\n",
-              "      <td>Earth\\nEarth is the third planet from the Sun....</td>\n",
-              "      <td>$.main-text[5]</td>\n",
-              "      <td>1</td>\n",
-              "      <td>[132.91053772, 512.46295166, 477.84887695, 534...</td>\n",
-              "      <td>7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...</td>\n",
-              "      <td>2</td>\n",
-              "      <td>-1</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>5</th>\n",
-              "      <td>earth.pdf</td>\n",
-              "      <td>1</td>\n",
-              "      <td>0</td>\n",
-              "      <td>11</td>\n",
-              "      <td>pdf</td>\n",
-              "      <td>18713f970989055625bef22209b6f4b6830b9ca22046bf...</td>\n",
-              "      <td>2686</td>\n",
-              "      <td>2024-10-18T13:30:59.494027</td>\n",
-              "      <td>2.015123</td>\n",
-              "      <td>earth.pdf</td>\n",
-              "      <td>f3c0ac2e-1de2-472b-8216-2043f3b3e9d1</td>\n",
-              "      <td>Earth\\nBasic facts about Earth:\\n· Distance fr...</td>\n",
-              "      <td>$.main-text[6]</td>\n",
-              "      <td>1</td>\n",
-              "      <td>[133.30151367, 494.86206055, 240.17156982, 505...</td>\n",
-              "      <td>189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...</td>\n",
-              "      <td>3</td>\n",
-              "      <td>-1</td>\n",
-              "    </tr>\n",
-              "  </tbody>\n",
-              "</table>\n",
-              "</div>"
-            ],
-            "text/plain": [
-              "    filename  num_pages  num_tables  num_doc_elements  ext  \\\n",
-              "0   mars.pdf          1           0                11  pdf   \n",
-              "1   mars.pdf          1           0                11  pdf   \n",
-              "2   mars.pdf          1           0                11  pdf   \n",
-              "3  earth.pdf          1           0                11  pdf   \n",
-              "4  earth.pdf          1           0                11  pdf   \n",
-              "5  earth.pdf          1           0                11  pdf   \n",
-              "\n",
-              "                                                hash  size  \\\n",
-              "0  8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...  2800   \n",
-              "1  8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...  2800   \n",
-              "2  8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...  2800   \n",
-              "3  18713f970989055625bef22209b6f4b6830b9ca22046bf...  2686   \n",
-              "4  18713f970989055625bef22209b6f4b6830b9ca22046bf...  2686   \n",
-              "5  18713f970989055625bef22209b6f4b6830b9ca22046bf...  2686   \n",
-              "\n",
-              "                date_acquired  pdf_convert_time source_filename  \\\n",
-              "0  2024-10-18T13:30:59.490007          2.011138        mars.pdf   \n",
-              "1  2024-10-18T13:30:59.490007          2.011138        mars.pdf   \n",
-              "2  2024-10-18T13:30:59.490007          2.011138        mars.pdf   \n",
-              "3  2024-10-18T13:30:59.494027          2.015123       earth.pdf   \n",
-              "4  2024-10-18T13:30:59.494027          2.015123       earth.pdf   \n",
-              "5  2024-10-18T13:30:59.494027          2.015123       earth.pdf   \n",
-              "\n",
-              "                     source_document_id  \\\n",
-              "0  62e5639f-f922-4ccc-a041-3cb02f1cfd83   \n",
-              "1  62e5639f-f922-4ccc-a041-3cb02f1cfd83   \n",
-              "2  62e5639f-f922-4ccc-a041-3cb02f1cfd83   \n",
-              "3  f3c0ac2e-1de2-472b-8216-2043f3b3e9d1   \n",
-              "4  f3c0ac2e-1de2-472b-8216-2043f3b3e9d1   \n",
-              "5  f3c0ac2e-1de2-472b-8216-2043f3b3e9d1   \n",
-              "\n",
-              "                                            contents    doc_jsonpath  \\\n",
-              "0  Solar System\\nOur solar system is a vast and f...  $.main-text[2]   \n",
-              "1  Mars\\nMars, the fourth planet from the Sun, is...  $.main-text[5]   \n",
-              "2  Basic facts about Mars:\\n· Distance from the S...  $.main-text[6]   \n",
-              "3  Solar System\\nFor more details about our Solar...  $.main-text[3]   \n",
-              "4  Earth\\nEarth is the third planet from the Sun....  $.main-text[5]   \n",
-              "5  Earth\\nBasic facts about Earth:\\n· Distance fr...  $.main-text[6]   \n",
-              "\n",
-              "   page_number                                               bbox  \\\n",
-              "0            1  [132.84518433, 588.96014404, 479.40917969, 623...   \n",
-              "1            1  [132.87440491, 500.84011841, 477.48345947, 534...   \n",
-              "2            1  [133.2026062, 482.90710449, 237.04431152, 493....   \n",
-              "3            1  [133.20942688, 570.81555176, 375.57919312, 581...   \n",
-              "4            1  [132.91053772, 512.46295166, 477.84887695, 534...   \n",
-              "5            1  [133.30151367, 494.86206055, 240.17156982, 505...   \n",
-              "\n",
-              "                                         document_id  chunk_id  chunk_hash  \n",
-              "0  44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...         4          -1  \n",
-              "1  a31663e06fac41470ecc459f5a58658a3f9997d7801053...         6          -1  \n",
-              "2  7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...         7          -1  \n",
-              "3  d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...         1           5  \n",
-              "4  7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...         2          -1  \n",
-              "5  189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...         3          -1  "
-            ]
-          },
-          "execution_count": 28,
-          "metadata": {},
-          "output_type": "execute_result"
-        }
-      ],
-      "source": [
-        "from my_utils import read_parquet_files_as_df\n",
-        "\n",
-        "output_df = read_parquet_files_as_df(output_folder)\n",
-        "\n",
-        "print (\"Input data dimensions (rows x columns)= \", input_df.shape)\n",
-        "print (\"Output data dimensions (rows x columns)= \", output_df.shape)\n",
-        "print (\"Duplicate chunks removed  by fuzzy-dedupe:  \", (input_df.shape[0] - output_df.shape[0]))\n",
-        "\n",
-        "output_df.head(10)"
-      ]
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "✅ Stage:6 completed successfully\n",
+      "CPU times: user 607 ms, sys: 226 ms, total: 833 ms\n",
+      "Wall time: 22.1 s\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "\n",
+    "from text_encoder_transform_ray import TextEncoderRayTransformConfiguration\n",
+    "\n",
+    "local_conf = {\n",
+    "    \"input_folder\": input_folder,\n",
+    "    \"output_folder\": output_folder,\n",
+    "}\n",
+    "worker_options = {\"num_cpus\" : MY_CONFIG.RAY_NUM_CPUS}\n",
+    "params = {\n",
+    "    # where to run\n",
+    "    \"run_locally\": True,\n",
+    "    # Data access. Only required parameters are specified\n",
+    "    \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n",
+    "    # orchestrator\n",
+    "    \"runtime_worker_options\": ParamsUtils.convert_to_ast(worker_options),\n",
+    "    \"runtime_num_workers\": MY_CONFIG.RAY_RUNTIME_WORKERS,\n",
+    "    # text_encoder\n",
+    "    \"text_encoder_model_name\": MY_CONFIG.EMBEDDING_MODEL,\n",
+    "}\n",
+    "\n",
+    "sys.argv = ParamsUtils.dict_to_req(d=params)\n",
+    "# create launcher\n",
+    "launcher = RayTransformLauncher(TextEncoderRayTransformConfiguration())\n",
+    "# Launch the ray actor(s) to process the input\n",
+    "\n",
+    "return_code = launcher.launch()\n",
+    "\n",
+    "if return_code == 0:\n",
+    "    print (f\"✅ Stage:{STAGE} completed successfully\")\n",
+    "else:\n",
+    "    raise Exception (\"❌ Ray job failed\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b734852c",
+   "metadata": {
+    "id": "b734852c"
+   },
+   "source": [
+    "### 8.3 - Inspect Generated output\n",
+    "\n",
+    "You will see a column called `embeddings` added at the end.  This the text content converted into vectors or embeddings.  We used the model `sentence-transformers/all-MiniLM-L6-v2`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "id": "7b1c1d09",
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/",
+     "height": 659
     },
+    "id": "7b1c1d09",
+    "outputId": "70612634-b336-4ad5-ddb3-782ca0676bae"
+   },
+   "outputs": [
     {
-      "cell_type": "code",
-      "execution_count": 29,
-      "id": "ab7ea52b",
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 238
-        },
-        "id": "ab7ea52b",
-        "outputId": "e38754ee-777f-4ed7-ebc0-9299ee122662"
-      },
-      "outputs": [
-        {
-          "data": {
-            "text/html": [
-              "<div>\n",
-              "<style scoped>\n",
-              "    .dataframe tbody tr th:only-of-type {\n",
-              "        vertical-align: middle;\n",
-              "    }\n",
-              "\n",
-              "    .dataframe tbody tr th {\n",
-              "        vertical-align: top;\n",
-              "    }\n",
-              "\n",
-              "    .dataframe thead th {\n",
-              "        text-align: right;\n",
-              "    }\n",
-              "</style>\n",
-              "<table border=\"1\" class=\"dataframe\">\n",
-              "  <thead>\n",
-              "    <tr style=\"text-align: right;\">\n",
-              "      <th></th>\n",
-              "      <th>filename</th>\n",
-              "      <th>contents</th>\n",
-              "    </tr>\n",
-              "  </thead>\n",
-              "  <tbody>\n",
-              "    <tr>\n",
-              "      <th>0</th>\n",
-              "      <td>mars.pdf</td>\n",
-              "      <td>Solar System\\nOur solar system is a vast and f...</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>1</th>\n",
-              "      <td>mars.pdf</td>\n",
-              "      <td>Mars\\nMars, the fourth planet from the Sun, is...</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>2</th>\n",
-              "      <td>mars.pdf</td>\n",
-              "      <td>Basic facts about Mars:\\n· Distance from the S...</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>3</th>\n",
-              "      <td>earth.pdf</td>\n",
-              "      <td>Solar System\\nFor more details about our Solar...</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>4</th>\n",
-              "      <td>earth.pdf</td>\n",
-              "      <td>Earth\\nEarth is the third planet from the Sun....</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>5</th>\n",
-              "      <td>earth.pdf</td>\n",
-              "      <td>Earth\\nBasic facts about Earth:\\n· Distance fr...</td>\n",
-              "    </tr>\n",
-              "  </tbody>\n",
-              "</table>\n",
-              "</div>"
-            ],
-            "text/plain": [
-              "    filename                                           contents\n",
-              "0   mars.pdf  Solar System\\nOur solar system is a vast and f...\n",
-              "1   mars.pdf  Mars\\nMars, the fourth planet from the Sun, is...\n",
-              "2   mars.pdf  Basic facts about Mars:\\n· Distance from the S...\n",
-              "3  earth.pdf  Solar System\\nFor more details about our Solar...\n",
-              "4  earth.pdf  Earth\\nEarth is the third planet from the Sun....\n",
-              "5  earth.pdf  Earth\\nBasic facts about Earth:\\n· Distance fr..."
-            ]
-          },
-          "execution_count": 29,
-          "metadata": {},
-          "output_type": "execute_result"
-        }
-      ],
-      "source": [
-        "output_df[['filename', 'contents']]"
-      ]
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Input data dimensions (rows x columns)=  (6, 18)\n",
+      "Output data dimensions (rows x columns)=  (6, 19)\n"
+     ]
     },
     {
-      "cell_type": "code",
-      "execution_count": 30,
-      "id": "6bdd3515",
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "6bdd3515",
-        "outputId": "e6e3f2c0-5b23-4336-bc95-013921f0724a"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "========== mars.pdf ===========\n",
-            "-------Chunk 0------\n",
-            "Solar System\n",
-            "Our solar system is a vast and fascinating expanse, comprising eight planets, five dwarf planets, numerous moons, asteroids, comets, and other celestial bodies. At its center lies the star we call the Sun.\n",
-            "-------\n",
-            "-------Chunk 1------\n",
-            "Mars\n",
-            "Mars, the fourth planet from the Sun, is a cold, desert world with a thin atmosphere composed primarily of carbon dioxide. Its reddish hue comes from iron oxide, or rust, prevalent on its surface.\n",
-            "-------\n",
-            "-------Chunk 2------\n",
-            "Basic facts about Mars:\n",
-            "· Distance from the Sun: Average of 228 million kilometers (142 million miles)\n",
-            "· Rotation Period: 24.6 hours (one Martian day - called a \"sol\")\n",
-            "· Moons: Two small moons, Phobos and Deimos.\n",
-            "-------\n",
-            "========== earth.pdf ===========\n",
-            "-------Chunk 0------\n",
-            "Solar System\n",
-            "For more details about our Solar system see Chapter 1.\n",
-            "-------\n",
-            "-------Chunk 1------\n",
-            "Earth\n",
-            "Earth is the third planet from the Sun. It's our home planet. Earth is the only place we know of with life.\n",
-            "-------\n",
-            "-------Chunk 2------\n",
-            "Earth\n",
-            "Basic facts about Earth:\n",
-            "· Distance from the Sun: Average of 149.6 million kilometers (93 million miles)\n",
-            "· Rotation Period: 24 hours (one day)\n",
-            "· Moons: One moon, called Luna or simply \"the Moon\".\n",
-            "-------\n"
-          ]
-        }
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>filename</th>\n",
+       "      <th>num_pages</th>\n",
+       "      <th>num_tables</th>\n",
+       "      <th>num_doc_elements</th>\n",
+       "      <th>ext</th>\n",
+       "      <th>hash</th>\n",
+       "      <th>size</th>\n",
+       "      <th>date_acquired</th>\n",
+       "      <th>pdf_convert_time</th>\n",
+       "      <th>source_filename</th>\n",
+       "      <th>source_document_id</th>\n",
+       "      <th>contents</th>\n",
+       "      <th>doc_jsonpath</th>\n",
+       "      <th>page_number</th>\n",
+       "      <th>bbox</th>\n",
+       "      <th>document_id</th>\n",
+       "      <th>chunk_id</th>\n",
+       "      <th>chunk_hash</th>\n",
+       "      <th>embeddings</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>mars.pdf</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>11</td>\n",
+       "      <td>pdf</td>\n",
+       "      <td>8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...</td>\n",
+       "      <td>2800</td>\n",
+       "      <td>2024-10-18T13:30:59.490007</td>\n",
+       "      <td>2.011138</td>\n",
+       "      <td>mars.pdf</td>\n",
+       "      <td>62e5639f-f922-4ccc-a041-3cb02f1cfd83</td>\n",
+       "      <td>Solar System\\nOur solar system is a vast and f...</td>\n",
+       "      <td>$.main-text[2]</td>\n",
+       "      <td>1</td>\n",
+       "      <td>[132.84518433, 588.96014404, 479.40917969, 623...</td>\n",
+       "      <td>44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...</td>\n",
+       "      <td>4</td>\n",
+       "      <td>-1</td>\n",
+       "      <td>[0.0077404897, -0.020559434, 0.026426662, 0.01...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>mars.pdf</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>11</td>\n",
+       "      <td>pdf</td>\n",
+       "      <td>8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...</td>\n",
+       "      <td>2800</td>\n",
+       "      <td>2024-10-18T13:30:59.490007</td>\n",
+       "      <td>2.011138</td>\n",
+       "      <td>mars.pdf</td>\n",
+       "      <td>62e5639f-f922-4ccc-a041-3cb02f1cfd83</td>\n",
+       "      <td>Mars\\nMars, the fourth planet from the Sun, is...</td>\n",
+       "      <td>$.main-text[5]</td>\n",
+       "      <td>1</td>\n",
+       "      <td>[132.87440491, 500.84011841, 477.48345947, 534...</td>\n",
+       "      <td>a31663e06fac41470ecc459f5a58658a3f9997d7801053...</td>\n",
+       "      <td>6</td>\n",
+       "      <td>-1</td>\n",
+       "      <td>[0.07728298, 0.024971062, -0.04318075, 0.05809...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>mars.pdf</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>11</td>\n",
+       "      <td>pdf</td>\n",
+       "      <td>8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...</td>\n",
+       "      <td>2800</td>\n",
+       "      <td>2024-10-18T13:30:59.490007</td>\n",
+       "      <td>2.011138</td>\n",
+       "      <td>mars.pdf</td>\n",
+       "      <td>62e5639f-f922-4ccc-a041-3cb02f1cfd83</td>\n",
+       "      <td>Basic facts about Mars:\\n· Distance from the S...</td>\n",
+       "      <td>$.main-text[6]</td>\n",
+       "      <td>1</td>\n",
+       "      <td>[133.2026062, 482.90710449, 237.04431152, 493....</td>\n",
+       "      <td>7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...</td>\n",
+       "      <td>7</td>\n",
+       "      <td>-1</td>\n",
+       "      <td>[0.1059802, 0.025460616, 0.02362733, 0.0390564...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>earth.pdf</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>11</td>\n",
+       "      <td>pdf</td>\n",
+       "      <td>18713f970989055625bef22209b6f4b6830b9ca22046bf...</td>\n",
+       "      <td>2686</td>\n",
+       "      <td>2024-10-18T13:30:59.494027</td>\n",
+       "      <td>2.015123</td>\n",
+       "      <td>earth.pdf</td>\n",
+       "      <td>f3c0ac2e-1de2-472b-8216-2043f3b3e9d1</td>\n",
+       "      <td>Solar System\\nFor more details about our Solar...</td>\n",
+       "      <td>$.main-text[3]</td>\n",
+       "      <td>1</td>\n",
+       "      <td>[133.20942688, 570.81555176, 375.57919312, 581...</td>\n",
+       "      <td>d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...</td>\n",
+       "      <td>1</td>\n",
+       "      <td>5</td>\n",
+       "      <td>[-0.062105577, -0.0053322953, 0.03127779, 0.04...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>earth.pdf</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>11</td>\n",
+       "      <td>pdf</td>\n",
+       "      <td>18713f970989055625bef22209b6f4b6830b9ca22046bf...</td>\n",
+       "      <td>2686</td>\n",
+       "      <td>2024-10-18T13:30:59.494027</td>\n",
+       "      <td>2.015123</td>\n",
+       "      <td>earth.pdf</td>\n",
+       "      <td>f3c0ac2e-1de2-472b-8216-2043f3b3e9d1</td>\n",
+       "      <td>Earth\\nEarth is the third planet from the Sun....</td>\n",
+       "      <td>$.main-text[5]</td>\n",
+       "      <td>1</td>\n",
+       "      <td>[132.91053772, 512.46295166, 477.84887695, 534...</td>\n",
+       "      <td>7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...</td>\n",
+       "      <td>2</td>\n",
+       "      <td>-1</td>\n",
+       "      <td>[0.0724358, -0.058001805, -0.01977186, -0.0243...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>earth.pdf</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>11</td>\n",
+       "      <td>pdf</td>\n",
+       "      <td>18713f970989055625bef22209b6f4b6830b9ca22046bf...</td>\n",
+       "      <td>2686</td>\n",
+       "      <td>2024-10-18T13:30:59.494027</td>\n",
+       "      <td>2.015123</td>\n",
+       "      <td>earth.pdf</td>\n",
+       "      <td>f3c0ac2e-1de2-472b-8216-2043f3b3e9d1</td>\n",
+       "      <td>Earth\\nBasic facts about Earth:\\n· Distance fr...</td>\n",
+       "      <td>$.main-text[6]</td>\n",
+       "      <td>1</td>\n",
+       "      <td>[133.30151367, 494.86206055, 240.17156982, 505...</td>\n",
+       "      <td>189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...</td>\n",
+       "      <td>3</td>\n",
+       "      <td>-1</td>\n",
+       "      <td>[0.091821924, 0.015197907, 0.07716932, 0.01711...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
       ],
-      "source": [
-        "for f in output_df['filename'].unique():\n",
-        "    print ('==========' , f, '===========')\n",
-        "    chunks = output_df[output_df['filename'] == f]['contents']\n",
-        "    for idx , chunk in enumerate(chunks):\n",
-        "        print (f'-------Chunk {idx}------\\n{chunk}\\n-------')"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "2b34d9c6",
-      "metadata": {
-        "id": "2b34d9c6"
-      },
-      "source": [
-        "### 7.4- Understanding the output\n",
-        "\n",
-        "So we started with 7 rows and ended up with 6.  Fuzzy dedupe removed the following **very similar** chunk.\n",
-        "\n",
-        "These are pretty similar chunks except for the words 'the' and 'our'\n",
-        "\n",
-        "**earth.pdf**\n",
-        "\n",
-        "`For more details about *our* Solar system see Chapter 1.`\n",
-        "\n",
-        "**mars.pdf**\n",
-        "\n",
-        "`For more details about *the* Solar system see Chapter 1.`\n",
-        "\n",
-        "Pretty neat, eh? 👏\n",
-        "\n",
-        "### Configuring Fuzzy de-dupe\n",
-        "\n",
-        "You can tweak fuzzy dedupe by tweaking the following parameters\n",
-        "\n",
-        "```python\n",
-        "# fuzzy parameters\n",
-        "    \"fdedup_num_permutations\": 64,\n",
-        "    \"fdedup_threshold\": 0.7, #  (default 0.8)\n",
-        "    \"fdedup_shingles_size\": 5,\n",
-        "    \"fdedup_delimiters\": \" \"\n",
-        "```\n",
-        "\n",
-        "In our case, we set `fdedup_threshold` parameter to 0.7.  \n"
+      "text/plain": [
+       "    filename  num_pages  num_tables  num_doc_elements  ext  \\\n",
+       "0   mars.pdf          1           0                11  pdf   \n",
+       "1   mars.pdf          1           0                11  pdf   \n",
+       "2   mars.pdf          1           0                11  pdf   \n",
+       "3  earth.pdf          1           0                11  pdf   \n",
+       "4  earth.pdf          1           0                11  pdf   \n",
+       "5  earth.pdf          1           0                11  pdf   \n",
+       "\n",
+       "                                                hash  size  \\\n",
+       "0  8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...  2800   \n",
+       "1  8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...  2800   \n",
+       "2  8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...  2800   \n",
+       "3  18713f970989055625bef22209b6f4b6830b9ca22046bf...  2686   \n",
+       "4  18713f970989055625bef22209b6f4b6830b9ca22046bf...  2686   \n",
+       "5  18713f970989055625bef22209b6f4b6830b9ca22046bf...  2686   \n",
+       "\n",
+       "                date_acquired  pdf_convert_time source_filename  \\\n",
+       "0  2024-10-18T13:30:59.490007          2.011138        mars.pdf   \n",
+       "1  2024-10-18T13:30:59.490007          2.011138        mars.pdf   \n",
+       "2  2024-10-18T13:30:59.490007          2.011138        mars.pdf   \n",
+       "3  2024-10-18T13:30:59.494027          2.015123       earth.pdf   \n",
+       "4  2024-10-18T13:30:59.494027          2.015123       earth.pdf   \n",
+       "5  2024-10-18T13:30:59.494027          2.015123       earth.pdf   \n",
+       "\n",
+       "                     source_document_id  \\\n",
+       "0  62e5639f-f922-4ccc-a041-3cb02f1cfd83   \n",
+       "1  62e5639f-f922-4ccc-a041-3cb02f1cfd83   \n",
+       "2  62e5639f-f922-4ccc-a041-3cb02f1cfd83   \n",
+       "3  f3c0ac2e-1de2-472b-8216-2043f3b3e9d1   \n",
+       "4  f3c0ac2e-1de2-472b-8216-2043f3b3e9d1   \n",
+       "5  f3c0ac2e-1de2-472b-8216-2043f3b3e9d1   \n",
+       "\n",
+       "                                            contents    doc_jsonpath  \\\n",
+       "0  Solar System\\nOur solar system is a vast and f...  $.main-text[2]   \n",
+       "1  Mars\\nMars, the fourth planet from the Sun, is...  $.main-text[5]   \n",
+       "2  Basic facts about Mars:\\n· Distance from the S...  $.main-text[6]   \n",
+       "3  Solar System\\nFor more details about our Solar...  $.main-text[3]   \n",
+       "4  Earth\\nEarth is the third planet from the Sun....  $.main-text[5]   \n",
+       "5  Earth\\nBasic facts about Earth:\\n· Distance fr...  $.main-text[6]   \n",
+       "\n",
+       "   page_number                                               bbox  \\\n",
+       "0            1  [132.84518433, 588.96014404, 479.40917969, 623...   \n",
+       "1            1  [132.87440491, 500.84011841, 477.48345947, 534...   \n",
+       "2            1  [133.2026062, 482.90710449, 237.04431152, 493....   \n",
+       "3            1  [133.20942688, 570.81555176, 375.57919312, 581...   \n",
+       "4            1  [132.91053772, 512.46295166, 477.84887695, 534...   \n",
+       "5            1  [133.30151367, 494.86206055, 240.17156982, 505...   \n",
+       "\n",
+       "                                         document_id  chunk_id  chunk_hash  \\\n",
+       "0  44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...         4          -1   \n",
+       "1  a31663e06fac41470ecc459f5a58658a3f9997d7801053...         6          -1   \n",
+       "2  7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...         7          -1   \n",
+       "3  d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...         1           5   \n",
+       "4  7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...         2          -1   \n",
+       "5  189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...         3          -1   \n",
+       "\n",
+       "                                          embeddings  \n",
+       "0  [0.0077404897, -0.020559434, 0.026426662, 0.01...  \n",
+       "1  [0.07728298, 0.024971062, -0.04318075, 0.05809...  \n",
+       "2  [0.1059802, 0.025460616, 0.02362733, 0.0390564...  \n",
+       "3  [-0.062105577, -0.0053322953, 0.03127779, 0.04...  \n",
+       "4  [0.0724358, -0.058001805, -0.01977186, -0.0243...  \n",
+       "5  [0.091821924, 0.015197907, 0.07716932, 0.01711...  "
       ]
+     },
+     "execution_count": 33,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from my_utils import read_parquet_files_as_df\n",
+    "\n",
+    "output_df = read_parquet_files_as_df(output_folder)\n",
+    "\n",
+    "print (\"Input data dimensions (rows x columns)= \", input_df.shape)\n",
+    "print (\"Output data dimensions (rows x columns)= \", output_df.shape)\n",
+    "\n",
+    "output_df.head(10)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f5e12630-be6b-4188-a925-77117155617b",
+   "metadata": {
+    "id": "f5e12630-be6b-4188-a925-77117155617b"
+   },
+   "source": [
+    "## Step-9: Copy output to final output dir"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "id": "16dee3b8-31dc-4168-8adb-f2a0a0b5e207",
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
     },
+    "id": "16dee3b8-31dc-4168-8adb-f2a0a0b5e207",
+    "outputId": "d151e618-6528-40b5-fdbd-1c67291a7279"
+   },
+   "outputs": [
     {
-      "cell_type": "markdown",
-      "id": "5370950a-2a3a-4143-8218-f9b4808099ba",
-      "metadata": {
-        "id": "5370950a-2a3a-4143-8218-f9b4808099ba"
-      },
-      "source": [
-        "## Step-8:   Text encoding\n",
-        "\n",
-        "Encode text for the vector storage."
-      ]
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "✅ Copied output from 'output/06_embeddings_out' --> 'output/output_final'\n"
+     ]
+    }
+   ],
+   "source": [
+    "import shutil\n",
+    "\n",
+    "shutil.rmtree(MY_CONFIG.OUTPUT_FOLDER_FINAL, ignore_errors=True)\n",
+    "shutil.copytree(src=output_folder, dst=MY_CONFIG.OUTPUT_FOLDER_FINAL)\n",
+    "\n",
+    "print (f\"✅ Copied output from '{output_folder}' --> '{MY_CONFIG.OUTPUT_FOLDER_FINAL}'\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "id": "dc0a6728",
+   "metadata": {
+    "id": "dc0a6728"
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "colab": {
+   "provenance": []
+  },
+  "kernelspec": {
+   "display_name": "dpk-3-basic-022dev1-py311",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.10"
+  },
+  "widgets": {
+   "application/vnd.jupyter.widget-state+json": {
+    "06107a2f48b3491f91bbe84e46e10ba0": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_68997339f13240a4824a9e416096bee4",
+      "placeholder": "​",
+      "style": "IPY_MODEL_919b086abd314077bbff75687392bd91",
+      "value": ""
+     }
     },
-    {
-      "cell_type": "markdown",
-      "id": "85aba685",
-      "metadata": {
-        "id": "85aba685"
-      },
-      "source": [
-        "### 8.1 - Set Input/output Folder"
-      ]
+    "68997339f13240a4824a9e416096bee4": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
     },
-    {
-      "cell_type": "code",
-      "execution_count": 31,
-      "id": "20a153fa-fd56-401e-86be-4f7617affcc8",
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "20a153fa-fd56-401e-86be-4f7617affcc8",
-        "outputId": "530e65c6-7ceb-4c73-cb87-50da46c78add"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "🏃🏼 STAGE-6: Processing input='output/05_fuzzy_dedupe_out' --> output='output/06_embeddings_out'\n"
-          ]
-        }
-      ],
-      "source": [
-        "STAGE  = 6\n",
-        "\n",
-        "input_folder = output_fuzzy_dedupe_dir # previous output folder is the input folder for the current stage\n",
-        "output_folder =  output_embeddings_dir\n",
-        "\n",
-        "input_df = read_parquet_files_as_df(input_folder)  ## for debug purposes\n",
-        "\n",
-        "print (f\"🏃🏼 STAGE-{STAGE}: Processing input='{input_folder}' --> output='{output_folder}'\")"
-      ]
+    "6c08de2dd9a2402c90b1a7a645db9b13": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "ProgressStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "ProgressStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "bar_color": null,
+      "description_width": ""
+     }
     },
-    {
-      "cell_type": "markdown",
-      "id": "c97545f4",
-      "metadata": {
-        "id": "c97545f4"
-      },
-      "source": [
-        "### 8.2 - Execute"
-      ]
+    "7e13e8779a81400f996d4428c74acfaf": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_91fff81a1de8487c9009e872b751edb0",
+      "placeholder": "​",
+      "style": "IPY_MODEL_ada62d24cbcf4361acbb21808f334d33",
+      "value": " 0/0 [00:00&lt;?, ?it/s]"
+     }
     },
-    {
-      "cell_type": "code",
-      "execution_count": 32,
-      "id": "228df6b2-bc62-494b-9697-03ece98d7853",
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 914,
-          "referenced_widgets": [
-            "8b7571c585df431eb901fcdebdf8177e",
-            "06107a2f48b3491f91bbe84e46e10ba0",
-            "bd74356eca18423aa0373c808d9097e3",
-            "7e13e8779a81400f996d4428c74acfaf",
-            "a75892696be546a3970962bae7bf732a",
-            "68997339f13240a4824a9e416096bee4",
-            "919b086abd314077bbff75687392bd91",
-            "b4c209371e7a403986991a786cfb296d",
-            "6c08de2dd9a2402c90b1a7a645db9b13",
-            "91fff81a1de8487c9009e872b751edb0",
-            "ada62d24cbcf4361acbb21808f334d33"
-          ]
-        },
-        "id": "228df6b2-bc62-494b-9697-03ece98d7853",
-        "outputId": "b10eecc1-cd17-49c1-e3b1-b80e0e1bfa86"
-      },
-      "outputs": [
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "13:32:37 INFO - text_encoder parameters are : {'content_column_name': 'contents', 'output_embeddings_column_name': 'embeddings', 'model_name': 'sentence-transformers/all-MiniLM-L6-v2'}\n",
-            "13:32:37 INFO - pipeline id pipeline_id\n",
-            "13:32:37 INFO - code location None\n",
-            "13:32:37 INFO - number of workers 2 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n",
-            "13:32:37 INFO - actor creation delay 0\n",
-            "13:32:37 INFO - job details {'job category': 'preprocessing', 'job name': 'text_encoder', 'job type': 'ray', 'job id': 'job_id'}\n",
-            "13:32:37 INFO - data factory data_ is using local data access: input_folder - output/05_fuzzy_dedupe_out output_folder - output/06_embeddings_out\n",
-            "13:32:37 INFO - data factory data_ max_files -1, n_sample -1\n",
-            "13:32:37 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n",
-            "13:32:37 INFO - Running locally\n",
-            "2024-10-18 13:32:39,609\tINFO worker.py:1744 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n",
-            "\u001b[36m(orchestrate pid=17394)\u001b[0m 13:32:42 INFO - orchestrator started at 2024-10-18 13:32:42\n",
-            "\u001b[36m(orchestrate pid=17394)\u001b[0m 13:32:42 INFO - Number of files is 2, source profile {'max_file_size': 0.009654045104980469, 'min_file_size': 0.00907135009765625, 'total_file_size': 0.01872539520263672}\n",
-            "\u001b[36m(orchestrate pid=17394)\u001b[0m 13:32:42 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 14.943363189697266, 'object_store': 7.471681594848633}\n",
-            "\u001b[36m(orchestrate pid=17394)\u001b[0m 13:32:42 INFO - Number of workers - 2 with {'num_cpus': 0.8, 'max_restarts': -1} each\n",
-            "\u001b[36m(orchestrate pid=17394)\u001b[0m 13:32:42 INFO - Completed 0 files (0.0%)  in 0.0 min. Waiting for completion\n",
-            "\u001b[36m(orchestrate pid=17394)\u001b[0m 13:32:47 INFO - Completed processing 2 files in 0.087 min\n",
-            "\u001b[36m(orchestrate pid=17394)\u001b[0m 13:32:47 INFO - done flushing in 0.001 sec\n",
-            "13:32:57 INFO - Completed execution in 0.333 min, execution result 0\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "✅ Stage:6 completed successfully\n",
-            "CPU times: user 607 ms, sys: 226 ms, total: 833 ms\n",
-            "Wall time: 22.1 s\n"
-          ]
-        }
+    "8b7571c585df431eb901fcdebdf8177e": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HBoxModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HBoxModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HBoxView",
+      "box_style": "",
+      "children": [
+       "IPY_MODEL_06107a2f48b3491f91bbe84e46e10ba0",
+       "IPY_MODEL_bd74356eca18423aa0373c808d9097e3",
+       "IPY_MODEL_7e13e8779a81400f996d4428c74acfaf"
       ],
-      "source": [
-        "%%time\n",
-        "\n",
-        "from text_encoder_transform_ray import TextEncoderRayTransformConfiguration\n",
-        "\n",
-        "local_conf = {\n",
-        "    \"input_folder\": input_folder,\n",
-        "    \"output_folder\": output_folder,\n",
-        "}\n",
-        "worker_options = {\"num_cpus\" : MY_CONFIG.RAY_NUM_CPUS}\n",
-        "params = {\n",
-        "    # where to run\n",
-        "    \"run_locally\": True,\n",
-        "    # Data access. Only required parameters are specified\n",
-        "    \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n",
-        "    # orchestrator\n",
-        "    \"runtime_worker_options\": ParamsUtils.convert_to_ast(worker_options),\n",
-        "    \"runtime_num_workers\": MY_CONFIG.RAY_RUNTIME_WORKERS,\n",
-        "    # text_encoder\n",
-        "    \"text_encoder_model_name\": MY_CONFIG.EMBEDDING_MODEL,\n",
-        "}\n",
-        "\n",
-        "sys.argv = ParamsUtils.dict_to_req(d=params)\n",
-        "# create launcher\n",
-        "launcher = RayTransformLauncher(TextEncoderRayTransformConfiguration())\n",
-        "# Launch the ray actor(s) to process the input\n",
-        "\n",
-        "return_code = launcher.launch()\n",
-        "\n",
-        "if return_code == 0:\n",
-        "    print (f\"✅ Stage:{STAGE} completed successfully\")\n",
-        "else:\n",
-        "    raise Exception (\"❌ Ray job failed\")"
-      ]
+      "layout": "IPY_MODEL_a75892696be546a3970962bae7bf732a"
+     }
     },
-    {
-      "cell_type": "markdown",
-      "id": "b734852c",
-      "metadata": {
-        "id": "b734852c"
-      },
-      "source": [
-        "### 8.3 - Inspect Generated output\n",
-        "\n",
-        "You will see a column called `embeddings` added at the end.  This the text content converted into vectors or embeddings.  We used the model `sentence-transformers/all-MiniLM-L6-v2`"
-      ]
+    "919b086abd314077bbff75687392bd91": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
     },
-    {
-      "cell_type": "code",
-      "execution_count": 33,
-      "id": "7b1c1d09",
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 659
-        },
-        "id": "7b1c1d09",
-        "outputId": "70612634-b336-4ad5-ddb3-782ca0676bae"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Input data dimensions (rows x columns)=  (6, 18)\n",
-            "Output data dimensions (rows x columns)=  (6, 19)\n"
-          ]
-        },
-        {
-          "data": {
-            "text/html": [
-              "<div>\n",
-              "<style scoped>\n",
-              "    .dataframe tbody tr th:only-of-type {\n",
-              "        vertical-align: middle;\n",
-              "    }\n",
-              "\n",
-              "    .dataframe tbody tr th {\n",
-              "        vertical-align: top;\n",
-              "    }\n",
-              "\n",
-              "    .dataframe thead th {\n",
-              "        text-align: right;\n",
-              "    }\n",
-              "</style>\n",
-              "<table border=\"1\" class=\"dataframe\">\n",
-              "  <thead>\n",
-              "    <tr style=\"text-align: right;\">\n",
-              "      <th></th>\n",
-              "      <th>filename</th>\n",
-              "      <th>num_pages</th>\n",
-              "      <th>num_tables</th>\n",
-              "      <th>num_doc_elements</th>\n",
-              "      <th>ext</th>\n",
-              "      <th>hash</th>\n",
-              "      <th>size</th>\n",
-              "      <th>date_acquired</th>\n",
-              "      <th>pdf_convert_time</th>\n",
-              "      <th>source_filename</th>\n",
-              "      <th>source_document_id</th>\n",
-              "      <th>contents</th>\n",
-              "      <th>doc_jsonpath</th>\n",
-              "      <th>page_number</th>\n",
-              "      <th>bbox</th>\n",
-              "      <th>document_id</th>\n",
-              "      <th>chunk_id</th>\n",
-              "      <th>chunk_hash</th>\n",
-              "      <th>embeddings</th>\n",
-              "    </tr>\n",
-              "  </thead>\n",
-              "  <tbody>\n",
-              "    <tr>\n",
-              "      <th>0</th>\n",
-              "      <td>mars.pdf</td>\n",
-              "      <td>1</td>\n",
-              "      <td>0</td>\n",
-              "      <td>11</td>\n",
-              "      <td>pdf</td>\n",
-              "      <td>8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...</td>\n",
-              "      <td>2800</td>\n",
-              "      <td>2024-10-18T13:30:59.490007</td>\n",
-              "      <td>2.011138</td>\n",
-              "      <td>mars.pdf</td>\n",
-              "      <td>62e5639f-f922-4ccc-a041-3cb02f1cfd83</td>\n",
-              "      <td>Solar System\\nOur solar system is a vast and f...</td>\n",
-              "      <td>$.main-text[2]</td>\n",
-              "      <td>1</td>\n",
-              "      <td>[132.84518433, 588.96014404, 479.40917969, 623...</td>\n",
-              "      <td>44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...</td>\n",
-              "      <td>4</td>\n",
-              "      <td>-1</td>\n",
-              "      <td>[0.0077404897, -0.020559434, 0.026426662, 0.01...</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>1</th>\n",
-              "      <td>mars.pdf</td>\n",
-              "      <td>1</td>\n",
-              "      <td>0</td>\n",
-              "      <td>11</td>\n",
-              "      <td>pdf</td>\n",
-              "      <td>8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...</td>\n",
-              "      <td>2800</td>\n",
-              "      <td>2024-10-18T13:30:59.490007</td>\n",
-              "      <td>2.011138</td>\n",
-              "      <td>mars.pdf</td>\n",
-              "      <td>62e5639f-f922-4ccc-a041-3cb02f1cfd83</td>\n",
-              "      <td>Mars\\nMars, the fourth planet from the Sun, is...</td>\n",
-              "      <td>$.main-text[5]</td>\n",
-              "      <td>1</td>\n",
-              "      <td>[132.87440491, 500.84011841, 477.48345947, 534...</td>\n",
-              "      <td>a31663e06fac41470ecc459f5a58658a3f9997d7801053...</td>\n",
-              "      <td>6</td>\n",
-              "      <td>-1</td>\n",
-              "      <td>[0.07728298, 0.024971062, -0.04318075, 0.05809...</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>2</th>\n",
-              "      <td>mars.pdf</td>\n",
-              "      <td>1</td>\n",
-              "      <td>0</td>\n",
-              "      <td>11</td>\n",
-              "      <td>pdf</td>\n",
-              "      <td>8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...</td>\n",
-              "      <td>2800</td>\n",
-              "      <td>2024-10-18T13:30:59.490007</td>\n",
-              "      <td>2.011138</td>\n",
-              "      <td>mars.pdf</td>\n",
-              "      <td>62e5639f-f922-4ccc-a041-3cb02f1cfd83</td>\n",
-              "      <td>Basic facts about Mars:\\n· Distance from the S...</td>\n",
-              "      <td>$.main-text[6]</td>\n",
-              "      <td>1</td>\n",
-              "      <td>[133.2026062, 482.90710449, 237.04431152, 493....</td>\n",
-              "      <td>7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...</td>\n",
-              "      <td>7</td>\n",
-              "      <td>-1</td>\n",
-              "      <td>[0.1059802, 0.025460616, 0.02362733, 0.0390564...</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>3</th>\n",
-              "      <td>earth.pdf</td>\n",
-              "      <td>1</td>\n",
-              "      <td>0</td>\n",
-              "      <td>11</td>\n",
-              "      <td>pdf</td>\n",
-              "      <td>18713f970989055625bef22209b6f4b6830b9ca22046bf...</td>\n",
-              "      <td>2686</td>\n",
-              "      <td>2024-10-18T13:30:59.494027</td>\n",
-              "      <td>2.015123</td>\n",
-              "      <td>earth.pdf</td>\n",
-              "      <td>f3c0ac2e-1de2-472b-8216-2043f3b3e9d1</td>\n",
-              "      <td>Solar System\\nFor more details about our Solar...</td>\n",
-              "      <td>$.main-text[3]</td>\n",
-              "      <td>1</td>\n",
-              "      <td>[133.20942688, 570.81555176, 375.57919312, 581...</td>\n",
-              "      <td>d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...</td>\n",
-              "      <td>1</td>\n",
-              "      <td>5</td>\n",
-              "      <td>[-0.062105577, -0.0053322953, 0.03127779, 0.04...</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>4</th>\n",
-              "      <td>earth.pdf</td>\n",
-              "      <td>1</td>\n",
-              "      <td>0</td>\n",
-              "      <td>11</td>\n",
-              "      <td>pdf</td>\n",
-              "      <td>18713f970989055625bef22209b6f4b6830b9ca22046bf...</td>\n",
-              "      <td>2686</td>\n",
-              "      <td>2024-10-18T13:30:59.494027</td>\n",
-              "      <td>2.015123</td>\n",
-              "      <td>earth.pdf</td>\n",
-              "      <td>f3c0ac2e-1de2-472b-8216-2043f3b3e9d1</td>\n",
-              "      <td>Earth\\nEarth is the third planet from the Sun....</td>\n",
-              "      <td>$.main-text[5]</td>\n",
-              "      <td>1</td>\n",
-              "      <td>[132.91053772, 512.46295166, 477.84887695, 534...</td>\n",
-              "      <td>7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...</td>\n",
-              "      <td>2</td>\n",
-              "      <td>-1</td>\n",
-              "      <td>[0.0724358, -0.058001805, -0.01977186, -0.0243...</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>5</th>\n",
-              "      <td>earth.pdf</td>\n",
-              "      <td>1</td>\n",
-              "      <td>0</td>\n",
-              "      <td>11</td>\n",
-              "      <td>pdf</td>\n",
-              "      <td>18713f970989055625bef22209b6f4b6830b9ca22046bf...</td>\n",
-              "      <td>2686</td>\n",
-              "      <td>2024-10-18T13:30:59.494027</td>\n",
-              "      <td>2.015123</td>\n",
-              "      <td>earth.pdf</td>\n",
-              "      <td>f3c0ac2e-1de2-472b-8216-2043f3b3e9d1</td>\n",
-              "      <td>Earth\\nBasic facts about Earth:\\n· Distance fr...</td>\n",
-              "      <td>$.main-text[6]</td>\n",
-              "      <td>1</td>\n",
-              "      <td>[133.30151367, 494.86206055, 240.17156982, 505...</td>\n",
-              "      <td>189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...</td>\n",
-              "      <td>3</td>\n",
-              "      <td>-1</td>\n",
-              "      <td>[0.091821924, 0.015197907, 0.07716932, 0.01711...</td>\n",
-              "    </tr>\n",
-              "  </tbody>\n",
-              "</table>\n",
-              "</div>"
-            ],
-            "text/plain": [
-              "    filename  num_pages  num_tables  num_doc_elements  ext  \\\n",
-              "0   mars.pdf          1           0                11  pdf   \n",
-              "1   mars.pdf          1           0                11  pdf   \n",
-              "2   mars.pdf          1           0                11  pdf   \n",
-              "3  earth.pdf          1           0                11  pdf   \n",
-              "4  earth.pdf          1           0                11  pdf   \n",
-              "5  earth.pdf          1           0                11  pdf   \n",
-              "\n",
-              "                                                hash  size  \\\n",
-              "0  8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...  2800   \n",
-              "1  8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...  2800   \n",
-              "2  8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...  2800   \n",
-              "3  18713f970989055625bef22209b6f4b6830b9ca22046bf...  2686   \n",
-              "4  18713f970989055625bef22209b6f4b6830b9ca22046bf...  2686   \n",
-              "5  18713f970989055625bef22209b6f4b6830b9ca22046bf...  2686   \n",
-              "\n",
-              "                date_acquired  pdf_convert_time source_filename  \\\n",
-              "0  2024-10-18T13:30:59.490007          2.011138        mars.pdf   \n",
-              "1  2024-10-18T13:30:59.490007          2.011138        mars.pdf   \n",
-              "2  2024-10-18T13:30:59.490007          2.011138        mars.pdf   \n",
-              "3  2024-10-18T13:30:59.494027          2.015123       earth.pdf   \n",
-              "4  2024-10-18T13:30:59.494027          2.015123       earth.pdf   \n",
-              "5  2024-10-18T13:30:59.494027          2.015123       earth.pdf   \n",
-              "\n",
-              "                     source_document_id  \\\n",
-              "0  62e5639f-f922-4ccc-a041-3cb02f1cfd83   \n",
-              "1  62e5639f-f922-4ccc-a041-3cb02f1cfd83   \n",
-              "2  62e5639f-f922-4ccc-a041-3cb02f1cfd83   \n",
-              "3  f3c0ac2e-1de2-472b-8216-2043f3b3e9d1   \n",
-              "4  f3c0ac2e-1de2-472b-8216-2043f3b3e9d1   \n",
-              "5  f3c0ac2e-1de2-472b-8216-2043f3b3e9d1   \n",
-              "\n",
-              "                                            contents    doc_jsonpath  \\\n",
-              "0  Solar System\\nOur solar system is a vast and f...  $.main-text[2]   \n",
-              "1  Mars\\nMars, the fourth planet from the Sun, is...  $.main-text[5]   \n",
-              "2  Basic facts about Mars:\\n· Distance from the S...  $.main-text[6]   \n",
-              "3  Solar System\\nFor more details about our Solar...  $.main-text[3]   \n",
-              "4  Earth\\nEarth is the third planet from the Sun....  $.main-text[5]   \n",
-              "5  Earth\\nBasic facts about Earth:\\n· Distance fr...  $.main-text[6]   \n",
-              "\n",
-              "   page_number                                               bbox  \\\n",
-              "0            1  [132.84518433, 588.96014404, 479.40917969, 623...   \n",
-              "1            1  [132.87440491, 500.84011841, 477.48345947, 534...   \n",
-              "2            1  [133.2026062, 482.90710449, 237.04431152, 493....   \n",
-              "3            1  [133.20942688, 570.81555176, 375.57919312, 581...   \n",
-              "4            1  [132.91053772, 512.46295166, 477.84887695, 534...   \n",
-              "5            1  [133.30151367, 494.86206055, 240.17156982, 505...   \n",
-              "\n",
-              "                                         document_id  chunk_id  chunk_hash  \\\n",
-              "0  44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...         4          -1   \n",
-              "1  a31663e06fac41470ecc459f5a58658a3f9997d7801053...         6          -1   \n",
-              "2  7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...         7          -1   \n",
-              "3  d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...         1           5   \n",
-              "4  7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...         2          -1   \n",
-              "5  189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...         3          -1   \n",
-              "\n",
-              "                                          embeddings  \n",
-              "0  [0.0077404897, -0.020559434, 0.026426662, 0.01...  \n",
-              "1  [0.07728298, 0.024971062, -0.04318075, 0.05809...  \n",
-              "2  [0.1059802, 0.025460616, 0.02362733, 0.0390564...  \n",
-              "3  [-0.062105577, -0.0053322953, 0.03127779, 0.04...  \n",
-              "4  [0.0724358, -0.058001805, -0.01977186, -0.0243...  \n",
-              "5  [0.091821924, 0.015197907, 0.07716932, 0.01711...  "
-            ]
-          },
-          "execution_count": 33,
-          "metadata": {},
-          "output_type": "execute_result"
-        }
-      ],
-      "source": [
-        "from my_utils import read_parquet_files_as_df\n",
-        "\n",
-        "output_df = read_parquet_files_as_df(output_folder)\n",
-        "\n",
-        "print (\"Input data dimensions (rows x columns)= \", input_df.shape)\n",
-        "print (\"Output data dimensions (rows x columns)= \", output_df.shape)\n",
-        "\n",
-        "output_df.head(10)"
-      ]
+    "91fff81a1de8487c9009e872b751edb0": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
     },
-    {
-      "cell_type": "markdown",
-      "id": "f5e12630-be6b-4188-a925-77117155617b",
-      "metadata": {
-        "id": "f5e12630-be6b-4188-a925-77117155617b"
-      },
-      "source": [
-        "## Step-9: Copy output to final output dir"
-      ]
+    "a75892696be546a3970962bae7bf732a": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
     },
-    {
-      "cell_type": "code",
-      "execution_count": 34,
-      "id": "16dee3b8-31dc-4168-8adb-f2a0a0b5e207",
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "16dee3b8-31dc-4168-8adb-f2a0a0b5e207",
-        "outputId": "d151e618-6528-40b5-fdbd-1c67291a7279"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "✅ Copied output from 'output/06_embeddings_out' --> 'output/output_final'\n"
-          ]
-        }
-      ],
-      "source": [
-        "import shutil\n",
-        "\n",
-        "shutil.rmtree(MY_CONFIG.OUTPUT_FOLDER_FINAL, ignore_errors=True)\n",
-        "shutil.copytree(src=output_folder, dst=MY_CONFIG.OUTPUT_FOLDER_FINAL)\n",
-        "\n",
-        "print (f\"✅ Copied output from '{output_folder}' --> '{MY_CONFIG.OUTPUT_FOLDER_FINAL}'\")"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 31,
-      "id": "dc0a6728",
-      "metadata": {
-        "id": "dc0a6728"
-      },
-      "outputs": [],
-      "source": []
-    }
-  ],
-  "metadata": {
-    "colab": {
-      "provenance": []
+    "ada62d24cbcf4361acbb21808f334d33": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
     },
-    "kernelspec": {
-      "display_name": "dpk-2-basic-021-py311",
-      "language": "python",
-      "name": "python3"
+    "b4c209371e7a403986991a786cfb296d": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": "20px"
+     }
     },
-    "language_info": {
-      "codemirror_mode": {
-        "name": "ipython",
-        "version": 3
-      },
-      "file_extension": ".py",
-      "mimetype": "text/x-python",
-      "name": "python",
-      "nbconvert_exporter": "python",
-      "pygments_lexer": "ipython3",
-      "version": "3.11.10"
-    },
-    "widgets": {
-      "application/vnd.jupyter.widget-state+json": {
-        "06107a2f48b3491f91bbe84e46e10ba0": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HTMLModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HTMLModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HTMLView",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_68997339f13240a4824a9e416096bee4",
-            "placeholder": "​",
-            "style": "IPY_MODEL_919b086abd314077bbff75687392bd91",
-            "value": ""
-          }
-        },
-        "68997339f13240a4824a9e416096bee4": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "6c08de2dd9a2402c90b1a7a645db9b13": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "ProgressStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "ProgressStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "bar_color": null,
-            "description_width": ""
-          }
-        },
-        "7e13e8779a81400f996d4428c74acfaf": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HTMLModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HTMLModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HTMLView",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_91fff81a1de8487c9009e872b751edb0",
-            "placeholder": "​",
-            "style": "IPY_MODEL_ada62d24cbcf4361acbb21808f334d33",
-            "value": " 0/0 [00:00&lt;?, ?it/s]"
-          }
-        },
-        "8b7571c585df431eb901fcdebdf8177e": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HBoxModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HBoxModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HBoxView",
-            "box_style": "",
-            "children": [
-              "IPY_MODEL_06107a2f48b3491f91bbe84e46e10ba0",
-              "IPY_MODEL_bd74356eca18423aa0373c808d9097e3",
-              "IPY_MODEL_7e13e8779a81400f996d4428c74acfaf"
-            ],
-            "layout": "IPY_MODEL_a75892696be546a3970962bae7bf732a"
-          }
-        },
-        "919b086abd314077bbff75687392bd91": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "DescriptionStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "DescriptionStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "description_width": ""
-          }
-        },
-        "91fff81a1de8487c9009e872b751edb0": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "a75892696be546a3970962bae7bf732a": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "ada62d24cbcf4361acbb21808f334d33": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "DescriptionStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "DescriptionStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "description_width": ""
-          }
-        },
-        "b4c209371e7a403986991a786cfb296d": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": "20px"
-          }
-        },
-        "bd74356eca18423aa0373c808d9097e3": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "FloatProgressModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "FloatProgressModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "ProgressView",
-            "bar_style": "success",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_b4c209371e7a403986991a786cfb296d",
-            "max": 1,
-            "min": 0,
-            "orientation": "horizontal",
-            "style": "IPY_MODEL_6c08de2dd9a2402c90b1a7a645db9b13",
-            "value": 0
-          }
-        }
-      }
+    "bd74356eca18423aa0373c808d9097e3": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "FloatProgressModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "FloatProgressModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "ProgressView",
+      "bar_style": "success",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_b4c209371e7a403986991a786cfb296d",
+      "max": 1,
+      "min": 0,
+      "orientation": "horizontal",
+      "style": "IPY_MODEL_6c08de2dd9a2402c90b1a7a645db9b13",
+      "value": 0
+     }
     }
-  },
-  "nbformat": 4,
-  "nbformat_minor": 5
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
 }
diff --git a/examples/notebooks/rag/README.md b/examples/notebooks/rag/README.md
index f4a3460a1..16ffdb15e 100644
--- a/examples/notebooks/rag/README.md
+++ b/examples/notebooks/rag/README.md
@@ -76,7 +76,7 @@ REPLICATE_API_TOKEN=your REPLICATE token goes here
 
 ### 5.2 - Run the query code
 
-Code: [rag_1D_query_llama_replicate.ipynb](rag_1D_query_llama_replicate.ipynb)
+Code: [rag_1D_query_replicate.ipynb](rag_1D_query_replicate.ipynb)
 
 
 
diff --git a/examples/notebooks/rag/my_config.py b/examples/notebooks/rag/my_config.py
index ba9ea89fd..66fc1ecf7 100644
--- a/examples/notebooks/rag/my_config.py
+++ b/examples/notebooks/rag/my_config.py
@@ -23,8 +23,10 @@ class MyConfig:
 MY_CONFIG.EMBEDDING_LENGTH = 384
 
 ## LLM Model
-MY_CONFIG.LLM_MODEL = "meta/meta-llama-3-8b-instruct"
-
+# MY_CONFIG.LLM_MODEL = "meta/meta-llama-3-8b-instruct"
+# MY_CONFIG.LLM_MODEL = "meta/meta-llama-3-70b-instruct"
+# MY_CONFIG.LLM_MODEL = "ibm-granite/granite-3.0-2b-instruct"
+MY_CONFIG.LLM_MODEL = "ibm-granite/granite-3.0-8b-instruct"
 
 
 ## RAY CONFIGURATION
diff --git a/examples/notebooks/rag/rag_1A_dpk_process_ray.ipynb b/examples/notebooks/rag/rag_1A_dpk_process_ray.ipynb
index 8a8942b1f..8bdea1ff6 100644
--- a/examples/notebooks/rag/rag_1A_dpk_process_ray.ipynb
+++ b/examples/notebooks/rag/rag_1A_dpk_process_ray.ipynb
@@ -222,7 +222,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "id": "b0cd8ebd-bf71-42d6-a397-8df0c7b66a26",
    "metadata": {},
    "outputs": [
@@ -303,7 +303,7 @@
     "    \"data_files_to_use\": ast.literal_eval(\"['.pdf']\"),\n",
     "    # orchestrator\n",
     "    \"runtime_worker_options\": ParamsUtils.convert_to_ast(worker_options),\n",
-    "    \"runtime_num_workers\": MY_CONFIG.RAY_RUNTIME_WORKERS,\n",
+    "    \"runtime_num_workers\": 1,  # so model download to cleanup works properly\n",
     "    \"runtime_pipeline_id\": \"pipeline_id\",\n",
     "    \"runtime_job_id\": \"job_id\",\n",
     "    \"runtime_code_location\": ParamsUtils.convert_to_ast(code_location),\n",
@@ -2159,7 +2159,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "data-prep-kit-3-py312",
    "language": "python",
    "name": "python3"
   },
@@ -2173,7 +2173,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.9"
+   "version": "3.12.7"
   }
  },
  "nbformat": 4,
diff --git a/examples/notebooks/rag/rag_1D_query_llama_replicate.ipynb b/examples/notebooks/rag/rag_1D_query_replicate.ipynb
similarity index 88%
rename from examples/notebooks/rag/rag_1D_query_llama_replicate.ipynb
rename to examples/notebooks/rag/rag_1D_query_replicate.ipynb
index 532b7ef4d..5e94ac0e8 100644
--- a/examples/notebooks/rag/rag_1D_query_llama_replicate.ipynb
+++ b/examples/notebooks/rag/rag_1D_query_replicate.ipynb
@@ -249,33 +249,45 @@
     "\n",
     "### LLM Choices at Replicate\n",
     "\n",
-    "- llama 3.1 : Latest\n",
-    "    - **meta/meta-llama-3.1-405b-instruct** : Meta's flagship 405 billion parameter language model, fine-tuned for chat completions\n",
-    "- Base version of llama-3 from meta\n",
-    "    - [meta/meta-llama-3-8b](https://replicate.com/meta/meta-llama-3-8b) : Base version of Llama 3, an 8 billion parameter language model from Meta.\n",
-    "    - **meta/meta-llama-3-70b** : 70 billion\n",
-    "- Instruct versions of llama-3 from meta, fine tuned for chat completions\n",
-    "    - **meta/meta-llama-3-8b-instruct** : An 8 billion parameter language model from Meta, \n",
-    "    - **meta/meta-llama-3-70b-instruct** : 70 billion\n",
+    "\n",
+    "| Model                               | Publisher | Params | Description                                          |\n",
+    "|-------------------------------------|-----------|--------|------------------------------------------------------|\n",
+    "| ibm-granite/granite-3.0-8b-instruct | IBM       | 8 B    | IBM's newest Granite Model v3.0  (default)           |\n",
+    "| ibm-granite/granite-3.0-2b-instruct | IBM       | 2 B    | IBM's newest Granite Model v3.0                      |\n",
+    "| meta/meta-llama-3.1-405b-instruct   | Meta      | 405 B  | Meta's flagship 405 billion parameter language model |\n",
+    "| meta/meta-llama-3-8b-instruct       | Meta      | 8 B    | Meta's 8 billion parameter language model            |\n",
+    "| meta/meta-llama-3-70b-instruct      | Meta      | 70 B   | Meta's 70 billion parameter language model           |\n",
     "\n",
     "References \n",
     "\n",
-    "- https://docs.llamaindex.ai/en/stable/examples/llm/llama_2/?h=replicate"
+    "- https://www.ibm.com/granite\n",
+    "- https://www.llama.com/\n",
+    "- https://replicate.com/  "
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 8,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Using model: ibm-granite/granite-3.0-8b-instruct\n"
+     ]
+    }
+   ],
    "source": [
     "import os\n",
-    "os.environ[\"REPLICATE_API_TOKEN\"] = MY_CONFIG.REPLICATE_API_TOKEN"
+    "os.environ[\"REPLICATE_API_TOKEN\"] = MY_CONFIG.REPLICATE_API_TOKEN\n",
+    "\n",
+    "print ('Using model:', MY_CONFIG.LLM_MODEL)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -335,7 +347,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [
     {
@@ -351,11 +363,11 @@
       "Mayank Mishra ⋆ Matt Stallone ⋆ Gaoyuan Zhang ⋆ Yikang Shen Aditya Prasad Adriana Meza Soria Michele Merler Parameswaran Selvam Saptha Surendran Shivdeep Singh Manish Sethi Xuan-Hong Dang Pengyuan Li Kun-Lung Wu Syed Zawad Andrew Coleman Matthew White Mark Lewis Raju Pavuluri Yan Koyfman Boris Lublinsky Maximilien de Bayser Ibrahim Abdelaziz Kinjal Basu Mayank Agarwal Yi Zhou Chris Johnson Aanchal Goyal Hima Patel Yousaf Shah Petros Zerfos Heiko Ludwig Asim Munawar Maxwell Crouse Pavan Kapanipathi Shweta Salaria Bob Calio Sophia Wen Seetharami Seelam Brian Belgodere Carlos Fonseca Amith Singhee Nirmit Desai David D. Cox Ruchir Puri † Rameswar Panda †\n",
       "============ end  context ============\n",
       "============ here is the answer from LLM... STREAMING... =====\n",
-      "Based on the provided context, the training data used to train Granite models is not explicitly mentioned. However, it is mentioned that the 20B model was used after 1.6T tokens to start training of 34B model with the same code pretraining data without any changes to the training and inference framework. This implies that the same code pretraining data was used for both models, but the exact nature of this data is not specified.\n",
+      "The context does not provide specific details about the training data used to train the Granite models. It only mentions that the 20B model was trained after 1.6T tokens and then used to start training the 34B model with the same code pretraining data. However, it does not specify what this code pretraining data is.\n",
       "======  end LLM answer ======\n",
       "\n",
-      "CPU times: user 75.3 ms, sys: 37.8 ms, total: 113 ms\n",
-      "Wall time: 1.95 s\n"
+      "CPU times: user 63.6 ms, sys: 12 ms, total: 75.6 ms\n",
+      "Wall time: 1.43 s\n"
      ]
     }
    ],
@@ -369,7 +381,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [
     {
@@ -385,11 +397,11 @@
       "We are excited about the future of attention-based models and plan to apply them to other tasks. We plan to extend the Transformer to problems involving input and output modalities other than text and to investigate local, restricted attention mechanisms to efficiently handle large inputs and outputs such as images, audio and video. Making generation less sequential is another research goals of ours.\n",
       "============ end  context ============\n",
       "============ here is the answer from LLM... STREAMING... =====\n",
-      "Based on the provided context, an attention mechanism can be described as mapping a query and a set of key-value pairs to an output, where the query, keys, values, and output are all vectors. The output is computed as a weighted sum.\n",
+      "An attention mechanism is a method used in sequence modeling and transduction models to model dependencies between elements in input or output sequences, regardless of their distance. It maps a query and a set of key-value pairs to an output, which is computed as a weighted sum.\n",
       "======  end LLM answer ======\n",
       "\n",
-      "CPU times: user 41.1 ms, sys: 28.7 ms, total: 69.8 ms\n",
-      "Wall time: 1.58 s\n"
+      "CPU times: user 30.6 ms, sys: 17.3 ms, total: 47.9 ms\n",
+      "Wall time: 880 ms\n"
      ]
     }
    ],
@@ -403,7 +415,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [
     {
@@ -419,11 +431,11 @@
       "The Granite Code models achieve relatively high accuracy across all sizes (e.g., outperforming CodeGemma at 2B-3B scale, StarCoder2 at 7B-8B scale and CodeLlama models with half of the sizes). This shows that our Granite Code models are not only capable of generating good code but also of using libraries more accurately in real data science workflows.\n",
       "============ end  context ============\n",
       "============ here is the answer from LLM... STREAMING... =====\n",
-      "I apologize, but the provided context does not mention the moon landing. The context appears to be about code generation and evaluation benchmarks, specifically discussing the MBPP and MBPP+ benchmarks, and the performance of different code models. There is no mention of the moon landing. If you provide a different context or question, I'll be happy to help.\n",
+      "I'm sorry, the provided context does not contain information about the moon landing.\n",
       "======  end LLM answer ======\n",
       "\n",
-      "CPU times: user 41.5 ms, sys: 21 ms, total: 62.5 ms\n",
-      "Wall time: 2.13 s\n"
+      "CPU times: user 45 ms, sys: 3.19 ms, total: 48.2 ms\n",
+      "Wall time: 412 ms\n"
      ]
     }
    ],
@@ -445,7 +457,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "data-prep-kit-4-021",
    "language": "python",
    "name": "python3"
   },
diff --git a/transforms/language/doc_chunk/python/README.md b/transforms/language/doc_chunk/python/README.md
index fbacf4ade..9abca2b79 100644
--- a/transforms/language/doc_chunk/python/README.md
+++ b/transforms/language/doc_chunk/python/README.md
@@ -32,7 +32,6 @@ The transform can be tuned with the following parameters.
 | `chunking_type`        | `dl_json` | Chunking type to apply. Valid options are `li_markdown` for using the LlamaIndex [Markdown chunking](https://docs.llamaindex.ai/en/stable/module_guides/loading/node_parsers/modules/#markdownnodeparser), `dl_json` for using the [Docling JSON chunking](https://github.com/DS4SD/docling), `li_token_text` for using the LlamaIndex [Token Text Splitter](https://docs.llamaindex.ai/en/stable/api_reference/node_parsers/token_text_splitter/), which chunks the text into fixed-sized windows of tokens. |
 | `content_column_name`        | `contents` | Name of the column containing the text to be chunked. |
 | `doc_id_column_name`         | `document_id` | Name of the column containing the doc_id to be propagated in the output. |
-| `dl_min_chunk_len`           | `None` | Minimum number of characters for the chunk in the dl_json chunker. Setting to None is using the library defaults, i.e. a `min_chunk_len=64`. |
 | `chunk_size_tokens`          | `128` | Size of the chunk in tokens for the token text chunker. |
 | `chunk_overlap_tokens`       | `30` | Number of tokens overlapping between chunks for the token text chunker. |
 | `output_chunk_column_name`   | `contents` | Column name to store the chunks in the output table. |
diff --git a/transforms/language/doc_chunk/python/pyproject.toml b/transforms/language/doc_chunk/python/pyproject.toml
index 82d280eca..c9728712e 100644
--- a/transforms/language/doc_chunk/python/pyproject.toml
+++ b/transforms/language/doc_chunk/python/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_doc_chunk_transform_python"
-version = "0.2.2.dev2"
+version = "0.3.0"
 requires-python = ">=3.10,<3.13"
 description = "chunk documents Python Transform"
 license = {text = "Apache-2.0"}
diff --git a/transforms/language/doc_chunk/python/requirements.txt b/transforms/language/doc_chunk/python/requirements.txt
index ee6171ff6..7213c4199 100644
--- a/transforms/language/doc_chunk/python/requirements.txt
+++ b/transforms/language/doc_chunk/python/requirements.txt
@@ -1,3 +1,3 @@
 data-prep-toolkit==0.2.2.dev2
-docling-core==1.7.2
+docling-core==2.3.0
 llama-index-core>=0.11.0,<0.12.0
diff --git a/transforms/language/doc_chunk/python/src/doc_chunk_chunkers.py b/transforms/language/doc_chunk/python/src/doc_chunk_chunkers.py
index a8ba44f61..b55bd08ff 100644
--- a/transforms/language/doc_chunk/python/src/doc_chunk_chunkers.py
+++ b/transforms/language/doc_chunk/python/src/doc_chunk_chunkers.py
@@ -13,11 +13,11 @@
 from abc import ABCMeta, abstractmethod
 from typing import Iterator, Optional, Dict, List
 
-from docling_core.types import Document as DLDocument
+from docling_core.types.doc import DoclingDocument
 from llama_index.core.node_parser.text.token import TokenTextSplitter
 from llama_index.core import Document as LIDocument
 from llama_index.core.node_parser import MarkdownNodeParser
-from docling_core.transforms.chunker import HierarchicalChunker
+from docling_core.transforms.chunker import HierarchicalChunker, DocMeta
 
 
 class ChunkingExecutor(metaclass=ABCMeta):
@@ -29,7 +29,6 @@ def chunk(self, content: str) -> Iterator[dict]:
 class DLJsonChunker(ChunkingExecutor):
     def __init__(
         self,
-        min_chunk_len: Optional[int],
         output_chunk_column_name: str,
         output_jsonpath_column_name: str,
         output_pageno_column_name_key: str,
@@ -40,19 +39,19 @@ def __init__(
         self.output_pageno_column_name_key = output_pageno_column_name_key
         self.output_bbox_column_name_key = output_bbox_column_name_key
 
-        chunker_kwargs = dict(include_metadata=True)
-        if min_chunk_len is not None:
-            chunker_kwargs["min_chunk_len"] = min_chunk_len
-        self._chunker = HierarchicalChunker(**chunker_kwargs)
+        self._chunker = HierarchicalChunker()
 
     def chunk(self, content: str) -> Iterator[dict]:
-        doc = DLDocument.model_validate_json(content)
+        doc = DoclingDocument.model_validate_json(content)
         for chunk in self._chunker.chunk(doc):
+            meta = DocMeta.model_validate(chunk.meta)
+            doc_item = meta.doc_items[0]
+            prov = doc_item.prov[0]
             yield {
                 self.output_chunk_column_name: chunk.text,
-                self.output_jsonpath_column_name: chunk.path,
-                self.output_pageno_column_name_key: chunk.page,
-                self.output_bbox_column_name_key: chunk.bbox,
+                self.output_jsonpath_column_name: doc_item.self_ref,
+                self.output_pageno_column_name_key: prov.page_no,
+                self.output_bbox_column_name_key: prov.bbox.as_tuple(),
             }
 
 
diff --git a/transforms/language/doc_chunk/python/src/doc_chunk_local_python.py b/transforms/language/doc_chunk/python/src/doc_chunk_local_python.py
index e0fdfa871..0c830ee98 100644
--- a/transforms/language/doc_chunk/python/src/doc_chunk_local_python.py
+++ b/transforms/language/doc_chunk/python/src/doc_chunk_local_python.py
@@ -38,6 +38,7 @@
     "runtime_job_id": "job_id",
     "runtime_code_location": ParamsUtils.convert_to_ast(code_location),
     # doc_chunk params
+    # "doc_chunk_dl_min_chunk_len": 10,  # for testing the usage of the deprecated argument
     # "doc_chunk_chunking_type": "li_markdown",
     "doc_chunk_chunking_type": "dl_json",
     # "doc_chunk_chunking_type": chunking_types.LI_TOKEN_TEXT, 
diff --git a/transforms/language/doc_chunk/python/src/doc_chunk_transform.py b/transforms/language/doc_chunk/python/src/doc_chunk_transform.py
index 7acdd3ef1..e64a7c1d1 100644
--- a/transforms/language/doc_chunk/python/src/doc_chunk_transform.py
+++ b/transforms/language/doc_chunk/python/src/doc_chunk_transform.py
@@ -26,7 +26,6 @@
 content_column_name_key = "content_column_name"
 doc_id_column_name_key = "doc_id_column_name"
 chunking_type_key = "chunking_type"
-dl_min_chunk_len_key = "dl_min_chunk_len"
 chunk_size_tokens_key = "chunk_size_tokens"
 chunk_overlap_tokens_key = "chunk_overlap_tokens"
 output_chunk_column_name_key = "output_chunk_column_name"
@@ -38,7 +37,6 @@
 content_column_name_cli_param = f"{cli_prefix}{content_column_name_key}"
 doc_id_column_name_cli_param = f"{cli_prefix}{doc_id_column_name_key}"
 chunking_type_cli_param = f"{cli_prefix}{chunking_type_key}"
-dl_min_chunk_len_cli_param = f"{cli_prefix}{dl_min_chunk_len_key}"
 output_chunk_column_name_cli_param = f"{cli_prefix}{output_chunk_column_name_key}"
 output_source_doc_id_column_name_cli_param = f"{cli_prefix}{output_source_doc_id_column_name_key}"
 output_jsonpath_column_name_cli_param = f"{cli_prefix}{output_jsonpath_column_name_key}"
@@ -59,7 +57,6 @@ def __str__(self):
 default_content_column_name = "contents"
 default_doc_id_column_name = "document_id"
 default_chunking_type = chunking_types.DL_JSON
-default_dl_min_chunk_len = None
 default_output_chunk_column_name = "contents"
 default_output_chunk_column_id = "chunk_id"
 default_output_source_doc_id_column_name = "source_document_id"
@@ -95,7 +92,6 @@ def __init__(self, config: dict[str, Any]):
         self.output_source_doc_id_column_name = config.get(output_source_doc_id_column_name_key, default_output_source_doc_id_column_name)
 
         # Parameters for Docling JSON chunking
-        self.dl_min_chunk_len = config.get(dl_min_chunk_len_key, default_dl_min_chunk_len)
         self.output_jsonpath_column_name = config.get(
             output_jsonpath_column_name_key, default_output_jsonpath_column_name
         )
@@ -113,7 +109,6 @@ def __init__(self, config: dict[str, Any]):
         self.chunker: ChunkingExecutor
         if self.chunking_type == chunking_types.DL_JSON:
             self.chunker = DLJsonChunker(
-                min_chunk_len=self.dl_min_chunk_len,
                 output_chunk_column_name=self.output_chunk_column_name,
                 output_jsonpath_column_name=self.output_jsonpath_column_name,
                 output_pageno_column_name_key=self.output_pageno_column_name_key,
@@ -202,11 +197,6 @@ def add_input_params(self, parser: ArgumentParser) -> None:
             default=default_doc_id_column_name,
             help="Name of the column containing the doc_id to be propagated in the output",
         )
-        parser.add_argument(
-            f"--{dl_min_chunk_len_cli_param}",
-            default=default_dl_min_chunk_len,
-            help="Minimum number of characters for the chunk in the dl_json chunker. Setting to None is using the library defaults, i.e. a min_chunk_len=64.",
-        )
         parser.add_argument(
             f"--{output_chunk_column_name_cli_param}",
             default=default_output_chunk_column_name,
@@ -244,6 +234,11 @@ def add_input_params(self, parser: ArgumentParser) -> None:
             type=int,
             help="Number of tokens overlapping between chunks for the fixed-sized chunker.",
         )
+        parser.add_argument(
+            f"--{cli_prefix}dl_min_chunk_len",
+            default=None,
+            help="Deprecated. This option is no longer considered.",
+        )
 
     def apply_input_params(self, args: Namespace) -> bool:
         """
@@ -254,5 +249,7 @@ def apply_input_params(self, args: Namespace) -> bool:
         captured = CLIArgumentProvider.capture_parameters(args, cli_prefix, False)
 
         self.params = self.params | captured
+        if self.params.get("dl_min_chunk_len") is not None:
+            self.logger.warning("The `dl_min_chunk_len` option is deprecated and will be ignored. Please stop using it, it will not accepted anymore in future versions.")
         self.logger.info(f"doc_chunk parameters are : {self.params}")
         return True
diff --git a/transforms/language/doc_chunk/python/test-data/expected/metadata.json b/transforms/language/doc_chunk/python/test-data/expected/metadata.json
index 7eeaaa279..e83a0375b 100644
--- a/transforms/language/doc_chunk/python/test-data/expected/metadata.json
+++ b/transforms/language/doc_chunk/python/test-data/expected/metadata.json
@@ -5,8 +5,8 @@
     "job name": "doc_chunk",
     "job type": "pure python",
     "job id": "job_id",
-    "start_time": "2024-10-18 14:05:09",
-    "end_time": "2024-10-18 14:05:11",
+    "start_time": "2024-10-30 18:38:40",
+    "end_time": "2024-10-30 18:38:40",
     "status": "success"
   },
   "code": {
@@ -18,7 +18,6 @@
     "chunking_type": "dl_json",
     "content_column_name": "contents",
     "doc_id_column_name": "document_id",
-    "dl_min_chunk_len": null,
     "output_chunk_column_name": "contents",
     "output_source_doc_id_column_name": "source_document_id",
     "output_jsonpath_column_name": "doc_jsonpath",
@@ -35,22 +34,22 @@
     "num_processors": 0
   },
   "execution_stats": {
-    "cpus": 27.9,
+    "cpus": 19.5,
     "gpus": 0,
-    "memory": 25.75,
+    "memory": 27.48,
     "object_store": 0,
-    "execution time, min": 0.021
+    "execution time, min": 0.001
   },
   "job_output_stats": {
     "source_files": 1,
-    "source_size": 50276,
+    "source_size": 12073,
     "result_files": 1,
-    "result_size": 31223,
-    "processing_time": 1.266,
+    "result_size": 14363,
+    "processing_time": 0.043,
     "nfiles": 1,
-    "nrows": 88,
+    "nrows": 39,
     "source_doc_count": 1,
-    "result_doc_count": 88
+    "result_doc_count": 39
   },
   "source": {
     "name": "/Users/dol/codes/data-prep-kit/transforms/language/doc_chunk/python/test-data/input",
diff --git a/transforms/language/doc_chunk/python/test-data/expected/test1.parquet b/transforms/language/doc_chunk/python/test-data/expected/test1.parquet
index 06089be78..46714dde7 100644
Binary files a/transforms/language/doc_chunk/python/test-data/expected/test1.parquet and b/transforms/language/doc_chunk/python/test-data/expected/test1.parquet differ
diff --git a/transforms/language/doc_chunk/python/test-data/input/test1.parquet b/transforms/language/doc_chunk/python/test-data/input/test1.parquet
index 4015fccb0..32905aa74 100644
Binary files a/transforms/language/doc_chunk/python/test-data/input/test1.parquet and b/transforms/language/doc_chunk/python/test-data/input/test1.parquet differ
diff --git a/transforms/language/doc_chunk/ray/pyproject.toml b/transforms/language/doc_chunk/ray/pyproject.toml
index 033b8716b..29b594fac 100644
--- a/transforms/language/doc_chunk/ray/pyproject.toml
+++ b/transforms/language/doc_chunk/ray/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_doc_chunk_transform_ray"
-version = "0.2.2.dev2"
+version = "0.3.0"
 requires-python = ">=3.10,<3.13"
 description = "chunk documents Ray Transform"
 license = {text = "Apache-2.0"}
@@ -11,7 +11,7 @@ authors = [
     { name = "Christoph Auer", email = "cau@zurich.ibm.com" },
 ]
 dependencies = [
-    "dpk-doc-chunk-transform-python==0.2.2.dev2",
+    "dpk-doc-chunk-transform-python==0.3.0",
     "data-prep-toolkit[ray]==0.2.2.dev2",
 ]
 
diff --git a/transforms/language/doc_chunk/ray/test-data/expected/metadata.json b/transforms/language/doc_chunk/ray/test-data/expected/metadata.json
index 7eeaaa279..e83a0375b 100644
--- a/transforms/language/doc_chunk/ray/test-data/expected/metadata.json
+++ b/transforms/language/doc_chunk/ray/test-data/expected/metadata.json
@@ -5,8 +5,8 @@
     "job name": "doc_chunk",
     "job type": "pure python",
     "job id": "job_id",
-    "start_time": "2024-10-18 14:05:09",
-    "end_time": "2024-10-18 14:05:11",
+    "start_time": "2024-10-30 18:38:40",
+    "end_time": "2024-10-30 18:38:40",
     "status": "success"
   },
   "code": {
@@ -18,7 +18,6 @@
     "chunking_type": "dl_json",
     "content_column_name": "contents",
     "doc_id_column_name": "document_id",
-    "dl_min_chunk_len": null,
     "output_chunk_column_name": "contents",
     "output_source_doc_id_column_name": "source_document_id",
     "output_jsonpath_column_name": "doc_jsonpath",
@@ -35,22 +34,22 @@
     "num_processors": 0
   },
   "execution_stats": {
-    "cpus": 27.9,
+    "cpus": 19.5,
     "gpus": 0,
-    "memory": 25.75,
+    "memory": 27.48,
     "object_store": 0,
-    "execution time, min": 0.021
+    "execution time, min": 0.001
   },
   "job_output_stats": {
     "source_files": 1,
-    "source_size": 50276,
+    "source_size": 12073,
     "result_files": 1,
-    "result_size": 31223,
-    "processing_time": 1.266,
+    "result_size": 14363,
+    "processing_time": 0.043,
     "nfiles": 1,
-    "nrows": 88,
+    "nrows": 39,
     "source_doc_count": 1,
-    "result_doc_count": 88
+    "result_doc_count": 39
   },
   "source": {
     "name": "/Users/dol/codes/data-prep-kit/transforms/language/doc_chunk/python/test-data/input",
diff --git a/transforms/language/doc_chunk/ray/test-data/expected/test1.parquet b/transforms/language/doc_chunk/ray/test-data/expected/test1.parquet
index 06089be78..46714dde7 100644
Binary files a/transforms/language/doc_chunk/ray/test-data/expected/test1.parquet and b/transforms/language/doc_chunk/ray/test-data/expected/test1.parquet differ
diff --git a/transforms/language/doc_chunk/ray/test-data/input/test1.parquet b/transforms/language/doc_chunk/ray/test-data/input/test1.parquet
index 4015fccb0..32905aa74 100644
Binary files a/transforms/language/doc_chunk/ray/test-data/input/test1.parquet and b/transforms/language/doc_chunk/ray/test-data/input/test1.parquet differ
diff --git a/transforms/language/doc_chunk/transform.config b/transforms/language/doc_chunk/transform.config
index f433f360b..1df42f298 100644
--- a/transforms/language/doc_chunk/transform.config
+++ b/transforms/language/doc_chunk/transform.config
@@ -14,7 +14,7 @@ TRANSFORM_NAME=doc_chunk
 #
 # If you change the versions numbers, be sure to run "make set-versions" to 
 # update version numbers across the transform (e.g., pyproject.toml).
-DOC_CHUNK_PYTHON_VERSION=$(DPK_VERSION)
+DOC_CHUNK_PYTHON_VERSION=0.3.0
 DOC_CHUNK_RAY_VERSION=$(DOC_CHUNK_PYTHON_VERSION)
 DOC_CHUNK_SPARK_VERSION=$(DOC_CHUNK_PYTHON_VERSION)
 
diff --git a/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_multiple_wf.py b/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_multiple_wf.py
index cfb443f16..8992f1145 100644
--- a/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_multiple_wf.py
+++ b/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_multiple_wf.py
@@ -39,8 +39,11 @@ def compute_exec_params_func(
     runtime_pipeline_id: str,
     runtime_job_id: str,
     runtime_code_location: dict,
+    pdf2parquet_batch_size: int,
     pdf2parquet_do_table_structure: bool,
     pdf2parquet_do_ocr: bool,
+    pdf2parquet_ocr_engine: str,
+    pdf2parquet_bitmap_area_threshold: float,
 ) -> dict:
     from runtime_utils import KFPUtils
 
@@ -53,8 +56,11 @@ def compute_exec_params_func(
         "runtime_pipeline_id": runtime_pipeline_id,
         "runtime_job_id": runtime_job_id,
         "runtime_code_location": str(runtime_code_location),
+        "pdf2parquet_batch_size": pdf2parquet_batch_size,
         "pdf2parquet_do_table_structure": pdf2parquet_do_table_structure,
         "pdf2parquet_do_ocr": pdf2parquet_do_ocr,
+        "pdf2parquet_ocr_engine": pdf2parquet_ocr_engine,
+        "pdf2parquet_bitmap_area_threshold": pdf2parquet_bitmap_area_threshold,
     }
 
 
@@ -112,8 +118,11 @@ def pdf2parquet(
     runtime_pipeline_id: str = "pipeline_id",
     runtime_code_location: dict = {'github': 'github', 'commit_hash': '12345', 'path': 'path'},
     # pdf2parquet parameters
+    pdf2parquet_batch_size: int = -1,
     pdf2parquet_do_table_structure: bool = True,
     pdf2parquet_do_ocr: bool = False,
+    pdf2parquet_ocr_engine: str = "easyocr",
+    pdf2parquet_bitmap_area_threshold: float = 0.05,
     # additional parameters
     additional_params: str = '{"wait_interval": 2, "wait_cluster_ready_tmout": 400, "wait_cluster_up_tmout": 300, "wait_job_ready_tmout": 400, "wait_print_tmout": 30, "http_retries": 5, "delete_cluster_delay_minutes": 0}',
 ):
@@ -150,8 +159,11 @@ def pdf2parquet(
     :param runtime_actor_options - actor options
     :param runtime_pipeline_id - pipeline id
     :param runtime_code_location - code location
+    :param pdf2parquet_batch_size - how many inputs to batch into one output table
     :param pdf2parquet_do_table_structure - run table structure model
     :param pdf2parquet_do_ocr - run ocr model
+    :param pdf2parquet_ocr_engine - which ocr engine
+    :param pdf2parquet_bitmap_area_threshold - threshold for bitmaps
     :return: None
     """
     # create clean_up task
@@ -169,8 +181,11 @@ def pdf2parquet(
             runtime_pipeline_id=runtime_pipeline_id,
             runtime_job_id=run_id,
             runtime_code_location=runtime_code_location,
+            pdf2parquet_batch_size=pdf2parquet_batch_size,
             pdf2parquet_do_table_structure=pdf2parquet_do_table_structure,
             pdf2parquet_do_ocr=pdf2parquet_do_ocr,
+            pdf2parquet_ocr_engine=pdf2parquet_ocr_engine,
+            pdf2parquet_bitmap_area_threshold=pdf2parquet_bitmap_area_threshold,
         )
         ComponentUtils.add_settings_to_component(compute_exec_params, ONE_HOUR_SEC * 2)
         # start Ray cluster
diff --git a/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_wf.py b/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_wf.py
index 1905ee17c..c9cdbf652 100644
--- a/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_wf.py
+++ b/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_wf.py
@@ -40,8 +40,11 @@ def compute_exec_params_func(
     runtime_pipeline_id: str,
     runtime_job_id: str,
     runtime_code_location: dict,
+    pdf2parquet_batch_size: int,
     pdf2parquet_do_table_structure: bool,
     pdf2parquet_do_ocr: bool,
+    pdf2parquet_ocr_engine: str,
+    pdf2parquet_bitmap_area_threshold: float,
 ) -> dict:
     from runtime_utils import KFPUtils
 
@@ -55,8 +58,11 @@ def compute_exec_params_func(
         "runtime_pipeline_id": runtime_pipeline_id,
         "runtime_job_id": runtime_job_id,
         "runtime_code_location": str(runtime_code_location),
+        "pdf2parquet_batch_size": pdf2parquet_batch_size,
         "pdf2parquet_do_table_structure": pdf2parquet_do_table_structure,
         "pdf2parquet_do_ocr": pdf2parquet_do_ocr,
+        "pdf2parquet_ocr_engine": pdf2parquet_ocr_engine,
+        "pdf2parquet_bitmap_area_threshold": pdf2parquet_bitmap_area_threshold,
     }
 
 
@@ -116,8 +122,11 @@ def pdf2parquet(
     runtime_pipeline_id: str = "pipeline_id",
     runtime_code_location: dict = {'github': 'github', 'commit_hash': '12345', 'path': 'path'},
     # pdf2parquet parameters
+    pdf2parquet_batch_size: int = -1,
     pdf2parquet_do_table_structure: bool = True,
     pdf2parquet_do_ocr: bool = False,
+    pdf2parquet_ocr_engine: str = "easyocr",
+    pdf2parquet_bitmap_area_threshold: float = 0.05,
     # additional parameters
     additional_params: str = '{"wait_interval": 2, "wait_cluster_ready_tmout": 400, "wait_cluster_up_tmout": 300, "wait_job_ready_tmout": 400, "wait_print_tmout": 30, "http_retries": 5, "delete_cluster_delay_minutes": 0}',
 ):
@@ -154,8 +163,11 @@ def pdf2parquet(
     :param runtime_actor_options - actor options
     :param runtime_pipeline_id - pipeline id
     :param runtime_code_location - code location
+    :param pdf2parquet_batch_size - how many inputs to batch into one output table
     :param pdf2parquet_do_table_structure - run table structure model
     :param pdf2parquet_do_ocr - run ocr model
+    :param pdf2parquet_ocr_engine - which ocr engine
+    :param pdf2parquet_bitmap_area_threshold - threshold for bitmaps
     :return: None
     """
     # create clean_up task
@@ -174,8 +186,11 @@ def pdf2parquet(
             runtime_pipeline_id=runtime_pipeline_id,
             runtime_job_id=run_id,
             runtime_code_location=runtime_code_location,
+            pdf2parquet_batch_size=pdf2parquet_batch_size,
             pdf2parquet_do_table_structure=pdf2parquet_do_table_structure,
             pdf2parquet_do_ocr=pdf2parquet_do_ocr,
+            pdf2parquet_ocr_engine=pdf2parquet_ocr_engine,
+            pdf2parquet_bitmap_area_threshold=pdf2parquet_bitmap_area_threshold,
         )
         ComponentUtils.add_settings_to_component(compute_exec_params, ONE_HOUR_SEC * 2)
         # start Ray cluster
diff --git a/transforms/language/pdf2parquet/python/Dockerfile b/transforms/language/pdf2parquet/python/Dockerfile
index 948d126e1..7d6f80502 100644
--- a/transforms/language/pdf2parquet/python/Dockerfile
+++ b/transforms/language/pdf2parquet/python/Dockerfile
@@ -32,7 +32,7 @@ RUN pip install ${PIP_INSTALL_EXTRA_ARGS} --no-cache-dir -e .
 
 # Download models
 RUN python -c 'from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models; load_pretrained_nlp_models(verbose=True);'
-RUN python -c 'from docling.document_converter import DocumentConverter; s=DocumentConverter.download_models_hf(); print(f"Models cached in {s}")'
+RUN python -c 'from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline; s=StandardPdfPipeline.download_models_hf(); print(f"Models cached in {s}")'
 
 # copy the main() entry point to the image
 COPY --chown=dpk:root src/pdf2parquet_transform.py ./
diff --git a/transforms/language/pdf2parquet/python/README.md b/transforms/language/pdf2parquet/python/README.md
index 4ba825530..a4bd31e06 100644
--- a/transforms/language/pdf2parquet/python/README.md
+++ b/transforms/language/pdf2parquet/python/README.md
@@ -1,9 +1,21 @@
 # Ingest PDF to Parquet
 
-This tranforms iterate through PDF files or zip of PDF files and generates parquet files
-containing the converted document in Markdown format.
+This tranforms iterate through document files or zip of files and generates parquet files
+containing the converted document in Markdown or JSON format.
 
 The PDF conversion is using the [Docling package](https://github.com/DS4SD/docling).
+The Docling configuration in DPK is tuned for best results when running large batch ingestions.
+For more details on the multiple configuration options, please refer to the official [Docling documentation](https://ds4sd.github.io/docling/).
+
+This transform supports the following input formats:
+
+- PDF documents
+- DOCX documents
+- PPTX presentations
+- Image files (png, jpeg, etc)
+- HTML pages
+- Markdown documents
+- ASCII Docs documents
 
 
 ## Output format
@@ -17,6 +29,7 @@ with the addition of the following columns
     "filename": "string",         // the basename of the PDF file
     "contents": "string",         // the content of the PDF
     "document_id": "string",      // the document id, a random uuid4 
+    "document_hash": "string",    // the document hash of the input content 
     "ext": "string",              // the detected file extension
     "hash": "string",             // the hash of the `contents` column
     "size": "string",             // the size of `contents`
@@ -35,10 +48,14 @@ The transform can be initialized with the following parameters.
 
 | Parameter  | Default  | Description  |
 |------------|----------|--------------|
+| `batch_size`                 | -1 | Number of documents to be saved in the same result table. A value of -1 will generate one result file for each input file. |
 | `artifacts_path`             | <unset> | Path where to Docling models artifacts are located, if unset they will be downloaded and fetched from the [HF_HUB_CACHE](https://huggingface.co/docs/huggingface_hub/en/guides/manage-cache) folder. |
-| `contents_type`         | `text/markdown`        | The output type for the `contents` column. Valid types are `text/markdown` and `application/json`. |
+| `contents_type`         | `text/markdown`        | The output type for the `contents` column. Valid types are `text/markdown`, `text/plain` and `application/json`. |
 | `do_table_structure`         | `True`        | If true, detected tables will be processed with the table structure model. |
 | `do_ocr`                     | `True`        | If true, optical character recognition (OCR) will be used to read the content of bitmap parts of the document. |
+| `ocr_engine`                 | `easyocr`     | The OCR engine to use. Valid values are `easyocr`, `tesseract`, `tesseract_cli`. |
+| `bitmap_area_threshold`      | `0.05`        | Threshold for running OCR on bitmap figures embedded in document. The threshold is computed as the fraction of the area covered by the bitmap, compared to the whole page area. |
+| `pdf_backend`                | `dlparse_v2`  | The PDF backend to use. Valid values are `dlparse_v2`, `dlparse_v1`, `pypdfium2`. |
 | `double_precision`           | `8`           | If set, all floating points (e.g. bounding boxes) are rounded to this precision. For tests it is advised to use 0. |
 
 When invoking the CLI, the parameters must be set as `--pdf2parquet_<name>`, e.g. `--pdf2parquet_do_ocr=true`.
diff --git a/transforms/language/pdf2parquet/python/pyproject.toml b/transforms/language/pdf2parquet/python/pyproject.toml
index 56c1087fb..f19b8e049 100644
--- a/transforms/language/pdf2parquet/python/pyproject.toml
+++ b/transforms/language/pdf2parquet/python/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_pdf2parquet_transform_python"
-version = "0.2.2.dev2"
+version = "0.3.0"
 requires-python = ">=3.10,<3.13"
 description = "PDF2PARQUET Python Transform"
 license = {text = "Apache-2.0"}
diff --git a/transforms/language/pdf2parquet/python/requirements.txt b/transforms/language/pdf2parquet/python/requirements.txt
index 7346d4fc3..2912af252 100644
--- a/transforms/language/pdf2parquet/python/requirements.txt
+++ b/transforms/language/pdf2parquet/python/requirements.txt
@@ -1,6 +1,6 @@
 data-prep-toolkit==0.2.2.dev2
-docling-core==1.7.2
-docling-ibm-models==2.0.0
-deepsearch-glm==0.22.0
-docling==1.20.0
+docling-core==2.3.0
+docling-ibm-models==2.0.3
+deepsearch-glm==0.26.1
+docling==2.3.1
 filetype >=1.2.0, <2.0.0
diff --git a/transforms/language/pdf2parquet/python/src/pdf2parquet_local_python.py b/transforms/language/pdf2parquet/python/src/pdf2parquet_local_python.py
index b6d450fc8..99b0b7f84 100644
--- a/transforms/language/pdf2parquet/python/src/pdf2parquet_local_python.py
+++ b/transforms/language/pdf2parquet/python/src/pdf2parquet_local_python.py
@@ -32,13 +32,14 @@
 params = {
     # Data access. Only required parameters are specified
     "data_local_config": ParamsUtils.convert_to_ast(local_conf),
-    "data_files_to_use": ast.literal_eval("['.pdf','.zip']"),
+    "data_files_to_use": ast.literal_eval("['.pdf','.docx','.pptx','.zip']"),
     # execution info
     "runtime_pipeline_id": "pipeline_id",
     "runtime_job_id": "job_id",
     "runtime_code_location": ParamsUtils.convert_to_ast(code_location),
     # pdf2parquet params
     "pdf2parquet_double_precision": 0,
+    # "pdf2parquet_batch_size": 10,
     # "pdf2parquet_do_table_structure": False,
     # "pdf2parquet_do_ocr": False,
     # "pdf2parquet_contents_type": "text/markdown",
diff --git a/transforms/language/pdf2parquet/python/src/pdf2parquet_transform.py b/transforms/language/pdf2parquet/python/src/pdf2parquet_transform.py
index 1ca559d33..0f5de10c0 100644
--- a/transforms/language/pdf2parquet/python/src/pdf2parquet_transform.py
+++ b/transforms/language/pdf2parquet/python/src/pdf2parquet_transform.py
@@ -19,7 +19,7 @@
 from argparse import ArgumentParser, Namespace
 from datetime import datetime
 from pathlib import Path
-from typing import Any
+from typing import Any, Optional
 
 import filetype
 import pandas as pd
@@ -27,42 +27,86 @@
 from data_processing.transform import AbstractBinaryTransform, TransformConfiguration
 from data_processing.utils import TransformUtils, get_logger, str2bool
 from data_processing.utils.cli_utils import CLIArgumentProvider
-from docling.datamodel.base_models import DocumentStream
-from docling.datamodel.document import ConvertedDocument, DocumentConversionInput
-from docling.document_converter import DocumentConverter
-from docling.pipeline.standard_model_pipeline import PipelineOptions
+from data_processing.utils.multilock import MultiLock
+from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
+from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
+from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
+from docling.datamodel.base_models import DocumentStream, MimeTypeToFormat
+from docling.datamodel.pipeline_options import (
+    EasyOcrOptions,
+    OcrOptions,
+    PdfPipelineOptions,
+    TesseractCliOcrOptions,
+    TesseractOcrOptions,
+)
+from docling.document_converter import DocumentConverter, InputFormat, PdfFormatOption
+from docling.models.base_ocr_model import OcrOptions
 
 
 logger = get_logger(__name__)
+# logger = get_logger(__name__, level="DEBUG")
 
 shortname = "pdf2parquet"
 cli_prefix = f"{shortname}_"
+pdf2parquet_batch_size_key = f"batch_size"
 pdf2parquet_artifacts_path_key = f"artifacts_path"
 pdf2parquet_contents_type_key = f"contents_type"
 pdf2parquet_do_table_structure_key = f"do_table_structure"
 pdf2parquet_do_ocr_key = f"do_ocr"
+pdf2parquet_ocr_engine_key = f"ocr_engine"
+pdf2parquet_bitmap_area_threshold_key = f"bitmap_area_threshold"
+pdf2parquet_pdf_backend_key = f"pdf_backend"
 pdf2parquet_double_precision_key = f"double_precision"
 
 
 class pdf2parquet_contents_types(str, enum.Enum):
     MARKDOWN = "text/markdown"
+    TEXT = "text/plain"
     JSON = "application/json"
 
     def __str__(self):
         return str(self.value)
 
 
+class pdf2parquet_pdf_backend(str, enum.Enum):
+    PYPDFIUM2 = "pypdfium2"
+    DLPARSE_V1 = "dlparse_v1"
+    DLPARSE_V2 = "dlparse_v2"
+
+    def __str__(self):
+        return str(self.value)
+
+
+class pdf2parquet_ocr_engine(str, enum.Enum):
+    EASYOCR = "easyocr"
+    TESSERACT_CLI = "tesseract_cli"
+    TESSERACT = "tesseract"
+
+    def __str__(self):
+        return str(self.value)
+
+
+pdf2parquet_batch_size_default = -1
 pdf2parquet_contents_type_default = pdf2parquet_contents_types.MARKDOWN
 pdf2parquet_do_table_structure_default = True
 pdf2parquet_do_ocr_default = True
+pdf2parquet_bitmap_area_threshold_default = 0.05
+pdf2parquet_ocr_engine_default = pdf2parquet_ocr_engine.EASYOCR
+pdf2parquet_pdf_backend_default = pdf2parquet_pdf_backend.DLPARSE_V2
 pdf2parquet_double_precision_default = 8
 
+pdf2parquet_batch_size_cli_param = f"{cli_prefix}{pdf2parquet_batch_size_key}"
 pdf2parquet_artifacts_path_cli_param = f"{cli_prefix}{pdf2parquet_artifacts_path_key}"
 pdf2parquet_contents_type_cli_param = f"{cli_prefix}{pdf2parquet_contents_type_key}"
 pdf2parquet_do_table_structure_cli_param = (
     f"{cli_prefix}{pdf2parquet_do_table_structure_key}"
 )
 pdf2parquet_do_ocr_cli_param = f"{cli_prefix}{pdf2parquet_do_ocr_key}"
+pdf2parquet_bitmap_area_threshold__cli_param = (
+    f"{cli_prefix}{pdf2parquet_bitmap_area_threshold_key}"
+)
+pdf2parquet_ocr_engine_cli_param = f"{cli_prefix}{pdf2parquet_ocr_engine_key}"
+pdf2parquet_pdf_backend_cli_param = f"{cli_prefix}{pdf2parquet_pdf_backend_key}"
 pdf2parquet_double_precision_cli_param = (
     f"{cli_prefix}{pdf2parquet_double_precision_key}"
 )
@@ -81,6 +125,7 @@ def __init__(self, config: dict):
 
         super().__init__(config)
 
+        self.batch_size = config.get(pdf2parquet_batch_size_key, pdf2parquet_batch_size_default)
         self.artifacts_path = config.get(pdf2parquet_artifacts_path_key, None)
         if self.artifacts_path is not None:
             self.artifacts_path = Path(self.artifacts_path)
@@ -93,18 +138,75 @@ def __init__(self, config: dict):
             pdf2parquet_do_table_structure_key, pdf2parquet_do_table_structure_default
         )
         self.do_ocr = config.get(pdf2parquet_do_ocr_key, pdf2parquet_do_ocr_default)
+        self.ocr_engine_name = config.get(
+            pdf2parquet_ocr_engine_key, pdf2parquet_ocr_engine_default
+        )
+        if not isinstance(self.ocr_engine_name, pdf2parquet_ocr_engine):
+            self.ocr_engine_name = pdf2parquet_ocr_engine[self.ocr_engine_name]
+        self.bitmap_area_threshold = config.get(
+            pdf2parquet_bitmap_area_threshold_key,
+            pdf2parquet_bitmap_area_threshold_default,
+        )
+        self.pdf_backend_name = config.get(
+            pdf2parquet_pdf_backend_key, pdf2parquet_pdf_backend_default
+        )
+        if not isinstance(self.pdf_backend_name, pdf2parquet_pdf_backend):
+            self.pdf_backend_name = pdf2parquet_pdf_backend[self.pdf_backend_name]
         self.double_precision = config.get(
             pdf2parquet_double_precision_key, pdf2parquet_double_precision_default
         )
 
         logger.info("Initializing models")
-        pipeline_options = PipelineOptions(
+        pipeline_options = PdfPipelineOptions(
+            artifacts_path=self.artifacts_path,
             do_table_structure=self.do_table_structure,
             do_ocr=self.do_ocr,
+            ocr_options=self._get_ocr_engine(self.ocr_engine_name),
         )
-        self._converter = DocumentConverter(
-            artifacts_path=self.artifacts_path, pipeline_options=pipeline_options
-        )
+        pipeline_options.ocr_options.bitmap_area_threshold = self.bitmap_area_threshold
+
+        lock = MultiLock("dpk_pdf2parquet_init")
+        try:
+            logger.debug(
+                f"Going to acquire lock {lock.lock_filename} for synchronizing global filesystem operations."
+            )
+            locked = lock.acquire()
+            logger.debug(f"Lock {lock.lock_filename} acquired.")
+
+            self._converter = DocumentConverter(
+                format_options={
+                    InputFormat.PDF: PdfFormatOption(
+                        pipeline_options=pipeline_options,
+                        backend=self._get_pdf_backend(self.pdf_backend_name),
+                    )
+                }
+            )
+            self._converter.initialize_pipeline(InputFormat.PDF)
+        finally:
+            lock.release()
+            logger.debug(f"Lock {lock.lock_filename} released.")
+        
+        self.buffer = []
+
+    def _get_ocr_engine(self, engine_name: pdf2parquet_ocr_engine) -> OcrOptions:
+        if engine_name == pdf2parquet_ocr_engine.EASYOCR:
+            return EasyOcrOptions()
+        elif engine_name == pdf2parquet_ocr_engine.TESSERACT_CLI:
+            return TesseractCliOcrOptions()
+        elif engine_name == pdf2parquet_ocr_engine.TESSERACT:
+            return TesseractOcrOptions()
+
+        raise RuntimeError(f"Unknown OCR engine `{engine_name}`")
+
+    def _get_pdf_backend(self, backend_name: pdf2parquet_pdf_backend):
+        if backend_name == pdf2parquet_pdf_backend.DLPARSE_V1:
+            return DoclingParseDocumentBackend
+        elif backend_name == pdf2parquet_pdf_backend.DLPARSE_V2:
+            return DoclingParseV2DocumentBackend
+        elif backend_name == pdf2parquet_pdf_backend.PYPDFIUM2:
+            return PyPdfiumDocumentBackend
+
+        raise RuntimeError(f"Unknown PDF backend `{backend_name}`")
 
     def _update_metrics(self, num_pages: int, elapse_time: float):
         # This is implemented in the ray version
@@ -116,26 +218,26 @@ def _convert_pdf2parquet(
         # Convert PDF to Markdown
         start_time = time.time()
         buf = io.BytesIO(content_bytes)
-        input_docs = DocumentStream(filename=doc_filename, stream=buf)
-        input = DocumentConversionInput.from_streams([input_docs])
+        input_doc = DocumentStream(name=doc_filename, stream=buf)
 
-        converted_docs = self._converter.convert(input)
-        doc: ConvertedDocument = next(converted_docs, None)
-        if doc is None or doc.output is None:
-            raise RuntimeError("Failed in converting.")
+        conv_res = self._converter.convert(input_doc)
+        doc = conv_res.document
         elapse_time = time.time() - start_time
 
         if self.contents_type == pdf2parquet_contents_types.MARKDOWN:
-            content_string = doc.render_as_markdown()
+            content_string = doc.export_to_markdown()
+        elif self.contents_type == pdf2parquet_contents_types.TEXT:
+            content_string = doc.export_to_text()
         elif self.contents_type == pdf2parquet_contents_types.JSON:
             content_string = pd.io.json.ujson_dumps(
-                doc.render_as_dict(), double_precision=self.double_precision
+                doc.export_to_dict(), double_precision=self.double_precision
             )
         else:
             raise RuntimeError(f"Uknown contents_type {self.contents_type}.")
         num_pages = len(doc.pages)
-        num_tables = len(doc.output.tables) if doc.output.tables is not None else 0
-        num_doc_elements = len(doc.output.main_text) if doc.output.main_text is not None else 0
+        num_tables = len(doc.tables)
+        num_doc_elements = len(doc.texts)
+        document_hash = doc.origin.binary_hash
 
         self._update_metrics(num_pages=num_pages, elapse_time=elapse_time)
 
@@ -146,6 +248,7 @@ def _convert_pdf2parquet(
             "num_tables": num_tables,
             "num_doc_elements": num_doc_elements,
             "document_id": str(uuid.uuid4()),
+            "document_hash": document_hash,
             "ext": ext,
             "hash": TransformUtils.str_to_hash(content_string),
             "size": len(content_string),
@@ -165,18 +268,20 @@ def transform_binary(
         for each PDF file detected in the archive.
         """
 
-        data = []
+        data = [*self.buffer]
         success_doc_id = []
         failed_doc_id = []
         skipped_doc_id = []
         number_of_rows = 0
 
         try:
+            # TODO: Docling has an inner-function with a stronger type checking.
+            # Once it is exposed as public, we can use it here as well.
             root_kind = filetype.guess(byte_array)
 
-            # Process single PDF documents
-            if root_kind is not None and root_kind.mime == "application/pdf":
-                logger.debug(f"Detected root file {file_name=} as PDF.")
+            # Process single documents
+            if root_kind is not None and root_kind.mime in MimeTypeToFormat:
+                logger.debug(f"Detected root file {file_name=} as {root_kind.mime}.")
 
                 try:
                     root_ext = root_kind.extension
@@ -195,10 +300,10 @@ def transform_binary(
                 except Exception as e:
                     failed_doc_id.append(file_name)
                     logger.warning(
-                        f"Exception {str(e)} processing file {archive_doc_filename}, skipping"
+                        f"Exception {str(e)} processing file {file_name}, skipping"
                     )
 
-            # Process ZIP archive of PDF documents
+            # Process ZIP archive of documents
             elif root_kind is not None and root_kind.mime == "application/zip":
                 logger.debug(
                     f"Detected root file {file_name=} as ZIP. Iterating through the archive content."
@@ -218,9 +323,9 @@ def transform_binary(
 
                                 # Detect file type
                                 kind = filetype.guess(content_bytes)
-                                if kind is None or kind.mime != "application/pdf":
+                                if kind is None or kind.mime not in MimeTypeToFormat:
                                     logger.info(
-                                        f"File {archive_doc_filename=} is not detected as PDF but {kind=}. Skipping."
+                                        f"File {archive_doc_filename=} is not detected as valid format {kind=}. Skipping."
                                     )
                                     skipped_doc_id.append(archive_doc_filename)
                                     continue
@@ -248,23 +353,53 @@ def transform_binary(
 
             else:
                 logger.warning(
-                    f"File {file_name=} is not detected as PDF nor as ZIP but {kind=}. Skipping."
+                    f"File {file_name=} is not detected as a supported type nor as ZIP but {kind=}. Skipping."
                 )
 
-            table = pa.Table.from_pylist(data)
+            
             metadata = {
-                "nrows": len(table),
+                "nrows": number_of_rows,
                 "nsuccess": len(success_doc_id),
                 "nfail": len(failed_doc_id),
                 "nskip": len(skipped_doc_id),
             }
-            return [
-                (TransformUtils.convert_arrow_to_binary(table=table), ".parquet")
-            ], metadata
+
+            batch_results = []
+            self.buffer = []
+            if self.batch_size <= 0:
+                # we do a single batch
+                table = pa.Table.from_pylist(data)
+                batch_results.append((TransformUtils.convert_arrow_to_binary(table=table), ".parquet"))
+            else:
+                # we create result files containing batch_size rows/documents
+                num_left = len(data)
+                start_row = 0
+                while num_left >= self.batch_size:
+                    table = pa.Table.from_pylist(data[start_row:self.batch_size])
+                    batch_results.append((TransformUtils.convert_arrow_to_binary(table=table), ".parquet"))
+                    
+                    start_row += self.batch_size
+                    num_left = num_left - self.batch_size
+                
+                if num_left >= 0:
+                    self.buffer = data[start_row:]
+
+            return batch_results, metadata
         except Exception as e:
             logger.error(f"Fatal error with file {file_name=}. No results produced.")
             raise
 
+    def flush_binary(self) -> tuple[list[tuple[bytes, str]], dict[str, Any]]:
+        result = []
+        if len(self.buffer) > 0:
+            logger.debug(f"flushing buffered table with {len(self.buffer)} rows.")
+            table = pa.Table.from_pylist(self.buffer)
+            result.append((TransformUtils.convert_arrow_to_binary(table=table), ".parquet"))
+            self.buffer = None
+        else:
+            logger.debug(f"Empty buffer. nothing to flush.")
+        return result, {}
+
 
 class Pdf2ParquetTransformConfiguration(TransformConfiguration):
     """
@@ -286,6 +421,12 @@ def add_input_params(self, parser: ArgumentParser) -> None:
         By convention a common prefix should be used for all mutator-specific CLI args
         (e.g, noop_, pii_, etc.)
         """
+        parser.add_argument(
+            f"--{pdf2parquet_batch_size_cli_param}",
+            type=int,
+            help="Number of documents to be saved in the same result table. A value of -1 will generate one result file for each input file.",
+            default=pdf2parquet_batch_size_default,
+        )
         parser.add_argument(
             f"--{pdf2parquet_artifacts_path_cli_param}",
             type=str,
@@ -311,6 +452,26 @@ def add_input_params(self, parser: ArgumentParser) -> None:
             help="If true, optical character recognition (OCR) will be used to read the PDF content.",
             default=pdf2parquet_do_ocr_default,
         )
+        parser.add_argument(
+            f"--{pdf2parquet_ocr_engine_cli_param}",
+            type=pdf2parquet_ocr_engine,
+            choices=list(pdf2parquet_ocr_engine),
+            help="The OCR engine to use.",
+            default=pdf2parquet_ocr_engine.EASYOCR,
+        )
+        parser.add_argument(
+            f"--{pdf2parquet_bitmap_area_threshold__cli_param}",
+            type=float,
+            help="Threshold for running OCR on bitmap figures embedded in document. The threshold is computed as the fraction of the area covered by the bitmap, compared to the whole page area.",
+            default=pdf2parquet_bitmap_area_threshold_default,
+        )
+        parser.add_argument(
+            f"--{pdf2parquet_pdf_backend_cli_param}",
+            type=pdf2parquet_pdf_backend,
+            choices=list(pdf2parquet_pdf_backend),
+            help="The PDF backend to use.",
+            default=pdf2parquet_pdf_backend.DLPARSE_V2,
+        )
         parser.add_argument(
             f"--{pdf2parquet_double_precision_cli_param}",
             type=int,
diff --git a/transforms/language/pdf2parquet/python/test-data/expected/archive1.parquet b/transforms/language/pdf2parquet/python/test-data/expected/archive1.parquet
index 9975c3608..907fb3803 100644
Binary files a/transforms/language/pdf2parquet/python/test-data/expected/archive1.parquet and b/transforms/language/pdf2parquet/python/test-data/expected/archive1.parquet differ
diff --git a/transforms/language/pdf2parquet/python/test-data/expected/metadata.json b/transforms/language/pdf2parquet/python/test-data/expected/metadata.json
index 704a86d8e..b9a535098 100644
--- a/transforms/language/pdf2parquet/python/test-data/expected/metadata.json
+++ b/transforms/language/pdf2parquet/python/test-data/expected/metadata.json
@@ -5,8 +5,8 @@
     "job name": "pdf2parquet",
     "job type": "pure python",
     "job id": "job_id",
-    "start_time": "2024-10-18 06:02:44",
-    "end_time": "2024-10-18 06:03:04",
+    "start_time": "2024-10-29 14:17:59",
+    "end_time": "2024-10-29 14:18:05",
     "status": "success"
   },
   "code": {
@@ -19,6 +19,9 @@
     "contents_type": "text/markdown",
     "do_table_structure": true,
     "do_ocr": true,
+    "ocr_engine": "easyocr",
+    "bitmap_area_threshold": 0.05,
+    "pdf_backend": "dlparse_v2",
     "double_precision": 0,
     "checkpointing": false,
     "max_files": -1,
@@ -30,18 +33,18 @@
     "num_processors": 0
   },
   "execution_stats": {
-    "cpus": 29.2,
+    "cpus": 16.8,
     "gpus": 0,
-    "memory": 29.7,
+    "memory": 31.22,
     "object_store": 0,
-    "execution time, min": 0.329
+    "execution time, min": 0.108
   },
   "job_output_stats": {
     "source_files": 2,
     "source_size": 605137,
     "result_files": 2,
-    "result_size": 32086,
-    "processing_time": 5.981,
+    "result_size": 33044,
+    "processing_time": 6.478,
     "nrows": 3,
     "nsuccess": 3,
     "nfail": 0,
diff --git a/transforms/language/pdf2parquet/python/test-data/expected/redp5110-ch1.parquet b/transforms/language/pdf2parquet/python/test-data/expected/redp5110-ch1.parquet
index f70a89278..39613b1d1 100644
Binary files a/transforms/language/pdf2parquet/python/test-data/expected/redp5110-ch1.parquet and b/transforms/language/pdf2parquet/python/test-data/expected/redp5110-ch1.parquet differ
diff --git a/transforms/language/pdf2parquet/python/test-data/expected_batch/metadata.json b/transforms/language/pdf2parquet/python/test-data/expected_batch/metadata.json
new file mode 100644
index 000000000..f8f9ad71a
--- /dev/null
+++ b/transforms/language/pdf2parquet/python/test-data/expected_batch/metadata.json
@@ -0,0 +1,64 @@
+{
+  "pipeline": "pipeline_id",
+  "job details": {
+    "job category": "preprocessing",
+    "job name": "pdf2parquet",
+    "job type": "pure python",
+    "job id": "job_id",
+    "start_time": "2024-10-31 13:14:39",
+    "end_time": "2024-10-31 13:16:41",
+    "status": "success"
+  },
+  "code": {
+    "github": "github",
+    "commit_hash": "12345",
+    "path": "path"
+  },
+  "job_input_params": {
+    "batch_size": 10,
+    "artifacts_path": null,
+    "contents_type": "text/markdown",
+    "do_table_structure": true,
+    "do_ocr": true,
+    "ocr_engine": "easyocr",
+    "bitmap_area_threshold": 0.05,
+    "pdf_backend": "dlparse_v2",
+    "double_precision": 0,
+    "checkpointing": false,
+    "max_files": -1,
+    "random_samples": -1,
+    "files_to_use": [
+      ".pdf",
+      ".docx",
+      ".pptx",
+      ".zip"
+    ],
+    "num_processors": 0
+  },
+  "execution_stats": {
+    "cpus": 39.0,
+    "gpus": 0,
+    "memory": 29.87,
+    "object_store": 0,
+    "execution time, min": 2.029
+  },
+  "job_output_stats": {
+    "source_files": 2,
+    "source_size": 605137,
+    "result_files": 1,
+    "processing_time": 3.888,
+    "nrows": 3,
+    "nsuccess": 3,
+    "nfail": 0,
+    "nskip": 0,
+    "result_size": 27200
+  },
+  "source": {
+    "name": "/Users/dol/codes/data-prep-kit/transforms/language/pdf2parquet/python/test-data/input",
+    "type": "path"
+  },
+  "target": {
+    "name": "/Users/dol/codes/data-prep-kit/transforms/language/pdf2parquet/python/output",
+    "type": "path"
+  }
+}
\ No newline at end of file
diff --git a/transforms/language/pdf2parquet/python/test-data/expected_batch/redp5110-ch1.parquet b/transforms/language/pdf2parquet/python/test-data/expected_batch/redp5110-ch1.parquet
new file mode 100644
index 000000000..3e9ba12c7
Binary files /dev/null and b/transforms/language/pdf2parquet/python/test-data/expected_batch/redp5110-ch1.parquet differ
diff --git a/transforms/language/pdf2parquet/python/test-data/expected_json/archive1.parquet b/transforms/language/pdf2parquet/python/test-data/expected_json/archive1.parquet
index 033452371..7f34e1ba8 100644
Binary files a/transforms/language/pdf2parquet/python/test-data/expected_json/archive1.parquet and b/transforms/language/pdf2parquet/python/test-data/expected_json/archive1.parquet differ
diff --git a/transforms/language/pdf2parquet/python/test-data/expected_json/metadata.json b/transforms/language/pdf2parquet/python/test-data/expected_json/metadata.json
index a38a938a3..04bec2b88 100644
--- a/transforms/language/pdf2parquet/python/test-data/expected_json/metadata.json
+++ b/transforms/language/pdf2parquet/python/test-data/expected_json/metadata.json
@@ -5,8 +5,8 @@
     "job name": "pdf2parquet",
     "job type": "pure python",
     "job id": "job_id",
-    "start_time": "2024-10-18 06:09:35",
-    "end_time": "2024-10-18 06:09:44",
+    "start_time": "2024-10-29 14:20:01",
+    "end_time": "2024-10-29 14:20:07",
     "status": "success"
   },
   "code": {
@@ -19,6 +19,9 @@
     "contents_type": "application/json",
     "do_table_structure": true,
     "do_ocr": true,
+    "ocr_engine": "easyocr",
+    "bitmap_area_threshold": 0.05,
+    "pdf_backend": "dlparse_v2",
     "double_precision": 0,
     "checkpointing": false,
     "max_files": -1,
@@ -30,18 +33,18 @@
     "num_processors": 0
   },
   "execution_stats": {
-    "cpus": 25.3,
+    "cpus": 18.0,
     "gpus": 0,
-    "memory": 29.52,
+    "memory": 30.77,
     "object_store": 0,
-    "execution time, min": 0.138
+    "execution time, min": 0.105
   },
   "job_output_stats": {
     "source_files": 2,
     "source_size": 605137,
     "result_files": 2,
-    "result_size": 33227,
-    "processing_time": 5.64,
+    "result_size": 22953,
+    "processing_time": 6.282,
     "nrows": 3,
     "nsuccess": 3,
     "nfail": 0,
diff --git a/transforms/language/pdf2parquet/python/test-data/expected_json/redp5110-ch1.parquet b/transforms/language/pdf2parquet/python/test-data/expected_json/redp5110-ch1.parquet
index 5032919c5..32905aa74 100644
Binary files a/transforms/language/pdf2parquet/python/test-data/expected_json/redp5110-ch1.parquet and b/transforms/language/pdf2parquet/python/test-data/expected_json/redp5110-ch1.parquet differ
diff --git a/transforms/language/pdf2parquet/python/test-data/expected_md_no_table_no_ocr/archive1.parquet b/transforms/language/pdf2parquet/python/test-data/expected_md_no_table_no_ocr/archive1.parquet
index 58bcfcf6f..9fec2cd2d 100644
Binary files a/transforms/language/pdf2parquet/python/test-data/expected_md_no_table_no_ocr/archive1.parquet and b/transforms/language/pdf2parquet/python/test-data/expected_md_no_table_no_ocr/archive1.parquet differ
diff --git a/transforms/language/pdf2parquet/python/test-data/expected_md_no_table_no_ocr/metadata.json b/transforms/language/pdf2parquet/python/test-data/expected_md_no_table_no_ocr/metadata.json
index c276aa899..bf5c9e12a 100644
--- a/transforms/language/pdf2parquet/python/test-data/expected_md_no_table_no_ocr/metadata.json
+++ b/transforms/language/pdf2parquet/python/test-data/expected_md_no_table_no_ocr/metadata.json
@@ -5,8 +5,8 @@
     "job name": "pdf2parquet",
     "job type": "pure python",
     "job id": "job_id",
-    "start_time": "2024-10-18 06:09:08",
-    "end_time": "2024-10-18 06:09:12",
+    "start_time": "2024-10-29 14:19:30",
+    "end_time": "2024-10-29 14:19:33",
     "status": "success"
   },
   "code": {
@@ -19,6 +19,9 @@
     "contents_type": "text/markdown",
     "do_table_structure": false,
     "do_ocr": false,
+    "ocr_engine": "easyocr",
+    "bitmap_area_threshold": 0.05,
+    "pdf_backend": "dlparse_v2",
     "double_precision": 0,
     "checkpointing": false,
     "max_files": -1,
@@ -30,18 +33,18 @@
     "num_processors": 0
   },
   "execution_stats": {
-    "cpus": 25.5,
+    "cpus": 17.3,
     "gpus": 0,
-    "memory": 27.42,
+    "memory": 28.85,
     "object_store": 0,
-    "execution time, min": 0.066
+    "execution time, min": 0.043
   },
   "job_output_stats": {
     "source_files": 2,
     "source_size": 605137,
     "result_files": 2,
-    "result_size": 27574,
-    "processing_time": 3.448,
+    "result_size": 29659,
+    "processing_time": 2.554,
     "nrows": 3,
     "nsuccess": 3,
     "nfail": 0,
diff --git a/transforms/language/pdf2parquet/python/test-data/expected_md_no_table_no_ocr/redp5110-ch1.parquet b/transforms/language/pdf2parquet/python/test-data/expected_md_no_table_no_ocr/redp5110-ch1.parquet
index 52b40288b..69bc4e421 100644
Binary files a/transforms/language/pdf2parquet/python/test-data/expected_md_no_table_no_ocr/redp5110-ch1.parquet and b/transforms/language/pdf2parquet/python/test-data/expected_md_no_table_no_ocr/redp5110-ch1.parquet differ
diff --git a/transforms/language/pdf2parquet/python/test/test_pdf2parquet_python.py b/transforms/language/pdf2parquet/python/test/test_pdf2parquet_python.py
index 1c5a694fc..132c978ed 100644
--- a/transforms/language/pdf2parquet/python/test/test_pdf2parquet_python.py
+++ b/transforms/language/pdf2parquet/python/test/test_pdf2parquet_python.py
@@ -19,8 +19,7 @@
 from data_processing.test_support.launch.transform_test import (
     AbstractTransformLauncherTest,
 )
-from docling_core.types import Document
-from docling_core.types.doc.base import BaseText
+from docling_core.types.doc import DocItem, DoclingDocument, TextItem
 from pdf2parquet_transform_python import Pdf2ParquetPythonTransformConfiguration
 from pydantic import ValidationError
 
@@ -35,7 +34,7 @@ def get_test_transform_fixtures(self) -> list[tuple]:
         basedir = "../test-data"
         basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), basedir))
         config = {
-            "data_files_to_use": ast.literal_eval("['.pdf','.zip']"),
+            "data_files_to_use": ast.literal_eval("['.pdf','.docx','.pptx','.zip']"),
             "pdf2parquet_double_precision": 0,
         }
 
@@ -58,6 +57,20 @@ def get_test_transform_fixtures(self) -> list[tuple]:
             )
         )
 
+        # All input in a single parquet
+        fixtures.append(
+            (
+                launcher,
+                {
+                    **config,
+                    "pdf2parquet_batch_size": 10,
+                },
+                basedir + "/input",
+                basedir + "/expected_batch",
+                ignore_columns,
+            )
+        )
+
         # No table model and no OCR
         fixtures.append(
             (
@@ -130,8 +143,10 @@ def validate_expected_row(
                     # Test for Document type
                     try:
 
-                        expected_doc = Document.model_validate_json(expected_value)
-                        test_doc = Document.model_validate_json(test_value)
+                        expected_doc = DoclingDocument.model_validate_json(
+                            expected_value
+                        )
+                        test_doc = DoclingDocument.model_validate_json(test_value)
                         cls.validate_documents(
                             row_index=row_index,
                             table_index=table_index,
@@ -164,27 +179,32 @@ def validate_documents(
         cls,
         row_index: int,
         table_index: int,
-        test_doc: Document,
-        expected_doc: Document,
+        test_doc: DoclingDocument,
+        expected_doc: DoclingDocument,
     ):
 
         msg = f"Row {row_index} of table {table_index} are not equal\n\t"
-        assert(test_doc.main_text is not None, msg + "Test document has no content")
-        assert len(test_doc.main_text) == len(expected_doc.main_text), (
+        assert len(test_doc.texts) == len(expected_doc.texts), (
             msg + f"Main Text lengths do not match."
         )
 
-        for i in range(len(expected_doc.main_text)):
-            expected_item = expected_doc.main_text[i]
-            test_item = test_doc.main_text[i]
+        for (expected_item, _expected_level), (test_item, _test_level) in zip(
+            expected_doc.iterate_items(), test_doc.iterate_items()
+        ):
+            if not isinstance(expected_item, DocItem):
+                continue
+            assert isinstance(test_item, DocItem), msg + "Test item is not a DocItem"
 
             # Validate type
-            assert expected_item.obj_type == test_item.obj_type, (
-                msg + f"Object type does not match."
+            assert expected_item.label == test_item.label, (
+                msg + f"Object label does not match."
             )
 
             # Validate text content
-            if isinstance(expected_item, BaseText):
+            if isinstance(expected_item, TextItem):
+                assert isinstance(test_item, TextItem), (
+                    msg + "Test item is not a TextItem as the expected one"
+                )
                 assert expected_item.text == test_item.text, (
                     msg + f"Text does not match."
                 )
diff --git a/transforms/language/pdf2parquet/ray/Dockerfile b/transforms/language/pdf2parquet/ray/Dockerfile
index 83fb9d1ea..8f738b7f4 100644
--- a/transforms/language/pdf2parquet/ray/Dockerfile
+++ b/transforms/language/pdf2parquet/ray/Dockerfile
@@ -33,7 +33,8 @@ RUN pip install ${PIP_INSTALL_EXTRA_ARGS} --no-cache-dir -e .
 # Download models
 RUN python -c 'from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models; load_pretrained_nlp_models(verbose=True);'
 # RUN python -c 'from docling.document_converter import DocumentConverter; from pathlib import Path; DocumentConverter.download_models_hf(local_dir=Path("./artifacts/"));'
-RUN python -c 'from docling.document_converter import DocumentConverter; s=DocumentConverter.download_models_hf(); print(f"Models cached in {s}")'
+RUN python -c 'from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline; s=StandardPdfPipeline.download_models_hf(); print(f"Models cached in {s}")'
+
 
 # copy the main() entry point to the image
 COPY --chown=ray:users src/pdf2parquet_transform_ray.py ./
diff --git a/transforms/language/pdf2parquet/ray/pyproject.toml b/transforms/language/pdf2parquet/ray/pyproject.toml
index b65fa26a5..5e192c8ac 100644
--- a/transforms/language/pdf2parquet/ray/pyproject.toml
+++ b/transforms/language/pdf2parquet/ray/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_pdf2parquet_transform_ray"
-version = "0.2.2.dev2"
+version = "0.3.0"
 requires-python = ">=3.10,<3.13"
 description = "PDF2PARQUET Ray Transform"
 license = {text = "Apache-2.0"}
diff --git a/transforms/language/pdf2parquet/ray/requirements.txt b/transforms/language/pdf2parquet/ray/requirements.txt
index 39553ea56..2b414c59e 100644
--- a/transforms/language/pdf2parquet/ray/requirements.txt
+++ b/transforms/language/pdf2parquet/ray/requirements.txt
@@ -1,7 +1,7 @@
-dpk-pdf2parquet-transform-python==0.2.2.dev2
+dpk-pdf2parquet-transform-python==0.3.0
 data-prep-toolkit[ray]==0.2.2.dev2
-docling-core==1.7.2
-docling-ibm-models==2.0.0
-deepsearch-glm==0.22.0
-docling==1.20.0
+# docling-core==1.7.2
+# docling-ibm-models==2.0.0
+# deepsearch-glm==0.22.0
+# docling==1.20.0
 filetype >=1.2.0, <2.0.0
diff --git a/transforms/language/pdf2parquet/ray/test-data/expected/archive1.parquet b/transforms/language/pdf2parquet/ray/test-data/expected/archive1.parquet
index 9975c3608..907fb3803 100644
Binary files a/transforms/language/pdf2parquet/ray/test-data/expected/archive1.parquet and b/transforms/language/pdf2parquet/ray/test-data/expected/archive1.parquet differ
diff --git a/transforms/language/pdf2parquet/ray/test-data/expected/metadata.json b/transforms/language/pdf2parquet/ray/test-data/expected/metadata.json
index 704a86d8e..b9a535098 100644
--- a/transforms/language/pdf2parquet/ray/test-data/expected/metadata.json
+++ b/transforms/language/pdf2parquet/ray/test-data/expected/metadata.json
@@ -5,8 +5,8 @@
     "job name": "pdf2parquet",
     "job type": "pure python",
     "job id": "job_id",
-    "start_time": "2024-10-18 06:02:44",
-    "end_time": "2024-10-18 06:03:04",
+    "start_time": "2024-10-29 14:17:59",
+    "end_time": "2024-10-29 14:18:05",
     "status": "success"
   },
   "code": {
@@ -19,6 +19,9 @@
     "contents_type": "text/markdown",
     "do_table_structure": true,
     "do_ocr": true,
+    "ocr_engine": "easyocr",
+    "bitmap_area_threshold": 0.05,
+    "pdf_backend": "dlparse_v2",
     "double_precision": 0,
     "checkpointing": false,
     "max_files": -1,
@@ -30,18 +33,18 @@
     "num_processors": 0
   },
   "execution_stats": {
-    "cpus": 29.2,
+    "cpus": 16.8,
     "gpus": 0,
-    "memory": 29.7,
+    "memory": 31.22,
     "object_store": 0,
-    "execution time, min": 0.329
+    "execution time, min": 0.108
   },
   "job_output_stats": {
     "source_files": 2,
     "source_size": 605137,
     "result_files": 2,
-    "result_size": 32086,
-    "processing_time": 5.981,
+    "result_size": 33044,
+    "processing_time": 6.478,
     "nrows": 3,
     "nsuccess": 3,
     "nfail": 0,
diff --git a/transforms/language/pdf2parquet/ray/test-data/expected/redp5110-ch1.parquet b/transforms/language/pdf2parquet/ray/test-data/expected/redp5110-ch1.parquet
index f70a89278..39613b1d1 100644
Binary files a/transforms/language/pdf2parquet/ray/test-data/expected/redp5110-ch1.parquet and b/transforms/language/pdf2parquet/ray/test-data/expected/redp5110-ch1.parquet differ
diff --git a/transforms/transform.config b/transforms/transform.config
index afe747c21..91bacf9b4 100644
--- a/transforms/transform.config
+++ b/transforms/transform.config
@@ -14,4 +14,4 @@ TRANSFORM_NAME=data-prep-kit-transforms
 #
 # If you change the versions numbers, be sure to run "make set-versions" to 
 # update version numbers across the transform (e.g., pyproject.toml).
-TRANSFORMS_PKG_VERSION=$(DPK_VERSION)
+TRANSFORMS_PKG_VERSION=0.3.0