diff --git a/examples/notebooks/intro/dpk_intro_1_python.ipynb b/examples/notebooks/intro/dpk_intro_1_python.ipynb index f3659afcf..ab7cda854 100644 --- a/examples/notebooks/intro/dpk_intro_1_python.ipynb +++ b/examples/notebooks/intro/dpk_intro_1_python.ipynb @@ -13,7 +13,7 @@ "\n", "Here is the workflow\n", "\n", - "![](https://raw.githubusercontent.com/sujee/data-prep-kit/intro-example1/examples/notebooks/intro/images/data-prep-kit-3-workflow.png)\n" + "![](https://raw.githubusercontent.com/IBM/data-prep-kit/dev/examples/notebooks/intro/images/data-prep-kit-3-workflow.png)\n" ] }, { @@ -27,7 +27,7 @@ "\n", "Two options:\n", "\n", - "- **Option 1 - Google Colab:** easiest option. no setup required. Click this link to open this on google colab. [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/sujee/data-prep-kit/blob/intro-example1/examples/notebooks/intro/dpk_intro_1_python.ipynb)\n", + "- **Option 1 - Google Colab:** easiest option. no setup required. Click this link to open this on google colab. [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/IBM/data-prep-kit/blob/dev/examples/notebooks/intro/dpk_intro_1_python.ipynb)\n", "- **Option 2 - Local python dev environment:** Setup using this [guide](../../../README.md#-getting-started)\n", "\n", "The notebook will work as in both environments" @@ -42,10 +42,10 @@ "source": [ "## Step-1: Inspect the Data\n", "\n", - "We will use simple PDFs about Solar system. The files are [here](https://github.com/sujee/data-prep-kit/tree/intro-example1/examples/notebooks/intro/input/solar-system)\n", + "We will use simple PDFs about Solar system. The files are [here](https://github.com/IBM/data-prep-kit/tree/dev/examples/notebooks/intro/input/solar-system)\n", "\n", - "- [earth.pdf](https://github.com/sujee/data-prep-kit/blob/intro-example1/examples/notebooks/intro/input/solar-system/earth.pdf)\n", - "- [mars.pdf](https://github.com/sujee/data-prep-kit/blob/intro-example1/examples/notebooks/intro/input/solar-system/mars.pdf)\n" + "- [earth.pdf](https://github.com/IBM/data-prep-kit/blob/dev/examples/notebooks/intro/input/solar-system/earth.pdf)\n", + "- [mars.pdf](https://github.com/IBM/data-prep-kit/blob/dev/examples/notebooks/intro/input/solar-system/mars.pdf)\n" ] }, { @@ -118,9 +118,9 @@ "source": [ "if RUNNING_IN_COLAB:\n", " !mkdir -p 'input/solar-system'\n", - " !wget -O 'input/solar-system/earth.pdf' 'https://raw.githubusercontent.com/sujee/data-prep-kit/intro-example1/examples/notebooks/intro/input/solar-system/earth.pdf'\n", - " !wget -O 'input/solar-system/mars.pdf' 'https://raw.githubusercontent.com/sujee/data-prep-kit/intro-example1/examples/notebooks/intro/input/solar-system/mars.pdf'\n", - " !wget -O 'my_utils.py' 'https://raw.githubusercontent.com/sujee/data-prep-kit/intro-example1/examples/notebooks/intro/my_utils.py'" + " !wget -O 'input/solar-system/earth.pdf' 'https://raw.githubusercontent.com/IBM/data-prep-kit/dev/examples/notebooks/intro/input/solar-system/earth.pdf'\n", + " !wget -O 'input/solar-system/mars.pdf' 'https://raw.githubusercontent.com/IBM/data-prep-kit/dev/examples/notebooks/intro/input/solar-system/mars.pdf'\n", + " !wget -O 'my_utils.py' 'https://raw.githubusercontent.com/IBM/data-prep-kit/dev/examples/notebooks/intro/my_utils.py'" ] }, { diff --git a/examples/notebooks/intro/dpk_intro_1_ray.ipynb b/examples/notebooks/intro/dpk_intro_1_ray.ipynb index da33a3499..b2feb9135 100644 --- a/examples/notebooks/intro/dpk_intro_1_ray.ipynb +++ b/examples/notebooks/intro/dpk_intro_1_ray.ipynb @@ -1,4358 +1,4359 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "841e533d-ebb3-406d-9da7-b19e2c5f5866", - "metadata": { - "id": "841e533d-ebb3-406d-9da7-b19e2c5f5866" - }, - "source": [ - "# Data Prep Kit Demo 1 - Ray Version\n", - "\n", - "This notebook will introduce DPK and showcase some of it's capabilities.\n", - "\n", - "Here is the workflow\n", - "\n", - "![](https://raw.githubusercontent.com/sujee/data-prep-kit/intro-example1/examples/notebooks/intro/images/data-prep-kit-3-workflow.png)\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "id": "b15976e3", - "metadata": { - "id": "b15976e3" - }, - "source": [ - "## How to run this notebook\n", - "\n", - "Two options:\n", - "\n", - "- **Option 1 - Google Colab:** easiest option. no setup required. Click this link to open this on google colab. [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/sujee/data-prep-kit/blob/intro-example1/examples/notebooks/intro/dpk_intro_1_ray.ipynb)\n", - "- **Option 2 - Local python dev environment:** Setup using this [guide](../../../README.md#-getting-started)\n", - "\n", - "The notebook will work as in both environments" - ] - }, - { - "cell_type": "markdown", - "id": "eb8b0d5c", - "metadata": { - "id": "eb8b0d5c" - }, - "source": [ - "## Step-1: Inspect the Data\n", - "\n", - "We will use simple PDFs about Solar system. The files are [here](https://github.com/sujee/data-prep-kit/tree/intro-example1/examples/notebooks/intro/input/solar-system)\n", - "\n", - "- [earth.pdf](https://github.com/sujee/data-prep-kit/blob/intro-example1/examples/notebooks/intro/input/solar-system/earth.pdf)\n", - "- [mars.pdf](https://github.com/sujee/data-prep-kit/blob/intro-example1/examples/notebooks/intro/input/solar-system/mars.pdf)\n" - ] - }, - { - "cell_type": "markdown", - "id": "39a0ab6e", - "metadata": { - "id": "39a0ab6e" - }, - "source": [ - "## Step-2: Figure out Runtime Environment\n", - "\n", - "### 2.1 - Determine runtime\n", - "\n", - "Determine if we are running on Google colab or local python environment" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "1fe354b7", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "1fe354b7", - "outputId": "6665c654-baa5-46dc-d370-9931e0e9eed3" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "NOT in Colab\n" - ] - } - ], - "source": [ - "import os\n", - "\n", - "if os.getenv(\"COLAB_RELEASE_TAG\"):\n", - " print(\"Running in Colab\")\n", - " RUNNING_IN_COLAB = True\n", - "else:\n", - " print(\"NOT in Colab\")\n", - " RUNNING_IN_COLAB = False" - ] - }, - { - "cell_type": "markdown", - "id": "8e7c104b", - "metadata": { - "id": "8e7c104b" - }, - "source": [ - "### 2.2 -Download Data if running on Google Colab" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "3309799e", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "3309799e", - "outputId": "00d7362e-d675-4aaf-8c87-d99027d9a06c" - }, - "outputs": [], - "source": [ - "if RUNNING_IN_COLAB:\n", - " !mkdir -p 'input/solar-system'\n", - " !wget -O 'input/solar-system/earth.pdf' 'https://raw.githubusercontent.com/sujee/data-prep-kit/intro-example1/examples/notebooks/intro/input/solar-system/earth.pdf'\n", - " !wget -O 'input/solar-system/mars.pdf' 'https://raw.githubusercontent.com/sujee/data-prep-kit/intro-example1/examples/notebooks/intro/input/solar-system/mars.pdf'\n", - " !wget -O 'my_utils.py' 'https://raw.githubusercontent.com/sujee/data-prep-kit/intro-example1/examples/notebooks/intro/my_utils.py'" - ] - }, - { - "cell_type": "markdown", - "id": "a5dc2b68", - "metadata": { - "id": "a5dc2b68" - }, - "source": [ - "### 2.3 - Install dependencies if running on Google Colab" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "1fcec577", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - }, - "id": "1fcec577", - "outputId": "48cf233b-f04e-4b9b-9605-423f87693f10" - }, - "outputs": [], - "source": [ - "if RUNNING_IN_COLAB:\n", - " ! pip install --default-timeout=100 \\\n", - " data-prep-toolkit==0.2.1 \\\n", - " data-prep-toolkit-transforms==0.2.1 \\\n", - " data-prep-toolkit-transforms-ray==0.2.1 \\\n", - " deepsearch-toolkit" - ] + "cells": [ + { + "cell_type": "markdown", + "id": "841e533d-ebb3-406d-9da7-b19e2c5f5866", + "metadata": { + "id": "841e533d-ebb3-406d-9da7-b19e2c5f5866" + }, + "source": [ + "# Data Prep Kit Demo 1 - Ray Version\n", + "\n", + "This notebook will introduce DPK and showcase some of it's capabilities.\n", + "\n", + "Here is the workflow\n", + "\n", + "![](https://raw.githubusercontent.com/IBM/data-prep-kit/dev/examples/notebooks/intro/images/data-prep-kit-3-workflow.png)\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "b15976e3", + "metadata": { + "id": "b15976e3" + }, + "source": [ + "## How to run this notebook\n", + "\n", + "Two options:\n", + "\n", + "- **Option 1 - Google Colab:** easiest option. no setup required. Click this link to open this on google colab. [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/IBM/data-prep-kit/blob/dev/examples/notebooks/intro/dpk_intro_1_ray.ipynb)\n", + "- **Option 2 - Local python dev environment:** Setup using this [guide](../../../README.md#-getting-started)\n", + "\n", + "The notebook will work as in both environments" + ] + }, + { + "cell_type": "markdown", + "id": "eb8b0d5c", + "metadata": { + "id": "eb8b0d5c" + }, + "source": [ + "## Step-1: Inspect the Data\n", + "\n", + "We will use simple PDFs about Solar system. The files are [here](https://github.com/IBM/data-prep-kit/tree/dev/examples/notebooks/intro/input/solar-system)\n", + "\n", + "- [earth.pdf](https://github.com/IBM/data-prep-kit/blob/dev/examples/notebooks/intro/input/solar-system/earth.pdf)\n", + "- [mars.pdf](https://github.com/IBM/data-prep-kit/blob/dev/examples/notebooks/intro/input/solar-system/mars.pdf)\n" + ] + }, + { + "cell_type": "markdown", + "id": "39a0ab6e", + "metadata": { + "id": "39a0ab6e" + }, + "source": [ + "## Step-2: Figure out Runtime Environment\n", + "\n", + "### 2.1 - Determine runtime\n", + "\n", + "Determine if we are running on Google colab or local python environment" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "1fe354b7", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "1fe354b7", + "outputId": "6665c654-baa5-46dc-d370-9931e0e9eed3" + }, + "outputs": [ { - "cell_type": "markdown", - "id": "243322b8", - "metadata": { - "id": "243322b8" - }, - "source": [ - "### 2.4 - Restart Runtime\n", - "\n", - "After installing dependencies, be sure restart runtime, so libraries will be loaded\n", - "\n", - "You do this by going to **`Runtime --> Restart Session`**\n", - "\n", - "Then you can continue to the next step (no need to re-run the notebook)" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "NOT in Colab\n" + ] + } + ], + "source": [ + "import os\n", + "\n", + "if os.getenv(\"COLAB_RELEASE_TAG\"):\n", + " print(\"Running in Colab\")\n", + " RUNNING_IN_COLAB = True\n", + "else:\n", + " print(\"NOT in Colab\")\n", + " RUNNING_IN_COLAB = False" + ] + }, + { + "cell_type": "markdown", + "id": "8e7c104b", + "metadata": { + "id": "8e7c104b" + }, + "source": [ + "### 2.2 -Download Data if running on Google Colab" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "3309799e", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, - { - "cell_type": "markdown", - "id": "e8b10be1", - "metadata": { - "id": "e8b10be1" - }, - "source": [ - "## Step-2: Configuration" - ] + "id": "3309799e", + "outputId": "00d7362e-d675-4aaf-8c87-d99027d9a06c" + }, + "outputs": [], + "source": [ + "if RUNNING_IN_COLAB:\n", + " !mkdir -p 'input/solar-system'\n", + " !wget -O 'input/solar-system/earth.pdf' 'https://raw.githubusercontent.com/IBM/data-prep-kit/dev/examples/notebooks/intro/input/solar-system/earth.pdf'\n", + " !wget -O 'input/solar-system/mars.pdf' 'https://raw.githubusercontent.com/IBM/data-prep-kit/dev/examples/notebooks/intro/input/solar-system/mars.pdf'\n", + " !wget -O 'my_utils.py' 'https://raw.githubusercontent.com/IBM/data-prep-kit/dev/examples/notebooks/intro/my_utils.py'" + ] + }, + { + "cell_type": "markdown", + "id": "a5dc2b68", + "metadata": { + "id": "a5dc2b68" + }, + "source": [ + "### 2.3 - Install dependencies if running on Google Colab" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "1fcec577", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 }, - { - "cell_type": "markdown", - "id": "356c66f7", - "metadata": { - "id": "356c66f7" - }, - "source": [ - "### 2.1 - Basic Config" - ] + "id": "1fcec577", + "outputId": "48cf233b-f04e-4b9b-9605-423f87693f10" + }, + "outputs": [], + "source": [ + "if RUNNING_IN_COLAB:\n", + " ! pip install --default-timeout=100 \\\n", + " data-prep-toolkit==0.2.1 \\\n", + " data-prep-toolkit-transforms==0.2.1 \\\n", + " data-prep-toolkit-transforms-ray==0.2.1 \\\n", + " deepsearch-toolkit" + ] + }, + { + "cell_type": "markdown", + "id": "243322b8", + "metadata": { + "id": "243322b8" + }, + "source": [ + "### 2.4 - Restart Runtime\n", + "\n", + "After installing dependencies, be sure restart runtime, so libraries will be loaded\n", + "\n", + "You do this by going to **`Runtime --> Restart Session`**\n", + "\n", + "Then you can continue to the next step (no need to re-run the notebook)" + ] + }, + { + "cell_type": "markdown", + "id": "e8b10be1", + "metadata": { + "id": "e8b10be1" + }, + "source": [ + "## Step-2: Configuration" + ] + }, + { + "cell_type": "markdown", + "id": "356c66f7", + "metadata": { + "id": "356c66f7" + }, + "source": [ + "### 2.1 - Basic Config" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "e4YMZrBuFycl", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "e4YMZrBuFycl", + "outputId": "1a1d5f01-0856-40b6-8b1c-8187b0c38d64" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 4, - "id": "e4YMZrBuFycl", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "e4YMZrBuFycl", - "outputId": "1a1d5f01-0856-40b6-8b1c-8187b0c38d64" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "NOT in Colab\n" - ] - } - ], - "source": [ - "import os\n", - "\n", - "if os.getenv(\"COLAB_RELEASE_TAG\"):\n", - " print(\"Running in Colab\")\n", - " RUNNING_IN_COLAB = True\n", - "else:\n", - " print(\"NOT in Colab\")\n", - " RUNNING_IN_COLAB = False" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "NOT in Colab\n" + ] + } + ], + "source": [ + "import os\n", + "\n", + "if os.getenv(\"COLAB_RELEASE_TAG\"):\n", + " print(\"Running in Colab\")\n", + " RUNNING_IN_COLAB = True\n", + "else:\n", + " print(\"NOT in Colab\")\n", + " RUNNING_IN_COLAB = False" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "33345487", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "33345487", + "outputId": "f3e71a25-4864-4f8f-dfce-4af3d7e08a8a" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 5, - "id": "33345487", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "33345487", - "outputId": "f3e71a25-4864-4f8f-dfce-4af3d7e08a8a" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "MY_CONFIG.RAY_RUNTIME_WORKERS: 2\n", - "MY_CONFIG.RAY_NUM_CPUS: 0.8\n", - "MY_CONFIG.RAY_MEMORY_GB: 2\n" - ] - } - ], - "source": [ - "import os\n", - "\n", - "## Configuration\n", - "class MyConfig:\n", - " pass\n", - "\n", - "MY_CONFIG = MyConfig ()\n", - "\n", - "MY_CONFIG.INPUT_DATA_DIR = 'input/solar-system'\n", - "\n", - "MY_CONFIG.OUTPUT_FOLDER = \"output\"\n", - "MY_CONFIG.OUTPUT_FOLDER_FINAL = os.path.join(MY_CONFIG.OUTPUT_FOLDER , \"output_final\")\n", - "\n", - "## Embedding model\n", - "MY_CONFIG.EMBEDDING_MODEL = 'sentence-transformers/all-MiniLM-L6-v2'\n", - "\n", - "## RAY CONFIGURATION\n", - "### For local runs, we can use more parallelism\n", - "### For google colab, be conservative\n", - "\n", - "if RUNNING_IN_COLAB:\n", - " MY_CONFIG.RAY_RUNTIME_WORKERS = 2\n", - " MY_CONFIG.RAY_NUM_CPUS = 0.3\n", - " MY_CONFIG.RAY_MEMORY_GB = 2 # GB\n", - "else: # local run\n", - " num_cpus_available = os.cpu_count()\n", - " # print (num_cpus_available)\n", - "\n", - " MY_CONFIG.RAY_RUNTIME_WORKERS = 2\n", - " MY_CONFIG.RAY_NUM_CPUS = 0.8\n", - " MY_CONFIG.RAY_MEMORY_GB = 2 # GB\n", - " # MY_CONFIG.RAY_RUNTIME_WORKERS = num_cpus_available // 3\n", - "\n", - "print ('MY_CONFIG.RAY_RUNTIME_WORKERS:', MY_CONFIG.RAY_RUNTIME_WORKERS)\n", - "print ('MY_CONFIG.RAY_NUM_CPUS:', MY_CONFIG.RAY_NUM_CPUS)\n", - "print ('MY_CONFIG.RAY_MEMORY_GB:', MY_CONFIG.RAY_MEMORY_GB)\n" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "MY_CONFIG.RAY_RUNTIME_WORKERS: 2\n", + "MY_CONFIG.RAY_NUM_CPUS: 0.8\n", + "MY_CONFIG.RAY_MEMORY_GB: 2\n" + ] + } + ], + "source": [ + "import os\n", + "\n", + "## Configuration\n", + "class MyConfig:\n", + " pass\n", + "\n", + "MY_CONFIG = MyConfig ()\n", + "\n", + "MY_CONFIG.INPUT_DATA_DIR = 'input/solar-system'\n", + "\n", + "MY_CONFIG.OUTPUT_FOLDER = \"output\"\n", + "MY_CONFIG.OUTPUT_FOLDER_FINAL = os.path.join(MY_CONFIG.OUTPUT_FOLDER , \"output_final\")\n", + "\n", + "## Embedding model\n", + "MY_CONFIG.EMBEDDING_MODEL = 'sentence-transformers/all-MiniLM-L6-v2'\n", + "\n", + "## RAY CONFIGURATION\n", + "### For local runs, we can use more parallelism\n", + "### For google colab, be conservative\n", + "\n", + "if RUNNING_IN_COLAB:\n", + " MY_CONFIG.RAY_RUNTIME_WORKERS = 2\n", + " MY_CONFIG.RAY_NUM_CPUS = 0.3\n", + " MY_CONFIG.RAY_MEMORY_GB = 2 # GB\n", + "else: # local run\n", + " num_cpus_available = os.cpu_count()\n", + " # print (num_cpus_available)\n", + "\n", + " MY_CONFIG.RAY_RUNTIME_WORKERS = 2\n", + " MY_CONFIG.RAY_NUM_CPUS = 0.8\n", + " MY_CONFIG.RAY_MEMORY_GB = 2 # GB\n", + " # MY_CONFIG.RAY_RUNTIME_WORKERS = num_cpus_available // 3\n", + "\n", + "print ('MY_CONFIG.RAY_RUNTIME_WORKERS:', MY_CONFIG.RAY_RUNTIME_WORKERS)\n", + "print ('MY_CONFIG.RAY_NUM_CPUS:', MY_CONFIG.RAY_NUM_CPUS)\n", + "print ('MY_CONFIG.RAY_MEMORY_GB:', MY_CONFIG.RAY_MEMORY_GB)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "b15e6827", + "metadata": { + "id": "b15e6827" + }, + "outputs": [], + "source": [ + "## Add parent dir to path\n", + "import os,sys\n", + "\n", + "this_dir = os.path.abspath('')\n", + "parent_dir = os.path.dirname(this_dir)\n", + "sys.path.append (os.path.abspath (parent_dir))" + ] + }, + { + "cell_type": "markdown", + "id": "72510ae6-48b0-4b88-9e13-a623281c3a63", + "metadata": { + "id": "72510ae6-48b0-4b88-9e13-a623281c3a63" + }, + "source": [ + "### 2.2 - Setup input/outpur directories" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "60ac8bee-0960-4309-b225-d7a211b14262", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "60ac8bee-0960-4309-b225-d7a211b14262", + "outputId": "ec5beb05-027a-49eb-9a96-271471619d81" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 6, - "id": "b15e6827", - "metadata": { - "id": "b15e6827" - }, - "outputs": [], - "source": [ - "## Add parent dir to path\n", - "import os,sys\n", - "\n", - "this_dir = os.path.abspath('')\n", - "parent_dir = os.path.dirname(this_dir)\n", - "sys.path.append (os.path.abspath (parent_dir))" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "โœ… Cleared output directory\n" + ] + } + ], + "source": [ + "import os, sys\n", + "import shutil\n", + "\n", + "if not os.path.exists(MY_CONFIG.INPUT_DATA_DIR ):\n", + " raise Exception (f\"โŒ Input folder MY_CONFIG.INPUT_DATA_DIR = '{MY_CONFIG.INPUT_DATA_DIR}' not found\")\n", + "\n", + "output_parquet_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '01_parquet_out')\n", + "output_chunk_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '02_chunk_out')\n", + "output_docid_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '03_docid_out')\n", + "output_exact_dedupe_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '04_exact_dedupe_out')\n", + "output_fuzzy_dedupe_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '05_fuzzy_dedupe_out')\n", + "output_embeddings_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '06_embeddings_out')\n", + "\n", + "## clear output folder\n", + "shutil.rmtree(MY_CONFIG.OUTPUT_FOLDER, ignore_errors=True)\n", + "shutil.os.makedirs(MY_CONFIG.OUTPUT_FOLDER, exist_ok=True)\n", + "\n", + "print (\"โœ… Cleared output directory\")" + ] + }, + { + "cell_type": "markdown", + "id": "2449e5c7-078c-4ad6-a2f6-21d39d4da3fb", + "metadata": { + "id": "2449e5c7-078c-4ad6-a2f6-21d39d4da3fb" + }, + "source": [ + "## Step-3: pdf2parquet - Convert data from PDF to Parquet\n", + "\n", + "This step is reading the input folder containing all PDF files and ingest them in a parquet table using the [Docling package](https://github.com/DS4SD/docling).\n", + "The documents are converted into a JSON format which allows to easily chunk it in the later steps.\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "c0c574c4-9dc4-4dab-9ad6-b5338207e67a", + "metadata": { + "id": "c0c574c4-9dc4-4dab-9ad6-b5338207e67a" + }, + "source": [ + "### 3.1 - Set Input/output Folder" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "482605b2-d814-456d-9195-49a2ec454ef0", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "482605b2-d814-456d-9195-49a2ec454ef0", + "outputId": "f8383739-a4fb-450c-dc37-5df32aab8212" + }, + "outputs": [ { - "cell_type": "markdown", - "id": "72510ae6-48b0-4b88-9e13-a623281c3a63", - "metadata": { - "id": "72510ae6-48b0-4b88-9e13-a623281c3a63" - }, - "source": [ - "### 2.2 - Setup input/outpur directories" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "๐Ÿƒ๐Ÿผ STAGE-1: Processing input='input/solar-system' --> output='output/01_parquet_out'\n" + ] + } + ], + "source": [ + "STAGE = 1\n", + "\n", + "input_folder = MY_CONFIG.INPUT_DATA_DIR\n", + "output_folder = output_parquet_dir\n", + "\n", + "print (f\"๐Ÿƒ๐Ÿผ STAGE-{STAGE}: Processing input='{input_folder}' --> output='{output_folder}'\")" + ] + }, + { + "cell_type": "markdown", + "id": "9bb15f02-ab5c-4525-a536-cfa1fd2ba70b", + "metadata": { + "id": "9bb15f02-ab5c-4525-a536-cfa1fd2ba70b" + }, + "source": [ + "### 3.2 - Execute" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b0cd8ebd-bf71-42d6-a397-8df0c7b66a26", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "b0cd8ebd-bf71-42d6-a397-8df0c7b66a26", + "outputId": "14a36e73-a186-4431-a755-f46ccb691130" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 7, - "id": "60ac8bee-0960-4309-b225-d7a211b14262", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "60ac8bee-0960-4309-b225-d7a211b14262", - "outputId": "ec5beb05-027a-49eb-9a96-271471619d81" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "โœ… Cleared output directory\n" - ] - } - ], - "source": [ - "import os, sys\n", - "import shutil\n", - "\n", - "if not os.path.exists(MY_CONFIG.INPUT_DATA_DIR ):\n", - " raise Exception (f\"โŒ Input folder MY_CONFIG.INPUT_DATA_DIR = '{MY_CONFIG.INPUT_DATA_DIR}' not found\")\n", - "\n", - "output_parquet_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '01_parquet_out')\n", - "output_chunk_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '02_chunk_out')\n", - "output_docid_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '03_docid_out')\n", - "output_exact_dedupe_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '04_exact_dedupe_out')\n", - "output_fuzzy_dedupe_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '05_fuzzy_dedupe_out')\n", - "output_embeddings_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '06_embeddings_out')\n", - "\n", - "## clear output folder\n", - "shutil.rmtree(MY_CONFIG.OUTPUT_FOLDER, ignore_errors=True)\n", - "shutil.os.makedirs(MY_CONFIG.OUTPUT_FOLDER, exist_ok=True)\n", - "\n", - "print (\"โœ… Cleared output directory\")" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "13:30:44 INFO - pdf2parquet parameters are : {'artifacts_path': None, 'contents_type': , 'do_table_structure': True, 'do_ocr': True, 'double_precision': 8}\n", + "13:30:44 INFO - pipeline id pipeline_id\n", + "13:30:44 INFO - code location None\n", + "13:30:44 INFO - number of workers 2 worker options {'num_cpus': 0.8, 'memory': 2147483648, 'max_restarts': -1}\n", + "13:30:44 INFO - actor creation delay 0\n", + "13:30:44 INFO - job details {'job category': 'preprocessing', 'job name': 'pdf2parquet', 'job type': 'ray', 'job id': 'job_id'}\n", + "13:30:44 INFO - data factory data_ is using local data access: input_folder - input/solar-system output_folder - output/01_parquet_out\n", + "13:30:44 INFO - data factory data_ max_files -1, n_sample -1\n", + "13:30:44 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.pdf'], files to checkpoint ['.parquet']\n", + "13:30:44 INFO - Running locally\n", + "2024-10-18 13:30:47,436\tINFO worker.py:1744 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", + "\u001b[36m(orchestrate pid=9266)\u001b[0m 13:30:50 INFO - orchestrator started at 2024-10-18 13:30:50\n", + "\u001b[36m(orchestrate pid=9266)\u001b[0m 13:30:50 INFO - Number of files is 2, source profile {'max_file_size': 0.055823326110839844, 'min_file_size': 0.0551910400390625, 'total_file_size': 0.11101436614990234}\n", + "\u001b[36m(orchestrate pid=9266)\u001b[0m 13:30:50 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 14.872821807861328, 'object_store': 7.436410903930664}\n", + "\u001b[36m(orchestrate pid=9266)\u001b[0m 13:30:50 INFO - Number of workers - 2 with {'num_cpus': 0.8, 'memory': 2147483648, 'max_restarts': -1} each\n", + "\u001b[36m(orchestrate pid=9266)\u001b[0m 13:30:50 INFO - Completed 0 files (0.0%) in 0.0 min. Waiting for completion\n", + "\u001b[36m(RayTransformFileProcessor pid=10098)\u001b[0m 13:30:53 INFO - Initializing models\n", + "Fetching 10 files: 100%|โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆ| 10/10 [00:00<00:00, 110376.42it/s]\n", + "\u001b[36m(RayTransformFileProcessor pid=10098)\u001b[0m Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.\n", + "\u001b[36m(orchestrate pid=9266)\u001b[0m 13:30:59 INFO - Completed processing 2 files in 0.145 min\n", + "\u001b[36m(orchestrate pid=9266)\u001b[0m 13:30:59 INFO - done flushing in 0.001 sec\n", + "\u001b[36m(RayTransformFileProcessor pid=10099)\u001b[0m 13:30:53 INFO - Initializing models\n", + "Fetching 10 files: 100%|โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆ| 10/10 [00:00<00:00, 73713.60it/s]\n", + "\u001b[36m(RayTransformFileProcessor pid=10099)\u001b[0m Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.\n", + "13:31:09 INFO - Completed execution in 0.421 min, execution result 0\n" + ] }, { - "cell_type": "markdown", - "id": "2449e5c7-078c-4ad6-a2f6-21d39d4da3fb", - "metadata": { - "id": "2449e5c7-078c-4ad6-a2f6-21d39d4da3fb" - }, - "source": [ - "## Step-3: pdf2parquet - Convert data from PDF to Parquet\n", - "\n", - "This step is reading the input folder containing all PDF files and ingest them in a parquet table using the [Docling package](https://github.com/DS4SD/docling).\n", - "The documents are converted into a JSON format which allows to easily chunk it in the later steps.\n", - "\n" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "โœ… Stage:1 completed successfully\n", + "CPU times: user 4.41 s, sys: 1.39 s, total: 5.8 s\n", + "Wall time: 31.1 s\n" + ] + } + ], + "source": [ + "%%time\n", + "\n", + "import ast\n", + "import os\n", + "import sys\n", + "\n", + "from pdf2parquet_transform import (\n", + " pdf2parquet_contents_type_cli_param,\n", + " pdf2parquet_contents_types,\n", + ")\n", + "from data_processing_ray.runtime.ray import RayTransformLauncher\n", + "from pdf2parquet_transform_python import Pdf2ParquetPythonTransformConfiguration\n", + "from pdf2parquet_transform_ray import Pdf2ParquetRayTransformConfiguration\n", + "\n", + "from data_processing.utils import GB, ParamsUtils\n", + "\n", + "\n", + "# create parameters\n", + "local_conf = {\n", + " \"input_folder\": input_folder,\n", + " \"output_folder\": output_folder,\n", + "}\n", + "worker_options = {\"num_cpus\" : MY_CONFIG.RAY_NUM_CPUS, \"memory\": MY_CONFIG.RAY_MEMORY_GB * GB}\n", + "ingest_config = {\n", + " pdf2parquet_contents_type_cli_param: pdf2parquet_contents_types.JSON,\n", + "}\n", + "\n", + "params = {\n", + " # where to run\n", + " \"run_locally\": True,\n", + " # Data access. Only required parameters are specified\n", + " \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n", + " \"data_files_to_use\": ast.literal_eval(\"['.pdf']\"),\n", + " # orchestrator\n", + " \"runtime_worker_options\": ParamsUtils.convert_to_ast(worker_options),\n", + " \"runtime_num_workers\": 1, # so model download to cleanup works properly\n", + "\n", + "}\n", + "\n", + "\n", + "sys.argv = ParamsUtils.dict_to_req(d=(params | ingest_config))\n", + "# create launcher\n", + "launcher = RayTransformLauncher(Pdf2ParquetRayTransformConfiguration())\n", + "# launcher = PythonTransformLauncher(Pdf2ParquetPythonTransformConfiguration())\n", + "# launch\n", + "return_code = launcher.launch()\n", + "\n", + "if return_code == 0:\n", + " print (f\"โœ… Stage:{STAGE} completed successfully\")\n", + "else:\n", + " raise Exception (\"โŒ Ray job failed\")\n" + ] + }, + { + "cell_type": "markdown", + "id": "5ca790e0", + "metadata": { + "id": "5ca790e0" + }, + "source": [ + "### 3.3 - Inspect Generated output\n", + "\n", + "Here we should see one entry per input file processed." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "fe59563d", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 255 }, + "id": "fe59563d", + "outputId": "d10c022d-524f-4a13-ebf8-6431114e9172" + }, + "outputs": [ { - "cell_type": "markdown", - "id": "c0c574c4-9dc4-4dab-9ad6-b5338207e67a", - "metadata": { - "id": "c0c574c4-9dc4-4dab-9ad6-b5338207e67a" - }, - "source": [ - "### 3.1 - Set Input/output Folder" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "Output dimensions (rows x columns)= (2, 12)\n" + ] }, { - "cell_type": "code", - "execution_count": 8, - "id": "482605b2-d814-456d-9195-49a2ec454ef0", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "482605b2-d814-456d-9195-49a2ec454ef0", - "outputId": "f8383739-a4fb-450c-dc37-5df32aab8212" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "๐Ÿƒ๐Ÿผ STAGE-1: Processing input='input/solar-system' --> output='output/01_parquet_out'\n" - ] - } + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
filenamecontentsnum_pagesnum_tablesnum_doc_elementsdocument_idexthashsizedate_acquiredpdf_convert_timesource_filename
0mars.pdf{\"_name\":\"\",\"type\":\"pdf-document\",\"description...101162e5639f-f922-4ccc-a041-3cb02f1cfd83pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf
1earth.pdf{\"_name\":\"\",\"type\":\"pdf-document\",\"description...1011f3c0ac2e-1de2-472b-8216-2043f3b3e9d1pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdf
\n", + "
" ], - "source": [ - "STAGE = 1\n", - "\n", - "input_folder = MY_CONFIG.INPUT_DATA_DIR\n", - "output_folder = output_parquet_dir\n", - "\n", - "print (f\"๐Ÿƒ๐Ÿผ STAGE-{STAGE}: Processing input='{input_folder}' --> output='{output_folder}'\")" + "text/plain": [ + " filename contents num_pages \\\n", + "0 mars.pdf {\"_name\":\"\",\"type\":\"pdf-document\",\"description... 1 \n", + "1 earth.pdf {\"_name\":\"\",\"type\":\"pdf-document\",\"description... 1 \n", + "\n", + " num_tables num_doc_elements document_id ext \\\n", + "0 0 11 62e5639f-f922-4ccc-a041-3cb02f1cfd83 pdf \n", + "1 0 11 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 pdf \n", + "\n", + " hash size \\\n", + "0 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", + "1 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", + "\n", + " date_acquired pdf_convert_time source_filename \n", + "0 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", + "1 2024-10-18T13:30:59.494027 2.015123 earth.pdf " ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from my_utils import read_parquet_files_as_df\n", + "\n", + "output_df = read_parquet_files_as_df(output_folder)\n", + "\n", + "print (\"Output dimensions (rows x columns)= \", output_df.shape)\n", + "\n", + "output_df.head(5)\n", + "\n", + "## To display certain columns\n", + "#parquet_df[['column1', 'column2', 'column3']].head(5)" + ] + }, + { + "cell_type": "markdown", + "id": "e5058a21", + "metadata": { + "id": "e5058a21" + }, + "source": [ + "\n", + "### 3.4 - Understand the output\n", + "\n", + "Here are some interesting attributes to note:\n", + "\n", + "- **filename** : original filename\n", + "- **contents** : text\n", + "- **document_id**: unique id (UUID) assignd to this document\n", + "- **hash** : hash of document\n", + "- **pdf_convert_time** : time to convert this pdf in seconds\n", + "\n", + "Let's inspect the **contents** column. See how the text is being divided up!" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "f870e624", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "f870e624", + "outputId": "9142246b-988c-4674-99d7-e2f3fffbaaf4" + }, + "outputs": [ { - "cell_type": "markdown", - "id": "9bb15f02-ab5c-4525-a536-cfa1fd2ba70b", - "metadata": { - "id": "9bb15f02-ab5c-4525-a536-cfa1fd2ba70b" - }, - "source": [ - "### 3.2 - Execute" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "{'_name': '',\n", + " 'description': {'logs': []},\n", + " 'equations': [],\n", + " 'figures': [],\n", + " 'file-info': {'#-pages': 1,\n", + " 'document-hash': '1a83f43f3a202e3f203c1263e36961ecc45d401aad488f638fc5559a584333b2',\n", + " 'filename': 'mars.pdf',\n", + " 'page-hashes': [{'hash': '551fe7a9bde2a9302f150c0a79a13fcc0868fcf73ac6afb80be645c1174734a0',\n", + " 'model': 'default',\n", + " 'page': 1}]},\n", + " 'footnotes': [],\n", + " 'main-text': [{'name': 'Section-header',\n", + " 'prov': [{'bbox': [133.35137939,\n", + " 654.45184326,\n", + " 169.88169861,\n", + " 667.98492432],\n", + " 'page': 1,\n", + " 'span': [0, 4]}],\n", + " 'text': 'Mars',\n", + " 'type': 'subtitle-level-1'},\n", + " {'name': 'Section-header',\n", + " 'prov': [{'bbox': [133.09541321,\n", + " 630.68127441,\n", + " 210.66503906,\n", + " 642.34405518],\n", + " 'page': 1,\n", + " 'span': [0, 12]}],\n", + " 'text': 'Solar System',\n", + " 'type': 'subtitle-level-1'},\n", + " {'name': 'Text',\n", + " 'prov': [{'bbox': [132.84518433,\n", + " 588.96014404,\n", + " 479.40917969,\n", + " 623.02520752],\n", + " 'page': 1,\n", + " 'span': [0, 205]}],\n", + " 'text': 'Our solar system is a vast and fascinating expanse, '\n", + " 'comprising eight planets, five dwarf planets, '\n", + " 'numerous moons, asteroids, comets, and other '\n", + " 'celestial bodies. At its center lies the star we call '\n", + " 'the Sun.',\n", + " 'type': 'paragraph'},\n", + " {'name': 'Text',\n", + " 'prov': [{'bbox': [133.18510437,\n", + " 570.83258057,\n", + " 374.99838257,\n", + " 581.07043457],\n", + " 'page': 1,\n", + " 'span': [0, 54]}],\n", + " 'text': 'For more details about the Solar system see Chapter '\n", + " '1.',\n", + " 'type': 'paragraph'},\n", + " {'name': 'Section-header',\n", + " 'prov': [{'bbox': [133.22866821,\n", + " 542.98168945,\n", + " 163.86282349,\n", + " 554.45288086],\n", + " 'page': 1,\n", + " 'span': [0, 4]}],\n", + " 'text': 'Mars',\n", + " 'type': 'subtitle-level-1'},\n", + " {'name': 'Text',\n", + " 'prov': [{'bbox': [132.87440491,\n", + " 500.84011841,\n", + " 477.48345947,\n", + " 534.55810547],\n", + " 'page': 1,\n", + " 'span': [0, 196]}],\n", + " 'text': 'Mars, the fourth planet from the Sun, is a cold, '\n", + " 'desert world with a thin atmosphere composed '\n", + " 'primarily of carbon dioxide. Its reddish hue comes '\n", + " 'from iron oxide, or rust, prevalent on its surface.',\n", + " 'type': 'paragraph'},\n", + " {'name': 'Section-header',\n", + " 'prov': [{'bbox': [133.2026062,\n", + " 482.90710449,\n", + " 237.04431152,\n", + " 493.07443237],\n", + " 'page': 1,\n", + " 'span': [0, 23]}],\n", + " 'text': 'Basic facts about Mars:',\n", + " 'type': 'subtitle-level-1'},\n", + " {'name': 'List-item',\n", + " 'prov': [{'bbox': [145.94500732,\n", + " 453.019104,\n", + " 477.48171997,\n", + " 474.9703064],\n", + " 'page': 1,\n", + " 'span': [0, 78]}],\n", + " 'text': 'ยท Distance from the Sun: Average of 228 million '\n", + " 'kilometers (142 million miles)',\n", + " 'type': 'paragraph'},\n", + " {'name': 'List-item',\n", + " 'prov': [{'bbox': [145.94500732,\n", + " 440.79351807,\n", + " 431.73287964,\n", + " 451.2142334],\n", + " 'page': 1,\n", + " 'span': [0, 64]}],\n", + " 'text': 'ยท Rotation Period: 24.6 hours (one Martian day - '\n", + " 'called a \"sol\")',\n", + " 'type': 'paragraph'},\n", + " {'name': 'List-item',\n", + " 'prov': [{'bbox': [145.94500732,\n", + " 429.10913086,\n", + " 365.9559021,\n", + " 438.83737183],\n", + " 'page': 1,\n", + " 'span': [0, 44]}],\n", + " 'text': 'ยท Moons: Two small moons, Phobos and Deimos.',\n", + " 'type': 'paragraph'},\n", + " {'name': 'Page-footer',\n", + " 'prov': [{'bbox': [303.13299561,\n", + " 87.20314026,\n", + " 308.11428833,\n", + " 96.51646423],\n", + " 'page': 1,\n", + " 'span': [0, 1]}],\n", + " 'text': '1',\n", + " 'type': 'page-footer'}],\n", + " 'page-dimensions': [{'height': 792.0, 'page': 1, 'width': 612.0}],\n", + " 'page-footers': [],\n", + " 'page-headers': [],\n", + " 'tables': [],\n", + " 'type': 'pdf-document'}\n" + ] + } + ], + "source": [ + "import pprint\n", + "import json\n", + "\n", + "pprint.pprint (json.loads(output_df.iloc[0, ]['contents']))\n", + "# json.loads(output_df.iloc[0, ]['contents'])" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "e1a10c2d", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "e1a10c2d", + "outputId": "ca74113e-6fd3-488b-836a-60bd58299fb1" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 9, - "id": "b0cd8ebd-bf71-42d6-a397-8df0c7b66a26", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "b0cd8ebd-bf71-42d6-a397-8df0c7b66a26", - "outputId": "14a36e73-a186-4431-a755-f46ccb691130" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "13:30:44 INFO - pdf2parquet parameters are : {'artifacts_path': None, 'contents_type': , 'do_table_structure': True, 'do_ocr': True, 'double_precision': 8}\n", - "13:30:44 INFO - pipeline id pipeline_id\n", - "13:30:44 INFO - code location None\n", - "13:30:44 INFO - number of workers 2 worker options {'num_cpus': 0.8, 'memory': 2147483648, 'max_restarts': -1}\n", - "13:30:44 INFO - actor creation delay 0\n", - "13:30:44 INFO - job details {'job category': 'preprocessing', 'job name': 'pdf2parquet', 'job type': 'ray', 'job id': 'job_id'}\n", - "13:30:44 INFO - data factory data_ is using local data access: input_folder - input/solar-system output_folder - output/01_parquet_out\n", - "13:30:44 INFO - data factory data_ max_files -1, n_sample -1\n", - "13:30:44 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.pdf'], files to checkpoint ['.parquet']\n", - "13:30:44 INFO - Running locally\n", - "2024-10-18 13:30:47,436\tINFO worker.py:1744 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", - "\u001b[36m(orchestrate pid=9266)\u001b[0m 13:30:50 INFO - orchestrator started at 2024-10-18 13:30:50\n", - "\u001b[36m(orchestrate pid=9266)\u001b[0m 13:30:50 INFO - Number of files is 2, source profile {'max_file_size': 0.055823326110839844, 'min_file_size': 0.0551910400390625, 'total_file_size': 0.11101436614990234}\n", - "\u001b[36m(orchestrate pid=9266)\u001b[0m 13:30:50 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 14.872821807861328, 'object_store': 7.436410903930664}\n", - "\u001b[36m(orchestrate pid=9266)\u001b[0m 13:30:50 INFO - Number of workers - 2 with {'num_cpus': 0.8, 'memory': 2147483648, 'max_restarts': -1} each\n", - "\u001b[36m(orchestrate pid=9266)\u001b[0m 13:30:50 INFO - Completed 0 files (0.0%) in 0.0 min. Waiting for completion\n", - "\u001b[36m(RayTransformFileProcessor pid=10098)\u001b[0m 13:30:53 INFO - Initializing models\n", - "Fetching 10 files: 100%|โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆ| 10/10 [00:00<00:00, 110376.42it/s]\n", - "\u001b[36m(RayTransformFileProcessor pid=10098)\u001b[0m Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.\n", - "\u001b[36m(orchestrate pid=9266)\u001b[0m 13:30:59 INFO - Completed processing 2 files in 0.145 min\n", - "\u001b[36m(orchestrate pid=9266)\u001b[0m 13:30:59 INFO - done flushing in 0.001 sec\n", - "\u001b[36m(RayTransformFileProcessor pid=10099)\u001b[0m 13:30:53 INFO - Initializing models\n", - "Fetching 10 files: 100%|โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆ| 10/10 [00:00<00:00, 73713.60it/s]\n", - "\u001b[36m(RayTransformFileProcessor pid=10099)\u001b[0m Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.\n", - "13:31:09 INFO - Completed execution in 0.421 min, execution result 0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "โœ… Stage:1 completed successfully\n", - "CPU times: user 4.41 s, sys: 1.39 s, total: 5.8 s\n", - "Wall time: 31.1 s\n" - ] - } - ], - "source": [ - "%%time\n", - "\n", - "import ast\n", - "import os\n", - "import sys\n", - "\n", - "from pdf2parquet_transform import (\n", - " pdf2parquet_contents_type_cli_param,\n", - " pdf2parquet_contents_types,\n", - ")\n", - "from data_processing_ray.runtime.ray import RayTransformLauncher\n", - "from pdf2parquet_transform_python import Pdf2ParquetPythonTransformConfiguration\n", - "from pdf2parquet_transform_ray import Pdf2ParquetRayTransformConfiguration\n", - "\n", - "from data_processing.utils import GB, ParamsUtils\n", - "\n", - "\n", - "# create parameters\n", - "local_conf = {\n", - " \"input_folder\": input_folder,\n", - " \"output_folder\": output_folder,\n", - "}\n", - "worker_options = {\"num_cpus\" : MY_CONFIG.RAY_NUM_CPUS, \"memory\": MY_CONFIG.RAY_MEMORY_GB * GB}\n", - "ingest_config = {\n", - " pdf2parquet_contents_type_cli_param: pdf2parquet_contents_types.JSON,\n", - "}\n", - "\n", - "params = {\n", - " # where to run\n", - " \"run_locally\": True,\n", - " # Data access. Only required parameters are specified\n", - " \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n", - " \"data_files_to_use\": ast.literal_eval(\"['.pdf']\"),\n", - " # orchestrator\n", - " \"runtime_worker_options\": ParamsUtils.convert_to_ast(worker_options),\n", - " \"runtime_num_workers\": MY_CONFIG.RAY_RUNTIME_WORKERS,\n", - "}\n", - "\n", - "\n", - "sys.argv = ParamsUtils.dict_to_req(d=(params | ingest_config))\n", - "# create launcher\n", - "launcher = RayTransformLauncher(Pdf2ParquetRayTransformConfiguration())\n", - "# launcher = PythonTransformLauncher(Pdf2ParquetPythonTransformConfiguration())\n", - "# launch\n", - "return_code = launcher.launch()\n", - "\n", - "if return_code == 0:\n", - " print (f\"โœ… Stage:{STAGE} completed successfully\")\n", - "else:\n", - " raise Exception (\"โŒ Ray job failed\")\n" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "{'_name': '',\n", + " 'description': {'logs': []},\n", + " 'equations': [],\n", + " 'figures': [],\n", + " 'file-info': {'#-pages': 1,\n", + " 'document-hash': '7401ae81637dbb89e7040dcd5945bbfb75ff8648bb761c69f8a1595e86538748',\n", + " 'filename': 'earth.pdf',\n", + " 'page-hashes': [{'hash': 'ca802e4bd5a3301792808caea2a47db51f0520888875b77fc230c99ee851c19b',\n", + " 'model': 'default',\n", + " 'page': 1}]},\n", + " 'footnotes': [],\n", + " 'main-text': [{'name': 'Section-header',\n", + " 'prov': [{'bbox': [133.30961609,\n", + " 654.45184326,\n", + " 174.04208374,\n", + " 667.93347168],\n", + " 'page': 1,\n", + " 'span': [0, 5]}],\n", + " 'text': 'Earth',\n", + " 'type': 'subtitle-level-1'},\n", + " {'name': 'Section-header',\n", + " 'prov': [{'bbox': [133.12528992,\n", + " 630.69073486,\n", + " 210.66503906,\n", + " 642.27935791],\n", + " 'page': 1,\n", + " 'span': [0, 12]}],\n", + " 'text': 'Solar System',\n", + " 'type': 'subtitle-level-1'},\n", + " {'name': 'Text',\n", + " 'prov': [{'bbox': [132.87112427,\n", + " 588.96014404,\n", + " 479.40917969,\n", + " 623.04595947],\n", + " 'page': 1,\n", + " 'span': [0, 205]}],\n", + " 'text': 'Our solar system is a vast and fascinating expanse, '\n", + " 'comprising eight planets, five dwarf planets, '\n", + " 'numerous moons, asteroids, comets, and other '\n", + " 'celestial bodies. At its center lies the star we call '\n", + " 'the Sun.',\n", + " 'type': 'paragraph'},\n", + " {'name': 'Text',\n", + " 'prov': [{'bbox': [133.20942688,\n", + " 570.81555176,\n", + " 375.57919312,\n", + " 581.08459473],\n", + " 'page': 1,\n", + " 'span': [0, 54]}],\n", + " 'text': 'For more details about our Solar system see Chapter '\n", + " '1.',\n", + " 'type': 'paragraph'},\n", + " {'name': 'Section-header',\n", + " 'prov': [{'bbox': [133.15542603,\n", + " 542.98168945,\n", + " 167.32983398,\n", + " 554.36669922],\n", + " 'page': 1,\n", + " 'span': [0, 5]}],\n", + " 'text': 'Earth',\n", + " 'type': 'subtitle-level-1'},\n", + " {'name': 'Text',\n", + " 'prov': [{'bbox': [132.91053772,\n", + " 512.46295166,\n", + " 477.84887695,\n", + " 534.48431396],\n", + " 'page': 1,\n", + " 'span': [0, 107]}],\n", + " 'text': \"Earth is the third planet from the Sun. It's our home \"\n", + " 'planet. Earth is the only place we know of with life.',\n", + " 'type': 'paragraph'},\n", + " {'name': 'Text',\n", + " 'prov': [{'bbox': [133.30151367,\n", + " 494.86206055,\n", + " 240.17156982,\n", + " 505.07229614],\n", + " 'page': 1,\n", + " 'span': [0, 24]}],\n", + " 'text': 'Basic facts about Earth:',\n", + " 'type': 'paragraph'},\n", + " {'name': 'List-item',\n", + " 'prov': [{'bbox': [145.94500732,\n", + " 464.97409058,\n", + " 477.47979736,\n", + " 487.02810669],\n", + " 'page': 1,\n", + " 'span': [0, 79]}],\n", + " 'text': 'ยท Distance from the Sun: Average of 149.6 million '\n", + " 'kilometers (93 million miles)',\n", + " 'type': 'paragraph'},\n", + " {'name': 'List-item',\n", + " 'prov': [{'bbox': [145.94500732,\n", + " 452.86901855,\n", + " 317.90722656,\n", + " 463.24041748],\n", + " 'page': 1,\n", + " 'span': [0, 37]}],\n", + " 'text': 'ยท Rotation Period: 24 hours (one day)',\n", + " 'type': 'paragraph'},\n", + " {'name': 'List-item',\n", + " 'prov': [{'bbox': [145.94500732,\n", + " 440.71496582,\n", + " 396.66357422,\n", + " 451.19915771],\n", + " 'page': 1,\n", + " 'span': [0, 52]}],\n", + " 'text': 'ยท Moons: One moon, called Luna or simply \"the Moon\".',\n", + " 'type': 'paragraph'},\n", + " {'name': 'Page-footer',\n", + " 'prov': [{'bbox': [303.13299561,\n", + " 87.20314026,\n", + " 308.11428833,\n", + " 96.53633118],\n", + " 'page': 1,\n", + " 'span': [0, 1]}],\n", + " 'text': '1',\n", + " 'type': 'page-footer'}],\n", + " 'page-dimensions': [{'height': 792.0, 'page': 1, 'width': 612.0}],\n", + " 'page-footers': [],\n", + " 'page-headers': [],\n", + " 'tables': [],\n", + " 'type': 'pdf-document'}\n" + ] + } + ], + "source": [ + "pprint.pprint (json.loads(output_df.iloc[1, ]['contents']))" + ] + }, + { + "cell_type": "markdown", + "id": "72274586", + "metadata": { + "id": "72274586" + }, + "source": [ + "## Step-4: Doc chunks\n", + "\n", + "In the previous step, we have extracted text from oru PDFs. But we have the content of entire file as 'one row' in our parquet output.\n", + "\n", + "In this step, we are going to split the documents in chunks, according to their layout segmentation.\n", + "\n", + "This transform uses [Quackling](https://github.com/DS4SD/quackling) `HierarchicalChunker`\n", + "to chunk according to the document layout segmentation, i.e. respecting the original document components as paragraphs, tables, enumerations, etc.\n", + "It relies on documents converted with the Docling library in the [pdf2parquet transform](https://github.com/IBM/data-prep-kit/blob/dev/transforms/language/pdf2parquet/python/README.md) using the option `contents_type: \"application/json\"`,\n", + "which provides the required JSON structure." + ] + }, + { + "cell_type": "markdown", + "id": "96198fa6", + "metadata": { + "id": "96198fa6" + }, + "source": [ + "### 4.1 - Set Input/output Folder" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "305f00a3", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "305f00a3", + "outputId": "689f1531-7007-49d9-9a27-39c39f8f2c50" + }, + "outputs": [ { - "cell_type": "markdown", - "id": "5ca790e0", - "metadata": { - "id": "5ca790e0" - }, - "source": [ - "### 3.3 - Inspect Generated output\n", - "\n", - "Here we should see one entry per input file processed." - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "๐Ÿƒ๐Ÿผ STAGE-2: Processing input='output/01_parquet_out' --> output='output/02_chunk_out'\n" + ] + } + ], + "source": [ + "STAGE = 2\n", + "\n", + "input_folder = output_parquet_dir # previous output folder is the input folder for the current stage\n", + "output_folder = output_chunk_dir\n", + "\n", + "input_df = read_parquet_files_as_df(input_folder) ## for debug purposes\n", + "\n", + "print (f\"๐Ÿƒ๐Ÿผ STAGE-{STAGE}: Processing input='{input_folder}' --> output='{output_folder}'\")" + ] + }, + { + "cell_type": "markdown", + "id": "369f2cd1", + "metadata": { + "id": "369f2cd1" + }, + "source": [ + "### 4.2 - Execute" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "5b7b18d5", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "5b7b18d5", + "outputId": "0146bd91-2ccb-4e56-c649-f415a38bfcf8" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 10, - "id": "fe59563d", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 255 - }, - "id": "fe59563d", - "outputId": "d10c022d-524f-4a13-ebf8-6431114e9172" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Output dimensions (rows x columns)= (2, 12)\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
filenamecontentsnum_pagesnum_tablesnum_doc_elementsdocument_idexthashsizedate_acquiredpdf_convert_timesource_filename
0mars.pdf{\"_name\":\"\",\"type\":\"pdf-document\",\"description...101162e5639f-f922-4ccc-a041-3cb02f1cfd83pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf
1earth.pdf{\"_name\":\"\",\"type\":\"pdf-document\",\"description...1011f3c0ac2e-1de2-472b-8216-2043f3b3e9d1pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdf
\n", - "
" - ], - "text/plain": [ - " filename contents num_pages \\\n", - "0 mars.pdf {\"_name\":\"\",\"type\":\"pdf-document\",\"description... 1 \n", - "1 earth.pdf {\"_name\":\"\",\"type\":\"pdf-document\",\"description... 1 \n", - "\n", - " num_tables num_doc_elements document_id ext \\\n", - "0 0 11 62e5639f-f922-4ccc-a041-3cb02f1cfd83 pdf \n", - "1 0 11 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 pdf \n", - "\n", - " hash size \\\n", - "0 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "1 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "\n", - " date_acquired pdf_convert_time source_filename \n", - "0 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", - "1 2024-10-18T13:30:59.494027 2.015123 earth.pdf " - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from my_utils import read_parquet_files_as_df\n", - "\n", - "output_df = read_parquet_files_as_df(output_folder)\n", - "\n", - "print (\"Output dimensions (rows x columns)= \", output_df.shape)\n", - "\n", - "output_df.head(5)\n", - "\n", - "## To display certain columns\n", - "#parquet_df[['column1', 'column2', 'column3']].head(5)" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "13:31:12 INFO - doc_chunk parameters are : {'chunking_type': , 'content_column_name': 'contents', 'doc_id_column_name': 'document_id', 'dl_min_chunk_len': None, 'output_chunk_column_name': 'contents', 'output_source_doc_id_column_name': 'source_document_id', 'output_jsonpath_column_name': 'doc_jsonpath', 'output_pageno_column_name': 'page_number', 'output_bbox_column_name': 'bbox'}\n", + "13:31:12 INFO - pipeline id pipeline_id\n", + "13:31:12 INFO - code location None\n", + "13:31:12 INFO - number of workers 2 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n", + "13:31:12 INFO - actor creation delay 0\n", + "13:31:12 INFO - job details {'job category': 'preprocessing', 'job name': 'doc_chunk', 'job type': 'ray', 'job id': 'job_id'}\n", + "13:31:12 INFO - data factory data_ is using local data access: input_folder - output/01_parquet_out output_folder - output/02_chunk_out\n", + "13:31:12 INFO - data factory data_ max_files -1, n_sample -1\n", + "13:31:12 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "13:31:12 INFO - Running locally\n", + "2024-10-18 13:31:14,121\tINFO worker.py:1744 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", + "\u001b[36m(orchestrate pid=10912)\u001b[0m 13:31:16 INFO - orchestrator started at 2024-10-18 13:31:16\n", + "\u001b[36m(orchestrate pid=10912)\u001b[0m 13:31:16 INFO - Number of files is 2, source profile {'max_file_size': 0.02239513397216797, 'min_file_size': 0.02167987823486328, 'total_file_size': 0.04407501220703125}\n", + "\u001b[36m(orchestrate pid=10912)\u001b[0m 13:31:16 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 14.963891602121294, 'object_store': 7.4819458005949855}\n", + "\u001b[36m(orchestrate pid=10912)\u001b[0m 13:31:16 INFO - Number of workers - 2 with {'num_cpus': 0.8, 'max_restarts': -1} each\n", + "\u001b[36m(orchestrate pid=10912)\u001b[0m 13:31:16 INFO - Completed 0 files (0.0%) in 0.0 min. Waiting for completion\n", + "\u001b[36m(orchestrate pid=10912)\u001b[0m 13:31:18 INFO - Completed processing 2 files in 0.032 min\n", + "\u001b[36m(orchestrate pid=10912)\u001b[0m 13:31:18 INFO - done flushing in 0.001 sec\n", + "13:31:28 INFO - Completed execution in 0.269 min, execution result 0\n" + ] }, { - "cell_type": "markdown", - "id": "e5058a21", - "metadata": { - "id": "e5058a21" - }, - "source": [ - "\n", - "### 3.4 - Understand the output\n", - "\n", - "Here are some interesting attributes to note:\n", - "\n", - "- **filename** : original filename\n", - "- **contents** : text\n", - "- **document_id**: unique id (UUID) assignd to this document\n", - "- **hash** : hash of document\n", - "- **pdf_convert_time** : time to convert this pdf in seconds\n", - "\n", - "Let's inspect the **contents** column. See how the text is being divided up!" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "โœ… Stage:2 completed successfully\n", + "CPU times: user 982 ms, sys: 291 ms, total: 1.27 s\n", + "Wall time: 18.9 s\n" + ] + } + ], + "source": [ + "%%time\n", + "\n", + "from data_processing_ray.runtime.ray import RayTransformLauncher\n", + "from doc_chunk_transform_ray import DocChunkRayTransformConfiguration\n", + "\n", + "\n", + "# Prepare the commandline params\n", + "local_conf = {\n", + " \"input_folder\": input_folder,\n", + " \"output_folder\": output_folder,\n", + "}\n", + "worker_options = {\"num_cpus\" : MY_CONFIG.RAY_NUM_CPUS}\n", + "params = {\n", + " # where to run\n", + " \"run_locally\": True,\n", + " # Data access. Only required parameters are specified\n", + " \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n", + " # orchestrator\n", + " \"runtime_worker_options\": ParamsUtils.convert_to_ast(worker_options),\n", + " \"runtime_num_workers\": MY_CONFIG.RAY_RUNTIME_WORKERS,\n", + " # doc_chunk arguments\n", + " # ...\n", + "}\n", + "\n", + "# Pass the commandline params\n", + "sys.argv = ParamsUtils.dict_to_req(d=params)\n", + "\n", + "# create launcher\n", + "launcher = RayTransformLauncher(DocChunkRayTransformConfiguration())\n", + "# launch\n", + "return_code = launcher.launch()\n", + "\n", + "if return_code == 0:\n", + " print (f\"โœ… Stage:{STAGE} completed successfully\")\n", + "else:\n", + " raise Exception (\"โŒ Ray job failed\")" + ] + }, + { + "cell_type": "markdown", + "id": "213afdf6", + "metadata": { + "id": "213afdf6" + }, + "source": [ + "### 4.3 - Inspect Generated output\n", + "\n", + "We would see documents are split into many chunks" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "d8138d43", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 897 }, + "id": "d8138d43", + "outputId": "e1758b0c-5f22-4368-c3e6-ff778fc9ae82" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 11, - "id": "f870e624", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "f870e624", - "outputId": "9142246b-988c-4674-99d7-e2f3fffbaaf4" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'_name': '',\n", - " 'description': {'logs': []},\n", - " 'equations': [],\n", - " 'figures': [],\n", - " 'file-info': {'#-pages': 1,\n", - " 'document-hash': '1a83f43f3a202e3f203c1263e36961ecc45d401aad488f638fc5559a584333b2',\n", - " 'filename': 'mars.pdf',\n", - " 'page-hashes': [{'hash': '551fe7a9bde2a9302f150c0a79a13fcc0868fcf73ac6afb80be645c1174734a0',\n", - " 'model': 'default',\n", - " 'page': 1}]},\n", - " 'footnotes': [],\n", - " 'main-text': [{'name': 'Section-header',\n", - " 'prov': [{'bbox': [133.35137939,\n", - " 654.45184326,\n", - " 169.88169861,\n", - " 667.98492432],\n", - " 'page': 1,\n", - " 'span': [0, 4]}],\n", - " 'text': 'Mars',\n", - " 'type': 'subtitle-level-1'},\n", - " {'name': 'Section-header',\n", - " 'prov': [{'bbox': [133.09541321,\n", - " 630.68127441,\n", - " 210.66503906,\n", - " 642.34405518],\n", - " 'page': 1,\n", - " 'span': [0, 12]}],\n", - " 'text': 'Solar System',\n", - " 'type': 'subtitle-level-1'},\n", - " {'name': 'Text',\n", - " 'prov': [{'bbox': [132.84518433,\n", - " 588.96014404,\n", - " 479.40917969,\n", - " 623.02520752],\n", - " 'page': 1,\n", - " 'span': [0, 205]}],\n", - " 'text': 'Our solar system is a vast and fascinating expanse, '\n", - " 'comprising eight planets, five dwarf planets, '\n", - " 'numerous moons, asteroids, comets, and other '\n", - " 'celestial bodies. At its center lies the star we call '\n", - " 'the Sun.',\n", - " 'type': 'paragraph'},\n", - " {'name': 'Text',\n", - " 'prov': [{'bbox': [133.18510437,\n", - " 570.83258057,\n", - " 374.99838257,\n", - " 581.07043457],\n", - " 'page': 1,\n", - " 'span': [0, 54]}],\n", - " 'text': 'For more details about the Solar system see Chapter '\n", - " '1.',\n", - " 'type': 'paragraph'},\n", - " {'name': 'Section-header',\n", - " 'prov': [{'bbox': [133.22866821,\n", - " 542.98168945,\n", - " 163.86282349,\n", - " 554.45288086],\n", - " 'page': 1,\n", - " 'span': [0, 4]}],\n", - " 'text': 'Mars',\n", - " 'type': 'subtitle-level-1'},\n", - " {'name': 'Text',\n", - " 'prov': [{'bbox': [132.87440491,\n", - " 500.84011841,\n", - " 477.48345947,\n", - " 534.55810547],\n", - " 'page': 1,\n", - " 'span': [0, 196]}],\n", - " 'text': 'Mars, the fourth planet from the Sun, is a cold, '\n", - " 'desert world with a thin atmosphere composed '\n", - " 'primarily of carbon dioxide. Its reddish hue comes '\n", - " 'from iron oxide, or rust, prevalent on its surface.',\n", - " 'type': 'paragraph'},\n", - " {'name': 'Section-header',\n", - " 'prov': [{'bbox': [133.2026062,\n", - " 482.90710449,\n", - " 237.04431152,\n", - " 493.07443237],\n", - " 'page': 1,\n", - " 'span': [0, 23]}],\n", - " 'text': 'Basic facts about Mars:',\n", - " 'type': 'subtitle-level-1'},\n", - " {'name': 'List-item',\n", - " 'prov': [{'bbox': [145.94500732,\n", - " 453.019104,\n", - " 477.48171997,\n", - " 474.9703064],\n", - " 'page': 1,\n", - " 'span': [0, 78]}],\n", - " 'text': 'ยท Distance from the Sun: Average of 228 million '\n", - " 'kilometers (142 million miles)',\n", - " 'type': 'paragraph'},\n", - " {'name': 'List-item',\n", - " 'prov': [{'bbox': [145.94500732,\n", - " 440.79351807,\n", - " 431.73287964,\n", - " 451.2142334],\n", - " 'page': 1,\n", - " 'span': [0, 64]}],\n", - " 'text': 'ยท Rotation Period: 24.6 hours (one Martian day - '\n", - " 'called a \"sol\")',\n", - " 'type': 'paragraph'},\n", - " {'name': 'List-item',\n", - " 'prov': [{'bbox': [145.94500732,\n", - " 429.10913086,\n", - " 365.9559021,\n", - " 438.83737183],\n", - " 'page': 1,\n", - " 'span': [0, 44]}],\n", - " 'text': 'ยท Moons: Two small moons, Phobos and Deimos.',\n", - " 'type': 'paragraph'},\n", - " {'name': 'Page-footer',\n", - " 'prov': [{'bbox': [303.13299561,\n", - " 87.20314026,\n", - " 308.11428833,\n", - " 96.51646423],\n", - " 'page': 1,\n", - " 'span': [0, 1]}],\n", - " 'text': '1',\n", - " 'type': 'page-footer'}],\n", - " 'page-dimensions': [{'height': 792.0, 'page': 1, 'width': 612.0}],\n", - " 'page-footers': [],\n", - " 'page-headers': [],\n", - " 'tables': [],\n", - " 'type': 'pdf-document'}\n" - ] - } - ], - "source": [ - "import pprint\n", - "import json\n", - "\n", - "pprint.pprint (json.loads(output_df.iloc[0, ]['contents']))\n", - "# json.loads(output_df.iloc[0, ]['contents'])" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "Files processed : 2\n", + "Chunks created : 8\n", + "Input data dimensions (rows x columns)= (2, 12)\n", + "Output data dimensions (rows x columns)= (8, 16)\n" + ] }, { - "cell_type": "code", - "execution_count": 12, - "id": "e1a10c2d", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "e1a10c2d", - "outputId": "ca74113e-6fd3-488b-836a-60bd58299fb1" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'_name': '',\n", - " 'description': {'logs': []},\n", - " 'equations': [],\n", - " 'figures': [],\n", - " 'file-info': {'#-pages': 1,\n", - " 'document-hash': '7401ae81637dbb89e7040dcd5945bbfb75ff8648bb761c69f8a1595e86538748',\n", - " 'filename': 'earth.pdf',\n", - " 'page-hashes': [{'hash': 'ca802e4bd5a3301792808caea2a47db51f0520888875b77fc230c99ee851c19b',\n", - " 'model': 'default',\n", - " 'page': 1}]},\n", - " 'footnotes': [],\n", - " 'main-text': [{'name': 'Section-header',\n", - " 'prov': [{'bbox': [133.30961609,\n", - " 654.45184326,\n", - " 174.04208374,\n", - " 667.93347168],\n", - " 'page': 1,\n", - " 'span': [0, 5]}],\n", - " 'text': 'Earth',\n", - " 'type': 'subtitle-level-1'},\n", - " {'name': 'Section-header',\n", - " 'prov': [{'bbox': [133.12528992,\n", - " 630.69073486,\n", - " 210.66503906,\n", - " 642.27935791],\n", - " 'page': 1,\n", - " 'span': [0, 12]}],\n", - " 'text': 'Solar System',\n", - " 'type': 'subtitle-level-1'},\n", - " {'name': 'Text',\n", - " 'prov': [{'bbox': [132.87112427,\n", - " 588.96014404,\n", - " 479.40917969,\n", - " 623.04595947],\n", - " 'page': 1,\n", - " 'span': [0, 205]}],\n", - " 'text': 'Our solar system is a vast and fascinating expanse, '\n", - " 'comprising eight planets, five dwarf planets, '\n", - " 'numerous moons, asteroids, comets, and other '\n", - " 'celestial bodies. At its center lies the star we call '\n", - " 'the Sun.',\n", - " 'type': 'paragraph'},\n", - " {'name': 'Text',\n", - " 'prov': [{'bbox': [133.20942688,\n", - " 570.81555176,\n", - " 375.57919312,\n", - " 581.08459473],\n", - " 'page': 1,\n", - " 'span': [0, 54]}],\n", - " 'text': 'For more details about our Solar system see Chapter '\n", - " '1.',\n", - " 'type': 'paragraph'},\n", - " {'name': 'Section-header',\n", - " 'prov': [{'bbox': [133.15542603,\n", - " 542.98168945,\n", - " 167.32983398,\n", - " 554.36669922],\n", - " 'page': 1,\n", - " 'span': [0, 5]}],\n", - " 'text': 'Earth',\n", - " 'type': 'subtitle-level-1'},\n", - " {'name': 'Text',\n", - " 'prov': [{'bbox': [132.91053772,\n", - " 512.46295166,\n", - " 477.84887695,\n", - " 534.48431396],\n", - " 'page': 1,\n", - " 'span': [0, 107]}],\n", - " 'text': \"Earth is the third planet from the Sun. It's our home \"\n", - " 'planet. Earth is the only place we know of with life.',\n", - " 'type': 'paragraph'},\n", - " {'name': 'Text',\n", - " 'prov': [{'bbox': [133.30151367,\n", - " 494.86206055,\n", - " 240.17156982,\n", - " 505.07229614],\n", - " 'page': 1,\n", - " 'span': [0, 24]}],\n", - " 'text': 'Basic facts about Earth:',\n", - " 'type': 'paragraph'},\n", - " {'name': 'List-item',\n", - " 'prov': [{'bbox': [145.94500732,\n", - " 464.97409058,\n", - " 477.47979736,\n", - " 487.02810669],\n", - " 'page': 1,\n", - " 'span': [0, 79]}],\n", - " 'text': 'ยท Distance from the Sun: Average of 149.6 million '\n", - " 'kilometers (93 million miles)',\n", - " 'type': 'paragraph'},\n", - " {'name': 'List-item',\n", - " 'prov': [{'bbox': [145.94500732,\n", - " 452.86901855,\n", - " 317.90722656,\n", - " 463.24041748],\n", - " 'page': 1,\n", - " 'span': [0, 37]}],\n", - " 'text': 'ยท Rotation Period: 24 hours (one day)',\n", - " 'type': 'paragraph'},\n", - " {'name': 'List-item',\n", - " 'prov': [{'bbox': [145.94500732,\n", - " 440.71496582,\n", - " 396.66357422,\n", - " 451.19915771],\n", - " 'page': 1,\n", - " 'span': [0, 52]}],\n", - " 'text': 'ยท Moons: One moon, called Luna or simply \"the Moon\".',\n", - " 'type': 'paragraph'},\n", - " {'name': 'Page-footer',\n", - " 'prov': [{'bbox': [303.13299561,\n", - " 87.20314026,\n", - " 308.11428833,\n", - " 96.53633118],\n", - " 'page': 1,\n", - " 'span': [0, 1]}],\n", - " 'text': '1',\n", - " 'type': 'page-footer'}],\n", - " 'page-dimensions': [{'height': 792.0, 'page': 1, 'width': 612.0}],\n", - " 'page-footers': [],\n", - " 'page-headers': [],\n", - " 'tables': [],\n", - " 'type': 'pdf-document'}\n" - ] - } + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
filenamenum_pagesnum_tablesnum_doc_elementsexthashsizedate_acquiredpdf_convert_timesource_filenamesource_document_idcontentsdoc_jsonpathpage_numberbboxdocument_id
0mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Solar System\\nOur solar system is a vast and f...$.main-text[2]1[132.84518433, 588.96014404, 479.40917969, 623...44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...
1mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Solar System\\nFor more details about the Solar...$.main-text[3]1[133.18510437, 570.83258057, 374.99838257, 581...dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07...
2mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Mars\\nMars, the fourth planet from the Sun, is...$.main-text[5]1[132.87440491, 500.84011841, 477.48345947, 534...a31663e06fac41470ecc459f5a58658a3f9997d7801053...
3mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Basic facts about Mars:\\nยท Distance from the S...$.main-text[6]1[133.2026062, 482.90710449, 237.04431152, 493....7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...
4earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Solar System\\nOur solar system is a vast and f...$.main-text[2]1[132.87112427, 588.96014404, 479.40917969, 623...44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...
5earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Solar System\\nFor more details about our Solar...$.main-text[3]1[133.20942688, 570.81555176, 375.57919312, 581...d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...
6earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Earth\\nEarth is the third planet from the Sun....$.main-text[5]1[132.91053772, 512.46295166, 477.84887695, 534...7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...
7earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Earth\\nBasic facts about Earth:\\nยท Distance fr...$.main-text[6]1[133.30151367, 494.86206055, 240.17156982, 505...189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...
\n", + "
" ], - "source": [ - "pprint.pprint (json.loads(output_df.iloc[1, ]['contents']))" - ] - }, - { - "cell_type": "markdown", - "id": "72274586", - "metadata": { - "id": "72274586" - }, - "source": [ - "## Step-4: Doc chunks\n", - "\n", - "In the previous step, we have extracted text from oru PDFs. But we have the content of entire file as 'one row' in our parquet output.\n", - "\n", - "In this step, we are going to split the documents in chunks, according to their layout segmentation.\n", - "\n", - "This transform uses [Quackling](https://github.com/DS4SD/quackling) `HierarchicalChunker`\n", - "to chunk according to the document layout segmentation, i.e. respecting the original document components as paragraphs, tables, enumerations, etc.\n", - "It relies on documents converted with the Docling library in the [pdf2parquet transform](https://github.com/IBM/data-prep-kit/blob/dev/transforms/language/pdf2parquet/python/README.md) using the option `contents_type: \"application/json\"`,\n", - "which provides the required JSON structure." - ] - }, - { - "cell_type": "markdown", - "id": "96198fa6", - "metadata": { - "id": "96198fa6" - }, - "source": [ - "### 4.1 - Set Input/output Folder" + "text/plain": [ + " filename num_pages num_tables num_doc_elements ext \\\n", + "0 mars.pdf 1 0 11 pdf \n", + "1 mars.pdf 1 0 11 pdf \n", + "2 mars.pdf 1 0 11 pdf \n", + "3 mars.pdf 1 0 11 pdf \n", + "4 earth.pdf 1 0 11 pdf \n", + "5 earth.pdf 1 0 11 pdf \n", + "6 earth.pdf 1 0 11 pdf \n", + "7 earth.pdf 1 0 11 pdf \n", + "\n", + " hash size \\\n", + "0 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", + "1 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", + "2 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", + "3 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", + "4 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", + "5 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", + "6 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", + "7 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", + "\n", + " date_acquired pdf_convert_time source_filename \\\n", + "0 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", + "1 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", + "2 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", + "3 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", + "4 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", + "5 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", + "6 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", + "7 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", + "\n", + " source_document_id \\\n", + "0 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", + "1 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", + "2 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", + "3 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", + "4 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", + "5 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", + "6 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", + "7 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", + "\n", + " contents doc_jsonpath \\\n", + "0 Solar System\\nOur solar system is a vast and f... $.main-text[2] \n", + "1 Solar System\\nFor more details about the Solar... $.main-text[3] \n", + "2 Mars\\nMars, the fourth planet from the Sun, is... $.main-text[5] \n", + "3 Basic facts about Mars:\\nยท Distance from the S... $.main-text[6] \n", + "4 Solar System\\nOur solar system is a vast and f... $.main-text[2] \n", + "5 Solar System\\nFor more details about our Solar... $.main-text[3] \n", + "6 Earth\\nEarth is the third planet from the Sun.... $.main-text[5] \n", + "7 Earth\\nBasic facts about Earth:\\nยท Distance fr... $.main-text[6] \n", + "\n", + " page_number bbox \\\n", + "0 1 [132.84518433, 588.96014404, 479.40917969, 623... \n", + "1 1 [133.18510437, 570.83258057, 374.99838257, 581... \n", + "2 1 [132.87440491, 500.84011841, 477.48345947, 534... \n", + "3 1 [133.2026062, 482.90710449, 237.04431152, 493.... \n", + "4 1 [132.87112427, 588.96014404, 479.40917969, 623... \n", + "5 1 [133.20942688, 570.81555176, 375.57919312, 581... \n", + "6 1 [132.91053772, 512.46295166, 477.84887695, 534... \n", + "7 1 [133.30151367, 494.86206055, 240.17156982, 505... \n", + "\n", + " document_id \n", + "0 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... \n", + "1 dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07... \n", + "2 a31663e06fac41470ecc459f5a58658a3f9997d7801053... \n", + "3 7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a... \n", + "4 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... \n", + "5 d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d... \n", + "6 7c4a750e2215f231803a6f8078bde1e9699034fb033dd3... \n", + "7 189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f... " ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from my_utils import read_parquet_files_as_df\n", + "\n", + "output_df = read_parquet_files_as_df(output_folder)\n", + "\n", + "print (f\"Files processed : {input_df.shape[0]:,}\")\n", + "print (f\"Chunks created : {output_df.shape[0]:,}\")\n", + "\n", + "print (\"Input data dimensions (rows x columns)= \", input_df.shape)\n", + "print (\"Output data dimensions (rows x columns)= \", output_df.shape)\n", + "\n", + "output_df.head(10)" + ] + }, + { + "cell_type": "markdown", + "id": "9e9ca75c", + "metadata": { + "id": "9e9ca75c" + }, + "source": [ + "### 4.4 - Understanding the Output\n", + "\n", + "Here we see 2 PDF files are split into 6 chunks. Basically we see the documents are being split along 'natural boundaris' - paragraphs and bullet points\n", + "\n", + "See how **document_id** is carried throughout. This helps us identify original documents.\n", + "\n", + "Also note **contents** is now plain text (not JSON as before)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "3090c950", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 300 }, + "id": "3090c950", + "outputId": "3f542446-2cfa-404c-c642-3732f7b74568" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 13, - "id": "305f00a3", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "305f00a3", - "outputId": "689f1531-7007-49d9-9a27-39c39f8f2c50" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "๐Ÿƒ๐Ÿผ STAGE-2: Processing input='output/01_parquet_out' --> output='output/02_chunk_out'\n" - ] - } + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
filenamecontents
0mars.pdfSolar System\\nOur solar system is a vast and f...
1mars.pdfSolar System\\nFor more details about the Solar...
2mars.pdfMars\\nMars, the fourth planet from the Sun, is...
3mars.pdfBasic facts about Mars:\\nยท Distance from the S...
4earth.pdfSolar System\\nOur solar system is a vast and f...
5earth.pdfSolar System\\nFor more details about our Solar...
6earth.pdfEarth\\nEarth is the third planet from the Sun....
7earth.pdfEarth\\nBasic facts about Earth:\\nยท Distance fr...
\n", + "
" ], - "source": [ - "STAGE = 2\n", - "\n", - "input_folder = output_parquet_dir # previous output folder is the input folder for the current stage\n", - "output_folder = output_chunk_dir\n", - "\n", - "input_df = read_parquet_files_as_df(input_folder) ## for debug purposes\n", - "\n", - "print (f\"๐Ÿƒ๐Ÿผ STAGE-{STAGE}: Processing input='{input_folder}' --> output='{output_folder}'\")" - ] - }, - { - "cell_type": "markdown", - "id": "369f2cd1", - "metadata": { - "id": "369f2cd1" - }, - "source": [ - "### 4.2 - Execute" + "text/plain": [ + " filename contents\n", + "0 mars.pdf Solar System\\nOur solar system is a vast and f...\n", + "1 mars.pdf Solar System\\nFor more details about the Solar...\n", + "2 mars.pdf Mars\\nMars, the fourth planet from the Sun, is...\n", + "3 mars.pdf Basic facts about Mars:\\nยท Distance from the S...\n", + "4 earth.pdf Solar System\\nOur solar system is a vast and f...\n", + "5 earth.pdf Solar System\\nFor more details about our Solar...\n", + "6 earth.pdf Earth\\nEarth is the third planet from the Sun....\n", + "7 earth.pdf Earth\\nBasic facts about Earth:\\nยท Distance fr..." ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "output_df[['filename', 'contents']]" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "d5f151ae", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "d5f151ae", + "outputId": "4616d648-0852-4ecb-cef8-f5940e176de0" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 14, - "id": "5b7b18d5", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "5b7b18d5", - "outputId": "0146bd91-2ccb-4e56-c649-f415a38bfcf8" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "13:31:12 INFO - doc_chunk parameters are : {'chunking_type': , 'content_column_name': 'contents', 'doc_id_column_name': 'document_id', 'dl_min_chunk_len': None, 'output_chunk_column_name': 'contents', 'output_source_doc_id_column_name': 'source_document_id', 'output_jsonpath_column_name': 'doc_jsonpath', 'output_pageno_column_name': 'page_number', 'output_bbox_column_name': 'bbox'}\n", - "13:31:12 INFO - pipeline id pipeline_id\n", - "13:31:12 INFO - code location None\n", - "13:31:12 INFO - number of workers 2 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n", - "13:31:12 INFO - actor creation delay 0\n", - "13:31:12 INFO - job details {'job category': 'preprocessing', 'job name': 'doc_chunk', 'job type': 'ray', 'job id': 'job_id'}\n", - "13:31:12 INFO - data factory data_ is using local data access: input_folder - output/01_parquet_out output_folder - output/02_chunk_out\n", - "13:31:12 INFO - data factory data_ max_files -1, n_sample -1\n", - "13:31:12 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "13:31:12 INFO - Running locally\n", - "2024-10-18 13:31:14,121\tINFO worker.py:1744 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", - "\u001b[36m(orchestrate pid=10912)\u001b[0m 13:31:16 INFO - orchestrator started at 2024-10-18 13:31:16\n", - "\u001b[36m(orchestrate pid=10912)\u001b[0m 13:31:16 INFO - Number of files is 2, source profile {'max_file_size': 0.02239513397216797, 'min_file_size': 0.02167987823486328, 'total_file_size': 0.04407501220703125}\n", - "\u001b[36m(orchestrate pid=10912)\u001b[0m 13:31:16 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 14.963891602121294, 'object_store': 7.4819458005949855}\n", - "\u001b[36m(orchestrate pid=10912)\u001b[0m 13:31:16 INFO - Number of workers - 2 with {'num_cpus': 0.8, 'max_restarts': -1} each\n", - "\u001b[36m(orchestrate pid=10912)\u001b[0m 13:31:16 INFO - Completed 0 files (0.0%) in 0.0 min. Waiting for completion\n", - "\u001b[36m(orchestrate pid=10912)\u001b[0m 13:31:18 INFO - Completed processing 2 files in 0.032 min\n", - "\u001b[36m(orchestrate pid=10912)\u001b[0m 13:31:18 INFO - done flushing in 0.001 sec\n", - "13:31:28 INFO - Completed execution in 0.269 min, execution result 0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "โœ… Stage:2 completed successfully\n", - "CPU times: user 982 ms, sys: 291 ms, total: 1.27 s\n", - "Wall time: 18.9 s\n" - ] - } - ], - "source": [ - "%%time\n", - "\n", - "from data_processing_ray.runtime.ray import RayTransformLauncher\n", - "from doc_chunk_transform_ray import DocChunkRayTransformConfiguration\n", - "\n", - "\n", - "# Prepare the commandline params\n", - "local_conf = {\n", - " \"input_folder\": input_folder,\n", - " \"output_folder\": output_folder,\n", - "}\n", - "worker_options = {\"num_cpus\" : MY_CONFIG.RAY_NUM_CPUS}\n", - "params = {\n", - " # where to run\n", - " \"run_locally\": True,\n", - " # Data access. Only required parameters are specified\n", - " \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n", - " # orchestrator\n", - " \"runtime_worker_options\": ParamsUtils.convert_to_ast(worker_options),\n", - " \"runtime_num_workers\": MY_CONFIG.RAY_RUNTIME_WORKERS,\n", - " # doc_chunk arguments\n", - " # ...\n", - "}\n", - "\n", - "# Pass the commandline params\n", - "sys.argv = ParamsUtils.dict_to_req(d=params)\n", - "\n", - "# create launcher\n", - "launcher = RayTransformLauncher(DocChunkRayTransformConfiguration())\n", - "# launch\n", - "return_code = launcher.launch()\n", - "\n", - "if return_code == 0:\n", - " print (f\"โœ… Stage:{STAGE} completed successfully\")\n", - "else:\n", - " raise Exception (\"โŒ Ray job failed\")" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "========== mars.pdf ===========\n", + "-------Chunk 0------\n", + "Solar System\n", + "Our solar system is a vast and fascinating expanse, comprising eight planets, five dwarf planets, numerous moons, asteroids, comets, and other celestial bodies. At its center lies the star we call the Sun.\n", + "-------\n", + "-------Chunk 1------\n", + "Solar System\n", + "For more details about the Solar system see Chapter 1.\n", + "-------\n", + "-------Chunk 2------\n", + "Mars\n", + "Mars, the fourth planet from the Sun, is a cold, desert world with a thin atmosphere composed primarily of carbon dioxide. Its reddish hue comes from iron oxide, or rust, prevalent on its surface.\n", + "-------\n", + "-------Chunk 3------\n", + "Basic facts about Mars:\n", + "ยท Distance from the Sun: Average of 228 million kilometers (142 million miles)\n", + "ยท Rotation Period: 24.6 hours (one Martian day - called a \"sol\")\n", + "ยท Moons: Two small moons, Phobos and Deimos.\n", + "-------\n", + "========== earth.pdf ===========\n", + "-------Chunk 0------\n", + "Solar System\n", + "Our solar system is a vast and fascinating expanse, comprising eight planets, five dwarf planets, numerous moons, asteroids, comets, and other celestial bodies. At its center lies the star we call the Sun.\n", + "-------\n", + "-------Chunk 1------\n", + "Solar System\n", + "For more details about our Solar system see Chapter 1.\n", + "-------\n", + "-------Chunk 2------\n", + "Earth\n", + "Earth is the third planet from the Sun. It's our home planet. Earth is the only place we know of with life.\n", + "-------\n", + "-------Chunk 3------\n", + "Earth\n", + "Basic facts about Earth:\n", + "ยท Distance from the Sun: Average of 149.6 million kilometers (93 million miles)\n", + "ยท Rotation Period: 24 hours (one day)\n", + "ยท Moons: One moon, called Luna or simply \"the Moon\".\n", + "-------\n" + ] + } + ], + "source": [ + "for f in output_df['filename'].unique():\n", + " print ('==========' , f, '===========')\n", + " chunks = output_df[output_df['filename'] == f]['contents']\n", + " for idx , chunk in enumerate(chunks):\n", + " print (f'-------Chunk {idx}------\\n{chunk}\\n-------')" + ] + }, + { + "cell_type": "markdown", + "id": "20217298", + "metadata": { + "id": "20217298" + }, + "source": [ + "## Step-5: DOC ID generation\n", + "\n", + "This transform annotates documents with document \"ids\". It supports the following transformations of the original data:\n", + "\n", + " - Adding document hash: this enables the addition of a document hash-based id to the data. The hash is calculated with `hashlib.sha256(doc.encode(\"utf-8\")).hexdigest()`. To enable this annotation, set **hash_column** to the name of the column, where you want to store it.\n", + " - Adding integer document id: this allows the addition of an integer document id to the data that is unique across all rows in all tables provided to the transform() method. To enable this annotation, set **int_id_column** to the name of the column, where you want to store it.\n", + "\n", + "**This is a pre-requisite for fuzzy dedup** in the pipeline." + ] + }, + { + "cell_type": "markdown", + "id": "66811f5b", + "metadata": { + "id": "66811f5b" + }, + "source": [ + "### 5.1 - Set Input/output Folder" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "1f747c0d", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "1f747c0d", + "outputId": "e42500b7-5d1e-41fd-b53b-34d3393f36f4" + }, + "outputs": [ { - "cell_type": "markdown", - "id": "213afdf6", - "metadata": { - "id": "213afdf6" - }, - "source": [ - "### 4.3 - Inspect Generated output\n", - "\n", - "We would see documents are split into many chunks" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "๐Ÿƒ๐Ÿผ STAGE-3: Processing input='output/02_chunk_out' --> output='output/03_docid_out'\n" + ] + } + ], + "source": [ + "\n", + "# Input for this stage is the output of exact dedeup component\n", + "# output of this component makes it possible for fdedup component to run on data.\n", + "\n", + "STAGE = 3\n", + "\n", + "input_folder = output_chunk_dir\n", + "output_folder = output_docid_dir\n", + "\n", + "input_df = read_parquet_files_as_df(input_folder) ## for debug purposes\n", + "\n", + "print (f\"๐Ÿƒ๐Ÿผ STAGE-{STAGE}: Processing input='{input_folder}' --> output='{output_folder}'\")" + ] + }, + { + "cell_type": "markdown", + "id": "18aa0fe1", + "metadata": { + "id": "18aa0fe1" + }, + "source": [ + "### 5.2 - Execute" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "f6e9e145", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "f6e9e145", + "outputId": "2add5f0c-3ab6-4336-8a7b-ac8b1b76ab73" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 15, - "id": "d8138d43", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 897 - }, - "id": "d8138d43", - "outputId": "e1758b0c-5f22-4368-c3e6-ff778fc9ae82" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Files processed : 2\n", - "Chunks created : 8\n", - "Input data dimensions (rows x columns)= (2, 12)\n", - "Output data dimensions (rows x columns)= (8, 16)\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
filenamenum_pagesnum_tablesnum_doc_elementsexthashsizedate_acquiredpdf_convert_timesource_filenamesource_document_idcontentsdoc_jsonpathpage_numberbboxdocument_id
0mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Solar System\\nOur solar system is a vast and f...$.main-text[2]1[132.84518433, 588.96014404, 479.40917969, 623...44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...
1mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Solar System\\nFor more details about the Solar...$.main-text[3]1[133.18510437, 570.83258057, 374.99838257, 581...dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07...
2mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Mars\\nMars, the fourth planet from the Sun, is...$.main-text[5]1[132.87440491, 500.84011841, 477.48345947, 534...a31663e06fac41470ecc459f5a58658a3f9997d7801053...
3mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Basic facts about Mars:\\nยท Distance from the S...$.main-text[6]1[133.2026062, 482.90710449, 237.04431152, 493....7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...
4earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Solar System\\nOur solar system is a vast and f...$.main-text[2]1[132.87112427, 588.96014404, 479.40917969, 623...44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...
5earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Solar System\\nFor more details about our Solar...$.main-text[3]1[133.20942688, 570.81555176, 375.57919312, 581...d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...
6earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Earth\\nEarth is the third planet from the Sun....$.main-text[5]1[132.91053772, 512.46295166, 477.84887695, 534...7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...
7earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Earth\\nBasic facts about Earth:\\nยท Distance fr...$.main-text[6]1[133.30151367, 494.86206055, 240.17156982, 505...189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...
\n", - "
" - ], - "text/plain": [ - " filename num_pages num_tables num_doc_elements ext \\\n", - "0 mars.pdf 1 0 11 pdf \n", - "1 mars.pdf 1 0 11 pdf \n", - "2 mars.pdf 1 0 11 pdf \n", - "3 mars.pdf 1 0 11 pdf \n", - "4 earth.pdf 1 0 11 pdf \n", - "5 earth.pdf 1 0 11 pdf \n", - "6 earth.pdf 1 0 11 pdf \n", - "7 earth.pdf 1 0 11 pdf \n", - "\n", - " hash size \\\n", - "0 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "1 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "2 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "3 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "4 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "5 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "6 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "7 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "\n", - " date_acquired pdf_convert_time source_filename \\\n", - "0 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", - "1 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", - "2 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", - "3 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", - "4 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", - "5 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", - "6 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", - "7 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", - "\n", - " source_document_id \\\n", - "0 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", - "1 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", - "2 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", - "3 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", - "4 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", - "5 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", - "6 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", - "7 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", - "\n", - " contents doc_jsonpath \\\n", - "0 Solar System\\nOur solar system is a vast and f... $.main-text[2] \n", - "1 Solar System\\nFor more details about the Solar... $.main-text[3] \n", - "2 Mars\\nMars, the fourth planet from the Sun, is... $.main-text[5] \n", - "3 Basic facts about Mars:\\nยท Distance from the S... $.main-text[6] \n", - "4 Solar System\\nOur solar system is a vast and f... $.main-text[2] \n", - "5 Solar System\\nFor more details about our Solar... $.main-text[3] \n", - "6 Earth\\nEarth is the third planet from the Sun.... $.main-text[5] \n", - "7 Earth\\nBasic facts about Earth:\\nยท Distance fr... $.main-text[6] \n", - "\n", - " page_number bbox \\\n", - "0 1 [132.84518433, 588.96014404, 479.40917969, 623... \n", - "1 1 [133.18510437, 570.83258057, 374.99838257, 581... \n", - "2 1 [132.87440491, 500.84011841, 477.48345947, 534... \n", - "3 1 [133.2026062, 482.90710449, 237.04431152, 493.... \n", - "4 1 [132.87112427, 588.96014404, 479.40917969, 623... \n", - "5 1 [133.20942688, 570.81555176, 375.57919312, 581... \n", - "6 1 [132.91053772, 512.46295166, 477.84887695, 534... \n", - "7 1 [133.30151367, 494.86206055, 240.17156982, 505... \n", - "\n", - " document_id \n", - "0 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... \n", - "1 dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07... \n", - "2 a31663e06fac41470ecc459f5a58658a3f9997d7801053... \n", - "3 7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a... \n", - "4 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... \n", - "5 d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d... \n", - "6 7c4a750e2215f231803a6f8078bde1e9699034fb033dd3... \n", - "7 189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f... " - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from my_utils import read_parquet_files_as_df\n", - "\n", - "output_df = read_parquet_files_as_df(output_folder)\n", - "\n", - "print (f\"Files processed : {input_df.shape[0]:,}\")\n", - "print (f\"Chunks created : {output_df.shape[0]:,}\")\n", - "\n", - "print (\"Input data dimensions (rows x columns)= \", input_df.shape)\n", - "print (\"Output data dimensions (rows x columns)= \", output_df.shape)\n", - "\n", - "output_df.head(10)" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "13:31:29 INFO - Doc id parameters are : {'doc_column': 'contents', 'hash_column': 'chunk_hash', 'int_column': 'chunk_id', 'start_id': 0}\n", + "13:31:29 INFO - pipeline id pipeline_id\n", + "13:31:29 INFO - code location None\n", + "13:31:29 INFO - number of workers 2 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n", + "13:31:29 INFO - actor creation delay 0\n", + "13:31:29 INFO - job details {'job category': 'preprocessing', 'job name': 'doc_id', 'job type': 'ray', 'job id': 'job_id'}\n", + "13:31:29 INFO - data factory data_ is using local data access: input_folder - output/02_chunk_out output_folder - output/03_docid_out\n", + "13:31:29 INFO - data factory data_ max_files -1, n_sample -1\n", + "13:31:29 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "13:31:29 INFO - Running locally\n", + "2024-10-18 13:31:31,792\tINFO worker.py:1744 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", + "\u001b[36m(orchestrate pid=12291)\u001b[0m 13:31:32 INFO - orchestrator started at 2024-10-18 13:31:32\n", + "\u001b[36m(orchestrate pid=12291)\u001b[0m 13:31:32 INFO - Number of files is 2, source profile {'max_file_size': 0.008975982666015625, 'min_file_size': 0.008897781372070312, 'total_file_size': 0.017873764038085938}\n", + "\u001b[36m(orchestrate pid=12291)\u001b[0m 13:31:32 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 15.033103181049228, 'object_store': 7.516551589593291}\n", + "\u001b[36m(orchestrate pid=12291)\u001b[0m 13:31:32 INFO - Number of workers - 2 with {'num_cpus': 0.8, 'max_restarts': -1} each\n", + "\u001b[36m(orchestrate pid=12291)\u001b[0m 13:31:32 INFO - Completed 0 files (0.0%) in 0.0 min. Waiting for completion\n", + "\u001b[36m(orchestrate pid=12291)\u001b[0m 13:31:33 INFO - Completed processing 2 files in 0.012 min\n", + "\u001b[36m(orchestrate pid=12291)\u001b[0m 13:31:33 INFO - done flushing in 0.001 sec\n", + "13:31:43 INFO - Completed execution in 0.228 min, execution result 0\n" + ] }, { - "cell_type": "markdown", - "id": "9e9ca75c", - "metadata": { - "id": "9e9ca75c" - }, - "source": [ - "### 4.4 - Understanding the Output\n", - "\n", - "Here we see 2 PDF files are split into 6 chunks. Basically we see the documents are being split along 'natural boundaris' - paragraphs and bullet points\n", - "\n", - "See how **document_id** is carried throughout. This helps us identify original documents.\n", - "\n", - "Also note **contents** is now plain text (not JSON as before)" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "โœ… Stage:3 completed successfully\n", + "CPU times: user 123 ms, sys: 145 ms, total: 267 ms\n", + "Wall time: 15.2 s\n" + ] + } + ], + "source": [ + "%%time\n", + "\n", + "from data_processing_ray.runtime.ray import RayTransformLauncher\n", + "from doc_id_transform_ray import DocIDRayTransformRuntimeConfiguration\n", + "\n", + "local_conf = {\n", + " \"input_folder\": input_folder,\n", + " \"output_folder\": output_folder,\n", + "}\n", + "worker_options = {\"num_cpus\" : MY_CONFIG.RAY_NUM_CPUS}\n", + "params = {\n", + " # where to run\n", + " \"run_locally\": True,\n", + " # Data access. Only required parameters are specified\n", + " \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n", + " # orchestrator\n", + " \"runtime_worker_options\": ParamsUtils.convert_to_ast(worker_options),\n", + " \"runtime_num_workers\": MY_CONFIG.RAY_RUNTIME_WORKERS,\n", + " # doc id configuration\n", + " \"doc_id_doc_column\": \"contents\",\n", + " \"doc_id_hash_column\": \"chunk_hash\",\n", + " \"doc_id_int_column\": \"chunk_id\",\n", + "}\n", + "sys.argv = ParamsUtils.dict_to_req(d=params)\n", + "\n", + "# launch\n", + "\n", + "launcher = RayTransformLauncher(DocIDRayTransformRuntimeConfiguration())\n", + "\n", + "return_code = launcher.launch()\n", + "\n", + "if return_code == 0:\n", + " print (f\"โœ… Stage:{STAGE} completed successfully\")\n", + "else:\n", + " raise Exception (\"โŒ Ray job failed\")" + ] + }, + { + "cell_type": "markdown", + "id": "4954402f", + "metadata": { + "id": "4954402f" + }, + "source": [ + "### 5.3 - Inspect Generated output\n", + "\n", + "You will notice we have two extra columns\n", + "\n", + "- **hash_column**\n", + "- **int_id_column**\n", + "\n", + "But still the same number or rows as before" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "1911179a", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 860 }, + "id": "1911179a", + "outputId": "45e83e2a-1f70-46b9-e311-c50f025419be" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 16, - "id": "3090c950", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 300 - }, - "id": "3090c950", - "outputId": "3f542446-2cfa-404c-c642-3732f7b74568" - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
filenamecontents
0mars.pdfSolar System\\nOur solar system is a vast and f...
1mars.pdfSolar System\\nFor more details about the Solar...
2mars.pdfMars\\nMars, the fourth planet from the Sun, is...
3mars.pdfBasic facts about Mars:\\nยท Distance from the S...
4earth.pdfSolar System\\nOur solar system is a vast and f...
5earth.pdfSolar System\\nFor more details about our Solar...
6earth.pdfEarth\\nEarth is the third planet from the Sun....
7earth.pdfEarth\\nBasic facts about Earth:\\nยท Distance fr...
\n", - "
" - ], - "text/plain": [ - " filename contents\n", - "0 mars.pdf Solar System\\nOur solar system is a vast and f...\n", - "1 mars.pdf Solar System\\nFor more details about the Solar...\n", - "2 mars.pdf Mars\\nMars, the fourth planet from the Sun, is...\n", - "3 mars.pdf Basic facts about Mars:\\nยท Distance from the S...\n", - "4 earth.pdf Solar System\\nOur solar system is a vast and f...\n", - "5 earth.pdf Solar System\\nFor more details about our Solar...\n", - "6 earth.pdf Earth\\nEarth is the third planet from the Sun....\n", - "7 earth.pdf Earth\\nBasic facts about Earth:\\nยท Distance fr..." - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "output_df[['filename', 'contents']]" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "Input data dimensions (rows x columns)= (8, 16)\n", + "Output data dimensions (rows x columns)= (8, 18)\n" + ] }, { - "cell_type": "code", - "execution_count": 17, - "id": "d5f151ae", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "d5f151ae", - "outputId": "4616d648-0852-4ecb-cef8-f5940e176de0" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "========== mars.pdf ===========\n", - "-------Chunk 0------\n", - "Solar System\n", - "Our solar system is a vast and fascinating expanse, comprising eight planets, five dwarf planets, numerous moons, asteroids, comets, and other celestial bodies. At its center lies the star we call the Sun.\n", - "-------\n", - "-------Chunk 1------\n", - "Solar System\n", - "For more details about the Solar system see Chapter 1.\n", - "-------\n", - "-------Chunk 2------\n", - "Mars\n", - "Mars, the fourth planet from the Sun, is a cold, desert world with a thin atmosphere composed primarily of carbon dioxide. Its reddish hue comes from iron oxide, or rust, prevalent on its surface.\n", - "-------\n", - "-------Chunk 3------\n", - "Basic facts about Mars:\n", - "ยท Distance from the Sun: Average of 228 million kilometers (142 million miles)\n", - "ยท Rotation Period: 24.6 hours (one Martian day - called a \"sol\")\n", - "ยท Moons: Two small moons, Phobos and Deimos.\n", - "-------\n", - "========== earth.pdf ===========\n", - "-------Chunk 0------\n", - "Solar System\n", - "Our solar system is a vast and fascinating expanse, comprising eight planets, five dwarf planets, numerous moons, asteroids, comets, and other celestial bodies. At its center lies the star we call the Sun.\n", - "-------\n", - "-------Chunk 1------\n", - "Solar System\n", - "For more details about our Solar system see Chapter 1.\n", - "-------\n", - "-------Chunk 2------\n", - "Earth\n", - "Earth is the third planet from the Sun. It's our home planet. Earth is the only place we know of with life.\n", - "-------\n", - "-------Chunk 3------\n", - "Earth\n", - "Basic facts about Earth:\n", - "ยท Distance from the Sun: Average of 149.6 million kilometers (93 million miles)\n", - "ยท Rotation Period: 24 hours (one day)\n", - "ยท Moons: One moon, called Luna or simply \"the Moon\".\n", - "-------\n" - ] - } + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
filenamenum_pagesnum_tablesnum_doc_elementsexthashsizedate_acquiredpdf_convert_timesource_filenamesource_document_idcontentsdoc_jsonpathpage_numberbboxdocument_idchunk_hashchunk_id
0mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Solar System\\nOur solar system is a vast and f...$.main-text[2]1[132.84518433, 588.96014404, 479.40917969, 623...44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...4
1mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Solar System\\nFor more details about the Solar...$.main-text[3]1[133.18510437, 570.83258057, 374.99838257, 581...dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07...dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07...5
2mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Mars\\nMars, the fourth planet from the Sun, is...$.main-text[5]1[132.87440491, 500.84011841, 477.48345947, 534...a31663e06fac41470ecc459f5a58658a3f9997d7801053...a31663e06fac41470ecc459f5a58658a3f9997d7801053...6
3mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Basic facts about Mars:\\nยท Distance from the S...$.main-text[6]1[133.2026062, 482.90710449, 237.04431152, 493....7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...7
4earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Solar System\\nOur solar system is a vast and f...$.main-text[2]1[132.87112427, 588.96014404, 479.40917969, 623...44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...0
5earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Solar System\\nFor more details about our Solar...$.main-text[3]1[133.20942688, 570.81555176, 375.57919312, 581...d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...1
6earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Earth\\nEarth is the third planet from the Sun....$.main-text[5]1[132.91053772, 512.46295166, 477.84887695, 534...7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...2
7earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Earth\\nBasic facts about Earth:\\nยท Distance fr...$.main-text[6]1[133.30151367, 494.86206055, 240.17156982, 505...189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...3
\n", + "
" ], - "source": [ - "for f in output_df['filename'].unique():\n", - " print ('==========' , f, '===========')\n", - " chunks = output_df[output_df['filename'] == f]['contents']\n", - " for idx , chunk in enumerate(chunks):\n", - " print (f'-------Chunk {idx}------\\n{chunk}\\n-------')" - ] - }, - { - "cell_type": "markdown", - "id": "20217298", - "metadata": { - "id": "20217298" - }, - "source": [ - "## Step-5: DOC ID generation\n", - "\n", - "This transform annotates documents with document \"ids\". It supports the following transformations of the original data:\n", - "\n", - " - Adding document hash: this enables the addition of a document hash-based id to the data. The hash is calculated with `hashlib.sha256(doc.encode(\"utf-8\")).hexdigest()`. To enable this annotation, set **hash_column** to the name of the column, where you want to store it.\n", - " - Adding integer document id: this allows the addition of an integer document id to the data that is unique across all rows in all tables provided to the transform() method. To enable this annotation, set **int_id_column** to the name of the column, where you want to store it.\n", - "\n", - "**This is a pre-requisite for fuzzy dedup** in the pipeline." - ] - }, - { - "cell_type": "markdown", - "id": "66811f5b", - "metadata": { - "id": "66811f5b" - }, - "source": [ - "### 5.1 - Set Input/output Folder" + "text/plain": [ + " filename num_pages num_tables num_doc_elements ext \\\n", + "0 mars.pdf 1 0 11 pdf \n", + "1 mars.pdf 1 0 11 pdf \n", + "2 mars.pdf 1 0 11 pdf \n", + "3 mars.pdf 1 0 11 pdf \n", + "4 earth.pdf 1 0 11 pdf \n", + "5 earth.pdf 1 0 11 pdf \n", + "6 earth.pdf 1 0 11 pdf \n", + "7 earth.pdf 1 0 11 pdf \n", + "\n", + " hash size \\\n", + "0 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", + "1 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", + "2 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", + "3 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", + "4 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", + "5 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", + "6 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", + "7 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", + "\n", + " date_acquired pdf_convert_time source_filename \\\n", + "0 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", + "1 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", + "2 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", + "3 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", + "4 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", + "5 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", + "6 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", + "7 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", + "\n", + " source_document_id \\\n", + "0 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", + "1 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", + "2 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", + "3 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", + "4 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", + "5 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", + "6 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", + "7 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", + "\n", + " contents doc_jsonpath \\\n", + "0 Solar System\\nOur solar system is a vast and f... $.main-text[2] \n", + "1 Solar System\\nFor more details about the Solar... $.main-text[3] \n", + "2 Mars\\nMars, the fourth planet from the Sun, is... $.main-text[5] \n", + "3 Basic facts about Mars:\\nยท Distance from the S... $.main-text[6] \n", + "4 Solar System\\nOur solar system is a vast and f... $.main-text[2] \n", + "5 Solar System\\nFor more details about our Solar... $.main-text[3] \n", + "6 Earth\\nEarth is the third planet from the Sun.... $.main-text[5] \n", + "7 Earth\\nBasic facts about Earth:\\nยท Distance fr... $.main-text[6] \n", + "\n", + " page_number bbox \\\n", + "0 1 [132.84518433, 588.96014404, 479.40917969, 623... \n", + "1 1 [133.18510437, 570.83258057, 374.99838257, 581... \n", + "2 1 [132.87440491, 500.84011841, 477.48345947, 534... \n", + "3 1 [133.2026062, 482.90710449, 237.04431152, 493.... \n", + "4 1 [132.87112427, 588.96014404, 479.40917969, 623... \n", + "5 1 [133.20942688, 570.81555176, 375.57919312, 581... \n", + "6 1 [132.91053772, 512.46295166, 477.84887695, 534... \n", + "7 1 [133.30151367, 494.86206055, 240.17156982, 505... \n", + "\n", + " document_id \\\n", + "0 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... \n", + "1 dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07... \n", + "2 a31663e06fac41470ecc459f5a58658a3f9997d7801053... \n", + "3 7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a... \n", + "4 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... \n", + "5 d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d... \n", + "6 7c4a750e2215f231803a6f8078bde1e9699034fb033dd3... \n", + "7 189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f... \n", + "\n", + " chunk_hash chunk_id \n", + "0 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... 4 \n", + "1 dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07... 5 \n", + "2 a31663e06fac41470ecc459f5a58658a3f9997d7801053... 6 \n", + "3 7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a... 7 \n", + "4 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... 0 \n", + "5 d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d... 1 \n", + "6 7c4a750e2215f231803a6f8078bde1e9699034fb033dd3... 2 \n", + "7 189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f... 3 " ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from my_utils import read_parquet_files_as_df\n", + "\n", + "output_df = read_parquet_files_as_df(output_folder)\n", + "\n", + "print (\"Input data dimensions (rows x columns)= \", input_df.shape)\n", + "print (\"Output data dimensions (rows x columns)= \", output_df.shape)\n", + "\n", + "output_df.head(10)" + ] + }, + { + "cell_type": "markdown", + "id": "852829dc", + "metadata": { + "id": "852829dc" + }, + "source": [ + "## Step-6: Exact Dedup\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "5acfd3a2-a236-4143-bcfc-15804f1da7fe", + "metadata": { + "id": "5acfd3a2-a236-4143-bcfc-15804f1da7fe" + }, + "source": [ + "### 6.1 - Set Input/output Folder" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "4c7a1b94", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "4c7a1b94", + "outputId": "40a119b4-44fc-483d-9ad0-da178a2a8eb1" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 18, - "id": "1f747c0d", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "1f747c0d", - "outputId": "e42500b7-5d1e-41fd-b53b-34d3393f36f4" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "๐Ÿƒ๐Ÿผ STAGE-3: Processing input='output/02_chunk_out' --> output='output/03_docid_out'\n" - ] - } - ], - "source": [ - "\n", - "# Input for this stage is the output of exact dedeup component\n", - "# output of this component makes it possible for fdedup component to run on data.\n", - "\n", - "STAGE = 3\n", - "\n", - "input_folder = output_chunk_dir\n", - "output_folder = output_docid_dir\n", - "\n", - "input_df = read_parquet_files_as_df(input_folder) ## for debug purposes\n", - "\n", - "print (f\"๐Ÿƒ๐Ÿผ STAGE-{STAGE}: Processing input='{input_folder}' --> output='{output_folder}'\")" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "๐Ÿƒ๐Ÿผ STAGE-4: Processing input='output/03_docid_out' --> output='output/04_exact_dedupe_out'\n" + ] + } + ], + "source": [ + "STAGE = 4\n", + "\n", + "input_folder = output_docid_dir # previous output folder is the input folder for the current stage\n", + "output_folder = output_exact_dedupe_dir\n", + "\n", + "input_df = read_parquet_files_as_df(input_folder) ## for debug purposes\n", + "\n", + "print (f\"๐Ÿƒ๐Ÿผ STAGE-{STAGE}: Processing input='{input_folder}' --> output='{output_folder}'\")" + ] + }, + { + "cell_type": "markdown", + "id": "3661cb37-39c7-4b09-a784-925bfa9eaf1e", + "metadata": { + "id": "3661cb37-39c7-4b09-a784-925bfa9eaf1e" + }, + "source": [ + "### 6.2 - Execute" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "a624b2b2-faad-4325-ac7d-53a840f564ef", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "a624b2b2-faad-4325-ac7d-53a840f564ef", + "outputId": "bd0f3f94-8c48-4c6b-b911-858e389243f4" + }, + "outputs": [ { - "cell_type": "markdown", - "id": "18aa0fe1", - "metadata": { - "id": "18aa0fe1" - }, - "source": [ - "### 5.2 - Execute" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "13:31:45 INFO - exact dedup params are {'doc_column': 'contents', 'doc_id_column': 'chunk_hash', 'use_snapshot': False, 'snapshot_directory': None, 'hash_cpu': 0.5, 'num_hashes': 2}\n", + "13:31:45 INFO - pipeline id pipeline_id\n", + "13:31:45 INFO - code location None\n", + "13:31:45 INFO - number of workers 2 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n", + "13:31:45 INFO - actor creation delay 0\n", + "13:31:45 INFO - job details {'job category': 'preprocessing', 'job name': 'ededup', 'job type': 'ray', 'job id': 'job_id'}\n", + "13:31:45 INFO - data factory data_ is using local data access: input_folder - output/03_docid_out output_folder - output/04_exact_dedupe_out\n", + "13:31:45 INFO - data factory data_ max_files -1, n_sample -1\n", + "13:31:45 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "13:31:45 INFO - Running locally\n", + "2024-10-18 13:31:47,001\tINFO worker.py:1744 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", + "\u001b[36m(orchestrate pid=13775)\u001b[0m 13:31:48 INFO - orchestrator started at 2024-10-18 13:31:48\n", + "\u001b[36m(orchestrate pid=13775)\u001b[0m 13:31:48 INFO - Number of files is 2, source profile {'max_file_size': 0.010180473327636719, 'min_file_size': 0.010101318359375, 'total_file_size': 0.02028179168701172}\n", + "\u001b[36m(orchestrate pid=13775)\u001b[0m 13:31:48 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 15.010423279367387, 'object_store': 7.505211639218032}\n", + "\u001b[36m(orchestrate pid=13775)\u001b[0m 13:31:48 INFO - Number of workers - 2 with {'num_cpus': 0.8, 'max_restarts': -1} each\n", + "\u001b[36m(orchestrate pid=13775)\u001b[0m 13:31:48 INFO - Completed 0 files (0.0%) in 0.0 min. Waiting for completion\n", + "\u001b[36m(orchestrate pid=13775)\u001b[0m 13:31:48 INFO - Completed processing 2 files in 0.013 min\n", + "\u001b[36m(orchestrate pid=13775)\u001b[0m 13:31:48 INFO - done flushing in 0.001 sec\n", + "13:31:58 INFO - Completed execution in 0.228 min, execution result 0\n" + ] }, { - "cell_type": "code", - "execution_count": 19, - "id": "f6e9e145", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "f6e9e145", - "outputId": "2add5f0c-3ab6-4336-8a7b-ac8b1b76ab73" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "13:31:29 INFO - Doc id parameters are : {'doc_column': 'contents', 'hash_column': 'chunk_hash', 'int_column': 'chunk_id', 'start_id': 0}\n", - "13:31:29 INFO - pipeline id pipeline_id\n", - "13:31:29 INFO - code location None\n", - "13:31:29 INFO - number of workers 2 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n", - "13:31:29 INFO - actor creation delay 0\n", - "13:31:29 INFO - job details {'job category': 'preprocessing', 'job name': 'doc_id', 'job type': 'ray', 'job id': 'job_id'}\n", - "13:31:29 INFO - data factory data_ is using local data access: input_folder - output/02_chunk_out output_folder - output/03_docid_out\n", - "13:31:29 INFO - data factory data_ max_files -1, n_sample -1\n", - "13:31:29 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "13:31:29 INFO - Running locally\n", - "2024-10-18 13:31:31,792\tINFO worker.py:1744 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", - "\u001b[36m(orchestrate pid=12291)\u001b[0m 13:31:32 INFO - orchestrator started at 2024-10-18 13:31:32\n", - "\u001b[36m(orchestrate pid=12291)\u001b[0m 13:31:32 INFO - Number of files is 2, source profile {'max_file_size': 0.008975982666015625, 'min_file_size': 0.008897781372070312, 'total_file_size': 0.017873764038085938}\n", - "\u001b[36m(orchestrate pid=12291)\u001b[0m 13:31:32 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 15.033103181049228, 'object_store': 7.516551589593291}\n", - "\u001b[36m(orchestrate pid=12291)\u001b[0m 13:31:32 INFO - Number of workers - 2 with {'num_cpus': 0.8, 'max_restarts': -1} each\n", - "\u001b[36m(orchestrate pid=12291)\u001b[0m 13:31:32 INFO - Completed 0 files (0.0%) in 0.0 min. Waiting for completion\n", - "\u001b[36m(orchestrate pid=12291)\u001b[0m 13:31:33 INFO - Completed processing 2 files in 0.012 min\n", - "\u001b[36m(orchestrate pid=12291)\u001b[0m 13:31:33 INFO - done flushing in 0.001 sec\n", - "13:31:43 INFO - Completed execution in 0.228 min, execution result 0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "โœ… Stage:3 completed successfully\n", - "CPU times: user 123 ms, sys: 145 ms, total: 267 ms\n", - "Wall time: 15.2 s\n" - ] - } - ], - "source": [ - "%%time\n", - "\n", - "from data_processing_ray.runtime.ray import RayTransformLauncher\n", - "from doc_id_transform_ray import DocIDRayTransformRuntimeConfiguration\n", - "\n", - "local_conf = {\n", - " \"input_folder\": input_folder,\n", - " \"output_folder\": output_folder,\n", - "}\n", - "worker_options = {\"num_cpus\" : MY_CONFIG.RAY_NUM_CPUS}\n", - "params = {\n", - " # where to run\n", - " \"run_locally\": True,\n", - " # Data access. Only required parameters are specified\n", - " \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n", - " # orchestrator\n", - " \"runtime_worker_options\": ParamsUtils.convert_to_ast(worker_options),\n", - " \"runtime_num_workers\": MY_CONFIG.RAY_RUNTIME_WORKERS,\n", - " # doc id configuration\n", - " \"doc_id_doc_column\": \"contents\",\n", - " \"doc_id_hash_column\": \"chunk_hash\",\n", - " \"doc_id_int_column\": \"chunk_id\",\n", - "}\n", - "sys.argv = ParamsUtils.dict_to_req(d=params)\n", - "\n", - "# launch\n", - "\n", - "launcher = RayTransformLauncher(DocIDRayTransformRuntimeConfiguration())\n", - "\n", - "return_code = launcher.launch()\n", - "\n", - "if return_code == 0:\n", - " print (f\"โœ… Stage:{STAGE} completed successfully\")\n", - "else:\n", - " raise Exception (\"โŒ Ray job failed\")" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "โœ… Stage:4 completed successfully\n", + "CPU times: user 136 ms, sys: 154 ms, total: 289 ms\n", + "Wall time: 15.2 s\n" + ] + } + ], + "source": [ + "%%time\n", + "\n", + "from data_processing_ray.runtime.ray import RayTransformLauncher\n", + "from ededup_transform_ray import EdedupRayTransformRuntimeConfiguration\n", + "\n", + "\n", + "# Prepare the commandline params\n", + "local_conf = {\n", + " \"input_folder\": input_folder,\n", + " \"output_folder\": output_folder,\n", + "}\n", + "worker_options = {\"num_cpus\" : MY_CONFIG.RAY_NUM_CPUS}\n", + "params = {\n", + " # where to run\n", + " \"run_locally\": True,\n", + " # Data access. Only required parameters are specified\n", + " \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n", + " # orchestrator\n", + " \"runtime_worker_options\": ParamsUtils.convert_to_ast(worker_options),\n", + " \"runtime_num_workers\": MY_CONFIG.RAY_RUNTIME_WORKERS,\n", + " # ededup parameters\n", + " \"ededup_hash_cpu\": 0.5,\n", + " \"ededup_num_hashes\": 2,\n", + " \"ededup_doc_column\": \"contents\",\n", + " \"ededup_doc_id_column\": \"chunk_hash\",\n", + "}\n", + "\n", + "# Pass the commandline params\n", + "sys.argv = ParamsUtils.dict_to_req(d=params)\n", + "\n", + "# create launcher\n", + "launcher = RayTransformLauncher(EdedupRayTransformRuntimeConfiguration())\n", + "# launch\n", + "return_code = launcher.launch()\n", + "\n", + "if return_code == 0:\n", + " print (f\"โœ… Stage:{STAGE} completed successfully\")\n", + "else:\n", + " raise Exception (\"โŒ Ray job failed\")" + ] + }, + { + "cell_type": "markdown", + "id": "eaf1c3c3", + "metadata": { + "id": "eaf1c3c3" + }, + "source": [ + "### 6.3 - Inspect Generated output" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "d824ebf6", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 815 }, + "id": "d824ebf6", + "outputId": "9173efb6-1b95-4a7e-b531-1a611841a4d0" + }, + "outputs": [ { - "cell_type": "markdown", - "id": "4954402f", - "metadata": { - "id": "4954402f" - }, - "source": [ - "### 5.3 - Inspect Generated output\n", - "\n", - "You will notice we have two extra columns\n", - "\n", - "- **hash_column**\n", - "- **int_id_column**\n", - "\n", - "But still the same number or rows as before" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "Input data dimensions (rows x columns)= (8, 18)\n", + "Output data dimensions (rows x columns)= (7, 19)\n", + "Input chunks before exact dedupe : 8\n", + "Output chunks after exact dedupe : 7\n", + "Duplicate chunks removed : 1\n" + ] }, { - "cell_type": "code", - "execution_count": 20, - "id": "1911179a", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 860 - }, - "id": "1911179a", - "outputId": "45e83e2a-1f70-46b9-e311-c50f025419be" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Input data dimensions (rows x columns)= (8, 16)\n", - "Output data dimensions (rows x columns)= (8, 18)\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
filenamenum_pagesnum_tablesnum_doc_elementsexthashsizedate_acquiredpdf_convert_timesource_filenamesource_document_idcontentsdoc_jsonpathpage_numberbboxdocument_idchunk_hashchunk_id
0mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Solar System\\nOur solar system is a vast and f...$.main-text[2]1[132.84518433, 588.96014404, 479.40917969, 623...44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...4
1mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Solar System\\nFor more details about the Solar...$.main-text[3]1[133.18510437, 570.83258057, 374.99838257, 581...dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07...dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07...5
2mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Mars\\nMars, the fourth planet from the Sun, is...$.main-text[5]1[132.87440491, 500.84011841, 477.48345947, 534...a31663e06fac41470ecc459f5a58658a3f9997d7801053...a31663e06fac41470ecc459f5a58658a3f9997d7801053...6
3mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Basic facts about Mars:\\nยท Distance from the S...$.main-text[6]1[133.2026062, 482.90710449, 237.04431152, 493....7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...7
4earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Solar System\\nOur solar system is a vast and f...$.main-text[2]1[132.87112427, 588.96014404, 479.40917969, 623...44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...0
5earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Solar System\\nFor more details about our Solar...$.main-text[3]1[133.20942688, 570.81555176, 375.57919312, 581...d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...1
6earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Earth\\nEarth is the third planet from the Sun....$.main-text[5]1[132.91053772, 512.46295166, 477.84887695, 534...7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...2
7earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Earth\\nBasic facts about Earth:\\nยท Distance fr...$.main-text[6]1[133.30151367, 494.86206055, 240.17156982, 505...189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...3
\n", - "
" - ], - "text/plain": [ - " filename num_pages num_tables num_doc_elements ext \\\n", - "0 mars.pdf 1 0 11 pdf \n", - "1 mars.pdf 1 0 11 pdf \n", - "2 mars.pdf 1 0 11 pdf \n", - "3 mars.pdf 1 0 11 pdf \n", - "4 earth.pdf 1 0 11 pdf \n", - "5 earth.pdf 1 0 11 pdf \n", - "6 earth.pdf 1 0 11 pdf \n", - "7 earth.pdf 1 0 11 pdf \n", - "\n", - " hash size \\\n", - "0 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "1 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "2 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "3 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "4 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "5 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "6 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "7 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "\n", - " date_acquired pdf_convert_time source_filename \\\n", - "0 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", - "1 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", - "2 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", - "3 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", - "4 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", - "5 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", - "6 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", - "7 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", - "\n", - " source_document_id \\\n", - "0 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", - "1 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", - "2 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", - "3 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", - "4 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", - "5 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", - "6 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", - "7 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", - "\n", - " contents doc_jsonpath \\\n", - "0 Solar System\\nOur solar system is a vast and f... $.main-text[2] \n", - "1 Solar System\\nFor more details about the Solar... $.main-text[3] \n", - "2 Mars\\nMars, the fourth planet from the Sun, is... $.main-text[5] \n", - "3 Basic facts about Mars:\\nยท Distance from the S... $.main-text[6] \n", - "4 Solar System\\nOur solar system is a vast and f... $.main-text[2] \n", - "5 Solar System\\nFor more details about our Solar... $.main-text[3] \n", - "6 Earth\\nEarth is the third planet from the Sun.... $.main-text[5] \n", - "7 Earth\\nBasic facts about Earth:\\nยท Distance fr... $.main-text[6] \n", - "\n", - " page_number bbox \\\n", - "0 1 [132.84518433, 588.96014404, 479.40917969, 623... \n", - "1 1 [133.18510437, 570.83258057, 374.99838257, 581... \n", - "2 1 [132.87440491, 500.84011841, 477.48345947, 534... \n", - "3 1 [133.2026062, 482.90710449, 237.04431152, 493.... \n", - "4 1 [132.87112427, 588.96014404, 479.40917969, 623... \n", - "5 1 [133.20942688, 570.81555176, 375.57919312, 581... \n", - "6 1 [132.91053772, 512.46295166, 477.84887695, 534... \n", - "7 1 [133.30151367, 494.86206055, 240.17156982, 505... \n", - "\n", - " document_id \\\n", - "0 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... \n", - "1 dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07... \n", - "2 a31663e06fac41470ecc459f5a58658a3f9997d7801053... \n", - "3 7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a... \n", - "4 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... \n", - "5 d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d... \n", - "6 7c4a750e2215f231803a6f8078bde1e9699034fb033dd3... \n", - "7 189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f... \n", - "\n", - " chunk_hash chunk_id \n", - "0 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... 4 \n", - "1 dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07... 5 \n", - "2 a31663e06fac41470ecc459f5a58658a3f9997d7801053... 6 \n", - "3 7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a... 7 \n", - "4 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... 0 \n", - "5 d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d... 1 \n", - "6 7c4a750e2215f231803a6f8078bde1e9699034fb033dd3... 2 \n", - "7 189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f... 3 " - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - } + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
filenamenum_pagesnum_tablesnum_doc_elementsexthashsizedate_acquiredpdf_convert_timesource_filenamesource_document_idcontentsdoc_jsonpathpage_numberbboxdocument_idchunk_hashchunk_idremoved
0mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Solar System\\nFor more details about the Solar...$.main-text[3]1[133.18510437, 570.83258057, 374.99838257, 581...dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07...dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07...5[44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf567...
1mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Mars\\nMars, the fourth planet from the Sun, is...$.main-text[5]1[132.87440491, 500.84011841, 477.48345947, 534...a31663e06fac41470ecc459f5a58658a3f9997d7801053...a31663e06fac41470ecc459f5a58658a3f9997d7801053...6[]
2mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Basic facts about Mars:\\nยท Distance from the S...$.main-text[6]1[133.2026062, 482.90710449, 237.04431152, 493....7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...7[]
3earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Solar System\\nOur solar system is a vast and f...$.main-text[2]1[132.87112427, 588.96014404, 479.40917969, 623...44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...0[]
4earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Solar System\\nFor more details about our Solar...$.main-text[3]1[133.20942688, 570.81555176, 375.57919312, 581...d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...1[]
5earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Earth\\nEarth is the third planet from the Sun....$.main-text[5]1[132.91053772, 512.46295166, 477.84887695, 534...7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...2[]
6earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Earth\\nBasic facts about Earth:\\nยท Distance fr...$.main-text[6]1[133.30151367, 494.86206055, 240.17156982, 505...189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...3[]
\n", + "
" ], - "source": [ - "from my_utils import read_parquet_files_as_df\n", - "\n", - "output_df = read_parquet_files_as_df(output_folder)\n", - "\n", - "print (\"Input data dimensions (rows x columns)= \", input_df.shape)\n", - "print (\"Output data dimensions (rows x columns)= \", output_df.shape)\n", - "\n", - "output_df.head(10)" - ] - }, - { - "cell_type": "markdown", - "id": "852829dc", - "metadata": { - "id": "852829dc" - }, - "source": [ - "## Step-6: Exact Dedup\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "id": "5acfd3a2-a236-4143-bcfc-15804f1da7fe", - "metadata": { - "id": "5acfd3a2-a236-4143-bcfc-15804f1da7fe" - }, - "source": [ - "### 6.1 - Set Input/output Folder" + "text/plain": [ + " filename num_pages num_tables num_doc_elements ext \\\n", + "0 mars.pdf 1 0 11 pdf \n", + "1 mars.pdf 1 0 11 pdf \n", + "2 mars.pdf 1 0 11 pdf \n", + "3 earth.pdf 1 0 11 pdf \n", + "4 earth.pdf 1 0 11 pdf \n", + "5 earth.pdf 1 0 11 pdf \n", + "6 earth.pdf 1 0 11 pdf \n", + "\n", + " hash size \\\n", + "0 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", + "1 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", + "2 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", + "3 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", + "4 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", + "5 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", + "6 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", + "\n", + " date_acquired pdf_convert_time source_filename \\\n", + "0 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", + "1 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", + "2 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", + "3 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", + "4 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", + "5 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", + "6 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", + "\n", + " source_document_id \\\n", + "0 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", + "1 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", + "2 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", + "3 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", + "4 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", + "5 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", + "6 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", + "\n", + " contents doc_jsonpath \\\n", + "0 Solar System\\nFor more details about the Solar... $.main-text[3] \n", + "1 Mars\\nMars, the fourth planet from the Sun, is... $.main-text[5] \n", + "2 Basic facts about Mars:\\nยท Distance from the S... $.main-text[6] \n", + "3 Solar System\\nOur solar system is a vast and f... $.main-text[2] \n", + "4 Solar System\\nFor more details about our Solar... $.main-text[3] \n", + "5 Earth\\nEarth is the third planet from the Sun.... $.main-text[5] \n", + "6 Earth\\nBasic facts about Earth:\\nยท Distance fr... $.main-text[6] \n", + "\n", + " page_number bbox \\\n", + "0 1 [133.18510437, 570.83258057, 374.99838257, 581... \n", + "1 1 [132.87440491, 500.84011841, 477.48345947, 534... \n", + "2 1 [133.2026062, 482.90710449, 237.04431152, 493.... \n", + "3 1 [132.87112427, 588.96014404, 479.40917969, 623... \n", + "4 1 [133.20942688, 570.81555176, 375.57919312, 581... \n", + "5 1 [132.91053772, 512.46295166, 477.84887695, 534... \n", + "6 1 [133.30151367, 494.86206055, 240.17156982, 505... \n", + "\n", + " document_id \\\n", + "0 dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07... \n", + "1 a31663e06fac41470ecc459f5a58658a3f9997d7801053... \n", + "2 7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a... \n", + "3 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... \n", + "4 d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d... \n", + "5 7c4a750e2215f231803a6f8078bde1e9699034fb033dd3... \n", + "6 189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f... \n", + "\n", + " chunk_hash chunk_id \\\n", + "0 dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07... 5 \n", + "1 a31663e06fac41470ecc459f5a58658a3f9997d7801053... 6 \n", + "2 7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a... 7 \n", + "3 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... 0 \n", + "4 d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d... 1 \n", + "5 7c4a750e2215f231803a6f8078bde1e9699034fb033dd3... 2 \n", + "6 189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f... 3 \n", + "\n", + " removed \n", + "0 [44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf567... \n", + "1 [] \n", + "2 [] \n", + "3 [] \n", + "4 [] \n", + "5 [] \n", + "6 [] " ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from my_utils import read_parquet_files_as_df\n", + "\n", + "output_df = read_parquet_files_as_df(output_folder)\n", + "\n", + "print (\"Input data dimensions (rows x columns)= \", input_df.shape)\n", + "print (\"Output data dimensions (rows x columns)= \", output_df.shape)\n", + "print (f\"Input chunks before exact dedupe : {input_df.shape[0]:,}\")\n", + "print (f\"Output chunks after exact dedupe : {output_df.shape[0]:,}\")\n", + "print (\"Duplicate chunks removed : \", (input_df.shape[0] - output_df.shape[0]))\n", + "\n", + "output_df.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "82cc9bb0", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 269 }, + "id": "82cc9bb0", + "outputId": "e043fa01-ceca-49ae-b764-8154219c7b6c" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 21, - "id": "4c7a1b94", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "4c7a1b94", - "outputId": "40a119b4-44fc-483d-9ad0-da178a2a8eb1" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "๐Ÿƒ๐Ÿผ STAGE-4: Processing input='output/03_docid_out' --> output='output/04_exact_dedupe_out'\n" - ] - } + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
filenamecontents
0mars.pdfSolar System\\nFor more details about the Solar...
1mars.pdfMars\\nMars, the fourth planet from the Sun, is...
2mars.pdfBasic facts about Mars:\\nยท Distance from the S...
3earth.pdfSolar System\\nOur solar system is a vast and f...
4earth.pdfSolar System\\nFor more details about our Solar...
5earth.pdfEarth\\nEarth is the third planet from the Sun....
6earth.pdfEarth\\nBasic facts about Earth:\\nยท Distance fr...
\n", + "
" ], - "source": [ - "STAGE = 4\n", - "\n", - "input_folder = output_docid_dir # previous output folder is the input folder for the current stage\n", - "output_folder = output_exact_dedupe_dir\n", - "\n", - "input_df = read_parquet_files_as_df(input_folder) ## for debug purposes\n", - "\n", - "print (f\"๐Ÿƒ๐Ÿผ STAGE-{STAGE}: Processing input='{input_folder}' --> output='{output_folder}'\")" + "text/plain": [ + " filename contents\n", + "0 mars.pdf Solar System\\nFor more details about the Solar...\n", + "1 mars.pdf Mars\\nMars, the fourth planet from the Sun, is...\n", + "2 mars.pdf Basic facts about Mars:\\nยท Distance from the S...\n", + "3 earth.pdf Solar System\\nOur solar system is a vast and f...\n", + "4 earth.pdf Solar System\\nFor more details about our Solar...\n", + "5 earth.pdf Earth\\nEarth is the third planet from the Sun....\n", + "6 earth.pdf Earth\\nBasic facts about Earth:\\nยท Distance fr..." ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "output_df[['filename', 'contents']]" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "cc61dffa", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "cc61dffa", + "outputId": "aff7a0d9-a791-42a5-d5b7-ad643f59f261" + }, + "outputs": [ { - "cell_type": "markdown", - "id": "3661cb37-39c7-4b09-a784-925bfa9eaf1e", - "metadata": { - "id": "3661cb37-39c7-4b09-a784-925bfa9eaf1e" - }, - "source": [ - "### 6.2 - Execute" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "========== mars.pdf ===========\n", + "-------Chunk 0------\n", + "Solar System\n", + "For more details about the Solar system see Chapter 1.\n", + "-------\n", + "-------Chunk 1------\n", + "Mars\n", + "Mars, the fourth planet from the Sun, is a cold, desert world with a thin atmosphere composed primarily of carbon dioxide. Its reddish hue comes from iron oxide, or rust, prevalent on its surface.\n", + "-------\n", + "-------Chunk 2------\n", + "Basic facts about Mars:\n", + "ยท Distance from the Sun: Average of 228 million kilometers (142 million miles)\n", + "ยท Rotation Period: 24.6 hours (one Martian day - called a \"sol\")\n", + "ยท Moons: Two small moons, Phobos and Deimos.\n", + "-------\n", + "========== earth.pdf ===========\n", + "-------Chunk 0------\n", + "Solar System\n", + "Our solar system is a vast and fascinating expanse, comprising eight planets, five dwarf planets, numerous moons, asteroids, comets, and other celestial bodies. At its center lies the star we call the Sun.\n", + "-------\n", + "-------Chunk 1------\n", + "Solar System\n", + "For more details about our Solar system see Chapter 1.\n", + "-------\n", + "-------Chunk 2------\n", + "Earth\n", + "Earth is the third planet from the Sun. It's our home planet. Earth is the only place we know of with life.\n", + "-------\n", + "-------Chunk 3------\n", + "Earth\n", + "Basic facts about Earth:\n", + "ยท Distance from the Sun: Average of 149.6 million kilometers (93 million miles)\n", + "ยท Rotation Period: 24 hours (one day)\n", + "ยท Moons: One moon, called Luna or simply \"the Moon\".\n", + "-------\n" + ] + } + ], + "source": [ + "for f in output_df['filename'].unique():\n", + " print ('==========' , f, '===========')\n", + " chunks = output_df[output_df['filename'] == f]['contents']\n", + " for idx , chunk in enumerate(chunks):\n", + " print (f'-------Chunk {idx}------\\n{chunk}\\n-------')" + ] + }, + { + "cell_type": "markdown", + "id": "383f40ba", + "metadata": { + "id": "383f40ba" + }, + "source": [ + "### 6.4 - Understanding the output\n", + "\n", + "Remember we had 8 chunks initially. Now we have 7! One duplicate chunk is removed.\n", + "\n", + "If you look at the PDF, the following common paragraph in `earth.pdf` and `mars.pdf` is removed from one of the documents! Pretty neat, eh!\n", + "\n", + "```text\n", + "## Solar System\n", + "\n", + "Our solar system is a vast and fascinating expanse, comprising eight planets, five dwarf planets, numerous moons, asteroids, comets, and other celestial bodies. At its center lies the star we call the Sun.\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "85309751-8556-41c6-ac32-84acc941bc8d", + "metadata": { + "id": "85309751-8556-41c6-ac32-84acc941bc8d" + }, + "source": [ + "## Step-7: Fuzzy Dedup\n", + "\n", + "Post exact deduplication, fuzzy deduplication is applied with the goal of removing code files that may have **slight variations** and thereby unbiasing\n", + "the data further.\n", + "\n", + "Small variations are quite commonly seen in code data in the form of variations in the values of variables, addittion of logging statements etc." + ] + }, + { + "cell_type": "markdown", + "id": "fcf574a3-b287-419c-9c86-07b828b41ca6", + "metadata": { + "id": "fcf574a3-b287-419c-9c86-07b828b41ca6" + }, + "source": [ + "### 7.1 - Set Input/output Folder" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "9e431c8c-c7c7-48de-ba5f-2c4649c35399", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "9e431c8c-c7c7-48de-ba5f-2c4649c35399", + "outputId": "d53a92d2-0f1c-465f-f11c-b9bc2931f651" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 22, - "id": "a624b2b2-faad-4325-ac7d-53a840f564ef", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "a624b2b2-faad-4325-ac7d-53a840f564ef", - "outputId": "bd0f3f94-8c48-4c6b-b911-858e389243f4" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "13:31:45 INFO - exact dedup params are {'doc_column': 'contents', 'doc_id_column': 'chunk_hash', 'use_snapshot': False, 'snapshot_directory': None, 'hash_cpu': 0.5, 'num_hashes': 2}\n", - "13:31:45 INFO - pipeline id pipeline_id\n", - "13:31:45 INFO - code location None\n", - "13:31:45 INFO - number of workers 2 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n", - "13:31:45 INFO - actor creation delay 0\n", - "13:31:45 INFO - job details {'job category': 'preprocessing', 'job name': 'ededup', 'job type': 'ray', 'job id': 'job_id'}\n", - "13:31:45 INFO - data factory data_ is using local data access: input_folder - output/03_docid_out output_folder - output/04_exact_dedupe_out\n", - "13:31:45 INFO - data factory data_ max_files -1, n_sample -1\n", - "13:31:45 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "13:31:45 INFO - Running locally\n", - "2024-10-18 13:31:47,001\tINFO worker.py:1744 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", - "\u001b[36m(orchestrate pid=13775)\u001b[0m 13:31:48 INFO - orchestrator started at 2024-10-18 13:31:48\n", - "\u001b[36m(orchestrate pid=13775)\u001b[0m 13:31:48 INFO - Number of files is 2, source profile {'max_file_size': 0.010180473327636719, 'min_file_size': 0.010101318359375, 'total_file_size': 0.02028179168701172}\n", - "\u001b[36m(orchestrate pid=13775)\u001b[0m 13:31:48 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 15.010423279367387, 'object_store': 7.505211639218032}\n", - "\u001b[36m(orchestrate pid=13775)\u001b[0m 13:31:48 INFO - Number of workers - 2 with {'num_cpus': 0.8, 'max_restarts': -1} each\n", - "\u001b[36m(orchestrate pid=13775)\u001b[0m 13:31:48 INFO - Completed 0 files (0.0%) in 0.0 min. Waiting for completion\n", - "\u001b[36m(orchestrate pid=13775)\u001b[0m 13:31:48 INFO - Completed processing 2 files in 0.013 min\n", - "\u001b[36m(orchestrate pid=13775)\u001b[0m 13:31:48 INFO - done flushing in 0.001 sec\n", - "13:31:58 INFO - Completed execution in 0.228 min, execution result 0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "โœ… Stage:4 completed successfully\n", - "CPU times: user 136 ms, sys: 154 ms, total: 289 ms\n", - "Wall time: 15.2 s\n" - ] - } - ], - "source": [ - "%%time\n", - "\n", - "from data_processing_ray.runtime.ray import RayTransformLauncher\n", - "from ededup_transform_ray import EdedupRayTransformRuntimeConfiguration\n", - "\n", - "\n", - "# Prepare the commandline params\n", - "local_conf = {\n", - " \"input_folder\": input_folder,\n", - " \"output_folder\": output_folder,\n", - "}\n", - "worker_options = {\"num_cpus\" : MY_CONFIG.RAY_NUM_CPUS}\n", - "params = {\n", - " # where to run\n", - " \"run_locally\": True,\n", - " # Data access. Only required parameters are specified\n", - " \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n", - " # orchestrator\n", - " \"runtime_worker_options\": ParamsUtils.convert_to_ast(worker_options),\n", - " \"runtime_num_workers\": MY_CONFIG.RAY_RUNTIME_WORKERS,\n", - " # ededup parameters\n", - " \"ededup_hash_cpu\": 0.5,\n", - " \"ededup_num_hashes\": 2,\n", - " \"ededup_doc_column\": \"contents\",\n", - " \"ededup_doc_id_column\": \"chunk_hash\",\n", - "}\n", - "\n", - "# Pass the commandline params\n", - "sys.argv = ParamsUtils.dict_to_req(d=params)\n", - "\n", - "# create launcher\n", - "launcher = RayTransformLauncher(EdedupRayTransformRuntimeConfiguration())\n", - "# launch\n", - "return_code = launcher.launch()\n", - "\n", - "if return_code == 0:\n", - " print (f\"โœ… Stage:{STAGE} completed successfully\")\n", - "else:\n", - " raise Exception (\"โŒ Ray job failed\")" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "๐Ÿƒ๐Ÿผ STAGE-5: Processing input='output/03_docid_out' --> output='output/05_fuzzy_dedupe_out'\n" + ] + } + ], + "source": [ + "## Input to this component is the output of doc_id generator component.\n", + "\n", + "STAGE = 5\n", + "\n", + "input_folder = output_docid_dir # previous output folder is the input folder for the current stage\n", + "output_folder = output_fuzzy_dedupe_dir\n", + "\n", + "input_df = read_parquet_files_as_df(input_folder) ## for debug purposes\n", + "\n", + "print (f\"๐Ÿƒ๐Ÿผ STAGE-{STAGE}: Processing input='{input_folder}' --> output='{output_folder}'\")" + ] + }, + { + "cell_type": "markdown", + "id": "f4c82a8f-b513-4fe5-b172-d41b104b54f3", + "metadata": { + "id": "f4c82a8f-b513-4fe5-b172-d41b104b54f3" + }, + "source": [ + "### 7.2 - Execute" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "3864ff77-e9a8-48f7-973b-c3b3aef1a94f", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "3864ff77-e9a8-48f7-973b-c3b3aef1a94f", + "outputId": "1e63d364-3944-465a-ff7c-6e1dc750b2de" + }, + "outputs": [ { - "cell_type": "markdown", - "id": "eaf1c3c3", - "metadata": { - "id": "eaf1c3c3" - }, - "source": [ - "### 6.3 - Inspect Generated output" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "13:32:00 INFO - fuzzy dedup params are {'doc_column': 'contents', 'id_column': 'chunk_id', 'cluster_column': 'chunk_hash', 'bucket_cpu': 0.3, 'mhash_cpu': 0.3, 'doc_cpu': 0.3, 'num_doc_actors': 1, 'num_minhash_actors': 1, 'num_bucket_actors': 1, 'num_preprocessors': 1, 'num_permutations': 64, 'threshold': 0.7, 'shingles_size': 5, 'delimiters': ' ', 'snapshot_delay': 1, 'use_bucket_snapshot': False, 'use_doc_snapshot': False, 'random_delay_limit': 10, 'worker_options': {'num_cpus': 0.8}}\n", + "13:32:00 INFO - pipeline id pipeline_id\n", + "13:32:00 INFO - code location None\n", + "13:32:00 INFO - number of workers 2 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n", + "13:32:00 INFO - actor creation delay 0\n", + "13:32:00 INFO - job details {'job category': 'preprocessing', 'job name': 'fdedup', 'job type': 'ray', 'job id': 'job_id'}\n", + "13:32:00 INFO - data factory data_ is using local data access: input_folder - output/03_docid_out output_folder - output/05_fuzzy_dedupe_out\n", + "13:32:00 INFO - data factory data_ max_files -1, n_sample -1\n", + "13:32:00 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "13:32:00 INFO - Running locally\n", + "2024-10-18 13:32:02,246\tINFO worker.py:1744 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", + "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - orchestrator started at 2024-10-18 13:32:03\n", + "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - Number of files is 2, source profile {'max_file_size': 0.010180473327636719, 'min_file_size': 0.010101318359375, 'total_file_size': 0.02028179168701172}\n", + "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 15.000544739887118, 'object_store': 7.500272369012237}\n", + "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - Number of workers - 2 with {'num_cpus': 0.8, 'max_restarts': -1} each\n", + "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - starting run from the beginning\n", + "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - continuing from the very beginning\n", + "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - Fuzzy: num buckets 8, bucket length 8\n", + "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - created 1 bucket actors\n", + "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - created 1 minhash actors\n", + "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - Table preprocessing uses 1 readers\n", + "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - created 1 table processor actors\n", + "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:07 INFO - Completed 1 files in 0.064 min\n", + "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:07 INFO - Completed 1 files (50.0%) in 0.064 min. Waiting for completion\n", + "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:15 INFO - Completed processing 2 files in 0.197 min\n", + "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:15 INFO - creating minhash snapshots\n", + "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:16 INFO - minhash snapshots created\n", + "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:16 INFO - creating bucket snapshots\n", + "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:17 INFO - bucket snapshots created\n", + "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:17 INFO - created 1 document actors\n", + "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:17 INFO - created 1 bucket processor actors\n", + "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:17 INFO - created bucket processor invoker\n", + "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:17 INFO - added invoker to bucket collectors\n", + "\u001b[36m(BucketsHash pid=16209)\u001b[0m 13:32:17 INFO - processing buckets 0 long, 53 short\n", + "\u001b[36m(BucketsHash pid=16209)\u001b[0m 13:32:17 INFO - Done submitting long buckets\n", + "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:17 INFO - Done processing buckets in 0.01 min\n", + "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:17 INFO - creating document snapshots\n", + "\u001b[36m(BucketsHashProcessorInvoker pid=16602)\u001b[0m 13:32:17 INFO - Waiting bucket processing completion. Submitted requests 1\n", + "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:18 INFO - document snapshots created\n", + "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:18 INFO - Completed 0 files (0.0%) in 0.0 min. Waiting for completion\n", + "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:25 INFO - Completed processing 2 files in 0.113 min\n", + "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:25 INFO - done flushing in 0.005 sec\n", + "13:32:35 INFO - Completed execution in 0.588 min, execution result 0\n" + ] }, { - "cell_type": "code", - "execution_count": 23, - "id": "d824ebf6", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 815 - }, - "id": "d824ebf6", - "outputId": "9173efb6-1b95-4a7e-b531-1a611841a4d0" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Input data dimensions (rows x columns)= (8, 18)\n", - "Output data dimensions (rows x columns)= (7, 19)\n", - "Input chunks before exact dedupe : 8\n", - "Output chunks after exact dedupe : 7\n", - "Duplicate chunks removed : 1\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
filenamenum_pagesnum_tablesnum_doc_elementsexthashsizedate_acquiredpdf_convert_timesource_filenamesource_document_idcontentsdoc_jsonpathpage_numberbboxdocument_idchunk_hashchunk_idremoved
0mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Solar System\\nFor more details about the Solar...$.main-text[3]1[133.18510437, 570.83258057, 374.99838257, 581...dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07...dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07...5[44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf567...
1mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Mars\\nMars, the fourth planet from the Sun, is...$.main-text[5]1[132.87440491, 500.84011841, 477.48345947, 534...a31663e06fac41470ecc459f5a58658a3f9997d7801053...a31663e06fac41470ecc459f5a58658a3f9997d7801053...6[]
2mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Basic facts about Mars:\\nยท Distance from the S...$.main-text[6]1[133.2026062, 482.90710449, 237.04431152, 493....7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...7[]
3earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Solar System\\nOur solar system is a vast and f...$.main-text[2]1[132.87112427, 588.96014404, 479.40917969, 623...44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...0[]
4earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Solar System\\nFor more details about our Solar...$.main-text[3]1[133.20942688, 570.81555176, 375.57919312, 581...d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...1[]
5earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Earth\\nEarth is the third planet from the Sun....$.main-text[5]1[132.91053772, 512.46295166, 477.84887695, 534...7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...2[]
6earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Earth\\nBasic facts about Earth:\\nยท Distance fr...$.main-text[6]1[133.30151367, 494.86206055, 240.17156982, 505...189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...3[]
\n", - "
" - ], - "text/plain": [ - " filename num_pages num_tables num_doc_elements ext \\\n", - "0 mars.pdf 1 0 11 pdf \n", - "1 mars.pdf 1 0 11 pdf \n", - "2 mars.pdf 1 0 11 pdf \n", - "3 earth.pdf 1 0 11 pdf \n", - "4 earth.pdf 1 0 11 pdf \n", - "5 earth.pdf 1 0 11 pdf \n", - "6 earth.pdf 1 0 11 pdf \n", - "\n", - " hash size \\\n", - "0 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "1 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "2 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "3 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "4 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "5 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "6 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "\n", - " date_acquired pdf_convert_time source_filename \\\n", - "0 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", - "1 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", - "2 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", - "3 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", - "4 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", - "5 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", - "6 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", - "\n", - " source_document_id \\\n", - "0 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", - "1 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", - "2 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", - "3 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", - "4 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", - "5 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", - "6 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", - "\n", - " contents doc_jsonpath \\\n", - "0 Solar System\\nFor more details about the Solar... $.main-text[3] \n", - "1 Mars\\nMars, the fourth planet from the Sun, is... $.main-text[5] \n", - "2 Basic facts about Mars:\\nยท Distance from the S... $.main-text[6] \n", - "3 Solar System\\nOur solar system is a vast and f... $.main-text[2] \n", - "4 Solar System\\nFor more details about our Solar... $.main-text[3] \n", - "5 Earth\\nEarth is the third planet from the Sun.... $.main-text[5] \n", - "6 Earth\\nBasic facts about Earth:\\nยท Distance fr... $.main-text[6] \n", - "\n", - " page_number bbox \\\n", - "0 1 [133.18510437, 570.83258057, 374.99838257, 581... \n", - "1 1 [132.87440491, 500.84011841, 477.48345947, 534... \n", - "2 1 [133.2026062, 482.90710449, 237.04431152, 493.... \n", - "3 1 [132.87112427, 588.96014404, 479.40917969, 623... \n", - "4 1 [133.20942688, 570.81555176, 375.57919312, 581... \n", - "5 1 [132.91053772, 512.46295166, 477.84887695, 534... \n", - "6 1 [133.30151367, 494.86206055, 240.17156982, 505... \n", - "\n", - " document_id \\\n", - "0 dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07... \n", - "1 a31663e06fac41470ecc459f5a58658a3f9997d7801053... \n", - "2 7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a... \n", - "3 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... \n", - "4 d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d... \n", - "5 7c4a750e2215f231803a6f8078bde1e9699034fb033dd3... \n", - "6 189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f... \n", - "\n", - " chunk_hash chunk_id \\\n", - "0 dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07... 5 \n", - "1 a31663e06fac41470ecc459f5a58658a3f9997d7801053... 6 \n", - "2 7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a... 7 \n", - "3 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... 0 \n", - "4 d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d... 1 \n", - "5 7c4a750e2215f231803a6f8078bde1e9699034fb033dd3... 2 \n", - "6 189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f... 3 \n", - "\n", - " removed \n", - "0 [44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf567... \n", - "1 [] \n", - "2 [] \n", - "3 [] \n", - "4 [] \n", - "5 [] \n", - "6 [] " - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from my_utils import read_parquet_files_as_df\n", - "\n", - "output_df = read_parquet_files_as_df(output_folder)\n", - "\n", - "print (\"Input data dimensions (rows x columns)= \", input_df.shape)\n", - "print (\"Output data dimensions (rows x columns)= \", output_df.shape)\n", - "print (f\"Input chunks before exact dedupe : {input_df.shape[0]:,}\")\n", - "print (f\"Output chunks after exact dedupe : {output_df.shape[0]:,}\")\n", - "print (\"Duplicate chunks removed : \", (input_df.shape[0] - output_df.shape[0]))\n", - "\n", - "output_df.head(10)" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "โœ… Stage:5 completed successfully\n", + "CPU times: user 270 ms, sys: 200 ms, total: 470 ms\n", + "Wall time: 36.6 s\n" + ] + } + ], + "source": [ + "%%time\n", + "\n", + "import os\n", + "import sys\n", + "\n", + "from data_processing.utils import ParamsUtils\n", + "from fdedup_transform_ray import FdedupRayTransformConfiguration\n", + "from data_processing_ray.runtime.ray import RayTransformLauncher\n", + "\n", + "# create parameters\n", + "\n", + "local_conf = {\n", + " \"input_folder\": input_folder,\n", + " \"output_folder\": output_folder,\n", + "}\n", + "worker_options = {\"num_cpus\" : MY_CONFIG.RAY_NUM_CPUS}\n", + "code_location = {\"github\": \"github\", \"commit_hash\": \"12345\", \"path\": \"path\"}\n", + "params = {\n", + " # where to run\n", + " \"run_locally\": True,\n", + " # Data access. Only required parameters are specified\n", + " \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n", + " # Orchestration parameters\n", + " \"runtime_worker_options\": ParamsUtils.convert_to_ast(worker_options),\n", + " \"runtime_num_workers\": MY_CONFIG.RAY_RUNTIME_WORKERS,\n", + " # columns used\n", + " \"fdedup_doc_column\": \"contents\",\n", + " \"fdedup_id_column\": \"chunk_id\",\n", + " \"fdedup_cluster_column\": \"chunk_hash\",\n", + " # infrastructure\n", + " \"fdedup_bucket_cpu\": 0.3,\n", + " \"fdedup_doc_cpu\": 0.3,\n", + " \"fdedup_mhash_cpu\": 0.3,\n", + " \"fdedup_num_doc_actors\": 1,\n", + " \"fdedup_num_bucket_actors\": 1,\n", + " \"fdedup_num_minhash_actors\": 1,\n", + " \"fdedup_num_preprocessors\": 1,\n", + " # fuzzy parameters\n", + " \"fdedup_num_permutations\": 64,\n", + " \"fdedup_threshold\": 0.7, # (default 0.8)\n", + " \"fdedup_shingles_size\": 5,\n", + " \"fdedup_delimiters\": \" \"\n", + "}\n", + "\n", + "# Pass commandline params\n", + "sys.argv = ParamsUtils.dict_to_req(d=params)\n", + "\n", + "# launch\n", + "\n", + "launcher = RayTransformLauncher(FdedupRayTransformConfiguration())\n", + "\n", + "return_code = launcher.launch()\n", + "\n", + "if return_code == 0:\n", + " print (f\"โœ… Stage:{STAGE} completed successfully\")\n", + "else:\n", + " raise Exception (\"โŒ Ray job failed\")" + ] + }, + { + "cell_type": "markdown", + "id": "a6f8cd11", + "metadata": { + "id": "a6f8cd11" + }, + "source": [ + "### 7.3 - Inspect Generated output" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "e899ad60", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 677 }, + "id": "e899ad60", + "outputId": "fcfda84c-ebbf-490f-f478-ceef7ca9e83b" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 24, - "id": "82cc9bb0", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 269 - }, - "id": "82cc9bb0", - "outputId": "e043fa01-ceca-49ae-b764-8154219c7b6c" - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
filenamecontents
0mars.pdfSolar System\\nFor more details about the Solar...
1mars.pdfMars\\nMars, the fourth planet from the Sun, is...
2mars.pdfBasic facts about Mars:\\nยท Distance from the S...
3earth.pdfSolar System\\nOur solar system is a vast and f...
4earth.pdfSolar System\\nFor more details about our Solar...
5earth.pdfEarth\\nEarth is the third planet from the Sun....
6earth.pdfEarth\\nBasic facts about Earth:\\nยท Distance fr...
\n", - "
" - ], - "text/plain": [ - " filename contents\n", - "0 mars.pdf Solar System\\nFor more details about the Solar...\n", - "1 mars.pdf Mars\\nMars, the fourth planet from the Sun, is...\n", - "2 mars.pdf Basic facts about Mars:\\nยท Distance from the S...\n", - "3 earth.pdf Solar System\\nOur solar system is a vast and f...\n", - "4 earth.pdf Solar System\\nFor more details about our Solar...\n", - "5 earth.pdf Earth\\nEarth is the third planet from the Sun....\n", - "6 earth.pdf Earth\\nBasic facts about Earth:\\nยท Distance fr..." - ] - }, - "execution_count": 24, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "output_df[['filename', 'contents']]" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "Input data dimensions (rows x columns)= (8, 18)\n", + "Output data dimensions (rows x columns)= (6, 18)\n", + "Duplicate chunks removed by fuzzy-dedupe: 2\n" + ] }, { - "cell_type": "code", - "execution_count": 25, - "id": "cc61dffa", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "cc61dffa", - "outputId": "aff7a0d9-a791-42a5-d5b7-ad643f59f261" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "========== mars.pdf ===========\n", - "-------Chunk 0------\n", - "Solar System\n", - "For more details about the Solar system see Chapter 1.\n", - "-------\n", - "-------Chunk 1------\n", - "Mars\n", - "Mars, the fourth planet from the Sun, is a cold, desert world with a thin atmosphere composed primarily of carbon dioxide. Its reddish hue comes from iron oxide, or rust, prevalent on its surface.\n", - "-------\n", - "-------Chunk 2------\n", - "Basic facts about Mars:\n", - "ยท Distance from the Sun: Average of 228 million kilometers (142 million miles)\n", - "ยท Rotation Period: 24.6 hours (one Martian day - called a \"sol\")\n", - "ยท Moons: Two small moons, Phobos and Deimos.\n", - "-------\n", - "========== earth.pdf ===========\n", - "-------Chunk 0------\n", - "Solar System\n", - "Our solar system is a vast and fascinating expanse, comprising eight planets, five dwarf planets, numerous moons, asteroids, comets, and other celestial bodies. At its center lies the star we call the Sun.\n", - "-------\n", - "-------Chunk 1------\n", - "Solar System\n", - "For more details about our Solar system see Chapter 1.\n", - "-------\n", - "-------Chunk 2------\n", - "Earth\n", - "Earth is the third planet from the Sun. It's our home planet. Earth is the only place we know of with life.\n", - "-------\n", - "-------Chunk 3------\n", - "Earth\n", - "Basic facts about Earth:\n", - "ยท Distance from the Sun: Average of 149.6 million kilometers (93 million miles)\n", - "ยท Rotation Period: 24 hours (one day)\n", - "ยท Moons: One moon, called Luna or simply \"the Moon\".\n", - "-------\n" - ] - } + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
filenamenum_pagesnum_tablesnum_doc_elementsexthashsizedate_acquiredpdf_convert_timesource_filenamesource_document_idcontentsdoc_jsonpathpage_numberbboxdocument_idchunk_idchunk_hash
0mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Solar System\\nOur solar system is a vast and f...$.main-text[2]1[132.84518433, 588.96014404, 479.40917969, 623...44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...4-1
1mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Mars\\nMars, the fourth planet from the Sun, is...$.main-text[5]1[132.87440491, 500.84011841, 477.48345947, 534...a31663e06fac41470ecc459f5a58658a3f9997d7801053...6-1
2mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Basic facts about Mars:\\nยท Distance from the S...$.main-text[6]1[133.2026062, 482.90710449, 237.04431152, 493....7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...7-1
3earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Solar System\\nFor more details about our Solar...$.main-text[3]1[133.20942688, 570.81555176, 375.57919312, 581...d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...15
4earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Earth\\nEarth is the third planet from the Sun....$.main-text[5]1[132.91053772, 512.46295166, 477.84887695, 534...7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...2-1
5earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Earth\\nBasic facts about Earth:\\nยท Distance fr...$.main-text[6]1[133.30151367, 494.86206055, 240.17156982, 505...189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...3-1
\n", + "
" ], - "source": [ - "for f in output_df['filename'].unique():\n", - " print ('==========' , f, '===========')\n", - " chunks = output_df[output_df['filename'] == f]['contents']\n", - " for idx , chunk in enumerate(chunks):\n", - " print (f'-------Chunk {idx}------\\n{chunk}\\n-------')" - ] - }, - { - "cell_type": "markdown", - "id": "383f40ba", - "metadata": { - "id": "383f40ba" - }, - "source": [ - "### 6.4 - Understanding the output\n", - "\n", - "Remember we had 8 chunks initially. Now we have 7! One duplicate chunk is removed.\n", - "\n", - "If you look at the PDF, the following common paragraph in `earth.pdf` and `mars.pdf` is removed from one of the documents! Pretty neat, eh!\n", - "\n", - "```text\n", - "## Solar System\n", - "\n", - "Our solar system is a vast and fascinating expanse, comprising eight planets, five dwarf planets, numerous moons, asteroids, comets, and other celestial bodies. At its center lies the star we call the Sun.\n", - "```" - ] - }, - { - "cell_type": "markdown", - "id": "85309751-8556-41c6-ac32-84acc941bc8d", - "metadata": { - "id": "85309751-8556-41c6-ac32-84acc941bc8d" - }, - "source": [ - "## Step-7: Fuzzy Dedup\n", - "\n", - "Post exact deduplication, fuzzy deduplication is applied with the goal of removing code files that may have **slight variations** and thereby unbiasing\n", - "the data further.\n", - "\n", - "Small variations are quite commonly seen in code data in the form of variations in the values of variables, addittion of logging statements etc." - ] - }, - { - "cell_type": "markdown", - "id": "fcf574a3-b287-419c-9c86-07b828b41ca6", - "metadata": { - "id": "fcf574a3-b287-419c-9c86-07b828b41ca6" - }, - "source": [ - "### 7.1 - Set Input/output Folder" + "text/plain": [ + " filename num_pages num_tables num_doc_elements ext \\\n", + "0 mars.pdf 1 0 11 pdf \n", + "1 mars.pdf 1 0 11 pdf \n", + "2 mars.pdf 1 0 11 pdf \n", + "3 earth.pdf 1 0 11 pdf \n", + "4 earth.pdf 1 0 11 pdf \n", + "5 earth.pdf 1 0 11 pdf \n", + "\n", + " hash size \\\n", + "0 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", + "1 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", + "2 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", + "3 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", + "4 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", + "5 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", + "\n", + " date_acquired pdf_convert_time source_filename \\\n", + "0 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", + "1 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", + "2 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", + "3 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", + "4 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", + "5 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", + "\n", + " source_document_id \\\n", + "0 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", + "1 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", + "2 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", + "3 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", + "4 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", + "5 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", + "\n", + " contents doc_jsonpath \\\n", + "0 Solar System\\nOur solar system is a vast and f... $.main-text[2] \n", + "1 Mars\\nMars, the fourth planet from the Sun, is... $.main-text[5] \n", + "2 Basic facts about Mars:\\nยท Distance from the S... $.main-text[6] \n", + "3 Solar System\\nFor more details about our Solar... $.main-text[3] \n", + "4 Earth\\nEarth is the third planet from the Sun.... $.main-text[5] \n", + "5 Earth\\nBasic facts about Earth:\\nยท Distance fr... $.main-text[6] \n", + "\n", + " page_number bbox \\\n", + "0 1 [132.84518433, 588.96014404, 479.40917969, 623... \n", + "1 1 [132.87440491, 500.84011841, 477.48345947, 534... \n", + "2 1 [133.2026062, 482.90710449, 237.04431152, 493.... \n", + "3 1 [133.20942688, 570.81555176, 375.57919312, 581... \n", + "4 1 [132.91053772, 512.46295166, 477.84887695, 534... \n", + "5 1 [133.30151367, 494.86206055, 240.17156982, 505... \n", + "\n", + " document_id chunk_id chunk_hash \n", + "0 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... 4 -1 \n", + "1 a31663e06fac41470ecc459f5a58658a3f9997d7801053... 6 -1 \n", + "2 7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a... 7 -1 \n", + "3 d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d... 1 5 \n", + "4 7c4a750e2215f231803a6f8078bde1e9699034fb033dd3... 2 -1 \n", + "5 189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f... 3 -1 " ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from my_utils import read_parquet_files_as_df\n", + "\n", + "output_df = read_parquet_files_as_df(output_folder)\n", + "\n", + "print (\"Input data dimensions (rows x columns)= \", input_df.shape)\n", + "print (\"Output data dimensions (rows x columns)= \", output_df.shape)\n", + "print (\"Duplicate chunks removed by fuzzy-dedupe: \", (input_df.shape[0] - output_df.shape[0]))\n", + "\n", + "output_df.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "ab7ea52b", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 238 }, + "id": "ab7ea52b", + "outputId": "e38754ee-777f-4ed7-ebc0-9299ee122662" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 26, - "id": "9e431c8c-c7c7-48de-ba5f-2c4649c35399", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "9e431c8c-c7c7-48de-ba5f-2c4649c35399", - "outputId": "d53a92d2-0f1c-465f-f11c-b9bc2931f651" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "๐Ÿƒ๐Ÿผ STAGE-5: Processing input='output/03_docid_out' --> output='output/05_fuzzy_dedupe_out'\n" - ] - } + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
filenamecontents
0mars.pdfSolar System\\nOur solar system is a vast and f...
1mars.pdfMars\\nMars, the fourth planet from the Sun, is...
2mars.pdfBasic facts about Mars:\\nยท Distance from the S...
3earth.pdfSolar System\\nFor more details about our Solar...
4earth.pdfEarth\\nEarth is the third planet from the Sun....
5earth.pdfEarth\\nBasic facts about Earth:\\nยท Distance fr...
\n", + "
" ], - "source": [ - "## Input to this component is the output of doc_id generator component.\n", - "\n", - "STAGE = 5\n", - "\n", - "input_folder = output_docid_dir # previous output folder is the input folder for the current stage\n", - "output_folder = output_fuzzy_dedupe_dir\n", - "\n", - "input_df = read_parquet_files_as_df(input_folder) ## for debug purposes\n", - "\n", - "print (f\"๐Ÿƒ๐Ÿผ STAGE-{STAGE}: Processing input='{input_folder}' --> output='{output_folder}'\")" + "text/plain": [ + " filename contents\n", + "0 mars.pdf Solar System\\nOur solar system is a vast and f...\n", + "1 mars.pdf Mars\\nMars, the fourth planet from the Sun, is...\n", + "2 mars.pdf Basic facts about Mars:\\nยท Distance from the S...\n", + "3 earth.pdf Solar System\\nFor more details about our Solar...\n", + "4 earth.pdf Earth\\nEarth is the third planet from the Sun....\n", + "5 earth.pdf Earth\\nBasic facts about Earth:\\nยท Distance fr..." ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "output_df[['filename', 'contents']]" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "6bdd3515", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "6bdd3515", + "outputId": "e6e3f2c0-5b23-4336-bc95-013921f0724a" + }, + "outputs": [ { - "cell_type": "markdown", - "id": "f4c82a8f-b513-4fe5-b172-d41b104b54f3", - "metadata": { - "id": "f4c82a8f-b513-4fe5-b172-d41b104b54f3" - }, - "source": [ - "### 7.2 - Execute" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "========== mars.pdf ===========\n", + "-------Chunk 0------\n", + "Solar System\n", + "Our solar system is a vast and fascinating expanse, comprising eight planets, five dwarf planets, numerous moons, asteroids, comets, and other celestial bodies. At its center lies the star we call the Sun.\n", + "-------\n", + "-------Chunk 1------\n", + "Mars\n", + "Mars, the fourth planet from the Sun, is a cold, desert world with a thin atmosphere composed primarily of carbon dioxide. Its reddish hue comes from iron oxide, or rust, prevalent on its surface.\n", + "-------\n", + "-------Chunk 2------\n", + "Basic facts about Mars:\n", + "ยท Distance from the Sun: Average of 228 million kilometers (142 million miles)\n", + "ยท Rotation Period: 24.6 hours (one Martian day - called a \"sol\")\n", + "ยท Moons: Two small moons, Phobos and Deimos.\n", + "-------\n", + "========== earth.pdf ===========\n", + "-------Chunk 0------\n", + "Solar System\n", + "For more details about our Solar system see Chapter 1.\n", + "-------\n", + "-------Chunk 1------\n", + "Earth\n", + "Earth is the third planet from the Sun. It's our home planet. Earth is the only place we know of with life.\n", + "-------\n", + "-------Chunk 2------\n", + "Earth\n", + "Basic facts about Earth:\n", + "ยท Distance from the Sun: Average of 149.6 million kilometers (93 million miles)\n", + "ยท Rotation Period: 24 hours (one day)\n", + "ยท Moons: One moon, called Luna or simply \"the Moon\".\n", + "-------\n" + ] + } + ], + "source": [ + "for f in output_df['filename'].unique():\n", + " print ('==========' , f, '===========')\n", + " chunks = output_df[output_df['filename'] == f]['contents']\n", + " for idx , chunk in enumerate(chunks):\n", + " print (f'-------Chunk {idx}------\\n{chunk}\\n-------')" + ] + }, + { + "cell_type": "markdown", + "id": "2b34d9c6", + "metadata": { + "id": "2b34d9c6" + }, + "source": [ + "### 7.4- Understanding the output\n", + "\n", + "So we started with 7 rows and ended up with 6. Fuzzy dedupe removed the following **very similar** chunk.\n", + "\n", + "These are pretty similar chunks except for the words 'the' and 'our'\n", + "\n", + "**earth.pdf**\n", + "\n", + "`For more details about *our* Solar system see Chapter 1.`\n", + "\n", + "**mars.pdf**\n", + "\n", + "`For more details about *the* Solar system see Chapter 1.`\n", + "\n", + "Pretty neat, eh? ๐Ÿ‘\n", + "\n", + "### Configuring Fuzzy de-dupe\n", + "\n", + "You can tweak fuzzy dedupe by tweaking the following parameters\n", + "\n", + "```python\n", + "# fuzzy parameters\n", + " \"fdedup_num_permutations\": 64,\n", + " \"fdedup_threshold\": 0.7, # (default 0.8)\n", + " \"fdedup_shingles_size\": 5,\n", + " \"fdedup_delimiters\": \" \"\n", + "```\n", + "\n", + "In our case, we set `fdedup_threshold` parameter to 0.7. \n" + ] + }, + { + "cell_type": "markdown", + "id": "5370950a-2a3a-4143-8218-f9b4808099ba", + "metadata": { + "id": "5370950a-2a3a-4143-8218-f9b4808099ba" + }, + "source": [ + "## Step-8: Text encoding\n", + "\n", + "Encode text for the vector storage." + ] + }, + { + "cell_type": "markdown", + "id": "85aba685", + "metadata": { + "id": "85aba685" + }, + "source": [ + "### 8.1 - Set Input/output Folder" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "20a153fa-fd56-401e-86be-4f7617affcc8", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "20a153fa-fd56-401e-86be-4f7617affcc8", + "outputId": "530e65c6-7ceb-4c73-cb87-50da46c78add" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 27, - "id": "3864ff77-e9a8-48f7-973b-c3b3aef1a94f", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "3864ff77-e9a8-48f7-973b-c3b3aef1a94f", - "outputId": "1e63d364-3944-465a-ff7c-6e1dc750b2de" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "13:32:00 INFO - fuzzy dedup params are {'doc_column': 'contents', 'id_column': 'chunk_id', 'cluster_column': 'chunk_hash', 'bucket_cpu': 0.3, 'mhash_cpu': 0.3, 'doc_cpu': 0.3, 'num_doc_actors': 1, 'num_minhash_actors': 1, 'num_bucket_actors': 1, 'num_preprocessors': 1, 'num_permutations': 64, 'threshold': 0.7, 'shingles_size': 5, 'delimiters': ' ', 'snapshot_delay': 1, 'use_bucket_snapshot': False, 'use_doc_snapshot': False, 'random_delay_limit': 10, 'worker_options': {'num_cpus': 0.8}}\n", - "13:32:00 INFO - pipeline id pipeline_id\n", - "13:32:00 INFO - code location None\n", - "13:32:00 INFO - number of workers 2 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n", - "13:32:00 INFO - actor creation delay 0\n", - "13:32:00 INFO - job details {'job category': 'preprocessing', 'job name': 'fdedup', 'job type': 'ray', 'job id': 'job_id'}\n", - "13:32:00 INFO - data factory data_ is using local data access: input_folder - output/03_docid_out output_folder - output/05_fuzzy_dedupe_out\n", - "13:32:00 INFO - data factory data_ max_files -1, n_sample -1\n", - "13:32:00 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "13:32:00 INFO - Running locally\n", - "2024-10-18 13:32:02,246\tINFO worker.py:1744 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - orchestrator started at 2024-10-18 13:32:03\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - Number of files is 2, source profile {'max_file_size': 0.010180473327636719, 'min_file_size': 0.010101318359375, 'total_file_size': 0.02028179168701172}\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 15.000544739887118, 'object_store': 7.500272369012237}\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - Number of workers - 2 with {'num_cpus': 0.8, 'max_restarts': -1} each\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - starting run from the beginning\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - continuing from the very beginning\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - Fuzzy: num buckets 8, bucket length 8\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - created 1 bucket actors\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - created 1 minhash actors\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - Table preprocessing uses 1 readers\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - created 1 table processor actors\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:07 INFO - Completed 1 files in 0.064 min\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:07 INFO - Completed 1 files (50.0%) in 0.064 min. Waiting for completion\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:15 INFO - Completed processing 2 files in 0.197 min\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:15 INFO - creating minhash snapshots\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:16 INFO - minhash snapshots created\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:16 INFO - creating bucket snapshots\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:17 INFO - bucket snapshots created\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:17 INFO - created 1 document actors\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:17 INFO - created 1 bucket processor actors\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:17 INFO - created bucket processor invoker\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:17 INFO - added invoker to bucket collectors\n", - "\u001b[36m(BucketsHash pid=16209)\u001b[0m 13:32:17 INFO - processing buckets 0 long, 53 short\n", - "\u001b[36m(BucketsHash pid=16209)\u001b[0m 13:32:17 INFO - Done submitting long buckets\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:17 INFO - Done processing buckets in 0.01 min\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:17 INFO - creating document snapshots\n", - "\u001b[36m(BucketsHashProcessorInvoker pid=16602)\u001b[0m 13:32:17 INFO - Waiting bucket processing completion. Submitted requests 1\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:18 INFO - document snapshots created\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:18 INFO - Completed 0 files (0.0%) in 0.0 min. Waiting for completion\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:25 INFO - Completed processing 2 files in 0.113 min\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:25 INFO - done flushing in 0.005 sec\n", - "13:32:35 INFO - Completed execution in 0.588 min, execution result 0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "โœ… Stage:5 completed successfully\n", - "CPU times: user 270 ms, sys: 200 ms, total: 470 ms\n", - "Wall time: 36.6 s\n" - ] - } - ], - "source": [ - "%%time\n", - "\n", - "import os\n", - "import sys\n", - "\n", - "from data_processing.utils import ParamsUtils\n", - "from fdedup_transform_ray import FdedupRayTransformConfiguration\n", - "from data_processing_ray.runtime.ray import RayTransformLauncher\n", - "\n", - "# create parameters\n", - "\n", - "local_conf = {\n", - " \"input_folder\": input_folder,\n", - " \"output_folder\": output_folder,\n", - "}\n", - "worker_options = {\"num_cpus\" : MY_CONFIG.RAY_NUM_CPUS}\n", - "code_location = {\"github\": \"github\", \"commit_hash\": \"12345\", \"path\": \"path\"}\n", - "params = {\n", - " # where to run\n", - " \"run_locally\": True,\n", - " # Data access. Only required parameters are specified\n", - " \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n", - " # Orchestration parameters\n", - " \"runtime_worker_options\": ParamsUtils.convert_to_ast(worker_options),\n", - " \"runtime_num_workers\": MY_CONFIG.RAY_RUNTIME_WORKERS,\n", - " # columns used\n", - " \"fdedup_doc_column\": \"contents\",\n", - " \"fdedup_id_column\": \"chunk_id\",\n", - " \"fdedup_cluster_column\": \"chunk_hash\",\n", - " # infrastructure\n", - " \"fdedup_bucket_cpu\": 0.3,\n", - " \"fdedup_doc_cpu\": 0.3,\n", - " \"fdedup_mhash_cpu\": 0.3,\n", - " \"fdedup_num_doc_actors\": 1,\n", - " \"fdedup_num_bucket_actors\": 1,\n", - " \"fdedup_num_minhash_actors\": 1,\n", - " \"fdedup_num_preprocessors\": 1,\n", - " # fuzzy parameters\n", - " \"fdedup_num_permutations\": 64,\n", - " \"fdedup_threshold\": 0.7, # (default 0.8)\n", - " \"fdedup_shingles_size\": 5,\n", - " \"fdedup_delimiters\": \" \"\n", - "}\n", - "\n", - "# Pass commandline params\n", - "sys.argv = ParamsUtils.dict_to_req(d=params)\n", - "\n", - "# launch\n", - "\n", - "launcher = RayTransformLauncher(FdedupRayTransformConfiguration())\n", - "\n", - "return_code = launcher.launch()\n", - "\n", - "if return_code == 0:\n", - " print (f\"โœ… Stage:{STAGE} completed successfully\")\n", - "else:\n", - " raise Exception (\"โŒ Ray job failed\")" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "๐Ÿƒ๐Ÿผ STAGE-6: Processing input='output/05_fuzzy_dedupe_out' --> output='output/06_embeddings_out'\n" + ] + } + ], + "source": [ + "STAGE = 6\n", + "\n", + "input_folder = output_fuzzy_dedupe_dir # previous output folder is the input folder for the current stage\n", + "output_folder = output_embeddings_dir\n", + "\n", + "input_df = read_parquet_files_as_df(input_folder) ## for debug purposes\n", + "\n", + "print (f\"๐Ÿƒ๐Ÿผ STAGE-{STAGE}: Processing input='{input_folder}' --> output='{output_folder}'\")" + ] + }, + { + "cell_type": "markdown", + "id": "c97545f4", + "metadata": { + "id": "c97545f4" + }, + "source": [ + "### 8.2 - Execute" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "228df6b2-bc62-494b-9697-03ece98d7853", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 914, + "referenced_widgets": [ + "8b7571c585df431eb901fcdebdf8177e", + "06107a2f48b3491f91bbe84e46e10ba0", + "bd74356eca18423aa0373c808d9097e3", + "7e13e8779a81400f996d4428c74acfaf", + "a75892696be546a3970962bae7bf732a", + "68997339f13240a4824a9e416096bee4", + "919b086abd314077bbff75687392bd91", + "b4c209371e7a403986991a786cfb296d", + "6c08de2dd9a2402c90b1a7a645db9b13", + "91fff81a1de8487c9009e872b751edb0", + "ada62d24cbcf4361acbb21808f334d33" + ] }, + "id": "228df6b2-bc62-494b-9697-03ece98d7853", + "outputId": "b10eecc1-cd17-49c1-e3b1-b80e0e1bfa86" + }, + "outputs": [ { - "cell_type": "markdown", - "id": "a6f8cd11", - "metadata": { - "id": "a6f8cd11" - }, - "source": [ - "### 7.3 - Inspect Generated output" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "13:32:37 INFO - text_encoder parameters are : {'content_column_name': 'contents', 'output_embeddings_column_name': 'embeddings', 'model_name': 'sentence-transformers/all-MiniLM-L6-v2'}\n", + "13:32:37 INFO - pipeline id pipeline_id\n", + "13:32:37 INFO - code location None\n", + "13:32:37 INFO - number of workers 2 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n", + "13:32:37 INFO - actor creation delay 0\n", + "13:32:37 INFO - job details {'job category': 'preprocessing', 'job name': 'text_encoder', 'job type': 'ray', 'job id': 'job_id'}\n", + "13:32:37 INFO - data factory data_ is using local data access: input_folder - output/05_fuzzy_dedupe_out output_folder - output/06_embeddings_out\n", + "13:32:37 INFO - data factory data_ max_files -1, n_sample -1\n", + "13:32:37 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "13:32:37 INFO - Running locally\n", + "2024-10-18 13:32:39,609\tINFO worker.py:1744 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", + "\u001b[36m(orchestrate pid=17394)\u001b[0m 13:32:42 INFO - orchestrator started at 2024-10-18 13:32:42\n", + "\u001b[36m(orchestrate pid=17394)\u001b[0m 13:32:42 INFO - Number of files is 2, source profile {'max_file_size': 0.009654045104980469, 'min_file_size': 0.00907135009765625, 'total_file_size': 0.01872539520263672}\n", + "\u001b[36m(orchestrate pid=17394)\u001b[0m 13:32:42 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 14.943363189697266, 'object_store': 7.471681594848633}\n", + "\u001b[36m(orchestrate pid=17394)\u001b[0m 13:32:42 INFO - Number of workers - 2 with {'num_cpus': 0.8, 'max_restarts': -1} each\n", + "\u001b[36m(orchestrate pid=17394)\u001b[0m 13:32:42 INFO - Completed 0 files (0.0%) in 0.0 min. Waiting for completion\n", + "\u001b[36m(orchestrate pid=17394)\u001b[0m 13:32:47 INFO - Completed processing 2 files in 0.087 min\n", + "\u001b[36m(orchestrate pid=17394)\u001b[0m 13:32:47 INFO - done flushing in 0.001 sec\n", + "13:32:57 INFO - Completed execution in 0.333 min, execution result 0\n" + ] }, { - "cell_type": "code", - "execution_count": 28, - "id": "e899ad60", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 677 - }, - "id": "e899ad60", - "outputId": "fcfda84c-ebbf-490f-f478-ceef7ca9e83b" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Input data dimensions (rows x columns)= (8, 18)\n", - "Output data dimensions (rows x columns)= (6, 18)\n", - "Duplicate chunks removed by fuzzy-dedupe: 2\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
filenamenum_pagesnum_tablesnum_doc_elementsexthashsizedate_acquiredpdf_convert_timesource_filenamesource_document_idcontentsdoc_jsonpathpage_numberbboxdocument_idchunk_idchunk_hash
0mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Solar System\\nOur solar system is a vast and f...$.main-text[2]1[132.84518433, 588.96014404, 479.40917969, 623...44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...4-1
1mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Mars\\nMars, the fourth planet from the Sun, is...$.main-text[5]1[132.87440491, 500.84011841, 477.48345947, 534...a31663e06fac41470ecc459f5a58658a3f9997d7801053...6-1
2mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Basic facts about Mars:\\nยท Distance from the S...$.main-text[6]1[133.2026062, 482.90710449, 237.04431152, 493....7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...7-1
3earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Solar System\\nFor more details about our Solar...$.main-text[3]1[133.20942688, 570.81555176, 375.57919312, 581...d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...15
4earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Earth\\nEarth is the third planet from the Sun....$.main-text[5]1[132.91053772, 512.46295166, 477.84887695, 534...7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...2-1
5earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Earth\\nBasic facts about Earth:\\nยท Distance fr...$.main-text[6]1[133.30151367, 494.86206055, 240.17156982, 505...189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...3-1
\n", - "
" - ], - "text/plain": [ - " filename num_pages num_tables num_doc_elements ext \\\n", - "0 mars.pdf 1 0 11 pdf \n", - "1 mars.pdf 1 0 11 pdf \n", - "2 mars.pdf 1 0 11 pdf \n", - "3 earth.pdf 1 0 11 pdf \n", - "4 earth.pdf 1 0 11 pdf \n", - "5 earth.pdf 1 0 11 pdf \n", - "\n", - " hash size \\\n", - "0 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "1 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "2 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "3 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "4 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "5 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "\n", - " date_acquired pdf_convert_time source_filename \\\n", - "0 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", - "1 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", - "2 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", - "3 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", - "4 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", - "5 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", - "\n", - " source_document_id \\\n", - "0 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", - "1 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", - "2 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", - "3 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", - "4 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", - "5 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", - "\n", - " contents doc_jsonpath \\\n", - "0 Solar System\\nOur solar system is a vast and f... $.main-text[2] \n", - "1 Mars\\nMars, the fourth planet from the Sun, is... $.main-text[5] \n", - "2 Basic facts about Mars:\\nยท Distance from the S... $.main-text[6] \n", - "3 Solar System\\nFor more details about our Solar... $.main-text[3] \n", - "4 Earth\\nEarth is the third planet from the Sun.... $.main-text[5] \n", - "5 Earth\\nBasic facts about Earth:\\nยท Distance fr... $.main-text[6] \n", - "\n", - " page_number bbox \\\n", - "0 1 [132.84518433, 588.96014404, 479.40917969, 623... \n", - "1 1 [132.87440491, 500.84011841, 477.48345947, 534... \n", - "2 1 [133.2026062, 482.90710449, 237.04431152, 493.... \n", - "3 1 [133.20942688, 570.81555176, 375.57919312, 581... \n", - "4 1 [132.91053772, 512.46295166, 477.84887695, 534... \n", - "5 1 [133.30151367, 494.86206055, 240.17156982, 505... \n", - "\n", - " document_id chunk_id chunk_hash \n", - "0 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... 4 -1 \n", - "1 a31663e06fac41470ecc459f5a58658a3f9997d7801053... 6 -1 \n", - "2 7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a... 7 -1 \n", - "3 d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d... 1 5 \n", - "4 7c4a750e2215f231803a6f8078bde1e9699034fb033dd3... 2 -1 \n", - "5 189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f... 3 -1 " - ] - }, - "execution_count": 28, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from my_utils import read_parquet_files_as_df\n", - "\n", - "output_df = read_parquet_files_as_df(output_folder)\n", - "\n", - "print (\"Input data dimensions (rows x columns)= \", input_df.shape)\n", - "print (\"Output data dimensions (rows x columns)= \", output_df.shape)\n", - "print (\"Duplicate chunks removed by fuzzy-dedupe: \", (input_df.shape[0] - output_df.shape[0]))\n", - "\n", - "output_df.head(10)" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "โœ… Stage:6 completed successfully\n", + "CPU times: user 607 ms, sys: 226 ms, total: 833 ms\n", + "Wall time: 22.1 s\n" + ] + } + ], + "source": [ + "%%time\n", + "\n", + "from text_encoder_transform_ray import TextEncoderRayTransformConfiguration\n", + "\n", + "local_conf = {\n", + " \"input_folder\": input_folder,\n", + " \"output_folder\": output_folder,\n", + "}\n", + "worker_options = {\"num_cpus\" : MY_CONFIG.RAY_NUM_CPUS}\n", + "params = {\n", + " # where to run\n", + " \"run_locally\": True,\n", + " # Data access. Only required parameters are specified\n", + " \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n", + " # orchestrator\n", + " \"runtime_worker_options\": ParamsUtils.convert_to_ast(worker_options),\n", + " \"runtime_num_workers\": MY_CONFIG.RAY_RUNTIME_WORKERS,\n", + " # text_encoder\n", + " \"text_encoder_model_name\": MY_CONFIG.EMBEDDING_MODEL,\n", + "}\n", + "\n", + "sys.argv = ParamsUtils.dict_to_req(d=params)\n", + "# create launcher\n", + "launcher = RayTransformLauncher(TextEncoderRayTransformConfiguration())\n", + "# Launch the ray actor(s) to process the input\n", + "\n", + "return_code = launcher.launch()\n", + "\n", + "if return_code == 0:\n", + " print (f\"โœ… Stage:{STAGE} completed successfully\")\n", + "else:\n", + " raise Exception (\"โŒ Ray job failed\")" + ] + }, + { + "cell_type": "markdown", + "id": "b734852c", + "metadata": { + "id": "b734852c" + }, + "source": [ + "### 8.3 - Inspect Generated output\n", + "\n", + "You will see a column called `embeddings` added at the end. This the text content converted into vectors or embeddings. We used the model `sentence-transformers/all-MiniLM-L6-v2`" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "7b1c1d09", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 659 }, + "id": "7b1c1d09", + "outputId": "70612634-b336-4ad5-ddb3-782ca0676bae" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 29, - "id": "ab7ea52b", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 238 - }, - "id": "ab7ea52b", - "outputId": "e38754ee-777f-4ed7-ebc0-9299ee122662" - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
filenamecontents
0mars.pdfSolar System\\nOur solar system is a vast and f...
1mars.pdfMars\\nMars, the fourth planet from the Sun, is...
2mars.pdfBasic facts about Mars:\\nยท Distance from the S...
3earth.pdfSolar System\\nFor more details about our Solar...
4earth.pdfEarth\\nEarth is the third planet from the Sun....
5earth.pdfEarth\\nBasic facts about Earth:\\nยท Distance fr...
\n", - "
" - ], - "text/plain": [ - " filename contents\n", - "0 mars.pdf Solar System\\nOur solar system is a vast and f...\n", - "1 mars.pdf Mars\\nMars, the fourth planet from the Sun, is...\n", - "2 mars.pdf Basic facts about Mars:\\nยท Distance from the S...\n", - "3 earth.pdf Solar System\\nFor more details about our Solar...\n", - "4 earth.pdf Earth\\nEarth is the third planet from the Sun....\n", - "5 earth.pdf Earth\\nBasic facts about Earth:\\nยท Distance fr..." - ] - }, - "execution_count": 29, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "output_df[['filename', 'contents']]" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "Input data dimensions (rows x columns)= (6, 18)\n", + "Output data dimensions (rows x columns)= (6, 19)\n" + ] }, { - "cell_type": "code", - "execution_count": 30, - "id": "6bdd3515", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "6bdd3515", - "outputId": "e6e3f2c0-5b23-4336-bc95-013921f0724a" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "========== mars.pdf ===========\n", - "-------Chunk 0------\n", - "Solar System\n", - "Our solar system is a vast and fascinating expanse, comprising eight planets, five dwarf planets, numerous moons, asteroids, comets, and other celestial bodies. At its center lies the star we call the Sun.\n", - "-------\n", - "-------Chunk 1------\n", - "Mars\n", - "Mars, the fourth planet from the Sun, is a cold, desert world with a thin atmosphere composed primarily of carbon dioxide. Its reddish hue comes from iron oxide, or rust, prevalent on its surface.\n", - "-------\n", - "-------Chunk 2------\n", - "Basic facts about Mars:\n", - "ยท Distance from the Sun: Average of 228 million kilometers (142 million miles)\n", - "ยท Rotation Period: 24.6 hours (one Martian day - called a \"sol\")\n", - "ยท Moons: Two small moons, Phobos and Deimos.\n", - "-------\n", - "========== earth.pdf ===========\n", - "-------Chunk 0------\n", - "Solar System\n", - "For more details about our Solar system see Chapter 1.\n", - "-------\n", - "-------Chunk 1------\n", - "Earth\n", - "Earth is the third planet from the Sun. It's our home planet. Earth is the only place we know of with life.\n", - "-------\n", - "-------Chunk 2------\n", - "Earth\n", - "Basic facts about Earth:\n", - "ยท Distance from the Sun: Average of 149.6 million kilometers (93 million miles)\n", - "ยท Rotation Period: 24 hours (one day)\n", - "ยท Moons: One moon, called Luna or simply \"the Moon\".\n", - "-------\n" - ] - } + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
filenamenum_pagesnum_tablesnum_doc_elementsexthashsizedate_acquiredpdf_convert_timesource_filenamesource_document_idcontentsdoc_jsonpathpage_numberbboxdocument_idchunk_idchunk_hashembeddings
0mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Solar System\\nOur solar system is a vast and f...$.main-text[2]1[132.84518433, 588.96014404, 479.40917969, 623...44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...4-1[0.0077404897, -0.020559434, 0.026426662, 0.01...
1mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Mars\\nMars, the fourth planet from the Sun, is...$.main-text[5]1[132.87440491, 500.84011841, 477.48345947, 534...a31663e06fac41470ecc459f5a58658a3f9997d7801053...6-1[0.07728298, 0.024971062, -0.04318075, 0.05809...
2mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Basic facts about Mars:\\nยท Distance from the S...$.main-text[6]1[133.2026062, 482.90710449, 237.04431152, 493....7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...7-1[0.1059802, 0.025460616, 0.02362733, 0.0390564...
3earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Solar System\\nFor more details about our Solar...$.main-text[3]1[133.20942688, 570.81555176, 375.57919312, 581...d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...15[-0.062105577, -0.0053322953, 0.03127779, 0.04...
4earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Earth\\nEarth is the third planet from the Sun....$.main-text[5]1[132.91053772, 512.46295166, 477.84887695, 534...7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...2-1[0.0724358, -0.058001805, -0.01977186, -0.0243...
5earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Earth\\nBasic facts about Earth:\\nยท Distance fr...$.main-text[6]1[133.30151367, 494.86206055, 240.17156982, 505...189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...3-1[0.091821924, 0.015197907, 0.07716932, 0.01711...
\n", + "
" ], - "source": [ - "for f in output_df['filename'].unique():\n", - " print ('==========' , f, '===========')\n", - " chunks = output_df[output_df['filename'] == f]['contents']\n", - " for idx , chunk in enumerate(chunks):\n", - " print (f'-------Chunk {idx}------\\n{chunk}\\n-------')" - ] - }, - { - "cell_type": "markdown", - "id": "2b34d9c6", - "metadata": { - "id": "2b34d9c6" - }, - "source": [ - "### 7.4- Understanding the output\n", - "\n", - "So we started with 7 rows and ended up with 6. Fuzzy dedupe removed the following **very similar** chunk.\n", - "\n", - "These are pretty similar chunks except for the words 'the' and 'our'\n", - "\n", - "**earth.pdf**\n", - "\n", - "`For more details about *our* Solar system see Chapter 1.`\n", - "\n", - "**mars.pdf**\n", - "\n", - "`For more details about *the* Solar system see Chapter 1.`\n", - "\n", - "Pretty neat, eh? ๐Ÿ‘\n", - "\n", - "### Configuring Fuzzy de-dupe\n", - "\n", - "You can tweak fuzzy dedupe by tweaking the following parameters\n", - "\n", - "```python\n", - "# fuzzy parameters\n", - " \"fdedup_num_permutations\": 64,\n", - " \"fdedup_threshold\": 0.7, # (default 0.8)\n", - " \"fdedup_shingles_size\": 5,\n", - " \"fdedup_delimiters\": \" \"\n", - "```\n", - "\n", - "In our case, we set `fdedup_threshold` parameter to 0.7. \n" + "text/plain": [ + " filename num_pages num_tables num_doc_elements ext \\\n", + "0 mars.pdf 1 0 11 pdf \n", + "1 mars.pdf 1 0 11 pdf \n", + "2 mars.pdf 1 0 11 pdf \n", + "3 earth.pdf 1 0 11 pdf \n", + "4 earth.pdf 1 0 11 pdf \n", + "5 earth.pdf 1 0 11 pdf \n", + "\n", + " hash size \\\n", + "0 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", + "1 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", + "2 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", + "3 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", + "4 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", + "5 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", + "\n", + " date_acquired pdf_convert_time source_filename \\\n", + "0 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", + "1 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", + "2 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", + "3 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", + "4 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", + "5 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", + "\n", + " source_document_id \\\n", + "0 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", + "1 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", + "2 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", + "3 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", + "4 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", + "5 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", + "\n", + " contents doc_jsonpath \\\n", + "0 Solar System\\nOur solar system is a vast and f... $.main-text[2] \n", + "1 Mars\\nMars, the fourth planet from the Sun, is... $.main-text[5] \n", + "2 Basic facts about Mars:\\nยท Distance from the S... $.main-text[6] \n", + "3 Solar System\\nFor more details about our Solar... $.main-text[3] \n", + "4 Earth\\nEarth is the third planet from the Sun.... $.main-text[5] \n", + "5 Earth\\nBasic facts about Earth:\\nยท Distance fr... $.main-text[6] \n", + "\n", + " page_number bbox \\\n", + "0 1 [132.84518433, 588.96014404, 479.40917969, 623... \n", + "1 1 [132.87440491, 500.84011841, 477.48345947, 534... \n", + "2 1 [133.2026062, 482.90710449, 237.04431152, 493.... \n", + "3 1 [133.20942688, 570.81555176, 375.57919312, 581... \n", + "4 1 [132.91053772, 512.46295166, 477.84887695, 534... \n", + "5 1 [133.30151367, 494.86206055, 240.17156982, 505... \n", + "\n", + " document_id chunk_id chunk_hash \\\n", + "0 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... 4 -1 \n", + "1 a31663e06fac41470ecc459f5a58658a3f9997d7801053... 6 -1 \n", + "2 7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a... 7 -1 \n", + "3 d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d... 1 5 \n", + "4 7c4a750e2215f231803a6f8078bde1e9699034fb033dd3... 2 -1 \n", + "5 189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f... 3 -1 \n", + "\n", + " embeddings \n", + "0 [0.0077404897, -0.020559434, 0.026426662, 0.01... \n", + "1 [0.07728298, 0.024971062, -0.04318075, 0.05809... \n", + "2 [0.1059802, 0.025460616, 0.02362733, 0.0390564... \n", + "3 [-0.062105577, -0.0053322953, 0.03127779, 0.04... \n", + "4 [0.0724358, -0.058001805, -0.01977186, -0.0243... \n", + "5 [0.091821924, 0.015197907, 0.07716932, 0.01711... " ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from my_utils import read_parquet_files_as_df\n", + "\n", + "output_df = read_parquet_files_as_df(output_folder)\n", + "\n", + "print (\"Input data dimensions (rows x columns)= \", input_df.shape)\n", + "print (\"Output data dimensions (rows x columns)= \", output_df.shape)\n", + "\n", + "output_df.head(10)" + ] + }, + { + "cell_type": "markdown", + "id": "f5e12630-be6b-4188-a925-77117155617b", + "metadata": { + "id": "f5e12630-be6b-4188-a925-77117155617b" + }, + "source": [ + "## Step-9: Copy output to final output dir" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "16dee3b8-31dc-4168-8adb-f2a0a0b5e207", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "16dee3b8-31dc-4168-8adb-f2a0a0b5e207", + "outputId": "d151e618-6528-40b5-fdbd-1c67291a7279" + }, + "outputs": [ { - "cell_type": "markdown", - "id": "5370950a-2a3a-4143-8218-f9b4808099ba", - "metadata": { - "id": "5370950a-2a3a-4143-8218-f9b4808099ba" - }, - "source": [ - "## Step-8: Text encoding\n", - "\n", - "Encode text for the vector storage." - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "โœ… Copied output from 'output/06_embeddings_out' --> 'output/output_final'\n" + ] + } + ], + "source": [ + "import shutil\n", + "\n", + "shutil.rmtree(MY_CONFIG.OUTPUT_FOLDER_FINAL, ignore_errors=True)\n", + "shutil.copytree(src=output_folder, dst=MY_CONFIG.OUTPUT_FOLDER_FINAL)\n", + "\n", + "print (f\"โœ… Copied output from '{output_folder}' --> '{MY_CONFIG.OUTPUT_FOLDER_FINAL}'\")" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "dc0a6728", + "metadata": { + "id": "dc0a6728" + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "dpk-3-basic-022dev1-py311", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.10" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "06107a2f48b3491f91bbe84e46e10ba0": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_68997339f13240a4824a9e416096bee4", + "placeholder": "โ€‹", + "style": "IPY_MODEL_919b086abd314077bbff75687392bd91", + "value": "" + } }, - { - "cell_type": "markdown", - "id": "85aba685", - "metadata": { - "id": "85aba685" - }, - "source": [ - "### 8.1 - Set Input/output Folder" - ] + "68997339f13240a4824a9e416096bee4": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } }, - { - "cell_type": "code", - "execution_count": 31, - "id": "20a153fa-fd56-401e-86be-4f7617affcc8", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "20a153fa-fd56-401e-86be-4f7617affcc8", - "outputId": "530e65c6-7ceb-4c73-cb87-50da46c78add" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "๐Ÿƒ๐Ÿผ STAGE-6: Processing input='output/05_fuzzy_dedupe_out' --> output='output/06_embeddings_out'\n" - ] - } - ], - "source": [ - "STAGE = 6\n", - "\n", - "input_folder = output_fuzzy_dedupe_dir # previous output folder is the input folder for the current stage\n", - "output_folder = output_embeddings_dir\n", - "\n", - "input_df = read_parquet_files_as_df(input_folder) ## for debug purposes\n", - "\n", - "print (f\"๐Ÿƒ๐Ÿผ STAGE-{STAGE}: Processing input='{input_folder}' --> output='{output_folder}'\")" - ] + "6c08de2dd9a2402c90b1a7a645db9b13": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } }, - { - "cell_type": "markdown", - "id": "c97545f4", - "metadata": { - "id": "c97545f4" - }, - "source": [ - "### 8.2 - Execute" - ] + "7e13e8779a81400f996d4428c74acfaf": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_91fff81a1de8487c9009e872b751edb0", + "placeholder": "โ€‹", + "style": "IPY_MODEL_ada62d24cbcf4361acbb21808f334d33", + "value": "โ€‡0/0โ€‡[00:00<?,โ€‡?it/s]" + } }, - { - "cell_type": "code", - "execution_count": 32, - "id": "228df6b2-bc62-494b-9697-03ece98d7853", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 914, - "referenced_widgets": [ - "8b7571c585df431eb901fcdebdf8177e", - "06107a2f48b3491f91bbe84e46e10ba0", - "bd74356eca18423aa0373c808d9097e3", - "7e13e8779a81400f996d4428c74acfaf", - "a75892696be546a3970962bae7bf732a", - "68997339f13240a4824a9e416096bee4", - "919b086abd314077bbff75687392bd91", - "b4c209371e7a403986991a786cfb296d", - "6c08de2dd9a2402c90b1a7a645db9b13", - "91fff81a1de8487c9009e872b751edb0", - "ada62d24cbcf4361acbb21808f334d33" - ] - }, - "id": "228df6b2-bc62-494b-9697-03ece98d7853", - "outputId": "b10eecc1-cd17-49c1-e3b1-b80e0e1bfa86" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "13:32:37 INFO - text_encoder parameters are : {'content_column_name': 'contents', 'output_embeddings_column_name': 'embeddings', 'model_name': 'sentence-transformers/all-MiniLM-L6-v2'}\n", - "13:32:37 INFO - pipeline id pipeline_id\n", - "13:32:37 INFO - code location None\n", - "13:32:37 INFO - number of workers 2 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n", - "13:32:37 INFO - actor creation delay 0\n", - "13:32:37 INFO - job details {'job category': 'preprocessing', 'job name': 'text_encoder', 'job type': 'ray', 'job id': 'job_id'}\n", - "13:32:37 INFO - data factory data_ is using local data access: input_folder - output/05_fuzzy_dedupe_out output_folder - output/06_embeddings_out\n", - "13:32:37 INFO - data factory data_ max_files -1, n_sample -1\n", - "13:32:37 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "13:32:37 INFO - Running locally\n", - "2024-10-18 13:32:39,609\tINFO worker.py:1744 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", - "\u001b[36m(orchestrate pid=17394)\u001b[0m 13:32:42 INFO - orchestrator started at 2024-10-18 13:32:42\n", - "\u001b[36m(orchestrate pid=17394)\u001b[0m 13:32:42 INFO - Number of files is 2, source profile {'max_file_size': 0.009654045104980469, 'min_file_size': 0.00907135009765625, 'total_file_size': 0.01872539520263672}\n", - "\u001b[36m(orchestrate pid=17394)\u001b[0m 13:32:42 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 14.943363189697266, 'object_store': 7.471681594848633}\n", - "\u001b[36m(orchestrate pid=17394)\u001b[0m 13:32:42 INFO - Number of workers - 2 with {'num_cpus': 0.8, 'max_restarts': -1} each\n", - "\u001b[36m(orchestrate pid=17394)\u001b[0m 13:32:42 INFO - Completed 0 files (0.0%) in 0.0 min. Waiting for completion\n", - "\u001b[36m(orchestrate pid=17394)\u001b[0m 13:32:47 INFO - Completed processing 2 files in 0.087 min\n", - "\u001b[36m(orchestrate pid=17394)\u001b[0m 13:32:47 INFO - done flushing in 0.001 sec\n", - "13:32:57 INFO - Completed execution in 0.333 min, execution result 0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "โœ… Stage:6 completed successfully\n", - "CPU times: user 607 ms, sys: 226 ms, total: 833 ms\n", - "Wall time: 22.1 s\n" - ] - } + "8b7571c585df431eb901fcdebdf8177e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_06107a2f48b3491f91bbe84e46e10ba0", + "IPY_MODEL_bd74356eca18423aa0373c808d9097e3", + "IPY_MODEL_7e13e8779a81400f996d4428c74acfaf" ], - "source": [ - "%%time\n", - "\n", - "from text_encoder_transform_ray import TextEncoderRayTransformConfiguration\n", - "\n", - "local_conf = {\n", - " \"input_folder\": input_folder,\n", - " \"output_folder\": output_folder,\n", - "}\n", - "worker_options = {\"num_cpus\" : MY_CONFIG.RAY_NUM_CPUS}\n", - "params = {\n", - " # where to run\n", - " \"run_locally\": True,\n", - " # Data access. Only required parameters are specified\n", - " \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n", - " # orchestrator\n", - " \"runtime_worker_options\": ParamsUtils.convert_to_ast(worker_options),\n", - " \"runtime_num_workers\": MY_CONFIG.RAY_RUNTIME_WORKERS,\n", - " # text_encoder\n", - " \"text_encoder_model_name\": MY_CONFIG.EMBEDDING_MODEL,\n", - "}\n", - "\n", - "sys.argv = ParamsUtils.dict_to_req(d=params)\n", - "# create launcher\n", - "launcher = RayTransformLauncher(TextEncoderRayTransformConfiguration())\n", - "# Launch the ray actor(s) to process the input\n", - "\n", - "return_code = launcher.launch()\n", - "\n", - "if return_code == 0:\n", - " print (f\"โœ… Stage:{STAGE} completed successfully\")\n", - "else:\n", - " raise Exception (\"โŒ Ray job failed\")" - ] + "layout": "IPY_MODEL_a75892696be546a3970962bae7bf732a" + } }, - { - "cell_type": "markdown", - "id": "b734852c", - "metadata": { - "id": "b734852c" - }, - "source": [ - "### 8.3 - Inspect Generated output\n", - "\n", - "You will see a column called `embeddings` added at the end. This the text content converted into vectors or embeddings. We used the model `sentence-transformers/all-MiniLM-L6-v2`" - ] + "919b086abd314077bbff75687392bd91": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } }, - { - "cell_type": "code", - "execution_count": 33, - "id": "7b1c1d09", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 659 - }, - "id": "7b1c1d09", - "outputId": "70612634-b336-4ad5-ddb3-782ca0676bae" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Input data dimensions (rows x columns)= (6, 18)\n", - "Output data dimensions (rows x columns)= (6, 19)\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
filenamenum_pagesnum_tablesnum_doc_elementsexthashsizedate_acquiredpdf_convert_timesource_filenamesource_document_idcontentsdoc_jsonpathpage_numberbboxdocument_idchunk_idchunk_hashembeddings
0mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Solar System\\nOur solar system is a vast and f...$.main-text[2]1[132.84518433, 588.96014404, 479.40917969, 623...44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...4-1[0.0077404897, -0.020559434, 0.026426662, 0.01...
1mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Mars\\nMars, the fourth planet from the Sun, is...$.main-text[5]1[132.87440491, 500.84011841, 477.48345947, 534...a31663e06fac41470ecc459f5a58658a3f9997d7801053...6-1[0.07728298, 0.024971062, -0.04318075, 0.05809...
2mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Basic facts about Mars:\\nยท Distance from the S...$.main-text[6]1[133.2026062, 482.90710449, 237.04431152, 493....7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...7-1[0.1059802, 0.025460616, 0.02362733, 0.0390564...
3earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Solar System\\nFor more details about our Solar...$.main-text[3]1[133.20942688, 570.81555176, 375.57919312, 581...d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...15[-0.062105577, -0.0053322953, 0.03127779, 0.04...
4earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Earth\\nEarth is the third planet from the Sun....$.main-text[5]1[132.91053772, 512.46295166, 477.84887695, 534...7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...2-1[0.0724358, -0.058001805, -0.01977186, -0.0243...
5earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Earth\\nBasic facts about Earth:\\nยท Distance fr...$.main-text[6]1[133.30151367, 494.86206055, 240.17156982, 505...189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...3-1[0.091821924, 0.015197907, 0.07716932, 0.01711...
\n", - "
" - ], - "text/plain": [ - " filename num_pages num_tables num_doc_elements ext \\\n", - "0 mars.pdf 1 0 11 pdf \n", - "1 mars.pdf 1 0 11 pdf \n", - "2 mars.pdf 1 0 11 pdf \n", - "3 earth.pdf 1 0 11 pdf \n", - "4 earth.pdf 1 0 11 pdf \n", - "5 earth.pdf 1 0 11 pdf \n", - "\n", - " hash size \\\n", - "0 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "1 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "2 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "3 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "4 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "5 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "\n", - " date_acquired pdf_convert_time source_filename \\\n", - "0 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", - "1 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", - "2 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", - "3 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", - "4 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", - "5 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", - "\n", - " source_document_id \\\n", - "0 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", - "1 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", - "2 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", - "3 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", - "4 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", - "5 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", - "\n", - " contents doc_jsonpath \\\n", - "0 Solar System\\nOur solar system is a vast and f... $.main-text[2] \n", - "1 Mars\\nMars, the fourth planet from the Sun, is... $.main-text[5] \n", - "2 Basic facts about Mars:\\nยท Distance from the S... $.main-text[6] \n", - "3 Solar System\\nFor more details about our Solar... $.main-text[3] \n", - "4 Earth\\nEarth is the third planet from the Sun.... $.main-text[5] \n", - "5 Earth\\nBasic facts about Earth:\\nยท Distance fr... $.main-text[6] \n", - "\n", - " page_number bbox \\\n", - "0 1 [132.84518433, 588.96014404, 479.40917969, 623... \n", - "1 1 [132.87440491, 500.84011841, 477.48345947, 534... \n", - "2 1 [133.2026062, 482.90710449, 237.04431152, 493.... \n", - "3 1 [133.20942688, 570.81555176, 375.57919312, 581... \n", - "4 1 [132.91053772, 512.46295166, 477.84887695, 534... \n", - "5 1 [133.30151367, 494.86206055, 240.17156982, 505... \n", - "\n", - " document_id chunk_id chunk_hash \\\n", - "0 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... 4 -1 \n", - "1 a31663e06fac41470ecc459f5a58658a3f9997d7801053... 6 -1 \n", - "2 7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a... 7 -1 \n", - "3 d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d... 1 5 \n", - "4 7c4a750e2215f231803a6f8078bde1e9699034fb033dd3... 2 -1 \n", - "5 189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f... 3 -1 \n", - "\n", - " embeddings \n", - "0 [0.0077404897, -0.020559434, 0.026426662, 0.01... \n", - "1 [0.07728298, 0.024971062, -0.04318075, 0.05809... \n", - "2 [0.1059802, 0.025460616, 0.02362733, 0.0390564... \n", - "3 [-0.062105577, -0.0053322953, 0.03127779, 0.04... \n", - "4 [0.0724358, -0.058001805, -0.01977186, -0.0243... \n", - "5 [0.091821924, 0.015197907, 0.07716932, 0.01711... " - ] - }, - "execution_count": 33, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from my_utils import read_parquet_files_as_df\n", - "\n", - "output_df = read_parquet_files_as_df(output_folder)\n", - "\n", - "print (\"Input data dimensions (rows x columns)= \", input_df.shape)\n", - "print (\"Output data dimensions (rows x columns)= \", output_df.shape)\n", - "\n", - "output_df.head(10)" - ] + "91fff81a1de8487c9009e872b751edb0": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } }, - { - "cell_type": "markdown", - "id": "f5e12630-be6b-4188-a925-77117155617b", - "metadata": { - "id": "f5e12630-be6b-4188-a925-77117155617b" - }, - "source": [ - "## Step-9: Copy output to final output dir" - ] + "a75892696be546a3970962bae7bf732a": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } }, - { - "cell_type": "code", - "execution_count": 34, - "id": "16dee3b8-31dc-4168-8adb-f2a0a0b5e207", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "16dee3b8-31dc-4168-8adb-f2a0a0b5e207", - "outputId": "d151e618-6528-40b5-fdbd-1c67291a7279" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "โœ… Copied output from 'output/06_embeddings_out' --> 'output/output_final'\n" - ] - } - ], - "source": [ - "import shutil\n", - "\n", - "shutil.rmtree(MY_CONFIG.OUTPUT_FOLDER_FINAL, ignore_errors=True)\n", - "shutil.copytree(src=output_folder, dst=MY_CONFIG.OUTPUT_FOLDER_FINAL)\n", - "\n", - "print (f\"โœ… Copied output from '{output_folder}' --> '{MY_CONFIG.OUTPUT_FOLDER_FINAL}'\")" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "id": "dc0a6728", - "metadata": { - "id": "dc0a6728" - }, - "outputs": [], - "source": [] - } - ], - "metadata": { - "colab": { - "provenance": [] + "ada62d24cbcf4361acbb21808f334d33": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } }, - "kernelspec": { - "display_name": "dpk-2-basic-021-py311", - "language": "python", - "name": "python3" + "b4c209371e7a403986991a786cfb296d": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": "20px" + } }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.10" - }, - "widgets": { - "application/vnd.jupyter.widget-state+json": { - "06107a2f48b3491f91bbe84e46e10ba0": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_68997339f13240a4824a9e416096bee4", - "placeholder": "โ€‹", - "style": "IPY_MODEL_919b086abd314077bbff75687392bd91", - "value": "" - } - }, - "68997339f13240a4824a9e416096bee4": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "6c08de2dd9a2402c90b1a7a645db9b13": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "7e13e8779a81400f996d4428c74acfaf": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_91fff81a1de8487c9009e872b751edb0", - "placeholder": "โ€‹", - "style": "IPY_MODEL_ada62d24cbcf4361acbb21808f334d33", - "value": "โ€‡0/0โ€‡[00:00<?,โ€‡?it/s]" - } - }, - "8b7571c585df431eb901fcdebdf8177e": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_06107a2f48b3491f91bbe84e46e10ba0", - "IPY_MODEL_bd74356eca18423aa0373c808d9097e3", - "IPY_MODEL_7e13e8779a81400f996d4428c74acfaf" - ], - "layout": "IPY_MODEL_a75892696be546a3970962bae7bf732a" - } - }, - "919b086abd314077bbff75687392bd91": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "91fff81a1de8487c9009e872b751edb0": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "a75892696be546a3970962bae7bf732a": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "ada62d24cbcf4361acbb21808f334d33": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "b4c209371e7a403986991a786cfb296d": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": "20px" - } - }, - "bd74356eca18423aa0373c808d9097e3": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_b4c209371e7a403986991a786cfb296d", - "max": 1, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_6c08de2dd9a2402c90b1a7a645db9b13", - "value": 0 - } - } - } + "bd74356eca18423aa0373c808d9097e3": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_b4c209371e7a403986991a786cfb296d", + "max": 1, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_6c08de2dd9a2402c90b1a7a645db9b13", + "value": 0 + } } - }, - "nbformat": 4, - "nbformat_minor": 5 + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 } diff --git a/examples/notebooks/rag/README.md b/examples/notebooks/rag/README.md index f4a3460a1..16ffdb15e 100644 --- a/examples/notebooks/rag/README.md +++ b/examples/notebooks/rag/README.md @@ -76,7 +76,7 @@ REPLICATE_API_TOKEN=your REPLICATE token goes here ### 5.2 - Run the query code -Code: [rag_1D_query_llama_replicate.ipynb](rag_1D_query_llama_replicate.ipynb) +Code: [rag_1D_query_replicate.ipynb](rag_1D_query_replicate.ipynb) diff --git a/examples/notebooks/rag/my_config.py b/examples/notebooks/rag/my_config.py index ba9ea89fd..66fc1ecf7 100644 --- a/examples/notebooks/rag/my_config.py +++ b/examples/notebooks/rag/my_config.py @@ -23,8 +23,10 @@ class MyConfig: MY_CONFIG.EMBEDDING_LENGTH = 384 ## LLM Model -MY_CONFIG.LLM_MODEL = "meta/meta-llama-3-8b-instruct" - +# MY_CONFIG.LLM_MODEL = "meta/meta-llama-3-8b-instruct" +# MY_CONFIG.LLM_MODEL = "meta/meta-llama-3-70b-instruct" +# MY_CONFIG.LLM_MODEL = "ibm-granite/granite-3.0-2b-instruct" +MY_CONFIG.LLM_MODEL = "ibm-granite/granite-3.0-8b-instruct" ## RAY CONFIGURATION diff --git a/examples/notebooks/rag/rag_1A_dpk_process_ray.ipynb b/examples/notebooks/rag/rag_1A_dpk_process_ray.ipynb index 8a8942b1f..8bdea1ff6 100644 --- a/examples/notebooks/rag/rag_1A_dpk_process_ray.ipynb +++ b/examples/notebooks/rag/rag_1A_dpk_process_ray.ipynb @@ -222,7 +222,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "id": "b0cd8ebd-bf71-42d6-a397-8df0c7b66a26", "metadata": {}, "outputs": [ @@ -303,7 +303,7 @@ " \"data_files_to_use\": ast.literal_eval(\"['.pdf']\"),\n", " # orchestrator\n", " \"runtime_worker_options\": ParamsUtils.convert_to_ast(worker_options),\n", - " \"runtime_num_workers\": MY_CONFIG.RAY_RUNTIME_WORKERS,\n", + " \"runtime_num_workers\": 1, # so model download to cleanup works properly\n", " \"runtime_pipeline_id\": \"pipeline_id\",\n", " \"runtime_job_id\": \"job_id\",\n", " \"runtime_code_location\": ParamsUtils.convert_to_ast(code_location),\n", @@ -2159,7 +2159,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "data-prep-kit-3-py312", "language": "python", "name": "python3" }, @@ -2173,7 +2173,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.9" + "version": "3.12.7" } }, "nbformat": 4, diff --git a/examples/notebooks/rag/rag_1D_query_llama_replicate.ipynb b/examples/notebooks/rag/rag_1D_query_replicate.ipynb similarity index 88% rename from examples/notebooks/rag/rag_1D_query_llama_replicate.ipynb rename to examples/notebooks/rag/rag_1D_query_replicate.ipynb index 532b7ef4d..5e94ac0e8 100644 --- a/examples/notebooks/rag/rag_1D_query_llama_replicate.ipynb +++ b/examples/notebooks/rag/rag_1D_query_replicate.ipynb @@ -249,33 +249,45 @@ "\n", "### LLM Choices at Replicate\n", "\n", - "- llama 3.1 : Latest\n", - " - **meta/meta-llama-3.1-405b-instruct** : Meta's flagship 405 billion parameter language model, fine-tuned for chat completions\n", - "- Base version of llama-3 from meta\n", - " - [meta/meta-llama-3-8b](https://replicate.com/meta/meta-llama-3-8b) : Base version of Llama 3, an 8 billion parameter language model from Meta.\n", - " - **meta/meta-llama-3-70b** : 70 billion\n", - "- Instruct versions of llama-3 from meta, fine tuned for chat completions\n", - " - **meta/meta-llama-3-8b-instruct** : An 8 billion parameter language model from Meta, \n", - " - **meta/meta-llama-3-70b-instruct** : 70 billion\n", + "\n", + "| Model | Publisher | Params | Description |\n", + "|-------------------------------------|-----------|--------|------------------------------------------------------|\n", + "| ibm-granite/granite-3.0-8b-instruct | IBM | 8 B | IBM's newest Granite Model v3.0 (default) |\n", + "| ibm-granite/granite-3.0-2b-instruct | IBM | 2 B | IBM's newest Granite Model v3.0 |\n", + "| meta/meta-llama-3.1-405b-instruct | Meta | 405 B | Meta's flagship 405 billion parameter language model |\n", + "| meta/meta-llama-3-8b-instruct | Meta | 8 B | Meta's 8 billion parameter language model |\n", + "| meta/meta-llama-3-70b-instruct | Meta | 70 B | Meta's 70 billion parameter language model |\n", "\n", "References \n", "\n", - "- https://docs.llamaindex.ai/en/stable/examples/llm/llama_2/?h=replicate" + "- https://www.ibm.com/granite\n", + "- https://www.llama.com/\n", + "- https://replicate.com/ " ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Using model: ibm-granite/granite-3.0-8b-instruct\n" + ] + } + ], "source": [ "import os\n", - "os.environ[\"REPLICATE_API_TOKEN\"] = MY_CONFIG.REPLICATE_API_TOKEN" + "os.environ[\"REPLICATE_API_TOKEN\"] = MY_CONFIG.REPLICATE_API_TOKEN\n", + "\n", + "print ('Using model:', MY_CONFIG.LLM_MODEL)" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -335,7 +347,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -351,11 +363,11 @@ "Mayank Mishra โ‹† Matt Stallone โ‹† Gaoyuan Zhang โ‹† Yikang Shen Aditya Prasad Adriana Meza Soria Michele Merler Parameswaran Selvam Saptha Surendran Shivdeep Singh Manish Sethi Xuan-Hong Dang Pengyuan Li Kun-Lung Wu Syed Zawad Andrew Coleman Matthew White Mark Lewis Raju Pavuluri Yan Koyfman Boris Lublinsky Maximilien de Bayser Ibrahim Abdelaziz Kinjal Basu Mayank Agarwal Yi Zhou Chris Johnson Aanchal Goyal Hima Patel Yousaf Shah Petros Zerfos Heiko Ludwig Asim Munawar Maxwell Crouse Pavan Kapanipathi Shweta Salaria Bob Calio Sophia Wen Seetharami Seelam Brian Belgodere Carlos Fonseca Amith Singhee Nirmit Desai David D. Cox Ruchir Puri โ€  Rameswar Panda โ€ \n", "============ end context ============\n", "============ here is the answer from LLM... STREAMING... =====\n", - "Based on the provided context, the training data used to train Granite models is not explicitly mentioned. However, it is mentioned that the 20B model was used after 1.6T tokens to start training of 34B model with the same code pretraining data without any changes to the training and inference framework. This implies that the same code pretraining data was used for both models, but the exact nature of this data is not specified.\n", + "The context does not provide specific details about the training data used to train the Granite models. It only mentions that the 20B model was trained after 1.6T tokens and then used to start training the 34B model with the same code pretraining data. However, it does not specify what this code pretraining data is.\n", "====== end LLM answer ======\n", "\n", - "CPU times: user 75.3 ms, sys: 37.8 ms, total: 113 ms\n", - "Wall time: 1.95 s\n" + "CPU times: user 63.6 ms, sys: 12 ms, total: 75.6 ms\n", + "Wall time: 1.43 s\n" ] } ], @@ -369,7 +381,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -385,11 +397,11 @@ "We are excited about the future of attention-based models and plan to apply them to other tasks. We plan to extend the Transformer to problems involving input and output modalities other than text and to investigate local, restricted attention mechanisms to efficiently handle large inputs and outputs such as images, audio and video. Making generation less sequential is another research goals of ours.\n", "============ end context ============\n", "============ here is the answer from LLM... STREAMING... =====\n", - "Based on the provided context, an attention mechanism can be described as mapping a query and a set of key-value pairs to an output, where the query, keys, values, and output are all vectors. The output is computed as a weighted sum.\n", + "An attention mechanism is a method used in sequence modeling and transduction models to model dependencies between elements in input or output sequences, regardless of their distance. It maps a query and a set of key-value pairs to an output, which is computed as a weighted sum.\n", "====== end LLM answer ======\n", "\n", - "CPU times: user 41.1 ms, sys: 28.7 ms, total: 69.8 ms\n", - "Wall time: 1.58 s\n" + "CPU times: user 30.6 ms, sys: 17.3 ms, total: 47.9 ms\n", + "Wall time: 880 ms\n" ] } ], @@ -403,7 +415,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -419,11 +431,11 @@ "The Granite Code models achieve relatively high accuracy across all sizes (e.g., outperforming CodeGemma at 2B-3B scale, StarCoder2 at 7B-8B scale and CodeLlama models with half of the sizes). This shows that our Granite Code models are not only capable of generating good code but also of using libraries more accurately in real data science workflows.\n", "============ end context ============\n", "============ here is the answer from LLM... STREAMING... =====\n", - "I apologize, but the provided context does not mention the moon landing. The context appears to be about code generation and evaluation benchmarks, specifically discussing the MBPP and MBPP+ benchmarks, and the performance of different code models. There is no mention of the moon landing. If you provide a different context or question, I'll be happy to help.\n", + "I'm sorry, the provided context does not contain information about the moon landing.\n", "====== end LLM answer ======\n", "\n", - "CPU times: user 41.5 ms, sys: 21 ms, total: 62.5 ms\n", - "Wall time: 2.13 s\n" + "CPU times: user 45 ms, sys: 3.19 ms, total: 48.2 ms\n", + "Wall time: 412 ms\n" ] } ], @@ -445,7 +457,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "data-prep-kit-4-021", "language": "python", "name": "python3" }, diff --git a/transforms/language/doc_chunk/python/README.md b/transforms/language/doc_chunk/python/README.md index fbacf4ade..9abca2b79 100644 --- a/transforms/language/doc_chunk/python/README.md +++ b/transforms/language/doc_chunk/python/README.md @@ -32,7 +32,6 @@ The transform can be tuned with the following parameters. | `chunking_type` | `dl_json` | Chunking type to apply. Valid options are `li_markdown` for using the LlamaIndex [Markdown chunking](https://docs.llamaindex.ai/en/stable/module_guides/loading/node_parsers/modules/#markdownnodeparser), `dl_json` for using the [Docling JSON chunking](https://github.com/DS4SD/docling), `li_token_text` for using the LlamaIndex [Token Text Splitter](https://docs.llamaindex.ai/en/stable/api_reference/node_parsers/token_text_splitter/), which chunks the text into fixed-sized windows of tokens. | | `content_column_name` | `contents` | Name of the column containing the text to be chunked. | | `doc_id_column_name` | `document_id` | Name of the column containing the doc_id to be propagated in the output. | -| `dl_min_chunk_len` | `None` | Minimum number of characters for the chunk in the dl_json chunker. Setting to None is using the library defaults, i.e. a `min_chunk_len=64`. | | `chunk_size_tokens` | `128` | Size of the chunk in tokens for the token text chunker. | | `chunk_overlap_tokens` | `30` | Number of tokens overlapping between chunks for the token text chunker. | | `output_chunk_column_name` | `contents` | Column name to store the chunks in the output table. | diff --git a/transforms/language/doc_chunk/python/pyproject.toml b/transforms/language/doc_chunk/python/pyproject.toml index 82d280eca..c9728712e 100644 --- a/transforms/language/doc_chunk/python/pyproject.toml +++ b/transforms/language/doc_chunk/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_doc_chunk_transform_python" -version = "0.2.2.dev2" +version = "0.3.0" requires-python = ">=3.10,<3.13" description = "chunk documents Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/language/doc_chunk/python/requirements.txt b/transforms/language/doc_chunk/python/requirements.txt index ee6171ff6..7213c4199 100644 --- a/transforms/language/doc_chunk/python/requirements.txt +++ b/transforms/language/doc_chunk/python/requirements.txt @@ -1,3 +1,3 @@ data-prep-toolkit==0.2.2.dev2 -docling-core==1.7.2 +docling-core==2.3.0 llama-index-core>=0.11.0,<0.12.0 diff --git a/transforms/language/doc_chunk/python/src/doc_chunk_chunkers.py b/transforms/language/doc_chunk/python/src/doc_chunk_chunkers.py index a8ba44f61..b55bd08ff 100644 --- a/transforms/language/doc_chunk/python/src/doc_chunk_chunkers.py +++ b/transforms/language/doc_chunk/python/src/doc_chunk_chunkers.py @@ -13,11 +13,11 @@ from abc import ABCMeta, abstractmethod from typing import Iterator, Optional, Dict, List -from docling_core.types import Document as DLDocument +from docling_core.types.doc import DoclingDocument from llama_index.core.node_parser.text.token import TokenTextSplitter from llama_index.core import Document as LIDocument from llama_index.core.node_parser import MarkdownNodeParser -from docling_core.transforms.chunker import HierarchicalChunker +from docling_core.transforms.chunker import HierarchicalChunker, DocMeta class ChunkingExecutor(metaclass=ABCMeta): @@ -29,7 +29,6 @@ def chunk(self, content: str) -> Iterator[dict]: class DLJsonChunker(ChunkingExecutor): def __init__( self, - min_chunk_len: Optional[int], output_chunk_column_name: str, output_jsonpath_column_name: str, output_pageno_column_name_key: str, @@ -40,19 +39,19 @@ def __init__( self.output_pageno_column_name_key = output_pageno_column_name_key self.output_bbox_column_name_key = output_bbox_column_name_key - chunker_kwargs = dict(include_metadata=True) - if min_chunk_len is not None: - chunker_kwargs["min_chunk_len"] = min_chunk_len - self._chunker = HierarchicalChunker(**chunker_kwargs) + self._chunker = HierarchicalChunker() def chunk(self, content: str) -> Iterator[dict]: - doc = DLDocument.model_validate_json(content) + doc = DoclingDocument.model_validate_json(content) for chunk in self._chunker.chunk(doc): + meta = DocMeta.model_validate(chunk.meta) + doc_item = meta.doc_items[0] + prov = doc_item.prov[0] yield { self.output_chunk_column_name: chunk.text, - self.output_jsonpath_column_name: chunk.path, - self.output_pageno_column_name_key: chunk.page, - self.output_bbox_column_name_key: chunk.bbox, + self.output_jsonpath_column_name: doc_item.self_ref, + self.output_pageno_column_name_key: prov.page_no, + self.output_bbox_column_name_key: prov.bbox.as_tuple(), } diff --git a/transforms/language/doc_chunk/python/src/doc_chunk_local_python.py b/transforms/language/doc_chunk/python/src/doc_chunk_local_python.py index e0fdfa871..0c830ee98 100644 --- a/transforms/language/doc_chunk/python/src/doc_chunk_local_python.py +++ b/transforms/language/doc_chunk/python/src/doc_chunk_local_python.py @@ -38,6 +38,7 @@ "runtime_job_id": "job_id", "runtime_code_location": ParamsUtils.convert_to_ast(code_location), # doc_chunk params + # "doc_chunk_dl_min_chunk_len": 10, # for testing the usage of the deprecated argument # "doc_chunk_chunking_type": "li_markdown", "doc_chunk_chunking_type": "dl_json", # "doc_chunk_chunking_type": chunking_types.LI_TOKEN_TEXT, diff --git a/transforms/language/doc_chunk/python/src/doc_chunk_transform.py b/transforms/language/doc_chunk/python/src/doc_chunk_transform.py index 7acdd3ef1..e64a7c1d1 100644 --- a/transforms/language/doc_chunk/python/src/doc_chunk_transform.py +++ b/transforms/language/doc_chunk/python/src/doc_chunk_transform.py @@ -26,7 +26,6 @@ content_column_name_key = "content_column_name" doc_id_column_name_key = "doc_id_column_name" chunking_type_key = "chunking_type" -dl_min_chunk_len_key = "dl_min_chunk_len" chunk_size_tokens_key = "chunk_size_tokens" chunk_overlap_tokens_key = "chunk_overlap_tokens" output_chunk_column_name_key = "output_chunk_column_name" @@ -38,7 +37,6 @@ content_column_name_cli_param = f"{cli_prefix}{content_column_name_key}" doc_id_column_name_cli_param = f"{cli_prefix}{doc_id_column_name_key}" chunking_type_cli_param = f"{cli_prefix}{chunking_type_key}" -dl_min_chunk_len_cli_param = f"{cli_prefix}{dl_min_chunk_len_key}" output_chunk_column_name_cli_param = f"{cli_prefix}{output_chunk_column_name_key}" output_source_doc_id_column_name_cli_param = f"{cli_prefix}{output_source_doc_id_column_name_key}" output_jsonpath_column_name_cli_param = f"{cli_prefix}{output_jsonpath_column_name_key}" @@ -59,7 +57,6 @@ def __str__(self): default_content_column_name = "contents" default_doc_id_column_name = "document_id" default_chunking_type = chunking_types.DL_JSON -default_dl_min_chunk_len = None default_output_chunk_column_name = "contents" default_output_chunk_column_id = "chunk_id" default_output_source_doc_id_column_name = "source_document_id" @@ -95,7 +92,6 @@ def __init__(self, config: dict[str, Any]): self.output_source_doc_id_column_name = config.get(output_source_doc_id_column_name_key, default_output_source_doc_id_column_name) # Parameters for Docling JSON chunking - self.dl_min_chunk_len = config.get(dl_min_chunk_len_key, default_dl_min_chunk_len) self.output_jsonpath_column_name = config.get( output_jsonpath_column_name_key, default_output_jsonpath_column_name ) @@ -113,7 +109,6 @@ def __init__(self, config: dict[str, Any]): self.chunker: ChunkingExecutor if self.chunking_type == chunking_types.DL_JSON: self.chunker = DLJsonChunker( - min_chunk_len=self.dl_min_chunk_len, output_chunk_column_name=self.output_chunk_column_name, output_jsonpath_column_name=self.output_jsonpath_column_name, output_pageno_column_name_key=self.output_pageno_column_name_key, @@ -202,11 +197,6 @@ def add_input_params(self, parser: ArgumentParser) -> None: default=default_doc_id_column_name, help="Name of the column containing the doc_id to be propagated in the output", ) - parser.add_argument( - f"--{dl_min_chunk_len_cli_param}", - default=default_dl_min_chunk_len, - help="Minimum number of characters for the chunk in the dl_json chunker. Setting to None is using the library defaults, i.e. a min_chunk_len=64.", - ) parser.add_argument( f"--{output_chunk_column_name_cli_param}", default=default_output_chunk_column_name, @@ -244,6 +234,11 @@ def add_input_params(self, parser: ArgumentParser) -> None: type=int, help="Number of tokens overlapping between chunks for the fixed-sized chunker.", ) + parser.add_argument( + f"--{cli_prefix}dl_min_chunk_len", + default=None, + help="Deprecated. This option is no longer considered.", + ) def apply_input_params(self, args: Namespace) -> bool: """ @@ -254,5 +249,7 @@ def apply_input_params(self, args: Namespace) -> bool: captured = CLIArgumentProvider.capture_parameters(args, cli_prefix, False) self.params = self.params | captured + if self.params.get("dl_min_chunk_len") is not None: + self.logger.warning("The `dl_min_chunk_len` option is deprecated and will be ignored. Please stop using it, it will not accepted anymore in future versions.") self.logger.info(f"doc_chunk parameters are : {self.params}") return True diff --git a/transforms/language/doc_chunk/python/test-data/expected/metadata.json b/transforms/language/doc_chunk/python/test-data/expected/metadata.json index 7eeaaa279..e83a0375b 100644 --- a/transforms/language/doc_chunk/python/test-data/expected/metadata.json +++ b/transforms/language/doc_chunk/python/test-data/expected/metadata.json @@ -5,8 +5,8 @@ "job name": "doc_chunk", "job type": "pure python", "job id": "job_id", - "start_time": "2024-10-18 14:05:09", - "end_time": "2024-10-18 14:05:11", + "start_time": "2024-10-30 18:38:40", + "end_time": "2024-10-30 18:38:40", "status": "success" }, "code": { @@ -18,7 +18,6 @@ "chunking_type": "dl_json", "content_column_name": "contents", "doc_id_column_name": "document_id", - "dl_min_chunk_len": null, "output_chunk_column_name": "contents", "output_source_doc_id_column_name": "source_document_id", "output_jsonpath_column_name": "doc_jsonpath", @@ -35,22 +34,22 @@ "num_processors": 0 }, "execution_stats": { - "cpus": 27.9, + "cpus": 19.5, "gpus": 0, - "memory": 25.75, + "memory": 27.48, "object_store": 0, - "execution time, min": 0.021 + "execution time, min": 0.001 }, "job_output_stats": { "source_files": 1, - "source_size": 50276, + "source_size": 12073, "result_files": 1, - "result_size": 31223, - "processing_time": 1.266, + "result_size": 14363, + "processing_time": 0.043, "nfiles": 1, - "nrows": 88, + "nrows": 39, "source_doc_count": 1, - "result_doc_count": 88 + "result_doc_count": 39 }, "source": { "name": "/Users/dol/codes/data-prep-kit/transforms/language/doc_chunk/python/test-data/input", diff --git a/transforms/language/doc_chunk/python/test-data/expected/test1.parquet b/transforms/language/doc_chunk/python/test-data/expected/test1.parquet index 06089be78..46714dde7 100644 Binary files a/transforms/language/doc_chunk/python/test-data/expected/test1.parquet and b/transforms/language/doc_chunk/python/test-data/expected/test1.parquet differ diff --git a/transforms/language/doc_chunk/python/test-data/input/test1.parquet b/transforms/language/doc_chunk/python/test-data/input/test1.parquet index 4015fccb0..32905aa74 100644 Binary files a/transforms/language/doc_chunk/python/test-data/input/test1.parquet and b/transforms/language/doc_chunk/python/test-data/input/test1.parquet differ diff --git a/transforms/language/doc_chunk/ray/pyproject.toml b/transforms/language/doc_chunk/ray/pyproject.toml index 033b8716b..29b594fac 100644 --- a/transforms/language/doc_chunk/ray/pyproject.toml +++ b/transforms/language/doc_chunk/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_doc_chunk_transform_ray" -version = "0.2.2.dev2" +version = "0.3.0" requires-python = ">=3.10,<3.13" description = "chunk documents Ray Transform" license = {text = "Apache-2.0"} @@ -11,7 +11,7 @@ authors = [ { name = "Christoph Auer", email = "cau@zurich.ibm.com" }, ] dependencies = [ - "dpk-doc-chunk-transform-python==0.2.2.dev2", + "dpk-doc-chunk-transform-python==0.3.0", "data-prep-toolkit[ray]==0.2.2.dev2", ] diff --git a/transforms/language/doc_chunk/ray/test-data/expected/metadata.json b/transforms/language/doc_chunk/ray/test-data/expected/metadata.json index 7eeaaa279..e83a0375b 100644 --- a/transforms/language/doc_chunk/ray/test-data/expected/metadata.json +++ b/transforms/language/doc_chunk/ray/test-data/expected/metadata.json @@ -5,8 +5,8 @@ "job name": "doc_chunk", "job type": "pure python", "job id": "job_id", - "start_time": "2024-10-18 14:05:09", - "end_time": "2024-10-18 14:05:11", + "start_time": "2024-10-30 18:38:40", + "end_time": "2024-10-30 18:38:40", "status": "success" }, "code": { @@ -18,7 +18,6 @@ "chunking_type": "dl_json", "content_column_name": "contents", "doc_id_column_name": "document_id", - "dl_min_chunk_len": null, "output_chunk_column_name": "contents", "output_source_doc_id_column_name": "source_document_id", "output_jsonpath_column_name": "doc_jsonpath", @@ -35,22 +34,22 @@ "num_processors": 0 }, "execution_stats": { - "cpus": 27.9, + "cpus": 19.5, "gpus": 0, - "memory": 25.75, + "memory": 27.48, "object_store": 0, - "execution time, min": 0.021 + "execution time, min": 0.001 }, "job_output_stats": { "source_files": 1, - "source_size": 50276, + "source_size": 12073, "result_files": 1, - "result_size": 31223, - "processing_time": 1.266, + "result_size": 14363, + "processing_time": 0.043, "nfiles": 1, - "nrows": 88, + "nrows": 39, "source_doc_count": 1, - "result_doc_count": 88 + "result_doc_count": 39 }, "source": { "name": "/Users/dol/codes/data-prep-kit/transforms/language/doc_chunk/python/test-data/input", diff --git a/transforms/language/doc_chunk/ray/test-data/expected/test1.parquet b/transforms/language/doc_chunk/ray/test-data/expected/test1.parquet index 06089be78..46714dde7 100644 Binary files a/transforms/language/doc_chunk/ray/test-data/expected/test1.parquet and b/transforms/language/doc_chunk/ray/test-data/expected/test1.parquet differ diff --git a/transforms/language/doc_chunk/ray/test-data/input/test1.parquet b/transforms/language/doc_chunk/ray/test-data/input/test1.parquet index 4015fccb0..32905aa74 100644 Binary files a/transforms/language/doc_chunk/ray/test-data/input/test1.parquet and b/transforms/language/doc_chunk/ray/test-data/input/test1.parquet differ diff --git a/transforms/language/doc_chunk/transform.config b/transforms/language/doc_chunk/transform.config index f433f360b..1df42f298 100644 --- a/transforms/language/doc_chunk/transform.config +++ b/transforms/language/doc_chunk/transform.config @@ -14,7 +14,7 @@ TRANSFORM_NAME=doc_chunk # # If you change the versions numbers, be sure to run "make set-versions" to # update version numbers across the transform (e.g., pyproject.toml). -DOC_CHUNK_PYTHON_VERSION=$(DPK_VERSION) +DOC_CHUNK_PYTHON_VERSION=0.3.0 DOC_CHUNK_RAY_VERSION=$(DOC_CHUNK_PYTHON_VERSION) DOC_CHUNK_SPARK_VERSION=$(DOC_CHUNK_PYTHON_VERSION) diff --git a/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_multiple_wf.py b/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_multiple_wf.py index cfb443f16..8992f1145 100644 --- a/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_multiple_wf.py +++ b/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_multiple_wf.py @@ -39,8 +39,11 @@ def compute_exec_params_func( runtime_pipeline_id: str, runtime_job_id: str, runtime_code_location: dict, + pdf2parquet_batch_size: int, pdf2parquet_do_table_structure: bool, pdf2parquet_do_ocr: bool, + pdf2parquet_ocr_engine: str, + pdf2parquet_bitmap_area_threshold: float, ) -> dict: from runtime_utils import KFPUtils @@ -53,8 +56,11 @@ def compute_exec_params_func( "runtime_pipeline_id": runtime_pipeline_id, "runtime_job_id": runtime_job_id, "runtime_code_location": str(runtime_code_location), + "pdf2parquet_batch_size": pdf2parquet_batch_size, "pdf2parquet_do_table_structure": pdf2parquet_do_table_structure, "pdf2parquet_do_ocr": pdf2parquet_do_ocr, + "pdf2parquet_ocr_engine": pdf2parquet_ocr_engine, + "pdf2parquet_bitmap_area_threshold": pdf2parquet_bitmap_area_threshold, } @@ -112,8 +118,11 @@ def pdf2parquet( runtime_pipeline_id: str = "pipeline_id", runtime_code_location: dict = {'github': 'github', 'commit_hash': '12345', 'path': 'path'}, # pdf2parquet parameters + pdf2parquet_batch_size: int = -1, pdf2parquet_do_table_structure: bool = True, pdf2parquet_do_ocr: bool = False, + pdf2parquet_ocr_engine: str = "easyocr", + pdf2parquet_bitmap_area_threshold: float = 0.05, # additional parameters additional_params: str = '{"wait_interval": 2, "wait_cluster_ready_tmout": 400, "wait_cluster_up_tmout": 300, "wait_job_ready_tmout": 400, "wait_print_tmout": 30, "http_retries": 5, "delete_cluster_delay_minutes": 0}', ): @@ -150,8 +159,11 @@ def pdf2parquet( :param runtime_actor_options - actor options :param runtime_pipeline_id - pipeline id :param runtime_code_location - code location + :param pdf2parquet_batch_size - how many inputs to batch into one output table :param pdf2parquet_do_table_structure - run table structure model :param pdf2parquet_do_ocr - run ocr model + :param pdf2parquet_ocr_engine - which ocr engine + :param pdf2parquet_bitmap_area_threshold - threshold for bitmaps :return: None """ # create clean_up task @@ -169,8 +181,11 @@ def pdf2parquet( runtime_pipeline_id=runtime_pipeline_id, runtime_job_id=run_id, runtime_code_location=runtime_code_location, + pdf2parquet_batch_size=pdf2parquet_batch_size, pdf2parquet_do_table_structure=pdf2parquet_do_table_structure, pdf2parquet_do_ocr=pdf2parquet_do_ocr, + pdf2parquet_ocr_engine=pdf2parquet_ocr_engine, + pdf2parquet_bitmap_area_threshold=pdf2parquet_bitmap_area_threshold, ) ComponentUtils.add_settings_to_component(compute_exec_params, ONE_HOUR_SEC * 2) # start Ray cluster diff --git a/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_wf.py b/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_wf.py index 1905ee17c..c9cdbf652 100644 --- a/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_wf.py +++ b/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_wf.py @@ -40,8 +40,11 @@ def compute_exec_params_func( runtime_pipeline_id: str, runtime_job_id: str, runtime_code_location: dict, + pdf2parquet_batch_size: int, pdf2parquet_do_table_structure: bool, pdf2parquet_do_ocr: bool, + pdf2parquet_ocr_engine: str, + pdf2parquet_bitmap_area_threshold: float, ) -> dict: from runtime_utils import KFPUtils @@ -55,8 +58,11 @@ def compute_exec_params_func( "runtime_pipeline_id": runtime_pipeline_id, "runtime_job_id": runtime_job_id, "runtime_code_location": str(runtime_code_location), + "pdf2parquet_batch_size": pdf2parquet_batch_size, "pdf2parquet_do_table_structure": pdf2parquet_do_table_structure, "pdf2parquet_do_ocr": pdf2parquet_do_ocr, + "pdf2parquet_ocr_engine": pdf2parquet_ocr_engine, + "pdf2parquet_bitmap_area_threshold": pdf2parquet_bitmap_area_threshold, } @@ -116,8 +122,11 @@ def pdf2parquet( runtime_pipeline_id: str = "pipeline_id", runtime_code_location: dict = {'github': 'github', 'commit_hash': '12345', 'path': 'path'}, # pdf2parquet parameters + pdf2parquet_batch_size: int = -1, pdf2parquet_do_table_structure: bool = True, pdf2parquet_do_ocr: bool = False, + pdf2parquet_ocr_engine: str = "easyocr", + pdf2parquet_bitmap_area_threshold: float = 0.05, # additional parameters additional_params: str = '{"wait_interval": 2, "wait_cluster_ready_tmout": 400, "wait_cluster_up_tmout": 300, "wait_job_ready_tmout": 400, "wait_print_tmout": 30, "http_retries": 5, "delete_cluster_delay_minutes": 0}', ): @@ -154,8 +163,11 @@ def pdf2parquet( :param runtime_actor_options - actor options :param runtime_pipeline_id - pipeline id :param runtime_code_location - code location + :param pdf2parquet_batch_size - how many inputs to batch into one output table :param pdf2parquet_do_table_structure - run table structure model :param pdf2parquet_do_ocr - run ocr model + :param pdf2parquet_ocr_engine - which ocr engine + :param pdf2parquet_bitmap_area_threshold - threshold for bitmaps :return: None """ # create clean_up task @@ -174,8 +186,11 @@ def pdf2parquet( runtime_pipeline_id=runtime_pipeline_id, runtime_job_id=run_id, runtime_code_location=runtime_code_location, + pdf2parquet_batch_size=pdf2parquet_batch_size, pdf2parquet_do_table_structure=pdf2parquet_do_table_structure, pdf2parquet_do_ocr=pdf2parquet_do_ocr, + pdf2parquet_ocr_engine=pdf2parquet_ocr_engine, + pdf2parquet_bitmap_area_threshold=pdf2parquet_bitmap_area_threshold, ) ComponentUtils.add_settings_to_component(compute_exec_params, ONE_HOUR_SEC * 2) # start Ray cluster diff --git a/transforms/language/pdf2parquet/python/Dockerfile b/transforms/language/pdf2parquet/python/Dockerfile index 948d126e1..7d6f80502 100644 --- a/transforms/language/pdf2parquet/python/Dockerfile +++ b/transforms/language/pdf2parquet/python/Dockerfile @@ -32,7 +32,7 @@ RUN pip install ${PIP_INSTALL_EXTRA_ARGS} --no-cache-dir -e . # Download models RUN python -c 'from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models; load_pretrained_nlp_models(verbose=True);' -RUN python -c 'from docling.document_converter import DocumentConverter; s=DocumentConverter.download_models_hf(); print(f"Models cached in {s}")' +RUN python -c 'from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline; s=StandardPdfPipeline.download_models_hf(); print(f"Models cached in {s}")' # copy the main() entry point to the image COPY --chown=dpk:root src/pdf2parquet_transform.py ./ diff --git a/transforms/language/pdf2parquet/python/README.md b/transforms/language/pdf2parquet/python/README.md index 4ba825530..a4bd31e06 100644 --- a/transforms/language/pdf2parquet/python/README.md +++ b/transforms/language/pdf2parquet/python/README.md @@ -1,9 +1,21 @@ # Ingest PDF to Parquet -This tranforms iterate through PDF files or zip of PDF files and generates parquet files -containing the converted document in Markdown format. +This tranforms iterate through document files or zip of files and generates parquet files +containing the converted document in Markdown or JSON format. The PDF conversion is using the [Docling package](https://github.com/DS4SD/docling). +The Docling configuration in DPK is tuned for best results when running large batch ingestions. +For more details on the multiple configuration options, please refer to the official [Docling documentation](https://ds4sd.github.io/docling/). + +This transform supports the following input formats: + +- PDF documents +- DOCX documents +- PPTX presentations +- Image files (png, jpeg, etc) +- HTML pages +- Markdown documents +- ASCII Docs documents ## Output format @@ -17,6 +29,7 @@ with the addition of the following columns "filename": "string", // the basename of the PDF file "contents": "string", // the content of the PDF "document_id": "string", // the document id, a random uuid4 + "document_hash": "string", // the document hash of the input content "ext": "string", // the detected file extension "hash": "string", // the hash of the `contents` column "size": "string", // the size of `contents` @@ -35,10 +48,14 @@ The transform can be initialized with the following parameters. | Parameter | Default | Description | |------------|----------|--------------| +| `batch_size` | -1 | Number of documents to be saved in the same result table. A value of -1 will generate one result file for each input file. | | `artifacts_path` | | Path where to Docling models artifacts are located, if unset they will be downloaded and fetched from the [HF_HUB_CACHE](https://huggingface.co/docs/huggingface_hub/en/guides/manage-cache) folder. | -| `contents_type` | `text/markdown` | The output type for the `contents` column. Valid types are `text/markdown` and `application/json`. | +| `contents_type` | `text/markdown` | The output type for the `contents` column. Valid types are `text/markdown`, `text/plain` and `application/json`. | | `do_table_structure` | `True` | If true, detected tables will be processed with the table structure model. | | `do_ocr` | `True` | If true, optical character recognition (OCR) will be used to read the content of bitmap parts of the document. | +| `ocr_engine` | `easyocr` | The OCR engine to use. Valid values are `easyocr`, `tesseract`, `tesseract_cli`. | +| `bitmap_area_threshold` | `0.05` | Threshold for running OCR on bitmap figures embedded in document. The threshold is computed as the fraction of the area covered by the bitmap, compared to the whole page area. | +| `pdf_backend` | `dlparse_v2` | The PDF backend to use. Valid values are `dlparse_v2`, `dlparse_v1`, `pypdfium2`. | | `double_precision` | `8` | If set, all floating points (e.g. bounding boxes) are rounded to this precision. For tests it is advised to use 0. | When invoking the CLI, the parameters must be set as `--pdf2parquet_`, e.g. `--pdf2parquet_do_ocr=true`. diff --git a/transforms/language/pdf2parquet/python/pyproject.toml b/transforms/language/pdf2parquet/python/pyproject.toml index 56c1087fb..f19b8e049 100644 --- a/transforms/language/pdf2parquet/python/pyproject.toml +++ b/transforms/language/pdf2parquet/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_pdf2parquet_transform_python" -version = "0.2.2.dev2" +version = "0.3.0" requires-python = ">=3.10,<3.13" description = "PDF2PARQUET Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/language/pdf2parquet/python/requirements.txt b/transforms/language/pdf2parquet/python/requirements.txt index 7346d4fc3..2912af252 100644 --- a/transforms/language/pdf2parquet/python/requirements.txt +++ b/transforms/language/pdf2parquet/python/requirements.txt @@ -1,6 +1,6 @@ data-prep-toolkit==0.2.2.dev2 -docling-core==1.7.2 -docling-ibm-models==2.0.0 -deepsearch-glm==0.22.0 -docling==1.20.0 +docling-core==2.3.0 +docling-ibm-models==2.0.3 +deepsearch-glm==0.26.1 +docling==2.3.1 filetype >=1.2.0, <2.0.0 diff --git a/transforms/language/pdf2parquet/python/src/pdf2parquet_local_python.py b/transforms/language/pdf2parquet/python/src/pdf2parquet_local_python.py index b6d450fc8..99b0b7f84 100644 --- a/transforms/language/pdf2parquet/python/src/pdf2parquet_local_python.py +++ b/transforms/language/pdf2parquet/python/src/pdf2parquet_local_python.py @@ -32,13 +32,14 @@ params = { # Data access. Only required parameters are specified "data_local_config": ParamsUtils.convert_to_ast(local_conf), - "data_files_to_use": ast.literal_eval("['.pdf','.zip']"), + "data_files_to_use": ast.literal_eval("['.pdf','.docx','.pptx','.zip']"), # execution info "runtime_pipeline_id": "pipeline_id", "runtime_job_id": "job_id", "runtime_code_location": ParamsUtils.convert_to_ast(code_location), # pdf2parquet params "pdf2parquet_double_precision": 0, + # "pdf2parquet_batch_size": 10, # "pdf2parquet_do_table_structure": False, # "pdf2parquet_do_ocr": False, # "pdf2parquet_contents_type": "text/markdown", diff --git a/transforms/language/pdf2parquet/python/src/pdf2parquet_transform.py b/transforms/language/pdf2parquet/python/src/pdf2parquet_transform.py index 1ca559d33..0f5de10c0 100644 --- a/transforms/language/pdf2parquet/python/src/pdf2parquet_transform.py +++ b/transforms/language/pdf2parquet/python/src/pdf2parquet_transform.py @@ -19,7 +19,7 @@ from argparse import ArgumentParser, Namespace from datetime import datetime from pathlib import Path -from typing import Any +from typing import Any, Optional import filetype import pandas as pd @@ -27,42 +27,86 @@ from data_processing.transform import AbstractBinaryTransform, TransformConfiguration from data_processing.utils import TransformUtils, get_logger, str2bool from data_processing.utils.cli_utils import CLIArgumentProvider -from docling.datamodel.base_models import DocumentStream -from docling.datamodel.document import ConvertedDocument, DocumentConversionInput -from docling.document_converter import DocumentConverter -from docling.pipeline.standard_model_pipeline import PipelineOptions +from data_processing.utils.multilock import MultiLock +from docling.backend.docling_parse_backend import DoclingParseDocumentBackend +from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend +from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend +from docling.datamodel.base_models import DocumentStream, MimeTypeToFormat +from docling.datamodel.pipeline_options import ( + EasyOcrOptions, + OcrOptions, + PdfPipelineOptions, + TesseractCliOcrOptions, + TesseractOcrOptions, +) +from docling.document_converter import DocumentConverter, InputFormat, PdfFormatOption +from docling.models.base_ocr_model import OcrOptions logger = get_logger(__name__) +# logger = get_logger(__name__, level="DEBUG") shortname = "pdf2parquet" cli_prefix = f"{shortname}_" +pdf2parquet_batch_size_key = f"batch_size" pdf2parquet_artifacts_path_key = f"artifacts_path" pdf2parquet_contents_type_key = f"contents_type" pdf2parquet_do_table_structure_key = f"do_table_structure" pdf2parquet_do_ocr_key = f"do_ocr" +pdf2parquet_ocr_engine_key = f"ocr_engine" +pdf2parquet_bitmap_area_threshold_key = f"bitmap_area_threshold" +pdf2parquet_pdf_backend_key = f"pdf_backend" pdf2parquet_double_precision_key = f"double_precision" class pdf2parquet_contents_types(str, enum.Enum): MARKDOWN = "text/markdown" + TEXT = "text/plain" JSON = "application/json" def __str__(self): return str(self.value) +class pdf2parquet_pdf_backend(str, enum.Enum): + PYPDFIUM2 = "pypdfium2" + DLPARSE_V1 = "dlparse_v1" + DLPARSE_V2 = "dlparse_v2" + + def __str__(self): + return str(self.value) + + +class pdf2parquet_ocr_engine(str, enum.Enum): + EASYOCR = "easyocr" + TESSERACT_CLI = "tesseract_cli" + TESSERACT = "tesseract" + + def __str__(self): + return str(self.value) + + +pdf2parquet_batch_size_default = -1 pdf2parquet_contents_type_default = pdf2parquet_contents_types.MARKDOWN pdf2parquet_do_table_structure_default = True pdf2parquet_do_ocr_default = True +pdf2parquet_bitmap_area_threshold_default = 0.05 +pdf2parquet_ocr_engine_default = pdf2parquet_ocr_engine.EASYOCR +pdf2parquet_pdf_backend_default = pdf2parquet_pdf_backend.DLPARSE_V2 pdf2parquet_double_precision_default = 8 +pdf2parquet_batch_size_cli_param = f"{cli_prefix}{pdf2parquet_batch_size_key}" pdf2parquet_artifacts_path_cli_param = f"{cli_prefix}{pdf2parquet_artifacts_path_key}" pdf2parquet_contents_type_cli_param = f"{cli_prefix}{pdf2parquet_contents_type_key}" pdf2parquet_do_table_structure_cli_param = ( f"{cli_prefix}{pdf2parquet_do_table_structure_key}" ) pdf2parquet_do_ocr_cli_param = f"{cli_prefix}{pdf2parquet_do_ocr_key}" +pdf2parquet_bitmap_area_threshold__cli_param = ( + f"{cli_prefix}{pdf2parquet_bitmap_area_threshold_key}" +) +pdf2parquet_ocr_engine_cli_param = f"{cli_prefix}{pdf2parquet_ocr_engine_key}" +pdf2parquet_pdf_backend_cli_param = f"{cli_prefix}{pdf2parquet_pdf_backend_key}" pdf2parquet_double_precision_cli_param = ( f"{cli_prefix}{pdf2parquet_double_precision_key}" ) @@ -81,6 +125,7 @@ def __init__(self, config: dict): super().__init__(config) + self.batch_size = config.get(pdf2parquet_batch_size_key, pdf2parquet_batch_size_default) self.artifacts_path = config.get(pdf2parquet_artifacts_path_key, None) if self.artifacts_path is not None: self.artifacts_path = Path(self.artifacts_path) @@ -93,18 +138,75 @@ def __init__(self, config: dict): pdf2parquet_do_table_structure_key, pdf2parquet_do_table_structure_default ) self.do_ocr = config.get(pdf2parquet_do_ocr_key, pdf2parquet_do_ocr_default) + self.ocr_engine_name = config.get( + pdf2parquet_ocr_engine_key, pdf2parquet_ocr_engine_default + ) + if not isinstance(self.ocr_engine_name, pdf2parquet_ocr_engine): + self.ocr_engine_name = pdf2parquet_ocr_engine[self.ocr_engine_name] + self.bitmap_area_threshold = config.get( + pdf2parquet_bitmap_area_threshold_key, + pdf2parquet_bitmap_area_threshold_default, + ) + self.pdf_backend_name = config.get( + pdf2parquet_pdf_backend_key, pdf2parquet_pdf_backend_default + ) + if not isinstance(self.pdf_backend_name, pdf2parquet_pdf_backend): + self.pdf_backend_name = pdf2parquet_pdf_backend[self.pdf_backend_name] self.double_precision = config.get( pdf2parquet_double_precision_key, pdf2parquet_double_precision_default ) logger.info("Initializing models") - pipeline_options = PipelineOptions( + pipeline_options = PdfPipelineOptions( + artifacts_path=self.artifacts_path, do_table_structure=self.do_table_structure, do_ocr=self.do_ocr, + ocr_options=self._get_ocr_engine(self.ocr_engine_name), ) - self._converter = DocumentConverter( - artifacts_path=self.artifacts_path, pipeline_options=pipeline_options - ) + pipeline_options.ocr_options.bitmap_area_threshold = self.bitmap_area_threshold + + lock = MultiLock("dpk_pdf2parquet_init") + try: + logger.debug( + f"Going to acquire lock {lock.lock_filename} for synchronizing global filesystem operations." + ) + locked = lock.acquire() + logger.debug(f"Lock {lock.lock_filename} acquired.") + + self._converter = DocumentConverter( + format_options={ + InputFormat.PDF: PdfFormatOption( + pipeline_options=pipeline_options, + backend=self._get_pdf_backend(self.pdf_backend_name), + ) + } + ) + self._converter.initialize_pipeline(InputFormat.PDF) + finally: + lock.release() + logger.debug(f"Lock {lock.lock_filename} released.") + + self.buffer = [] + + def _get_ocr_engine(self, engine_name: pdf2parquet_ocr_engine) -> OcrOptions: + if engine_name == pdf2parquet_ocr_engine.EASYOCR: + return EasyOcrOptions() + elif engine_name == pdf2parquet_ocr_engine.TESSERACT_CLI: + return TesseractCliOcrOptions() + elif engine_name == pdf2parquet_ocr_engine.TESSERACT: + return TesseractOcrOptions() + + raise RuntimeError(f"Unknown OCR engine `{engine_name}`") + + def _get_pdf_backend(self, backend_name: pdf2parquet_pdf_backend): + if backend_name == pdf2parquet_pdf_backend.DLPARSE_V1: + return DoclingParseDocumentBackend + elif backend_name == pdf2parquet_pdf_backend.DLPARSE_V2: + return DoclingParseV2DocumentBackend + elif backend_name == pdf2parquet_pdf_backend.PYPDFIUM2: + return PyPdfiumDocumentBackend + + raise RuntimeError(f"Unknown PDF backend `{backend_name}`") def _update_metrics(self, num_pages: int, elapse_time: float): # This is implemented in the ray version @@ -116,26 +218,26 @@ def _convert_pdf2parquet( # Convert PDF to Markdown start_time = time.time() buf = io.BytesIO(content_bytes) - input_docs = DocumentStream(filename=doc_filename, stream=buf) - input = DocumentConversionInput.from_streams([input_docs]) + input_doc = DocumentStream(name=doc_filename, stream=buf) - converted_docs = self._converter.convert(input) - doc: ConvertedDocument = next(converted_docs, None) - if doc is None or doc.output is None: - raise RuntimeError("Failed in converting.") + conv_res = self._converter.convert(input_doc) + doc = conv_res.document elapse_time = time.time() - start_time if self.contents_type == pdf2parquet_contents_types.MARKDOWN: - content_string = doc.render_as_markdown() + content_string = doc.export_to_markdown() + elif self.contents_type == pdf2parquet_contents_types.TEXT: + content_string = doc.export_to_text() elif self.contents_type == pdf2parquet_contents_types.JSON: content_string = pd.io.json.ujson_dumps( - doc.render_as_dict(), double_precision=self.double_precision + doc.export_to_dict(), double_precision=self.double_precision ) else: raise RuntimeError(f"Uknown contents_type {self.contents_type}.") num_pages = len(doc.pages) - num_tables = len(doc.output.tables) if doc.output.tables is not None else 0 - num_doc_elements = len(doc.output.main_text) if doc.output.main_text is not None else 0 + num_tables = len(doc.tables) + num_doc_elements = len(doc.texts) + document_hash = doc.origin.binary_hash self._update_metrics(num_pages=num_pages, elapse_time=elapse_time) @@ -146,6 +248,7 @@ def _convert_pdf2parquet( "num_tables": num_tables, "num_doc_elements": num_doc_elements, "document_id": str(uuid.uuid4()), + "document_hash": document_hash, "ext": ext, "hash": TransformUtils.str_to_hash(content_string), "size": len(content_string), @@ -165,18 +268,20 @@ def transform_binary( for each PDF file detected in the archive. """ - data = [] + data = [*self.buffer] success_doc_id = [] failed_doc_id = [] skipped_doc_id = [] number_of_rows = 0 try: + # TODO: Docling has an inner-function with a stronger type checking. + # Once it is exposed as public, we can use it here as well. root_kind = filetype.guess(byte_array) - # Process single PDF documents - if root_kind is not None and root_kind.mime == "application/pdf": - logger.debug(f"Detected root file {file_name=} as PDF.") + # Process single documents + if root_kind is not None and root_kind.mime in MimeTypeToFormat: + logger.debug(f"Detected root file {file_name=} as {root_kind.mime}.") try: root_ext = root_kind.extension @@ -195,10 +300,10 @@ def transform_binary( except Exception as e: failed_doc_id.append(file_name) logger.warning( - f"Exception {str(e)} processing file {archive_doc_filename}, skipping" + f"Exception {str(e)} processing file {file_name}, skipping" ) - # Process ZIP archive of PDF documents + # Process ZIP archive of documents elif root_kind is not None and root_kind.mime == "application/zip": logger.debug( f"Detected root file {file_name=} as ZIP. Iterating through the archive content." @@ -218,9 +323,9 @@ def transform_binary( # Detect file type kind = filetype.guess(content_bytes) - if kind is None or kind.mime != "application/pdf": + if kind is None or kind.mime not in MimeTypeToFormat: logger.info( - f"File {archive_doc_filename=} is not detected as PDF but {kind=}. Skipping." + f"File {archive_doc_filename=} is not detected as valid format {kind=}. Skipping." ) skipped_doc_id.append(archive_doc_filename) continue @@ -248,23 +353,53 @@ def transform_binary( else: logger.warning( - f"File {file_name=} is not detected as PDF nor as ZIP but {kind=}. Skipping." + f"File {file_name=} is not detected as a supported type nor as ZIP but {kind=}. Skipping." ) - table = pa.Table.from_pylist(data) + metadata = { - "nrows": len(table), + "nrows": number_of_rows, "nsuccess": len(success_doc_id), "nfail": len(failed_doc_id), "nskip": len(skipped_doc_id), } - return [ - (TransformUtils.convert_arrow_to_binary(table=table), ".parquet") - ], metadata + + batch_results = [] + self.buffer = [] + if self.batch_size <= 0: + # we do a single batch + table = pa.Table.from_pylist(data) + batch_results.append((TransformUtils.convert_arrow_to_binary(table=table), ".parquet")) + else: + # we create result files containing batch_size rows/documents + num_left = len(data) + start_row = 0 + while num_left >= self.batch_size: + table = pa.Table.from_pylist(data[start_row:self.batch_size]) + batch_results.append((TransformUtils.convert_arrow_to_binary(table=table), ".parquet")) + + start_row += self.batch_size + num_left = num_left - self.batch_size + + if num_left >= 0: + self.buffer = data[start_row:] + + return batch_results, metadata except Exception as e: logger.error(f"Fatal error with file {file_name=}. No results produced.") raise + def flush_binary(self) -> tuple[list[tuple[bytes, str]], dict[str, Any]]: + result = [] + if len(self.buffer) > 0: + logger.debug(f"flushing buffered table with {len(self.buffer)} rows.") + table = pa.Table.from_pylist(self.buffer) + result.append((TransformUtils.convert_arrow_to_binary(table=table), ".parquet")) + self.buffer = None + else: + logger.debug(f"Empty buffer. nothing to flush.") + return result, {} + class Pdf2ParquetTransformConfiguration(TransformConfiguration): """ @@ -286,6 +421,12 @@ def add_input_params(self, parser: ArgumentParser) -> None: By convention a common prefix should be used for all mutator-specific CLI args (e.g, noop_, pii_, etc.) """ + parser.add_argument( + f"--{pdf2parquet_batch_size_cli_param}", + type=int, + help="Number of documents to be saved in the same result table. A value of -1 will generate one result file for each input file.", + default=pdf2parquet_batch_size_default, + ) parser.add_argument( f"--{pdf2parquet_artifacts_path_cli_param}", type=str, @@ -311,6 +452,26 @@ def add_input_params(self, parser: ArgumentParser) -> None: help="If true, optical character recognition (OCR) will be used to read the PDF content.", default=pdf2parquet_do_ocr_default, ) + parser.add_argument( + f"--{pdf2parquet_ocr_engine_cli_param}", + type=pdf2parquet_ocr_engine, + choices=list(pdf2parquet_ocr_engine), + help="The OCR engine to use.", + default=pdf2parquet_ocr_engine.EASYOCR, + ) + parser.add_argument( + f"--{pdf2parquet_bitmap_area_threshold__cli_param}", + type=float, + help="Threshold for running OCR on bitmap figures embedded in document. The threshold is computed as the fraction of the area covered by the bitmap, compared to the whole page area.", + default=pdf2parquet_bitmap_area_threshold_default, + ) + parser.add_argument( + f"--{pdf2parquet_pdf_backend_cli_param}", + type=pdf2parquet_pdf_backend, + choices=list(pdf2parquet_pdf_backend), + help="The PDF backend to use.", + default=pdf2parquet_pdf_backend.DLPARSE_V2, + ) parser.add_argument( f"--{pdf2parquet_double_precision_cli_param}", type=int, diff --git a/transforms/language/pdf2parquet/python/test-data/expected/archive1.parquet b/transforms/language/pdf2parquet/python/test-data/expected/archive1.parquet index 9975c3608..907fb3803 100644 Binary files a/transforms/language/pdf2parquet/python/test-data/expected/archive1.parquet and b/transforms/language/pdf2parquet/python/test-data/expected/archive1.parquet differ diff --git a/transforms/language/pdf2parquet/python/test-data/expected/metadata.json b/transforms/language/pdf2parquet/python/test-data/expected/metadata.json index 704a86d8e..b9a535098 100644 --- a/transforms/language/pdf2parquet/python/test-data/expected/metadata.json +++ b/transforms/language/pdf2parquet/python/test-data/expected/metadata.json @@ -5,8 +5,8 @@ "job name": "pdf2parquet", "job type": "pure python", "job id": "job_id", - "start_time": "2024-10-18 06:02:44", - "end_time": "2024-10-18 06:03:04", + "start_time": "2024-10-29 14:17:59", + "end_time": "2024-10-29 14:18:05", "status": "success" }, "code": { @@ -19,6 +19,9 @@ "contents_type": "text/markdown", "do_table_structure": true, "do_ocr": true, + "ocr_engine": "easyocr", + "bitmap_area_threshold": 0.05, + "pdf_backend": "dlparse_v2", "double_precision": 0, "checkpointing": false, "max_files": -1, @@ -30,18 +33,18 @@ "num_processors": 0 }, "execution_stats": { - "cpus": 29.2, + "cpus": 16.8, "gpus": 0, - "memory": 29.7, + "memory": 31.22, "object_store": 0, - "execution time, min": 0.329 + "execution time, min": 0.108 }, "job_output_stats": { "source_files": 2, "source_size": 605137, "result_files": 2, - "result_size": 32086, - "processing_time": 5.981, + "result_size": 33044, + "processing_time": 6.478, "nrows": 3, "nsuccess": 3, "nfail": 0, diff --git a/transforms/language/pdf2parquet/python/test-data/expected/redp5110-ch1.parquet b/transforms/language/pdf2parquet/python/test-data/expected/redp5110-ch1.parquet index f70a89278..39613b1d1 100644 Binary files a/transforms/language/pdf2parquet/python/test-data/expected/redp5110-ch1.parquet and b/transforms/language/pdf2parquet/python/test-data/expected/redp5110-ch1.parquet differ diff --git a/transforms/language/pdf2parquet/python/test-data/expected_batch/metadata.json b/transforms/language/pdf2parquet/python/test-data/expected_batch/metadata.json new file mode 100644 index 000000000..f8f9ad71a --- /dev/null +++ b/transforms/language/pdf2parquet/python/test-data/expected_batch/metadata.json @@ -0,0 +1,64 @@ +{ + "pipeline": "pipeline_id", + "job details": { + "job category": "preprocessing", + "job name": "pdf2parquet", + "job type": "pure python", + "job id": "job_id", + "start_time": "2024-10-31 13:14:39", + "end_time": "2024-10-31 13:16:41", + "status": "success" + }, + "code": { + "github": "github", + "commit_hash": "12345", + "path": "path" + }, + "job_input_params": { + "batch_size": 10, + "artifacts_path": null, + "contents_type": "text/markdown", + "do_table_structure": true, + "do_ocr": true, + "ocr_engine": "easyocr", + "bitmap_area_threshold": 0.05, + "pdf_backend": "dlparse_v2", + "double_precision": 0, + "checkpointing": false, + "max_files": -1, + "random_samples": -1, + "files_to_use": [ + ".pdf", + ".docx", + ".pptx", + ".zip" + ], + "num_processors": 0 + }, + "execution_stats": { + "cpus": 39.0, + "gpus": 0, + "memory": 29.87, + "object_store": 0, + "execution time, min": 2.029 + }, + "job_output_stats": { + "source_files": 2, + "source_size": 605137, + "result_files": 1, + "processing_time": 3.888, + "nrows": 3, + "nsuccess": 3, + "nfail": 0, + "nskip": 0, + "result_size": 27200 + }, + "source": { + "name": "/Users/dol/codes/data-prep-kit/transforms/language/pdf2parquet/python/test-data/input", + "type": "path" + }, + "target": { + "name": "/Users/dol/codes/data-prep-kit/transforms/language/pdf2parquet/python/output", + "type": "path" + } +} \ No newline at end of file diff --git a/transforms/language/pdf2parquet/python/test-data/expected_batch/redp5110-ch1.parquet b/transforms/language/pdf2parquet/python/test-data/expected_batch/redp5110-ch1.parquet new file mode 100644 index 000000000..3e9ba12c7 Binary files /dev/null and b/transforms/language/pdf2parquet/python/test-data/expected_batch/redp5110-ch1.parquet differ diff --git a/transforms/language/pdf2parquet/python/test-data/expected_json/archive1.parquet b/transforms/language/pdf2parquet/python/test-data/expected_json/archive1.parquet index 033452371..7f34e1ba8 100644 Binary files a/transforms/language/pdf2parquet/python/test-data/expected_json/archive1.parquet and b/transforms/language/pdf2parquet/python/test-data/expected_json/archive1.parquet differ diff --git a/transforms/language/pdf2parquet/python/test-data/expected_json/metadata.json b/transforms/language/pdf2parquet/python/test-data/expected_json/metadata.json index a38a938a3..04bec2b88 100644 --- a/transforms/language/pdf2parquet/python/test-data/expected_json/metadata.json +++ b/transforms/language/pdf2parquet/python/test-data/expected_json/metadata.json @@ -5,8 +5,8 @@ "job name": "pdf2parquet", "job type": "pure python", "job id": "job_id", - "start_time": "2024-10-18 06:09:35", - "end_time": "2024-10-18 06:09:44", + "start_time": "2024-10-29 14:20:01", + "end_time": "2024-10-29 14:20:07", "status": "success" }, "code": { @@ -19,6 +19,9 @@ "contents_type": "application/json", "do_table_structure": true, "do_ocr": true, + "ocr_engine": "easyocr", + "bitmap_area_threshold": 0.05, + "pdf_backend": "dlparse_v2", "double_precision": 0, "checkpointing": false, "max_files": -1, @@ -30,18 +33,18 @@ "num_processors": 0 }, "execution_stats": { - "cpus": 25.3, + "cpus": 18.0, "gpus": 0, - "memory": 29.52, + "memory": 30.77, "object_store": 0, - "execution time, min": 0.138 + "execution time, min": 0.105 }, "job_output_stats": { "source_files": 2, "source_size": 605137, "result_files": 2, - "result_size": 33227, - "processing_time": 5.64, + "result_size": 22953, + "processing_time": 6.282, "nrows": 3, "nsuccess": 3, "nfail": 0, diff --git a/transforms/language/pdf2parquet/python/test-data/expected_json/redp5110-ch1.parquet b/transforms/language/pdf2parquet/python/test-data/expected_json/redp5110-ch1.parquet index 5032919c5..32905aa74 100644 Binary files a/transforms/language/pdf2parquet/python/test-data/expected_json/redp5110-ch1.parquet and b/transforms/language/pdf2parquet/python/test-data/expected_json/redp5110-ch1.parquet differ diff --git a/transforms/language/pdf2parquet/python/test-data/expected_md_no_table_no_ocr/archive1.parquet b/transforms/language/pdf2parquet/python/test-data/expected_md_no_table_no_ocr/archive1.parquet index 58bcfcf6f..9fec2cd2d 100644 Binary files a/transforms/language/pdf2parquet/python/test-data/expected_md_no_table_no_ocr/archive1.parquet and b/transforms/language/pdf2parquet/python/test-data/expected_md_no_table_no_ocr/archive1.parquet differ diff --git a/transforms/language/pdf2parquet/python/test-data/expected_md_no_table_no_ocr/metadata.json b/transforms/language/pdf2parquet/python/test-data/expected_md_no_table_no_ocr/metadata.json index c276aa899..bf5c9e12a 100644 --- a/transforms/language/pdf2parquet/python/test-data/expected_md_no_table_no_ocr/metadata.json +++ b/transforms/language/pdf2parquet/python/test-data/expected_md_no_table_no_ocr/metadata.json @@ -5,8 +5,8 @@ "job name": "pdf2parquet", "job type": "pure python", "job id": "job_id", - "start_time": "2024-10-18 06:09:08", - "end_time": "2024-10-18 06:09:12", + "start_time": "2024-10-29 14:19:30", + "end_time": "2024-10-29 14:19:33", "status": "success" }, "code": { @@ -19,6 +19,9 @@ "contents_type": "text/markdown", "do_table_structure": false, "do_ocr": false, + "ocr_engine": "easyocr", + "bitmap_area_threshold": 0.05, + "pdf_backend": "dlparse_v2", "double_precision": 0, "checkpointing": false, "max_files": -1, @@ -30,18 +33,18 @@ "num_processors": 0 }, "execution_stats": { - "cpus": 25.5, + "cpus": 17.3, "gpus": 0, - "memory": 27.42, + "memory": 28.85, "object_store": 0, - "execution time, min": 0.066 + "execution time, min": 0.043 }, "job_output_stats": { "source_files": 2, "source_size": 605137, "result_files": 2, - "result_size": 27574, - "processing_time": 3.448, + "result_size": 29659, + "processing_time": 2.554, "nrows": 3, "nsuccess": 3, "nfail": 0, diff --git a/transforms/language/pdf2parquet/python/test-data/expected_md_no_table_no_ocr/redp5110-ch1.parquet b/transforms/language/pdf2parquet/python/test-data/expected_md_no_table_no_ocr/redp5110-ch1.parquet index 52b40288b..69bc4e421 100644 Binary files a/transforms/language/pdf2parquet/python/test-data/expected_md_no_table_no_ocr/redp5110-ch1.parquet and b/transforms/language/pdf2parquet/python/test-data/expected_md_no_table_no_ocr/redp5110-ch1.parquet differ diff --git a/transforms/language/pdf2parquet/python/test/test_pdf2parquet_python.py b/transforms/language/pdf2parquet/python/test/test_pdf2parquet_python.py index 1c5a694fc..132c978ed 100644 --- a/transforms/language/pdf2parquet/python/test/test_pdf2parquet_python.py +++ b/transforms/language/pdf2parquet/python/test/test_pdf2parquet_python.py @@ -19,8 +19,7 @@ from data_processing.test_support.launch.transform_test import ( AbstractTransformLauncherTest, ) -from docling_core.types import Document -from docling_core.types.doc.base import BaseText +from docling_core.types.doc import DocItem, DoclingDocument, TextItem from pdf2parquet_transform_python import Pdf2ParquetPythonTransformConfiguration from pydantic import ValidationError @@ -35,7 +34,7 @@ def get_test_transform_fixtures(self) -> list[tuple]: basedir = "../test-data" basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), basedir)) config = { - "data_files_to_use": ast.literal_eval("['.pdf','.zip']"), + "data_files_to_use": ast.literal_eval("['.pdf','.docx','.pptx','.zip']"), "pdf2parquet_double_precision": 0, } @@ -58,6 +57,20 @@ def get_test_transform_fixtures(self) -> list[tuple]: ) ) + # All input in a single parquet + fixtures.append( + ( + launcher, + { + **config, + "pdf2parquet_batch_size": 10, + }, + basedir + "/input", + basedir + "/expected_batch", + ignore_columns, + ) + ) + # No table model and no OCR fixtures.append( ( @@ -130,8 +143,10 @@ def validate_expected_row( # Test for Document type try: - expected_doc = Document.model_validate_json(expected_value) - test_doc = Document.model_validate_json(test_value) + expected_doc = DoclingDocument.model_validate_json( + expected_value + ) + test_doc = DoclingDocument.model_validate_json(test_value) cls.validate_documents( row_index=row_index, table_index=table_index, @@ -164,27 +179,32 @@ def validate_documents( cls, row_index: int, table_index: int, - test_doc: Document, - expected_doc: Document, + test_doc: DoclingDocument, + expected_doc: DoclingDocument, ): msg = f"Row {row_index} of table {table_index} are not equal\n\t" - assert(test_doc.main_text is not None, msg + "Test document has no content") - assert len(test_doc.main_text) == len(expected_doc.main_text), ( + assert len(test_doc.texts) == len(expected_doc.texts), ( msg + f"Main Text lengths do not match." ) - for i in range(len(expected_doc.main_text)): - expected_item = expected_doc.main_text[i] - test_item = test_doc.main_text[i] + for (expected_item, _expected_level), (test_item, _test_level) in zip( + expected_doc.iterate_items(), test_doc.iterate_items() + ): + if not isinstance(expected_item, DocItem): + continue + assert isinstance(test_item, DocItem), msg + "Test item is not a DocItem" # Validate type - assert expected_item.obj_type == test_item.obj_type, ( - msg + f"Object type does not match." + assert expected_item.label == test_item.label, ( + msg + f"Object label does not match." ) # Validate text content - if isinstance(expected_item, BaseText): + if isinstance(expected_item, TextItem): + assert isinstance(test_item, TextItem), ( + msg + "Test item is not a TextItem as the expected one" + ) assert expected_item.text == test_item.text, ( msg + f"Text does not match." ) diff --git a/transforms/language/pdf2parquet/ray/Dockerfile b/transforms/language/pdf2parquet/ray/Dockerfile index 83fb9d1ea..8f738b7f4 100644 --- a/transforms/language/pdf2parquet/ray/Dockerfile +++ b/transforms/language/pdf2parquet/ray/Dockerfile @@ -33,7 +33,8 @@ RUN pip install ${PIP_INSTALL_EXTRA_ARGS} --no-cache-dir -e . # Download models RUN python -c 'from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models; load_pretrained_nlp_models(verbose=True);' # RUN python -c 'from docling.document_converter import DocumentConverter; from pathlib import Path; DocumentConverter.download_models_hf(local_dir=Path("./artifacts/"));' -RUN python -c 'from docling.document_converter import DocumentConverter; s=DocumentConverter.download_models_hf(); print(f"Models cached in {s}")' +RUN python -c 'from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline; s=StandardPdfPipeline.download_models_hf(); print(f"Models cached in {s}")' + # copy the main() entry point to the image COPY --chown=ray:users src/pdf2parquet_transform_ray.py ./ diff --git a/transforms/language/pdf2parquet/ray/pyproject.toml b/transforms/language/pdf2parquet/ray/pyproject.toml index b65fa26a5..5e192c8ac 100644 --- a/transforms/language/pdf2parquet/ray/pyproject.toml +++ b/transforms/language/pdf2parquet/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_pdf2parquet_transform_ray" -version = "0.2.2.dev2" +version = "0.3.0" requires-python = ">=3.10,<3.13" description = "PDF2PARQUET Ray Transform" license = {text = "Apache-2.0"} diff --git a/transforms/language/pdf2parquet/ray/requirements.txt b/transforms/language/pdf2parquet/ray/requirements.txt index 39553ea56..2b414c59e 100644 --- a/transforms/language/pdf2parquet/ray/requirements.txt +++ b/transforms/language/pdf2parquet/ray/requirements.txt @@ -1,7 +1,7 @@ -dpk-pdf2parquet-transform-python==0.2.2.dev2 +dpk-pdf2parquet-transform-python==0.3.0 data-prep-toolkit[ray]==0.2.2.dev2 -docling-core==1.7.2 -docling-ibm-models==2.0.0 -deepsearch-glm==0.22.0 -docling==1.20.0 +# docling-core==1.7.2 +# docling-ibm-models==2.0.0 +# deepsearch-glm==0.22.0 +# docling==1.20.0 filetype >=1.2.0, <2.0.0 diff --git a/transforms/language/pdf2parquet/ray/test-data/expected/archive1.parquet b/transforms/language/pdf2parquet/ray/test-data/expected/archive1.parquet index 9975c3608..907fb3803 100644 Binary files a/transforms/language/pdf2parquet/ray/test-data/expected/archive1.parquet and b/transforms/language/pdf2parquet/ray/test-data/expected/archive1.parquet differ diff --git a/transforms/language/pdf2parquet/ray/test-data/expected/metadata.json b/transforms/language/pdf2parquet/ray/test-data/expected/metadata.json index 704a86d8e..b9a535098 100644 --- a/transforms/language/pdf2parquet/ray/test-data/expected/metadata.json +++ b/transforms/language/pdf2parquet/ray/test-data/expected/metadata.json @@ -5,8 +5,8 @@ "job name": "pdf2parquet", "job type": "pure python", "job id": "job_id", - "start_time": "2024-10-18 06:02:44", - "end_time": "2024-10-18 06:03:04", + "start_time": "2024-10-29 14:17:59", + "end_time": "2024-10-29 14:18:05", "status": "success" }, "code": { @@ -19,6 +19,9 @@ "contents_type": "text/markdown", "do_table_structure": true, "do_ocr": true, + "ocr_engine": "easyocr", + "bitmap_area_threshold": 0.05, + "pdf_backend": "dlparse_v2", "double_precision": 0, "checkpointing": false, "max_files": -1, @@ -30,18 +33,18 @@ "num_processors": 0 }, "execution_stats": { - "cpus": 29.2, + "cpus": 16.8, "gpus": 0, - "memory": 29.7, + "memory": 31.22, "object_store": 0, - "execution time, min": 0.329 + "execution time, min": 0.108 }, "job_output_stats": { "source_files": 2, "source_size": 605137, "result_files": 2, - "result_size": 32086, - "processing_time": 5.981, + "result_size": 33044, + "processing_time": 6.478, "nrows": 3, "nsuccess": 3, "nfail": 0, diff --git a/transforms/language/pdf2parquet/ray/test-data/expected/redp5110-ch1.parquet b/transforms/language/pdf2parquet/ray/test-data/expected/redp5110-ch1.parquet index f70a89278..39613b1d1 100644 Binary files a/transforms/language/pdf2parquet/ray/test-data/expected/redp5110-ch1.parquet and b/transforms/language/pdf2parquet/ray/test-data/expected/redp5110-ch1.parquet differ diff --git a/transforms/transform.config b/transforms/transform.config index afe747c21..91bacf9b4 100644 --- a/transforms/transform.config +++ b/transforms/transform.config @@ -14,4 +14,4 @@ TRANSFORM_NAME=data-prep-kit-transforms # # If you change the versions numbers, be sure to run "make set-versions" to # update version numbers across the transform (e.g., pyproject.toml). -TRANSFORMS_PKG_VERSION=$(DPK_VERSION) +TRANSFORMS_PKG_VERSION=0.3.0