diff --git a/examples/notebooks/intro/dpk_intro_1_python.ipynb b/examples/notebooks/intro/dpk_intro_1_python.ipynb index f3659afcf..ab7cda854 100644 --- a/examples/notebooks/intro/dpk_intro_1_python.ipynb +++ b/examples/notebooks/intro/dpk_intro_1_python.ipynb @@ -13,7 +13,7 @@ "\n", "Here is the workflow\n", "\n", - "![](https://raw.githubusercontent.com/sujee/data-prep-kit/intro-example1/examples/notebooks/intro/images/data-prep-kit-3-workflow.png)\n" + "![](https://raw.githubusercontent.com/IBM/data-prep-kit/dev/examples/notebooks/intro/images/data-prep-kit-3-workflow.png)\n" ] }, { @@ -27,7 +27,7 @@ "\n", "Two options:\n", "\n", - "- **Option 1 - Google Colab:** easiest option. no setup required. Click this link to open this on google colab. [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/sujee/data-prep-kit/blob/intro-example1/examples/notebooks/intro/dpk_intro_1_python.ipynb)\n", + "- **Option 1 - Google Colab:** easiest option. no setup required. Click this link to open this on google colab. [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/IBM/data-prep-kit/blob/dev/examples/notebooks/intro/dpk_intro_1_python.ipynb)\n", "- **Option 2 - Local python dev environment:** Setup using this [guide](../../../README.md#-getting-started)\n", "\n", "The notebook will work as in both environments" @@ -42,10 +42,10 @@ "source": [ "## Step-1: Inspect the Data\n", "\n", - "We will use simple PDFs about Solar system. The files are [here](https://github.com/sujee/data-prep-kit/tree/intro-example1/examples/notebooks/intro/input/solar-system)\n", + "We will use simple PDFs about Solar system. The files are [here](https://github.com/IBM/data-prep-kit/tree/dev/examples/notebooks/intro/input/solar-system)\n", "\n", - "- [earth.pdf](https://github.com/sujee/data-prep-kit/blob/intro-example1/examples/notebooks/intro/input/solar-system/earth.pdf)\n", - "- [mars.pdf](https://github.com/sujee/data-prep-kit/blob/intro-example1/examples/notebooks/intro/input/solar-system/mars.pdf)\n" + "- [earth.pdf](https://github.com/IBM/data-prep-kit/blob/dev/examples/notebooks/intro/input/solar-system/earth.pdf)\n", + "- [mars.pdf](https://github.com/IBM/data-prep-kit/blob/dev/examples/notebooks/intro/input/solar-system/mars.pdf)\n" ] }, { @@ -118,9 +118,9 @@ "source": [ "if RUNNING_IN_COLAB:\n", " !mkdir -p 'input/solar-system'\n", - " !wget -O 'input/solar-system/earth.pdf' 'https://raw.githubusercontent.com/sujee/data-prep-kit/intro-example1/examples/notebooks/intro/input/solar-system/earth.pdf'\n", - " !wget -O 'input/solar-system/mars.pdf' 'https://raw.githubusercontent.com/sujee/data-prep-kit/intro-example1/examples/notebooks/intro/input/solar-system/mars.pdf'\n", - " !wget -O 'my_utils.py' 'https://raw.githubusercontent.com/sujee/data-prep-kit/intro-example1/examples/notebooks/intro/my_utils.py'" + " !wget -O 'input/solar-system/earth.pdf' 'https://raw.githubusercontent.com/IBM/data-prep-kit/dev/examples/notebooks/intro/input/solar-system/earth.pdf'\n", + " !wget -O 'input/solar-system/mars.pdf' 'https://raw.githubusercontent.com/IBM/data-prep-kit/dev/examples/notebooks/intro/input/solar-system/mars.pdf'\n", + " !wget -O 'my_utils.py' 'https://raw.githubusercontent.com/IBM/data-prep-kit/dev/examples/notebooks/intro/my_utils.py'" ] }, { diff --git a/examples/notebooks/intro/dpk_intro_1_ray.ipynb b/examples/notebooks/intro/dpk_intro_1_ray.ipynb index da33a3499..b2feb9135 100644 --- a/examples/notebooks/intro/dpk_intro_1_ray.ipynb +++ b/examples/notebooks/intro/dpk_intro_1_ray.ipynb @@ -1,4358 +1,4359 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "841e533d-ebb3-406d-9da7-b19e2c5f5866", - "metadata": { - "id": "841e533d-ebb3-406d-9da7-b19e2c5f5866" - }, - "source": [ - "# Data Prep Kit Demo 1 - Ray Version\n", - "\n", - "This notebook will introduce DPK and showcase some of it's capabilities.\n", - "\n", - "Here is the workflow\n", - "\n", - "![](https://raw.githubusercontent.com/sujee/data-prep-kit/intro-example1/examples/notebooks/intro/images/data-prep-kit-3-workflow.png)\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "id": "b15976e3", - "metadata": { - "id": "b15976e3" - }, - "source": [ - "## How to run this notebook\n", - "\n", - "Two options:\n", - "\n", - "- **Option 1 - Google Colab:** easiest option. no setup required. Click this link to open this on google colab. [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/sujee/data-prep-kit/blob/intro-example1/examples/notebooks/intro/dpk_intro_1_ray.ipynb)\n", - "- **Option 2 - Local python dev environment:** Setup using this [guide](../../../README.md#-getting-started)\n", - "\n", - "The notebook will work as in both environments" - ] - }, - { - "cell_type": "markdown", - "id": "eb8b0d5c", - "metadata": { - "id": "eb8b0d5c" - }, - "source": [ - "## Step-1: Inspect the Data\n", - "\n", - "We will use simple PDFs about Solar system. The files are [here](https://github.com/sujee/data-prep-kit/tree/intro-example1/examples/notebooks/intro/input/solar-system)\n", - "\n", - "- [earth.pdf](https://github.com/sujee/data-prep-kit/blob/intro-example1/examples/notebooks/intro/input/solar-system/earth.pdf)\n", - "- [mars.pdf](https://github.com/sujee/data-prep-kit/blob/intro-example1/examples/notebooks/intro/input/solar-system/mars.pdf)\n" - ] - }, - { - "cell_type": "markdown", - "id": "39a0ab6e", - "metadata": { - "id": "39a0ab6e" - }, - "source": [ - "## Step-2: Figure out Runtime Environment\n", - "\n", - "### 2.1 - Determine runtime\n", - "\n", - "Determine if we are running on Google colab or local python environment" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "1fe354b7", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "1fe354b7", - "outputId": "6665c654-baa5-46dc-d370-9931e0e9eed3" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "NOT in Colab\n" - ] - } - ], - "source": [ - "import os\n", - "\n", - "if os.getenv(\"COLAB_RELEASE_TAG\"):\n", - " print(\"Running in Colab\")\n", - " RUNNING_IN_COLAB = True\n", - "else:\n", - " print(\"NOT in Colab\")\n", - " RUNNING_IN_COLAB = False" - ] - }, - { - "cell_type": "markdown", - "id": "8e7c104b", - "metadata": { - "id": "8e7c104b" - }, - "source": [ - "### 2.2 -Download Data if running on Google Colab" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "3309799e", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "3309799e", - "outputId": "00d7362e-d675-4aaf-8c87-d99027d9a06c" - }, - "outputs": [], - "source": [ - "if RUNNING_IN_COLAB:\n", - " !mkdir -p 'input/solar-system'\n", - " !wget -O 'input/solar-system/earth.pdf' 'https://raw.githubusercontent.com/sujee/data-prep-kit/intro-example1/examples/notebooks/intro/input/solar-system/earth.pdf'\n", - " !wget -O 'input/solar-system/mars.pdf' 'https://raw.githubusercontent.com/sujee/data-prep-kit/intro-example1/examples/notebooks/intro/input/solar-system/mars.pdf'\n", - " !wget -O 'my_utils.py' 'https://raw.githubusercontent.com/sujee/data-prep-kit/intro-example1/examples/notebooks/intro/my_utils.py'" - ] - }, - { - "cell_type": "markdown", - "id": "a5dc2b68", - "metadata": { - "id": "a5dc2b68" - }, - "source": [ - "### 2.3 - Install dependencies if running on Google Colab" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "1fcec577", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - }, - "id": "1fcec577", - "outputId": "48cf233b-f04e-4b9b-9605-423f87693f10" - }, - "outputs": [], - "source": [ - "if RUNNING_IN_COLAB:\n", - " ! pip install --default-timeout=100 \\\n", - " data-prep-toolkit==0.2.1 \\\n", - " data-prep-toolkit-transforms==0.2.1 \\\n", - " data-prep-toolkit-transforms-ray==0.2.1 \\\n", - " deepsearch-toolkit" - ] + "cells": [ + { + "cell_type": "markdown", + "id": "841e533d-ebb3-406d-9da7-b19e2c5f5866", + "metadata": { + "id": "841e533d-ebb3-406d-9da7-b19e2c5f5866" + }, + "source": [ + "# Data Prep Kit Demo 1 - Ray Version\n", + "\n", + "This notebook will introduce DPK and showcase some of it's capabilities.\n", + "\n", + "Here is the workflow\n", + "\n", + "![](https://raw.githubusercontent.com/IBM/data-prep-kit/dev/examples/notebooks/intro/images/data-prep-kit-3-workflow.png)\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "b15976e3", + "metadata": { + "id": "b15976e3" + }, + "source": [ + "## How to run this notebook\n", + "\n", + "Two options:\n", + "\n", + "- **Option 1 - Google Colab:** easiest option. no setup required. Click this link to open this on google colab. [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/IBM/data-prep-kit/blob/dev/examples/notebooks/intro/dpk_intro_1_ray.ipynb)\n", + "- **Option 2 - Local python dev environment:** Setup using this [guide](../../../README.md#-getting-started)\n", + "\n", + "The notebook will work as in both environments" + ] + }, + { + "cell_type": "markdown", + "id": "eb8b0d5c", + "metadata": { + "id": "eb8b0d5c" + }, + "source": [ + "## Step-1: Inspect the Data\n", + "\n", + "We will use simple PDFs about Solar system. The files are [here](https://github.com/IBM/data-prep-kit/tree/dev/examples/notebooks/intro/input/solar-system)\n", + "\n", + "- [earth.pdf](https://github.com/IBM/data-prep-kit/blob/dev/examples/notebooks/intro/input/solar-system/earth.pdf)\n", + "- [mars.pdf](https://github.com/IBM/data-prep-kit/blob/dev/examples/notebooks/intro/input/solar-system/mars.pdf)\n" + ] + }, + { + "cell_type": "markdown", + "id": "39a0ab6e", + "metadata": { + "id": "39a0ab6e" + }, + "source": [ + "## Step-2: Figure out Runtime Environment\n", + "\n", + "### 2.1 - Determine runtime\n", + "\n", + "Determine if we are running on Google colab or local python environment" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "1fe354b7", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "1fe354b7", + "outputId": "6665c654-baa5-46dc-d370-9931e0e9eed3" + }, + "outputs": [ { - "cell_type": "markdown", - "id": "243322b8", - "metadata": { - "id": "243322b8" - }, - "source": [ - "### 2.4 - Restart Runtime\n", - "\n", - "After installing dependencies, be sure restart runtime, so libraries will be loaded\n", - "\n", - "You do this by going to **`Runtime --> Restart Session`**\n", - "\n", - "Then you can continue to the next step (no need to re-run the notebook)" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "NOT in Colab\n" + ] + } + ], + "source": [ + "import os\n", + "\n", + "if os.getenv(\"COLAB_RELEASE_TAG\"):\n", + " print(\"Running in Colab\")\n", + " RUNNING_IN_COLAB = True\n", + "else:\n", + " print(\"NOT in Colab\")\n", + " RUNNING_IN_COLAB = False" + ] + }, + { + "cell_type": "markdown", + "id": "8e7c104b", + "metadata": { + "id": "8e7c104b" + }, + "source": [ + "### 2.2 -Download Data if running on Google Colab" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "3309799e", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, - { - "cell_type": "markdown", - "id": "e8b10be1", - "metadata": { - "id": "e8b10be1" - }, - "source": [ - "## Step-2: Configuration" - ] + "id": "3309799e", + "outputId": "00d7362e-d675-4aaf-8c87-d99027d9a06c" + }, + "outputs": [], + "source": [ + "if RUNNING_IN_COLAB:\n", + " !mkdir -p 'input/solar-system'\n", + " !wget -O 'input/solar-system/earth.pdf' 'https://raw.githubusercontent.com/IBM/data-prep-kit/dev/examples/notebooks/intro/input/solar-system/earth.pdf'\n", + " !wget -O 'input/solar-system/mars.pdf' 'https://raw.githubusercontent.com/IBM/data-prep-kit/dev/examples/notebooks/intro/input/solar-system/mars.pdf'\n", + " !wget -O 'my_utils.py' 'https://raw.githubusercontent.com/IBM/data-prep-kit/dev/examples/notebooks/intro/my_utils.py'" + ] + }, + { + "cell_type": "markdown", + "id": "a5dc2b68", + "metadata": { + "id": "a5dc2b68" + }, + "source": [ + "### 2.3 - Install dependencies if running on Google Colab" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "1fcec577", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 }, - { - "cell_type": "markdown", - "id": "356c66f7", - "metadata": { - "id": "356c66f7" - }, - "source": [ - "### 2.1 - Basic Config" - ] + "id": "1fcec577", + "outputId": "48cf233b-f04e-4b9b-9605-423f87693f10" + }, + "outputs": [], + "source": [ + "if RUNNING_IN_COLAB:\n", + " ! pip install --default-timeout=100 \\\n", + " data-prep-toolkit==0.2.1 \\\n", + " data-prep-toolkit-transforms==0.2.1 \\\n", + " data-prep-toolkit-transforms-ray==0.2.1 \\\n", + " deepsearch-toolkit" + ] + }, + { + "cell_type": "markdown", + "id": "243322b8", + "metadata": { + "id": "243322b8" + }, + "source": [ + "### 2.4 - Restart Runtime\n", + "\n", + "After installing dependencies, be sure restart runtime, so libraries will be loaded\n", + "\n", + "You do this by going to **`Runtime --> Restart Session`**\n", + "\n", + "Then you can continue to the next step (no need to re-run the notebook)" + ] + }, + { + "cell_type": "markdown", + "id": "e8b10be1", + "metadata": { + "id": "e8b10be1" + }, + "source": [ + "## Step-2: Configuration" + ] + }, + { + "cell_type": "markdown", + "id": "356c66f7", + "metadata": { + "id": "356c66f7" + }, + "source": [ + "### 2.1 - Basic Config" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "e4YMZrBuFycl", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "e4YMZrBuFycl", + "outputId": "1a1d5f01-0856-40b6-8b1c-8187b0c38d64" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 4, - "id": "e4YMZrBuFycl", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "e4YMZrBuFycl", - "outputId": "1a1d5f01-0856-40b6-8b1c-8187b0c38d64" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "NOT in Colab\n" - ] - } - ], - "source": [ - "import os\n", - "\n", - "if os.getenv(\"COLAB_RELEASE_TAG\"):\n", - " print(\"Running in Colab\")\n", - " RUNNING_IN_COLAB = True\n", - "else:\n", - " print(\"NOT in Colab\")\n", - " RUNNING_IN_COLAB = False" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "NOT in Colab\n" + ] + } + ], + "source": [ + "import os\n", + "\n", + "if os.getenv(\"COLAB_RELEASE_TAG\"):\n", + " print(\"Running in Colab\")\n", + " RUNNING_IN_COLAB = True\n", + "else:\n", + " print(\"NOT in Colab\")\n", + " RUNNING_IN_COLAB = False" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "33345487", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "33345487", + "outputId": "f3e71a25-4864-4f8f-dfce-4af3d7e08a8a" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 5, - "id": "33345487", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "33345487", - "outputId": "f3e71a25-4864-4f8f-dfce-4af3d7e08a8a" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "MY_CONFIG.RAY_RUNTIME_WORKERS: 2\n", - "MY_CONFIG.RAY_NUM_CPUS: 0.8\n", - "MY_CONFIG.RAY_MEMORY_GB: 2\n" - ] - } - ], - "source": [ - "import os\n", - "\n", - "## Configuration\n", - "class MyConfig:\n", - " pass\n", - "\n", - "MY_CONFIG = MyConfig ()\n", - "\n", - "MY_CONFIG.INPUT_DATA_DIR = 'input/solar-system'\n", - "\n", - "MY_CONFIG.OUTPUT_FOLDER = \"output\"\n", - "MY_CONFIG.OUTPUT_FOLDER_FINAL = os.path.join(MY_CONFIG.OUTPUT_FOLDER , \"output_final\")\n", - "\n", - "## Embedding model\n", - "MY_CONFIG.EMBEDDING_MODEL = 'sentence-transformers/all-MiniLM-L6-v2'\n", - "\n", - "## RAY CONFIGURATION\n", - "### For local runs, we can use more parallelism\n", - "### For google colab, be conservative\n", - "\n", - "if RUNNING_IN_COLAB:\n", - " MY_CONFIG.RAY_RUNTIME_WORKERS = 2\n", - " MY_CONFIG.RAY_NUM_CPUS = 0.3\n", - " MY_CONFIG.RAY_MEMORY_GB = 2 # GB\n", - "else: # local run\n", - " num_cpus_available = os.cpu_count()\n", - " # print (num_cpus_available)\n", - "\n", - " MY_CONFIG.RAY_RUNTIME_WORKERS = 2\n", - " MY_CONFIG.RAY_NUM_CPUS = 0.8\n", - " MY_CONFIG.RAY_MEMORY_GB = 2 # GB\n", - " # MY_CONFIG.RAY_RUNTIME_WORKERS = num_cpus_available // 3\n", - "\n", - "print ('MY_CONFIG.RAY_RUNTIME_WORKERS:', MY_CONFIG.RAY_RUNTIME_WORKERS)\n", - "print ('MY_CONFIG.RAY_NUM_CPUS:', MY_CONFIG.RAY_NUM_CPUS)\n", - "print ('MY_CONFIG.RAY_MEMORY_GB:', MY_CONFIG.RAY_MEMORY_GB)\n" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "MY_CONFIG.RAY_RUNTIME_WORKERS: 2\n", + "MY_CONFIG.RAY_NUM_CPUS: 0.8\n", + "MY_CONFIG.RAY_MEMORY_GB: 2\n" + ] + } + ], + "source": [ + "import os\n", + "\n", + "## Configuration\n", + "class MyConfig:\n", + " pass\n", + "\n", + "MY_CONFIG = MyConfig ()\n", + "\n", + "MY_CONFIG.INPUT_DATA_DIR = 'input/solar-system'\n", + "\n", + "MY_CONFIG.OUTPUT_FOLDER = \"output\"\n", + "MY_CONFIG.OUTPUT_FOLDER_FINAL = os.path.join(MY_CONFIG.OUTPUT_FOLDER , \"output_final\")\n", + "\n", + "## Embedding model\n", + "MY_CONFIG.EMBEDDING_MODEL = 'sentence-transformers/all-MiniLM-L6-v2'\n", + "\n", + "## RAY CONFIGURATION\n", + "### For local runs, we can use more parallelism\n", + "### For google colab, be conservative\n", + "\n", + "if RUNNING_IN_COLAB:\n", + " MY_CONFIG.RAY_RUNTIME_WORKERS = 2\n", + " MY_CONFIG.RAY_NUM_CPUS = 0.3\n", + " MY_CONFIG.RAY_MEMORY_GB = 2 # GB\n", + "else: # local run\n", + " num_cpus_available = os.cpu_count()\n", + " # print (num_cpus_available)\n", + "\n", + " MY_CONFIG.RAY_RUNTIME_WORKERS = 2\n", + " MY_CONFIG.RAY_NUM_CPUS = 0.8\n", + " MY_CONFIG.RAY_MEMORY_GB = 2 # GB\n", + " # MY_CONFIG.RAY_RUNTIME_WORKERS = num_cpus_available // 3\n", + "\n", + "print ('MY_CONFIG.RAY_RUNTIME_WORKERS:', MY_CONFIG.RAY_RUNTIME_WORKERS)\n", + "print ('MY_CONFIG.RAY_NUM_CPUS:', MY_CONFIG.RAY_NUM_CPUS)\n", + "print ('MY_CONFIG.RAY_MEMORY_GB:', MY_CONFIG.RAY_MEMORY_GB)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "b15e6827", + "metadata": { + "id": "b15e6827" + }, + "outputs": [], + "source": [ + "## Add parent dir to path\n", + "import os,sys\n", + "\n", + "this_dir = os.path.abspath('')\n", + "parent_dir = os.path.dirname(this_dir)\n", + "sys.path.append (os.path.abspath (parent_dir))" + ] + }, + { + "cell_type": "markdown", + "id": "72510ae6-48b0-4b88-9e13-a623281c3a63", + "metadata": { + "id": "72510ae6-48b0-4b88-9e13-a623281c3a63" + }, + "source": [ + "### 2.2 - Setup input/outpur directories" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "60ac8bee-0960-4309-b225-d7a211b14262", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "60ac8bee-0960-4309-b225-d7a211b14262", + "outputId": "ec5beb05-027a-49eb-9a96-271471619d81" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 6, - "id": "b15e6827", - "metadata": { - "id": "b15e6827" - }, - "outputs": [], - "source": [ - "## Add parent dir to path\n", - "import os,sys\n", - "\n", - "this_dir = os.path.abspath('')\n", - "parent_dir = os.path.dirname(this_dir)\n", - "sys.path.append (os.path.abspath (parent_dir))" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "โœ… Cleared output directory\n" + ] + } + ], + "source": [ + "import os, sys\n", + "import shutil\n", + "\n", + "if not os.path.exists(MY_CONFIG.INPUT_DATA_DIR ):\n", + " raise Exception (f\"โŒ Input folder MY_CONFIG.INPUT_DATA_DIR = '{MY_CONFIG.INPUT_DATA_DIR}' not found\")\n", + "\n", + "output_parquet_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '01_parquet_out')\n", + "output_chunk_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '02_chunk_out')\n", + "output_docid_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '03_docid_out')\n", + "output_exact_dedupe_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '04_exact_dedupe_out')\n", + "output_fuzzy_dedupe_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '05_fuzzy_dedupe_out')\n", + "output_embeddings_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '06_embeddings_out')\n", + "\n", + "## clear output folder\n", + "shutil.rmtree(MY_CONFIG.OUTPUT_FOLDER, ignore_errors=True)\n", + "shutil.os.makedirs(MY_CONFIG.OUTPUT_FOLDER, exist_ok=True)\n", + "\n", + "print (\"โœ… Cleared output directory\")" + ] + }, + { + "cell_type": "markdown", + "id": "2449e5c7-078c-4ad6-a2f6-21d39d4da3fb", + "metadata": { + "id": "2449e5c7-078c-4ad6-a2f6-21d39d4da3fb" + }, + "source": [ + "## Step-3: pdf2parquet - Convert data from PDF to Parquet\n", + "\n", + "This step is reading the input folder containing all PDF files and ingest them in a parquet table using the [Docling package](https://github.com/DS4SD/docling).\n", + "The documents are converted into a JSON format which allows to easily chunk it in the later steps.\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "c0c574c4-9dc4-4dab-9ad6-b5338207e67a", + "metadata": { + "id": "c0c574c4-9dc4-4dab-9ad6-b5338207e67a" + }, + "source": [ + "### 3.1 - Set Input/output Folder" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "482605b2-d814-456d-9195-49a2ec454ef0", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "482605b2-d814-456d-9195-49a2ec454ef0", + "outputId": "f8383739-a4fb-450c-dc37-5df32aab8212" + }, + "outputs": [ { - "cell_type": "markdown", - "id": "72510ae6-48b0-4b88-9e13-a623281c3a63", - "metadata": { - "id": "72510ae6-48b0-4b88-9e13-a623281c3a63" - }, - "source": [ - "### 2.2 - Setup input/outpur directories" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "๐Ÿƒ๐Ÿผ STAGE-1: Processing input='input/solar-system' --> output='output/01_parquet_out'\n" + ] + } + ], + "source": [ + "STAGE = 1\n", + "\n", + "input_folder = MY_CONFIG.INPUT_DATA_DIR\n", + "output_folder = output_parquet_dir\n", + "\n", + "print (f\"๐Ÿƒ๐Ÿผ STAGE-{STAGE}: Processing input='{input_folder}' --> output='{output_folder}'\")" + ] + }, + { + "cell_type": "markdown", + "id": "9bb15f02-ab5c-4525-a536-cfa1fd2ba70b", + "metadata": { + "id": "9bb15f02-ab5c-4525-a536-cfa1fd2ba70b" + }, + "source": [ + "### 3.2 - Execute" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b0cd8ebd-bf71-42d6-a397-8df0c7b66a26", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "b0cd8ebd-bf71-42d6-a397-8df0c7b66a26", + "outputId": "14a36e73-a186-4431-a755-f46ccb691130" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 7, - "id": "60ac8bee-0960-4309-b225-d7a211b14262", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "60ac8bee-0960-4309-b225-d7a211b14262", - "outputId": "ec5beb05-027a-49eb-9a96-271471619d81" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "โœ… Cleared output directory\n" - ] - } - ], - "source": [ - "import os, sys\n", - "import shutil\n", - "\n", - "if not os.path.exists(MY_CONFIG.INPUT_DATA_DIR ):\n", - " raise Exception (f\"โŒ Input folder MY_CONFIG.INPUT_DATA_DIR = '{MY_CONFIG.INPUT_DATA_DIR}' not found\")\n", - "\n", - "output_parquet_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '01_parquet_out')\n", - "output_chunk_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '02_chunk_out')\n", - "output_docid_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '03_docid_out')\n", - "output_exact_dedupe_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '04_exact_dedupe_out')\n", - "output_fuzzy_dedupe_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '05_fuzzy_dedupe_out')\n", - "output_embeddings_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '06_embeddings_out')\n", - "\n", - "## clear output folder\n", - "shutil.rmtree(MY_CONFIG.OUTPUT_FOLDER, ignore_errors=True)\n", - "shutil.os.makedirs(MY_CONFIG.OUTPUT_FOLDER, exist_ok=True)\n", - "\n", - "print (\"โœ… Cleared output directory\")" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "13:30:44 INFO - pdf2parquet parameters are : {'artifacts_path': None, 'contents_type': , 'do_table_structure': True, 'do_ocr': True, 'double_precision': 8}\n", + "13:30:44 INFO - pipeline id pipeline_id\n", + "13:30:44 INFO - code location None\n", + "13:30:44 INFO - number of workers 2 worker options {'num_cpus': 0.8, 'memory': 2147483648, 'max_restarts': -1}\n", + "13:30:44 INFO - actor creation delay 0\n", + "13:30:44 INFO - job details {'job category': 'preprocessing', 'job name': 'pdf2parquet', 'job type': 'ray', 'job id': 'job_id'}\n", + "13:30:44 INFO - data factory data_ is using local data access: input_folder - input/solar-system output_folder - output/01_parquet_out\n", + "13:30:44 INFO - data factory data_ max_files -1, n_sample -1\n", + "13:30:44 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.pdf'], files to checkpoint ['.parquet']\n", + "13:30:44 INFO - Running locally\n", + "2024-10-18 13:30:47,436\tINFO worker.py:1744 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", + "\u001b[36m(orchestrate pid=9266)\u001b[0m 13:30:50 INFO - orchestrator started at 2024-10-18 13:30:50\n", + "\u001b[36m(orchestrate pid=9266)\u001b[0m 13:30:50 INFO - Number of files is 2, source profile {'max_file_size': 0.055823326110839844, 'min_file_size': 0.0551910400390625, 'total_file_size': 0.11101436614990234}\n", + "\u001b[36m(orchestrate pid=9266)\u001b[0m 13:30:50 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 14.872821807861328, 'object_store': 7.436410903930664}\n", + "\u001b[36m(orchestrate pid=9266)\u001b[0m 13:30:50 INFO - Number of workers - 2 with {'num_cpus': 0.8, 'memory': 2147483648, 'max_restarts': -1} each\n", + "\u001b[36m(orchestrate pid=9266)\u001b[0m 13:30:50 INFO - Completed 0 files (0.0%) in 0.0 min. Waiting for completion\n", + "\u001b[36m(RayTransformFileProcessor pid=10098)\u001b[0m 13:30:53 INFO - Initializing models\n", + "Fetching 10 files: 100%|โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆ| 10/10 [00:00<00:00, 110376.42it/s]\n", + "\u001b[36m(RayTransformFileProcessor pid=10098)\u001b[0m Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.\n", + "\u001b[36m(orchestrate pid=9266)\u001b[0m 13:30:59 INFO - Completed processing 2 files in 0.145 min\n", + "\u001b[36m(orchestrate pid=9266)\u001b[0m 13:30:59 INFO - done flushing in 0.001 sec\n", + "\u001b[36m(RayTransformFileProcessor pid=10099)\u001b[0m 13:30:53 INFO - Initializing models\n", + "Fetching 10 files: 100%|โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆ| 10/10 [00:00<00:00, 73713.60it/s]\n", + "\u001b[36m(RayTransformFileProcessor pid=10099)\u001b[0m Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.\n", + "13:31:09 INFO - Completed execution in 0.421 min, execution result 0\n" + ] }, { - "cell_type": "markdown", - "id": "2449e5c7-078c-4ad6-a2f6-21d39d4da3fb", - "metadata": { - "id": "2449e5c7-078c-4ad6-a2f6-21d39d4da3fb" - }, - "source": [ - "## Step-3: pdf2parquet - Convert data from PDF to Parquet\n", - "\n", - "This step is reading the input folder containing all PDF files and ingest them in a parquet table using the [Docling package](https://github.com/DS4SD/docling).\n", - "The documents are converted into a JSON format which allows to easily chunk it in the later steps.\n", - "\n" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "โœ… Stage:1 completed successfully\n", + "CPU times: user 4.41 s, sys: 1.39 s, total: 5.8 s\n", + "Wall time: 31.1 s\n" + ] + } + ], + "source": [ + "%%time\n", + "\n", + "import ast\n", + "import os\n", + "import sys\n", + "\n", + "from pdf2parquet_transform import (\n", + " pdf2parquet_contents_type_cli_param,\n", + " pdf2parquet_contents_types,\n", + ")\n", + "from data_processing_ray.runtime.ray import RayTransformLauncher\n", + "from pdf2parquet_transform_python import Pdf2ParquetPythonTransformConfiguration\n", + "from pdf2parquet_transform_ray import Pdf2ParquetRayTransformConfiguration\n", + "\n", + "from data_processing.utils import GB, ParamsUtils\n", + "\n", + "\n", + "# create parameters\n", + "local_conf = {\n", + " \"input_folder\": input_folder,\n", + " \"output_folder\": output_folder,\n", + "}\n", + "worker_options = {\"num_cpus\" : MY_CONFIG.RAY_NUM_CPUS, \"memory\": MY_CONFIG.RAY_MEMORY_GB * GB}\n", + "ingest_config = {\n", + " pdf2parquet_contents_type_cli_param: pdf2parquet_contents_types.JSON,\n", + "}\n", + "\n", + "params = {\n", + " # where to run\n", + " \"run_locally\": True,\n", + " # Data access. Only required parameters are specified\n", + " \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n", + " \"data_files_to_use\": ast.literal_eval(\"['.pdf']\"),\n", + " # orchestrator\n", + " \"runtime_worker_options\": ParamsUtils.convert_to_ast(worker_options),\n", + " \"runtime_num_workers\": 1, # so model download to cleanup works properly\n", + "\n", + "}\n", + "\n", + "\n", + "sys.argv = ParamsUtils.dict_to_req(d=(params | ingest_config))\n", + "# create launcher\n", + "launcher = RayTransformLauncher(Pdf2ParquetRayTransformConfiguration())\n", + "# launcher = PythonTransformLauncher(Pdf2ParquetPythonTransformConfiguration())\n", + "# launch\n", + "return_code = launcher.launch()\n", + "\n", + "if return_code == 0:\n", + " print (f\"โœ… Stage:{STAGE} completed successfully\")\n", + "else:\n", + " raise Exception (\"โŒ Ray job failed\")\n" + ] + }, + { + "cell_type": "markdown", + "id": "5ca790e0", + "metadata": { + "id": "5ca790e0" + }, + "source": [ + "### 3.3 - Inspect Generated output\n", + "\n", + "Here we should see one entry per input file processed." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "fe59563d", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 255 }, + "id": "fe59563d", + "outputId": "d10c022d-524f-4a13-ebf8-6431114e9172" + }, + "outputs": [ { - "cell_type": "markdown", - "id": "c0c574c4-9dc4-4dab-9ad6-b5338207e67a", - "metadata": { - "id": "c0c574c4-9dc4-4dab-9ad6-b5338207e67a" - }, - "source": [ - "### 3.1 - Set Input/output Folder" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "Output dimensions (rows x columns)= (2, 12)\n" + ] }, { - "cell_type": "code", - "execution_count": 8, - "id": "482605b2-d814-456d-9195-49a2ec454ef0", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "482605b2-d814-456d-9195-49a2ec454ef0", - "outputId": "f8383739-a4fb-450c-dc37-5df32aab8212" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "๐Ÿƒ๐Ÿผ STAGE-1: Processing input='input/solar-system' --> output='output/01_parquet_out'\n" - ] - } + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
filenamecontentsnum_pagesnum_tablesnum_doc_elementsdocument_idexthashsizedate_acquiredpdf_convert_timesource_filename
0mars.pdf{\"_name\":\"\",\"type\":\"pdf-document\",\"description...101162e5639f-f922-4ccc-a041-3cb02f1cfd83pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf
1earth.pdf{\"_name\":\"\",\"type\":\"pdf-document\",\"description...1011f3c0ac2e-1de2-472b-8216-2043f3b3e9d1pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdf
\n", + "
" ], - "source": [ - "STAGE = 1\n", - "\n", - "input_folder = MY_CONFIG.INPUT_DATA_DIR\n", - "output_folder = output_parquet_dir\n", - "\n", - "print (f\"๐Ÿƒ๐Ÿผ STAGE-{STAGE}: Processing input='{input_folder}' --> output='{output_folder}'\")" + "text/plain": [ + " filename contents num_pages \\\n", + "0 mars.pdf {\"_name\":\"\",\"type\":\"pdf-document\",\"description... 1 \n", + "1 earth.pdf {\"_name\":\"\",\"type\":\"pdf-document\",\"description... 1 \n", + "\n", + " num_tables num_doc_elements document_id ext \\\n", + "0 0 11 62e5639f-f922-4ccc-a041-3cb02f1cfd83 pdf \n", + "1 0 11 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 pdf \n", + "\n", + " hash size \\\n", + "0 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", + "1 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", + "\n", + " date_acquired pdf_convert_time source_filename \n", + "0 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", + "1 2024-10-18T13:30:59.494027 2.015123 earth.pdf " ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from my_utils import read_parquet_files_as_df\n", + "\n", + "output_df = read_parquet_files_as_df(output_folder)\n", + "\n", + "print (\"Output dimensions (rows x columns)= \", output_df.shape)\n", + "\n", + "output_df.head(5)\n", + "\n", + "## To display certain columns\n", + "#parquet_df[['column1', 'column2', 'column3']].head(5)" + ] + }, + { + "cell_type": "markdown", + "id": "e5058a21", + "metadata": { + "id": "e5058a21" + }, + "source": [ + "\n", + "### 3.4 - Understand the output\n", + "\n", + "Here are some interesting attributes to note:\n", + "\n", + "- **filename** : original filename\n", + "- **contents** : text\n", + "- **document_id**: unique id (UUID) assignd to this document\n", + "- **hash** : hash of document\n", + "- **pdf_convert_time** : time to convert this pdf in seconds\n", + "\n", + "Let's inspect the **contents** column. See how the text is being divided up!" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "f870e624", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "f870e624", + "outputId": "9142246b-988c-4674-99d7-e2f3fffbaaf4" + }, + "outputs": [ { - "cell_type": "markdown", - "id": "9bb15f02-ab5c-4525-a536-cfa1fd2ba70b", - "metadata": { - "id": "9bb15f02-ab5c-4525-a536-cfa1fd2ba70b" - }, - "source": [ - "### 3.2 - Execute" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "{'_name': '',\n", + " 'description': {'logs': []},\n", + " 'equations': [],\n", + " 'figures': [],\n", + " 'file-info': {'#-pages': 1,\n", + " 'document-hash': '1a83f43f3a202e3f203c1263e36961ecc45d401aad488f638fc5559a584333b2',\n", + " 'filename': 'mars.pdf',\n", + " 'page-hashes': [{'hash': '551fe7a9bde2a9302f150c0a79a13fcc0868fcf73ac6afb80be645c1174734a0',\n", + " 'model': 'default',\n", + " 'page': 1}]},\n", + " 'footnotes': [],\n", + " 'main-text': [{'name': 'Section-header',\n", + " 'prov': [{'bbox': [133.35137939,\n", + " 654.45184326,\n", + " 169.88169861,\n", + " 667.98492432],\n", + " 'page': 1,\n", + " 'span': [0, 4]}],\n", + " 'text': 'Mars',\n", + " 'type': 'subtitle-level-1'},\n", + " {'name': 'Section-header',\n", + " 'prov': [{'bbox': [133.09541321,\n", + " 630.68127441,\n", + " 210.66503906,\n", + " 642.34405518],\n", + " 'page': 1,\n", + " 'span': [0, 12]}],\n", + " 'text': 'Solar System',\n", + " 'type': 'subtitle-level-1'},\n", + " {'name': 'Text',\n", + " 'prov': [{'bbox': [132.84518433,\n", + " 588.96014404,\n", + " 479.40917969,\n", + " 623.02520752],\n", + " 'page': 1,\n", + " 'span': [0, 205]}],\n", + " 'text': 'Our solar system is a vast and fascinating expanse, '\n", + " 'comprising eight planets, five dwarf planets, '\n", + " 'numerous moons, asteroids, comets, and other '\n", + " 'celestial bodies. At its center lies the star we call '\n", + " 'the Sun.',\n", + " 'type': 'paragraph'},\n", + " {'name': 'Text',\n", + " 'prov': [{'bbox': [133.18510437,\n", + " 570.83258057,\n", + " 374.99838257,\n", + " 581.07043457],\n", + " 'page': 1,\n", + " 'span': [0, 54]}],\n", + " 'text': 'For more details about the Solar system see Chapter '\n", + " '1.',\n", + " 'type': 'paragraph'},\n", + " {'name': 'Section-header',\n", + " 'prov': [{'bbox': [133.22866821,\n", + " 542.98168945,\n", + " 163.86282349,\n", + " 554.45288086],\n", + " 'page': 1,\n", + " 'span': [0, 4]}],\n", + " 'text': 'Mars',\n", + " 'type': 'subtitle-level-1'},\n", + " {'name': 'Text',\n", + " 'prov': [{'bbox': [132.87440491,\n", + " 500.84011841,\n", + " 477.48345947,\n", + " 534.55810547],\n", + " 'page': 1,\n", + " 'span': [0, 196]}],\n", + " 'text': 'Mars, the fourth planet from the Sun, is a cold, '\n", + " 'desert world with a thin atmosphere composed '\n", + " 'primarily of carbon dioxide. Its reddish hue comes '\n", + " 'from iron oxide, or rust, prevalent on its surface.',\n", + " 'type': 'paragraph'},\n", + " {'name': 'Section-header',\n", + " 'prov': [{'bbox': [133.2026062,\n", + " 482.90710449,\n", + " 237.04431152,\n", + " 493.07443237],\n", + " 'page': 1,\n", + " 'span': [0, 23]}],\n", + " 'text': 'Basic facts about Mars:',\n", + " 'type': 'subtitle-level-1'},\n", + " {'name': 'List-item',\n", + " 'prov': [{'bbox': [145.94500732,\n", + " 453.019104,\n", + " 477.48171997,\n", + " 474.9703064],\n", + " 'page': 1,\n", + " 'span': [0, 78]}],\n", + " 'text': 'ยท Distance from the Sun: Average of 228 million '\n", + " 'kilometers (142 million miles)',\n", + " 'type': 'paragraph'},\n", + " {'name': 'List-item',\n", + " 'prov': [{'bbox': [145.94500732,\n", + " 440.79351807,\n", + " 431.73287964,\n", + " 451.2142334],\n", + " 'page': 1,\n", + " 'span': [0, 64]}],\n", + " 'text': 'ยท Rotation Period: 24.6 hours (one Martian day - '\n", + " 'called a \"sol\")',\n", + " 'type': 'paragraph'},\n", + " {'name': 'List-item',\n", + " 'prov': [{'bbox': [145.94500732,\n", + " 429.10913086,\n", + " 365.9559021,\n", + " 438.83737183],\n", + " 'page': 1,\n", + " 'span': [0, 44]}],\n", + " 'text': 'ยท Moons: Two small moons, Phobos and Deimos.',\n", + " 'type': 'paragraph'},\n", + " {'name': 'Page-footer',\n", + " 'prov': [{'bbox': [303.13299561,\n", + " 87.20314026,\n", + " 308.11428833,\n", + " 96.51646423],\n", + " 'page': 1,\n", + " 'span': [0, 1]}],\n", + " 'text': '1',\n", + " 'type': 'page-footer'}],\n", + " 'page-dimensions': [{'height': 792.0, 'page': 1, 'width': 612.0}],\n", + " 'page-footers': [],\n", + " 'page-headers': [],\n", + " 'tables': [],\n", + " 'type': 'pdf-document'}\n" + ] + } + ], + "source": [ + "import pprint\n", + "import json\n", + "\n", + "pprint.pprint (json.loads(output_df.iloc[0, ]['contents']))\n", + "# json.loads(output_df.iloc[0, ]['contents'])" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "e1a10c2d", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "e1a10c2d", + "outputId": "ca74113e-6fd3-488b-836a-60bd58299fb1" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 9, - "id": "b0cd8ebd-bf71-42d6-a397-8df0c7b66a26", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "b0cd8ebd-bf71-42d6-a397-8df0c7b66a26", - "outputId": "14a36e73-a186-4431-a755-f46ccb691130" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "13:30:44 INFO - pdf2parquet parameters are : {'artifacts_path': None, 'contents_type': , 'do_table_structure': True, 'do_ocr': True, 'double_precision': 8}\n", - "13:30:44 INFO - pipeline id pipeline_id\n", - "13:30:44 INFO - code location None\n", - "13:30:44 INFO - number of workers 2 worker options {'num_cpus': 0.8, 'memory': 2147483648, 'max_restarts': -1}\n", - "13:30:44 INFO - actor creation delay 0\n", - "13:30:44 INFO - job details {'job category': 'preprocessing', 'job name': 'pdf2parquet', 'job type': 'ray', 'job id': 'job_id'}\n", - "13:30:44 INFO - data factory data_ is using local data access: input_folder - input/solar-system output_folder - output/01_parquet_out\n", - "13:30:44 INFO - data factory data_ max_files -1, n_sample -1\n", - "13:30:44 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.pdf'], files to checkpoint ['.parquet']\n", - "13:30:44 INFO - Running locally\n", - "2024-10-18 13:30:47,436\tINFO worker.py:1744 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", - "\u001b[36m(orchestrate pid=9266)\u001b[0m 13:30:50 INFO - orchestrator started at 2024-10-18 13:30:50\n", - "\u001b[36m(orchestrate pid=9266)\u001b[0m 13:30:50 INFO - Number of files is 2, source profile {'max_file_size': 0.055823326110839844, 'min_file_size': 0.0551910400390625, 'total_file_size': 0.11101436614990234}\n", - "\u001b[36m(orchestrate pid=9266)\u001b[0m 13:30:50 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 14.872821807861328, 'object_store': 7.436410903930664}\n", - "\u001b[36m(orchestrate pid=9266)\u001b[0m 13:30:50 INFO - Number of workers - 2 with {'num_cpus': 0.8, 'memory': 2147483648, 'max_restarts': -1} each\n", - "\u001b[36m(orchestrate pid=9266)\u001b[0m 13:30:50 INFO - Completed 0 files (0.0%) in 0.0 min. Waiting for completion\n", - "\u001b[36m(RayTransformFileProcessor pid=10098)\u001b[0m 13:30:53 INFO - Initializing models\n", - "Fetching 10 files: 100%|โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆ| 10/10 [00:00<00:00, 110376.42it/s]\n", - "\u001b[36m(RayTransformFileProcessor pid=10098)\u001b[0m Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.\n", - "\u001b[36m(orchestrate pid=9266)\u001b[0m 13:30:59 INFO - Completed processing 2 files in 0.145 min\n", - "\u001b[36m(orchestrate pid=9266)\u001b[0m 13:30:59 INFO - done flushing in 0.001 sec\n", - "\u001b[36m(RayTransformFileProcessor pid=10099)\u001b[0m 13:30:53 INFO - Initializing models\n", - "Fetching 10 files: 100%|โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆ| 10/10 [00:00<00:00, 73713.60it/s]\n", - "\u001b[36m(RayTransformFileProcessor pid=10099)\u001b[0m Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.\n", - "13:31:09 INFO - Completed execution in 0.421 min, execution result 0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "โœ… Stage:1 completed successfully\n", - "CPU times: user 4.41 s, sys: 1.39 s, total: 5.8 s\n", - "Wall time: 31.1 s\n" - ] - } - ], - "source": [ - "%%time\n", - "\n", - "import ast\n", - "import os\n", - "import sys\n", - "\n", - "from pdf2parquet_transform import (\n", - " pdf2parquet_contents_type_cli_param,\n", - " pdf2parquet_contents_types,\n", - ")\n", - "from data_processing_ray.runtime.ray import RayTransformLauncher\n", - "from pdf2parquet_transform_python import Pdf2ParquetPythonTransformConfiguration\n", - "from pdf2parquet_transform_ray import Pdf2ParquetRayTransformConfiguration\n", - "\n", - "from data_processing.utils import GB, ParamsUtils\n", - "\n", - "\n", - "# create parameters\n", - "local_conf = {\n", - " \"input_folder\": input_folder,\n", - " \"output_folder\": output_folder,\n", - "}\n", - "worker_options = {\"num_cpus\" : MY_CONFIG.RAY_NUM_CPUS, \"memory\": MY_CONFIG.RAY_MEMORY_GB * GB}\n", - "ingest_config = {\n", - " pdf2parquet_contents_type_cli_param: pdf2parquet_contents_types.JSON,\n", - "}\n", - "\n", - "params = {\n", - " # where to run\n", - " \"run_locally\": True,\n", - " # Data access. Only required parameters are specified\n", - " \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n", - " \"data_files_to_use\": ast.literal_eval(\"['.pdf']\"),\n", - " # orchestrator\n", - " \"runtime_worker_options\": ParamsUtils.convert_to_ast(worker_options),\n", - " \"runtime_num_workers\": MY_CONFIG.RAY_RUNTIME_WORKERS,\n", - "}\n", - "\n", - "\n", - "sys.argv = ParamsUtils.dict_to_req(d=(params | ingest_config))\n", - "# create launcher\n", - "launcher = RayTransformLauncher(Pdf2ParquetRayTransformConfiguration())\n", - "# launcher = PythonTransformLauncher(Pdf2ParquetPythonTransformConfiguration())\n", - "# launch\n", - "return_code = launcher.launch()\n", - "\n", - "if return_code == 0:\n", - " print (f\"โœ… Stage:{STAGE} completed successfully\")\n", - "else:\n", - " raise Exception (\"โŒ Ray job failed\")\n" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "{'_name': '',\n", + " 'description': {'logs': []},\n", + " 'equations': [],\n", + " 'figures': [],\n", + " 'file-info': {'#-pages': 1,\n", + " 'document-hash': '7401ae81637dbb89e7040dcd5945bbfb75ff8648bb761c69f8a1595e86538748',\n", + " 'filename': 'earth.pdf',\n", + " 'page-hashes': [{'hash': 'ca802e4bd5a3301792808caea2a47db51f0520888875b77fc230c99ee851c19b',\n", + " 'model': 'default',\n", + " 'page': 1}]},\n", + " 'footnotes': [],\n", + " 'main-text': [{'name': 'Section-header',\n", + " 'prov': [{'bbox': [133.30961609,\n", + " 654.45184326,\n", + " 174.04208374,\n", + " 667.93347168],\n", + " 'page': 1,\n", + " 'span': [0, 5]}],\n", + " 'text': 'Earth',\n", + " 'type': 'subtitle-level-1'},\n", + " {'name': 'Section-header',\n", + " 'prov': [{'bbox': [133.12528992,\n", + " 630.69073486,\n", + " 210.66503906,\n", + " 642.27935791],\n", + " 'page': 1,\n", + " 'span': [0, 12]}],\n", + " 'text': 'Solar System',\n", + " 'type': 'subtitle-level-1'},\n", + " {'name': 'Text',\n", + " 'prov': [{'bbox': [132.87112427,\n", + " 588.96014404,\n", + " 479.40917969,\n", + " 623.04595947],\n", + " 'page': 1,\n", + " 'span': [0, 205]}],\n", + " 'text': 'Our solar system is a vast and fascinating expanse, '\n", + " 'comprising eight planets, five dwarf planets, '\n", + " 'numerous moons, asteroids, comets, and other '\n", + " 'celestial bodies. At its center lies the star we call '\n", + " 'the Sun.',\n", + " 'type': 'paragraph'},\n", + " {'name': 'Text',\n", + " 'prov': [{'bbox': [133.20942688,\n", + " 570.81555176,\n", + " 375.57919312,\n", + " 581.08459473],\n", + " 'page': 1,\n", + " 'span': [0, 54]}],\n", + " 'text': 'For more details about our Solar system see Chapter '\n", + " '1.',\n", + " 'type': 'paragraph'},\n", + " {'name': 'Section-header',\n", + " 'prov': [{'bbox': [133.15542603,\n", + " 542.98168945,\n", + " 167.32983398,\n", + " 554.36669922],\n", + " 'page': 1,\n", + " 'span': [0, 5]}],\n", + " 'text': 'Earth',\n", + " 'type': 'subtitle-level-1'},\n", + " {'name': 'Text',\n", + " 'prov': [{'bbox': [132.91053772,\n", + " 512.46295166,\n", + " 477.84887695,\n", + " 534.48431396],\n", + " 'page': 1,\n", + " 'span': [0, 107]}],\n", + " 'text': \"Earth is the third planet from the Sun. It's our home \"\n", + " 'planet. Earth is the only place we know of with life.',\n", + " 'type': 'paragraph'},\n", + " {'name': 'Text',\n", + " 'prov': [{'bbox': [133.30151367,\n", + " 494.86206055,\n", + " 240.17156982,\n", + " 505.07229614],\n", + " 'page': 1,\n", + " 'span': [0, 24]}],\n", + " 'text': 'Basic facts about Earth:',\n", + " 'type': 'paragraph'},\n", + " {'name': 'List-item',\n", + " 'prov': [{'bbox': [145.94500732,\n", + " 464.97409058,\n", + " 477.47979736,\n", + " 487.02810669],\n", + " 'page': 1,\n", + " 'span': [0, 79]}],\n", + " 'text': 'ยท Distance from the Sun: Average of 149.6 million '\n", + " 'kilometers (93 million miles)',\n", + " 'type': 'paragraph'},\n", + " {'name': 'List-item',\n", + " 'prov': [{'bbox': [145.94500732,\n", + " 452.86901855,\n", + " 317.90722656,\n", + " 463.24041748],\n", + " 'page': 1,\n", + " 'span': [0, 37]}],\n", + " 'text': 'ยท Rotation Period: 24 hours (one day)',\n", + " 'type': 'paragraph'},\n", + " {'name': 'List-item',\n", + " 'prov': [{'bbox': [145.94500732,\n", + " 440.71496582,\n", + " 396.66357422,\n", + " 451.19915771],\n", + " 'page': 1,\n", + " 'span': [0, 52]}],\n", + " 'text': 'ยท Moons: One moon, called Luna or simply \"the Moon\".',\n", + " 'type': 'paragraph'},\n", + " {'name': 'Page-footer',\n", + " 'prov': [{'bbox': [303.13299561,\n", + " 87.20314026,\n", + " 308.11428833,\n", + " 96.53633118],\n", + " 'page': 1,\n", + " 'span': [0, 1]}],\n", + " 'text': '1',\n", + " 'type': 'page-footer'}],\n", + " 'page-dimensions': [{'height': 792.0, 'page': 1, 'width': 612.0}],\n", + " 'page-footers': [],\n", + " 'page-headers': [],\n", + " 'tables': [],\n", + " 'type': 'pdf-document'}\n" + ] + } + ], + "source": [ + "pprint.pprint (json.loads(output_df.iloc[1, ]['contents']))" + ] + }, + { + "cell_type": "markdown", + "id": "72274586", + "metadata": { + "id": "72274586" + }, + "source": [ + "## Step-4: Doc chunks\n", + "\n", + "In the previous step, we have extracted text from oru PDFs. But we have the content of entire file as 'one row' in our parquet output.\n", + "\n", + "In this step, we are going to split the documents in chunks, according to their layout segmentation.\n", + "\n", + "This transform uses [Quackling](https://github.com/DS4SD/quackling) `HierarchicalChunker`\n", + "to chunk according to the document layout segmentation, i.e. respecting the original document components as paragraphs, tables, enumerations, etc.\n", + "It relies on documents converted with the Docling library in the [pdf2parquet transform](https://github.com/IBM/data-prep-kit/blob/dev/transforms/language/pdf2parquet/python/README.md) using the option `contents_type: \"application/json\"`,\n", + "which provides the required JSON structure." + ] + }, + { + "cell_type": "markdown", + "id": "96198fa6", + "metadata": { + "id": "96198fa6" + }, + "source": [ + "### 4.1 - Set Input/output Folder" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "305f00a3", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "305f00a3", + "outputId": "689f1531-7007-49d9-9a27-39c39f8f2c50" + }, + "outputs": [ { - "cell_type": "markdown", - "id": "5ca790e0", - "metadata": { - "id": "5ca790e0" - }, - "source": [ - "### 3.3 - Inspect Generated output\n", - "\n", - "Here we should see one entry per input file processed." - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "๐Ÿƒ๐Ÿผ STAGE-2: Processing input='output/01_parquet_out' --> output='output/02_chunk_out'\n" + ] + } + ], + "source": [ + "STAGE = 2\n", + "\n", + "input_folder = output_parquet_dir # previous output folder is the input folder for the current stage\n", + "output_folder = output_chunk_dir\n", + "\n", + "input_df = read_parquet_files_as_df(input_folder) ## for debug purposes\n", + "\n", + "print (f\"๐Ÿƒ๐Ÿผ STAGE-{STAGE}: Processing input='{input_folder}' --> output='{output_folder}'\")" + ] + }, + { + "cell_type": "markdown", + "id": "369f2cd1", + "metadata": { + "id": "369f2cd1" + }, + "source": [ + "### 4.2 - Execute" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "5b7b18d5", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "5b7b18d5", + "outputId": "0146bd91-2ccb-4e56-c649-f415a38bfcf8" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 10, - "id": "fe59563d", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 255 - }, - "id": "fe59563d", - "outputId": "d10c022d-524f-4a13-ebf8-6431114e9172" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Output dimensions (rows x columns)= (2, 12)\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
filenamecontentsnum_pagesnum_tablesnum_doc_elementsdocument_idexthashsizedate_acquiredpdf_convert_timesource_filename
0mars.pdf{\"_name\":\"\",\"type\":\"pdf-document\",\"description...101162e5639f-f922-4ccc-a041-3cb02f1cfd83pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf
1earth.pdf{\"_name\":\"\",\"type\":\"pdf-document\",\"description...1011f3c0ac2e-1de2-472b-8216-2043f3b3e9d1pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdf
\n", - "
" - ], - "text/plain": [ - " filename contents num_pages \\\n", - "0 mars.pdf {\"_name\":\"\",\"type\":\"pdf-document\",\"description... 1 \n", - "1 earth.pdf {\"_name\":\"\",\"type\":\"pdf-document\",\"description... 1 \n", - "\n", - " num_tables num_doc_elements document_id ext \\\n", - "0 0 11 62e5639f-f922-4ccc-a041-3cb02f1cfd83 pdf \n", - "1 0 11 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 pdf \n", - "\n", - " hash size \\\n", - "0 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "1 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "\n", - " date_acquired pdf_convert_time source_filename \n", - "0 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", - "1 2024-10-18T13:30:59.494027 2.015123 earth.pdf " - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from my_utils import read_parquet_files_as_df\n", - "\n", - "output_df = read_parquet_files_as_df(output_folder)\n", - "\n", - "print (\"Output dimensions (rows x columns)= \", output_df.shape)\n", - "\n", - "output_df.head(5)\n", - "\n", - "## To display certain columns\n", - "#parquet_df[['column1', 'column2', 'column3']].head(5)" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "13:31:12 INFO - doc_chunk parameters are : {'chunking_type': , 'content_column_name': 'contents', 'doc_id_column_name': 'document_id', 'dl_min_chunk_len': None, 'output_chunk_column_name': 'contents', 'output_source_doc_id_column_name': 'source_document_id', 'output_jsonpath_column_name': 'doc_jsonpath', 'output_pageno_column_name': 'page_number', 'output_bbox_column_name': 'bbox'}\n", + "13:31:12 INFO - pipeline id pipeline_id\n", + "13:31:12 INFO - code location None\n", + "13:31:12 INFO - number of workers 2 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n", + "13:31:12 INFO - actor creation delay 0\n", + "13:31:12 INFO - job details {'job category': 'preprocessing', 'job name': 'doc_chunk', 'job type': 'ray', 'job id': 'job_id'}\n", + "13:31:12 INFO - data factory data_ is using local data access: input_folder - output/01_parquet_out output_folder - output/02_chunk_out\n", + "13:31:12 INFO - data factory data_ max_files -1, n_sample -1\n", + "13:31:12 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "13:31:12 INFO - Running locally\n", + "2024-10-18 13:31:14,121\tINFO worker.py:1744 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", + "\u001b[36m(orchestrate pid=10912)\u001b[0m 13:31:16 INFO - orchestrator started at 2024-10-18 13:31:16\n", + "\u001b[36m(orchestrate pid=10912)\u001b[0m 13:31:16 INFO - Number of files is 2, source profile {'max_file_size': 0.02239513397216797, 'min_file_size': 0.02167987823486328, 'total_file_size': 0.04407501220703125}\n", + "\u001b[36m(orchestrate pid=10912)\u001b[0m 13:31:16 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 14.963891602121294, 'object_store': 7.4819458005949855}\n", + "\u001b[36m(orchestrate pid=10912)\u001b[0m 13:31:16 INFO - Number of workers - 2 with {'num_cpus': 0.8, 'max_restarts': -1} each\n", + "\u001b[36m(orchestrate pid=10912)\u001b[0m 13:31:16 INFO - Completed 0 files (0.0%) in 0.0 min. Waiting for completion\n", + "\u001b[36m(orchestrate pid=10912)\u001b[0m 13:31:18 INFO - Completed processing 2 files in 0.032 min\n", + "\u001b[36m(orchestrate pid=10912)\u001b[0m 13:31:18 INFO - done flushing in 0.001 sec\n", + "13:31:28 INFO - Completed execution in 0.269 min, execution result 0\n" + ] }, { - "cell_type": "markdown", - "id": "e5058a21", - "metadata": { - "id": "e5058a21" - }, - "source": [ - "\n", - "### 3.4 - Understand the output\n", - "\n", - "Here are some interesting attributes to note:\n", - "\n", - "- **filename** : original filename\n", - "- **contents** : text\n", - "- **document_id**: unique id (UUID) assignd to this document\n", - "- **hash** : hash of document\n", - "- **pdf_convert_time** : time to convert this pdf in seconds\n", - "\n", - "Let's inspect the **contents** column. See how the text is being divided up!" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "โœ… Stage:2 completed successfully\n", + "CPU times: user 982 ms, sys: 291 ms, total: 1.27 s\n", + "Wall time: 18.9 s\n" + ] + } + ], + "source": [ + "%%time\n", + "\n", + "from data_processing_ray.runtime.ray import RayTransformLauncher\n", + "from doc_chunk_transform_ray import DocChunkRayTransformConfiguration\n", + "\n", + "\n", + "# Prepare the commandline params\n", + "local_conf = {\n", + " \"input_folder\": input_folder,\n", + " \"output_folder\": output_folder,\n", + "}\n", + "worker_options = {\"num_cpus\" : MY_CONFIG.RAY_NUM_CPUS}\n", + "params = {\n", + " # where to run\n", + " \"run_locally\": True,\n", + " # Data access. Only required parameters are specified\n", + " \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n", + " # orchestrator\n", + " \"runtime_worker_options\": ParamsUtils.convert_to_ast(worker_options),\n", + " \"runtime_num_workers\": MY_CONFIG.RAY_RUNTIME_WORKERS,\n", + " # doc_chunk arguments\n", + " # ...\n", + "}\n", + "\n", + "# Pass the commandline params\n", + "sys.argv = ParamsUtils.dict_to_req(d=params)\n", + "\n", + "# create launcher\n", + "launcher = RayTransformLauncher(DocChunkRayTransformConfiguration())\n", + "# launch\n", + "return_code = launcher.launch()\n", + "\n", + "if return_code == 0:\n", + " print (f\"โœ… Stage:{STAGE} completed successfully\")\n", + "else:\n", + " raise Exception (\"โŒ Ray job failed\")" + ] + }, + { + "cell_type": "markdown", + "id": "213afdf6", + "metadata": { + "id": "213afdf6" + }, + "source": [ + "### 4.3 - Inspect Generated output\n", + "\n", + "We would see documents are split into many chunks" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "d8138d43", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 897 }, + "id": "d8138d43", + "outputId": "e1758b0c-5f22-4368-c3e6-ff778fc9ae82" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 11, - "id": "f870e624", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "f870e624", - "outputId": "9142246b-988c-4674-99d7-e2f3fffbaaf4" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'_name': '',\n", - " 'description': {'logs': []},\n", - " 'equations': [],\n", - " 'figures': [],\n", - " 'file-info': {'#-pages': 1,\n", - " 'document-hash': '1a83f43f3a202e3f203c1263e36961ecc45d401aad488f638fc5559a584333b2',\n", - " 'filename': 'mars.pdf',\n", - " 'page-hashes': [{'hash': '551fe7a9bde2a9302f150c0a79a13fcc0868fcf73ac6afb80be645c1174734a0',\n", - " 'model': 'default',\n", - " 'page': 1}]},\n", - " 'footnotes': [],\n", - " 'main-text': [{'name': 'Section-header',\n", - " 'prov': [{'bbox': [133.35137939,\n", - " 654.45184326,\n", - " 169.88169861,\n", - " 667.98492432],\n", - " 'page': 1,\n", - " 'span': [0, 4]}],\n", - " 'text': 'Mars',\n", - " 'type': 'subtitle-level-1'},\n", - " {'name': 'Section-header',\n", - " 'prov': [{'bbox': [133.09541321,\n", - " 630.68127441,\n", - " 210.66503906,\n", - " 642.34405518],\n", - " 'page': 1,\n", - " 'span': [0, 12]}],\n", - " 'text': 'Solar System',\n", - " 'type': 'subtitle-level-1'},\n", - " {'name': 'Text',\n", - " 'prov': [{'bbox': [132.84518433,\n", - " 588.96014404,\n", - " 479.40917969,\n", - " 623.02520752],\n", - " 'page': 1,\n", - " 'span': [0, 205]}],\n", - " 'text': 'Our solar system is a vast and fascinating expanse, '\n", - " 'comprising eight planets, five dwarf planets, '\n", - " 'numerous moons, asteroids, comets, and other '\n", - " 'celestial bodies. At its center lies the star we call '\n", - " 'the Sun.',\n", - " 'type': 'paragraph'},\n", - " {'name': 'Text',\n", - " 'prov': [{'bbox': [133.18510437,\n", - " 570.83258057,\n", - " 374.99838257,\n", - " 581.07043457],\n", - " 'page': 1,\n", - " 'span': [0, 54]}],\n", - " 'text': 'For more details about the Solar system see Chapter '\n", - " '1.',\n", - " 'type': 'paragraph'},\n", - " {'name': 'Section-header',\n", - " 'prov': [{'bbox': [133.22866821,\n", - " 542.98168945,\n", - " 163.86282349,\n", - " 554.45288086],\n", - " 'page': 1,\n", - " 'span': [0, 4]}],\n", - " 'text': 'Mars',\n", - " 'type': 'subtitle-level-1'},\n", - " {'name': 'Text',\n", - " 'prov': [{'bbox': [132.87440491,\n", - " 500.84011841,\n", - " 477.48345947,\n", - " 534.55810547],\n", - " 'page': 1,\n", - " 'span': [0, 196]}],\n", - " 'text': 'Mars, the fourth planet from the Sun, is a cold, '\n", - " 'desert world with a thin atmosphere composed '\n", - " 'primarily of carbon dioxide. Its reddish hue comes '\n", - " 'from iron oxide, or rust, prevalent on its surface.',\n", - " 'type': 'paragraph'},\n", - " {'name': 'Section-header',\n", - " 'prov': [{'bbox': [133.2026062,\n", - " 482.90710449,\n", - " 237.04431152,\n", - " 493.07443237],\n", - " 'page': 1,\n", - " 'span': [0, 23]}],\n", - " 'text': 'Basic facts about Mars:',\n", - " 'type': 'subtitle-level-1'},\n", - " {'name': 'List-item',\n", - " 'prov': [{'bbox': [145.94500732,\n", - " 453.019104,\n", - " 477.48171997,\n", - " 474.9703064],\n", - " 'page': 1,\n", - " 'span': [0, 78]}],\n", - " 'text': 'ยท Distance from the Sun: Average of 228 million '\n", - " 'kilometers (142 million miles)',\n", - " 'type': 'paragraph'},\n", - " {'name': 'List-item',\n", - " 'prov': [{'bbox': [145.94500732,\n", - " 440.79351807,\n", - " 431.73287964,\n", - " 451.2142334],\n", - " 'page': 1,\n", - " 'span': [0, 64]}],\n", - " 'text': 'ยท Rotation Period: 24.6 hours (one Martian day - '\n", - " 'called a \"sol\")',\n", - " 'type': 'paragraph'},\n", - " {'name': 'List-item',\n", - " 'prov': [{'bbox': [145.94500732,\n", - " 429.10913086,\n", - " 365.9559021,\n", - " 438.83737183],\n", - " 'page': 1,\n", - " 'span': [0, 44]}],\n", - " 'text': 'ยท Moons: Two small moons, Phobos and Deimos.',\n", - " 'type': 'paragraph'},\n", - " {'name': 'Page-footer',\n", - " 'prov': [{'bbox': [303.13299561,\n", - " 87.20314026,\n", - " 308.11428833,\n", - " 96.51646423],\n", - " 'page': 1,\n", - " 'span': [0, 1]}],\n", - " 'text': '1',\n", - " 'type': 'page-footer'}],\n", - " 'page-dimensions': [{'height': 792.0, 'page': 1, 'width': 612.0}],\n", - " 'page-footers': [],\n", - " 'page-headers': [],\n", - " 'tables': [],\n", - " 'type': 'pdf-document'}\n" - ] - } - ], - "source": [ - "import pprint\n", - "import json\n", - "\n", - "pprint.pprint (json.loads(output_df.iloc[0, ]['contents']))\n", - "# json.loads(output_df.iloc[0, ]['contents'])" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "Files processed : 2\n", + "Chunks created : 8\n", + "Input data dimensions (rows x columns)= (2, 12)\n", + "Output data dimensions (rows x columns)= (8, 16)\n" + ] }, { - "cell_type": "code", - "execution_count": 12, - "id": "e1a10c2d", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "e1a10c2d", - "outputId": "ca74113e-6fd3-488b-836a-60bd58299fb1" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'_name': '',\n", - " 'description': {'logs': []},\n", - " 'equations': [],\n", - " 'figures': [],\n", - " 'file-info': {'#-pages': 1,\n", - " 'document-hash': '7401ae81637dbb89e7040dcd5945bbfb75ff8648bb761c69f8a1595e86538748',\n", - " 'filename': 'earth.pdf',\n", - " 'page-hashes': [{'hash': 'ca802e4bd5a3301792808caea2a47db51f0520888875b77fc230c99ee851c19b',\n", - " 'model': 'default',\n", - " 'page': 1}]},\n", - " 'footnotes': [],\n", - " 'main-text': [{'name': 'Section-header',\n", - " 'prov': [{'bbox': [133.30961609,\n", - " 654.45184326,\n", - " 174.04208374,\n", - " 667.93347168],\n", - " 'page': 1,\n", - " 'span': [0, 5]}],\n", - " 'text': 'Earth',\n", - " 'type': 'subtitle-level-1'},\n", - " {'name': 'Section-header',\n", - " 'prov': [{'bbox': [133.12528992,\n", - " 630.69073486,\n", - " 210.66503906,\n", - " 642.27935791],\n", - " 'page': 1,\n", - " 'span': [0, 12]}],\n", - " 'text': 'Solar System',\n", - " 'type': 'subtitle-level-1'},\n", - " {'name': 'Text',\n", - " 'prov': [{'bbox': [132.87112427,\n", - " 588.96014404,\n", - " 479.40917969,\n", - " 623.04595947],\n", - " 'page': 1,\n", - " 'span': [0, 205]}],\n", - " 'text': 'Our solar system is a vast and fascinating expanse, '\n", - " 'comprising eight planets, five dwarf planets, '\n", - " 'numerous moons, asteroids, comets, and other '\n", - " 'celestial bodies. At its center lies the star we call '\n", - " 'the Sun.',\n", - " 'type': 'paragraph'},\n", - " {'name': 'Text',\n", - " 'prov': [{'bbox': [133.20942688,\n", - " 570.81555176,\n", - " 375.57919312,\n", - " 581.08459473],\n", - " 'page': 1,\n", - " 'span': [0, 54]}],\n", - " 'text': 'For more details about our Solar system see Chapter '\n", - " '1.',\n", - " 'type': 'paragraph'},\n", - " {'name': 'Section-header',\n", - " 'prov': [{'bbox': [133.15542603,\n", - " 542.98168945,\n", - " 167.32983398,\n", - " 554.36669922],\n", - " 'page': 1,\n", - " 'span': [0, 5]}],\n", - " 'text': 'Earth',\n", - " 'type': 'subtitle-level-1'},\n", - " {'name': 'Text',\n", - " 'prov': [{'bbox': [132.91053772,\n", - " 512.46295166,\n", - " 477.84887695,\n", - " 534.48431396],\n", - " 'page': 1,\n", - " 'span': [0, 107]}],\n", - " 'text': \"Earth is the third planet from the Sun. It's our home \"\n", - " 'planet. Earth is the only place we know of with life.',\n", - " 'type': 'paragraph'},\n", - " {'name': 'Text',\n", - " 'prov': [{'bbox': [133.30151367,\n", - " 494.86206055,\n", - " 240.17156982,\n", - " 505.07229614],\n", - " 'page': 1,\n", - " 'span': [0, 24]}],\n", - " 'text': 'Basic facts about Earth:',\n", - " 'type': 'paragraph'},\n", - " {'name': 'List-item',\n", - " 'prov': [{'bbox': [145.94500732,\n", - " 464.97409058,\n", - " 477.47979736,\n", - " 487.02810669],\n", - " 'page': 1,\n", - " 'span': [0, 79]}],\n", - " 'text': 'ยท Distance from the Sun: Average of 149.6 million '\n", - " 'kilometers (93 million miles)',\n", - " 'type': 'paragraph'},\n", - " {'name': 'List-item',\n", - " 'prov': [{'bbox': [145.94500732,\n", - " 452.86901855,\n", - " 317.90722656,\n", - " 463.24041748],\n", - " 'page': 1,\n", - " 'span': [0, 37]}],\n", - " 'text': 'ยท Rotation Period: 24 hours (one day)',\n", - " 'type': 'paragraph'},\n", - " {'name': 'List-item',\n", - " 'prov': [{'bbox': [145.94500732,\n", - " 440.71496582,\n", - " 396.66357422,\n", - " 451.19915771],\n", - " 'page': 1,\n", - " 'span': [0, 52]}],\n", - " 'text': 'ยท Moons: One moon, called Luna or simply \"the Moon\".',\n", - " 'type': 'paragraph'},\n", - " {'name': 'Page-footer',\n", - " 'prov': [{'bbox': [303.13299561,\n", - " 87.20314026,\n", - " 308.11428833,\n", - " 96.53633118],\n", - " 'page': 1,\n", - " 'span': [0, 1]}],\n", - " 'text': '1',\n", - " 'type': 'page-footer'}],\n", - " 'page-dimensions': [{'height': 792.0, 'page': 1, 'width': 612.0}],\n", - " 'page-footers': [],\n", - " 'page-headers': [],\n", - " 'tables': [],\n", - " 'type': 'pdf-document'}\n" - ] - } + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
filenamenum_pagesnum_tablesnum_doc_elementsexthashsizedate_acquiredpdf_convert_timesource_filenamesource_document_idcontentsdoc_jsonpathpage_numberbboxdocument_id
0mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Solar System\\nOur solar system is a vast and f...$.main-text[2]1[132.84518433, 588.96014404, 479.40917969, 623...44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...
1mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Solar System\\nFor more details about the Solar...$.main-text[3]1[133.18510437, 570.83258057, 374.99838257, 581...dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07...
2mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Mars\\nMars, the fourth planet from the Sun, is...$.main-text[5]1[132.87440491, 500.84011841, 477.48345947, 534...a31663e06fac41470ecc459f5a58658a3f9997d7801053...
3mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Basic facts about Mars:\\nยท Distance from the S...$.main-text[6]1[133.2026062, 482.90710449, 237.04431152, 493....7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...
4earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Solar System\\nOur solar system is a vast and f...$.main-text[2]1[132.87112427, 588.96014404, 479.40917969, 623...44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...
5earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Solar System\\nFor more details about our Solar...$.main-text[3]1[133.20942688, 570.81555176, 375.57919312, 581...d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...
6earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Earth\\nEarth is the third planet from the Sun....$.main-text[5]1[132.91053772, 512.46295166, 477.84887695, 534...7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...
7earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Earth\\nBasic facts about Earth:\\nยท Distance fr...$.main-text[6]1[133.30151367, 494.86206055, 240.17156982, 505...189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...
\n", + "
" ], - "source": [ - "pprint.pprint (json.loads(output_df.iloc[1, ]['contents']))" - ] - }, - { - "cell_type": "markdown", - "id": "72274586", - "metadata": { - "id": "72274586" - }, - "source": [ - "## Step-4: Doc chunks\n", - "\n", - "In the previous step, we have extracted text from oru PDFs. But we have the content of entire file as 'one row' in our parquet output.\n", - "\n", - "In this step, we are going to split the documents in chunks, according to their layout segmentation.\n", - "\n", - "This transform uses [Quackling](https://github.com/DS4SD/quackling) `HierarchicalChunker`\n", - "to chunk according to the document layout segmentation, i.e. respecting the original document components as paragraphs, tables, enumerations, etc.\n", - "It relies on documents converted with the Docling library in the [pdf2parquet transform](https://github.com/IBM/data-prep-kit/blob/dev/transforms/language/pdf2parquet/python/README.md) using the option `contents_type: \"application/json\"`,\n", - "which provides the required JSON structure." - ] - }, - { - "cell_type": "markdown", - "id": "96198fa6", - "metadata": { - "id": "96198fa6" - }, - "source": [ - "### 4.1 - Set Input/output Folder" + "text/plain": [ + " filename num_pages num_tables num_doc_elements ext \\\n", + "0 mars.pdf 1 0 11 pdf \n", + "1 mars.pdf 1 0 11 pdf \n", + "2 mars.pdf 1 0 11 pdf \n", + "3 mars.pdf 1 0 11 pdf \n", + "4 earth.pdf 1 0 11 pdf \n", + "5 earth.pdf 1 0 11 pdf \n", + "6 earth.pdf 1 0 11 pdf \n", + "7 earth.pdf 1 0 11 pdf \n", + "\n", + " hash size \\\n", + "0 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", + "1 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", + "2 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", + "3 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", + "4 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", + "5 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", + "6 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", + "7 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", + "\n", + " date_acquired pdf_convert_time source_filename \\\n", + "0 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", + "1 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", + "2 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", + "3 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", + "4 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", + "5 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", + "6 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", + "7 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", + "\n", + " source_document_id \\\n", + "0 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", + "1 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", + "2 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", + "3 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", + "4 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", + "5 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", + "6 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", + "7 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", + "\n", + " contents doc_jsonpath \\\n", + "0 Solar System\\nOur solar system is a vast and f... $.main-text[2] \n", + "1 Solar System\\nFor more details about the Solar... $.main-text[3] \n", + "2 Mars\\nMars, the fourth planet from the Sun, is... $.main-text[5] \n", + "3 Basic facts about Mars:\\nยท Distance from the S... $.main-text[6] \n", + "4 Solar System\\nOur solar system is a vast and f... $.main-text[2] \n", + "5 Solar System\\nFor more details about our Solar... $.main-text[3] \n", + "6 Earth\\nEarth is the third planet from the Sun.... $.main-text[5] \n", + "7 Earth\\nBasic facts about Earth:\\nยท Distance fr... $.main-text[6] \n", + "\n", + " page_number bbox \\\n", + "0 1 [132.84518433, 588.96014404, 479.40917969, 623... \n", + "1 1 [133.18510437, 570.83258057, 374.99838257, 581... \n", + "2 1 [132.87440491, 500.84011841, 477.48345947, 534... \n", + "3 1 [133.2026062, 482.90710449, 237.04431152, 493.... \n", + "4 1 [132.87112427, 588.96014404, 479.40917969, 623... \n", + "5 1 [133.20942688, 570.81555176, 375.57919312, 581... \n", + "6 1 [132.91053772, 512.46295166, 477.84887695, 534... \n", + "7 1 [133.30151367, 494.86206055, 240.17156982, 505... \n", + "\n", + " document_id \n", + "0 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... \n", + "1 dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07... \n", + "2 a31663e06fac41470ecc459f5a58658a3f9997d7801053... \n", + "3 7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a... \n", + "4 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... \n", + "5 d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d... \n", + "6 7c4a750e2215f231803a6f8078bde1e9699034fb033dd3... \n", + "7 189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f... " ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from my_utils import read_parquet_files_as_df\n", + "\n", + "output_df = read_parquet_files_as_df(output_folder)\n", + "\n", + "print (f\"Files processed : {input_df.shape[0]:,}\")\n", + "print (f\"Chunks created : {output_df.shape[0]:,}\")\n", + "\n", + "print (\"Input data dimensions (rows x columns)= \", input_df.shape)\n", + "print (\"Output data dimensions (rows x columns)= \", output_df.shape)\n", + "\n", + "output_df.head(10)" + ] + }, + { + "cell_type": "markdown", + "id": "9e9ca75c", + "metadata": { + "id": "9e9ca75c" + }, + "source": [ + "### 4.4 - Understanding the Output\n", + "\n", + "Here we see 2 PDF files are split into 6 chunks. Basically we see the documents are being split along 'natural boundaris' - paragraphs and bullet points\n", + "\n", + "See how **document_id** is carried throughout. This helps us identify original documents.\n", + "\n", + "Also note **contents** is now plain text (not JSON as before)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "3090c950", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 300 }, + "id": "3090c950", + "outputId": "3f542446-2cfa-404c-c642-3732f7b74568" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 13, - "id": "305f00a3", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "305f00a3", - "outputId": "689f1531-7007-49d9-9a27-39c39f8f2c50" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "๐Ÿƒ๐Ÿผ STAGE-2: Processing input='output/01_parquet_out' --> output='output/02_chunk_out'\n" - ] - } + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
filenamecontents
0mars.pdfSolar System\\nOur solar system is a vast and f...
1mars.pdfSolar System\\nFor more details about the Solar...
2mars.pdfMars\\nMars, the fourth planet from the Sun, is...
3mars.pdfBasic facts about Mars:\\nยท Distance from the S...
4earth.pdfSolar System\\nOur solar system is a vast and f...
5earth.pdfSolar System\\nFor more details about our Solar...
6earth.pdfEarth\\nEarth is the third planet from the Sun....
7earth.pdfEarth\\nBasic facts about Earth:\\nยท Distance fr...
\n", + "
" ], - "source": [ - "STAGE = 2\n", - "\n", - "input_folder = output_parquet_dir # previous output folder is the input folder for the current stage\n", - "output_folder = output_chunk_dir\n", - "\n", - "input_df = read_parquet_files_as_df(input_folder) ## for debug purposes\n", - "\n", - "print (f\"๐Ÿƒ๐Ÿผ STAGE-{STAGE}: Processing input='{input_folder}' --> output='{output_folder}'\")" - ] - }, - { - "cell_type": "markdown", - "id": "369f2cd1", - "metadata": { - "id": "369f2cd1" - }, - "source": [ - "### 4.2 - Execute" + "text/plain": [ + " filename contents\n", + "0 mars.pdf Solar System\\nOur solar system is a vast and f...\n", + "1 mars.pdf Solar System\\nFor more details about the Solar...\n", + "2 mars.pdf Mars\\nMars, the fourth planet from the Sun, is...\n", + "3 mars.pdf Basic facts about Mars:\\nยท Distance from the S...\n", + "4 earth.pdf Solar System\\nOur solar system is a vast and f...\n", + "5 earth.pdf Solar System\\nFor more details about our Solar...\n", + "6 earth.pdf Earth\\nEarth is the third planet from the Sun....\n", + "7 earth.pdf Earth\\nBasic facts about Earth:\\nยท Distance fr..." ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "output_df[['filename', 'contents']]" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "d5f151ae", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "d5f151ae", + "outputId": "4616d648-0852-4ecb-cef8-f5940e176de0" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 14, - "id": "5b7b18d5", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "5b7b18d5", - "outputId": "0146bd91-2ccb-4e56-c649-f415a38bfcf8" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "13:31:12 INFO - doc_chunk parameters are : {'chunking_type': , 'content_column_name': 'contents', 'doc_id_column_name': 'document_id', 'dl_min_chunk_len': None, 'output_chunk_column_name': 'contents', 'output_source_doc_id_column_name': 'source_document_id', 'output_jsonpath_column_name': 'doc_jsonpath', 'output_pageno_column_name': 'page_number', 'output_bbox_column_name': 'bbox'}\n", - "13:31:12 INFO - pipeline id pipeline_id\n", - "13:31:12 INFO - code location None\n", - "13:31:12 INFO - number of workers 2 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n", - "13:31:12 INFO - actor creation delay 0\n", - "13:31:12 INFO - job details {'job category': 'preprocessing', 'job name': 'doc_chunk', 'job type': 'ray', 'job id': 'job_id'}\n", - "13:31:12 INFO - data factory data_ is using local data access: input_folder - output/01_parquet_out output_folder - output/02_chunk_out\n", - "13:31:12 INFO - data factory data_ max_files -1, n_sample -1\n", - "13:31:12 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "13:31:12 INFO - Running locally\n", - "2024-10-18 13:31:14,121\tINFO worker.py:1744 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", - "\u001b[36m(orchestrate pid=10912)\u001b[0m 13:31:16 INFO - orchestrator started at 2024-10-18 13:31:16\n", - "\u001b[36m(orchestrate pid=10912)\u001b[0m 13:31:16 INFO - Number of files is 2, source profile {'max_file_size': 0.02239513397216797, 'min_file_size': 0.02167987823486328, 'total_file_size': 0.04407501220703125}\n", - "\u001b[36m(orchestrate pid=10912)\u001b[0m 13:31:16 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 14.963891602121294, 'object_store': 7.4819458005949855}\n", - "\u001b[36m(orchestrate pid=10912)\u001b[0m 13:31:16 INFO - Number of workers - 2 with {'num_cpus': 0.8, 'max_restarts': -1} each\n", - "\u001b[36m(orchestrate pid=10912)\u001b[0m 13:31:16 INFO - Completed 0 files (0.0%) in 0.0 min. Waiting for completion\n", - "\u001b[36m(orchestrate pid=10912)\u001b[0m 13:31:18 INFO - Completed processing 2 files in 0.032 min\n", - "\u001b[36m(orchestrate pid=10912)\u001b[0m 13:31:18 INFO - done flushing in 0.001 sec\n", - "13:31:28 INFO - Completed execution in 0.269 min, execution result 0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "โœ… Stage:2 completed successfully\n", - "CPU times: user 982 ms, sys: 291 ms, total: 1.27 s\n", - "Wall time: 18.9 s\n" - ] - } - ], - "source": [ - "%%time\n", - "\n", - "from data_processing_ray.runtime.ray import RayTransformLauncher\n", - "from doc_chunk_transform_ray import DocChunkRayTransformConfiguration\n", - "\n", - "\n", - "# Prepare the commandline params\n", - "local_conf = {\n", - " \"input_folder\": input_folder,\n", - " \"output_folder\": output_folder,\n", - "}\n", - "worker_options = {\"num_cpus\" : MY_CONFIG.RAY_NUM_CPUS}\n", - "params = {\n", - " # where to run\n", - " \"run_locally\": True,\n", - " # Data access. Only required parameters are specified\n", - " \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n", - " # orchestrator\n", - " \"runtime_worker_options\": ParamsUtils.convert_to_ast(worker_options),\n", - " \"runtime_num_workers\": MY_CONFIG.RAY_RUNTIME_WORKERS,\n", - " # doc_chunk arguments\n", - " # ...\n", - "}\n", - "\n", - "# Pass the commandline params\n", - "sys.argv = ParamsUtils.dict_to_req(d=params)\n", - "\n", - "# create launcher\n", - "launcher = RayTransformLauncher(DocChunkRayTransformConfiguration())\n", - "# launch\n", - "return_code = launcher.launch()\n", - "\n", - "if return_code == 0:\n", - " print (f\"โœ… Stage:{STAGE} completed successfully\")\n", - "else:\n", - " raise Exception (\"โŒ Ray job failed\")" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "========== mars.pdf ===========\n", + "-------Chunk 0------\n", + "Solar System\n", + "Our solar system is a vast and fascinating expanse, comprising eight planets, five dwarf planets, numerous moons, asteroids, comets, and other celestial bodies. At its center lies the star we call the Sun.\n", + "-------\n", + "-------Chunk 1------\n", + "Solar System\n", + "For more details about the Solar system see Chapter 1.\n", + "-------\n", + "-------Chunk 2------\n", + "Mars\n", + "Mars, the fourth planet from the Sun, is a cold, desert world with a thin atmosphere composed primarily of carbon dioxide. Its reddish hue comes from iron oxide, or rust, prevalent on its surface.\n", + "-------\n", + "-------Chunk 3------\n", + "Basic facts about Mars:\n", + "ยท Distance from the Sun: Average of 228 million kilometers (142 million miles)\n", + "ยท Rotation Period: 24.6 hours (one Martian day - called a \"sol\")\n", + "ยท Moons: Two small moons, Phobos and Deimos.\n", + "-------\n", + "========== earth.pdf ===========\n", + "-------Chunk 0------\n", + "Solar System\n", + "Our solar system is a vast and fascinating expanse, comprising eight planets, five dwarf planets, numerous moons, asteroids, comets, and other celestial bodies. At its center lies the star we call the Sun.\n", + "-------\n", + "-------Chunk 1------\n", + "Solar System\n", + "For more details about our Solar system see Chapter 1.\n", + "-------\n", + "-------Chunk 2------\n", + "Earth\n", + "Earth is the third planet from the Sun. It's our home planet. Earth is the only place we know of with life.\n", + "-------\n", + "-------Chunk 3------\n", + "Earth\n", + "Basic facts about Earth:\n", + "ยท Distance from the Sun: Average of 149.6 million kilometers (93 million miles)\n", + "ยท Rotation Period: 24 hours (one day)\n", + "ยท Moons: One moon, called Luna or simply \"the Moon\".\n", + "-------\n" + ] + } + ], + "source": [ + "for f in output_df['filename'].unique():\n", + " print ('==========' , f, '===========')\n", + " chunks = output_df[output_df['filename'] == f]['contents']\n", + " for idx , chunk in enumerate(chunks):\n", + " print (f'-------Chunk {idx}------\\n{chunk}\\n-------')" + ] + }, + { + "cell_type": "markdown", + "id": "20217298", + "metadata": { + "id": "20217298" + }, + "source": [ + "## Step-5: DOC ID generation\n", + "\n", + "This transform annotates documents with document \"ids\". It supports the following transformations of the original data:\n", + "\n", + " - Adding document hash: this enables the addition of a document hash-based id to the data. The hash is calculated with `hashlib.sha256(doc.encode(\"utf-8\")).hexdigest()`. To enable this annotation, set **hash_column** to the name of the column, where you want to store it.\n", + " - Adding integer document id: this allows the addition of an integer document id to the data that is unique across all rows in all tables provided to the transform() method. To enable this annotation, set **int_id_column** to the name of the column, where you want to store it.\n", + "\n", + "**This is a pre-requisite for fuzzy dedup** in the pipeline." + ] + }, + { + "cell_type": "markdown", + "id": "66811f5b", + "metadata": { + "id": "66811f5b" + }, + "source": [ + "### 5.1 - Set Input/output Folder" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "1f747c0d", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "1f747c0d", + "outputId": "e42500b7-5d1e-41fd-b53b-34d3393f36f4" + }, + "outputs": [ { - "cell_type": "markdown", - "id": "213afdf6", - "metadata": { - "id": "213afdf6" - }, - "source": [ - "### 4.3 - Inspect Generated output\n", - "\n", - "We would see documents are split into many chunks" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "๐Ÿƒ๐Ÿผ STAGE-3: Processing input='output/02_chunk_out' --> output='output/03_docid_out'\n" + ] + } + ], + "source": [ + "\n", + "# Input for this stage is the output of exact dedeup component\n", + "# output of this component makes it possible for fdedup component to run on data.\n", + "\n", + "STAGE = 3\n", + "\n", + "input_folder = output_chunk_dir\n", + "output_folder = output_docid_dir\n", + "\n", + "input_df = read_parquet_files_as_df(input_folder) ## for debug purposes\n", + "\n", + "print (f\"๐Ÿƒ๐Ÿผ STAGE-{STAGE}: Processing input='{input_folder}' --> output='{output_folder}'\")" + ] + }, + { + "cell_type": "markdown", + "id": "18aa0fe1", + "metadata": { + "id": "18aa0fe1" + }, + "source": [ + "### 5.2 - Execute" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "f6e9e145", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "f6e9e145", + "outputId": "2add5f0c-3ab6-4336-8a7b-ac8b1b76ab73" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 15, - "id": "d8138d43", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 897 - }, - "id": "d8138d43", - "outputId": "e1758b0c-5f22-4368-c3e6-ff778fc9ae82" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Files processed : 2\n", - "Chunks created : 8\n", - "Input data dimensions (rows x columns)= (2, 12)\n", - "Output data dimensions (rows x columns)= (8, 16)\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
filenamenum_pagesnum_tablesnum_doc_elementsexthashsizedate_acquiredpdf_convert_timesource_filenamesource_document_idcontentsdoc_jsonpathpage_numberbboxdocument_id
0mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Solar System\\nOur solar system is a vast and f...$.main-text[2]1[132.84518433, 588.96014404, 479.40917969, 623...44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...
1mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Solar System\\nFor more details about the Solar...$.main-text[3]1[133.18510437, 570.83258057, 374.99838257, 581...dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07...
2mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Mars\\nMars, the fourth planet from the Sun, is...$.main-text[5]1[132.87440491, 500.84011841, 477.48345947, 534...a31663e06fac41470ecc459f5a58658a3f9997d7801053...
3mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Basic facts about Mars:\\nยท Distance from the S...$.main-text[6]1[133.2026062, 482.90710449, 237.04431152, 493....7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...
4earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Solar System\\nOur solar system is a vast and f...$.main-text[2]1[132.87112427, 588.96014404, 479.40917969, 623...44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...
5earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Solar System\\nFor more details about our Solar...$.main-text[3]1[133.20942688, 570.81555176, 375.57919312, 581...d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...
6earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Earth\\nEarth is the third planet from the Sun....$.main-text[5]1[132.91053772, 512.46295166, 477.84887695, 534...7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...
7earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Earth\\nBasic facts about Earth:\\nยท Distance fr...$.main-text[6]1[133.30151367, 494.86206055, 240.17156982, 505...189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...
\n", - "
" - ], - "text/plain": [ - " filename num_pages num_tables num_doc_elements ext \\\n", - "0 mars.pdf 1 0 11 pdf \n", - "1 mars.pdf 1 0 11 pdf \n", - "2 mars.pdf 1 0 11 pdf \n", - "3 mars.pdf 1 0 11 pdf \n", - "4 earth.pdf 1 0 11 pdf \n", - "5 earth.pdf 1 0 11 pdf \n", - "6 earth.pdf 1 0 11 pdf \n", - "7 earth.pdf 1 0 11 pdf \n", - "\n", - " hash size \\\n", - "0 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "1 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "2 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "3 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "4 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "5 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "6 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "7 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "\n", - " date_acquired pdf_convert_time source_filename \\\n", - "0 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", - "1 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", - "2 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", - "3 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", - "4 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", - "5 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", - "6 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", - "7 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", - "\n", - " source_document_id \\\n", - "0 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", - "1 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", - "2 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", - "3 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", - "4 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", - "5 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", - "6 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", - "7 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", - "\n", - " contents doc_jsonpath \\\n", - "0 Solar System\\nOur solar system is a vast and f... $.main-text[2] \n", - "1 Solar System\\nFor more details about the Solar... $.main-text[3] \n", - "2 Mars\\nMars, the fourth planet from the Sun, is... $.main-text[5] \n", - "3 Basic facts about Mars:\\nยท Distance from the S... $.main-text[6] \n", - "4 Solar System\\nOur solar system is a vast and f... $.main-text[2] \n", - "5 Solar System\\nFor more details about our Solar... $.main-text[3] \n", - "6 Earth\\nEarth is the third planet from the Sun.... $.main-text[5] \n", - "7 Earth\\nBasic facts about Earth:\\nยท Distance fr... $.main-text[6] \n", - "\n", - " page_number bbox \\\n", - "0 1 [132.84518433, 588.96014404, 479.40917969, 623... \n", - "1 1 [133.18510437, 570.83258057, 374.99838257, 581... \n", - "2 1 [132.87440491, 500.84011841, 477.48345947, 534... \n", - "3 1 [133.2026062, 482.90710449, 237.04431152, 493.... \n", - "4 1 [132.87112427, 588.96014404, 479.40917969, 623... \n", - "5 1 [133.20942688, 570.81555176, 375.57919312, 581... \n", - "6 1 [132.91053772, 512.46295166, 477.84887695, 534... \n", - "7 1 [133.30151367, 494.86206055, 240.17156982, 505... \n", - "\n", - " document_id \n", - "0 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... \n", - "1 dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07... \n", - "2 a31663e06fac41470ecc459f5a58658a3f9997d7801053... \n", - "3 7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a... \n", - "4 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... \n", - "5 d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d... \n", - "6 7c4a750e2215f231803a6f8078bde1e9699034fb033dd3... \n", - "7 189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f... " - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from my_utils import read_parquet_files_as_df\n", - "\n", - "output_df = read_parquet_files_as_df(output_folder)\n", - "\n", - "print (f\"Files processed : {input_df.shape[0]:,}\")\n", - "print (f\"Chunks created : {output_df.shape[0]:,}\")\n", - "\n", - "print (\"Input data dimensions (rows x columns)= \", input_df.shape)\n", - "print (\"Output data dimensions (rows x columns)= \", output_df.shape)\n", - "\n", - "output_df.head(10)" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "13:31:29 INFO - Doc id parameters are : {'doc_column': 'contents', 'hash_column': 'chunk_hash', 'int_column': 'chunk_id', 'start_id': 0}\n", + "13:31:29 INFO - pipeline id pipeline_id\n", + "13:31:29 INFO - code location None\n", + "13:31:29 INFO - number of workers 2 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n", + "13:31:29 INFO - actor creation delay 0\n", + "13:31:29 INFO - job details {'job category': 'preprocessing', 'job name': 'doc_id', 'job type': 'ray', 'job id': 'job_id'}\n", + "13:31:29 INFO - data factory data_ is using local data access: input_folder - output/02_chunk_out output_folder - output/03_docid_out\n", + "13:31:29 INFO - data factory data_ max_files -1, n_sample -1\n", + "13:31:29 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "13:31:29 INFO - Running locally\n", + "2024-10-18 13:31:31,792\tINFO worker.py:1744 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", + "\u001b[36m(orchestrate pid=12291)\u001b[0m 13:31:32 INFO - orchestrator started at 2024-10-18 13:31:32\n", + "\u001b[36m(orchestrate pid=12291)\u001b[0m 13:31:32 INFO - Number of files is 2, source profile {'max_file_size': 0.008975982666015625, 'min_file_size': 0.008897781372070312, 'total_file_size': 0.017873764038085938}\n", + "\u001b[36m(orchestrate pid=12291)\u001b[0m 13:31:32 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 15.033103181049228, 'object_store': 7.516551589593291}\n", + "\u001b[36m(orchestrate pid=12291)\u001b[0m 13:31:32 INFO - Number of workers - 2 with {'num_cpus': 0.8, 'max_restarts': -1} each\n", + "\u001b[36m(orchestrate pid=12291)\u001b[0m 13:31:32 INFO - Completed 0 files (0.0%) in 0.0 min. Waiting for completion\n", + "\u001b[36m(orchestrate pid=12291)\u001b[0m 13:31:33 INFO - Completed processing 2 files in 0.012 min\n", + "\u001b[36m(orchestrate pid=12291)\u001b[0m 13:31:33 INFO - done flushing in 0.001 sec\n", + "13:31:43 INFO - Completed execution in 0.228 min, execution result 0\n" + ] }, { - "cell_type": "markdown", - "id": "9e9ca75c", - "metadata": { - "id": "9e9ca75c" - }, - "source": [ - "### 4.4 - Understanding the Output\n", - "\n", - "Here we see 2 PDF files are split into 6 chunks. Basically we see the documents are being split along 'natural boundaris' - paragraphs and bullet points\n", - "\n", - "See how **document_id** is carried throughout. This helps us identify original documents.\n", - "\n", - "Also note **contents** is now plain text (not JSON as before)" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "โœ… Stage:3 completed successfully\n", + "CPU times: user 123 ms, sys: 145 ms, total: 267 ms\n", + "Wall time: 15.2 s\n" + ] + } + ], + "source": [ + "%%time\n", + "\n", + "from data_processing_ray.runtime.ray import RayTransformLauncher\n", + "from doc_id_transform_ray import DocIDRayTransformRuntimeConfiguration\n", + "\n", + "local_conf = {\n", + " \"input_folder\": input_folder,\n", + " \"output_folder\": output_folder,\n", + "}\n", + "worker_options = {\"num_cpus\" : MY_CONFIG.RAY_NUM_CPUS}\n", + "params = {\n", + " # where to run\n", + " \"run_locally\": True,\n", + " # Data access. Only required parameters are specified\n", + " \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n", + " # orchestrator\n", + " \"runtime_worker_options\": ParamsUtils.convert_to_ast(worker_options),\n", + " \"runtime_num_workers\": MY_CONFIG.RAY_RUNTIME_WORKERS,\n", + " # doc id configuration\n", + " \"doc_id_doc_column\": \"contents\",\n", + " \"doc_id_hash_column\": \"chunk_hash\",\n", + " \"doc_id_int_column\": \"chunk_id\",\n", + "}\n", + "sys.argv = ParamsUtils.dict_to_req(d=params)\n", + "\n", + "# launch\n", + "\n", + "launcher = RayTransformLauncher(DocIDRayTransformRuntimeConfiguration())\n", + "\n", + "return_code = launcher.launch()\n", + "\n", + "if return_code == 0:\n", + " print (f\"โœ… Stage:{STAGE} completed successfully\")\n", + "else:\n", + " raise Exception (\"โŒ Ray job failed\")" + ] + }, + { + "cell_type": "markdown", + "id": "4954402f", + "metadata": { + "id": "4954402f" + }, + "source": [ + "### 5.3 - Inspect Generated output\n", + "\n", + "You will notice we have two extra columns\n", + "\n", + "- **hash_column**\n", + "- **int_id_column**\n", + "\n", + "But still the same number or rows as before" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "1911179a", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 860 }, + "id": "1911179a", + "outputId": "45e83e2a-1f70-46b9-e311-c50f025419be" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 16, - "id": "3090c950", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 300 - }, - "id": "3090c950", - "outputId": "3f542446-2cfa-404c-c642-3732f7b74568" - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
filenamecontents
0mars.pdfSolar System\\nOur solar system is a vast and f...
1mars.pdfSolar System\\nFor more details about the Solar...
2mars.pdfMars\\nMars, the fourth planet from the Sun, is...
3mars.pdfBasic facts about Mars:\\nยท Distance from the S...
4earth.pdfSolar System\\nOur solar system is a vast and f...
5earth.pdfSolar System\\nFor more details about our Solar...
6earth.pdfEarth\\nEarth is the third planet from the Sun....
7earth.pdfEarth\\nBasic facts about Earth:\\nยท Distance fr...
\n", - "
" - ], - "text/plain": [ - " filename contents\n", - "0 mars.pdf Solar System\\nOur solar system is a vast and f...\n", - "1 mars.pdf Solar System\\nFor more details about the Solar...\n", - "2 mars.pdf Mars\\nMars, the fourth planet from the Sun, is...\n", - "3 mars.pdf Basic facts about Mars:\\nยท Distance from the S...\n", - "4 earth.pdf Solar System\\nOur solar system is a vast and f...\n", - "5 earth.pdf Solar System\\nFor more details about our Solar...\n", - "6 earth.pdf Earth\\nEarth is the third planet from the Sun....\n", - "7 earth.pdf Earth\\nBasic facts about Earth:\\nยท Distance fr..." - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "output_df[['filename', 'contents']]" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "Input data dimensions (rows x columns)= (8, 16)\n", + "Output data dimensions (rows x columns)= (8, 18)\n" + ] }, { - "cell_type": "code", - "execution_count": 17, - "id": "d5f151ae", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "d5f151ae", - "outputId": "4616d648-0852-4ecb-cef8-f5940e176de0" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "========== mars.pdf ===========\n", - "-------Chunk 0------\n", - "Solar System\n", - "Our solar system is a vast and fascinating expanse, comprising eight planets, five dwarf planets, numerous moons, asteroids, comets, and other celestial bodies. At its center lies the star we call the Sun.\n", - "-------\n", - "-------Chunk 1------\n", - "Solar System\n", - "For more details about the Solar system see Chapter 1.\n", - "-------\n", - "-------Chunk 2------\n", - "Mars\n", - "Mars, the fourth planet from the Sun, is a cold, desert world with a thin atmosphere composed primarily of carbon dioxide. Its reddish hue comes from iron oxide, or rust, prevalent on its surface.\n", - "-------\n", - "-------Chunk 3------\n", - "Basic facts about Mars:\n", - "ยท Distance from the Sun: Average of 228 million kilometers (142 million miles)\n", - "ยท Rotation Period: 24.6 hours (one Martian day - called a \"sol\")\n", - "ยท Moons: Two small moons, Phobos and Deimos.\n", - "-------\n", - "========== earth.pdf ===========\n", - "-------Chunk 0------\n", - "Solar System\n", - "Our solar system is a vast and fascinating expanse, comprising eight planets, five dwarf planets, numerous moons, asteroids, comets, and other celestial bodies. At its center lies the star we call the Sun.\n", - "-------\n", - "-------Chunk 1------\n", - "Solar System\n", - "For more details about our Solar system see Chapter 1.\n", - "-------\n", - "-------Chunk 2------\n", - "Earth\n", - "Earth is the third planet from the Sun. It's our home planet. Earth is the only place we know of with life.\n", - "-------\n", - "-------Chunk 3------\n", - "Earth\n", - "Basic facts about Earth:\n", - "ยท Distance from the Sun: Average of 149.6 million kilometers (93 million miles)\n", - "ยท Rotation Period: 24 hours (one day)\n", - "ยท Moons: One moon, called Luna or simply \"the Moon\".\n", - "-------\n" - ] - } + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
filenamenum_pagesnum_tablesnum_doc_elementsexthashsizedate_acquiredpdf_convert_timesource_filenamesource_document_idcontentsdoc_jsonpathpage_numberbboxdocument_idchunk_hashchunk_id
0mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Solar System\\nOur solar system is a vast and f...$.main-text[2]1[132.84518433, 588.96014404, 479.40917969, 623...44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...4
1mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Solar System\\nFor more details about the Solar...$.main-text[3]1[133.18510437, 570.83258057, 374.99838257, 581...dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07...dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07...5
2mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Mars\\nMars, the fourth planet from the Sun, is...$.main-text[5]1[132.87440491, 500.84011841, 477.48345947, 534...a31663e06fac41470ecc459f5a58658a3f9997d7801053...a31663e06fac41470ecc459f5a58658a3f9997d7801053...6
3mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Basic facts about Mars:\\nยท Distance from the S...$.main-text[6]1[133.2026062, 482.90710449, 237.04431152, 493....7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...7
4earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Solar System\\nOur solar system is a vast and f...$.main-text[2]1[132.87112427, 588.96014404, 479.40917969, 623...44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...0
5earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Solar System\\nFor more details about our Solar...$.main-text[3]1[133.20942688, 570.81555176, 375.57919312, 581...d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...1
6earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Earth\\nEarth is the third planet from the Sun....$.main-text[5]1[132.91053772, 512.46295166, 477.84887695, 534...7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...2
7earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Earth\\nBasic facts about Earth:\\nยท Distance fr...$.main-text[6]1[133.30151367, 494.86206055, 240.17156982, 505...189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...3
\n", + "
" ], - "source": [ - "for f in output_df['filename'].unique():\n", - " print ('==========' , f, '===========')\n", - " chunks = output_df[output_df['filename'] == f]['contents']\n", - " for idx , chunk in enumerate(chunks):\n", - " print (f'-------Chunk {idx}------\\n{chunk}\\n-------')" - ] - }, - { - "cell_type": "markdown", - "id": "20217298", - "metadata": { - "id": "20217298" - }, - "source": [ - "## Step-5: DOC ID generation\n", - "\n", - "This transform annotates documents with document \"ids\". It supports the following transformations of the original data:\n", - "\n", - " - Adding document hash: this enables the addition of a document hash-based id to the data. The hash is calculated with `hashlib.sha256(doc.encode(\"utf-8\")).hexdigest()`. To enable this annotation, set **hash_column** to the name of the column, where you want to store it.\n", - " - Adding integer document id: this allows the addition of an integer document id to the data that is unique across all rows in all tables provided to the transform() method. To enable this annotation, set **int_id_column** to the name of the column, where you want to store it.\n", - "\n", - "**This is a pre-requisite for fuzzy dedup** in the pipeline." - ] - }, - { - "cell_type": "markdown", - "id": "66811f5b", - "metadata": { - "id": "66811f5b" - }, - "source": [ - "### 5.1 - Set Input/output Folder" + "text/plain": [ + " filename num_pages num_tables num_doc_elements ext \\\n", + "0 mars.pdf 1 0 11 pdf \n", + "1 mars.pdf 1 0 11 pdf \n", + "2 mars.pdf 1 0 11 pdf \n", + "3 mars.pdf 1 0 11 pdf \n", + "4 earth.pdf 1 0 11 pdf \n", + "5 earth.pdf 1 0 11 pdf \n", + "6 earth.pdf 1 0 11 pdf \n", + "7 earth.pdf 1 0 11 pdf \n", + "\n", + " hash size \\\n", + "0 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", + "1 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", + "2 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", + "3 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", + "4 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", + "5 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", + "6 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", + "7 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", + "\n", + " date_acquired pdf_convert_time source_filename \\\n", + "0 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", + "1 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", + "2 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", + "3 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", + "4 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", + "5 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", + "6 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", + "7 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", + "\n", + " source_document_id \\\n", + "0 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", + "1 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", + "2 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", + "3 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", + "4 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", + "5 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", + "6 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", + "7 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", + "\n", + " contents doc_jsonpath \\\n", + "0 Solar System\\nOur solar system is a vast and f... $.main-text[2] \n", + "1 Solar System\\nFor more details about the Solar... $.main-text[3] \n", + "2 Mars\\nMars, the fourth planet from the Sun, is... $.main-text[5] \n", + "3 Basic facts about Mars:\\nยท Distance from the S... $.main-text[6] \n", + "4 Solar System\\nOur solar system is a vast and f... $.main-text[2] \n", + "5 Solar System\\nFor more details about our Solar... $.main-text[3] \n", + "6 Earth\\nEarth is the third planet from the Sun.... $.main-text[5] \n", + "7 Earth\\nBasic facts about Earth:\\nยท Distance fr... $.main-text[6] \n", + "\n", + " page_number bbox \\\n", + "0 1 [132.84518433, 588.96014404, 479.40917969, 623... \n", + "1 1 [133.18510437, 570.83258057, 374.99838257, 581... \n", + "2 1 [132.87440491, 500.84011841, 477.48345947, 534... \n", + "3 1 [133.2026062, 482.90710449, 237.04431152, 493.... \n", + "4 1 [132.87112427, 588.96014404, 479.40917969, 623... \n", + "5 1 [133.20942688, 570.81555176, 375.57919312, 581... \n", + "6 1 [132.91053772, 512.46295166, 477.84887695, 534... \n", + "7 1 [133.30151367, 494.86206055, 240.17156982, 505... \n", + "\n", + " document_id \\\n", + "0 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... \n", + "1 dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07... \n", + "2 a31663e06fac41470ecc459f5a58658a3f9997d7801053... \n", + "3 7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a... \n", + "4 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... \n", + "5 d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d... \n", + "6 7c4a750e2215f231803a6f8078bde1e9699034fb033dd3... \n", + "7 189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f... \n", + "\n", + " chunk_hash chunk_id \n", + "0 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... 4 \n", + "1 dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07... 5 \n", + "2 a31663e06fac41470ecc459f5a58658a3f9997d7801053... 6 \n", + "3 7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a... 7 \n", + "4 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... 0 \n", + "5 d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d... 1 \n", + "6 7c4a750e2215f231803a6f8078bde1e9699034fb033dd3... 2 \n", + "7 189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f... 3 " ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from my_utils import read_parquet_files_as_df\n", + "\n", + "output_df = read_parquet_files_as_df(output_folder)\n", + "\n", + "print (\"Input data dimensions (rows x columns)= \", input_df.shape)\n", + "print (\"Output data dimensions (rows x columns)= \", output_df.shape)\n", + "\n", + "output_df.head(10)" + ] + }, + { + "cell_type": "markdown", + "id": "852829dc", + "metadata": { + "id": "852829dc" + }, + "source": [ + "## Step-6: Exact Dedup\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "5acfd3a2-a236-4143-bcfc-15804f1da7fe", + "metadata": { + "id": "5acfd3a2-a236-4143-bcfc-15804f1da7fe" + }, + "source": [ + "### 6.1 - Set Input/output Folder" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "4c7a1b94", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "4c7a1b94", + "outputId": "40a119b4-44fc-483d-9ad0-da178a2a8eb1" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 18, - "id": "1f747c0d", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "1f747c0d", - "outputId": "e42500b7-5d1e-41fd-b53b-34d3393f36f4" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "๐Ÿƒ๐Ÿผ STAGE-3: Processing input='output/02_chunk_out' --> output='output/03_docid_out'\n" - ] - } - ], - "source": [ - "\n", - "# Input for this stage is the output of exact dedeup component\n", - "# output of this component makes it possible for fdedup component to run on data.\n", - "\n", - "STAGE = 3\n", - "\n", - "input_folder = output_chunk_dir\n", - "output_folder = output_docid_dir\n", - "\n", - "input_df = read_parquet_files_as_df(input_folder) ## for debug purposes\n", - "\n", - "print (f\"๐Ÿƒ๐Ÿผ STAGE-{STAGE}: Processing input='{input_folder}' --> output='{output_folder}'\")" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "๐Ÿƒ๐Ÿผ STAGE-4: Processing input='output/03_docid_out' --> output='output/04_exact_dedupe_out'\n" + ] + } + ], + "source": [ + "STAGE = 4\n", + "\n", + "input_folder = output_docid_dir # previous output folder is the input folder for the current stage\n", + "output_folder = output_exact_dedupe_dir\n", + "\n", + "input_df = read_parquet_files_as_df(input_folder) ## for debug purposes\n", + "\n", + "print (f\"๐Ÿƒ๐Ÿผ STAGE-{STAGE}: Processing input='{input_folder}' --> output='{output_folder}'\")" + ] + }, + { + "cell_type": "markdown", + "id": "3661cb37-39c7-4b09-a784-925bfa9eaf1e", + "metadata": { + "id": "3661cb37-39c7-4b09-a784-925bfa9eaf1e" + }, + "source": [ + "### 6.2 - Execute" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "a624b2b2-faad-4325-ac7d-53a840f564ef", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "a624b2b2-faad-4325-ac7d-53a840f564ef", + "outputId": "bd0f3f94-8c48-4c6b-b911-858e389243f4" + }, + "outputs": [ { - "cell_type": "markdown", - "id": "18aa0fe1", - "metadata": { - "id": "18aa0fe1" - }, - "source": [ - "### 5.2 - Execute" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "13:31:45 INFO - exact dedup params are {'doc_column': 'contents', 'doc_id_column': 'chunk_hash', 'use_snapshot': False, 'snapshot_directory': None, 'hash_cpu': 0.5, 'num_hashes': 2}\n", + "13:31:45 INFO - pipeline id pipeline_id\n", + "13:31:45 INFO - code location None\n", + "13:31:45 INFO - number of workers 2 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n", + "13:31:45 INFO - actor creation delay 0\n", + "13:31:45 INFO - job details {'job category': 'preprocessing', 'job name': 'ededup', 'job type': 'ray', 'job id': 'job_id'}\n", + "13:31:45 INFO - data factory data_ is using local data access: input_folder - output/03_docid_out output_folder - output/04_exact_dedupe_out\n", + "13:31:45 INFO - data factory data_ max_files -1, n_sample -1\n", + "13:31:45 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "13:31:45 INFO - Running locally\n", + "2024-10-18 13:31:47,001\tINFO worker.py:1744 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", + "\u001b[36m(orchestrate pid=13775)\u001b[0m 13:31:48 INFO - orchestrator started at 2024-10-18 13:31:48\n", + "\u001b[36m(orchestrate pid=13775)\u001b[0m 13:31:48 INFO - Number of files is 2, source profile {'max_file_size': 0.010180473327636719, 'min_file_size': 0.010101318359375, 'total_file_size': 0.02028179168701172}\n", + "\u001b[36m(orchestrate pid=13775)\u001b[0m 13:31:48 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 15.010423279367387, 'object_store': 7.505211639218032}\n", + "\u001b[36m(orchestrate pid=13775)\u001b[0m 13:31:48 INFO - Number of workers - 2 with {'num_cpus': 0.8, 'max_restarts': -1} each\n", + "\u001b[36m(orchestrate pid=13775)\u001b[0m 13:31:48 INFO - Completed 0 files (0.0%) in 0.0 min. Waiting for completion\n", + "\u001b[36m(orchestrate pid=13775)\u001b[0m 13:31:48 INFO - Completed processing 2 files in 0.013 min\n", + "\u001b[36m(orchestrate pid=13775)\u001b[0m 13:31:48 INFO - done flushing in 0.001 sec\n", + "13:31:58 INFO - Completed execution in 0.228 min, execution result 0\n" + ] }, { - "cell_type": "code", - "execution_count": 19, - "id": "f6e9e145", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "f6e9e145", - "outputId": "2add5f0c-3ab6-4336-8a7b-ac8b1b76ab73" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "13:31:29 INFO - Doc id parameters are : {'doc_column': 'contents', 'hash_column': 'chunk_hash', 'int_column': 'chunk_id', 'start_id': 0}\n", - "13:31:29 INFO - pipeline id pipeline_id\n", - "13:31:29 INFO - code location None\n", - "13:31:29 INFO - number of workers 2 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n", - "13:31:29 INFO - actor creation delay 0\n", - "13:31:29 INFO - job details {'job category': 'preprocessing', 'job name': 'doc_id', 'job type': 'ray', 'job id': 'job_id'}\n", - "13:31:29 INFO - data factory data_ is using local data access: input_folder - output/02_chunk_out output_folder - output/03_docid_out\n", - "13:31:29 INFO - data factory data_ max_files -1, n_sample -1\n", - "13:31:29 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "13:31:29 INFO - Running locally\n", - "2024-10-18 13:31:31,792\tINFO worker.py:1744 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", - "\u001b[36m(orchestrate pid=12291)\u001b[0m 13:31:32 INFO - orchestrator started at 2024-10-18 13:31:32\n", - "\u001b[36m(orchestrate pid=12291)\u001b[0m 13:31:32 INFO - Number of files is 2, source profile {'max_file_size': 0.008975982666015625, 'min_file_size': 0.008897781372070312, 'total_file_size': 0.017873764038085938}\n", - "\u001b[36m(orchestrate pid=12291)\u001b[0m 13:31:32 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 15.033103181049228, 'object_store': 7.516551589593291}\n", - "\u001b[36m(orchestrate pid=12291)\u001b[0m 13:31:32 INFO - Number of workers - 2 with {'num_cpus': 0.8, 'max_restarts': -1} each\n", - "\u001b[36m(orchestrate pid=12291)\u001b[0m 13:31:32 INFO - Completed 0 files (0.0%) in 0.0 min. Waiting for completion\n", - "\u001b[36m(orchestrate pid=12291)\u001b[0m 13:31:33 INFO - Completed processing 2 files in 0.012 min\n", - "\u001b[36m(orchestrate pid=12291)\u001b[0m 13:31:33 INFO - done flushing in 0.001 sec\n", - "13:31:43 INFO - Completed execution in 0.228 min, execution result 0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "โœ… Stage:3 completed successfully\n", - "CPU times: user 123 ms, sys: 145 ms, total: 267 ms\n", - "Wall time: 15.2 s\n" - ] - } - ], - "source": [ - "%%time\n", - "\n", - "from data_processing_ray.runtime.ray import RayTransformLauncher\n", - "from doc_id_transform_ray import DocIDRayTransformRuntimeConfiguration\n", - "\n", - "local_conf = {\n", - " \"input_folder\": input_folder,\n", - " \"output_folder\": output_folder,\n", - "}\n", - "worker_options = {\"num_cpus\" : MY_CONFIG.RAY_NUM_CPUS}\n", - "params = {\n", - " # where to run\n", - " \"run_locally\": True,\n", - " # Data access. Only required parameters are specified\n", - " \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n", - " # orchestrator\n", - " \"runtime_worker_options\": ParamsUtils.convert_to_ast(worker_options),\n", - " \"runtime_num_workers\": MY_CONFIG.RAY_RUNTIME_WORKERS,\n", - " # doc id configuration\n", - " \"doc_id_doc_column\": \"contents\",\n", - " \"doc_id_hash_column\": \"chunk_hash\",\n", - " \"doc_id_int_column\": \"chunk_id\",\n", - "}\n", - "sys.argv = ParamsUtils.dict_to_req(d=params)\n", - "\n", - "# launch\n", - "\n", - "launcher = RayTransformLauncher(DocIDRayTransformRuntimeConfiguration())\n", - "\n", - "return_code = launcher.launch()\n", - "\n", - "if return_code == 0:\n", - " print (f\"โœ… Stage:{STAGE} completed successfully\")\n", - "else:\n", - " raise Exception (\"โŒ Ray job failed\")" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "โœ… Stage:4 completed successfully\n", + "CPU times: user 136 ms, sys: 154 ms, total: 289 ms\n", + "Wall time: 15.2 s\n" + ] + } + ], + "source": [ + "%%time\n", + "\n", + "from data_processing_ray.runtime.ray import RayTransformLauncher\n", + "from ededup_transform_ray import EdedupRayTransformRuntimeConfiguration\n", + "\n", + "\n", + "# Prepare the commandline params\n", + "local_conf = {\n", + " \"input_folder\": input_folder,\n", + " \"output_folder\": output_folder,\n", + "}\n", + "worker_options = {\"num_cpus\" : MY_CONFIG.RAY_NUM_CPUS}\n", + "params = {\n", + " # where to run\n", + " \"run_locally\": True,\n", + " # Data access. Only required parameters are specified\n", + " \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n", + " # orchestrator\n", + " \"runtime_worker_options\": ParamsUtils.convert_to_ast(worker_options),\n", + " \"runtime_num_workers\": MY_CONFIG.RAY_RUNTIME_WORKERS,\n", + " # ededup parameters\n", + " \"ededup_hash_cpu\": 0.5,\n", + " \"ededup_num_hashes\": 2,\n", + " \"ededup_doc_column\": \"contents\",\n", + " \"ededup_doc_id_column\": \"chunk_hash\",\n", + "}\n", + "\n", + "# Pass the commandline params\n", + "sys.argv = ParamsUtils.dict_to_req(d=params)\n", + "\n", + "# create launcher\n", + "launcher = RayTransformLauncher(EdedupRayTransformRuntimeConfiguration())\n", + "# launch\n", + "return_code = launcher.launch()\n", + "\n", + "if return_code == 0:\n", + " print (f\"โœ… Stage:{STAGE} completed successfully\")\n", + "else:\n", + " raise Exception (\"โŒ Ray job failed\")" + ] + }, + { + "cell_type": "markdown", + "id": "eaf1c3c3", + "metadata": { + "id": "eaf1c3c3" + }, + "source": [ + "### 6.3 - Inspect Generated output" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "d824ebf6", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 815 }, + "id": "d824ebf6", + "outputId": "9173efb6-1b95-4a7e-b531-1a611841a4d0" + }, + "outputs": [ { - "cell_type": "markdown", - "id": "4954402f", - "metadata": { - "id": "4954402f" - }, - "source": [ - "### 5.3 - Inspect Generated output\n", - "\n", - "You will notice we have two extra columns\n", - "\n", - "- **hash_column**\n", - "- **int_id_column**\n", - "\n", - "But still the same number or rows as before" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "Input data dimensions (rows x columns)= (8, 18)\n", + "Output data dimensions (rows x columns)= (7, 19)\n", + "Input chunks before exact dedupe : 8\n", + "Output chunks after exact dedupe : 7\n", + "Duplicate chunks removed : 1\n" + ] }, { - "cell_type": "code", - "execution_count": 20, - "id": "1911179a", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 860 - }, - "id": "1911179a", - "outputId": "45e83e2a-1f70-46b9-e311-c50f025419be" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Input data dimensions (rows x columns)= (8, 16)\n", - "Output data dimensions (rows x columns)= (8, 18)\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
filenamenum_pagesnum_tablesnum_doc_elementsexthashsizedate_acquiredpdf_convert_timesource_filenamesource_document_idcontentsdoc_jsonpathpage_numberbboxdocument_idchunk_hashchunk_id
0mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Solar System\\nOur solar system is a vast and f...$.main-text[2]1[132.84518433, 588.96014404, 479.40917969, 623...44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...4
1mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Solar System\\nFor more details about the Solar...$.main-text[3]1[133.18510437, 570.83258057, 374.99838257, 581...dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07...dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07...5
2mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Mars\\nMars, the fourth planet from the Sun, is...$.main-text[5]1[132.87440491, 500.84011841, 477.48345947, 534...a31663e06fac41470ecc459f5a58658a3f9997d7801053...a31663e06fac41470ecc459f5a58658a3f9997d7801053...6
3mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Basic facts about Mars:\\nยท Distance from the S...$.main-text[6]1[133.2026062, 482.90710449, 237.04431152, 493....7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...7
4earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Solar System\\nOur solar system is a vast and f...$.main-text[2]1[132.87112427, 588.96014404, 479.40917969, 623...44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...0
5earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Solar System\\nFor more details about our Solar...$.main-text[3]1[133.20942688, 570.81555176, 375.57919312, 581...d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...1
6earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Earth\\nEarth is the third planet from the Sun....$.main-text[5]1[132.91053772, 512.46295166, 477.84887695, 534...7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...2
7earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Earth\\nBasic facts about Earth:\\nยท Distance fr...$.main-text[6]1[133.30151367, 494.86206055, 240.17156982, 505...189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...3
\n", - "
" - ], - "text/plain": [ - " filename num_pages num_tables num_doc_elements ext \\\n", - "0 mars.pdf 1 0 11 pdf \n", - "1 mars.pdf 1 0 11 pdf \n", - "2 mars.pdf 1 0 11 pdf \n", - "3 mars.pdf 1 0 11 pdf \n", - "4 earth.pdf 1 0 11 pdf \n", - "5 earth.pdf 1 0 11 pdf \n", - "6 earth.pdf 1 0 11 pdf \n", - "7 earth.pdf 1 0 11 pdf \n", - "\n", - " hash size \\\n", - "0 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "1 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "2 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "3 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "4 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "5 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "6 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "7 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "\n", - " date_acquired pdf_convert_time source_filename \\\n", - "0 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", - "1 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", - "2 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", - "3 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", - "4 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", - "5 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", - "6 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", - "7 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", - "\n", - " source_document_id \\\n", - "0 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", - "1 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", - "2 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", - "3 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", - "4 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", - "5 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", - "6 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", - "7 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", - "\n", - " contents doc_jsonpath \\\n", - "0 Solar System\\nOur solar system is a vast and f... $.main-text[2] \n", - "1 Solar System\\nFor more details about the Solar... $.main-text[3] \n", - "2 Mars\\nMars, the fourth planet from the Sun, is... $.main-text[5] \n", - "3 Basic facts about Mars:\\nยท Distance from the S... $.main-text[6] \n", - "4 Solar System\\nOur solar system is a vast and f... $.main-text[2] \n", - "5 Solar System\\nFor more details about our Solar... $.main-text[3] \n", - "6 Earth\\nEarth is the third planet from the Sun.... $.main-text[5] \n", - "7 Earth\\nBasic facts about Earth:\\nยท Distance fr... $.main-text[6] \n", - "\n", - " page_number bbox \\\n", - "0 1 [132.84518433, 588.96014404, 479.40917969, 623... \n", - "1 1 [133.18510437, 570.83258057, 374.99838257, 581... \n", - "2 1 [132.87440491, 500.84011841, 477.48345947, 534... \n", - "3 1 [133.2026062, 482.90710449, 237.04431152, 493.... \n", - "4 1 [132.87112427, 588.96014404, 479.40917969, 623... \n", - "5 1 [133.20942688, 570.81555176, 375.57919312, 581... \n", - "6 1 [132.91053772, 512.46295166, 477.84887695, 534... \n", - "7 1 [133.30151367, 494.86206055, 240.17156982, 505... \n", - "\n", - " document_id \\\n", - "0 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... \n", - "1 dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07... \n", - "2 a31663e06fac41470ecc459f5a58658a3f9997d7801053... \n", - "3 7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a... \n", - "4 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... \n", - "5 d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d... \n", - "6 7c4a750e2215f231803a6f8078bde1e9699034fb033dd3... \n", - "7 189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f... \n", - "\n", - " chunk_hash chunk_id \n", - "0 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... 4 \n", - "1 dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07... 5 \n", - "2 a31663e06fac41470ecc459f5a58658a3f9997d7801053... 6 \n", - "3 7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a... 7 \n", - "4 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... 0 \n", - "5 d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d... 1 \n", - "6 7c4a750e2215f231803a6f8078bde1e9699034fb033dd3... 2 \n", - "7 189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f... 3 " - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - } + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
filenamenum_pagesnum_tablesnum_doc_elementsexthashsizedate_acquiredpdf_convert_timesource_filenamesource_document_idcontentsdoc_jsonpathpage_numberbboxdocument_idchunk_hashchunk_idremoved
0mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Solar System\\nFor more details about the Solar...$.main-text[3]1[133.18510437, 570.83258057, 374.99838257, 581...dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07...dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07...5[44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf567...
1mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Mars\\nMars, the fourth planet from the Sun, is...$.main-text[5]1[132.87440491, 500.84011841, 477.48345947, 534...a31663e06fac41470ecc459f5a58658a3f9997d7801053...a31663e06fac41470ecc459f5a58658a3f9997d7801053...6[]
2mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Basic facts about Mars:\\nยท Distance from the S...$.main-text[6]1[133.2026062, 482.90710449, 237.04431152, 493....7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...7[]
3earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Solar System\\nOur solar system is a vast and f...$.main-text[2]1[132.87112427, 588.96014404, 479.40917969, 623...44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...0[]
4earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Solar System\\nFor more details about our Solar...$.main-text[3]1[133.20942688, 570.81555176, 375.57919312, 581...d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...1[]
5earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Earth\\nEarth is the third planet from the Sun....$.main-text[5]1[132.91053772, 512.46295166, 477.84887695, 534...7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...2[]
6earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Earth\\nBasic facts about Earth:\\nยท Distance fr...$.main-text[6]1[133.30151367, 494.86206055, 240.17156982, 505...189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...3[]
\n", + "
" ], - "source": [ - "from my_utils import read_parquet_files_as_df\n", - "\n", - "output_df = read_parquet_files_as_df(output_folder)\n", - "\n", - "print (\"Input data dimensions (rows x columns)= \", input_df.shape)\n", - "print (\"Output data dimensions (rows x columns)= \", output_df.shape)\n", - "\n", - "output_df.head(10)" - ] - }, - { - "cell_type": "markdown", - "id": "852829dc", - "metadata": { - "id": "852829dc" - }, - "source": [ - "## Step-6: Exact Dedup\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "id": "5acfd3a2-a236-4143-bcfc-15804f1da7fe", - "metadata": { - "id": "5acfd3a2-a236-4143-bcfc-15804f1da7fe" - }, - "source": [ - "### 6.1 - Set Input/output Folder" + "text/plain": [ + " filename num_pages num_tables num_doc_elements ext \\\n", + "0 mars.pdf 1 0 11 pdf \n", + "1 mars.pdf 1 0 11 pdf \n", + "2 mars.pdf 1 0 11 pdf \n", + "3 earth.pdf 1 0 11 pdf \n", + "4 earth.pdf 1 0 11 pdf \n", + "5 earth.pdf 1 0 11 pdf \n", + "6 earth.pdf 1 0 11 pdf \n", + "\n", + " hash size \\\n", + "0 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", + "1 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", + "2 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", + "3 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", + "4 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", + "5 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", + "6 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", + "\n", + " date_acquired pdf_convert_time source_filename \\\n", + "0 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", + "1 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", + "2 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", + "3 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", + "4 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", + "5 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", + "6 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", + "\n", + " source_document_id \\\n", + "0 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", + "1 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", + "2 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", + "3 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", + "4 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", + "5 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", + "6 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", + "\n", + " contents doc_jsonpath \\\n", + "0 Solar System\\nFor more details about the Solar... $.main-text[3] \n", + "1 Mars\\nMars, the fourth planet from the Sun, is... $.main-text[5] \n", + "2 Basic facts about Mars:\\nยท Distance from the S... $.main-text[6] \n", + "3 Solar System\\nOur solar system is a vast and f... $.main-text[2] \n", + "4 Solar System\\nFor more details about our Solar... $.main-text[3] \n", + "5 Earth\\nEarth is the third planet from the Sun.... $.main-text[5] \n", + "6 Earth\\nBasic facts about Earth:\\nยท Distance fr... $.main-text[6] \n", + "\n", + " page_number bbox \\\n", + "0 1 [133.18510437, 570.83258057, 374.99838257, 581... \n", + "1 1 [132.87440491, 500.84011841, 477.48345947, 534... \n", + "2 1 [133.2026062, 482.90710449, 237.04431152, 493.... \n", + "3 1 [132.87112427, 588.96014404, 479.40917969, 623... \n", + "4 1 [133.20942688, 570.81555176, 375.57919312, 581... \n", + "5 1 [132.91053772, 512.46295166, 477.84887695, 534... \n", + "6 1 [133.30151367, 494.86206055, 240.17156982, 505... \n", + "\n", + " document_id \\\n", + "0 dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07... \n", + "1 a31663e06fac41470ecc459f5a58658a3f9997d7801053... \n", + "2 7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a... \n", + "3 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... \n", + "4 d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d... \n", + "5 7c4a750e2215f231803a6f8078bde1e9699034fb033dd3... \n", + "6 189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f... \n", + "\n", + " chunk_hash chunk_id \\\n", + "0 dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07... 5 \n", + "1 a31663e06fac41470ecc459f5a58658a3f9997d7801053... 6 \n", + "2 7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a... 7 \n", + "3 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... 0 \n", + "4 d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d... 1 \n", + "5 7c4a750e2215f231803a6f8078bde1e9699034fb033dd3... 2 \n", + "6 189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f... 3 \n", + "\n", + " removed \n", + "0 [44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf567... \n", + "1 [] \n", + "2 [] \n", + "3 [] \n", + "4 [] \n", + "5 [] \n", + "6 [] " ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from my_utils import read_parquet_files_as_df\n", + "\n", + "output_df = read_parquet_files_as_df(output_folder)\n", + "\n", + "print (\"Input data dimensions (rows x columns)= \", input_df.shape)\n", + "print (\"Output data dimensions (rows x columns)= \", output_df.shape)\n", + "print (f\"Input chunks before exact dedupe : {input_df.shape[0]:,}\")\n", + "print (f\"Output chunks after exact dedupe : {output_df.shape[0]:,}\")\n", + "print (\"Duplicate chunks removed : \", (input_df.shape[0] - output_df.shape[0]))\n", + "\n", + "output_df.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "82cc9bb0", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 269 }, + "id": "82cc9bb0", + "outputId": "e043fa01-ceca-49ae-b764-8154219c7b6c" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 21, - "id": "4c7a1b94", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "4c7a1b94", - "outputId": "40a119b4-44fc-483d-9ad0-da178a2a8eb1" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "๐Ÿƒ๐Ÿผ STAGE-4: Processing input='output/03_docid_out' --> output='output/04_exact_dedupe_out'\n" - ] - } + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
filenamecontents
0mars.pdfSolar System\\nFor more details about the Solar...
1mars.pdfMars\\nMars, the fourth planet from the Sun, is...
2mars.pdfBasic facts about Mars:\\nยท Distance from the S...
3earth.pdfSolar System\\nOur solar system is a vast and f...
4earth.pdfSolar System\\nFor more details about our Solar...
5earth.pdfEarth\\nEarth is the third planet from the Sun....
6earth.pdfEarth\\nBasic facts about Earth:\\nยท Distance fr...
\n", + "
" ], - "source": [ - "STAGE = 4\n", - "\n", - "input_folder = output_docid_dir # previous output folder is the input folder for the current stage\n", - "output_folder = output_exact_dedupe_dir\n", - "\n", - "input_df = read_parquet_files_as_df(input_folder) ## for debug purposes\n", - "\n", - "print (f\"๐Ÿƒ๐Ÿผ STAGE-{STAGE}: Processing input='{input_folder}' --> output='{output_folder}'\")" + "text/plain": [ + " filename contents\n", + "0 mars.pdf Solar System\\nFor more details about the Solar...\n", + "1 mars.pdf Mars\\nMars, the fourth planet from the Sun, is...\n", + "2 mars.pdf Basic facts about Mars:\\nยท Distance from the S...\n", + "3 earth.pdf Solar System\\nOur solar system is a vast and f...\n", + "4 earth.pdf Solar System\\nFor more details about our Solar...\n", + "5 earth.pdf Earth\\nEarth is the third planet from the Sun....\n", + "6 earth.pdf Earth\\nBasic facts about Earth:\\nยท Distance fr..." ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "output_df[['filename', 'contents']]" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "cc61dffa", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "cc61dffa", + "outputId": "aff7a0d9-a791-42a5-d5b7-ad643f59f261" + }, + "outputs": [ { - "cell_type": "markdown", - "id": "3661cb37-39c7-4b09-a784-925bfa9eaf1e", - "metadata": { - "id": "3661cb37-39c7-4b09-a784-925bfa9eaf1e" - }, - "source": [ - "### 6.2 - Execute" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "========== mars.pdf ===========\n", + "-------Chunk 0------\n", + "Solar System\n", + "For more details about the Solar system see Chapter 1.\n", + "-------\n", + "-------Chunk 1------\n", + "Mars\n", + "Mars, the fourth planet from the Sun, is a cold, desert world with a thin atmosphere composed primarily of carbon dioxide. Its reddish hue comes from iron oxide, or rust, prevalent on its surface.\n", + "-------\n", + "-------Chunk 2------\n", + "Basic facts about Mars:\n", + "ยท Distance from the Sun: Average of 228 million kilometers (142 million miles)\n", + "ยท Rotation Period: 24.6 hours (one Martian day - called a \"sol\")\n", + "ยท Moons: Two small moons, Phobos and Deimos.\n", + "-------\n", + "========== earth.pdf ===========\n", + "-------Chunk 0------\n", + "Solar System\n", + "Our solar system is a vast and fascinating expanse, comprising eight planets, five dwarf planets, numerous moons, asteroids, comets, and other celestial bodies. At its center lies the star we call the Sun.\n", + "-------\n", + "-------Chunk 1------\n", + "Solar System\n", + "For more details about our Solar system see Chapter 1.\n", + "-------\n", + "-------Chunk 2------\n", + "Earth\n", + "Earth is the third planet from the Sun. It's our home planet. Earth is the only place we know of with life.\n", + "-------\n", + "-------Chunk 3------\n", + "Earth\n", + "Basic facts about Earth:\n", + "ยท Distance from the Sun: Average of 149.6 million kilometers (93 million miles)\n", + "ยท Rotation Period: 24 hours (one day)\n", + "ยท Moons: One moon, called Luna or simply \"the Moon\".\n", + "-------\n" + ] + } + ], + "source": [ + "for f in output_df['filename'].unique():\n", + " print ('==========' , f, '===========')\n", + " chunks = output_df[output_df['filename'] == f]['contents']\n", + " for idx , chunk in enumerate(chunks):\n", + " print (f'-------Chunk {idx}------\\n{chunk}\\n-------')" + ] + }, + { + "cell_type": "markdown", + "id": "383f40ba", + "metadata": { + "id": "383f40ba" + }, + "source": [ + "### 6.4 - Understanding the output\n", + "\n", + "Remember we had 8 chunks initially. Now we have 7! One duplicate chunk is removed.\n", + "\n", + "If you look at the PDF, the following common paragraph in `earth.pdf` and `mars.pdf` is removed from one of the documents! Pretty neat, eh!\n", + "\n", + "```text\n", + "## Solar System\n", + "\n", + "Our solar system is a vast and fascinating expanse, comprising eight planets, five dwarf planets, numerous moons, asteroids, comets, and other celestial bodies. At its center lies the star we call the Sun.\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "85309751-8556-41c6-ac32-84acc941bc8d", + "metadata": { + "id": "85309751-8556-41c6-ac32-84acc941bc8d" + }, + "source": [ + "## Step-7: Fuzzy Dedup\n", + "\n", + "Post exact deduplication, fuzzy deduplication is applied with the goal of removing code files that may have **slight variations** and thereby unbiasing\n", + "the data further.\n", + "\n", + "Small variations are quite commonly seen in code data in the form of variations in the values of variables, addittion of logging statements etc." + ] + }, + { + "cell_type": "markdown", + "id": "fcf574a3-b287-419c-9c86-07b828b41ca6", + "metadata": { + "id": "fcf574a3-b287-419c-9c86-07b828b41ca6" + }, + "source": [ + "### 7.1 - Set Input/output Folder" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "9e431c8c-c7c7-48de-ba5f-2c4649c35399", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "9e431c8c-c7c7-48de-ba5f-2c4649c35399", + "outputId": "d53a92d2-0f1c-465f-f11c-b9bc2931f651" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 22, - "id": "a624b2b2-faad-4325-ac7d-53a840f564ef", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "a624b2b2-faad-4325-ac7d-53a840f564ef", - "outputId": "bd0f3f94-8c48-4c6b-b911-858e389243f4" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "13:31:45 INFO - exact dedup params are {'doc_column': 'contents', 'doc_id_column': 'chunk_hash', 'use_snapshot': False, 'snapshot_directory': None, 'hash_cpu': 0.5, 'num_hashes': 2}\n", - "13:31:45 INFO - pipeline id pipeline_id\n", - "13:31:45 INFO - code location None\n", - "13:31:45 INFO - number of workers 2 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n", - "13:31:45 INFO - actor creation delay 0\n", - "13:31:45 INFO - job details {'job category': 'preprocessing', 'job name': 'ededup', 'job type': 'ray', 'job id': 'job_id'}\n", - "13:31:45 INFO - data factory data_ is using local data access: input_folder - output/03_docid_out output_folder - output/04_exact_dedupe_out\n", - "13:31:45 INFO - data factory data_ max_files -1, n_sample -1\n", - "13:31:45 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "13:31:45 INFO - Running locally\n", - "2024-10-18 13:31:47,001\tINFO worker.py:1744 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", - "\u001b[36m(orchestrate pid=13775)\u001b[0m 13:31:48 INFO - orchestrator started at 2024-10-18 13:31:48\n", - "\u001b[36m(orchestrate pid=13775)\u001b[0m 13:31:48 INFO - Number of files is 2, source profile {'max_file_size': 0.010180473327636719, 'min_file_size': 0.010101318359375, 'total_file_size': 0.02028179168701172}\n", - "\u001b[36m(orchestrate pid=13775)\u001b[0m 13:31:48 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 15.010423279367387, 'object_store': 7.505211639218032}\n", - "\u001b[36m(orchestrate pid=13775)\u001b[0m 13:31:48 INFO - Number of workers - 2 with {'num_cpus': 0.8, 'max_restarts': -1} each\n", - "\u001b[36m(orchestrate pid=13775)\u001b[0m 13:31:48 INFO - Completed 0 files (0.0%) in 0.0 min. Waiting for completion\n", - "\u001b[36m(orchestrate pid=13775)\u001b[0m 13:31:48 INFO - Completed processing 2 files in 0.013 min\n", - "\u001b[36m(orchestrate pid=13775)\u001b[0m 13:31:48 INFO - done flushing in 0.001 sec\n", - "13:31:58 INFO - Completed execution in 0.228 min, execution result 0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "โœ… Stage:4 completed successfully\n", - "CPU times: user 136 ms, sys: 154 ms, total: 289 ms\n", - "Wall time: 15.2 s\n" - ] - } - ], - "source": [ - "%%time\n", - "\n", - "from data_processing_ray.runtime.ray import RayTransformLauncher\n", - "from ededup_transform_ray import EdedupRayTransformRuntimeConfiguration\n", - "\n", - "\n", - "# Prepare the commandline params\n", - "local_conf = {\n", - " \"input_folder\": input_folder,\n", - " \"output_folder\": output_folder,\n", - "}\n", - "worker_options = {\"num_cpus\" : MY_CONFIG.RAY_NUM_CPUS}\n", - "params = {\n", - " # where to run\n", - " \"run_locally\": True,\n", - " # Data access. Only required parameters are specified\n", - " \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n", - " # orchestrator\n", - " \"runtime_worker_options\": ParamsUtils.convert_to_ast(worker_options),\n", - " \"runtime_num_workers\": MY_CONFIG.RAY_RUNTIME_WORKERS,\n", - " # ededup parameters\n", - " \"ededup_hash_cpu\": 0.5,\n", - " \"ededup_num_hashes\": 2,\n", - " \"ededup_doc_column\": \"contents\",\n", - " \"ededup_doc_id_column\": \"chunk_hash\",\n", - "}\n", - "\n", - "# Pass the commandline params\n", - "sys.argv = ParamsUtils.dict_to_req(d=params)\n", - "\n", - "# create launcher\n", - "launcher = RayTransformLauncher(EdedupRayTransformRuntimeConfiguration())\n", - "# launch\n", - "return_code = launcher.launch()\n", - "\n", - "if return_code == 0:\n", - " print (f\"โœ… Stage:{STAGE} completed successfully\")\n", - "else:\n", - " raise Exception (\"โŒ Ray job failed\")" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "๐Ÿƒ๐Ÿผ STAGE-5: Processing input='output/03_docid_out' --> output='output/05_fuzzy_dedupe_out'\n" + ] + } + ], + "source": [ + "## Input to this component is the output of doc_id generator component.\n", + "\n", + "STAGE = 5\n", + "\n", + "input_folder = output_docid_dir # previous output folder is the input folder for the current stage\n", + "output_folder = output_fuzzy_dedupe_dir\n", + "\n", + "input_df = read_parquet_files_as_df(input_folder) ## for debug purposes\n", + "\n", + "print (f\"๐Ÿƒ๐Ÿผ STAGE-{STAGE}: Processing input='{input_folder}' --> output='{output_folder}'\")" + ] + }, + { + "cell_type": "markdown", + "id": "f4c82a8f-b513-4fe5-b172-d41b104b54f3", + "metadata": { + "id": "f4c82a8f-b513-4fe5-b172-d41b104b54f3" + }, + "source": [ + "### 7.2 - Execute" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "3864ff77-e9a8-48f7-973b-c3b3aef1a94f", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "3864ff77-e9a8-48f7-973b-c3b3aef1a94f", + "outputId": "1e63d364-3944-465a-ff7c-6e1dc750b2de" + }, + "outputs": [ { - "cell_type": "markdown", - "id": "eaf1c3c3", - "metadata": { - "id": "eaf1c3c3" - }, - "source": [ - "### 6.3 - Inspect Generated output" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "13:32:00 INFO - fuzzy dedup params are {'doc_column': 'contents', 'id_column': 'chunk_id', 'cluster_column': 'chunk_hash', 'bucket_cpu': 0.3, 'mhash_cpu': 0.3, 'doc_cpu': 0.3, 'num_doc_actors': 1, 'num_minhash_actors': 1, 'num_bucket_actors': 1, 'num_preprocessors': 1, 'num_permutations': 64, 'threshold': 0.7, 'shingles_size': 5, 'delimiters': ' ', 'snapshot_delay': 1, 'use_bucket_snapshot': False, 'use_doc_snapshot': False, 'random_delay_limit': 10, 'worker_options': {'num_cpus': 0.8}}\n", + "13:32:00 INFO - pipeline id pipeline_id\n", + "13:32:00 INFO - code location None\n", + "13:32:00 INFO - number of workers 2 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n", + "13:32:00 INFO - actor creation delay 0\n", + "13:32:00 INFO - job details {'job category': 'preprocessing', 'job name': 'fdedup', 'job type': 'ray', 'job id': 'job_id'}\n", + "13:32:00 INFO - data factory data_ is using local data access: input_folder - output/03_docid_out output_folder - output/05_fuzzy_dedupe_out\n", + "13:32:00 INFO - data factory data_ max_files -1, n_sample -1\n", + "13:32:00 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "13:32:00 INFO - Running locally\n", + "2024-10-18 13:32:02,246\tINFO worker.py:1744 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", + "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - orchestrator started at 2024-10-18 13:32:03\n", + "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - Number of files is 2, source profile {'max_file_size': 0.010180473327636719, 'min_file_size': 0.010101318359375, 'total_file_size': 0.02028179168701172}\n", + "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 15.000544739887118, 'object_store': 7.500272369012237}\n", + "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - Number of workers - 2 with {'num_cpus': 0.8, 'max_restarts': -1} each\n", + "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - starting run from the beginning\n", + "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - continuing from the very beginning\n", + "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - Fuzzy: num buckets 8, bucket length 8\n", + "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - created 1 bucket actors\n", + "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - created 1 minhash actors\n", + "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - Table preprocessing uses 1 readers\n", + "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - created 1 table processor actors\n", + "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:07 INFO - Completed 1 files in 0.064 min\n", + "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:07 INFO - Completed 1 files (50.0%) in 0.064 min. Waiting for completion\n", + "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:15 INFO - Completed processing 2 files in 0.197 min\n", + "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:15 INFO - creating minhash snapshots\n", + "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:16 INFO - minhash snapshots created\n", + "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:16 INFO - creating bucket snapshots\n", + "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:17 INFO - bucket snapshots created\n", + "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:17 INFO - created 1 document actors\n", + "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:17 INFO - created 1 bucket processor actors\n", + "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:17 INFO - created bucket processor invoker\n", + "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:17 INFO - added invoker to bucket collectors\n", + "\u001b[36m(BucketsHash pid=16209)\u001b[0m 13:32:17 INFO - processing buckets 0 long, 53 short\n", + "\u001b[36m(BucketsHash pid=16209)\u001b[0m 13:32:17 INFO - Done submitting long buckets\n", + "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:17 INFO - Done processing buckets in 0.01 min\n", + "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:17 INFO - creating document snapshots\n", + "\u001b[36m(BucketsHashProcessorInvoker pid=16602)\u001b[0m 13:32:17 INFO - Waiting bucket processing completion. Submitted requests 1\n", + "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:18 INFO - document snapshots created\n", + "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:18 INFO - Completed 0 files (0.0%) in 0.0 min. Waiting for completion\n", + "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:25 INFO - Completed processing 2 files in 0.113 min\n", + "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:25 INFO - done flushing in 0.005 sec\n", + "13:32:35 INFO - Completed execution in 0.588 min, execution result 0\n" + ] }, { - "cell_type": "code", - "execution_count": 23, - "id": "d824ebf6", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 815 - }, - "id": "d824ebf6", - "outputId": "9173efb6-1b95-4a7e-b531-1a611841a4d0" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Input data dimensions (rows x columns)= (8, 18)\n", - "Output data dimensions (rows x columns)= (7, 19)\n", - "Input chunks before exact dedupe : 8\n", - "Output chunks after exact dedupe : 7\n", - "Duplicate chunks removed : 1\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
filenamenum_pagesnum_tablesnum_doc_elementsexthashsizedate_acquiredpdf_convert_timesource_filenamesource_document_idcontentsdoc_jsonpathpage_numberbboxdocument_idchunk_hashchunk_idremoved
0mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Solar System\\nFor more details about the Solar...$.main-text[3]1[133.18510437, 570.83258057, 374.99838257, 581...dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07...dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07...5[44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf567...
1mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Mars\\nMars, the fourth planet from the Sun, is...$.main-text[5]1[132.87440491, 500.84011841, 477.48345947, 534...a31663e06fac41470ecc459f5a58658a3f9997d7801053...a31663e06fac41470ecc459f5a58658a3f9997d7801053...6[]
2mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Basic facts about Mars:\\nยท Distance from the S...$.main-text[6]1[133.2026062, 482.90710449, 237.04431152, 493....7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...7[]
3earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Solar System\\nOur solar system is a vast and f...$.main-text[2]1[132.87112427, 588.96014404, 479.40917969, 623...44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...0[]
4earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Solar System\\nFor more details about our Solar...$.main-text[3]1[133.20942688, 570.81555176, 375.57919312, 581...d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...1[]
5earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Earth\\nEarth is the third planet from the Sun....$.main-text[5]1[132.91053772, 512.46295166, 477.84887695, 534...7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...2[]
6earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Earth\\nBasic facts about Earth:\\nยท Distance fr...$.main-text[6]1[133.30151367, 494.86206055, 240.17156982, 505...189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...3[]
\n", - "
" - ], - "text/plain": [ - " filename num_pages num_tables num_doc_elements ext \\\n", - "0 mars.pdf 1 0 11 pdf \n", - "1 mars.pdf 1 0 11 pdf \n", - "2 mars.pdf 1 0 11 pdf \n", - "3 earth.pdf 1 0 11 pdf \n", - "4 earth.pdf 1 0 11 pdf \n", - "5 earth.pdf 1 0 11 pdf \n", - "6 earth.pdf 1 0 11 pdf \n", - "\n", - " hash size \\\n", - "0 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "1 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "2 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "3 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "4 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "5 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "6 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "\n", - " date_acquired pdf_convert_time source_filename \\\n", - "0 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", - "1 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", - "2 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", - "3 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", - "4 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", - "5 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", - "6 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", - "\n", - " source_document_id \\\n", - "0 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", - "1 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", - "2 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", - "3 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", - "4 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", - "5 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", - "6 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", - "\n", - " contents doc_jsonpath \\\n", - "0 Solar System\\nFor more details about the Solar... $.main-text[3] \n", - "1 Mars\\nMars, the fourth planet from the Sun, is... $.main-text[5] \n", - "2 Basic facts about Mars:\\nยท Distance from the S... $.main-text[6] \n", - "3 Solar System\\nOur solar system is a vast and f... $.main-text[2] \n", - "4 Solar System\\nFor more details about our Solar... $.main-text[3] \n", - "5 Earth\\nEarth is the third planet from the Sun.... $.main-text[5] \n", - "6 Earth\\nBasic facts about Earth:\\nยท Distance fr... $.main-text[6] \n", - "\n", - " page_number bbox \\\n", - "0 1 [133.18510437, 570.83258057, 374.99838257, 581... \n", - "1 1 [132.87440491, 500.84011841, 477.48345947, 534... \n", - "2 1 [133.2026062, 482.90710449, 237.04431152, 493.... \n", - "3 1 [132.87112427, 588.96014404, 479.40917969, 623... \n", - "4 1 [133.20942688, 570.81555176, 375.57919312, 581... \n", - "5 1 [132.91053772, 512.46295166, 477.84887695, 534... \n", - "6 1 [133.30151367, 494.86206055, 240.17156982, 505... \n", - "\n", - " document_id \\\n", - "0 dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07... \n", - "1 a31663e06fac41470ecc459f5a58658a3f9997d7801053... \n", - "2 7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a... \n", - "3 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... \n", - "4 d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d... \n", - "5 7c4a750e2215f231803a6f8078bde1e9699034fb033dd3... \n", - "6 189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f... \n", - "\n", - " chunk_hash chunk_id \\\n", - "0 dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07... 5 \n", - "1 a31663e06fac41470ecc459f5a58658a3f9997d7801053... 6 \n", - "2 7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a... 7 \n", - "3 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... 0 \n", - "4 d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d... 1 \n", - "5 7c4a750e2215f231803a6f8078bde1e9699034fb033dd3... 2 \n", - "6 189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f... 3 \n", - "\n", - " removed \n", - "0 [44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf567... \n", - "1 [] \n", - "2 [] \n", - "3 [] \n", - "4 [] \n", - "5 [] \n", - "6 [] " - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from my_utils import read_parquet_files_as_df\n", - "\n", - "output_df = read_parquet_files_as_df(output_folder)\n", - "\n", - "print (\"Input data dimensions (rows x columns)= \", input_df.shape)\n", - "print (\"Output data dimensions (rows x columns)= \", output_df.shape)\n", - "print (f\"Input chunks before exact dedupe : {input_df.shape[0]:,}\")\n", - "print (f\"Output chunks after exact dedupe : {output_df.shape[0]:,}\")\n", - "print (\"Duplicate chunks removed : \", (input_df.shape[0] - output_df.shape[0]))\n", - "\n", - "output_df.head(10)" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "โœ… Stage:5 completed successfully\n", + "CPU times: user 270 ms, sys: 200 ms, total: 470 ms\n", + "Wall time: 36.6 s\n" + ] + } + ], + "source": [ + "%%time\n", + "\n", + "import os\n", + "import sys\n", + "\n", + "from data_processing.utils import ParamsUtils\n", + "from fdedup_transform_ray import FdedupRayTransformConfiguration\n", + "from data_processing_ray.runtime.ray import RayTransformLauncher\n", + "\n", + "# create parameters\n", + "\n", + "local_conf = {\n", + " \"input_folder\": input_folder,\n", + " \"output_folder\": output_folder,\n", + "}\n", + "worker_options = {\"num_cpus\" : MY_CONFIG.RAY_NUM_CPUS}\n", + "code_location = {\"github\": \"github\", \"commit_hash\": \"12345\", \"path\": \"path\"}\n", + "params = {\n", + " # where to run\n", + " \"run_locally\": True,\n", + " # Data access. Only required parameters are specified\n", + " \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n", + " # Orchestration parameters\n", + " \"runtime_worker_options\": ParamsUtils.convert_to_ast(worker_options),\n", + " \"runtime_num_workers\": MY_CONFIG.RAY_RUNTIME_WORKERS,\n", + " # columns used\n", + " \"fdedup_doc_column\": \"contents\",\n", + " \"fdedup_id_column\": \"chunk_id\",\n", + " \"fdedup_cluster_column\": \"chunk_hash\",\n", + " # infrastructure\n", + " \"fdedup_bucket_cpu\": 0.3,\n", + " \"fdedup_doc_cpu\": 0.3,\n", + " \"fdedup_mhash_cpu\": 0.3,\n", + " \"fdedup_num_doc_actors\": 1,\n", + " \"fdedup_num_bucket_actors\": 1,\n", + " \"fdedup_num_minhash_actors\": 1,\n", + " \"fdedup_num_preprocessors\": 1,\n", + " # fuzzy parameters\n", + " \"fdedup_num_permutations\": 64,\n", + " \"fdedup_threshold\": 0.7, # (default 0.8)\n", + " \"fdedup_shingles_size\": 5,\n", + " \"fdedup_delimiters\": \" \"\n", + "}\n", + "\n", + "# Pass commandline params\n", + "sys.argv = ParamsUtils.dict_to_req(d=params)\n", + "\n", + "# launch\n", + "\n", + "launcher = RayTransformLauncher(FdedupRayTransformConfiguration())\n", + "\n", + "return_code = launcher.launch()\n", + "\n", + "if return_code == 0:\n", + " print (f\"โœ… Stage:{STAGE} completed successfully\")\n", + "else:\n", + " raise Exception (\"โŒ Ray job failed\")" + ] + }, + { + "cell_type": "markdown", + "id": "a6f8cd11", + "metadata": { + "id": "a6f8cd11" + }, + "source": [ + "### 7.3 - Inspect Generated output" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "e899ad60", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 677 }, + "id": "e899ad60", + "outputId": "fcfda84c-ebbf-490f-f478-ceef7ca9e83b" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 24, - "id": "82cc9bb0", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 269 - }, - "id": "82cc9bb0", - "outputId": "e043fa01-ceca-49ae-b764-8154219c7b6c" - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
filenamecontents
0mars.pdfSolar System\\nFor more details about the Solar...
1mars.pdfMars\\nMars, the fourth planet from the Sun, is...
2mars.pdfBasic facts about Mars:\\nยท Distance from the S...
3earth.pdfSolar System\\nOur solar system is a vast and f...
4earth.pdfSolar System\\nFor more details about our Solar...
5earth.pdfEarth\\nEarth is the third planet from the Sun....
6earth.pdfEarth\\nBasic facts about Earth:\\nยท Distance fr...
\n", - "
" - ], - "text/plain": [ - " filename contents\n", - "0 mars.pdf Solar System\\nFor more details about the Solar...\n", - "1 mars.pdf Mars\\nMars, the fourth planet from the Sun, is...\n", - "2 mars.pdf Basic facts about Mars:\\nยท Distance from the S...\n", - "3 earth.pdf Solar System\\nOur solar system is a vast and f...\n", - "4 earth.pdf Solar System\\nFor more details about our Solar...\n", - "5 earth.pdf Earth\\nEarth is the third planet from the Sun....\n", - "6 earth.pdf Earth\\nBasic facts about Earth:\\nยท Distance fr..." - ] - }, - "execution_count": 24, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "output_df[['filename', 'contents']]" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "Input data dimensions (rows x columns)= (8, 18)\n", + "Output data dimensions (rows x columns)= (6, 18)\n", + "Duplicate chunks removed by fuzzy-dedupe: 2\n" + ] }, { - "cell_type": "code", - "execution_count": 25, - "id": "cc61dffa", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "cc61dffa", - "outputId": "aff7a0d9-a791-42a5-d5b7-ad643f59f261" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "========== mars.pdf ===========\n", - "-------Chunk 0------\n", - "Solar System\n", - "For more details about the Solar system see Chapter 1.\n", - "-------\n", - "-------Chunk 1------\n", - "Mars\n", - "Mars, the fourth planet from the Sun, is a cold, desert world with a thin atmosphere composed primarily of carbon dioxide. Its reddish hue comes from iron oxide, or rust, prevalent on its surface.\n", - "-------\n", - "-------Chunk 2------\n", - "Basic facts about Mars:\n", - "ยท Distance from the Sun: Average of 228 million kilometers (142 million miles)\n", - "ยท Rotation Period: 24.6 hours (one Martian day - called a \"sol\")\n", - "ยท Moons: Two small moons, Phobos and Deimos.\n", - "-------\n", - "========== earth.pdf ===========\n", - "-------Chunk 0------\n", - "Solar System\n", - "Our solar system is a vast and fascinating expanse, comprising eight planets, five dwarf planets, numerous moons, asteroids, comets, and other celestial bodies. At its center lies the star we call the Sun.\n", - "-------\n", - "-------Chunk 1------\n", - "Solar System\n", - "For more details about our Solar system see Chapter 1.\n", - "-------\n", - "-------Chunk 2------\n", - "Earth\n", - "Earth is the third planet from the Sun. It's our home planet. Earth is the only place we know of with life.\n", - "-------\n", - "-------Chunk 3------\n", - "Earth\n", - "Basic facts about Earth:\n", - "ยท Distance from the Sun: Average of 149.6 million kilometers (93 million miles)\n", - "ยท Rotation Period: 24 hours (one day)\n", - "ยท Moons: One moon, called Luna or simply \"the Moon\".\n", - "-------\n" - ] - } + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
filenamenum_pagesnum_tablesnum_doc_elementsexthashsizedate_acquiredpdf_convert_timesource_filenamesource_document_idcontentsdoc_jsonpathpage_numberbboxdocument_idchunk_idchunk_hash
0mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Solar System\\nOur solar system is a vast and f...$.main-text[2]1[132.84518433, 588.96014404, 479.40917969, 623...44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...4-1
1mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Mars\\nMars, the fourth planet from the Sun, is...$.main-text[5]1[132.87440491, 500.84011841, 477.48345947, 534...a31663e06fac41470ecc459f5a58658a3f9997d7801053...6-1
2mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Basic facts about Mars:\\nยท Distance from the S...$.main-text[6]1[133.2026062, 482.90710449, 237.04431152, 493....7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...7-1
3earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Solar System\\nFor more details about our Solar...$.main-text[3]1[133.20942688, 570.81555176, 375.57919312, 581...d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...15
4earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Earth\\nEarth is the third planet from the Sun....$.main-text[5]1[132.91053772, 512.46295166, 477.84887695, 534...7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...2-1
5earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Earth\\nBasic facts about Earth:\\nยท Distance fr...$.main-text[6]1[133.30151367, 494.86206055, 240.17156982, 505...189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...3-1
\n", + "
" ], - "source": [ - "for f in output_df['filename'].unique():\n", - " print ('==========' , f, '===========')\n", - " chunks = output_df[output_df['filename'] == f]['contents']\n", - " for idx , chunk in enumerate(chunks):\n", - " print (f'-------Chunk {idx}------\\n{chunk}\\n-------')" - ] - }, - { - "cell_type": "markdown", - "id": "383f40ba", - "metadata": { - "id": "383f40ba" - }, - "source": [ - "### 6.4 - Understanding the output\n", - "\n", - "Remember we had 8 chunks initially. Now we have 7! One duplicate chunk is removed.\n", - "\n", - "If you look at the PDF, the following common paragraph in `earth.pdf` and `mars.pdf` is removed from one of the documents! Pretty neat, eh!\n", - "\n", - "```text\n", - "## Solar System\n", - "\n", - "Our solar system is a vast and fascinating expanse, comprising eight planets, five dwarf planets, numerous moons, asteroids, comets, and other celestial bodies. At its center lies the star we call the Sun.\n", - "```" - ] - }, - { - "cell_type": "markdown", - "id": "85309751-8556-41c6-ac32-84acc941bc8d", - "metadata": { - "id": "85309751-8556-41c6-ac32-84acc941bc8d" - }, - "source": [ - "## Step-7: Fuzzy Dedup\n", - "\n", - "Post exact deduplication, fuzzy deduplication is applied with the goal of removing code files that may have **slight variations** and thereby unbiasing\n", - "the data further.\n", - "\n", - "Small variations are quite commonly seen in code data in the form of variations in the values of variables, addittion of logging statements etc." - ] - }, - { - "cell_type": "markdown", - "id": "fcf574a3-b287-419c-9c86-07b828b41ca6", - "metadata": { - "id": "fcf574a3-b287-419c-9c86-07b828b41ca6" - }, - "source": [ - "### 7.1 - Set Input/output Folder" + "text/plain": [ + " filename num_pages num_tables num_doc_elements ext \\\n", + "0 mars.pdf 1 0 11 pdf \n", + "1 mars.pdf 1 0 11 pdf \n", + "2 mars.pdf 1 0 11 pdf \n", + "3 earth.pdf 1 0 11 pdf \n", + "4 earth.pdf 1 0 11 pdf \n", + "5 earth.pdf 1 0 11 pdf \n", + "\n", + " hash size \\\n", + "0 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", + "1 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", + "2 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", + "3 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", + "4 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", + "5 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", + "\n", + " date_acquired pdf_convert_time source_filename \\\n", + "0 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", + "1 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", + "2 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", + "3 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", + "4 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", + "5 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", + "\n", + " source_document_id \\\n", + "0 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", + "1 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", + "2 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", + "3 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", + "4 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", + "5 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", + "\n", + " contents doc_jsonpath \\\n", + "0 Solar System\\nOur solar system is a vast and f... $.main-text[2] \n", + "1 Mars\\nMars, the fourth planet from the Sun, is... $.main-text[5] \n", + "2 Basic facts about Mars:\\nยท Distance from the S... $.main-text[6] \n", + "3 Solar System\\nFor more details about our Solar... $.main-text[3] \n", + "4 Earth\\nEarth is the third planet from the Sun.... $.main-text[5] \n", + "5 Earth\\nBasic facts about Earth:\\nยท Distance fr... $.main-text[6] \n", + "\n", + " page_number bbox \\\n", + "0 1 [132.84518433, 588.96014404, 479.40917969, 623... \n", + "1 1 [132.87440491, 500.84011841, 477.48345947, 534... \n", + "2 1 [133.2026062, 482.90710449, 237.04431152, 493.... \n", + "3 1 [133.20942688, 570.81555176, 375.57919312, 581... \n", + "4 1 [132.91053772, 512.46295166, 477.84887695, 534... \n", + "5 1 [133.30151367, 494.86206055, 240.17156982, 505... \n", + "\n", + " document_id chunk_id chunk_hash \n", + "0 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... 4 -1 \n", + "1 a31663e06fac41470ecc459f5a58658a3f9997d7801053... 6 -1 \n", + "2 7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a... 7 -1 \n", + "3 d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d... 1 5 \n", + "4 7c4a750e2215f231803a6f8078bde1e9699034fb033dd3... 2 -1 \n", + "5 189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f... 3 -1 " ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from my_utils import read_parquet_files_as_df\n", + "\n", + "output_df = read_parquet_files_as_df(output_folder)\n", + "\n", + "print (\"Input data dimensions (rows x columns)= \", input_df.shape)\n", + "print (\"Output data dimensions (rows x columns)= \", output_df.shape)\n", + "print (\"Duplicate chunks removed by fuzzy-dedupe: \", (input_df.shape[0] - output_df.shape[0]))\n", + "\n", + "output_df.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "ab7ea52b", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 238 }, + "id": "ab7ea52b", + "outputId": "e38754ee-777f-4ed7-ebc0-9299ee122662" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 26, - "id": "9e431c8c-c7c7-48de-ba5f-2c4649c35399", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "9e431c8c-c7c7-48de-ba5f-2c4649c35399", - "outputId": "d53a92d2-0f1c-465f-f11c-b9bc2931f651" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "๐Ÿƒ๐Ÿผ STAGE-5: Processing input='output/03_docid_out' --> output='output/05_fuzzy_dedupe_out'\n" - ] - } + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
filenamecontents
0mars.pdfSolar System\\nOur solar system is a vast and f...
1mars.pdfMars\\nMars, the fourth planet from the Sun, is...
2mars.pdfBasic facts about Mars:\\nยท Distance from the S...
3earth.pdfSolar System\\nFor more details about our Solar...
4earth.pdfEarth\\nEarth is the third planet from the Sun....
5earth.pdfEarth\\nBasic facts about Earth:\\nยท Distance fr...
\n", + "
" ], - "source": [ - "## Input to this component is the output of doc_id generator component.\n", - "\n", - "STAGE = 5\n", - "\n", - "input_folder = output_docid_dir # previous output folder is the input folder for the current stage\n", - "output_folder = output_fuzzy_dedupe_dir\n", - "\n", - "input_df = read_parquet_files_as_df(input_folder) ## for debug purposes\n", - "\n", - "print (f\"๐Ÿƒ๐Ÿผ STAGE-{STAGE}: Processing input='{input_folder}' --> output='{output_folder}'\")" + "text/plain": [ + " filename contents\n", + "0 mars.pdf Solar System\\nOur solar system is a vast and f...\n", + "1 mars.pdf Mars\\nMars, the fourth planet from the Sun, is...\n", + "2 mars.pdf Basic facts about Mars:\\nยท Distance from the S...\n", + "3 earth.pdf Solar System\\nFor more details about our Solar...\n", + "4 earth.pdf Earth\\nEarth is the third planet from the Sun....\n", + "5 earth.pdf Earth\\nBasic facts about Earth:\\nยท Distance fr..." ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "output_df[['filename', 'contents']]" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "6bdd3515", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "6bdd3515", + "outputId": "e6e3f2c0-5b23-4336-bc95-013921f0724a" + }, + "outputs": [ { - "cell_type": "markdown", - "id": "f4c82a8f-b513-4fe5-b172-d41b104b54f3", - "metadata": { - "id": "f4c82a8f-b513-4fe5-b172-d41b104b54f3" - }, - "source": [ - "### 7.2 - Execute" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "========== mars.pdf ===========\n", + "-------Chunk 0------\n", + "Solar System\n", + "Our solar system is a vast and fascinating expanse, comprising eight planets, five dwarf planets, numerous moons, asteroids, comets, and other celestial bodies. At its center lies the star we call the Sun.\n", + "-------\n", + "-------Chunk 1------\n", + "Mars\n", + "Mars, the fourth planet from the Sun, is a cold, desert world with a thin atmosphere composed primarily of carbon dioxide. Its reddish hue comes from iron oxide, or rust, prevalent on its surface.\n", + "-------\n", + "-------Chunk 2------\n", + "Basic facts about Mars:\n", + "ยท Distance from the Sun: Average of 228 million kilometers (142 million miles)\n", + "ยท Rotation Period: 24.6 hours (one Martian day - called a \"sol\")\n", + "ยท Moons: Two small moons, Phobos and Deimos.\n", + "-------\n", + "========== earth.pdf ===========\n", + "-------Chunk 0------\n", + "Solar System\n", + "For more details about our Solar system see Chapter 1.\n", + "-------\n", + "-------Chunk 1------\n", + "Earth\n", + "Earth is the third planet from the Sun. It's our home planet. Earth is the only place we know of with life.\n", + "-------\n", + "-------Chunk 2------\n", + "Earth\n", + "Basic facts about Earth:\n", + "ยท Distance from the Sun: Average of 149.6 million kilometers (93 million miles)\n", + "ยท Rotation Period: 24 hours (one day)\n", + "ยท Moons: One moon, called Luna or simply \"the Moon\".\n", + "-------\n" + ] + } + ], + "source": [ + "for f in output_df['filename'].unique():\n", + " print ('==========' , f, '===========')\n", + " chunks = output_df[output_df['filename'] == f]['contents']\n", + " for idx , chunk in enumerate(chunks):\n", + " print (f'-------Chunk {idx}------\\n{chunk}\\n-------')" + ] + }, + { + "cell_type": "markdown", + "id": "2b34d9c6", + "metadata": { + "id": "2b34d9c6" + }, + "source": [ + "### 7.4- Understanding the output\n", + "\n", + "So we started with 7 rows and ended up with 6. Fuzzy dedupe removed the following **very similar** chunk.\n", + "\n", + "These are pretty similar chunks except for the words 'the' and 'our'\n", + "\n", + "**earth.pdf**\n", + "\n", + "`For more details about *our* Solar system see Chapter 1.`\n", + "\n", + "**mars.pdf**\n", + "\n", + "`For more details about *the* Solar system see Chapter 1.`\n", + "\n", + "Pretty neat, eh? ๐Ÿ‘\n", + "\n", + "### Configuring Fuzzy de-dupe\n", + "\n", + "You can tweak fuzzy dedupe by tweaking the following parameters\n", + "\n", + "```python\n", + "# fuzzy parameters\n", + " \"fdedup_num_permutations\": 64,\n", + " \"fdedup_threshold\": 0.7, # (default 0.8)\n", + " \"fdedup_shingles_size\": 5,\n", + " \"fdedup_delimiters\": \" \"\n", + "```\n", + "\n", + "In our case, we set `fdedup_threshold` parameter to 0.7. \n" + ] + }, + { + "cell_type": "markdown", + "id": "5370950a-2a3a-4143-8218-f9b4808099ba", + "metadata": { + "id": "5370950a-2a3a-4143-8218-f9b4808099ba" + }, + "source": [ + "## Step-8: Text encoding\n", + "\n", + "Encode text for the vector storage." + ] + }, + { + "cell_type": "markdown", + "id": "85aba685", + "metadata": { + "id": "85aba685" + }, + "source": [ + "### 8.1 - Set Input/output Folder" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "20a153fa-fd56-401e-86be-4f7617affcc8", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "20a153fa-fd56-401e-86be-4f7617affcc8", + "outputId": "530e65c6-7ceb-4c73-cb87-50da46c78add" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 27, - "id": "3864ff77-e9a8-48f7-973b-c3b3aef1a94f", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "3864ff77-e9a8-48f7-973b-c3b3aef1a94f", - "outputId": "1e63d364-3944-465a-ff7c-6e1dc750b2de" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "13:32:00 INFO - fuzzy dedup params are {'doc_column': 'contents', 'id_column': 'chunk_id', 'cluster_column': 'chunk_hash', 'bucket_cpu': 0.3, 'mhash_cpu': 0.3, 'doc_cpu': 0.3, 'num_doc_actors': 1, 'num_minhash_actors': 1, 'num_bucket_actors': 1, 'num_preprocessors': 1, 'num_permutations': 64, 'threshold': 0.7, 'shingles_size': 5, 'delimiters': ' ', 'snapshot_delay': 1, 'use_bucket_snapshot': False, 'use_doc_snapshot': False, 'random_delay_limit': 10, 'worker_options': {'num_cpus': 0.8}}\n", - "13:32:00 INFO - pipeline id pipeline_id\n", - "13:32:00 INFO - code location None\n", - "13:32:00 INFO - number of workers 2 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n", - "13:32:00 INFO - actor creation delay 0\n", - "13:32:00 INFO - job details {'job category': 'preprocessing', 'job name': 'fdedup', 'job type': 'ray', 'job id': 'job_id'}\n", - "13:32:00 INFO - data factory data_ is using local data access: input_folder - output/03_docid_out output_folder - output/05_fuzzy_dedupe_out\n", - "13:32:00 INFO - data factory data_ max_files -1, n_sample -1\n", - "13:32:00 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "13:32:00 INFO - Running locally\n", - "2024-10-18 13:32:02,246\tINFO worker.py:1744 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - orchestrator started at 2024-10-18 13:32:03\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - Number of files is 2, source profile {'max_file_size': 0.010180473327636719, 'min_file_size': 0.010101318359375, 'total_file_size': 0.02028179168701172}\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 15.000544739887118, 'object_store': 7.500272369012237}\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - Number of workers - 2 with {'num_cpus': 0.8, 'max_restarts': -1} each\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - starting run from the beginning\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - continuing from the very beginning\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - Fuzzy: num buckets 8, bucket length 8\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - created 1 bucket actors\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - created 1 minhash actors\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - Table preprocessing uses 1 readers\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - created 1 table processor actors\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:07 INFO - Completed 1 files in 0.064 min\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:07 INFO - Completed 1 files (50.0%) in 0.064 min. Waiting for completion\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:15 INFO - Completed processing 2 files in 0.197 min\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:15 INFO - creating minhash snapshots\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:16 INFO - minhash snapshots created\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:16 INFO - creating bucket snapshots\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:17 INFO - bucket snapshots created\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:17 INFO - created 1 document actors\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:17 INFO - created 1 bucket processor actors\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:17 INFO - created bucket processor invoker\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:17 INFO - added invoker to bucket collectors\n", - "\u001b[36m(BucketsHash pid=16209)\u001b[0m 13:32:17 INFO - processing buckets 0 long, 53 short\n", - "\u001b[36m(BucketsHash pid=16209)\u001b[0m 13:32:17 INFO - Done submitting long buckets\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:17 INFO - Done processing buckets in 0.01 min\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:17 INFO - creating document snapshots\n", - "\u001b[36m(BucketsHashProcessorInvoker pid=16602)\u001b[0m 13:32:17 INFO - Waiting bucket processing completion. Submitted requests 1\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:18 INFO - document snapshots created\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:18 INFO - Completed 0 files (0.0%) in 0.0 min. Waiting for completion\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:25 INFO - Completed processing 2 files in 0.113 min\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:25 INFO - done flushing in 0.005 sec\n", - "13:32:35 INFO - Completed execution in 0.588 min, execution result 0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "โœ… Stage:5 completed successfully\n", - "CPU times: user 270 ms, sys: 200 ms, total: 470 ms\n", - "Wall time: 36.6 s\n" - ] - } - ], - "source": [ - "%%time\n", - "\n", - "import os\n", - "import sys\n", - "\n", - "from data_processing.utils import ParamsUtils\n", - "from fdedup_transform_ray import FdedupRayTransformConfiguration\n", - "from data_processing_ray.runtime.ray import RayTransformLauncher\n", - "\n", - "# create parameters\n", - "\n", - "local_conf = {\n", - " \"input_folder\": input_folder,\n", - " \"output_folder\": output_folder,\n", - "}\n", - "worker_options = {\"num_cpus\" : MY_CONFIG.RAY_NUM_CPUS}\n", - "code_location = {\"github\": \"github\", \"commit_hash\": \"12345\", \"path\": \"path\"}\n", - "params = {\n", - " # where to run\n", - " \"run_locally\": True,\n", - " # Data access. Only required parameters are specified\n", - " \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n", - " # Orchestration parameters\n", - " \"runtime_worker_options\": ParamsUtils.convert_to_ast(worker_options),\n", - " \"runtime_num_workers\": MY_CONFIG.RAY_RUNTIME_WORKERS,\n", - " # columns used\n", - " \"fdedup_doc_column\": \"contents\",\n", - " \"fdedup_id_column\": \"chunk_id\",\n", - " \"fdedup_cluster_column\": \"chunk_hash\",\n", - " # infrastructure\n", - " \"fdedup_bucket_cpu\": 0.3,\n", - " \"fdedup_doc_cpu\": 0.3,\n", - " \"fdedup_mhash_cpu\": 0.3,\n", - " \"fdedup_num_doc_actors\": 1,\n", - " \"fdedup_num_bucket_actors\": 1,\n", - " \"fdedup_num_minhash_actors\": 1,\n", - " \"fdedup_num_preprocessors\": 1,\n", - " # fuzzy parameters\n", - " \"fdedup_num_permutations\": 64,\n", - " \"fdedup_threshold\": 0.7, # (default 0.8)\n", - " \"fdedup_shingles_size\": 5,\n", - " \"fdedup_delimiters\": \" \"\n", - "}\n", - "\n", - "# Pass commandline params\n", - "sys.argv = ParamsUtils.dict_to_req(d=params)\n", - "\n", - "# launch\n", - "\n", - "launcher = RayTransformLauncher(FdedupRayTransformConfiguration())\n", - "\n", - "return_code = launcher.launch()\n", - "\n", - "if return_code == 0:\n", - " print (f\"โœ… Stage:{STAGE} completed successfully\")\n", - "else:\n", - " raise Exception (\"โŒ Ray job failed\")" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "๐Ÿƒ๐Ÿผ STAGE-6: Processing input='output/05_fuzzy_dedupe_out' --> output='output/06_embeddings_out'\n" + ] + } + ], + "source": [ + "STAGE = 6\n", + "\n", + "input_folder = output_fuzzy_dedupe_dir # previous output folder is the input folder for the current stage\n", + "output_folder = output_embeddings_dir\n", + "\n", + "input_df = read_parquet_files_as_df(input_folder) ## for debug purposes\n", + "\n", + "print (f\"๐Ÿƒ๐Ÿผ STAGE-{STAGE}: Processing input='{input_folder}' --> output='{output_folder}'\")" + ] + }, + { + "cell_type": "markdown", + "id": "c97545f4", + "metadata": { + "id": "c97545f4" + }, + "source": [ + "### 8.2 - Execute" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "228df6b2-bc62-494b-9697-03ece98d7853", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 914, + "referenced_widgets": [ + "8b7571c585df431eb901fcdebdf8177e", + "06107a2f48b3491f91bbe84e46e10ba0", + "bd74356eca18423aa0373c808d9097e3", + "7e13e8779a81400f996d4428c74acfaf", + "a75892696be546a3970962bae7bf732a", + "68997339f13240a4824a9e416096bee4", + "919b086abd314077bbff75687392bd91", + "b4c209371e7a403986991a786cfb296d", + "6c08de2dd9a2402c90b1a7a645db9b13", + "91fff81a1de8487c9009e872b751edb0", + "ada62d24cbcf4361acbb21808f334d33" + ] }, + "id": "228df6b2-bc62-494b-9697-03ece98d7853", + "outputId": "b10eecc1-cd17-49c1-e3b1-b80e0e1bfa86" + }, + "outputs": [ { - "cell_type": "markdown", - "id": "a6f8cd11", - "metadata": { - "id": "a6f8cd11" - }, - "source": [ - "### 7.3 - Inspect Generated output" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "13:32:37 INFO - text_encoder parameters are : {'content_column_name': 'contents', 'output_embeddings_column_name': 'embeddings', 'model_name': 'sentence-transformers/all-MiniLM-L6-v2'}\n", + "13:32:37 INFO - pipeline id pipeline_id\n", + "13:32:37 INFO - code location None\n", + "13:32:37 INFO - number of workers 2 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n", + "13:32:37 INFO - actor creation delay 0\n", + "13:32:37 INFO - job details {'job category': 'preprocessing', 'job name': 'text_encoder', 'job type': 'ray', 'job id': 'job_id'}\n", + "13:32:37 INFO - data factory data_ is using local data access: input_folder - output/05_fuzzy_dedupe_out output_folder - output/06_embeddings_out\n", + "13:32:37 INFO - data factory data_ max_files -1, n_sample -1\n", + "13:32:37 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "13:32:37 INFO - Running locally\n", + "2024-10-18 13:32:39,609\tINFO worker.py:1744 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", + "\u001b[36m(orchestrate pid=17394)\u001b[0m 13:32:42 INFO - orchestrator started at 2024-10-18 13:32:42\n", + "\u001b[36m(orchestrate pid=17394)\u001b[0m 13:32:42 INFO - Number of files is 2, source profile {'max_file_size': 0.009654045104980469, 'min_file_size': 0.00907135009765625, 'total_file_size': 0.01872539520263672}\n", + "\u001b[36m(orchestrate pid=17394)\u001b[0m 13:32:42 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 14.943363189697266, 'object_store': 7.471681594848633}\n", + "\u001b[36m(orchestrate pid=17394)\u001b[0m 13:32:42 INFO - Number of workers - 2 with {'num_cpus': 0.8, 'max_restarts': -1} each\n", + "\u001b[36m(orchestrate pid=17394)\u001b[0m 13:32:42 INFO - Completed 0 files (0.0%) in 0.0 min. Waiting for completion\n", + "\u001b[36m(orchestrate pid=17394)\u001b[0m 13:32:47 INFO - Completed processing 2 files in 0.087 min\n", + "\u001b[36m(orchestrate pid=17394)\u001b[0m 13:32:47 INFO - done flushing in 0.001 sec\n", + "13:32:57 INFO - Completed execution in 0.333 min, execution result 0\n" + ] }, { - "cell_type": "code", - "execution_count": 28, - "id": "e899ad60", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 677 - }, - "id": "e899ad60", - "outputId": "fcfda84c-ebbf-490f-f478-ceef7ca9e83b" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Input data dimensions (rows x columns)= (8, 18)\n", - "Output data dimensions (rows x columns)= (6, 18)\n", - "Duplicate chunks removed by fuzzy-dedupe: 2\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
filenamenum_pagesnum_tablesnum_doc_elementsexthashsizedate_acquiredpdf_convert_timesource_filenamesource_document_idcontentsdoc_jsonpathpage_numberbboxdocument_idchunk_idchunk_hash
0mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Solar System\\nOur solar system is a vast and f...$.main-text[2]1[132.84518433, 588.96014404, 479.40917969, 623...44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...4-1
1mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Mars\\nMars, the fourth planet from the Sun, is...$.main-text[5]1[132.87440491, 500.84011841, 477.48345947, 534...a31663e06fac41470ecc459f5a58658a3f9997d7801053...6-1
2mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Basic facts about Mars:\\nยท Distance from the S...$.main-text[6]1[133.2026062, 482.90710449, 237.04431152, 493....7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...7-1
3earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Solar System\\nFor more details about our Solar...$.main-text[3]1[133.20942688, 570.81555176, 375.57919312, 581...d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...15
4earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Earth\\nEarth is the third planet from the Sun....$.main-text[5]1[132.91053772, 512.46295166, 477.84887695, 534...7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...2-1
5earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Earth\\nBasic facts about Earth:\\nยท Distance fr...$.main-text[6]1[133.30151367, 494.86206055, 240.17156982, 505...189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...3-1
\n", - "
" - ], - "text/plain": [ - " filename num_pages num_tables num_doc_elements ext \\\n", - "0 mars.pdf 1 0 11 pdf \n", - "1 mars.pdf 1 0 11 pdf \n", - "2 mars.pdf 1 0 11 pdf \n", - "3 earth.pdf 1 0 11 pdf \n", - "4 earth.pdf 1 0 11 pdf \n", - "5 earth.pdf 1 0 11 pdf \n", - "\n", - " hash size \\\n", - "0 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "1 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "2 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "3 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "4 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "5 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "\n", - " date_acquired pdf_convert_time source_filename \\\n", - "0 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", - "1 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", - "2 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", - "3 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", - "4 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", - "5 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", - "\n", - " source_document_id \\\n", - "0 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", - "1 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", - "2 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", - "3 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", - "4 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", - "5 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", - "\n", - " contents doc_jsonpath \\\n", - "0 Solar System\\nOur solar system is a vast and f... $.main-text[2] \n", - "1 Mars\\nMars, the fourth planet from the Sun, is... $.main-text[5] \n", - "2 Basic facts about Mars:\\nยท Distance from the S... $.main-text[6] \n", - "3 Solar System\\nFor more details about our Solar... $.main-text[3] \n", - "4 Earth\\nEarth is the third planet from the Sun.... $.main-text[5] \n", - "5 Earth\\nBasic facts about Earth:\\nยท Distance fr... $.main-text[6] \n", - "\n", - " page_number bbox \\\n", - "0 1 [132.84518433, 588.96014404, 479.40917969, 623... \n", - "1 1 [132.87440491, 500.84011841, 477.48345947, 534... \n", - "2 1 [133.2026062, 482.90710449, 237.04431152, 493.... \n", - "3 1 [133.20942688, 570.81555176, 375.57919312, 581... \n", - "4 1 [132.91053772, 512.46295166, 477.84887695, 534... \n", - "5 1 [133.30151367, 494.86206055, 240.17156982, 505... \n", - "\n", - " document_id chunk_id chunk_hash \n", - "0 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... 4 -1 \n", - "1 a31663e06fac41470ecc459f5a58658a3f9997d7801053... 6 -1 \n", - "2 7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a... 7 -1 \n", - "3 d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d... 1 5 \n", - "4 7c4a750e2215f231803a6f8078bde1e9699034fb033dd3... 2 -1 \n", - "5 189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f... 3 -1 " - ] - }, - "execution_count": 28, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from my_utils import read_parquet_files_as_df\n", - "\n", - "output_df = read_parquet_files_as_df(output_folder)\n", - "\n", - "print (\"Input data dimensions (rows x columns)= \", input_df.shape)\n", - "print (\"Output data dimensions (rows x columns)= \", output_df.shape)\n", - "print (\"Duplicate chunks removed by fuzzy-dedupe: \", (input_df.shape[0] - output_df.shape[0]))\n", - "\n", - "output_df.head(10)" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "โœ… Stage:6 completed successfully\n", + "CPU times: user 607 ms, sys: 226 ms, total: 833 ms\n", + "Wall time: 22.1 s\n" + ] + } + ], + "source": [ + "%%time\n", + "\n", + "from text_encoder_transform_ray import TextEncoderRayTransformConfiguration\n", + "\n", + "local_conf = {\n", + " \"input_folder\": input_folder,\n", + " \"output_folder\": output_folder,\n", + "}\n", + "worker_options = {\"num_cpus\" : MY_CONFIG.RAY_NUM_CPUS}\n", + "params = {\n", + " # where to run\n", + " \"run_locally\": True,\n", + " # Data access. Only required parameters are specified\n", + " \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n", + " # orchestrator\n", + " \"runtime_worker_options\": ParamsUtils.convert_to_ast(worker_options),\n", + " \"runtime_num_workers\": MY_CONFIG.RAY_RUNTIME_WORKERS,\n", + " # text_encoder\n", + " \"text_encoder_model_name\": MY_CONFIG.EMBEDDING_MODEL,\n", + "}\n", + "\n", + "sys.argv = ParamsUtils.dict_to_req(d=params)\n", + "# create launcher\n", + "launcher = RayTransformLauncher(TextEncoderRayTransformConfiguration())\n", + "# Launch the ray actor(s) to process the input\n", + "\n", + "return_code = launcher.launch()\n", + "\n", + "if return_code == 0:\n", + " print (f\"โœ… Stage:{STAGE} completed successfully\")\n", + "else:\n", + " raise Exception (\"โŒ Ray job failed\")" + ] + }, + { + "cell_type": "markdown", + "id": "b734852c", + "metadata": { + "id": "b734852c" + }, + "source": [ + "### 8.3 - Inspect Generated output\n", + "\n", + "You will see a column called `embeddings` added at the end. This the text content converted into vectors or embeddings. We used the model `sentence-transformers/all-MiniLM-L6-v2`" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "7b1c1d09", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 659 }, + "id": "7b1c1d09", + "outputId": "70612634-b336-4ad5-ddb3-782ca0676bae" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 29, - "id": "ab7ea52b", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 238 - }, - "id": "ab7ea52b", - "outputId": "e38754ee-777f-4ed7-ebc0-9299ee122662" - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
filenamecontents
0mars.pdfSolar System\\nOur solar system is a vast and f...
1mars.pdfMars\\nMars, the fourth planet from the Sun, is...
2mars.pdfBasic facts about Mars:\\nยท Distance from the S...
3earth.pdfSolar System\\nFor more details about our Solar...
4earth.pdfEarth\\nEarth is the third planet from the Sun....
5earth.pdfEarth\\nBasic facts about Earth:\\nยท Distance fr...
\n", - "
" - ], - "text/plain": [ - " filename contents\n", - "0 mars.pdf Solar System\\nOur solar system is a vast and f...\n", - "1 mars.pdf Mars\\nMars, the fourth planet from the Sun, is...\n", - "2 mars.pdf Basic facts about Mars:\\nยท Distance from the S...\n", - "3 earth.pdf Solar System\\nFor more details about our Solar...\n", - "4 earth.pdf Earth\\nEarth is the third planet from the Sun....\n", - "5 earth.pdf Earth\\nBasic facts about Earth:\\nยท Distance fr..." - ] - }, - "execution_count": 29, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "output_df[['filename', 'contents']]" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "Input data dimensions (rows x columns)= (6, 18)\n", + "Output data dimensions (rows x columns)= (6, 19)\n" + ] }, { - "cell_type": "code", - "execution_count": 30, - "id": "6bdd3515", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "6bdd3515", - "outputId": "e6e3f2c0-5b23-4336-bc95-013921f0724a" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "========== mars.pdf ===========\n", - "-------Chunk 0------\n", - "Solar System\n", - "Our solar system is a vast and fascinating expanse, comprising eight planets, five dwarf planets, numerous moons, asteroids, comets, and other celestial bodies. At its center lies the star we call the Sun.\n", - "-------\n", - "-------Chunk 1------\n", - "Mars\n", - "Mars, the fourth planet from the Sun, is a cold, desert world with a thin atmosphere composed primarily of carbon dioxide. Its reddish hue comes from iron oxide, or rust, prevalent on its surface.\n", - "-------\n", - "-------Chunk 2------\n", - "Basic facts about Mars:\n", - "ยท Distance from the Sun: Average of 228 million kilometers (142 million miles)\n", - "ยท Rotation Period: 24.6 hours (one Martian day - called a \"sol\")\n", - "ยท Moons: Two small moons, Phobos and Deimos.\n", - "-------\n", - "========== earth.pdf ===========\n", - "-------Chunk 0------\n", - "Solar System\n", - "For more details about our Solar system see Chapter 1.\n", - "-------\n", - "-------Chunk 1------\n", - "Earth\n", - "Earth is the third planet from the Sun. It's our home planet. Earth is the only place we know of with life.\n", - "-------\n", - "-------Chunk 2------\n", - "Earth\n", - "Basic facts about Earth:\n", - "ยท Distance from the Sun: Average of 149.6 million kilometers (93 million miles)\n", - "ยท Rotation Period: 24 hours (one day)\n", - "ยท Moons: One moon, called Luna or simply \"the Moon\".\n", - "-------\n" - ] - } + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
filenamenum_pagesnum_tablesnum_doc_elementsexthashsizedate_acquiredpdf_convert_timesource_filenamesource_document_idcontentsdoc_jsonpathpage_numberbboxdocument_idchunk_idchunk_hashembeddings
0mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Solar System\\nOur solar system is a vast and f...$.main-text[2]1[132.84518433, 588.96014404, 479.40917969, 623...44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...4-1[0.0077404897, -0.020559434, 0.026426662, 0.01...
1mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Mars\\nMars, the fourth planet from the Sun, is...$.main-text[5]1[132.87440491, 500.84011841, 477.48345947, 534...a31663e06fac41470ecc459f5a58658a3f9997d7801053...6-1[0.07728298, 0.024971062, -0.04318075, 0.05809...
2mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Basic facts about Mars:\\nยท Distance from the S...$.main-text[6]1[133.2026062, 482.90710449, 237.04431152, 493....7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...7-1[0.1059802, 0.025460616, 0.02362733, 0.0390564...
3earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Solar System\\nFor more details about our Solar...$.main-text[3]1[133.20942688, 570.81555176, 375.57919312, 581...d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...15[-0.062105577, -0.0053322953, 0.03127779, 0.04...
4earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Earth\\nEarth is the third planet from the Sun....$.main-text[5]1[132.91053772, 512.46295166, 477.84887695, 534...7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...2-1[0.0724358, -0.058001805, -0.01977186, -0.0243...
5earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Earth\\nBasic facts about Earth:\\nยท Distance fr...$.main-text[6]1[133.30151367, 494.86206055, 240.17156982, 505...189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...3-1[0.091821924, 0.015197907, 0.07716932, 0.01711...
\n", + "
" ], - "source": [ - "for f in output_df['filename'].unique():\n", - " print ('==========' , f, '===========')\n", - " chunks = output_df[output_df['filename'] == f]['contents']\n", - " for idx , chunk in enumerate(chunks):\n", - " print (f'-------Chunk {idx}------\\n{chunk}\\n-------')" - ] - }, - { - "cell_type": "markdown", - "id": "2b34d9c6", - "metadata": { - "id": "2b34d9c6" - }, - "source": [ - "### 7.4- Understanding the output\n", - "\n", - "So we started with 7 rows and ended up with 6. Fuzzy dedupe removed the following **very similar** chunk.\n", - "\n", - "These are pretty similar chunks except for the words 'the' and 'our'\n", - "\n", - "**earth.pdf**\n", - "\n", - "`For more details about *our* Solar system see Chapter 1.`\n", - "\n", - "**mars.pdf**\n", - "\n", - "`For more details about *the* Solar system see Chapter 1.`\n", - "\n", - "Pretty neat, eh? ๐Ÿ‘\n", - "\n", - "### Configuring Fuzzy de-dupe\n", - "\n", - "You can tweak fuzzy dedupe by tweaking the following parameters\n", - "\n", - "```python\n", - "# fuzzy parameters\n", - " \"fdedup_num_permutations\": 64,\n", - " \"fdedup_threshold\": 0.7, # (default 0.8)\n", - " \"fdedup_shingles_size\": 5,\n", - " \"fdedup_delimiters\": \" \"\n", - "```\n", - "\n", - "In our case, we set `fdedup_threshold` parameter to 0.7. \n" + "text/plain": [ + " filename num_pages num_tables num_doc_elements ext \\\n", + "0 mars.pdf 1 0 11 pdf \n", + "1 mars.pdf 1 0 11 pdf \n", + "2 mars.pdf 1 0 11 pdf \n", + "3 earth.pdf 1 0 11 pdf \n", + "4 earth.pdf 1 0 11 pdf \n", + "5 earth.pdf 1 0 11 pdf \n", + "\n", + " hash size \\\n", + "0 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", + "1 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", + "2 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", + "3 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", + "4 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", + "5 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", + "\n", + " date_acquired pdf_convert_time source_filename \\\n", + "0 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", + "1 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", + "2 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", + "3 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", + "4 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", + "5 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", + "\n", + " source_document_id \\\n", + "0 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", + "1 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", + "2 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", + "3 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", + "4 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", + "5 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", + "\n", + " contents doc_jsonpath \\\n", + "0 Solar System\\nOur solar system is a vast and f... $.main-text[2] \n", + "1 Mars\\nMars, the fourth planet from the Sun, is... $.main-text[5] \n", + "2 Basic facts about Mars:\\nยท Distance from the S... $.main-text[6] \n", + "3 Solar System\\nFor more details about our Solar... $.main-text[3] \n", + "4 Earth\\nEarth is the third planet from the Sun.... $.main-text[5] \n", + "5 Earth\\nBasic facts about Earth:\\nยท Distance fr... $.main-text[6] \n", + "\n", + " page_number bbox \\\n", + "0 1 [132.84518433, 588.96014404, 479.40917969, 623... \n", + "1 1 [132.87440491, 500.84011841, 477.48345947, 534... \n", + "2 1 [133.2026062, 482.90710449, 237.04431152, 493.... \n", + "3 1 [133.20942688, 570.81555176, 375.57919312, 581... \n", + "4 1 [132.91053772, 512.46295166, 477.84887695, 534... \n", + "5 1 [133.30151367, 494.86206055, 240.17156982, 505... \n", + "\n", + " document_id chunk_id chunk_hash \\\n", + "0 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... 4 -1 \n", + "1 a31663e06fac41470ecc459f5a58658a3f9997d7801053... 6 -1 \n", + "2 7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a... 7 -1 \n", + "3 d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d... 1 5 \n", + "4 7c4a750e2215f231803a6f8078bde1e9699034fb033dd3... 2 -1 \n", + "5 189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f... 3 -1 \n", + "\n", + " embeddings \n", + "0 [0.0077404897, -0.020559434, 0.026426662, 0.01... \n", + "1 [0.07728298, 0.024971062, -0.04318075, 0.05809... \n", + "2 [0.1059802, 0.025460616, 0.02362733, 0.0390564... \n", + "3 [-0.062105577, -0.0053322953, 0.03127779, 0.04... \n", + "4 [0.0724358, -0.058001805, -0.01977186, -0.0243... \n", + "5 [0.091821924, 0.015197907, 0.07716932, 0.01711... " ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from my_utils import read_parquet_files_as_df\n", + "\n", + "output_df = read_parquet_files_as_df(output_folder)\n", + "\n", + "print (\"Input data dimensions (rows x columns)= \", input_df.shape)\n", + "print (\"Output data dimensions (rows x columns)= \", output_df.shape)\n", + "\n", + "output_df.head(10)" + ] + }, + { + "cell_type": "markdown", + "id": "f5e12630-be6b-4188-a925-77117155617b", + "metadata": { + "id": "f5e12630-be6b-4188-a925-77117155617b" + }, + "source": [ + "## Step-9: Copy output to final output dir" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "16dee3b8-31dc-4168-8adb-f2a0a0b5e207", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "16dee3b8-31dc-4168-8adb-f2a0a0b5e207", + "outputId": "d151e618-6528-40b5-fdbd-1c67291a7279" + }, + "outputs": [ { - "cell_type": "markdown", - "id": "5370950a-2a3a-4143-8218-f9b4808099ba", - "metadata": { - "id": "5370950a-2a3a-4143-8218-f9b4808099ba" - }, - "source": [ - "## Step-8: Text encoding\n", - "\n", - "Encode text for the vector storage." - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "โœ… Copied output from 'output/06_embeddings_out' --> 'output/output_final'\n" + ] + } + ], + "source": [ + "import shutil\n", + "\n", + "shutil.rmtree(MY_CONFIG.OUTPUT_FOLDER_FINAL, ignore_errors=True)\n", + "shutil.copytree(src=output_folder, dst=MY_CONFIG.OUTPUT_FOLDER_FINAL)\n", + "\n", + "print (f\"โœ… Copied output from '{output_folder}' --> '{MY_CONFIG.OUTPUT_FOLDER_FINAL}'\")" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "dc0a6728", + "metadata": { + "id": "dc0a6728" + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "dpk-3-basic-022dev1-py311", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.10" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "06107a2f48b3491f91bbe84e46e10ba0": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_68997339f13240a4824a9e416096bee4", + "placeholder": "โ€‹", + "style": "IPY_MODEL_919b086abd314077bbff75687392bd91", + "value": "" + } }, - { - "cell_type": "markdown", - "id": "85aba685", - "metadata": { - "id": "85aba685" - }, - "source": [ - "### 8.1 - Set Input/output Folder" - ] + "68997339f13240a4824a9e416096bee4": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } }, - { - "cell_type": "code", - "execution_count": 31, - "id": "20a153fa-fd56-401e-86be-4f7617affcc8", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "20a153fa-fd56-401e-86be-4f7617affcc8", - "outputId": "530e65c6-7ceb-4c73-cb87-50da46c78add" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "๐Ÿƒ๐Ÿผ STAGE-6: Processing input='output/05_fuzzy_dedupe_out' --> output='output/06_embeddings_out'\n" - ] - } - ], - "source": [ - "STAGE = 6\n", - "\n", - "input_folder = output_fuzzy_dedupe_dir # previous output folder is the input folder for the current stage\n", - "output_folder = output_embeddings_dir\n", - "\n", - "input_df = read_parquet_files_as_df(input_folder) ## for debug purposes\n", - "\n", - "print (f\"๐Ÿƒ๐Ÿผ STAGE-{STAGE}: Processing input='{input_folder}' --> output='{output_folder}'\")" - ] + "6c08de2dd9a2402c90b1a7a645db9b13": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } }, - { - "cell_type": "markdown", - "id": "c97545f4", - "metadata": { - "id": "c97545f4" - }, - "source": [ - "### 8.2 - Execute" - ] + "7e13e8779a81400f996d4428c74acfaf": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_91fff81a1de8487c9009e872b751edb0", + "placeholder": "โ€‹", + "style": "IPY_MODEL_ada62d24cbcf4361acbb21808f334d33", + "value": "โ€‡0/0โ€‡[00:00<?,โ€‡?it/s]" + } }, - { - "cell_type": "code", - "execution_count": 32, - "id": "228df6b2-bc62-494b-9697-03ece98d7853", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 914, - "referenced_widgets": [ - "8b7571c585df431eb901fcdebdf8177e", - "06107a2f48b3491f91bbe84e46e10ba0", - "bd74356eca18423aa0373c808d9097e3", - "7e13e8779a81400f996d4428c74acfaf", - "a75892696be546a3970962bae7bf732a", - "68997339f13240a4824a9e416096bee4", - "919b086abd314077bbff75687392bd91", - "b4c209371e7a403986991a786cfb296d", - "6c08de2dd9a2402c90b1a7a645db9b13", - "91fff81a1de8487c9009e872b751edb0", - "ada62d24cbcf4361acbb21808f334d33" - ] - }, - "id": "228df6b2-bc62-494b-9697-03ece98d7853", - "outputId": "b10eecc1-cd17-49c1-e3b1-b80e0e1bfa86" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "13:32:37 INFO - text_encoder parameters are : {'content_column_name': 'contents', 'output_embeddings_column_name': 'embeddings', 'model_name': 'sentence-transformers/all-MiniLM-L6-v2'}\n", - "13:32:37 INFO - pipeline id pipeline_id\n", - "13:32:37 INFO - code location None\n", - "13:32:37 INFO - number of workers 2 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n", - "13:32:37 INFO - actor creation delay 0\n", - "13:32:37 INFO - job details {'job category': 'preprocessing', 'job name': 'text_encoder', 'job type': 'ray', 'job id': 'job_id'}\n", - "13:32:37 INFO - data factory data_ is using local data access: input_folder - output/05_fuzzy_dedupe_out output_folder - output/06_embeddings_out\n", - "13:32:37 INFO - data factory data_ max_files -1, n_sample -1\n", - "13:32:37 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "13:32:37 INFO - Running locally\n", - "2024-10-18 13:32:39,609\tINFO worker.py:1744 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", - "\u001b[36m(orchestrate pid=17394)\u001b[0m 13:32:42 INFO - orchestrator started at 2024-10-18 13:32:42\n", - "\u001b[36m(orchestrate pid=17394)\u001b[0m 13:32:42 INFO - Number of files is 2, source profile {'max_file_size': 0.009654045104980469, 'min_file_size': 0.00907135009765625, 'total_file_size': 0.01872539520263672}\n", - "\u001b[36m(orchestrate pid=17394)\u001b[0m 13:32:42 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 14.943363189697266, 'object_store': 7.471681594848633}\n", - "\u001b[36m(orchestrate pid=17394)\u001b[0m 13:32:42 INFO - Number of workers - 2 with {'num_cpus': 0.8, 'max_restarts': -1} each\n", - "\u001b[36m(orchestrate pid=17394)\u001b[0m 13:32:42 INFO - Completed 0 files (0.0%) in 0.0 min. Waiting for completion\n", - "\u001b[36m(orchestrate pid=17394)\u001b[0m 13:32:47 INFO - Completed processing 2 files in 0.087 min\n", - "\u001b[36m(orchestrate pid=17394)\u001b[0m 13:32:47 INFO - done flushing in 0.001 sec\n", - "13:32:57 INFO - Completed execution in 0.333 min, execution result 0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "โœ… Stage:6 completed successfully\n", - "CPU times: user 607 ms, sys: 226 ms, total: 833 ms\n", - "Wall time: 22.1 s\n" - ] - } + "8b7571c585df431eb901fcdebdf8177e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_06107a2f48b3491f91bbe84e46e10ba0", + "IPY_MODEL_bd74356eca18423aa0373c808d9097e3", + "IPY_MODEL_7e13e8779a81400f996d4428c74acfaf" ], - "source": [ - "%%time\n", - "\n", - "from text_encoder_transform_ray import TextEncoderRayTransformConfiguration\n", - "\n", - "local_conf = {\n", - " \"input_folder\": input_folder,\n", - " \"output_folder\": output_folder,\n", - "}\n", - "worker_options = {\"num_cpus\" : MY_CONFIG.RAY_NUM_CPUS}\n", - "params = {\n", - " # where to run\n", - " \"run_locally\": True,\n", - " # Data access. Only required parameters are specified\n", - " \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n", - " # orchestrator\n", - " \"runtime_worker_options\": ParamsUtils.convert_to_ast(worker_options),\n", - " \"runtime_num_workers\": MY_CONFIG.RAY_RUNTIME_WORKERS,\n", - " # text_encoder\n", - " \"text_encoder_model_name\": MY_CONFIG.EMBEDDING_MODEL,\n", - "}\n", - "\n", - "sys.argv = ParamsUtils.dict_to_req(d=params)\n", - "# create launcher\n", - "launcher = RayTransformLauncher(TextEncoderRayTransformConfiguration())\n", - "# Launch the ray actor(s) to process the input\n", - "\n", - "return_code = launcher.launch()\n", - "\n", - "if return_code == 0:\n", - " print (f\"โœ… Stage:{STAGE} completed successfully\")\n", - "else:\n", - " raise Exception (\"โŒ Ray job failed\")" - ] + "layout": "IPY_MODEL_a75892696be546a3970962bae7bf732a" + } }, - { - "cell_type": "markdown", - "id": "b734852c", - "metadata": { - "id": "b734852c" - }, - "source": [ - "### 8.3 - Inspect Generated output\n", - "\n", - "You will see a column called `embeddings` added at the end. This the text content converted into vectors or embeddings. We used the model `sentence-transformers/all-MiniLM-L6-v2`" - ] + "919b086abd314077bbff75687392bd91": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } }, - { - "cell_type": "code", - "execution_count": 33, - "id": "7b1c1d09", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 659 - }, - "id": "7b1c1d09", - "outputId": "70612634-b336-4ad5-ddb3-782ca0676bae" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Input data dimensions (rows x columns)= (6, 18)\n", - "Output data dimensions (rows x columns)= (6, 19)\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
filenamenum_pagesnum_tablesnum_doc_elementsexthashsizedate_acquiredpdf_convert_timesource_filenamesource_document_idcontentsdoc_jsonpathpage_numberbboxdocument_idchunk_idchunk_hashembeddings
0mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Solar System\\nOur solar system is a vast and f...$.main-text[2]1[132.84518433, 588.96014404, 479.40917969, 623...44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...4-1[0.0077404897, -0.020559434, 0.026426662, 0.01...
1mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Mars\\nMars, the fourth planet from the Sun, is...$.main-text[5]1[132.87440491, 500.84011841, 477.48345947, 534...a31663e06fac41470ecc459f5a58658a3f9997d7801053...6-1[0.07728298, 0.024971062, -0.04318075, 0.05809...
2mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Basic facts about Mars:\\nยท Distance from the S...$.main-text[6]1[133.2026062, 482.90710449, 237.04431152, 493....7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...7-1[0.1059802, 0.025460616, 0.02362733, 0.0390564...
3earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Solar System\\nFor more details about our Solar...$.main-text[3]1[133.20942688, 570.81555176, 375.57919312, 581...d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...15[-0.062105577, -0.0053322953, 0.03127779, 0.04...
4earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Earth\\nEarth is the third planet from the Sun....$.main-text[5]1[132.91053772, 512.46295166, 477.84887695, 534...7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...2-1[0.0724358, -0.058001805, -0.01977186, -0.0243...
5earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Earth\\nBasic facts about Earth:\\nยท Distance fr...$.main-text[6]1[133.30151367, 494.86206055, 240.17156982, 505...189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...3-1[0.091821924, 0.015197907, 0.07716932, 0.01711...
\n", - "
" - ], - "text/plain": [ - " filename num_pages num_tables num_doc_elements ext \\\n", - "0 mars.pdf 1 0 11 pdf \n", - "1 mars.pdf 1 0 11 pdf \n", - "2 mars.pdf 1 0 11 pdf \n", - "3 earth.pdf 1 0 11 pdf \n", - "4 earth.pdf 1 0 11 pdf \n", - "5 earth.pdf 1 0 11 pdf \n", - "\n", - " hash size \\\n", - "0 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "1 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "2 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "3 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "4 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "5 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "\n", - " date_acquired pdf_convert_time source_filename \\\n", - "0 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", - "1 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", - "2 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", - "3 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", - "4 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", - "5 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", - "\n", - " source_document_id \\\n", - "0 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", - "1 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", - "2 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", - "3 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", - "4 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", - "5 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", - "\n", - " contents doc_jsonpath \\\n", - "0 Solar System\\nOur solar system is a vast and f... $.main-text[2] \n", - "1 Mars\\nMars, the fourth planet from the Sun, is... $.main-text[5] \n", - "2 Basic facts about Mars:\\nยท Distance from the S... $.main-text[6] \n", - "3 Solar System\\nFor more details about our Solar... $.main-text[3] \n", - "4 Earth\\nEarth is the third planet from the Sun.... $.main-text[5] \n", - "5 Earth\\nBasic facts about Earth:\\nยท Distance fr... $.main-text[6] \n", - "\n", - " page_number bbox \\\n", - "0 1 [132.84518433, 588.96014404, 479.40917969, 623... \n", - "1 1 [132.87440491, 500.84011841, 477.48345947, 534... \n", - "2 1 [133.2026062, 482.90710449, 237.04431152, 493.... \n", - "3 1 [133.20942688, 570.81555176, 375.57919312, 581... \n", - "4 1 [132.91053772, 512.46295166, 477.84887695, 534... \n", - "5 1 [133.30151367, 494.86206055, 240.17156982, 505... \n", - "\n", - " document_id chunk_id chunk_hash \\\n", - "0 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... 4 -1 \n", - "1 a31663e06fac41470ecc459f5a58658a3f9997d7801053... 6 -1 \n", - "2 7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a... 7 -1 \n", - "3 d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d... 1 5 \n", - "4 7c4a750e2215f231803a6f8078bde1e9699034fb033dd3... 2 -1 \n", - "5 189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f... 3 -1 \n", - "\n", - " embeddings \n", - "0 [0.0077404897, -0.020559434, 0.026426662, 0.01... \n", - "1 [0.07728298, 0.024971062, -0.04318075, 0.05809... \n", - "2 [0.1059802, 0.025460616, 0.02362733, 0.0390564... \n", - "3 [-0.062105577, -0.0053322953, 0.03127779, 0.04... \n", - "4 [0.0724358, -0.058001805, -0.01977186, -0.0243... \n", - "5 [0.091821924, 0.015197907, 0.07716932, 0.01711... " - ] - }, - "execution_count": 33, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from my_utils import read_parquet_files_as_df\n", - "\n", - "output_df = read_parquet_files_as_df(output_folder)\n", - "\n", - "print (\"Input data dimensions (rows x columns)= \", input_df.shape)\n", - "print (\"Output data dimensions (rows x columns)= \", output_df.shape)\n", - "\n", - "output_df.head(10)" - ] + "91fff81a1de8487c9009e872b751edb0": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } }, - { - "cell_type": "markdown", - "id": "f5e12630-be6b-4188-a925-77117155617b", - "metadata": { - "id": "f5e12630-be6b-4188-a925-77117155617b" - }, - "source": [ - "## Step-9: Copy output to final output dir" - ] + "a75892696be546a3970962bae7bf732a": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } }, - { - "cell_type": "code", - "execution_count": 34, - "id": "16dee3b8-31dc-4168-8adb-f2a0a0b5e207", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "16dee3b8-31dc-4168-8adb-f2a0a0b5e207", - "outputId": "d151e618-6528-40b5-fdbd-1c67291a7279" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "โœ… Copied output from 'output/06_embeddings_out' --> 'output/output_final'\n" - ] - } - ], - "source": [ - "import shutil\n", - "\n", - "shutil.rmtree(MY_CONFIG.OUTPUT_FOLDER_FINAL, ignore_errors=True)\n", - "shutil.copytree(src=output_folder, dst=MY_CONFIG.OUTPUT_FOLDER_FINAL)\n", - "\n", - "print (f\"โœ… Copied output from '{output_folder}' --> '{MY_CONFIG.OUTPUT_FOLDER_FINAL}'\")" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "id": "dc0a6728", - "metadata": { - "id": "dc0a6728" - }, - "outputs": [], - "source": [] - } - ], - "metadata": { - "colab": { - "provenance": [] + "ada62d24cbcf4361acbb21808f334d33": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } }, - "kernelspec": { - "display_name": "dpk-2-basic-021-py311", - "language": "python", - "name": "python3" + "b4c209371e7a403986991a786cfb296d": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": "20px" + } }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.10" - }, - "widgets": { - "application/vnd.jupyter.widget-state+json": { - "06107a2f48b3491f91bbe84e46e10ba0": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_68997339f13240a4824a9e416096bee4", - "placeholder": "โ€‹", - "style": "IPY_MODEL_919b086abd314077bbff75687392bd91", - "value": "" - } - }, - "68997339f13240a4824a9e416096bee4": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "6c08de2dd9a2402c90b1a7a645db9b13": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "7e13e8779a81400f996d4428c74acfaf": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_91fff81a1de8487c9009e872b751edb0", - "placeholder": "โ€‹", - "style": "IPY_MODEL_ada62d24cbcf4361acbb21808f334d33", - "value": "โ€‡0/0โ€‡[00:00<?,โ€‡?it/s]" - } - }, - "8b7571c585df431eb901fcdebdf8177e": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_06107a2f48b3491f91bbe84e46e10ba0", - "IPY_MODEL_bd74356eca18423aa0373c808d9097e3", - "IPY_MODEL_7e13e8779a81400f996d4428c74acfaf" - ], - "layout": "IPY_MODEL_a75892696be546a3970962bae7bf732a" - } - }, - "919b086abd314077bbff75687392bd91": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "91fff81a1de8487c9009e872b751edb0": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "a75892696be546a3970962bae7bf732a": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "ada62d24cbcf4361acbb21808f334d33": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "b4c209371e7a403986991a786cfb296d": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": "20px" - } - }, - "bd74356eca18423aa0373c808d9097e3": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_b4c209371e7a403986991a786cfb296d", - "max": 1, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_6c08de2dd9a2402c90b1a7a645db9b13", - "value": 0 - } - } - } + "bd74356eca18423aa0373c808d9097e3": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_b4c209371e7a403986991a786cfb296d", + "max": 1, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_6c08de2dd9a2402c90b1a7a645db9b13", + "value": 0 + } } - }, - "nbformat": 4, - "nbformat_minor": 5 + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 }