Skip to content

Commit

Permalink
Merge pull request #416 from dolfim-ibm/docling-pdf2md
Browse files Browse the repository at this point in the history
Add pdf2parquet transform
  • Loading branch information
daw3rd authored Jul 25, 2024
2 parents 2616f5d + 8e06993 commit 8360e18
Show file tree
Hide file tree
Showing 51 changed files with 2,131 additions and 1 deletion.
11 changes: 10 additions & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,15 @@ jobs:
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Free up space in github runner
# Free space as indicated here : https://github.com/actions/runner-images/issues/2840#issuecomment-790492173
run: |
df -h
sudo rm -rf "/usr/local/share/boost"
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
sudo rm -rf /usr/share/dotnet /opt/ghc /usr/local/lib/android /usr/local/share/powershell /usr/share/swift /usr/local/.ghcup
sudo docker rmi $(docker image ls -aq) >/dev/null 2>&1 || true
df -h
- name: Test Universal Transforms
run: |
make -C transforms/universal DOCKER=docker venv test-src
Expand Down Expand Up @@ -269,7 +278,7 @@ jobs:
test-language-images:
needs: [check_if_push_images]
runs-on: ubuntu-22.04
timeout-minutes: 30
timeout-minutes: 120
env:
DOCKER_REGISTRY_USER: ${{ secrets.DOCKER_REGISTRY_USER }}
DOCKER_REGISTRY_KEY: ${{ secrets.DOCKER_REGISTRY_KEY }}
Expand Down
2 changes: 2 additions & 0 deletions .make.defaults
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ ABS_REPOROOT=$(shell (cd $(REPOROOT); pwd))
DOCKER_FILE?=Dockerfile
#DOCKER_IMAGE_NAME?=xyzzy # Must be defined by the includeing makefile
DOCKER?=docker
DOCKER_PLATFORM?=linux/amd64
DOCKER_HOSTNAME?=quay.io
DOCKER_NAMESPACE ?= dataprep1/data-prep-kit
DOCKER_REGISTRY_USER?=$(DPK_DOCKER_REGISTRY_USER)
Expand Down Expand Up @@ -214,6 +215,7 @@ __check_defined = \
touch pyproject.toml; \
fi
$(DOCKER) build -t $(DOCKER_LOCAL_IMAGE) \
--platform $(DOCKER_PLATFORM) \
--build-arg EXTRA_INDEX_URL=$(EXTRA_INDEX_URL) \
--build-arg BASE_IMAGE=$(BASE_IMAGE) \
--build-arg BUILD_DATE=$(shell date -u +'%Y-%m-%dT%H:%M:%SZ') \
Expand Down
4 changes: 4 additions & 0 deletions .make.versions
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,10 @@ CODE2PARQUET_PYTHON_VERSION=$(DPK_VERSION)
CODE2PARQUET_RAY_VERSION=$(DPK_VERSION)
INGEST_TO_PARQUET_VERSION=$(DPK_VERSION)
REPO_LVL_ORDER_RAY_VERSION=$(DPK_VERSION)

PDF2PARQUET_PYTHON_VERSION=$(DPK_VERSION)
PDF2PARQUET_RAY_VERSION=$(DPK_VERSION)

################## ################## ################## ################## ################## ##################
# Begin versions that the repo depends on.

Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ The below matrix shows the the combination of modules and supported runtimes. Al
|Filter on annotations |:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark: |
|Language identification |:white_check_mark:|:white_check_mark:| |:white_check_mark: |
|Document quality |:white_check_mark:|:white_check_mark:| |:white_check_mark: |
|PDF to Parquet (convert) |:white_check_mark:|:white_check_mark:| |:white_check_mark: |
|Code (from zip) to Parquet |:white_check_mark:|:white_check_mark:| |:white_check_mark: |
|Profiler | |:white_check_mark:| |:white_check_mark: |
|Tokenizer |:white_check_mark:|:white_check_mark:| |:white_check_mark: |
Expand Down
70 changes: 70 additions & 0 deletions transforms/language/pdf2parquet/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
REPOROOT=../../..
# Use make help, to see the available rules
include $(REPOROOT)/.make.defaults

setup::
@# Help: Recursively make $@ all subdirs
$(MAKE) RULE=$@ .recurse

clean::
@# Help: Recursively make $@ all subdirs
$(MAKE) RULE=$@ .recurse

build::
@# Help: Recursively make $@ in subdirs
$(MAKE) RULE=$@ .recurse
venv::
@# Help: Recursively make $@ in subdirs
$(MAKE) RULE=$@ .recurse

image::
@# Help: Recursively make $@ in all subdirs
@$(MAKE) RULE=$@ .recurse

set-versions:
@# Help: Recursively $@ in all subdirs
@$(MAKE) RULE=$@ .recurse

publish::
@# Help: Recursively make $@ in all subdirs
@$(MAKE) RULE=$@ .recurse

test-image::
@# Help: Recursively make $@ in all subdirs
@$(MAKE) RULE=$@ .recurse

test::
@# Help: Recursively make $@ in all subdirs
@$(MAKE) RULE=$@ .recurse

test-src::
@# Help: Recursively make $@ in all subdirs
$(MAKE) RULE=$@ .recurse

kind-load-image::
@# Help: Recursively make $@ in all subdirs
$(MAKE) RULE=$@ .recurse

docker-load-image::
@# Help: Recursively make $@ in all subdirs
$(MAKE) RULE=$@ .recurse

docker-save-image::
@# Help: Recursively make $@ in all subdirs
$(MAKE) RULE=$@ .recurse

.PHONY: workflow-venv
workflow-venv:
$(MAKE) -C kfp_ray workflow-venv

.PHONY: workflow-test
workflow-test:
$(MAKE) -C kfp_ray workflow-test

.PHONY: workflow-upload
workflow-upload:
$(MAKE) -C kfp_ray workflow-upload

.PHONY: workflow-build
workflow-build:
$(MAKE) -C kfp_ray workflow-build
16 changes: 16 additions & 0 deletions transforms/language/pdf2parquet/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# PDF2PARQUET Transform


The PDF2PARQUET transforms iterate through PDF files or zip of PDF files and generates parquet files
containing the converted document in Markdown format.

The PDF conversion is using the [Docling package](https://github.com/DS4SD/docling).

The following runtimes are available:

* [python](python/README.md) - provides the base python-based transformation
implementation.
* [ray](ray/README.md) - enables the running of the base python transformation
in a Ray runtime
* [kfp](kfp_ray/README.md) - enables running the ray docker image
in a kubernetes cluster using a generated `yaml` file.
51 changes: 51 additions & 0 deletions transforms/language/pdf2parquet/kfp_ray/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
REPOROOT=${CURDIR}/../../../../
WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate
include $(REPOROOT)/transforms/.make.workflows

SRC_DIR=${CURDIR}/../ray/

PYTHON_WF := $(shell find ./ -name '*_wf.py')
YAML_WF := $(patsubst %.py, %.yaml, ${PYTHON_WF})

workflow-venv: .check_python_version ${WORKFLOW_VENV_ACTIVATE}

.PHONY: clean
clean:
@# Help: Clean up the virtual environment.
rm -rf ${REPOROOT}/transforms/venv

venv::

build::

setup::

test::

test-src::

test-image::

publish::

image::

kind-load-image::

docker-load-image::

docker-save-image::

.PHONY: workflow-build
workflow-build: workflow-venv
$(MAKE) $(YAML_WF)

.PHONY: workflow-test
workflow-test: workflow-build
$(MAKE) .workflows.test-pipeline TRANSFORM_SRC=${SRC_DIR} PIPELINE_FILE=noop_wf.yaml

.PHONY: workflow-upload
workflow-upload: workflow-build
@for file in $(YAML_WF); do \
$(MAKE) .workflows.upload-pipeline PIPELINE_FILE=$$file; \
done
28 changes: 28 additions & 0 deletions transforms/language/pdf2parquet/kfp_ray/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# PDF2PARQUET Ray-base KubeFlow Pipeline Transformation


## Summary
This project allows execution of the [pdf2parquet Ray transform](../ray) as a
[KubeFlow Pipeline](https://www.kubeflow.org/docs/components/pipelines/overview/)

The detail pipeline is presented in the [Simplest Transform pipeline tutorial](../../../../kfp/doc/simple_transform_pipeline.md)

## Compilation

In order to compile pipeline definitions run
```shell
make workflow-build
```
from the directory. It creates a virtual environment (make workflow-venv) and after that compiles the pipeline
definitions in the folder. The virtual environment is created once for all transformers.

Note: the pipelines definitions can be compiled and executed on KFPv1 and KFPv2. Meantime, KFPv1 is our default. If you
prefer KFPv2, please do the following:
```shell
make clean
export KFPv2=1
make workflow-build
```

The next steps are described in [Deploying a pipeline](../../../../kfp/doc/simple_transform_pipeline.md#deploying-a-pipeline-)
and [Executing pipeline and watching execution results](../../../../kfp/doc/simple_transform_pipeline.md#executing-pipeline-and-watching-execution-results-)
Loading

0 comments on commit 8360e18

Please sign in to comment.