Skip to content

Commit

Permalink
Merge pull request #694 from revit13/kfp-html2parquet
Browse files Browse the repository at this point in the history
Kfp workflow for html2parquet- Thanks!
  • Loading branch information
touma-I authored Oct 15, 2024
2 parents 97810af + 5419cd1 commit 876336a
Show file tree
Hide file tree
Showing 11 changed files with 449 additions and 7 deletions.
116 changes: 116 additions & 0 deletions .github/workflows/test-language-html2parquet-kfp.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
#
# DO NOT EDIT THIS FILE: it is generated from test-transform.template, Edit there and run make to change these files
#
name: Test KFP - transforms/language/html2parquet

on:
workflow_dispatch:
push:
branches:
- "dev"
- "releases/**"
tags:
- "*"
paths:
- ".make.*"
- "transforms/.make.workflows"
- "transforms/language/html2parquet/**"
- "!kfp/**" # This is tested in separate workflow
- "!data-processing-lib/**" # This is tested in separate workflow
- "!**.md"
- "!**/doc/**"
- "!**/images/**"
- "!**.gitignore"
pull_request:
branches:
- "dev"
- "releases/**"
paths:
- ".make.*"
- "transforms/.make.workflows"
- "transforms/language/html2parquet/**"
- "!data-processing-lib/**" # This is tested in separate workflow
- "!kfp/**" # This is tested in separate workflow
- "!**.md"
- "!**/doc/**"
- "!**/images/**"
- "!**.gitignore"

# taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true

jobs:
test-kfp-v1:
runs-on: ubuntu-22.04
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Free up space in github runner
# Free space as indicated here : https://github.com/actions/runner-images/issues/2840#issuecomment-790492173
run: |
df -h
sudo rm -rf "/usr/local/share/boost"
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
sudo rm -rf /usr/share/dotnet /opt/ghc /usr/local/lib/android /usr/local/share/powershell /usr/share/swift /usr/lib/jvm /usr/local/.ghcup
sudo docker rmi $(docker image ls -aq) >/dev/null 2>&1 || true
df -h
- name: Import environment variables
run: |
cat scripts/k8s-setup/requirements.env >> $GITHUB_ENV
echo "K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup" >> $GITHUB_ENV
echo "REPOROOT=$PWD" >> $GITHUB_ENV
echo "PATH=$PATH:/tmp" >> $GITHUB_ENV
- name: Test V1 KFP workflow for transforms/language/html2parquet
timeout-minutes: 120
run: |
KFP_BLACK_LIST=$(./scripts/check-workflows.sh -show-kfp-black-list)
if [ -e "transforms/language/html2parquet/Makefile" -a -e "transforms/language/html2parquet/kfp_ray/Makefile" ]; then
transform=$(basename "transforms/language/html2parquet")
if echo ${KFP_BLACK_LIST} | grep -qv ${transform}; then
$PWD/scripts/workflow_helper.sh install-tools
$PWD/scripts/workflow_helper.sh test-workflow transforms/language/html2parquet
else
$PWD/scripts/workflow_helper.sh build-workflow transforms/language/html2parquet
fi
else
echo "Skipping transforms/language/html2parquet kfp test for lack of Makefile and/or kfp_ray/Makefile"
fi
test-kfp-v2:
runs-on: ubuntu-22.04
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Free up space in github runner
# Free space as indicated here : https://github.com/actions/runner-images/issues/2840#issuecomment-790492173
run: |
df -h
sudo rm -rf "/usr/local/share/boost"
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
sudo rm -rf /usr/share/dotnet /opt/ghc /usr/local/lib/android /usr/local/share/powershell /usr/share/swift /usr/lib/jvm /usr/local/.ghcup
sudo docker rmi $(docker image ls -aq) >/dev/null 2>&1 || true
df -h
- name: Import environment variables
run: |
cat scripts/k8s-setup/requirements.env >> $GITHUB_ENV
echo "K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup" >> $GITHUB_ENV
echo "REPOROOT=$PWD" >> $GITHUB_ENV
echo "PATH=$PATH:/tmp" >> $GITHUB_ENV
echo "KFPv2=1" >> $GITHUB_ENV
- name: Test V2 KFP workflow for transforms/language/html2parquet
timeout-minutes: 120
run: |
KFP_BLACK_LIST=$(./scripts/check-workflows.sh -show-kfp-black-list)
if [ -e "transforms/language/html2parquet/Makefile" -a -e "transforms/language/html2parquet/kfp_ray/Makefile" ]; then
transform=$(basename "transforms/language/html2parquet")
if echo ${KFP_BLACK_LIST} | grep -qv ${transform}; then
$PWD/scripts/workflow_helper.sh install-tools
$PWD/scripts/workflow_helper.sh test-workflow transforms/language/html2parquet
else
$PWD/scripts/workflow_helper.sh build-workflow transforms/language/html2parquet
fi
else
echo "Skipping transforms/language/html2parquet kfp test for lack of Makefile and/or kfp_ray/Makefile"
fi
4 changes: 2 additions & 2 deletions .github/workflows/test-universal-hap-kfp.yml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ concurrency:

jobs:
test-kfp-v1:
runs-on: ubuntu-latest
runs-on: ubuntu-22.04
steps:
- name: Checkout
uses: actions/checkout@v4
Expand Down Expand Up @@ -79,7 +79,7 @@ jobs:
fi
test-kfp-v2:
runs-on: ubuntu-latest
runs-on: ubuntu-22.04
steps:
- name: Checkout
uses: actions/checkout@v4
Expand Down
2 changes: 2 additions & 0 deletions kfp/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
| Transform | KFP pipeline |
|-------------------------------------|:----------------------------------------------------------------------------------:|
| language/lang_id | [lang_id_wf.py](../transforms/language/lang_id/kfp_ray/lang_id_wf.py) |
| language/html2parquet | [html2parquet_wf.py](../transforms/language/html2parquet/kfp_ray/html2parquet_wf.py) |
| code/malware | [malware_wf.py](../transforms/code/malware/kfp_ray/malware_wf.py) |
| code/code2parquet | [code2parquet_wf.py](../transforms/code/code2parquet/kfp_ray/code2parquet_wf.py) |
| code/code_quality | [code_quality_wf.py](../transforms/code/code_quality/kfp_ray/code_quality_wf.py) |
Expand All @@ -17,6 +18,7 @@
| universal/noop | [noop_wf.py](../transforms/universal/noop/kfp_ray/noop_wf.py) |
| universal/profiler | [profiler_wf.py](../transforms/universal/profiler/kfp_ray/profiler_wf.py) |
| universal/tokenization | [tokenization_wf.py](../transforms/universal/tokenization/kfp_ray/tokenization_wf.py) |
| universal/hap | [hap_wf.py](../transforms/universal/hap/kfp_ray/hap_wf.py) |


## Set up and working steps
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,16 @@ pipeline_parameters:
description: "Pipeline for noop task"
script_name: "noop_transform.py"
prefix: ""
multi_s3: True
multi_s3: False
compute_func_name: ""
compute_func_import: ""
component_spec_path: ""

pipeline_common_input_parameters_values:
kfp_base_image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.0.dev6"
transform_image: "quay.io/dataprep1/data-prep-kit/noop-ray:0.9.0.dev6"
kfp_base_image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
transform_image: "quay.io/dataprep1/data-prep-kit/noop-ray:latest"
s3_access_secret: "s3-secret"
image_pull_secret: "prod-all-icr-io"
image_pull_secret: ""
input_folder: "test/noop/input/"
output_folder: "test/noop/output/"

Expand Down
9 changes: 8 additions & 1 deletion kfp/pipeline_generator/single-pipeline/pipeline_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
environment = Environment(loader=FileSystemLoader(f"{script_dir}/templates/"))
template = environment.get_template(PIPELINE_TEMPLATE_FILE)

#pre_commit_config = f"{script_dir}/../pre-commit-config.yaml"
pre_commit_config = f"{script_dir}/../../../.pre-commit-config.yaml"
parser = argparse.ArgumentParser(description="Kubeflow pipeline generator for Foundation Models")
parser.add_argument("-c", "--config_file", type=str, default="")
parser.add_argument("-od", "--output_dir_file", type=str, default="")
Expand Down Expand Up @@ -57,3 +57,10 @@
message.write(content)
print(f"... wrote {output_file}")

# format the pipeline python file
import sys

from pre_commit.main import main

args = ["run", "--file", f"{output_file}", "-c", f"{pre_commit_config}"]
sys.exit(main(args))
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ def compute_exec_params_func(
data_s3_config: str,
data_max_files: int,
data_num_samples: int,
data_checkpointing: bool,
runtime_pipeline_id: str,
runtime_job_id: str,
runtime_code_location: dict,
Expand All @@ -50,6 +51,7 @@ def compute_exec_params_func(
"data_s3_config": data_s3_config,
"data_max_files": data_max_files,
"data_num_samples": data_num_samples,
"data_checkpointing": data_checkpointing,
"runtime_num_workers": KFPUtils.default_compute_execution_params(str(worker_options), str(actor_options)),
"runtime_worker_options": str(actor_options),
"runtime_pipeline_id": runtime_pipeline_id,
Expand Down Expand Up @@ -103,8 +105,13 @@ def {{ pipeline_name }}(
# Ray cluster
ray_name: str = "{{ pipeline_name }}-kfp-ray", # name of Ray cluster
# Add image_pull_secret and image_pull_policy to ray workers if needed
{%- if image_pull_secret != "" %}
ray_head_options: dict = {"cpu": 1, "memory": 4, "image_pull_secret": "{{ image_pull_secret }}", "image": task_image},
ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image_pull_secret": "{{ image_pull_secret }}", "image": task_image},
{%- else %}
ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image},
ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image": task_image},
{%- endif %}
server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888",
# data access
{%- if multi_s3 == False %}
Expand Down Expand Up @@ -177,6 +184,7 @@ def {{ pipeline_name }}(
data_s3_config=data_s3_config,
data_max_files=data_max_files,
data_num_samples=data_num_samples,
data_checkpointing=data_checkpointing,
runtime_pipeline_id=runtime_pipeline_id,
runtime_job_id=run_id,
runtime_code_location=runtime_code_location,
Expand Down
1 change: 1 addition & 0 deletions scripts/k8s-setup/populate_minio.sh
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ mc cp --recursive ${REPOROOT}/transforms/language/doc_quality/ray/test-data/inpu
mc cp --recursive ${REPOROOT}/transforms/language/pdf2parquet/ray/test-data/input/2206.01062.pdf kfp/test/pdf2parquet/input
mc cp --recursive ${REPOROOT}/transforms/language/text_encoder/ray/test-data/input/ kfp/test/text_encoder/input
mc cp --recursive ${REPOROOT}/transforms/language/doc_chunk/ray/test-data/input/ kfp/test/doc_chunk/input
mc cp --recursive ${REPOROOT}/transforms/language/html2parquet/ray/test-data/input/test1.html kfp/test/html2parquet/input
# universal
mc cp --recursive ${REPOROOT}/transforms/universal/doc_id/ray/test-data/input/ kfp/test/doc_id/input
mc cp --recursive ${REPOROOT}/transforms/universal/ededup/ray/test-data/input/ kfp/test/ededup/input
Expand Down
13 changes: 13 additions & 0 deletions transforms/add_new_kfp_workflow.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Adding new KFP workflows

This README outlines the steps to add a new KFP workflow for a new transform under [transforms](./transforms/) directory.

1) Create a new `kfp_ray` directory in the transform directory, similar to [this directory](universal/noop/kfp_ray/).

2) Create the workflow and add it to `kfp_ray` directory. It is recommended to use the [pipeline generator](../kfp/pipeline_generator/single-pipeline/) for that. If the workflow was generated using the [pipeline generator](../kfp//pipeline_generator/single-pipeline/) also include `pipeline_definitions.yaml` file used to generate the workflow in the `kfp_ray` directory.

3) Add `Makefile` file to `kfp_ray` directory similar to [this Makefile example](./universal/noop/kfp_ray/Makefile).

3) Add the path to the transform input directory in the [populate_minio script](../scripts/k8s-setup/populate_minio.sh). This path is used when testing the workflow.
4) Create a GitHub Action for the kfp workflow using the `make` command in the [.github/workflows/](../.github/workflows/README.md) directory.
5) Update the workflows list in [README.md](../kfp/README.md) file.
48 changes: 48 additions & 0 deletions transforms/language/html2parquet/kfp_ray/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
REPOROOT=${CURDIR}/../../../../
WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate
include $(REPOROOT)/transforms/.make.workflows

# Include the common configuration for this transform
include ../transform.config

SRC_DIR=${CURDIR}/../ray/

PYTHON_WF := $(shell find ./ -name '*_wf.py')
YAML_WF := $(patsubst %.py, %.yaml, ${PYTHON_WF})

workflow-venv: .check_python_version ${WORKFLOW_VENV_ACTIVATE}

.PHONY: clean
clean:
@# Help: Clean up the virtual environment.
rm -rf ${REPOROOT}/transforms/venv

venv::

build::

test::

test-src::

test-image::

publish::

image::

load-image::

.PHONY: workflow-build
workflow-build: workflow-venv
$(MAKE) $(YAML_WF)

.PHONY: workflow-test
workflow-test: workflow-build
$(MAKE) .workflows.test-pipeline TRANSFORM_SRC=${SRC_DIR} PIPELINE_FILE=html2parquet_wf.yaml

.PHONY: workflow-upload
workflow-upload:
@for file in $(YAML_WF); do \
$(MAKE) .workflows.upload-pipeline PIPELINE_FILE=$$file; \
done
Loading

0 comments on commit 876336a

Please sign in to comment.