Skip to content

Commit

Permalink
Merge pull request #685 from IBM/ian-cho-patch-1
Browse files Browse the repository at this point in the history
Added ray-based version of hap transform
  • Loading branch information
touma-I authored Oct 11, 2024
2 parents 2869eea + e2a6d28 commit c3b0996
Show file tree
Hide file tree
Showing 15 changed files with 477 additions and 0 deletions.
42 changes: 42 additions & 0 deletions transforms/universal/hap/ray/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
ARG BASE_IMAGE=docker.io/rayproject/ray:2.24.0-py310
FROM ${BASE_IMAGE}

RUN pip install --upgrade --no-cache-dir pip

# install pytest
RUN pip install --no-cache-dir pytest

# Copy and install data processing libraries
# These are expected to be placed in the docker context before this is run (see the make image).
COPY --chown=ray:users data-processing-lib-python/ data-processing-lib-python/
RUN cd data-processing-lib-python && pip install --no-cache-dir -e .
COPY --chown=ray:users data-processing-lib-ray/ data-processing-lib-ray/
RUN cd data-processing-lib-ray && pip install --no-cache-dir -e .
COPY --chown=ray:users python-transform/ python-transform/
RUN cd python-transform && pip install --no-cache-dir -e .

#COPY requirements.txt requirements.txt
#RUN pip install --no-cache-dir -r requirements.txt

COPY --chown=ray:users src/ src/
COPY --chown=ray:users pyproject.toml pyproject.toml
RUN pip install --no-cache-dir -e .

# copy the main() entry point to the image
COPY ./src/hap_transform_ray.py .

# copy some of the samples in
COPY ./src/hap_local_ray.py local/

# copy test
COPY test/ test/
COPY test-data/ test-data/

# Set environment
ENV PYTHONPATH /home/ray

# Put these at the end since they seem to upset the docker cache.
ARG BUILD_DATE
ARG GIT_COMMIT
LABEL build-date=$BUILD_DATE
LABEL git-commit=$GIT_COMMIT
58 changes: 58 additions & 0 deletions transforms/universal/hap/ray/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
# Define the root of the local git clone for the common rules to be able
# know where they are running from.
REPOROOT=../../../..
# Include a library of common .transform.* targets which most
# transforms should be able to reuse. However, feel free
# to override/redefine the rules below.
include $(REPOROOT)/transforms/.make.transforms

TRANSFORM_NAME=hap

BASE_IMAGE=${RAY_BASE_IMAGE}
HAP_PYTHON_VERSION= $(DPK_VERSION)

venv:: .transforms.ray-venv

install:: pip install -r requirements.txt

test:: .transforms.ray-test

clean:: .transforms.clean

image:: .transforms.ray-image

test-src:: .transforms.test-src

setup:: .transforms.setup

test-image:: .transforms.ray-test-image

build:: build-dist image

publish: publish-image

publish-image:: .transforms.publish-image-ray

setup:: .transforms.setup

# distribution versions is the same as image version.
set-versions:
$(MAKE) TRANSFORM_PYTHON_VERSION=$(HAP_PYTHON_VERSION) TOML_VERSION=$(HAP_PYTHON_VERSION) .transforms.set-versions

build-dist:: set-versions .defaults.build-dist

publish-dist:: .defaults.publish-dist

run-cli-sample: .transforms.run-cli-ray-sample

run-local-sample: .transforms.run-local-ray-sample

run-s3-sample: .transforms.run-s3-ray-sample

minio-start: .minio-start

kind-load-image:: .transforms.kind-load-image

docker-load-image: .defaults.docker-load-image

docker-save-image: .defaults.docker-save-image
20 changes: 20 additions & 0 deletions transforms/universal/hap/ray/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# Hate, Abuse, and Profanity (HAP) Annotation
# HAP Transform for Ray
Please see the set of
[transform project conventions](../../../README.md#transform-project-conventions)
for details on general project conventions, transform configuration,
testing and IDE set up.

## Summary
This project wraps the [hap transform](../python) with a Ray runtime.

## Configuration and command line Options

Configuration and command line options are the same as for the base python transform.

## Running

### Launched Command Line Options
In addition to those available to the transform as defined in [here](../python/README.md),
the set of
[ray launcher](../../../../data-processing-lib/doc/ray-launcher-options.md) are available.
50 changes: 50 additions & 0 deletions transforms/universal/hap/ray/output/metadata.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
{
"pipeline": "pipeline_id",
"job details": {
"job category": "preprocessing",
"job name": "hap",
"job type": "pure python",
"job id": "job_id",
"start_time": "2024-10-03 21:38:20",
"end_time": "2024-10-03 21:38:29",
"status": "success"
},
"code": {
"github": "github",
"commit_hash": "12345",
"path": "path"
},
"job_input_params": {
"model_name_or_path": "ibm-granite/granite-guardian-hap-38m",
"annotation_column": "hap_score",
"doc_text_column": "contents",
"inference_engine": "CPU",
"max_length": 512,
"batch_size": 128,
"checkpointing": false,
"max_files": -1,
"random_samples": -1,
"files_to_use": [
".parquet"
],
"num_processors": 0
},
"job_output_stats": {
"source_files": 2,
"source_size": 12124594,
"transform execution exception": 1,
"result_files": 1,
"result_size": 79822,
"processing_time": 6.932,
"source_doc_count": 50,
"result_doc_count": 50
},
"source": {
"name": "/Users/ian/Desktop/data-prep-kit/transforms/universal/hap/python/test-data/input",
"type": "path"
},
"target": {
"name": "/Users/ian/Desktop/data-prep-kit/transforms/universal/hap/python/output",
"type": "path"
}
}
Binary file not shown.
48 changes: 48 additions & 0 deletions transforms/universal/hap/ray/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
[project]
name = "dpk_hap_transform_ray"
version = "0.2.2.dev0"
requires-python = ">=3.10"
description = "HAP Ray Transform"
license = {text = "Apache-2.0"}
readme = {file = "README.md", content-type = "text/markdown"}
authors = [
{ name = "Ian Cho", email = "[email protected]" },
]
dynamic = ["dependencies"]


[build-system]
requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"]
build-backend = "setuptools.build_meta"

[tool.setuptools.dynamic]
dependencies = {file = ["requirements.txt"]}

[project.optional-dependencies]
dev = [
"twine",
"pytest>=7.3.2",
"pytest-dotenv>=0.5.2",
"pytest-env>=1.0.0",
"pre-commit>=3.3.2",
"pytest-cov>=4.1.0",
"pytest-mock>=3.10.0",
"moto==5.0.5",
"markupsafe==2.0.1",
]



[options]
package_dir = ["src","test"]

[options.packages.find]
where = ["src/"]

[tool.pytest.ini_options]
# Currently we use low coverage since we have to run tests separately (see makefile)
#addopts = "--cov --cov-report term-missing --cov-fail-under 25"
markers = ["unit: unit tests", "integration: integration tests"]

[tool.coverage.run]
include = ["src/*"]
6 changes: 6 additions & 0 deletions transforms/universal/hap/ray/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
data-prep-toolkit-ray==0.2.2.dev0
dpk-hap-transform-python==0.2.2.dev0
nltk==3.9.1
transformers==4.38.2
torch==2.4.1
pandas==2.2.2
Empty file.
60 changes: 60 additions & 0 deletions transforms/universal/hap/ray/src/hap_local_ray.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# (C) Copyright IBM Corp. 2024.
# Licensed under the Apache License, Version 2.0 (the “License”);
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an “AS IS” BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
################################################################################


import ast
import os
import sys

from data_processing.utils import ParamsUtils
from data_processing_ray.runtime.ray import RayTransformLauncher
from hap_transform_ray import HAPRayTransformConfiguration


# create parameters
input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data/input"))
output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "../output"))
local_conf = {
"input_folder": input_folder,
"output_folder": output_folder,
}
code_location = {"github": "github", "commit_hash": "12345", "path": "path"}

params = {
# where to run
"run_locally": True,

"data_local_config": ParamsUtils.convert_to_ast(local_conf),
"runtime_pipeline_id": "pipeline_id",
"runtime_job_id": "job_id",
"runtime_code_location": ParamsUtils.convert_to_ast(code_location),
}


hap_params = {
"model_name_or_path": 'ibm-granite/granite-guardian-hap-38m',
"annotation_column": "hap_score",
"doc_text_column": "contents",
"inference_engine": "CPU",
"max_length": 512,
"batch_size": 128,
}



if __name__ == "__main__":
# Set the simulated command line args
sys.argv = ParamsUtils.dict_to_req(d=params | hap_params)
# create launcher
launcher = RayTransformLauncher(HAPRayTransformConfiguration())
# Launch the ray actor(s) to process the input
launcher.launch()
64 changes: 64 additions & 0 deletions transforms/universal/hap/ray/src/hap_s3_ray.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
# (C) Copyright IBM Corp. 2024.
# Licensed under the Apache License, Version 2.0 (the “License”);
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an “AS IS” BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
################################################################################

import os
import sys

from data_processing.utils import ParamsUtils
from data_processing_ray.runtime.ray import RayTransformLauncher
from hap_transform_ray import HAPRayTransformConfiguration


# create launcher
launcher = RayTransformLauncher(HAPRayTransformConfiguration())
# create parameters
s3_cred = {
"access_key": "localminioaccesskey",
"secret_key": "localminiosecretkey",
"url": "http://localhost:9000",
}

s3_conf = {
"input_folder": "test/hap/input",
"output_folder": "test/hap/output",
}
worker_options = {"num_cpus": 0.8}
code_location = {"github": "github", "commit_hash": "12345", "path": "path"}
params = {
# where to run
"run_locally": True,
# Data access. Only required parameters are specified
"data_s3_cred": ParamsUtils.convert_to_ast(s3_cred),
"data_s3_config": ParamsUtils.convert_to_ast(s3_conf),
# orchestrator
"runtime_worker_options": ParamsUtils.convert_to_ast(worker_options),
"runtime_num_workers": 3,
"runtime_pipeline_id": "pipeline_id",
"runtime_job_id": "job_id",
"runtime_creation_delay": 0,
"runtime_code_location": ParamsUtils.convert_to_ast(code_location),
}


hap_params = {
"model_name_or_path": 'ibm-granite/granite-guardian-hap-38m',
"annotation_column": "hap_score",
"doc_text_column": "contents",
"inference_engine": "CPU",
"max_length": 512,
"batch_size": 128,
}


sys.argv = ParamsUtils.dict_to_req(d=params | hap_params)
# launch
launcher.launch()
39 changes: 39 additions & 0 deletions transforms/universal/hap/ray/src/hap_transform_ray.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# (C) Copyright IBM Corp. 2024.
# Licensed under the Apache License, Version 2.0 (the “License”);
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an “AS IS” BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
################################################################################

from data_processing.utils import get_logger
from data_processing_ray.runtime.ray import RayTransformLauncher
from data_processing_ray.runtime.ray.runtime_configuration import RayTransformRuntimeConfiguration
from hap_transform import HAPTransformConfiguration


logger = get_logger(__name__)


class HAPRayTransformConfiguration(RayTransformRuntimeConfiguration):
"""
Implements the RayTransformConfiguration for HAP as required by the RayTransformLauncher.
"""

def __init__(self):
"""
Initialization
:param base_configuration - base configuration class
"""
super().__init__(transform_config=HAPTransformConfiguration())


if __name__ == "__main__":
launcher = RayTransformLauncher(HAPRayTransformConfiguration())
logger.info("Launching hap transform")
launcher.launch()

Loading

0 comments on commit c3b0996

Please sign in to comment.