diff --git a/transforms/universal/hap/ray/Dockerfile b/transforms/universal/hap/ray/Dockerfile new file mode 100644 index 000000000..42005e9ba --- /dev/null +++ b/transforms/universal/hap/ray/Dockerfile @@ -0,0 +1,42 @@ +ARG BASE_IMAGE=docker.io/rayproject/ray:2.24.0-py310 +FROM ${BASE_IMAGE} + +RUN pip install --upgrade --no-cache-dir pip + +# install pytest +RUN pip install --no-cache-dir pytest + +# Copy and install data processing libraries +# These are expected to be placed in the docker context before this is run (see the make image). +COPY --chown=ray:users data-processing-lib-python/ data-processing-lib-python/ +RUN cd data-processing-lib-python && pip install --no-cache-dir -e . +COPY --chown=ray:users data-processing-lib-ray/ data-processing-lib-ray/ +RUN cd data-processing-lib-ray && pip install --no-cache-dir -e . +COPY --chown=ray:users python-transform/ python-transform/ +RUN cd python-transform && pip install --no-cache-dir -e . + +#COPY requirements.txt requirements.txt +#RUN pip install --no-cache-dir -r requirements.txt + +COPY --chown=ray:users src/ src/ +COPY --chown=ray:users pyproject.toml pyproject.toml +RUN pip install --no-cache-dir -e . + +# copy the main() entry point to the image +COPY ./src/hap_transform_ray.py . + +# copy some of the samples in +COPY ./src/hap_local_ray.py local/ + +# copy test +COPY test/ test/ +COPY test-data/ test-data/ + +# Set environment +ENV PYTHONPATH /home/ray + +# Put these at the end since they seem to upset the docker cache. +ARG BUILD_DATE +ARG GIT_COMMIT +LABEL build-date=$BUILD_DATE +LABEL git-commit=$GIT_COMMIT diff --git a/transforms/universal/hap/ray/Makefile b/transforms/universal/hap/ray/Makefile new file mode 100644 index 000000000..af5d50348 --- /dev/null +++ b/transforms/universal/hap/ray/Makefile @@ -0,0 +1,58 @@ +# Define the root of the local git clone for the common rules to be able +# know where they are running from. +REPOROOT=../../../.. +# Include a library of common .transform.* targets which most +# transforms should be able to reuse. However, feel free +# to override/redefine the rules below. +include $(REPOROOT)/transforms/.make.transforms + +TRANSFORM_NAME=hap + +BASE_IMAGE=${RAY_BASE_IMAGE} +HAP_PYTHON_VERSION= $(DPK_VERSION) + +venv:: .transforms.ray-venv + +install:: pip install -r requirements.txt + +test:: .transforms.ray-test + +clean:: .transforms.clean + +image:: .transforms.ray-image + +test-src:: .transforms.test-src + +setup:: .transforms.setup + +test-image:: .transforms.ray-test-image + +build:: build-dist image + +publish: publish-image + +publish-image:: .transforms.publish-image-ray + +setup:: .transforms.setup + +# distribution versions is the same as image version. +set-versions: + $(MAKE) TRANSFORM_PYTHON_VERSION=$(HAP_PYTHON_VERSION) TOML_VERSION=$(HAP_PYTHON_VERSION) .transforms.set-versions + +build-dist:: set-versions .defaults.build-dist + +publish-dist:: .defaults.publish-dist + +run-cli-sample: .transforms.run-cli-ray-sample + +run-local-sample: .transforms.run-local-ray-sample + +run-s3-sample: .transforms.run-s3-ray-sample + +minio-start: .minio-start + +kind-load-image:: .transforms.kind-load-image + +docker-load-image: .defaults.docker-load-image + +docker-save-image: .defaults.docker-save-image diff --git a/transforms/universal/hap/ray/README.md b/transforms/universal/hap/ray/README.md new file mode 100644 index 000000000..486ac903f --- /dev/null +++ b/transforms/universal/hap/ray/README.md @@ -0,0 +1,20 @@ +# Hate, Abuse, and Profanity (HAP) Annotation +# HAP Transform for Ray +Please see the set of +[transform project conventions](../../../README.md#transform-project-conventions) +for details on general project conventions, transform configuration, +testing and IDE set up. + +## Summary +This project wraps the [hap transform](../python) with a Ray runtime. + +## Configuration and command line Options + +Configuration and command line options are the same as for the base python transform. + +## Running + +### Launched Command Line Options +In addition to those available to the transform as defined in [here](../python/README.md), +the set of +[ray launcher](../../../../data-processing-lib/doc/ray-launcher-options.md) are available. diff --git a/transforms/universal/hap/ray/output/metadata.json b/transforms/universal/hap/ray/output/metadata.json new file mode 100644 index 000000000..062fee162 --- /dev/null +++ b/transforms/universal/hap/ray/output/metadata.json @@ -0,0 +1,50 @@ +{ + "pipeline": "pipeline_id", + "job details": { + "job category": "preprocessing", + "job name": "hap", + "job type": "pure python", + "job id": "job_id", + "start_time": "2024-10-03 21:38:20", + "end_time": "2024-10-03 21:38:29", + "status": "success" + }, + "code": { + "github": "github", + "commit_hash": "12345", + "path": "path" + }, + "job_input_params": { + "model_name_or_path": "ibm-granite/granite-guardian-hap-38m", + "annotation_column": "hap_score", + "doc_text_column": "contents", + "inference_engine": "CPU", + "max_length": 512, + "batch_size": 128, + "checkpointing": false, + "max_files": -1, + "random_samples": -1, + "files_to_use": [ + ".parquet" + ], + "num_processors": 0 + }, + "job_output_stats": { + "source_files": 2, + "source_size": 12124594, + "transform execution exception": 1, + "result_files": 1, + "result_size": 79822, + "processing_time": 6.932, + "source_doc_count": 50, + "result_doc_count": 50 + }, + "source": { + "name": "/Users/ian/Desktop/data-prep-kit/transforms/universal/hap/python/test-data/input", + "type": "path" + }, + "target": { + "name": "/Users/ian/Desktop/data-prep-kit/transforms/universal/hap/python/output", + "type": "path" + } +} \ No newline at end of file diff --git a/transforms/universal/hap/ray/output/test1.parquet b/transforms/universal/hap/ray/output/test1.parquet new file mode 100644 index 000000000..c9483e34d Binary files /dev/null and b/transforms/universal/hap/ray/output/test1.parquet differ diff --git a/transforms/universal/hap/ray/pyproject.toml b/transforms/universal/hap/ray/pyproject.toml new file mode 100644 index 000000000..ff3fc05f0 --- /dev/null +++ b/transforms/universal/hap/ray/pyproject.toml @@ -0,0 +1,48 @@ +[project] +name = "dpk_hap_transform_ray" +version = "0.2.2.dev0" +requires-python = ">=3.10" +description = "HAP Ray Transform" +license = {text = "Apache-2.0"} +readme = {file = "README.md", content-type = "text/markdown"} +authors = [ +{ name = "Ian Cho", email = "iancho.mr@gmail.com" }, +] +dynamic = ["dependencies"] + + +[build-system] +requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] +build-backend = "setuptools.build_meta" + +[tool.setuptools.dynamic] +dependencies = {file = ["requirements.txt"]} + +[project.optional-dependencies] +dev = [ + "twine", + "pytest>=7.3.2", + "pytest-dotenv>=0.5.2", + "pytest-env>=1.0.0", + "pre-commit>=3.3.2", + "pytest-cov>=4.1.0", + "pytest-mock>=3.10.0", + "moto==5.0.5", + "markupsafe==2.0.1", +] + + + +[options] +package_dir = ["src","test"] + +[options.packages.find] +where = ["src/"] + +[tool.pytest.ini_options] +# Currently we use low coverage since we have to run tests separately (see makefile) +#addopts = "--cov --cov-report term-missing --cov-fail-under 25" +markers = ["unit: unit tests", "integration: integration tests"] + +[tool.coverage.run] +include = ["src/*"] diff --git a/transforms/universal/hap/ray/requirements.txt b/transforms/universal/hap/ray/requirements.txt new file mode 100644 index 000000000..36c2b81af --- /dev/null +++ b/transforms/universal/hap/ray/requirements.txt @@ -0,0 +1,6 @@ +data-prep-toolkit-ray==0.2.2.dev0 +dpk-hap-transform-python==0.2.2.dev0 +nltk==3.9.1 +transformers==4.38.2 +torch==2.4.1 +pandas==2.2.2 diff --git a/transforms/universal/hap/ray/src/__init__.py b/transforms/universal/hap/ray/src/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/transforms/universal/hap/ray/src/hap_local_ray.py b/transforms/universal/hap/ray/src/hap_local_ray.py new file mode 100644 index 000000000..e2f4d6e81 --- /dev/null +++ b/transforms/universal/hap/ray/src/hap_local_ray.py @@ -0,0 +1,60 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + + +import ast +import os +import sys + +from data_processing.utils import ParamsUtils +from data_processing_ray.runtime.ray import RayTransformLauncher +from hap_transform_ray import HAPRayTransformConfiguration + + +# create parameters +input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data/input")) +output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "../output")) +local_conf = { + "input_folder": input_folder, + "output_folder": output_folder, +} +code_location = {"github": "github", "commit_hash": "12345", "path": "path"} + +params = { + # where to run + "run_locally": True, + + "data_local_config": ParamsUtils.convert_to_ast(local_conf), + "runtime_pipeline_id": "pipeline_id", + "runtime_job_id": "job_id", + "runtime_code_location": ParamsUtils.convert_to_ast(code_location), +} + + +hap_params = { + "model_name_or_path": 'ibm-granite/granite-guardian-hap-38m', + "annotation_column": "hap_score", + "doc_text_column": "contents", + "inference_engine": "CPU", + "max_length": 512, + "batch_size": 128, +} + + + +if __name__ == "__main__": + # Set the simulated command line args + sys.argv = ParamsUtils.dict_to_req(d=params | hap_params) + # create launcher + launcher = RayTransformLauncher(HAPRayTransformConfiguration()) + # Launch the ray actor(s) to process the input + launcher.launch() diff --git a/transforms/universal/hap/ray/src/hap_s3_ray.py b/transforms/universal/hap/ray/src/hap_s3_ray.py new file mode 100644 index 000000000..fceae0b7c --- /dev/null +++ b/transforms/universal/hap/ray/src/hap_s3_ray.py @@ -0,0 +1,64 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os +import sys + +from data_processing.utils import ParamsUtils +from data_processing_ray.runtime.ray import RayTransformLauncher +from hap_transform_ray import HAPRayTransformConfiguration + + +# create launcher +launcher = RayTransformLauncher(HAPRayTransformConfiguration()) +# create parameters +s3_cred = { + "access_key": "localminioaccesskey", + "secret_key": "localminiosecretkey", + "url": "http://localhost:9000", +} + +s3_conf = { + "input_folder": "test/hap/input", + "output_folder": "test/hap/output", +} +worker_options = {"num_cpus": 0.8} +code_location = {"github": "github", "commit_hash": "12345", "path": "path"} +params = { + # where to run + "run_locally": True, + # Data access. Only required parameters are specified + "data_s3_cred": ParamsUtils.convert_to_ast(s3_cred), + "data_s3_config": ParamsUtils.convert_to_ast(s3_conf), + # orchestrator + "runtime_worker_options": ParamsUtils.convert_to_ast(worker_options), + "runtime_num_workers": 3, + "runtime_pipeline_id": "pipeline_id", + "runtime_job_id": "job_id", + "runtime_creation_delay": 0, + "runtime_code_location": ParamsUtils.convert_to_ast(code_location), +} + + +hap_params = { + "model_name_or_path": 'ibm-granite/granite-guardian-hap-38m', + "annotation_column": "hap_score", + "doc_text_column": "contents", + "inference_engine": "CPU", + "max_length": 512, + "batch_size": 128, +} + + +sys.argv = ParamsUtils.dict_to_req(d=params | hap_params) +# launch +launcher.launch() diff --git a/transforms/universal/hap/ray/src/hap_transform_ray.py b/transforms/universal/hap/ray/src/hap_transform_ray.py new file mode 100644 index 000000000..f01fffef5 --- /dev/null +++ b/transforms/universal/hap/ray/src/hap_transform_ray.py @@ -0,0 +1,39 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +from data_processing.utils import get_logger +from data_processing_ray.runtime.ray import RayTransformLauncher +from data_processing_ray.runtime.ray.runtime_configuration import RayTransformRuntimeConfiguration +from hap_transform import HAPTransformConfiguration + + +logger = get_logger(__name__) + + +class HAPRayTransformConfiguration(RayTransformRuntimeConfiguration): + """ + Implements the RayTransformConfiguration for HAP as required by the RayTransformLauncher. + """ + + def __init__(self): + """ + Initialization + :param base_configuration - base configuration class + """ + super().__init__(transform_config=HAPTransformConfiguration()) + + +if __name__ == "__main__": + launcher = RayTransformLauncher(HAPRayTransformConfiguration()) + logger.info("Launching hap transform") + launcher.launch() + diff --git a/transforms/universal/hap/ray/test-data/expected/metadata.json b/transforms/universal/hap/ray/test-data/expected/metadata.json new file mode 100644 index 000000000..062fee162 --- /dev/null +++ b/transforms/universal/hap/ray/test-data/expected/metadata.json @@ -0,0 +1,50 @@ +{ + "pipeline": "pipeline_id", + "job details": { + "job category": "preprocessing", + "job name": "hap", + "job type": "pure python", + "job id": "job_id", + "start_time": "2024-10-03 21:38:20", + "end_time": "2024-10-03 21:38:29", + "status": "success" + }, + "code": { + "github": "github", + "commit_hash": "12345", + "path": "path" + }, + "job_input_params": { + "model_name_or_path": "ibm-granite/granite-guardian-hap-38m", + "annotation_column": "hap_score", + "doc_text_column": "contents", + "inference_engine": "CPU", + "max_length": 512, + "batch_size": 128, + "checkpointing": false, + "max_files": -1, + "random_samples": -1, + "files_to_use": [ + ".parquet" + ], + "num_processors": 0 + }, + "job_output_stats": { + "source_files": 2, + "source_size": 12124594, + "transform execution exception": 1, + "result_files": 1, + "result_size": 79822, + "processing_time": 6.932, + "source_doc_count": 50, + "result_doc_count": 50 + }, + "source": { + "name": "/Users/ian/Desktop/data-prep-kit/transforms/universal/hap/python/test-data/input", + "type": "path" + }, + "target": { + "name": "/Users/ian/Desktop/data-prep-kit/transforms/universal/hap/python/output", + "type": "path" + } +} \ No newline at end of file diff --git a/transforms/universal/hap/ray/test-data/expected/test1.parquet b/transforms/universal/hap/ray/test-data/expected/test1.parquet new file mode 100644 index 000000000..c9483e34d Binary files /dev/null and b/transforms/universal/hap/ray/test-data/expected/test1.parquet differ diff --git a/transforms/universal/hap/ray/test-data/input/test1.parquet b/transforms/universal/hap/ray/test-data/input/test1.parquet new file mode 100644 index 000000000..5e2f5fe9d Binary files /dev/null and b/transforms/universal/hap/ray/test-data/input/test1.parquet differ diff --git a/transforms/universal/hap/ray/test/test_hap_ray.py b/transforms/universal/hap/ray/test/test_hap_ray.py new file mode 100644 index 000000000..232e1f4ae --- /dev/null +++ b/transforms/universal/hap/ray/test/test_hap_ray.py @@ -0,0 +1,40 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ +import os +from data_processing_ray.runtime.ray import RayTransformLauncher +from data_processing.test_support.launch.transform_test import ( + AbstractTransformLauncherTest, +) +from hap_transform_ray import HAPRayTransformConfiguration + +hap_params = { + "run_locally": True, + "model_name_or_path": 'ibm-granite/granite-guardian-hap-38m', + "annotation_column": "hap_score", + "doc_text_column": "contents", + "inference_engine": "CPU", + "max_length": 512, + "batch_size": 128, +} + + +class TestRayHAPTransform(AbstractTransformLauncherTest): + """ + Extends the super-class to define the test data for the tests defined there. + The name of this class MUST begin with the word Test so that pytest recognizes it as a test class. + """ + def get_test_transform_fixtures(self) -> list[tuple]: + basedir = "../test-data" + basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), basedir)) + launcher = RayTransformLauncher(HAPRayTransformConfiguration()) + fixtures = [(launcher, hap_params, basedir + "/input", basedir + "/expected")] + return fixtures