From 23ed7701f5dac241a77a20eba5e7d5e227c049ca Mon Sep 17 00:00:00 2001 From: Mohammad Nassar Date: Wed, 10 Jul 2024 07:16:07 -0500 Subject: [PATCH] Add licence transforms to code superpipeline. Signed-off-by: Mohammad Nassar --- .../kfp_v1/superworkflow_code_sample_wf.py | 66 +++++++++++++++---- 1 file changed, 55 insertions(+), 11 deletions(-) diff --git a/kfp/superworkflows/ray/kfp_v1/superworkflow_code_sample_wf.py b/kfp/superworkflows/ray/kfp_v1/superworkflow_code_sample_wf.py index 7b83924e3..2937297d7 100644 --- a/kfp/superworkflows/ray/kfp_v1/superworkflow_code_sample_wf.py +++ b/kfp/superworkflows/ray/kfp_v1/superworkflow_code_sample_wf.py @@ -11,6 +11,8 @@ run_code_to_parquet_op = comp.load_component_from_file(component_spec_path + "executeSubWorkflowComponent.yaml") run_code_quality_op = comp.load_component_from_file(component_spec_path + "executeSubWorkflowComponent.yaml") run_malware_op = comp.load_component_from_file(component_spec_path + "executeSubWorkflowComponent.yaml") +run_license_check_op = comp.load_component_from_file(component_spec_path + "executeSubWorkflowComponent.yaml") +run_header_cleanser_op = comp.load_component_from_file(component_spec_path + "executeSubWorkflowComponent.yaml") run_proglang_select_op = comp.load_component_from_file(component_spec_path + "executeSubWorkflowComponent.yaml") run_doc_id_op = comp.load_component_from_file(component_spec_path + "executeSubWorkflowComponent.yaml") run_exact_dedup_op = comp.load_component_from_file(component_spec_path + "executeSubWorkflowComponent.yaml") @@ -21,6 +23,8 @@ proglang_select_image = "quay.io/dataprep1/data-prep-kit/proglang_select-ray:0.2.1.dev0" code_quality_image = "quay.io/dataprep1/data-prep-kit/code_quality-ray:0.2.1.dev0" malware_image = "quay.io/dataprep1/data-prep-kit/malware-ray:0.2.1.dev0" +license_check_image = "quay.io/dataprep1/data-prep-kit/license_check-ray:0.4.0.dev6" +header_cleanser_image = "quay.io/dataprep1/data-prep-kit/header-cleanser-ray:0.2.1.dev0" doc_id_image = "quay.io/dataprep1/data-prep-kit/doc_id-ray:0.2.1.dev0" ededup_image = "quay.io/dataprep1/data-prep-kit/ededup-ray:0.2.1.dev0" fdedup_image = "quay.io/dataprep1/data-prep-kit/fdedup-ray:0.2.1.dev0" @@ -37,6 +41,8 @@ def sample_code_ray_orchestrator( p1_orch_code_to_parquet_name: str = "code_2_parquet_wf", p1_orch_code_quality_name: str = "code_quality_wf", p1_orch_malware_name: str = "malware_wf", + p1_orch_license_check_name: str = "license_check_wf", + p1_orch_header_cleanser_name: str = "header_cleanser_wf", p1_orch_proglang_select_name: str = "proglang_select_wf", p1_orch_doc_id_name: str = "doc_id_wf", p1_orch_exact_dedup_name: str = "ededup_wf", @@ -165,16 +171,41 @@ def sample_code_ray_orchestrator( + '"}, "ray_head_options": {"image": "' + malware_image + '"}}', - # tokenization parameters - p10_name: str = "tokenization", + # license check step parameters + p10_name: str = "license_check", p10_skip: bool = False, - p10_tkn_tokenizer: str = "hf-internal-testing/llama-tokenizer", - p10_tkn_doc_id_column: str = "document_id", - p10_tkn_doc_content_column: str = "contents", - p10_tkn_text_lang: str = "en", - p10_tkn_tokenizer_args: str = "cache_dir=/tmp/hf", - p10_tkn_chunk_size: int = 0, + p10_lc_license_column_name: str = "license", + p10_lc_licenses_file: str = "test/license_check/sample_approved_licenses.json", + # orchestrator + # overriding parameters p10_overriding_params: str = '{"ray_worker_options": {"image": "' + + license_check_image + + '"}, "ray_head_options": {"image": "' + + license_check_image + + '"}}', + # header cleanser step parameters + p11_name: str = "header_cleanser", + p11_skip: bool = False, + p11_header_cleanser_contents_column_name: str = "contents", + p11_header_cleanser_license: bool = True, + p11_header_cleanser_copyright: bool = True, + # orchestrator + # overriding parameters + p11_overriding_params: str = '{"ray_worker_options": {"image": "' + + header_cleanser_image + + '"}, "ray_head_options": {"image": "' + + header_cleanser_image + + '"}}', + # tokenization parameters + p12_name: str = "tokenization", + p12_skip: bool = False, + p12_tkn_tokenizer: str = "hf-internal-testing/llama-tokenizer", + p12_tkn_doc_id_column: str = "document_id", + p12_tkn_doc_content_column: str = "contents", + p12_tkn_text_lang: str = "en", + p12_tkn_tokenizer_args: str = "cache_dir=/tmp/hf", + p12_tkn_chunk_size: int = 0, + p12_overriding_params: str = '{"ray_worker_options": {"image": "' + tokenizer_image + '"}, "ray_head_options": {"image": "' + tokenizer_image @@ -251,11 +282,24 @@ def _set_component(op: dsl.BaseOp, displaied_name: str, prev_op: dsl.BaseOp = No name=p1_orch_malware_name, prefix="p9_", params=args, host=orch_host, input_folder=code_quality.output ) _set_component(malware, "malware", code_quality) - # malware + + # license check + license_check = run_license_check_op( + name=p1_orch_license_check_name, prefix="p10_", params=args, host=orch_host, input_folder=malware.output + ) + _set_component(license_check, "license_check", malware) + + # header cleanser + header_cleanser = run_header_cleanser_op( + name=p1_orch_header_cleanser_name, prefix="p11_", params=args, host=orch_host, input_folder=license_check.output + ) + _set_component(header_cleanser, "header_cleanser", license_check) + + # tokenization tokenization = run_tokenization_op( - name=p1_orch_tokenization_wf_name, prefix="p10_", params=args, host=orch_host, input_folder=malware.output + name=p1_orch_tokenization_wf_name, prefix="p10_", params=args, host=orch_host, input_folder=header_cleanser.output ) - _set_component(tokenization, "tokenization", malware) + _set_component(tokenization, "tokenization", header_cleanser) # Configure the pipeline level to one week (in seconds) dsl.get_pipeline_conf().set_timeout(ONE_WEEK_SEC)