diff --git a/kfp/superworkflows/ray/kfp_v1/superworkflow_code_sample_wf.py b/kfp/superworkflows/ray/kfp_v1/superworkflow_code_sample_wf.py index 867c83198..d1479d794 100644 --- a/kfp/superworkflows/ray/kfp_v1/superworkflow_code_sample_wf.py +++ b/kfp/superworkflows/ray/kfp_v1/superworkflow_code_sample_wf.py @@ -11,6 +11,8 @@ run_code_to_parquet_op = comp.load_component_from_file(component_spec_path + "executeSubWorkflowComponent.yaml") run_code_quality_op = comp.load_component_from_file(component_spec_path + "executeSubWorkflowComponent.yaml") run_malware_op = comp.load_component_from_file(component_spec_path + "executeSubWorkflowComponent.yaml") +run_license_check_op = comp.load_component_from_file(component_spec_path + "executeSubWorkflowComponent.yaml") +run_header_cleanser_op = comp.load_component_from_file(component_spec_path + "executeSubWorkflowComponent.yaml") run_proglang_select_op = comp.load_component_from_file(component_spec_path + "executeSubWorkflowComponent.yaml") run_doc_id_op = comp.load_component_from_file(component_spec_path + "executeSubWorkflowComponent.yaml") run_exact_dedup_op = comp.load_component_from_file(component_spec_path + "executeSubWorkflowComponent.yaml") @@ -21,6 +23,8 @@ proglang_select_image = "quay.io/dataprep1/data-prep-kit/proglang_select-ray:latest" code_quality_image = "quay.io/dataprep1/data-prep-kit/code_quality-ray:latest" malware_image = "quay.io/dataprep1/data-prep-kit/malware-ray:latest" +license_check_image = "quay.io/dataprep1/data-prep-kit/license_check-ray:latest" +header_cleanser_image = "quay.io/dataprep1/data-prep-kit/header-cleanser-ray:latest" doc_id_image = "quay.io/dataprep1/data-prep-kit/doc_id-ray:latest" ededup_image = "quay.io/dataprep1/data-prep-kit/ededup-ray:latest" fdedup_image = "quay.io/dataprep1/data-prep-kit/fdedup-ray:latest" @@ -37,6 +41,8 @@ def sample_code_ray_orchestrator( p1_orch_code_to_parquet_name: str = "code_2_parquet_wf", p1_orch_code_quality_name: str = "code_quality_wf", p1_orch_malware_name: str = "malware_wf", + p1_orch_license_check_name: str = "license_check_wf", + p1_orch_header_cleanser_name: str = "header_cleanser_wf", p1_orch_proglang_select_name: str = "proglang_select_wf", p1_orch_doc_id_name: str = "doc_id_wf", p1_orch_exact_dedup_name: str = "ededup_wf", @@ -167,16 +173,41 @@ def sample_code_ray_orchestrator( + '"}, "ray_head_options": {"image": "' + malware_image + '"}}', - # tokenization parameters - p10_name: str = "tokenization", + # license check step parameters + p10_name: str = "license_check", p10_skip: bool = False, - p10_tkn_tokenizer: str = "hf-internal-testing/llama-tokenizer", - p10_tkn_doc_id_column: str = "document_id", - p10_tkn_doc_content_column: str = "contents", - p10_tkn_text_lang: str = "en", - p10_tkn_tokenizer_args: str = "cache_dir=/tmp/hf", - p10_tkn_chunk_size: int = 0, + p10_lc_license_column_name: str = "license", + p10_lc_licenses_file: str = "test/license_check/sample_approved_licenses.json", + # orchestrator + # overriding parameters p10_overriding_params: str = '{"ray_worker_options": {"image": "' + + license_check_image + + '"}, "ray_head_options": {"image": "' + + license_check_image + + '"}}', + # header cleanser step parameters + p11_name: str = "header_cleanser", + p11_skip: bool = False, + p11_header_cleanser_contents_column_name: str = "contents", + p11_header_cleanser_license: bool = True, + p11_header_cleanser_copyright: bool = True, + # orchestrator + # overriding parameters + p11_overriding_params: str = '{"ray_worker_options": {"image": "' + + header_cleanser_image + + '"}, "ray_head_options": {"image": "' + + header_cleanser_image + + '"}}', + # tokenization parameters + p12_name: str = "tokenization", + p12_skip: bool = False, + p12_tkn_tokenizer: str = "hf-internal-testing/llama-tokenizer", + p12_tkn_doc_id_column: str = "document_id", + p12_tkn_doc_content_column: str = "contents", + p12_tkn_text_lang: str = "en", + p12_tkn_tokenizer_args: str = "cache_dir=/tmp/hf", + p12_tkn_chunk_size: int = 0, + p12_overriding_params: str = '{"ray_worker_options": {"image": "' + tokenizer_image + '"}, "ray_head_options": {"image": "' + tokenizer_image @@ -253,11 +284,24 @@ def _set_component(op: dsl.BaseOp, displaied_name: str, prev_op: dsl.BaseOp = No name=p1_orch_malware_name, prefix="p9_", params=args, host=orch_host, input_folder=code_quality.output ) _set_component(malware, "malware", code_quality) - # malware + + # license check + license_check = run_license_check_op( + name=p1_orch_license_check_name, prefix="p10_", params=args, host=orch_host, input_folder=malware.output + ) + _set_component(license_check, "license_check", malware) + + # header cleanser + header_cleanser = run_header_cleanser_op( + name=p1_orch_header_cleanser_name, prefix="p11_", params=args, host=orch_host, input_folder=license_check.output + ) + _set_component(header_cleanser, "header_cleanser", license_check) + + # tokenization tokenization = run_tokenization_op( - name=p1_orch_tokenization_wf_name, prefix="p10_", params=args, host=orch_host, input_folder=malware.output + name=p1_orch_tokenization_wf_name, prefix="p10_", params=args, host=orch_host, input_folder=header_cleanser.output ) - _set_component(tokenization, "tokenization", malware) + _set_component(tokenization, "tokenization", header_cleanser) # Configure the pipeline level to one week (in seconds) dsl.get_pipeline_conf().set_timeout(ONE_WEEK_SEC)