Skip to content

Commit

Permalink
Merge pull request #400 from IBM/licence-super
Browse files Browse the repository at this point in the history
Add Licence transforms to Code Superpipeline.
  • Loading branch information
roytman authored Oct 1, 2024
2 parents 83cec8f + 0fdaca1 commit afafbf1
Showing 1 changed file with 55 additions and 11 deletions.
66 changes: 55 additions & 11 deletions kfp/superworkflows/ray/kfp_v1/superworkflow_code_sample_wf.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
run_code_to_parquet_op = comp.load_component_from_file(component_spec_path + "executeSubWorkflowComponent.yaml")
run_code_quality_op = comp.load_component_from_file(component_spec_path + "executeSubWorkflowComponent.yaml")
run_malware_op = comp.load_component_from_file(component_spec_path + "executeSubWorkflowComponent.yaml")
run_license_check_op = comp.load_component_from_file(component_spec_path + "executeSubWorkflowComponent.yaml")
run_header_cleanser_op = comp.load_component_from_file(component_spec_path + "executeSubWorkflowComponent.yaml")
run_proglang_select_op = comp.load_component_from_file(component_spec_path + "executeSubWorkflowComponent.yaml")
run_doc_id_op = comp.load_component_from_file(component_spec_path + "executeSubWorkflowComponent.yaml")
run_exact_dedup_op = comp.load_component_from_file(component_spec_path + "executeSubWorkflowComponent.yaml")
Expand All @@ -21,6 +23,8 @@
proglang_select_image = "quay.io/dataprep1/data-prep-kit/proglang_select-ray:latest"
code_quality_image = "quay.io/dataprep1/data-prep-kit/code_quality-ray:latest"
malware_image = "quay.io/dataprep1/data-prep-kit/malware-ray:latest"
license_check_image = "quay.io/dataprep1/data-prep-kit/license_check-ray:latest"
header_cleanser_image = "quay.io/dataprep1/data-prep-kit/header-cleanser-ray:latest"
doc_id_image = "quay.io/dataprep1/data-prep-kit/doc_id-ray:latest"
ededup_image = "quay.io/dataprep1/data-prep-kit/ededup-ray:latest"
fdedup_image = "quay.io/dataprep1/data-prep-kit/fdedup-ray:latest"
Expand All @@ -37,6 +41,8 @@ def sample_code_ray_orchestrator(
p1_orch_code_to_parquet_name: str = "code_2_parquet_wf",
p1_orch_code_quality_name: str = "code_quality_wf",
p1_orch_malware_name: str = "malware_wf",
p1_orch_license_check_name: str = "license_check_wf",
p1_orch_header_cleanser_name: str = "header_cleanser_wf",
p1_orch_proglang_select_name: str = "proglang_select_wf",
p1_orch_doc_id_name: str = "doc_id_wf",
p1_orch_exact_dedup_name: str = "ededup_wf",
Expand Down Expand Up @@ -167,16 +173,41 @@ def sample_code_ray_orchestrator(
+ '"}, "ray_head_options": {"image": "'
+ malware_image
+ '"}}',
# tokenization parameters
p10_name: str = "tokenization",
# license check step parameters
p10_name: str = "license_check",
p10_skip: bool = False,
p10_tkn_tokenizer: str = "hf-internal-testing/llama-tokenizer",
p10_tkn_doc_id_column: str = "document_id",
p10_tkn_doc_content_column: str = "contents",
p10_tkn_text_lang: str = "en",
p10_tkn_tokenizer_args: str = "cache_dir=/tmp/hf",
p10_tkn_chunk_size: int = 0,
p10_lc_license_column_name: str = "license",
p10_lc_licenses_file: str = "test/license_check/sample_approved_licenses.json",
# orchestrator
# overriding parameters
p10_overriding_params: str = '{"ray_worker_options": {"image": "'
+ license_check_image
+ '"}, "ray_head_options": {"image": "'
+ license_check_image
+ '"}}',
# header cleanser step parameters
p11_name: str = "header_cleanser",
p11_skip: bool = False,
p11_header_cleanser_contents_column_name: str = "contents",
p11_header_cleanser_license: bool = True,
p11_header_cleanser_copyright: bool = True,
# orchestrator
# overriding parameters
p11_overriding_params: str = '{"ray_worker_options": {"image": "'
+ header_cleanser_image
+ '"}, "ray_head_options": {"image": "'
+ header_cleanser_image
+ '"}}',
# tokenization parameters
p12_name: str = "tokenization",
p12_skip: bool = False,
p12_tkn_tokenizer: str = "hf-internal-testing/llama-tokenizer",
p12_tkn_doc_id_column: str = "document_id",
p12_tkn_doc_content_column: str = "contents",
p12_tkn_text_lang: str = "en",
p12_tkn_tokenizer_args: str = "cache_dir=/tmp/hf",
p12_tkn_chunk_size: int = 0,
p12_overriding_params: str = '{"ray_worker_options": {"image": "'
+ tokenizer_image
+ '"}, "ray_head_options": {"image": "'
+ tokenizer_image
Expand Down Expand Up @@ -253,11 +284,24 @@ def _set_component(op: dsl.BaseOp, displaied_name: str, prev_op: dsl.BaseOp = No
name=p1_orch_malware_name, prefix="p9_", params=args, host=orch_host, input_folder=code_quality.output
)
_set_component(malware, "malware", code_quality)
# malware

# license check
license_check = run_license_check_op(
name=p1_orch_license_check_name, prefix="p10_", params=args, host=orch_host, input_folder=malware.output
)
_set_component(license_check, "license_check", malware)

# header cleanser
header_cleanser = run_header_cleanser_op(
name=p1_orch_header_cleanser_name, prefix="p11_", params=args, host=orch_host, input_folder=license_check.output
)
_set_component(header_cleanser, "header_cleanser", license_check)

# tokenization
tokenization = run_tokenization_op(
name=p1_orch_tokenization_wf_name, prefix="p10_", params=args, host=orch_host, input_folder=malware.output
name=p1_orch_tokenization_wf_name, prefix="p10_", params=args, host=orch_host, input_folder=header_cleanser.output
)
_set_component(tokenization, "tokenization", malware)
_set_component(tokenization, "tokenization", header_cleanser)

# Configure the pipeline level to one week (in seconds)
dsl.get_pipeline_conf().set_timeout(ONE_WEEK_SEC)
Expand Down

0 comments on commit afafbf1

Please sign in to comment.