From d8b9be8531368657d2fb09973b8df486ed93c620 Mon Sep 17 00:00:00 2001 From: ian-cho <42691703+ian-cho@users.noreply.github.com> Date: Thu, 3 Oct 2024 22:16:51 +0900 Subject: [PATCH] changed doc_text into contents in related files The column name `doc_text` is changed into `contents` in all relevant scripts and parquet files --- .../universal/hap/python/output/metadata.json | 13 +++++++------ .../universal/hap/python/output/test1.parquet | Bin 79822 -> 79822 bytes .../universal/hap/python/src/hap_local.py | 2 +- .../hap/python/src/hap_local_python.py | 2 +- .../universal/hap/python/src/hap_transform.py | 12 ++++++------ .../python/test-data/expected/metadata.json | 13 +++++++------ .../python/test-data/expected/test1.parquet | Bin 79822 -> 79822 bytes .../hap/python/test-data/input/test1.parquet | Bin 109303 -> 109303 bytes .../universal/hap/python/test/test_hap.py | 2 +- 9 files changed, 23 insertions(+), 21 deletions(-) diff --git a/transforms/universal/hap/python/output/metadata.json b/transforms/universal/hap/python/output/metadata.json index 6627fabb9..062fee162 100644 --- a/transforms/universal/hap/python/output/metadata.json +++ b/transforms/universal/hap/python/output/metadata.json @@ -5,8 +5,8 @@ "job name": "hap", "job type": "pure python", "job id": "job_id", - "start_time": "2024-09-25 00:47:58", - "end_time": "2024-09-25 00:48:06", + "start_time": "2024-10-03 21:38:20", + "end_time": "2024-10-03 21:38:29", "status": "success" }, "code": { @@ -17,7 +17,7 @@ "job_input_params": { "model_name_or_path": "ibm-granite/granite-guardian-hap-38m", "annotation_column": "hap_score", - "doc_text_column": "doc_text", + "doc_text_column": "contents", "inference_engine": "CPU", "max_length": 512, "batch_size": 128, @@ -30,11 +30,12 @@ "num_processors": 0 }, "job_output_stats": { - "source_files": 1, - "source_size": 109303, + "source_files": 2, + "source_size": 12124594, + "transform execution exception": 1, "result_files": 1, "result_size": 79822, - "processing_time": 6.543, + "processing_time": 6.932, "source_doc_count": 50, "result_doc_count": 50 }, diff --git a/transforms/universal/hap/python/output/test1.parquet b/transforms/universal/hap/python/output/test1.parquet index 8ac5be443d311740b8b74296fb4a02e15eb50ebc..c9483e34d47dd71af90b1a6694c55fb01ea95453 100644 GIT binary patch delta 171 zcmX^2p5@$omWC~i#}qk|^Ycnl^Gb@hGiotLux?jVXS8F23r^3`WbB3WKuTQLm0>!R zbQF}*GE;L>;`0)7QnlyMCslEU)oisp=K0maogvj6}9 delta 155 zcmX^2p5@$omWC~i#}qkI@{{9BQY%WfGiotLux?jVXS8F23r^3`WbB3WKuTQL;oRx> z3>d{&L3)*@XBsd%OqVxeWZk~jkkOKvJ3Gn9FD=DA%p}Em`bQH+37(*`NJC45sE7;$ a&&=rujTu9?XPGjtVMJ2sI9<`4aV-EPE;6|Q diff --git a/transforms/universal/hap/python/src/hap_local.py b/transforms/universal/hap/python/src/hap_local.py index 89140fd74..220eea19b 100644 --- a/transforms/universal/hap/python/src/hap_local.py +++ b/transforms/universal/hap/python/src/hap_local.py @@ -24,7 +24,7 @@ hap_params = { "model_name_or_path": 'ibm-granite/granite-guardian-hap-38m', "annotation_column": "hap_score", - "doc_text_column": "doc_text", + "doc_text_column": "contents", "inference_engine": "CPU", "max_length": 512, "batch_size": 128, diff --git a/transforms/universal/hap/python/src/hap_local_python.py b/transforms/universal/hap/python/src/hap_local_python.py index 9a268803e..8e79dc583 100644 --- a/transforms/universal/hap/python/src/hap_local_python.py +++ b/transforms/universal/hap/python/src/hap_local_python.py @@ -37,7 +37,7 @@ hap_params = { "model_name_or_path": 'ibm-granite/granite-guardian-hap-38m', "annotation_column": "hap_score", - "doc_text_column": "doc_text", + "doc_text_column": "contents", "inference_engine": "CPU", "max_length": 512, "batch_size": 128, diff --git a/transforms/universal/hap/python/src/hap_transform.py b/transforms/universal/hap/python/src/hap_transform.py index 71bad2acb..e6a48cf86 100644 --- a/transforms/universal/hap/python/src/hap_transform.py +++ b/transforms/universal/hap/python/src/hap_transform.py @@ -27,11 +27,11 @@ class HAPTransform(AbstractTableTransform): def __init__(self, config: dict[str, Any]): super().__init__(config) - self.model_name_or_path = config.get("model_name_or_path") - self.annotation_column = config.get("annotation_column") - self.doc_text_column = config.get("doc_text_column") - self.max_length = config.get("max_length") - self.batch_size = config.get("batch_size") + self.model_name_or_path = config.get("model_name_or_path", "ibm-granite/granite-guardian-hap-38m") + self.annotation_column = config.get("annotation_column", "hap_score") + self.doc_text_column = config.get("doc_text_column", "contents") + self.max_length = config.get("max_length", 512) + self.batch_size = config.get("batch_size", 128) self.tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path) self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name_or_path) @@ -70,7 +70,7 @@ def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Tab :param table: Pyarrow table :return: a table with an additional hap_score column """ - # make sure that the table contains "doc_text" column + # make sure that the table contains "contents" column TransformUtils.validate_columns(table=table, required=[self.doc_text_column]) self.df = table.to_pandas() df_doc_list = [] diff --git a/transforms/universal/hap/python/test-data/expected/metadata.json b/transforms/universal/hap/python/test-data/expected/metadata.json index 1e5f710db..062fee162 100644 --- a/transforms/universal/hap/python/test-data/expected/metadata.json +++ b/transforms/universal/hap/python/test-data/expected/metadata.json @@ -5,8 +5,8 @@ "job name": "hap", "job type": "pure python", "job id": "job_id", - "start_time": "2024-09-26 20:56:49", - "end_time": "2024-09-26 20:56:56", + "start_time": "2024-10-03 21:38:20", + "end_time": "2024-10-03 21:38:29", "status": "success" }, "code": { @@ -17,7 +17,7 @@ "job_input_params": { "model_name_or_path": "ibm-granite/granite-guardian-hap-38m", "annotation_column": "hap_score", - "doc_text_column": "doc_text", + "doc_text_column": "contents", "inference_engine": "CPU", "max_length": 512, "batch_size": 128, @@ -30,11 +30,12 @@ "num_processors": 0 }, "job_output_stats": { - "source_files": 1, - "source_size": 109303, + "source_files": 2, + "source_size": 12124594, + "transform execution exception": 1, "result_files": 1, "result_size": 79822, - "processing_time": 6.501, + "processing_time": 6.932, "source_doc_count": 50, "result_doc_count": 50 }, diff --git a/transforms/universal/hap/python/test-data/expected/test1.parquet b/transforms/universal/hap/python/test-data/expected/test1.parquet index 8ac5be443d311740b8b74296fb4a02e15eb50ebc..c9483e34d47dd71af90b1a6694c55fb01ea95453 100644 GIT binary patch delta 171 zcmX^2p5@$omWC~i#}qk|^Ycnl^Gb@hGiotLux?jVXS8F23r^3`WbB3WKuTQLm0>!R zbQF}*GE;L>;`0)7QnlyMCslEU)oisp=K0maogvj6}9 delta 155 zcmX^2p5@$omWC~i#}qkI@{{9BQY%WfGiotLux?jVXS8F23r^3`WbB3WKuTQL;oRx> z3>d{&L3)*@XBsd%OqVxeWZk~jkkOKvJ3Gn9FD=DA%p}Em`bQH+37(*`NJC45sE7;$ a&&=rujTu9?XPGjtVMJ2sI9<`4aV-EPE;6|Q diff --git a/transforms/universal/hap/python/test-data/input/test1.parquet b/transforms/universal/hap/python/test-data/input/test1.parquet index 8a3468009e1f012d50dc9f0d9bf437926d048867..5e2f5fe9d5547448a8d2ff3ec3b5b5c51e575455 100644 GIT binary patch delta 177 zcmex-(j%98Sea)7vMyY@;(2}bTnBg@hh_ps6w58vs2s~A@T0IY^W A?f?J) delta 177 zcmexROj4XZi#?saaudVD zilf{tOC!S#b3C*1(>-&8$|4Od4Wc4447O{pWRzg!j&irmiZBk!N%aVv?zf6@B>-=i BLB0S0 diff --git a/transforms/universal/hap/python/test/test_hap.py b/transforms/universal/hap/python/test/test_hap.py index 3f2a25e53..82ac5dc06 100644 --- a/transforms/universal/hap/python/test/test_hap.py +++ b/transforms/universal/hap/python/test/test_hap.py @@ -19,7 +19,7 @@ hap_params = { "model_name_or_path": 'ibm-granite/granite-guardian-hap-38m', "annotation_column": "hap_score", - "doc_text_column": "doc_text", + "doc_text_column": "contents", "inference_engine": "CPU", "max_length": 512, "batch_size": 128,