From d8b9be8531368657d2fb09973b8df486ed93c620 Mon Sep 17 00:00:00 2001
From: ian-cho <42691703+ian-cho@users.noreply.github.com>
Date: Thu, 3 Oct 2024 22:16:51 +0900
Subject: [PATCH] changed doc_text into contents in related files

The column name `doc_text` is changed into `contents` in all relevant scripts and parquet files
---
 .../universal/hap/python/output/metadata.json |  13 +++++++------
 .../universal/hap/python/output/test1.parquet | Bin 79822 -> 79822 bytes
 .../universal/hap/python/src/hap_local.py     |   2 +-
 .../hap/python/src/hap_local_python.py        |   2 +-
 .../universal/hap/python/src/hap_transform.py |  12 ++++++------
 .../python/test-data/expected/metadata.json   |  13 +++++++------
 .../python/test-data/expected/test1.parquet   | Bin 79822 -> 79822 bytes
 .../hap/python/test-data/input/test1.parquet  | Bin 109303 -> 109303 bytes
 .../universal/hap/python/test/test_hap.py     |   2 +-
 9 files changed, 23 insertions(+), 21 deletions(-)

diff --git a/transforms/universal/hap/python/output/metadata.json b/transforms/universal/hap/python/output/metadata.json
index 6627fabb9..062fee162 100644
--- a/transforms/universal/hap/python/output/metadata.json
+++ b/transforms/universal/hap/python/output/metadata.json
@@ -5,8 +5,8 @@
     "job name": "hap",
     "job type": "pure python",
     "job id": "job_id",
-    "start_time": "2024-09-25 00:47:58",
-    "end_time": "2024-09-25 00:48:06",
+    "start_time": "2024-10-03 21:38:20",
+    "end_time": "2024-10-03 21:38:29",
     "status": "success"
   },
   "code": {
@@ -17,7 +17,7 @@
   "job_input_params": {
     "model_name_or_path": "ibm-granite/granite-guardian-hap-38m",
     "annotation_column": "hap_score",
-    "doc_text_column": "doc_text",
+    "doc_text_column": "contents",
     "inference_engine": "CPU",
     "max_length": 512,
     "batch_size": 128,
@@ -30,11 +30,12 @@
     "num_processors": 0
   },
   "job_output_stats": {
-    "source_files": 1,
-    "source_size": 109303,
+    "source_files": 2,
+    "source_size": 12124594,
+    "transform execution exception": 1,
     "result_files": 1,
     "result_size": 79822,
-    "processing_time": 6.543,
+    "processing_time": 6.932,
     "source_doc_count": 50,
     "result_doc_count": 50
   },
diff --git a/transforms/universal/hap/python/output/test1.parquet b/transforms/universal/hap/python/output/test1.parquet
index 8ac5be443d311740b8b74296fb4a02e15eb50ebc..c9483e34d47dd71af90b1a6694c55fb01ea95453 100644
GIT binary patch
delta 171
zcmX^2p5@$omWC~i#}qk|^Ycnl^Gb@hGiotLux?jVXS8F23r^3`WbB3WKuTQLm0>!R
zbQF}*GE;L>;`0)7Q<ba~l;AScGYuFWrpp^KvTk2%$Y{yTot0!{Y7iA}YLHwx{i6w^
k1dm@?QeIFF5LbC-PCsbO7`i>nlyMCslEU)oisp=K0maogvj6}9

delta 155
zcmX^2p5@$omWC~i#}qkI@{{9BQY%WfGiotLux?jVXS8F23r^3`WbB3WKuTQL;oRx>
z3>d{&L3)*@XBsd%OqVxeWZk~jkkOKvJ3Gn9FD=DA%p}Em`bQH+37(*`NJC45sE7;$
a&&=rujTu9?XPGjtVMJ2sI9<`4aV-EPE;6|Q

diff --git a/transforms/universal/hap/python/src/hap_local.py b/transforms/universal/hap/python/src/hap_local.py
index 89140fd74..220eea19b 100644
--- a/transforms/universal/hap/python/src/hap_local.py
+++ b/transforms/universal/hap/python/src/hap_local.py
@@ -24,7 +24,7 @@
 hap_params = {
     "model_name_or_path": 'ibm-granite/granite-guardian-hap-38m',
     "annotation_column": "hap_score",
-    "doc_text_column": "doc_text",
+    "doc_text_column": "contents",
     "inference_engine": "CPU",
     "max_length": 512,
     "batch_size": 128,
diff --git a/transforms/universal/hap/python/src/hap_local_python.py b/transforms/universal/hap/python/src/hap_local_python.py
index 9a268803e..8e79dc583 100644
--- a/transforms/universal/hap/python/src/hap_local_python.py
+++ b/transforms/universal/hap/python/src/hap_local_python.py
@@ -37,7 +37,7 @@
 hap_params = {
     "model_name_or_path": 'ibm-granite/granite-guardian-hap-38m',
     "annotation_column": "hap_score",
-    "doc_text_column": "doc_text",
+    "doc_text_column": "contents",
     "inference_engine": "CPU",
     "max_length": 512,
     "batch_size": 128,
diff --git a/transforms/universal/hap/python/src/hap_transform.py b/transforms/universal/hap/python/src/hap_transform.py
index 71bad2acb..e6a48cf86 100644
--- a/transforms/universal/hap/python/src/hap_transform.py
+++ b/transforms/universal/hap/python/src/hap_transform.py
@@ -27,11 +27,11 @@ class HAPTransform(AbstractTableTransform):
 
     def __init__(self, config: dict[str, Any]):
         super().__init__(config)
-        self.model_name_or_path = config.get("model_name_or_path")
-        self.annotation_column = config.get("annotation_column")
-        self.doc_text_column = config.get("doc_text_column")
-        self.max_length = config.get("max_length")
-        self.batch_size = config.get("batch_size")
+        self.model_name_or_path = config.get("model_name_or_path", "ibm-granite/granite-guardian-hap-38m")
+        self.annotation_column = config.get("annotation_column", "hap_score")
+        self.doc_text_column = config.get("doc_text_column", "contents")
+        self.max_length = config.get("max_length", 512)
+        self.batch_size = config.get("batch_size", 128)
         self.tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path)
         self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name_or_path)
 
@@ -70,7 +70,7 @@ def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Tab
         :param table: Pyarrow table
         :return: a table with an additional hap_score column
         """
-        # make sure that the table contains "doc_text" column
+        # make sure that the table contains "contents" column
         TransformUtils.validate_columns(table=table, required=[self.doc_text_column])
         self.df = table.to_pandas()
         df_doc_list = []
diff --git a/transforms/universal/hap/python/test-data/expected/metadata.json b/transforms/universal/hap/python/test-data/expected/metadata.json
index 1e5f710db..062fee162 100644
--- a/transforms/universal/hap/python/test-data/expected/metadata.json
+++ b/transforms/universal/hap/python/test-data/expected/metadata.json
@@ -5,8 +5,8 @@
     "job name": "hap",
     "job type": "pure python",
     "job id": "job_id",
-    "start_time": "2024-09-26 20:56:49",
-    "end_time": "2024-09-26 20:56:56",
+    "start_time": "2024-10-03 21:38:20",
+    "end_time": "2024-10-03 21:38:29",
     "status": "success"
   },
   "code": {
@@ -17,7 +17,7 @@
   "job_input_params": {
     "model_name_or_path": "ibm-granite/granite-guardian-hap-38m",
     "annotation_column": "hap_score",
-    "doc_text_column": "doc_text",
+    "doc_text_column": "contents",
     "inference_engine": "CPU",
     "max_length": 512,
     "batch_size": 128,
@@ -30,11 +30,12 @@
     "num_processors": 0
   },
   "job_output_stats": {
-    "source_files": 1,
-    "source_size": 109303,
+    "source_files": 2,
+    "source_size": 12124594,
+    "transform execution exception": 1,
     "result_files": 1,
     "result_size": 79822,
-    "processing_time": 6.501,
+    "processing_time": 6.932,
     "source_doc_count": 50,
     "result_doc_count": 50
   },
diff --git a/transforms/universal/hap/python/test-data/expected/test1.parquet b/transforms/universal/hap/python/test-data/expected/test1.parquet
index 8ac5be443d311740b8b74296fb4a02e15eb50ebc..c9483e34d47dd71af90b1a6694c55fb01ea95453 100644
GIT binary patch
delta 171
zcmX^2p5@$omWC~i#}qk|^Ycnl^Gb@hGiotLux?jVXS8F23r^3`WbB3WKuTQLm0>!R
zbQF}*GE;L>;`0)7Q<ba~l;AScGYuFWrpp^KvTk2%$Y{yTot0!{Y7iA}YLHwx{i6w^
k1dm@?QeIFF5LbC-PCsbO7`i>nlyMCslEU)oisp=K0maogvj6}9

delta 155
zcmX^2p5@$omWC~i#}qkI@{{9BQY%WfGiotLux?jVXS8F23r^3`WbB3WKuTQL;oRx>
z3>d{&L3)*@XBsd%OqVxeWZk~jkkOKvJ3Gn9FD=DA%p}Em`bQH+37(*`NJC45sE7;$
a&&=rujTu9?XPGjtVMJ2sI9<`4aV-EPE;6|Q

diff --git a/transforms/universal/hap/python/test-data/input/test1.parquet b/transforms/universal/hap/python/test-data/input/test1.parquet
index 8a3468009e1f012d50dc9f0d9bf437926d048867..5e2f5fe9d5547448a8d2ff3ec3b5b5c51e575455 100644
GIT binary patch
delta 177
zcmex<mF@dgwuUW?&u4HX=jWBA=9Lt0e?Nn<h8518UObPn7S01nMzC%VSi;!Cs0`Dg
zq@$pemYJH95}%ito2q1`pahqh{$mNF!*=s!jK`UDvyzNV4WhzL4U#K8i#?saaudVD
zilf{tOC!S#b3C*1(>-(j%98Sea)7vMyY@;(2}bTnBg@hh_ps6w58vs2s~A@T0IY^W
A?f?J)

delta 177
zcmex<mF@dgwuUW?&u4I?<R{0Mq*jz{e?Nn<h8518UObPn7S01nMzC%VSi;!Cs0`Dg
zq@$pemYJH95}%ito2q1`pahqh{$mNF!*=s!jK`UDvy+Vc(o)>ROj4XZi#?saaudVD
zilf{tOC!S#b3C*1(>-&8$|4Od4Wc4447O{pWRzg!j&irmiZBk!N%aVv?zf6@B>-=i
BLB0S0

diff --git a/transforms/universal/hap/python/test/test_hap.py b/transforms/universal/hap/python/test/test_hap.py
index 3f2a25e53..82ac5dc06 100644
--- a/transforms/universal/hap/python/test/test_hap.py
+++ b/transforms/universal/hap/python/test/test_hap.py
@@ -19,7 +19,7 @@
 hap_params = {
     "model_name_or_path": 'ibm-granite/granite-guardian-hap-38m',
     "annotation_column": "hap_score",
-    "doc_text_column": "doc_text",
+    "doc_text_column": "contents",
     "inference_engine": "CPU",
     "max_length": 512,
     "batch_size": 128,