Skip to content

Commit

Permalink
fix notebook
Browse files Browse the repository at this point in the history
Signed-off-by: Maroun Touma <[email protected]>
  • Loading branch information
touma-I committed Dec 10, 2024
1 parent dd3a63e commit 64781b9
Show file tree
Hide file tree
Showing 2 changed files with 141 additions and 20 deletions.
5 changes: 3 additions & 2 deletions transforms/universal/hap/dpk_hap/transform_python.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,14 @@
# See the License for the specific language governing permissions and
# limitations under the License.
################################################################################
import sys
import time

from data_processing.runtime.pure_python import PythonTransformLauncher
from data_processing.runtime.pure_python.runtime_configuration import (
PythonTransformRuntimeConfiguration,
)
from data_processing.utils import get_logger
from data_processing.utils import ParamsUtils, get_logger
from dpk_hap.transform import HAPTransformConfiguration


Expand Down Expand Up @@ -52,7 +53,7 @@ def __init__(self, **kwargs):
def transform(self):
sys.argv = ParamsUtils.dict_to_req(d=(self.params))
# create launcher
launcher = PythonTransformLauncher(Pdf2ParquetPythonTransformConfiguration())
launcher = PythonTransformLauncher(HAPPythonTransformConfiguration())
# launch
return_code = launcher.launch()
return return_code
Expand Down
156 changes: 138 additions & 18 deletions transforms/universal/hap/hap_python.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -50,27 +50,37 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 1,
"id": "38aebf49-9460-4951-bb04-7045dec28690",
"metadata": {},
"outputs": [],
"source": [
"#import ast\n",
"#import os\n",
"#import sys\n",
"\n",
"#from data_processing.runtime.pure_python import PythonTransformLauncher\n",
"#from data_processing.utils import ParamsUtils\n",
"#from dpk_hap.transform_python import HAPPythonTransformConfiguration"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "17306684-306b-48e8-a89a-4d0228e01291",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package punkt_tab to /Users/ian/nltk_data...\n",
"[nltk_data] Downloading package punkt_tab to /Users/touma/nltk_data...\n",
"[nltk_data] Package punkt_tab is already up-to-date!\n"
]
}
],
"source": [
"import ast\n",
"import os\n",
"import sys\n",
"\n",
"from data_processing.runtime.pure_python import PythonTransformLauncher\n",
"from data_processing.utils import ParamsUtils\n",
"from dpk_hap.transform_python import HAPPythonTransformConfiguration"
"from dpk_hap.transform_python import HAPRuntime"
]
},
{
Expand All @@ -83,14 +93,124 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 4,
"id": "6a8ec5e4-1f52-4c61-9c9e-4618f9034b80",
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"12:08:21 INFO - hap params are {'model_name_or_path': 'ibm-granite/granite-guardian-hap-38m', 'annotation_column': 'hap_score', 'doc_text_column': 'contents', 'inference_engine': 'CPU', 'max_length': 512, 'batch_size': 128} \n",
"12:08:21 INFO - pipeline id pipeline_id\n",
"12:08:21 INFO - code location None\n",
"12:08:21 INFO - data factory data_ is using local data access: input_folder - test-data/input output_folder - output\n",
"12:08:21 INFO - data factory data_ max_files -1, n_sample -1\n",
"12:08:21 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n",
"12:08:21 INFO - orchestrator hap started at 2024-12-10 12:08:21\n",
"12:08:21 INFO - Number of files is 1, source profile {'max_file_size': 0.10423946380615234, 'min_file_size': 0.10423946380615234, 'total_file_size': 0.10423946380615234}\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Processing batch: 0/11\n",
"Processing batch: 1/11\n",
"Processing batch: 2/11\n",
"Processing batch: 3/11\n",
"Processing batch: 4/11\n",
"Processing batch: 5/11\n",
"Processing batch: 6/11\n",
"Processing batch: 7/11\n",
"Processing batch: 8/11\n",
"Processing batch: 9/11\n",
"Processing batch: 10/11\n",
"Processing batch: 11/11\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"12:08:55 INFO - Completed 1 files (100.0%) in 0.467 min\n",
"12:08:55 INFO - Done processing 1 files, waiting for flush() completion.\n",
"12:08:55 INFO - done flushing in 0.0 sec\n",
"12:08:55 INFO - Completed execution in 0.568 min, execution result 0\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
" doc_id contents hap_score\n",
"0 1 GSC is very much a little Swiss Army knife for... 0.002463\n",
"1 2 When you’ve got a particular somebody that you... 0.075292\n",
"2 3 Many believe a healthy diet is all that’s need... 0.005342\n",
"3 4 Clinton’s plan specifically targets drugs that... 0.000294\n",
"4 5 An aspiring actress was found hanged in a ward... 0.071551\n",
"5 6 By Evan Ackerman\\nInside this rather large box... 0.000441\n",
"6 7 I'm really bad at naming things. Like hopeless... 0.131478\n",
"7 8 Metrolink is successful because of the continu... 0.000337\n",
"8 9 Notre Dame political scientist Jeff Harden is ... 0.006692\n",
"9 10 Federal employees who qualify for the federal ... 0.014998\n",
"10 11 Girona will face Deportivo Alaves at Estadio M... 0.005883\n",
"11 12 Xana & Melody's Foot Worship Punishment\\nA sla... 0.019838\n",
"12 13 \"But the liberal deviseth liberal things; and ... 0.001782\n",
"13 14 Workers at Linden Hills Co-op won their electi... 0.002168\n",
"14 15 The principal features in PCOD are no ovulatio... 0.000567\n",
"15 16 Kite Kali Beh Ke gurpreet dhanoa 11 years ago.... 0.019736\n",
"16 17 Chrsyo X1 Bag\\nLittle background information, ... 0.063229\n",
"17 18 The season of spooks has arrived in Sea of Thi... 0.110339\n",
"18 19 Box Office Mojo reports that Casino Royale has... 0.000420\n",
"19 20 Here are only a few examples. And no, I'm not ... 0.989713\n",
"20 21 Free Fiction Monday: Cosmic Balances Inc.\\nGri... 0.092669\n",
"21 22 Motorcycle crash kills 61-year-old Hellam man ... 0.001743\n",
"22 23 You are invited to a family-friendly geocachin... 0.000516\n",
"23 24 Evaluation Forms are commonly used by employer... 0.001409\n",
"24 25 4. Pray for the Nation (the Multitudes):\\nWe r... 0.356732\n",
"25 26 Welcome to the PC Matic Process Library. We ma... 0.000680\n",
"26 27 By Manpreet Singh, 2009 MBA and President of S... 0.000980\n",
"27 28 Lower Dauphin School District\\nCourse Title: C... 0.000566\n",
"28 29 Learn this in-depth pet air journey informatio... 0.001524\n",
"29 30 Marc Jones, Chief Technology Officer, Alkami\\n... 0.001189\n",
"30 31 Marsh, H. W, Parker, P. D & Morin, AJ. (2016).... 0.001947\n",
"31 32 The King of The South T.I.P returns yet again ... 0.011320\n",
"32 33 No prices to compare at the moment.\\nWhat is A... 0.026912\n",
"33 34 Hanover Park police and firefighter/paramedics... 0.001274\n",
"34 35 Julie Buchanan - Your wedding celebrant\\nI am ... 0.001094\n",
"35 36 - Open Access\\nNorepinephrine enhances the LPS... 0.174337\n",
"36 37 Are you an Amazon Echo or Echo dot owner? Woul... 0.006586\n",
"37 38 Ninth day of my apprenticeship. Went dumpster ... 0.003054\n",
"38 39 Regret and rue and remorse are all from the pa... 0.102948\n",
"39 40 Can i we take Immune globulin intravenous rout... 0.004886\n",
"40 41 Whether you’re single or have a significant ot... 0.049372\n",
"41 42 Short-lived or infrequent episodes of stress p... 0.001169\n",
"42 43 Unknown > Shiny Poison\\nNumber in series: Tags... 0.044633\n",
"43 44 Al Jazira confirm Eric Gerets as new coach to ... 0.000965\n",
"44 45 Mortgage Refinance Rates. Compare current, cus... 0.176559\n",
"45 46 Being an independent structural division, the ... 0.003829\n",
"46 47 After getting the shaft from schedule makers t... 0.000514\n",
"47 48 gift of love, loyalty, and companionship\\npupp... 0.067827\n",
"48 49 PULL APART HEART\\nGold Coast indie rockers Eli... 0.002925\n",
"49 50 Food Technology school trips to Greece\\nStuden... 0.000471\n"
]
},
{
"data": {
"text/plain": [
"0"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# create parameters\n",
"HAPRuntime(input_folder=\"../test-data/input\",\n",
" output_folder=\"../output\",\n",
"HAPRuntime(input_folder=\"test-data/input\",\n",
" output_folder=\"output\",\n",
" model_name_or_path= 'ibm-granite/granite-guardian-hap-38m',\n",
" annotation_column= \"hap_score\",\n",
" doc_text_column= \"contents\",\n",
Expand All @@ -110,25 +230,25 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 6,
"id": "f21d5d9b-562d-4530-8cea-2de5b63eb1dc",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['../output/metadata.json', '../output/test1.parquet']"
"['output/metadata.json', 'output/test1.parquet']"
]
},
"execution_count": 5,
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# the outputs will be located in the following folders\n",
"import glob\n",
"glob.glob(\"../output/*\")"
"glob.glob(\"output/*\")"
]
},
{
Expand Down Expand Up @@ -156,7 +276,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.0"
"version": "3.11.10"
}
},
"nbformat": 4,
Expand Down

0 comments on commit 64781b9

Please sign in to comment.