diff --git a/transforms/universal/web2parquet/dpk_web2parquet/config.py b/transforms/universal/web2parquet/dpk_web2parquet/config.py index b393de3b0..16584cb57 100644 --- a/transforms/universal/web2parquet/dpk_web2parquet/config.py +++ b/transforms/universal/web2parquet/dpk_web2parquet/config.py @@ -10,12 +10,8 @@ # limitations under the License. ################################################################################ -import time -import sys from argparse import ArgumentParser, Namespace -from typing import Any -import pyarrow as pa from data_processing.transform import TransformConfiguration from data_processing.utils import CLIArgumentProvider from data_processing.utils import get_logger @@ -58,7 +54,7 @@ def add_input_params(self, parser: ArgumentParser) -> None: help="maxumum number of downloaded URLs", ) parser.add_argument(f"--{folder_cli_param}", type=str, default=None, - help="Folder wher to store downloaded files", + help="Folder where to store downloaded files", ) parser.add_argument(f"--{urls_cli_param}", type=str, default=None, help="List of Seed URLs for the crawler", diff --git a/transforms/universal/web2parquet/dpk_web2parquet/local.py b/transforms/universal/web2parquet/dpk_web2parquet/local.py index fea14b457..cc0b8956d 100644 --- a/transforms/universal/web2parquet/dpk_web2parquet/local.py +++ b/transforms/universal/web2parquet/dpk_web2parquet/local.py @@ -10,7 +10,6 @@ # limitations under the License. ################################################################################ -import os from dpk_web2parquet.transform import Web2Parquet diff --git a/transforms/universal/web2parquet/dpk_web2parquet/local_python.py b/transforms/universal/web2parquet/dpk_web2parquet/local_python.py index b6764015b..735f0eb02 100644 --- a/transforms/universal/web2parquet/dpk_web2parquet/local_python.py +++ b/transforms/universal/web2parquet/dpk_web2parquet/local_python.py @@ -20,7 +20,7 @@ # create parameters input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..","test-data","input")) -output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output")) +output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "output")) local_conf = { "input_folder": input_folder, "output_folder": output_folder, diff --git a/transforms/universal/web2parquet/dpk_web2parquet/python_runtime.py b/transforms/universal/web2parquet/dpk_web2parquet/python_runtime.py index 535a74ca4..6b2acdfc5 100644 --- a/transforms/universal/web2parquet/dpk_web2parquet/python_runtime.py +++ b/transforms/universal/web2parquet/dpk_web2parquet/python_runtime.py @@ -1,3 +1,15 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + import time from data_processing.runtime.pure_python import PythonTransformLauncher diff --git a/transforms/universal/web2parquet/dpk_web2parquet/transform.py b/transforms/universal/web2parquet/dpk_web2parquet/transform.py index 012460443..5cd402fa2 100644 --- a/transforms/universal/web2parquet/dpk_web2parquet/transform.py +++ b/transforms/universal/web2parquet/dpk_web2parquet/transform.py @@ -17,7 +17,9 @@ from data_processing.data_access import DataAccessLocal from data_processing.transform import AbstractTableTransform from data_processing.utils import get_logger -from dpk_connector import crawl, shutdown +from dpk_connector import crawl +from dpk_web2parquet.utils import * + user_agent = "Mozilla/5.0 (X11; Linux i686; rv:125.0) Gecko/20100101 Firefox/125.0" @@ -62,17 +64,8 @@ def on_download(self, url: str, body: bytes, headers: dict) -> None: Callback function called when a page has been downloaded. You have access to the request URL, response body and headers. """ - doc={} + doc=get_file_info(url, headers) doc['url'] = url -# doc['file_size'] = int(headers.get('Content-Length', 0)) # Default to 0 if not found - doc['content_type']=headers.get('Content-Type') - try: - filename = headers.get('Content-Disposition').split('filename=')[1].strip().strip('"') - except: - url_split=url.split('/') - filename = url_split[-1] if not url.endswith('/') else url_split[-2] - filename = filename.replace('.','_')+"-"+doc['content_type'].split(';')[0].replace("/", ".") - doc['filename']=filename doc['contents'] = body logger.debug(f"url: {doc['url']}, filename: {doc['filename']}, content_type: {doc['content_type']}") @@ -99,12 +92,8 @@ def transform(self, table: pa.Table=None, file_name: str = None) -> tuple[list[p allow_mime_types=self.allow_mime_types ) # blocking call - # Shutdown all crawls - # Check with @Matsubara-san as this is preventing us from calling the transfrom method a second time. - # shutdown() end_time = time.time() -# logger.debug(f"Way After: {self.docs}") table = pa.Table.from_pylist(self.docs) metadata = { "count": len(self.docs), diff --git a/transforms/universal/web2parquet/dpk_web2parquet/utils.py b/transforms/universal/web2parquet/dpk_web2parquet/utils.py index 5a7fc9cb9..8214cc817 100644 --- a/transforms/universal/web2parquet/dpk_web2parquet/utils.py +++ b/transforms/universal/web2parquet/dpk_web2parquet/utils.py @@ -10,18 +10,29 @@ # limitations under the License. ################################################################################ -from datetime import datetime -def get_file_info(headers, url): - # Extract file size - file_size = int(headers.get('Content-Length', 0)) # Default to 0 if not found - content_type = headers.get('Content-Type') +from urllib.parse import urlparse + +def get_file_info(url: str, headers: dict=None): + try: + file_size = int(headers['Content-Length']) + except: + file_size=0 + try: + content_type=headers.get('Content-Type') + except: + content_type='text/html' + + url_parse=urlparse(url) try: filename = headers.get('Content-Disposition').split('filename=')[1].strip().strip('"') except: - url_split=url.split('/') - filename = url_split[-1] if not url.endswith('/') else url_split[-2] - filename = filename.replace('.','_')+"-"+content_type.replace("/", ".") + filename='-'.join(url_parse.path.strip('/').split('/')) + # Prepend host name + filename=url_parse.netloc.replace('.',"_")+'_'+filename + + # append extension using content type + filename = filename+"_"+content_type.split(';')[0].replace("/", ".") + return {'filename':filename, 'content_type': content_type, 'file_size': file_size} - return filename, content_type, file_size diff --git a/transforms/universal/web2parquet/test-data/expected/metadata.json b/transforms/universal/web2parquet/test-data/expected/metadata.json index dd65c2493..a2a9db309 100644 --- a/transforms/universal/web2parquet/test-data/expected/metadata.json +++ b/transforms/universal/web2parquet/test-data/expected/metadata.json @@ -5,8 +5,8 @@ "job name": "web2parquet", "job type": "pure python", "job id": "job_id", - "start_time": "2024-11-12 20:39:44", - "end_time": "2024-11-12 20:39:45", + "start_time": "2024-11-14 07:31:14", + "end_time": "2024-11-14 07:31:14", "status": "success" }, "code": { @@ -28,18 +28,18 @@ "num_processors": 0 }, "execution_stats": { - "cpus": 13.7, + "cpus": 21.1, "gpus": 0, - "memory": 14.5, + "memory": 13.62, "object_store": 0, - "execution time, min": 0.016 + "execution time, min": 0.01 }, "job_output_stats": { "source_files": 1, "source_size": 485, "result_files": 1, - "result_size": 32039, - "processing_time": 0.94, + "result_size": 32718, + "processing_time": 0.617, "count": 1, "requested_seeds": 1, "requested_depth": 1, @@ -52,7 +52,7 @@ "type": "path" }, "target": { - "name": "/Users/touma/data-prep-kit/transforms/universal/web2parquet/output", + "name": "/Users/touma/data-prep-kit/transforms/universal/web2parquet/test-data/output", "type": "path" } } \ No newline at end of file diff --git a/transforms/universal/web2parquet/test-data/expected/test.parquet b/transforms/universal/web2parquet/test-data/expected/test.parquet index 46dcd9815..49a48ae57 100644 Binary files a/transforms/universal/web2parquet/test-data/expected/test.parquet and b/transforms/universal/web2parquet/test-data/expected/test.parquet differ