fix filename issue

Signed-off-by: Maroun Touma <[email protected]>
IBM · Nov 14, 2024 · d2404f4 · d2404f4
1 parent 137d92c
commit d2404f4
Show file tree

Hide file tree

Showing 8 changed files with 46 additions and 39 deletions.
diff --git a/transforms/universal/web2parquet/dpk_web2parquet/config.py b/transforms/universal/web2parquet/dpk_web2parquet/config.py
@@ -10,12 +10,8 @@
 # limitations under the License.
 ################################################################################
 
-import time
-import sys
 from argparse import ArgumentParser, Namespace
-from typing import Any
 
-import pyarrow as pa
 from data_processing.transform import TransformConfiguration
 from data_processing.utils import CLIArgumentProvider
 from data_processing.utils import get_logger
@@ -58,7 +54,7 @@ def add_input_params(self, parser: ArgumentParser) -> None:
             help="maxumum number of downloaded URLs",
         )
         parser.add_argument(f"--{folder_cli_param}", type=str, default=None,
-            help="Folder wher to store downloaded files",
+            help="Folder where to store downloaded files",
         )
         parser.add_argument(f"--{urls_cli_param}", type=str, default=None,
             help="List of Seed URLs for the crawler",

diff --git a/transforms/universal/web2parquet/dpk_web2parquet/local.py b/transforms/universal/web2parquet/dpk_web2parquet/local.py
@@ -10,7 +10,6 @@
 # limitations under the License.
 ################################################################################
 
-import os
 
 from dpk_web2parquet.transform import Web2Parquet
 

diff --git a/transforms/universal/web2parquet/dpk_web2parquet/local_python.py b/transforms/universal/web2parquet/dpk_web2parquet/local_python.py
@@ -20,7 +20,7 @@
 
 # create parameters
 input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..","test-data","input"))
-output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output"))
+output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "output"))
 local_conf = {
     "input_folder": input_folder,
     "output_folder": output_folder,

diff --git a/transforms/universal/web2parquet/dpk_web2parquet/python_runtime.py b/transforms/universal/web2parquet/dpk_web2parquet/python_runtime.py
@@ -1,3 +1,15 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
 import time
 
 from data_processing.runtime.pure_python import PythonTransformLauncher

diff --git a/transforms/universal/web2parquet/dpk_web2parquet/transform.py b/transforms/universal/web2parquet/dpk_web2parquet/transform.py
@@ -17,7 +17,9 @@
 from data_processing.data_access import DataAccessLocal
 from data_processing.transform import AbstractTableTransform
 from data_processing.utils import get_logger
-from dpk_connector import crawl, shutdown
+from dpk_connector import crawl
+from dpk_web2parquet.utils import *
+
 
 
 user_agent = "Mozilla/5.0 (X11; Linux i686; rv:125.0) Gecko/20100101 Firefox/125.0"
@@ -62,17 +64,8 @@ def on_download(self, url: str, body: bytes, headers: dict) -> None:
         Callback function called when a page has been downloaded.
         You have access to the request URL, response body and headers.
         """
-        doc={}
+        doc=get_file_info(url, headers)
         doc['url'] = url
-#        doc['file_size'] = int(headers.get('Content-Length', 0))  # Default to 0 if not found  
-        doc['content_type']=headers.get('Content-Type')
-        try:
-            filename = headers.get('Content-Disposition').split('filename=')[1].strip().strip('"')
-        except:
-            url_split=url.split('/')
-            filename = url_split[-1] if not url.endswith('/') else url_split[-2]
-            filename = filename.replace('.','_')+"-"+doc['content_type'].split(';')[0].replace("/", ".")
-        doc['filename']=filename
         doc['contents'] = body
 
         logger.debug(f"url: {doc['url']}, filename: {doc['filename']}, content_type: {doc['content_type']}")
@@ -99,12 +92,8 @@ def transform(self, table: pa.Table=None, file_name: str = None) -> tuple[list[p
             allow_mime_types=self.allow_mime_types
         )  # blocking call
 
-        # Shutdown all crawls
-        # Check with @Matsubara-san as this is preventing us from calling the transfrom method a second time.
-    #    shutdown()
 
         end_time = time.time()      
-#        logger.debug(f"Way After: {self.docs}")
         table = pa.Table.from_pylist(self.docs)
         metadata = {
             "count": len(self.docs),

diff --git a/transforms/universal/web2parquet/dpk_web2parquet/utils.py b/transforms/universal/web2parquet/dpk_web2parquet/utils.py
@@ -10,18 +10,29 @@
 # limitations under the License.
 ################################################################################
 
-from datetime import datetime
 
-def get_file_info(headers, url):
-    # Extract file size
-    file_size = int(headers.get('Content-Length', 0))  # Default to 0 if not found
-    content_type = headers.get('Content-Type')
+from urllib.parse import urlparse
+
+def get_file_info(url: str, headers: dict=None):
+    try:
+        file_size = int(headers['Content-Length'])
+    except:
+        file_size=0        
+    try:
+        content_type=headers.get('Content-Type')
+    except:
+        content_type='text/html'
+
+    url_parse=urlparse(url)
     try:
         filename = headers.get('Content-Disposition').split('filename=')[1].strip().strip('"')
     except:
-        url_split=url.split('/')
-        filename = url_split[-1] if not url.endswith('/') else url_split[-2]
-        filename = filename.replace('.','_')+"-"+content_type.replace("/", ".")
+        filename='-'.join(url_parse.path.strip('/').split('/'))
+    # Prepend host name 
+    filename=url_parse.netloc.replace('.',"_")+'_'+filename
+
+    # append extension using content type
+    filename = filename+"_"+content_type.split(';')[0].replace("/", ".")
+    return {'filename':filename, 'content_type': content_type, 'file_size': file_size}
 
-    return filename, content_type, file_size
 
diff --git a/transforms/universal/web2parquet/test-data/expected/metadata.json b/transforms/universal/web2parquet/test-data/expected/metadata.json
@@ -5,8 +5,8 @@
     "job name": "web2parquet",
     "job type": "pure python",
     "job id": "job_id",
-    "start_time": "2024-11-12 20:39:44",
-    "end_time": "2024-11-12 20:39:45",
+    "start_time": "2024-11-14 07:31:14",
+    "end_time": "2024-11-14 07:31:14",
     "status": "success"
   },
   "code": {
@@ -28,18 +28,18 @@
     "num_processors": 0
   },
   "execution_stats": {
-    "cpus": 13.7,
+    "cpus": 21.1,
     "gpus": 0,
-    "memory": 14.5,
+    "memory": 13.62,
     "object_store": 0,
-    "execution time, min": 0.016
+    "execution time, min": 0.01
   },
   "job_output_stats": {
     "source_files": 1,
     "source_size": 485,
     "result_files": 1,
-    "result_size": 32039,
-    "processing_time": 0.94,
+    "result_size": 32718,
+    "processing_time": 0.617,
     "count": 1,
     "requested_seeds": 1,
     "requested_depth": 1,
@@ -52,7 +52,7 @@
     "type": "path"
   },
   "target": {
-    "name": "/Users/touma/data-prep-kit/transforms/universal/web2parquet/output",
+    "name": "/Users/touma/data-prep-kit/transforms/universal/web2parquet/test-data/output",
     "type": "path"
   }
 }
diff --git a/transforms/universal/web2parquet/test-data/expected/test.parquet b/transforms/universal/web2parquet/test-data/expected/test.parquet