Skip to content

Commit

Permalink
fix filename issue
Browse files Browse the repository at this point in the history
Signed-off-by: Maroun Touma <[email protected]>
  • Loading branch information
touma-I committed Nov 14, 2024
1 parent 137d92c commit d2404f4
Show file tree
Hide file tree
Showing 8 changed files with 46 additions and 39 deletions.
6 changes: 1 addition & 5 deletions transforms/universal/web2parquet/dpk_web2parquet/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,8 @@
# limitations under the License.
################################################################################

import time
import sys
from argparse import ArgumentParser, Namespace
from typing import Any

import pyarrow as pa
from data_processing.transform import TransformConfiguration
from data_processing.utils import CLIArgumentProvider
from data_processing.utils import get_logger
Expand Down Expand Up @@ -58,7 +54,7 @@ def add_input_params(self, parser: ArgumentParser) -> None:
help="maxumum number of downloaded URLs",
)
parser.add_argument(f"--{folder_cli_param}", type=str, default=None,
help="Folder wher to store downloaded files",
help="Folder where to store downloaded files",
)
parser.add_argument(f"--{urls_cli_param}", type=str, default=None,
help="List of Seed URLs for the crawler",
Expand Down
1 change: 0 additions & 1 deletion transforms/universal/web2parquet/dpk_web2parquet/local.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
# limitations under the License.
################################################################################

import os

from dpk_web2parquet.transform import Web2Parquet

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@

# create parameters
input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..","test-data","input"))
output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output"))
output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "output"))
local_conf = {
"input_folder": input_folder,
"output_folder": output_folder,
Expand Down
12 changes: 12 additions & 0 deletions transforms/universal/web2parquet/dpk_web2parquet/python_runtime.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,15 @@
# (C) Copyright IBM Corp. 2024.
# Licensed under the Apache License, Version 2.0 (the “License”);
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an “AS IS” BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
################################################################################

import time

from data_processing.runtime.pure_python import PythonTransformLauncher
Expand Down
19 changes: 4 additions & 15 deletions transforms/universal/web2parquet/dpk_web2parquet/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,9 @@
from data_processing.data_access import DataAccessLocal
from data_processing.transform import AbstractTableTransform
from data_processing.utils import get_logger
from dpk_connector import crawl, shutdown
from dpk_connector import crawl
from dpk_web2parquet.utils import *



user_agent = "Mozilla/5.0 (X11; Linux i686; rv:125.0) Gecko/20100101 Firefox/125.0"
Expand Down Expand Up @@ -62,17 +64,8 @@ def on_download(self, url: str, body: bytes, headers: dict) -> None:
Callback function called when a page has been downloaded.
You have access to the request URL, response body and headers.
"""
doc={}
doc=get_file_info(url, headers)
doc['url'] = url
# doc['file_size'] = int(headers.get('Content-Length', 0)) # Default to 0 if not found
doc['content_type']=headers.get('Content-Type')
try:
filename = headers.get('Content-Disposition').split('filename=')[1].strip().strip('"')
except:
url_split=url.split('/')
filename = url_split[-1] if not url.endswith('/') else url_split[-2]
filename = filename.replace('.','_')+"-"+doc['content_type'].split(';')[0].replace("/", ".")
doc['filename']=filename
doc['contents'] = body

logger.debug(f"url: {doc['url']}, filename: {doc['filename']}, content_type: {doc['content_type']}")
Expand All @@ -99,12 +92,8 @@ def transform(self, table: pa.Table=None, file_name: str = None) -> tuple[list[p
allow_mime_types=self.allow_mime_types
) # blocking call

# Shutdown all crawls
# Check with @Matsubara-san as this is preventing us from calling the transfrom method a second time.
# shutdown()

end_time = time.time()
# logger.debug(f"Way After: {self.docs}")
table = pa.Table.from_pylist(self.docs)
metadata = {
"count": len(self.docs),
Expand Down
29 changes: 20 additions & 9 deletions transforms/universal/web2parquet/dpk_web2parquet/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,18 +10,29 @@
# limitations under the License.
################################################################################

from datetime import datetime

def get_file_info(headers, url):
# Extract file size
file_size = int(headers.get('Content-Length', 0)) # Default to 0 if not found
content_type = headers.get('Content-Type')
from urllib.parse import urlparse

def get_file_info(url: str, headers: dict=None):
try:
file_size = int(headers['Content-Length'])
except:
file_size=0
try:
content_type=headers.get('Content-Type')
except:
content_type='text/html'

url_parse=urlparse(url)
try:
filename = headers.get('Content-Disposition').split('filename=')[1].strip().strip('"')
except:
url_split=url.split('/')
filename = url_split[-1] if not url.endswith('/') else url_split[-2]
filename = filename.replace('.','_')+"-"+content_type.replace("/", ".")
filename='-'.join(url_parse.path.strip('/').split('/'))
# Prepend host name
filename=url_parse.netloc.replace('.',"_")+'_'+filename

# append extension using content type
filename = filename+"_"+content_type.split(';')[0].replace("/", ".")
return {'filename':filename, 'content_type': content_type, 'file_size': file_size}

return filename, content_type, file_size

Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
"job name": "web2parquet",
"job type": "pure python",
"job id": "job_id",
"start_time": "2024-11-12 20:39:44",
"end_time": "2024-11-12 20:39:45",
"start_time": "2024-11-14 07:31:14",
"end_time": "2024-11-14 07:31:14",
"status": "success"
},
"code": {
Expand All @@ -28,18 +28,18 @@
"num_processors": 0
},
"execution_stats": {
"cpus": 13.7,
"cpus": 21.1,
"gpus": 0,
"memory": 14.5,
"memory": 13.62,
"object_store": 0,
"execution time, min": 0.016
"execution time, min": 0.01
},
"job_output_stats": {
"source_files": 1,
"source_size": 485,
"result_files": 1,
"result_size": 32039,
"processing_time": 0.94,
"result_size": 32718,
"processing_time": 0.617,
"count": 1,
"requested_seeds": 1,
"requested_depth": 1,
Expand All @@ -52,7 +52,7 @@
"type": "path"
},
"target": {
"name": "/Users/touma/data-prep-kit/transforms/universal/web2parquet/output",
"name": "/Users/touma/data-prep-kit/transforms/universal/web2parquet/test-data/output",
"type": "path"
}
}
Binary file modified transforms/universal/web2parquet/test-data/expected/test.parquet
Binary file not shown.

0 comments on commit d2404f4

Please sign in to comment.