From 7d2d251c64cbd0f60972e1f90a3ee57f0398107a Mon Sep 17 00:00:00 2001 From: ghiggi Date: Fri, 3 Nov 2023 13:32:41 +0100 Subject: [PATCH] Fix download utility and expand test --- disdrodb/data_transfer/download_data.py | 210 ++++++++++-------- disdrodb/data_transfer/upload_data.py | 4 +- .../test_data_transfer/test_download_data.py | 95 ++++++-- .../test_data_transfer/test_upload_data.py | 10 +- docs/source/contribute_data.rst | 2 +- 5 files changed, 211 insertions(+), 110 deletions(-) diff --git a/disdrodb/data_transfer/download_data.py b/disdrodb/data_transfer/download_data.py index 6799bd4c..ddff431f 100644 --- a/disdrodb/data_transfer/download_data.py +++ b/disdrodb/data_transfer/download_data.py @@ -19,6 +19,7 @@ """Routines to download data from the DISDRODB Decentralized Data Archive.""" import os +import shutil from typing import List, Optional, Union import click @@ -27,6 +28,7 @@ from disdrodb.api.metadata import get_list_metadata from disdrodb.configs import get_base_dir +from disdrodb.l0.io import _infer_disdrodb_tree_path from disdrodb.utils.compression import _unzip_file from disdrodb.utils.yaml import read_yaml @@ -86,92 +88,6 @@ def click_download_option(function: object): return function -def get_station_local_remote_locations(yaml_file_path: str) -> tuple: - """Return the station's local path and remote url. - - Parameters - ---------- - yaml_file_path : str - Path to the metadata YAML file. - - Returns - ------- - tuple - Tuple containing the local path and the url. - """ - - metadata_dict = read_yaml(yaml_file_path) - - # Check station name - expected_station_name = os.path.basename(yaml_file_path).replace(".yml", "") - - station_name = metadata_dict.get("station_name") - - if station_name and str(station_name) != str(expected_station_name): - return None, None, None - - # Get data url - station_remote_url = metadata_dict.get("data_url") - - # Get the local path - data_dir_path = os.path.dirname(yaml_file_path).replace("metadata", "data") - - return data_dir_path, station_name, station_remote_url - - -def _download_file_from_url(url: str, dir_path: str, force: bool = False) -> str: - """Download file. - - Parameters - ---------- - url : str - URL of the file to download. - dir_path : str - Dir path where to download the file. - force : bool, optional - Overwrite the raw data file if already existing, by default False. - - """ - - fname = os.path.basename(url) - file_path = os.path.join(dir_path, fname) - - if os.path.isfile(file_path): - if force: - os.remove(file_path) - else: - print(f"{file_path} already exists, skipping download.") - return file_path - - downloader = pooch.HTTPDownloader(progressbar=True) - pooch.retrieve(url=url, known_hash=None, path=dir_path, fname=fname, downloader=downloader, progressbar=tqdm) - - return file_path - - -def _download_station_data(metadata_fpath: str, force: bool = False) -> None: - """Download and unzip the station data . - - Parameters - ---------- - metadata_fpaths : str - Metadata file path. - force : bool, optional - force download, by default False - - """ - location_info = get_station_local_remote_locations(metadata_fpath) - - if None not in location_info: - data_dir_path, station_name, data_url = location_info - url_file_name, _ = os.path.splitext(os.path.basename(data_url)) - os.path.join(data_dir_path, url_file_name) - temp_zip_path = _download_file_from_url(data_url, data_dir_path, force) - _unzip_file(temp_zip_path, os.path.join(data_dir_path, str(station_name))) - if os.path.exists(temp_zip_path): - os.remove(temp_zip_path) - - def download_disdrodb_archives( data_sources: Optional[Union[str, List[str]]] = None, campaign_names: Optional[Union[str, List[str]]] = None, @@ -179,7 +95,7 @@ def download_disdrodb_archives( force: bool = False, base_dir: Optional[str] = None, ): - """Get all YAML files that contain the 'data_url' key + """Get all YAML files that contain the 'disdrodb_data_url' key and download the data locally. Parameters @@ -204,6 +120,7 @@ def download_disdrodb_archives( If None (the default), the disdrodb config variable 'dir' is used. """ + # Retrieve the requested metadata base_dir = get_base_dir(base_dir) metadata_fpaths = get_list_metadata( base_dir=base_dir, @@ -212,6 +129,121 @@ def download_disdrodb_archives( station_names=station_names, with_stations_data=False, ) - + # Try to download the data + # - It will download data only if the disdrodb_data_url is specified ! for metadata_fpath in metadata_fpaths: - _download_station_data(metadata_fpath, force) + try: + _download_station_data(metadata_fpath, force) + except Exception as e: + station_dir_path = _infer_disdrodb_tree_path(metadata_fpath).replace("metadata", "data").replace(".yml", "") + print(f"ERROR during downloading the station {station_dir_path}: {e}") + print(" ") + + +def _extract_station_files(zip_fpath, station_dir_path): + """Extract files from the station.zip file and remove the station.zip file.""" + _unzip_file(file_path=zip_fpath, dest_path=station_dir_path) + if os.path.exists(zip_fpath): + os.remove(zip_fpath) + + +def _download_station_data(metadata_fpath: str, force: bool = False) -> None: + """Download and unzip the station data . + + Parameters + ---------- + metadata_fpaths : str + Metadata file path. + force : bool, optional + force download, by default False + + """ + disdrodb_data_url, station_dir_path = _get_station_url_and_dir_path(metadata_fpath) + if disdrodb_data_url is not None: + # Download file + zip_fpath, to_unzip = _download_file_from_url(disdrodb_data_url, dst_dir_path=station_dir_path, force=force) + # Extract the stations files from the downloaded station.zip file + if to_unzip: + _extract_station_files(zip_fpath, station_dir_path=station_dir_path) + + +def _get_valid_station_name(metadata_fpath, metadata_dict): + """Check consistent station_name between YAML file name and metadata key.""" + # Check consistent station name + expected_station_name = os.path.basename(metadata_fpath).replace(".yml", "") + station_name = metadata_dict.get("station_name") + if station_name and str(station_name) != str(expected_station_name): + raise ValueError(f"Inconsistent station_name values in the {metadata_fpath} file. Download aborted.") + return station_name + + +def _get_station_url_and_dir_path(metadata_fpath: str) -> tuple: + """Return the station's remote url and the local destination directory path. + + Parameters + ---------- + metadata_fpath : str + Path to the metadata YAML file. + + Returns + ------- + disdrodb_data_url, station_dir_path + Tuple containing the remote url and the DISDRODB station directory path. + """ + metadata_dict = read_yaml(metadata_fpath) + station_name = _get_valid_station_name(metadata_fpath, metadata_dict) + disdrodb_data_url = metadata_dict.get("disdrodb_data_url", None) + # Define the destination local filepath path + data_dir_path = os.path.dirname(metadata_fpath).replace("metadata", "data") + station_dir_path = os.path.join(data_dir_path, station_name) + return disdrodb_data_url, station_dir_path + + +def _is_empty_directory(dir_path): + """Check if a directory is empty.""" + if not os.path.exists(dir_path): + raise OSError(f"{dir_path} does not exist.") + if not os.path.isdir(dir_path): + raise OSError(f"{dir_path} is not a directory.") + list_files = os.listdir(dir_path) + if len(list_files) == 0: + return True + else: + return False + + +def _download_file_from_url(url: str, dst_dir_path: str, force: bool = False) -> str: + """Download station zip file into the DISDRODB station data directory. + + Parameters + ---------- + url : str + URL of the file to download. + dst_dir_path : str + Local filepath where to download the file (DISDRODB station data directory). + force : bool, optional + Overwrite the raw data file if already existing, by default False. + + Returns + ------- + dst_fpath + Path of the downloaded file. + to_unzip + Flag that specify if the download station zip file must be unzipped. + """ + fname = os.path.basename(url) + dst_fpath = os.path.join(dst_dir_path, fname) + os.makedirs(dst_dir_path, exist_ok=True) + if not _is_empty_directory(dst_dir_path): + if force: + shutil.rmtree(dst_dir_path) + os.makedirs(dst_dir_path) # station directory + else: + print(f"There are already files within {dst_dir_path}. Skipping the station data download.") + to_unzip = False + return dst_fpath, to_unzip + os.makedirs(dst_dir_path, exist_ok=True) + downloader = pooch.HTTPDownloader(progressbar=True) + pooch.retrieve(url=url, known_hash=None, path=dst_dir_path, fname=fname, downloader=downloader, progressbar=tqdm) + to_unzip = True + return dst_fpath, to_unzip diff --git a/disdrodb/data_transfer/upload_data.py b/disdrodb/data_transfer/upload_data.py index fb555b98..8b554125 100644 --- a/disdrodb/data_transfer/upload_data.py +++ b/disdrodb/data_transfer/upload_data.py @@ -98,7 +98,7 @@ def _filter_already_uploaded(metadata_fpaths: List[str]) -> List[str]: for metadata_fpath in metadata_fpaths: metadata_dict = read_yaml(metadata_fpath) - if metadata_dict.get("data_url"): + if metadata_dict.get("disdrodb_data_url"): print(f"{metadata_fpath} already has a remote url specified. Skipping.") continue filtered.append(metadata_fpath) @@ -210,7 +210,7 @@ def _update_metadata_with_zenodo_url( """ zenodo_host = "sandbox.zenodo.org" if sandbox else "zenodo.org" metadata_dict = read_yaml(metadata_fpath) - metadata_dict["data_url"] = f"https://{zenodo_host}/record/{deposition_id}/files/{remote_path}.zip" + metadata_dict["disdrodb_data_url"] = f"https://{zenodo_host}/record/{deposition_id}/files/{remote_path}.zip" write_yaml(metadata_dict, metadata_fpath) diff --git a/disdrodb/tests/test_data_transfer/test_download_data.py b/disdrodb/tests/test_data_transfer/test_download_data.py index ee1acf42..c8b1d259 100644 --- a/disdrodb/tests/test_data_transfer/test_download_data.py +++ b/disdrodb/tests/test_data_transfer/test_download_data.py @@ -23,25 +23,94 @@ import pytest import yaml -from disdrodb.data_transfer import download_data +from disdrodb.data_transfer.download_data import ( + _download_file_from_url, + _download_station_data, + _is_empty_directory, +) -def create_fake_metadata_file(temp_path, data_source, campaign_name, station_name, with_url: bool = True): - subfolder_path = temp_path / "DISDRODB" / "Raw" / data_source / campaign_name / "metadata" - subfolder_path.mkdir(parents=True) +def test_download_file_from_url(tmp_path): + # Test download case when empty directory + url = "https://raw.githubusercontent.com/ltelab/disdrodb/main/README.md" + _download_file_from_url(url, tmp_path, force=False) + filename = os.path.basename(url) # README.md + filepath = os.path.join(tmp_path, filename) + assert os.path.isfile(filepath) is True + + # Test download case when directory is not empty and force=False --> avoid download + url = "https://raw.githubusercontent.com/ltelab/disdrodb/main/CODE_OF_CONDUCT.md" + _download_file_from_url(url, tmp_path, force=False) + filename = os.path.basename(url) # README.md + filepath = os.path.join(tmp_path, filename) + assert not os.path.isfile(filepath) + + # Test download case when directory is not empty and force=True --> it download + url = "https://raw.githubusercontent.com/ltelab/disdrodb/main/CODE_OF_CONDUCT.md" + _download_file_from_url(url, tmp_path, force=True) + filename = os.path.basename(url) # README.md + filepath = os.path.join(tmp_path, filename) + assert os.path.isfile(filepath) + + +class TestIsEmptyDirectory: + def test_non_existent_directory(self): + with pytest.raises(OSError, match=r".* does not exist."): + _is_empty_directory("non_existent_directory") + + def test_non_directory_path(self, tmp_path): + # Create a temporary file + file_path = tmp_path / "test_file.txt" + file_path.write_text("This is a test file.") + with pytest.raises(OSError, match=r".* is not a directory."): + _is_empty_directory(str(file_path)) + + def test_empty_directory(self, tmp_path): + # `tmp_path` is a pytest fixture that provides a temporary directory unique to the test invocation + assert _is_empty_directory(tmp_path) is True + + def test_non_empty_directory(self, tmp_path): + # Create a temporary file inside the temporary directory + file_path = tmp_path / "test_file.txt" + file_path.write_text("This is a test file.") + assert _is_empty_directory(tmp_path) is False + + +def create_fake_metadata_file( + tmp_path, + data_source="data_source", + campaign_name="campaign_name", + station_name="station_name", + with_url: bool = True, +): + metadata_dir_path = tmp_path / "DISDRODB" / "Raw" / data_source / campaign_name / "metadata" + metadata_dir_path.mkdir(parents=True) + metadata_fpath = os.path.join(metadata_dir_path, f"{station_name}.yml") # create a fake yaml file in temp folder - with open(os.path.join(subfolder_path, f"{station_name}.yml"), "w") as f: + with open(metadata_fpath, "w") as f: yaml_dict = {} + yaml_dict["station_name"] = station_name if with_url: - yaml_dict["data_url"] = "https://www.example.com" - yaml_dict["station_name"] = "station_name" + disdro_repo_path = "https://raw.githubusercontent.com/ltelab/disdrodb/main/" + test_data_path = "disdrodb/tests/data/test_data_download/station_files.zip" + disdrodb_data_url = disdro_repo_path + test_data_path + yaml_dict["disdrodb_data_url"] = disdrodb_data_url yaml.dump(yaml_dict, f) - - assert os.path.exists(os.path.join(subfolder_path, f"{station_name}.yml")) + assert os.path.exists(metadata_fpath) + return metadata_fpath -@pytest.mark.parametrize("url", ["https://raw.githubusercontent.com/ltelab/disdrodb/main/README.md"]) -def test_download_file_from_url(url, tmp_path): - download_data._download_file_from_url(url, tmp_path) - assert os.path.isfile(os.path.join(tmp_path, os.path.basename(url))) is True +def test_download_station_data(tmp_path): + station_name = "station_name" + metadata_fpath = create_fake_metadata_file(tmp_path, station_name=station_name, with_url=True) + station_dir_path = metadata_fpath.replace("metadata", "data").replace(".yml", "") + _download_station_data(metadata_fpath=metadata_fpath, force=True) + # Assert files in the zip file have been unzipped + assert os.path.isfile(os.path.join(station_dir_path, "station_file1.txt")) + # Assert inner zipped files are not unzipped ! + assert os.path.isfile(os.path.join(station_dir_path, "station_file2.zip")) + # Assert inner directories are there + assert os.path.isdir(os.path.join(station_dir_path, "2020")) + # Assert zip file has been removed + assert not os.path.exists(os.path.join(station_dir_path, "station_files.zip")) diff --git a/disdrodb/tests/test_data_transfer/test_upload_data.py b/disdrodb/tests/test_data_transfer/test_upload_data.py index acdb3dc1..ee70e2bf 100644 --- a/disdrodb/tests/test_data_transfer/test_upload_data.py +++ b/disdrodb/tests/test_data_transfer/test_upload_data.py @@ -30,15 +30,15 @@ from disdrodb.utils.zenodo import _create_zenodo_deposition -def create_fake_metadata_file(base_dir, data_source, campaign_name, station_name, data_url=""): +def create_fake_metadata_file(base_dir, data_source, campaign_name, station_name, disdrodb_data_url=""): metadata_dir = base_dir / "Raw" / data_source / campaign_name / "metadata" if not metadata_dir.exists(): metadata_dir.mkdir(parents=True) metadata_fpath = metadata_dir / f"{station_name}.yml" metadata_dict = {} - if data_url: - metadata_dict["data_url"] = data_url + if disdrodb_data_url: + metadata_dict["disdrodb_data_url"] = disdrodb_data_url write_yaml(metadata_dict, metadata_fpath) @@ -99,11 +99,11 @@ def test_upload_to_zenodo(tmp_path, requests_mock): # Check metadata files (1st one should not have changed) metadata_dict1 = get_metadata_dict(base_dir, data_source, campaign_name, station_name1) - new_station_url1 = metadata_dict1["data_url"] + new_station_url1 = metadata_dict1["disdrodb_data_url"] assert new_station_url1 == station_url1 metadata_dict2 = get_metadata_dict(base_dir, data_source, campaign_name, station_name2) - new_station_url2 = metadata_dict2["data_url"] + new_station_url2 = metadata_dict2["disdrodb_data_url"] list_new_station_url2 = new_station_url2.split(os.path.sep) list_new_station_url2 = re.split(r"[\\/]", new_station_url2) diff --git a/docs/source/contribute_data.rst b/docs/source/contribute_data.rst index cba0a836..b142eadb 100644 --- a/docs/source/contribute_data.rst +++ b/docs/source/contribute_data.rst @@ -20,7 +20,7 @@ Two types of data must be distinguished: - Stores a standard set of metadata and measurement issues of each disdrometer. - Central storage is provided in the ``disdro-data`` Git repository. - The ``/metadata`` folder contains a YAML metadata file called - ``.yml``. It has a ``data_url`` key that references to the remote/online repository where station's raw data are stored. At this URL, a single zip file provides all data available for a given station. + ``.yml``. It has a ``disdrodb_data_url`` key that references to the remote/online repository where station's raw data are stored. At this URL, a single zip file provides all data available for a given station. Data transfer upload and download schema: