Skip to content

Commit

Permalink
Strip catalog attributes - Optional reconstruction version. (#501)
Browse files Browse the repository at this point in the history
<!-- Please ensure the PR fulfills the following requirements! -->
<!-- If this is your first PR, make sure to add your details to the
AUTHORS.rst! -->
### Pull Request Checklist:
- [x] This PR addresses an already opened issue (for bug fixes /
features)
    - This PR fixes #499 and fixes #500.
- [x] (If applicable) Documentation has been added / updated (for bug
fixes / features).
- [x] (If applicable) Tests have been added.
- [x] This PR does not seem to break the templates.
- [x] CHANGELOG.rst has been updated (with summary of main changes).
- [x] Link to issue (:issue:`number`) and pull request (:pull:`number`)
has been added.

### What kind of change does this PR introduce?

* New `strip_cat_attrs` utility to remove those attributes added by the
catalog opening functions.
    + Use this by default in `save_to_zarr` and `save_to_netcdf`.
* In `build_path`, added possibility to have aggregated folder levels
with optional components.
    + Make `version` optionnal in the non-sim schemas.

### Does this PR introduce a breaking change?
* Yes. Saved files will have a few attributes removed. 
* Yes. `build_path` will not fail on a reconstruction dataset missing a
version.
  • Loading branch information
aulemahal authored Jan 17, 2025
2 parents 1d21e3e + 7f3f276 commit 9925181
Show file tree
Hide file tree
Showing 7 changed files with 51 additions and 14 deletions.
5 changes: 5 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,11 @@ New features and enhancements
* Conservative regridding now supports oblique mercator projections. (:pull:`467`).
* The automatic name for the weight file in ``regrid_dataset`` is now more explicit to avoid errors, but now requires `cat:id` and `cat:domain` arguments for both the source and target datasets. (:pull:`467`).

Breaking changes
^^^^^^^^^^^^^^^^
* Version facet is now optional in default filepath schemas for non-simulations a with "source_version" level. (:issue:`500`, :pull:`501`).
* Catalog attributes are removed by default in ``save_to_zarr`` and ``save_to_netcdf``. Catalog attributes are those added from the catalog columns by ``to_dataset``, ``to_dataset_dict`` and ``extract_dataset``, which have names prefixed with ``cat:``. (:issue:`499`, :pull:`501`).

Bug fixes
^^^^^^^^^
* Fixed bug with reusing weights. (:issue:`411`, :pull:`414`).
Expand Down
3 changes: 2 additions & 1 deletion src/xscen/catutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1051,7 +1051,8 @@ def _get_needed_fields(schema: dict):
needed.add(level)
elif isinstance(level, list):
for lvl in level:
needed.add(lvl)
if not (lvl.startswith("(") and lvl.endswith(")")):
needed.add(lvl)
elif not (isinstance(level, dict) and list(level.keys()) == ["text"]):
raise ValueError(
f"Invalid schema with unknown {level} of type {type(level)}."
Expand Down
10 changes: 5 additions & 5 deletions src/xscen/data/file_schema.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
# # There are four ways to specify a folder name to use:
# - < facet > # The value of the facet.
# - (< facet >) # Same, but if the facet is missing, this level is skipped, resulting in a tree of a different depth.
# - [< facet >, < facet >, ...]: # The folder name consists in more than one facet, concatenated with a "_" by default. They can't be optional.
# - [< facet >, < facet >, ...]: # The folder name consists in more than one facet, concatenated with a "_" by default. They can be optional.
# - text: < value > # A fixed string
# filename: # The file name schema, a list of facet names. If a facet is empty, it will be skipped. Elements will be separated by "_".
# # The special "DATES" facet will be replaced by the most concise way found to define the temporal range covered by the file.
Expand All @@ -33,7 +33,7 @@ original-non-sims:
- type
- domain
- institution
- [ source, version ]
- [ source, (version) ]
- (member)
- frequency
- variable
Expand Down Expand Up @@ -92,7 +92,7 @@ original-hydro-reconstruction:
- hydrology_source
- (hydrology_member)
- institution
- [ source, version ]
- [ source, (version) ]
- (member)
- frequency
- variable
Expand Down Expand Up @@ -199,7 +199,7 @@ derived-reconstruction:
folders:
- type
- institution
- [ source, version ]
- [ source, (version) ]
- (member)
- domain
- processing_level
Expand Down Expand Up @@ -261,7 +261,7 @@ derived-hydro-reconstruction:
- hydrology_source
- (hydrology_member)
- institution
- [ source, version ]
- [ source, (version) ]
- (member)
- domain
- processing_level
Expand Down
14 changes: 13 additions & 1 deletion src/xscen/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@

from .config import parse_config
from .scripting import TimeoutException
from .utils import TRANSLATOR, season_sort_key, translate_time_chunk
from .utils import TRANSLATOR, season_sort_key, strip_cat_attrs, translate_time_chunk

logger = logging.getLogger(__name__)
KEEPBITS = defaultdict(lambda: 12)
Expand Down Expand Up @@ -374,6 +374,7 @@ def save_to_netcdf(
bitround: bool | int | dict = False,
compute: bool = True,
netcdf_kwargs: dict | None = None,
strip_cat_metadata: bool = True,
):
"""Save a Dataset to NetCDF, rechunking or compressing if requested.
Expand All @@ -399,6 +400,8 @@ def save_to_netcdf(
Whether to start the computation or return a delayed object.
netcdf_kwargs : dict, optional
Additional arguments to send to_netcdf()
strip_cat_metadata : bool
If True (default), strips all catalog-added attributes before saving the dataset.
Returns
-------
Expand All @@ -425,6 +428,9 @@ def save_to_netcdf(
# Remove original_shape from encoding, since it can cause issues with some engines.
ds[var].encoding.pop("original_shape", None)

if strip_cat_metadata:
ds = strip_cat_attrs(ds)

_coerce_attrs(ds.attrs)
for var in ds.variables.values():
_coerce_attrs(var.attrs)
Expand All @@ -445,6 +451,7 @@ def save_to_zarr( # noqa: C901
mode: str = "f",
itervar: bool = False,
timeout_cleanup: bool = True,
strip_cat_metadata: bool = True,
):
"""
Save a Dataset to Zarr format, rechunking and compressing if requested.
Expand Down Expand Up @@ -487,6 +494,8 @@ def save_to_zarr( # noqa: C901
If True (default) and a :py:class:`xscen.scripting.TimeoutException` is raised during the writing,
the variable being written is removed from the dataset as it is incomplete.
This does nothing if `compute` is False.
strip_cat_metadata : bool
If True (default), strips all catalog-added attributes before saving the dataset.
Returns
-------
Expand Down Expand Up @@ -561,6 +570,9 @@ def _skip(var):
if len(ds.data_vars) == 0:
return None

if strip_cat_metadata:
ds = strip_cat_attrs(ds)

_coerce_attrs(ds.attrs)
for var in ds.variables.values():
_coerce_attrs(var.attrs)
Expand Down
11 changes: 9 additions & 2 deletions src/xscen/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -633,6 +633,15 @@ def get_cat_attrs(
return facets


def strip_cat_attrs(ds: xr.Dataset, prefix: str = "cat:"):
"""Remove attributes added from the catalog by `to_dataset` or `extract_dataset`."""
dsc = ds.copy()
for k in list(dsc.attrs):
if k.startswith(prefix):
del dsc.attrs[k]
return dsc


@parse_config
def maybe_unstack(
ds: xr.Dataset,
Expand Down Expand Up @@ -923,12 +932,10 @@ def clean_up( # noqa: C901
msg = f"Converting units: {variables_and_units}"
logger.info(msg)
ds = change_units(ds=ds, variables_and_units=variables_and_units)

# convert calendar
if convert_calendar_kwargs:
# create mask of grid point that should always be nan
ocean = ds.isnull().all("time")

# if missing_by_var exist make sure missing data are added to time axis
if missing_by_var:
if not all(k in missing_by_var.keys() for k in ds.data_vars):
Expand Down
18 changes: 13 additions & 5 deletions tests/test_catutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -273,20 +273,28 @@ def test_pattern_from_schema(samplecat):
assert any(res)


def test_build_path_ds():
@pytest.mark.parametrize("hasver", [True, False])
def test_build_path_ds(hasver):
ds = xr.tutorial.open_dataset("air_temperature")
ds = ds.assign(time=xr.cftime_range("0001-01-01", freq="6h", periods=ds.time.size))
ds.attrs.update(source="source", institution="institution")
if hasver:
ds.attrs["version"] = "v1"
new_path = cu.build_path(
ds,
schemas={
"folders": ["source", "institution", ["variable", "xrfreq"]],
"folders": [["source", "(version)"], "institution", ["variable", "xrfreq"]],
"filename": ["source", "institution", "variable", "frequency", "DATES"],
},
)
assert new_path == Path(
"source/institution/air_6h/source_institution_air_6hr_0001-0002"
)
if hasver:
assert new_path == Path(
"source_v1/institution/air_6h/source_institution_air_6hr_0001-0002"
)
else:
assert new_path == Path(
"source/institution/air_6h/source_institution_air_6hr_0001-0002"
)


def test_build_path_multivar(samplecat):
Expand Down
4 changes: 4 additions & 0 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,10 @@ def test_get_cat_attrs(self, ds, prefix, var_as_str):
elif prefix == "dog:":
assert out == {"source": "CanESM5"}

def test_strip_cat_attrs(self):
out = xs.utils.strip_cat_attrs(self.ds)
assert list(out.attrs.keys()) == ["dog:source"]


class TestStack:
def test_no_nan(self):
Expand Down

0 comments on commit 9925181

Please sign in to comment.