Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Strip catalog attributes - Optional reconstruction version. #501

Merged
merged 6 commits into from
Jan 17, 2025
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,11 @@ New features and enhancements
* Conservative regridding now supports oblique mercator projections. (:pull:`467`).
* The automatic name for the weight file in ``regrid_dataset`` is now more explicit to avoid errors, but now requires `cat:id` and `cat:domain` arguments for both the source and target datasets. (:pull:`467`).

Breaking changes
^^^^^^^^^^^^^^^^
* Version facet is now optional in default filepath schemas for non-simulations a with "source_version" level. (:issue:`500`, :pull:`501`).
* Catalog attributes are removed by default in ``save_to_zarr`` and ``save_to_netcdf``. Catalog attributes are those added from the catalog columns by ``to_dataset``, ``to_dataset_dict`` and ``extract_dataset``, which have names prefixed with ``cat:``. (:issue:`499`, :pull:`501`).

Bug fixes
^^^^^^^^^
* Fixed bug with reusing weights. (:issue:`411`, :pull:`414`).
Expand Down
3 changes: 2 additions & 1 deletion src/xscen/catutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1051,7 +1051,8 @@ def _get_needed_fields(schema: dict):
needed.add(level)
elif isinstance(level, list):
for lvl in level:
needed.add(lvl)
if not (lvl.startswith("(") and lvl.endswith(")")):
needed.add(lvl)
elif not (isinstance(level, dict) and list(level.keys()) == ["text"]):
raise ValueError(
f"Invalid schema with unknown {level} of type {type(level)}."
Expand Down
10 changes: 5 additions & 5 deletions src/xscen/data/file_schema.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
# # There are four ways to specify a folder name to use:
# - < facet > # The value of the facet.
# - (< facet >) # Same, but if the facet is missing, this level is skipped, resulting in a tree of a different depth.
# - [< facet >, < facet >, ...]: # The folder name consists in more than one facet, concatenated with a "_" by default. They can't be optional.
# - [< facet >, < facet >, ...]: # The folder name consists in more than one facet, concatenated with a "_" by default. They can be optional.
# - text: < value > # A fixed string
# filename: # The file name schema, a list of facet names. If a facet is empty, it will be skipped. Elements will be separated by "_".
# # The special "DATES" facet will be replaced by the most concise way found to define the temporal range covered by the file.
Expand All @@ -33,7 +33,7 @@ original-non-sims:
- type
- domain
- institution
- [ source, version ]
- [ source, (version) ]
- (member)
- frequency
- variable
Expand Down Expand Up @@ -92,7 +92,7 @@ original-hydro-reconstruction:
- hydrology_source
- (hydrology_member)
- institution
- [ source, version ]
- [ source, (version) ]
- (member)
- frequency
- variable
Expand Down Expand Up @@ -199,7 +199,7 @@ derived-reconstruction:
folders:
- type
- institution
- [ source, version ]
- [ source, (version) ]
- (member)
- domain
- processing_level
Expand Down Expand Up @@ -261,7 +261,7 @@ derived-hydro-reconstruction:
- hydrology_source
- (hydrology_member)
- institution
- [ source, version ]
- [ source, (version) ]
- (member)
- domain
- processing_level
Expand Down
14 changes: 13 additions & 1 deletion src/xscen/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@

from .config import parse_config
from .scripting import TimeoutException
from .utils import TRANSLATOR, season_sort_key, translate_time_chunk
from .utils import TRANSLATOR, season_sort_key, strip_cat_attrs, translate_time_chunk

logger = logging.getLogger(__name__)
KEEPBITS = defaultdict(lambda: 12)
Expand Down Expand Up @@ -374,6 +374,7 @@ def save_to_netcdf(
bitround: bool | int | dict = False,
compute: bool = True,
netcdf_kwargs: dict | None = None,
strip_cat_metadata: bool = True,
):
"""Save a Dataset to NetCDF, rechunking or compressing if requested.

Expand All @@ -399,6 +400,8 @@ def save_to_netcdf(
Whether to start the computation or return a delayed object.
netcdf_kwargs : dict, optional
Additional arguments to send to_netcdf()
strip_cat_metadata : bool
If True (default), strips all catalog-added attributes before saving the dataset.

Returns
-------
Expand All @@ -425,6 +428,9 @@ def save_to_netcdf(
# Remove original_shape from encoding, since it can cause issues with some engines.
ds[var].encoding.pop("original_shape", None)

if strip_cat_metadata:
ds = strip_cat_attrs(ds)

_coerce_attrs(ds.attrs)
for var in ds.variables.values():
_coerce_attrs(var.attrs)
Expand All @@ -445,6 +451,7 @@ def save_to_zarr( # noqa: C901
mode: str = "f",
itervar: bool = False,
timeout_cleanup: bool = True,
strip_cat_metadata: bool = True,
):
"""
Save a Dataset to Zarr format, rechunking and compressing if requested.
Expand Down Expand Up @@ -487,6 +494,8 @@ def save_to_zarr( # noqa: C901
If True (default) and a :py:class:`xscen.scripting.TimeoutException` is raised during the writing,
the variable being written is removed from the dataset as it is incomplete.
This does nothing if `compute` is False.
strip_cat_metadata : bool
If True (default), strips all catalog-added attributes before saving the dataset.

Returns
-------
Expand Down Expand Up @@ -561,6 +570,9 @@ def _skip(var):
if len(ds.data_vars) == 0:
return None

if strip_cat_metadata:
ds = strip_cat_attrs(ds)

_coerce_attrs(ds.attrs)
for var in ds.variables.values():
_coerce_attrs(var.attrs)
Expand Down
11 changes: 9 additions & 2 deletions src/xscen/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -633,6 +633,15 @@ def get_cat_attrs(
return facets


def strip_cat_attrs(ds: xr.Dataset, prefix: str = "cat:"):
"""Remove attributes added from the catalog by `to_dataset` or `extract_dataset`."""
dsc = ds.copy()
for k in list(dsc.attrs):
if k.startswith(prefix):
del dsc.attrs[k]
return dsc


@parse_config
def maybe_unstack(
ds: xr.Dataset,
Expand Down Expand Up @@ -923,12 +932,10 @@ def clean_up( # noqa: C901
msg = f"Converting units: {variables_and_units}"
logger.info(msg)
ds = change_units(ds=ds, variables_and_units=variables_and_units)

# convert calendar
if convert_calendar_kwargs:
# create mask of grid point that should always be nan
ocean = ds.isnull().all("time")

# if missing_by_var exist make sure missing data are added to time axis
if missing_by_var:
if not all(k in missing_by_var.keys() for k in ds.data_vars):
Expand Down
18 changes: 13 additions & 5 deletions tests/test_catutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -273,20 +273,28 @@ def test_pattern_from_schema(samplecat):
assert any(res)


def test_build_path_ds():
@pytest.mark.parametrize("hasver", [True, False])
def test_build_path_ds(hasver):
ds = xr.tutorial.open_dataset("air_temperature")
ds = ds.assign(time=xr.cftime_range("0001-01-01", freq="6h", periods=ds.time.size))
ds.attrs.update(source="source", institution="institution")
if hasver:
ds.attrs["version"] = "v1"
new_path = cu.build_path(
ds,
schemas={
"folders": ["source", "institution", ["variable", "xrfreq"]],
"folders": [["source", "(version)"], "institution", ["variable", "xrfreq"]],
"filename": ["source", "institution", "variable", "frequency", "DATES"],
},
)
assert new_path == Path(
"source/institution/air_6h/source_institution_air_6hr_0001-0002"
)
if hasver:
assert new_path == Path(
"source_v1/institution/air_6h/source_institution_air_6hr_0001-0002"
)
else:
assert new_path == Path(
"source/institution/air_6h/source_institution_air_6hr_0001-0002"
)


def test_build_path_multivar(samplecat):
Expand Down
4 changes: 4 additions & 0 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,10 @@ def test_get_cat_attrs(self, ds, prefix, var_as_str):
elif prefix == "dog:":
assert out == {"source": "CanESM5"}

def test_strip_cat_attrs(self):
out = xs.utils.strip_cat_attrs(self.ds)
assert list(out.attrs.keys()) == ["dog:source"]


class TestStack:
def test_no_nan(self):
Expand Down
Loading