From c46f3492d162bc850adcde9a91fd705abd521d89 Mon Sep 17 00:00:00 2001 From: Zhiyi Wu Date: Sat, 30 Dec 2023 12:17:47 +0000 Subject: [PATCH 1/3] update --- CHANGES | 3 ++ devtools/conda-envs/test_env.yaml | 2 +- src/alchemlyb/parsing/parquet.py | 39 +++++++++++++++++++-- src/alchemlyb/tests/conftest.py | 2 +- src/alchemlyb/tests/parsing/test_parquet.py | 35 +++++++++++++++++- src/alchemlyb/tests/test_preprocessing.py | 6 ++-- src/alchemlyb/tests/test_workflow_ABFE.py | 2 +- 7 files changed, 79 insertions(+), 10 deletions(-) diff --git a/CHANGES b/CHANGES index 7a8ed754..6c3b5f60 100644 --- a/CHANGES +++ b/CHANGES @@ -17,6 +17,9 @@ The rules for this file: */*/2023 hl2500 * 2.2.0 + +Changes + - For pandas>=2.1, metadata will be loaded from the parquet file (issue #331, PR #326). Enhancements - Add a TI estimator using gaussian quadrature to calculate the free energy. diff --git a/devtools/conda-envs/test_env.yaml b/devtools/conda-envs/test_env.yaml index 87470b55..a8795f1c 100644 --- a/devtools/conda-envs/test_env.yaml +++ b/devtools/conda-envs/test_env.yaml @@ -4,7 +4,7 @@ channels: dependencies: - python - numpy -- pandas +- pandas>=2.1 - pymbar>=4 - scipy - scikit-learn diff --git a/src/alchemlyb/parsing/parquet.py b/src/alchemlyb/parsing/parquet.py index 180817ae..c06664c7 100644 --- a/src/alchemlyb/parsing/parquet.py +++ b/src/alchemlyb/parsing/parquet.py @@ -1,9 +1,42 @@ import pandas as pd +from loguru import logger from . import _init_attrs -@_init_attrs +def _check_metadata(path: str, T: float) -> pd.DataFrame: + """ + Check if the metadata is included in the Dataframe and has the correct + temperature. + + Parameters + ---------- + path : str + Path to parquet file to extract dataframe from. + T : float + Temperature in Kelvin of the simulations. + + Returns + ------- + DataFrame + """ + df = pd.read_parquet(path) + if "temperature" not in df.attrs: + logger.warning( + f"No temperature metadata found in {path}. " + f"Serialise the Dataframe with pandas>=2.1 to preserve the metadata." + ) + df.attrs["temperature"] = T + df.attrs["energy_unit"] = "kT" + else: + if df.attrs["temperature"] != T: + raise ValueError( + f"Temperature in the input ({T}) doesn't match the temperature " + f"in the dataframe ({df.attrs['temperature']})." + ) + return df + + def extract_u_nk(path, T): r"""Return reduced potentials `u_nk` (unit: kT) from a pandas parquet file. @@ -36,7 +69,7 @@ def extract_u_nk(path, T): .. versionadded:: 2.1.0 """ - u_nk = pd.read_parquet(path) + u_nk = _check_metadata(path, T) columns = list(u_nk.columns) if isinstance(columns[0], str) and columns[0][0] == "(": new_columns = [] @@ -81,4 +114,4 @@ def extract_dHdl(path, T): .. versionadded:: 2.1.0 """ - return pd.read_parquet(path) + return _check_metadata(path, T) diff --git a/src/alchemlyb/tests/conftest.py b/src/alchemlyb/tests/conftest.py index d7a3749d..ac216471 100644 --- a/src/alchemlyb/tests/conftest.py +++ b/src/alchemlyb/tests/conftest.py @@ -82,7 +82,7 @@ def gmx_ABFE(): @pytest.fixture -def gmx_ABFE_complex_n_uk(gmx_ABFE): +def gmx_ABFE_complex_u_nk(gmx_ABFE): return [gmx.extract_u_nk(file, T=300) for file in gmx_ABFE["complex"]] diff --git a/src/alchemlyb/tests/parsing/test_parquet.py b/src/alchemlyb/tests/parsing/test_parquet.py index a2d788f6..c42cccb6 100644 --- a/src/alchemlyb/tests/parsing/test_parquet.py +++ b/src/alchemlyb/tests/parsing/test_parquet.py @@ -12,12 +12,45 @@ def test_extract_dHdl(dHdl_list, request, tmp_path): new_dHdl = extract_dHdl(str(tmp_path / "dhdl.parquet"), T=300) assert (new_dHdl.columns == dHdl.columns).all() assert (new_dHdl.index == dHdl.index).all() + assert new_dHdl.attrs["temperature"] == 300 + assert new_dHdl.attrs["energy_unit"] == "kT" -@pytest.mark.parametrize("u_nk_list", ["gmx_benzene_VDW_u_nk", "gmx_ABFE_complex_n_uk"]) +@pytest.mark.parametrize("u_nk_list", ["gmx_benzene_VDW_u_nk", "gmx_ABFE_complex_u_nk"]) def test_extract_dHdl(u_nk_list, request, tmp_path): u_nk = request.getfixturevalue(u_nk_list)[0] u_nk.to_parquet(path=str(tmp_path / "u_nk.parquet"), index=True) new_u_nk = extract_u_nk(str(tmp_path / "u_nk.parquet"), T=300) assert (new_u_nk.columns == u_nk.columns).all() assert (new_u_nk.index == u_nk.index).all() + assert new_u_nk.attrs["temperature"] == 300 + assert new_u_nk.attrs["energy_unit"] == "kT" + + +@pytest.fixture() +def u_nk(gmx_ABFE_complex_u_nk): + return gmx_ABFE_complex_u_nk[0] + + +def test_no_T(u_nk, tmp_path, caplog): + u_nk.attrs = {} + u_nk.to_parquet(path=str(tmp_path / "temp.parquet"), index=True) + extract_u_nk(str(tmp_path / "temp.parquet"), 300) + assert ( + "Serialise the Dataframe with pandas>=2.1 to preserve the metadata." + in caplog.text + ) + + +def test_wrong_T(u_nk, tmp_path, caplog): + u_nk.to_parquet(path=str(tmp_path / "temp.parquet"), index=True) + with pytest.raises(ValueError, match="doesn't match the temperature"): + extract_u_nk(str(tmp_path / "temp.parquet"), 400) + + +def test_metadata_unchanged(u_nk, tmp_path): + u_nk.attrs = {"temperature": 400, "energy_unit": "kcal/mol"} + u_nk.to_parquet(path=str(tmp_path / "temp.parquet"), index=True) + new_u_nk = extract_u_nk(str(tmp_path / "temp.parquet"), 400) + assert new_u_nk.attrs["temperature"] == 400 + assert new_u_nk.attrs["energy_unit"] == "kcal/mol" diff --git a/src/alchemlyb/tests/test_preprocessing.py b/src/alchemlyb/tests/test_preprocessing.py index 00e3c030..b59f66b4 100644 --- a/src/alchemlyb/tests/test_preprocessing.py +++ b/src/alchemlyb/tests/test_preprocessing.py @@ -41,8 +41,8 @@ def u_nk(gmx_benzene_Coulomb_u_nk): @pytest.fixture() -def multi_index_u_nk(gmx_ABFE_complex_n_uk): - return gmx_ABFE_complex_n_uk[0] +def multi_index_u_nk(gmx_ABFE_complex_u_nk): + return gmx_ABFE_complex_u_nk[0] @pytest.fixture() @@ -470,7 +470,7 @@ def test_decorrelate_dhdl_multiple_l(multi_index_dHdl): ) -def test_raise_non_uk(multi_index_dHdl): +def test_raise_nou_nk(multi_index_dHdl): with pytest.raises(ValueError): decorrelate_u_nk( multi_index_dHdl, diff --git a/src/alchemlyb/tests/test_workflow_ABFE.py b/src/alchemlyb/tests/test_workflow_ABFE.py index 82cc83ab..c2dc4b15 100644 --- a/src/alchemlyb/tests/test_workflow_ABFE.py +++ b/src/alchemlyb/tests/test_workflow_ABFE.py @@ -258,7 +258,7 @@ def test_single_estimator_ti(self, workflow, monkeypatch): summary = workflow.generate_result() assert np.isclose(summary["TI"]["Stages"]["TOTAL"], 21.51472826028906, 0.1) - def test_unprocessed_n_uk(self, workflow, monkeypatch): + def test_unprocessed_u_nk(self, workflow, monkeypatch): monkeypatch.setattr(workflow, "u_nk_sample_list", None) monkeypatch.setattr(workflow, "estimator", dict()) workflow.estimate() From 49736d65ed24bbb23a79a1b684ff601b6a976dcc Mon Sep 17 00:00:00 2001 From: Zhiyi Wu Date: Sat, 30 Dec 2023 12:18:41 +0000 Subject: [PATCH 2/3] update changelog --- CHANGES | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGES b/CHANGES index 6c3b5f60..e0e937a6 100644 --- a/CHANGES +++ b/CHANGES @@ -19,7 +19,7 @@ The rules for this file: * 2.2.0 Changes - - For pandas>=2.1, metadata will be loaded from the parquet file (issue #331, PR #326). + - For pandas>=2.1, metadata will be loaded from the parquet file (issue #331, PR #340). Enhancements - Add a TI estimator using gaussian quadrature to calculate the free energy. From 7f6d90e62541755d2a727caba1f55a0688d34698 Mon Sep 17 00:00:00 2001 From: "William (Zhiyi) Wu" Date: Wed, 3 Jan 2024 09:48:57 +0000 Subject: [PATCH 3/3] update function name --- environment.yml | 2 +- setup.py | 2 +- src/alchemlyb/parsing/parquet.py | 6 +++--- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/environment.yml b/environment.yml index 4d2d9bda..7699f270 100644 --- a/environment.yml +++ b/environment.yml @@ -4,7 +4,7 @@ channels: dependencies: - python - numpy -- pandas +- pandas>=2.1 - pymbar>=4 - scipy - scikit-learn diff --git a/setup.py b/setup.py index 882a3866..67df7d72 100755 --- a/setup.py +++ b/setup.py @@ -47,7 +47,7 @@ tests_require=["pytest", "alchemtest"], install_requires=[ "numpy", - "pandas>=1.4", + "pandas>=2.1", "pymbar>=4", "scipy", "scikit-learn", diff --git a/src/alchemlyb/parsing/parquet.py b/src/alchemlyb/parsing/parquet.py index c06664c7..8f9fc408 100644 --- a/src/alchemlyb/parsing/parquet.py +++ b/src/alchemlyb/parsing/parquet.py @@ -4,7 +4,7 @@ from . import _init_attrs -def _check_metadata(path: str, T: float) -> pd.DataFrame: +def _read_parquet_with_metadata(path: str, T: float) -> pd.DataFrame: """ Check if the metadata is included in the Dataframe and has the correct temperature. @@ -69,7 +69,7 @@ def extract_u_nk(path, T): .. versionadded:: 2.1.0 """ - u_nk = _check_metadata(path, T) + u_nk = _read_parquet_with_metadata(path, T) columns = list(u_nk.columns) if isinstance(columns[0], str) and columns[0][0] == "(": new_columns = [] @@ -114,4 +114,4 @@ def extract_dHdl(path, T): .. versionadded:: 2.1.0 """ - return _check_metadata(path, T) + return _read_parquet_with_metadata(path, T)