Merge branch 'master' into 71-joss-paper

alchemistry · Jan 3, 2024 · 966e386 · 966e386
2 parents edb9429 + d8a705a
commit 966e386
Show file tree

Hide file tree

Showing 12 changed files with 119 additions and 21 deletions.
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -29,24 +29,24 @@ jobs:
       fail-fast: false
       matrix:
         os: ["ubuntu-latest", "macOS-latest", "windows-latest"]
-        python-version: ["3.8", "3.9", "3.10", "3.11"]
+        python-version: ["3.9", "3.10", "3.11", "3.12"]
         # Only test lowest and highest version on the expensive/slow
         # macOS and windows runners (UPDATE when supported versions change):	
         exclude:
         - os: macOS-latest
           python-version: 3.10
         - os: macOS-latest
-          python-version: 3.9
+          python-version: 3.11
         - os: windows-latest
           python-version: 3.10
         - os: windows-latest
-          python-version: 3.9
+          python-version: 3.11
 
     steps:
     - uses: actions/checkout@v3
 
     # More info on options: https://github.com/conda-incubator/setup-miniconda
-    - uses: mamba-org/provision-with-micromamba@main
+    - uses: mamba-org/setup-micromamba@main
       with:
         environment-file: devtools/conda-envs/test_env.yaml
         environment-name: test

diff --git a/CHANGES b/CHANGES
@@ -14,13 +14,22 @@ The rules for this file:
 
 ------------------------------------------------------------------------------
 
-*/*/2023 hl2500
+*/*/2023 hl2500, xiki-tempula
 
   * 2.2.0
-
+
+Changes
+  - Require pandas >= 2.1 (PR #340)
+  - For pandas>=2.1, metadata will be loaded from the parquet file (issue #331, PR #340).
+  - add support for Python 3.12, remove Python 3.8 support (issue #341, PR #304).
+
 Enhancements
   - Add a TI estimator using gaussian quadrature to calculate the free energy.
     (issue #302, PR #304)
+  - Warning issued when the series is `None` for `statistical_inefficiency`
+    (issue #337, PR #338)
+  - ValueError issued when `df` and `series` for `statistical_inefficiency`
+    doesn't have the same length (issue #337, PR #338)
 
 
 22/06/2023 xiki-tempula

diff --git a/devtools/conda-envs/test_env.yaml b/devtools/conda-envs/test_env.yaml
@@ -4,7 +4,7 @@ channels:
 dependencies:
 - python
 - numpy
-- pandas
+- pandas>=2.1
 - pymbar>=4
 - scipy
 - scikit-learn

diff --git a/environment.yml b/environment.yml
@@ -4,9 +4,10 @@ channels:
 dependencies:
 - python
 - numpy
-- pandas
+- pandas>=2.1
 - pymbar>=4
 - scipy
 - scikit-learn
 - pyarrow
 - matplotlib
+- loguru
diff --git a/readthedocs.yml b/readthedocs.yml
@@ -13,7 +13,7 @@ build:
     python: "mambaforge-4.10"
 
 conda:
-    environment: environment.yml
+    environment: devtools/conda-envs/test_env.yaml
 
 python:
   install:

diff --git a/setup.py b/setup.py
@@ -29,10 +29,10 @@
         "Operating System :: Microsoft :: Windows ",
         "Programming Language :: Python",
         "Programming Language :: Python :: 3",
-        "Programming Language :: Python :: 3.8",
         "Programming Language :: Python :: 3.9",
         "Programming Language :: Python :: 3.10",
         "Programming Language :: Python :: 3.11",
+        "Programming Language :: Python :: 3.12",
         "Topic :: Scientific/Engineering",
         "Topic :: Scientific/Engineering :: Bio-Informatics",
         "Topic :: Scientific/Engineering :: Chemistry",
@@ -43,11 +43,11 @@
     license="BSD",
     long_description=open("README.rst").read(),
     long_description_content_type="text/x-rst",
-    python_requires=">=3.8",
+    python_requires=">=3.9",
     tests_require=["pytest", "alchemtest"],
     install_requires=[
         "numpy",
-        "pandas>=1.4",
+        "pandas>=2.1",
         "pymbar>=4",
         "scipy",
         "scikit-learn",

diff --git a/src/alchemlyb/parsing/parquet.py b/src/alchemlyb/parsing/parquet.py
@@ -1,9 +1,42 @@
 import pandas as pd
+from loguru import logger
 
 from . import _init_attrs
 
 
-@_init_attrs
+def _read_parquet_with_metadata(path: str, T: float) -> pd.DataFrame:
+    """
+    Check if the metadata is included in the Dataframe and has the correct
+    temperature.
+
+    Parameters
+    ----------
+    path : str
+        Path to parquet file to extract dataframe from.
+    T : float
+        Temperature in Kelvin of the simulations.
+
+    Returns
+    -------
+    DataFrame
+    """
+    df = pd.read_parquet(path)
+    if "temperature" not in df.attrs:
+        logger.warning(
+            f"No temperature metadata found in {path}. "
+            f"Serialise the Dataframe with pandas>=2.1 to preserve the metadata."
+        )
+        df.attrs["temperature"] = T
+        df.attrs["energy_unit"] = "kT"
+    else:
+        if df.attrs["temperature"] != T:
+            raise ValueError(
+                f"Temperature in the input ({T}) doesn't match the temperature "
+                f"in the dataframe ({df.attrs['temperature']})."
+            )
+    return df
+
+
 def extract_u_nk(path, T):
     r"""Return reduced potentials `u_nk` (unit: kT) from a pandas parquet file.
 
@@ -36,7 +69,7 @@ def extract_u_nk(path, T):
     .. versionadded:: 2.1.0
 
     """
-    u_nk = pd.read_parquet(path)
+    u_nk = _read_parquet_with_metadata(path, T)
     columns = list(u_nk.columns)
     if isinstance(columns[0], str) and columns[0][0] == "(":
         new_columns = []
@@ -81,4 +114,4 @@ def extract_dHdl(path, T):
     .. versionadded:: 2.1.0
 
     """
-    return pd.read_parquet(path)
+    return _read_parquet_with_metadata(path, T)
diff --git a/src/alchemlyb/preprocessing/subsampling.py b/src/alchemlyb/preprocessing/subsampling.py
@@ -363,6 +363,15 @@ def _prepare_input(df, series, drop_duplicates, sort):
     series : Series
         Formatted Series.
     """
+    if series is None:
+        warnings.warn(
+            "The series input is `None`, would not subsample according to statistical inefficiency."
+        )
+
+    elif len(df) != len(series):
+        raise ValueError(
+            f"The length of df ({len(df)}) should be same as the length of series ({len(series)})."
+        )
     if _check_multiple_times(df):
         if drop_duplicates:
             df, series = _drop_duplicates(df, series)

diff --git a/src/alchemlyb/tests/conftest.py b/src/alchemlyb/tests/conftest.py
@@ -82,7 +82,7 @@ def gmx_ABFE():
 
 
 @pytest.fixture
-def gmx_ABFE_complex_n_uk(gmx_ABFE):
+def gmx_ABFE_complex_u_nk(gmx_ABFE):
     return [gmx.extract_u_nk(file, T=300) for file in gmx_ABFE["complex"]]
 
 

diff --git a/src/alchemlyb/tests/parsing/test_parquet.py b/src/alchemlyb/tests/parsing/test_parquet.py
@@ -12,12 +12,45 @@ def test_extract_dHdl(dHdl_list, request, tmp_path):
     new_dHdl = extract_dHdl(str(tmp_path / "dhdl.parquet"), T=300)
     assert (new_dHdl.columns == dHdl.columns).all()
     assert (new_dHdl.index == dHdl.index).all()
+    assert new_dHdl.attrs["temperature"] == 300
+    assert new_dHdl.attrs["energy_unit"] == "kT"
 
 
-@pytest.mark.parametrize("u_nk_list", ["gmx_benzene_VDW_u_nk", "gmx_ABFE_complex_n_uk"])
+@pytest.mark.parametrize("u_nk_list", ["gmx_benzene_VDW_u_nk", "gmx_ABFE_complex_u_nk"])
 def test_extract_dHdl(u_nk_list, request, tmp_path):
     u_nk = request.getfixturevalue(u_nk_list)[0]
     u_nk.to_parquet(path=str(tmp_path / "u_nk.parquet"), index=True)
     new_u_nk = extract_u_nk(str(tmp_path / "u_nk.parquet"), T=300)
     assert (new_u_nk.columns == u_nk.columns).all()
     assert (new_u_nk.index == u_nk.index).all()
+    assert new_u_nk.attrs["temperature"] == 300
+    assert new_u_nk.attrs["energy_unit"] == "kT"
+
+
+@pytest.fixture()
+def u_nk(gmx_ABFE_complex_u_nk):
+    return gmx_ABFE_complex_u_nk[0]
+
+
+def test_no_T(u_nk, tmp_path, caplog):
+    u_nk.attrs = {}
+    u_nk.to_parquet(path=str(tmp_path / "temp.parquet"), index=True)
+    extract_u_nk(str(tmp_path / "temp.parquet"), 300)
+    assert (
+        "Serialise the Dataframe with pandas>=2.1 to preserve the metadata."
+        in caplog.text
+    )
+
+
+def test_wrong_T(u_nk, tmp_path, caplog):
+    u_nk.to_parquet(path=str(tmp_path / "temp.parquet"), index=True)
+    with pytest.raises(ValueError, match="doesn't match the temperature"):
+        extract_u_nk(str(tmp_path / "temp.parquet"), 400)
+
+
+def test_metadata_unchanged(u_nk, tmp_path):
+    u_nk.attrs = {"temperature": 400, "energy_unit": "kcal/mol"}
+    u_nk.to_parquet(path=str(tmp_path / "temp.parquet"), index=True)
+    new_u_nk = extract_u_nk(str(tmp_path / "temp.parquet"), 400)
+    assert new_u_nk.attrs["temperature"] == 400
+    assert new_u_nk.attrs["energy_unit"] == "kcal/mol"
diff --git a/src/alchemlyb/tests/test_preprocessing.py b/src/alchemlyb/tests/test_preprocessing.py
@@ -41,8 +41,8 @@ def u_nk(gmx_benzene_Coulomb_u_nk):
 
 
 @pytest.fixture()
-def multi_index_u_nk(gmx_ABFE_complex_n_uk):
-    return gmx_ABFE_complex_n_uk[0]
+def multi_index_u_nk(gmx_ABFE_complex_u_nk):
+    return gmx_ABFE_complex_u_nk[0]
 
 
 @pytest.fixture()
@@ -470,7 +470,7 @@ def test_decorrelate_dhdl_multiple_l(multi_index_dHdl):
     )
 
 
-def test_raise_non_uk(multi_index_dHdl):
+def test_raise_nou_nk(multi_index_dHdl):
     with pytest.raises(ValueError):
         decorrelate_u_nk(
             multi_index_dHdl,
@@ -544,3 +544,16 @@ def test_statistical_inefficiency(self, caplog, u_nk):
             assert "Running statistical inefficiency analysis." in caplog.text
             assert "Statistical inefficiency:" in caplog.text
             assert "Number of uncorrelated samples:" in caplog.text
+
+
+def test_unequil_input(dHdl):
+    with pytest.raises(ValueError, match="should be same as the length of series"):
+        statistical_inefficiency(dHdl, series=dHdl[:10])
+
+
+def test_series_none(dHdl):
+    with pytest.warns(
+        UserWarning,
+        match="The series input is `None`, would not subsample according to statistical inefficiency.",
+    ):
+        statistical_inefficiency(dHdl, series=None)
diff --git a/src/alchemlyb/tests/test_workflow_ABFE.py b/src/alchemlyb/tests/test_workflow_ABFE.py
@@ -258,7 +258,7 @@ def test_single_estimator_ti(self, workflow, monkeypatch):
         summary = workflow.generate_result()
         assert np.isclose(summary["TI"]["Stages"]["TOTAL"], 21.51472826028906, 0.1)
 
-    def test_unprocessed_n_uk(self, workflow, monkeypatch):
+    def test_unprocessed_u_nk(self, workflow, monkeypatch):
         monkeypatch.setattr(workflow, "u_nk_sample_list", None)
         monkeypatch.setattr(workflow, "estimator", dict())
         workflow.estimate()