From ffdb901a5cd3d571577990f2be4fd951ebcf42de Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Wed, 26 Apr 2023 13:05:40 -0500 Subject: [PATCH 01/69] PathLike wrapper/cache for ExternalStorage --- gufe/storage/pseudodirectory.py | 156 +++++++++++++++++++++ gufe/tests/storage/test_pseudodirectory.py | 107 ++++++++++++++ 2 files changed, 263 insertions(+) create mode 100644 gufe/storage/pseudodirectory.py create mode 100644 gufe/tests/storage/test_pseudodirectory.py diff --git a/gufe/storage/pseudodirectory.py b/gufe/storage/pseudodirectory.py new file mode 100644 index 00000000..b595dafa --- /dev/null +++ b/gufe/storage/pseudodirectory.py @@ -0,0 +1,156 @@ +from typing import Union, Optional +from pathlib import Path +from os import PathLike +from .externalresource import ExternalStorage + +import logging +_logger = logging.getLogger(__name__) + +class SharedRoot: + """ + Parameters + ---------- + scratch : os.PathLike + the scratch directory shared by all objects on this host + external : :class:`.ExternalStorage` + external storage resource where objects should eventualy go + prefix : str + label for this specific unit + holding : os.PathLike + name of the subdirectory of scratch where shared results are + temporarily stored; default is '.holding'. This must be the same for + all units within a DAG. + delete_holding : bool + whether to delete the contents of the $SCRATCH/$HOLDING/$PREFIX + directory when this object is deleted + read_only : bool + write to prevent NEW files from being written within this shared + directory. NOTE: This will not prevent overwrite of existing files, + in scratch space, but it will prevent changed files from uploading + to the external storage. + """ + def __init__( + self, + scratch: PathLike, + external: ExternalStorage, + prefix: str, + *, + holding: PathLike =".holding", + delete_holding: bool = True, + read_only: bool = False, + ): + self.external = external + self.scratch = Path(scratch) + self.prefix = Path(prefix) + self.read_only = read_only + self.delete_holding = delete_holding + self.holding = holding + + self.registry = set() + # NOTE: the fact that we use $SCRATCH/$HOLDING/$PREFIX instead of + # $SCRATCH/$PREFIX/$HOLDING is important for 2 reasons: + # 1. This doesn't take any of the user's namespace from their + # $SCRATCH/$PREFIX directory. + # 2. This allows us to easily use an external FileStorage where the + # external storage is exactly the same as this local storage, + # meaning that copies to/from the external storage are no-ops. + # Use FileStorage(scratch / holding) for that. + self.shared_dir = self.scratch / holding / prefix + self.shared_dir.mkdir(exist_ok=True, parents=True) + + def get_other_shared_dir(self, prefix, delete_holding=None): + """Get a related unit's shared directory. + """ + if delete_holding is None: + delete_holding = self.delete_holding + + return SharedRoot( + scratch=self.scratch, + external=self.external, + prefix=prefix, + holding=self.holding, + delete_holding=delete_holding, + read_only=True, + ) + + def transfer_holding_to_external(self): + """Transfer all objects in the registry to external storage""" + if self.read_only: + logging.debug("Read-only: Not transfering to external storage") + return # early exit + + for obj in self.registry: + path = Path(obj) + if not path.exists(): + logging.info(f"Found nonexistent path {path}, not " + "transfering to external storage") + elif path.is_dir(): + logging.debug(f"Found directory {path}, not " + "transfering to external storage") + else: + logging.info(f"Transfering {path} to external storage") + self.external.store_path(obj.label, path) + + def __del__(self): + # take everything in self.shared_dir and write to it shared; keeping + # our prefix + self.transfer_holding_to_external() + if self.delete_holding: + shutil.rmtree(self.shared_dir) + + def register_path(self, shared_path): + label_exists = self.external.exists(shared_path.label) + + if self.read_only and not label_exists: + raise IOError(f"Unable to create '{shared_path.label}'. This " + "shared path is read-only.") + + self.registry.add(shared_path) + + # if this is a file that exists, bring it into our subdir + # NB: this happens even if you're intending to overwrite the path, + # which is kind of wasteful + if label_exists: + scratch_path = self.shared_dir / shared_path.path + # TODO: switch this to using `get_filename` and `store_path` + with self.external.load_stream(shared_path.label) as f: + external_bytes = f.read() + if scratch_path.exists(): + ... # TODO: something to check that the bytes are the same? + scratch_path.parent.mkdir(exist_ok=True, parents=True) + with open(scratch_path, mode='wb') as f: + f.write(external_bytes) + + def __truediv__(self, path: PathLike): + return SharedPath(root=self, path=path) + + def __fspath__(self): + return str(self.shared_dir) + + def __repr__(self): + return f"SharedRoot({self.scratch}, {self.external}, {self.prefix})" + + +class SharedPath: + def __init__(self, root: SharedRoot, path: PathLike): + self.root = root + self.path = Path(path) + self.root.register_path(self) + + def __truediv__(self, path): + return SharedPath(self.root, self.path / path) + + def __fspath__(self): + return str(self.root.shared_dir / self.path) + + @property + def label(self): + return str(self.root.prefix / self.path) + + def __repr__(self): + return f"SharedPath({self.__fspath__()})" + + # TODO: how much of the pathlib.Path interface do we want to wrap? + # although edge cases may be a pain, we can get most of it with, e.g.: + # def exists(self): return Path(self).exists() + # but also, can do pathlib.Path(shared_path) and get hte whole thing diff --git a/gufe/tests/storage/test_pseudodirectory.py b/gufe/tests/storage/test_pseudodirectory.py new file mode 100644 index 00000000..4763cdd2 --- /dev/null +++ b/gufe/tests/storage/test_pseudodirectory.py @@ -0,0 +1,107 @@ +import pytest + +import os +import pathlib + +from gufe.storage.externalresource import MemoryStorage +from gufe.storage.pseudodirectory import SharedRoot + + +@pytest.fixture +def root(tmp_path): + external = MemoryStorage() + external.store_bytes("old_unit/data.txt", b"foo") + root = SharedRoot( + scratch=tmp_path, + external=external, + prefix="new_unit", + delete_holding=False + ) + return root + +@pytest.fixture +def root_with_contents(root): + with open(root / "data.txt", mode='wb') as f: + f.write(b"bar") + + return root + +class TestSharedRoot: + @pytest.mark.parametrize('pathlist', [ + ['file.txt'], ['dir', 'file.txt'] + ]) + def test_path(self, root, pathlist): + path = root + for p in pathlist: + path = path / p + + inner_path = os.sep.join(pathlist) + actual_path = root.shared_dir / inner_path + + assert pathlib.Path(path) == actual_path + + def test_read_old(self, root): + # When the file doesn't exist locally, it should be pulled down the + # first time that we register the path. + + # initial conditions, without touching SharedRoot/SharedPath + label = "old_unit/data.txt" + on_filesystem = root.scratch / root.holding / label + assert not on_filesystem.exists() + assert root.external.exists(label) + + # when we create the specific SharedPath, it registers and + # "downloads" the file + old_shared = root.get_other_shared_dir("old_unit") + filepath = old_shared / "data.txt" + assert pathlib.Path(filepath) == on_filesystem + assert on_filesystem.exists() + + # let's just be sure we can read in the data as desired + with open(filepath, mode='rb') as f: + assert f.read() == b"foo" + + def test_write_new(self, root): + label = "new_unit/somefile.txt" + on_filesystem = root.scratch / root.holding / label + assert not on_filesystem.exists() + with open(root / "somefile.txt", mode='wb') as f: + f.write(b"testing") + + # this has been written to disk in scratch, but not yet saved to + # external storage + assert on_filesystem.exists() + assert not root.external.exists(label) + + def test_write_old_fail(self, root): + old_shared = root.get_other_shared_dir("old_unit") + with pytest.raises(IOError, match="read-only"): + old_shared / "foo.txt" + + def test_transfer_to_external(self, root_with_contents): + path = list(root_with_contents.registry)[0] # only 1 + assert not root_with_contents.external.exists(path.label) + + root_with_contents.transfer_holding_to_external() + assert root_with_contents.external.exists(path.label) + + with root_with_contents.external.load_stream(path.label) as f: + assert f.read() == b"bar" + + def test_transfer_to_external_no_file(self, root): + ... + + def test_tranfer_to_external_directory(self, root): + ... + + def test_del(self): + ... + + def test_existing_local_and_external(self, root): + ... + + def test_existing_local_and_external_conflict(self, root): + ... + + def test_no_transfer_for_read_only(self, root): + ... From eb19e0f9ea880648cd285494b5ac379695264f1a Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Wed, 26 Apr 2023 14:07:44 -0500 Subject: [PATCH 02/69] mypy --- gufe/storage/pseudodirectory.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gufe/storage/pseudodirectory.py b/gufe/storage/pseudodirectory.py index b595dafa..eb88bfcf 100644 --- a/gufe/storage/pseudodirectory.py +++ b/gufe/storage/pseudodirectory.py @@ -35,7 +35,7 @@ def __init__( external: ExternalStorage, prefix: str, *, - holding: PathLike =".holding", + holding: PathLike = Path(".holding"), delete_holding: bool = True, read_only: bool = False, ): @@ -46,7 +46,7 @@ def __init__( self.delete_holding = delete_holding self.holding = holding - self.registry = set() + self.registry : set[SharedPath] = set() # NOTE: the fact that we use $SCRATCH/$HOLDING/$PREFIX instead of # $SCRATCH/$PREFIX/$HOLDING is important for 2 reasons: # 1. This doesn't take any of the user's namespace from their From 3be13c0c79bb7f63b3828af5ee8e0ab84ea38139 Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Thu, 27 Apr 2023 09:24:15 -0500 Subject: [PATCH 03/69] docstrings --- gufe/storage/pseudodirectory.py | 49 +++++++++++++++++++++++++++++++-- 1 file changed, 47 insertions(+), 2 deletions(-) diff --git a/gufe/storage/pseudodirectory.py b/gufe/storage/pseudodirectory.py index eb88bfcf..141a06bd 100644 --- a/gufe/storage/pseudodirectory.py +++ b/gufe/storage/pseudodirectory.py @@ -7,7 +7,29 @@ _logger = logging.getLogger(__name__) class SharedRoot: - """ + """PathLike local representation of an :class:`.ExternalStorage`. + + This connects objects on a local filesystem to the key-value store of a + (possibly remote) :class:`.ExternalStorage`. It presents a FileLike + interface to users, but internally (via the :class:`.SharedPath` objects + it contains in its registry) maps local filenames to the keys (labels) + for the key-value store. + + 1. If a local path is requested that corresponds to an existing label in + the :class:`.ExternalStorage`, this object will "download" the + contents of that key to that local path. + + 2. When requested, or when this object ??? (TODO: __exit__ or __del__), + it transfers any newly created files to the + :class:`.ExternalStorage`. + + 3. Optionally, this can delete the local cache of files when requested + or when this object ??? (TODO: __exit__ or __del__) + + This can be opened in "read-only" mode, which prevents new files from + being created, but does not prevent changes to existing versions of + local files. + Parameters ---------- scratch : os.PathLike @@ -25,7 +47,7 @@ class SharedRoot: directory when this object is deleted read_only : bool write to prevent NEW files from being written within this shared - directory. NOTE: This will not prevent overwrite of existing files, + directory. NOTE: This will not prevent overwrite of existing files in scratch space, but it will prevent changed files from uploading to the external storage. """ @@ -99,6 +121,23 @@ def __del__(self): shutil.rmtree(self.shared_dir) def register_path(self, shared_path): + """Register a :class:`.SharedPath` with this :class:`.SharedRoot`. + + This marks a given path as something for this object to manage, by + loading it into the ``registry``. This way it is tracked such that + its contents can be transfered to the :class:`.ExternalStorage` and + such that the local copy can be deleted when it is no longer needed. + + If this objects's :class:`.ExternalStorage` already has data for the + label associated with the provided :class:`.Sharedpath`, then the + contents of that should copied to the local path so that it can be + read by the user. + + Parameters + ---------- + shared_path: :class:`.SharedPath` + the path to track + """ label_exists = self.external.exists(shared_path.label) if self.read_only and not label_exists: @@ -132,6 +171,11 @@ def __repr__(self): class SharedPath: + """PathLike object linking local path with label for external storage. + + On creation, this registers with a :class:`.SharedRoot` that will manage + the local path and transferring data with its :class:`.ExternalStorage`. + """ def __init__(self, root: SharedRoot, path: PathLike): self.root = root self.path = Path(path) @@ -145,6 +189,7 @@ def __fspath__(self): @property def label(self): + """Label used in :class:`.ExternalStorage` for this path""" return str(self.root.prefix / self.path) def __repr__(self): From 472151eddb3bab87e82b5a26401ae5927a1efd1b Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Tue, 30 May 2023 15:13:23 -0500 Subject: [PATCH 04/69] Add StorageManager code --- gufe/storage/pseudodirectory.py | 70 +++++++++++----- gufe/storage/storagemanager.py | 136 ++++++++++++++++++++++++++++++++ 2 files changed, 186 insertions(+), 20 deletions(-) create mode 100644 gufe/storage/storagemanager.py diff --git a/gufe/storage/pseudodirectory.py b/gufe/storage/pseudodirectory.py index 141a06bd..550f3b45 100644 --- a/gufe/storage/pseudodirectory.py +++ b/gufe/storage/pseudodirectory.py @@ -6,6 +6,28 @@ import logging _logger = logging.getLogger(__name__) + +def _delete_empty_dirs(root, delete_root=True): + """Delete all empty directories. + + Repeats so that directories that only contained empty directories also + get deleted. + """ + root = Path(root) + + def find_empty_dirs(directory): + if not (paths := directory.iterdir()): + return [directory] + directories = [p for p in paths if p.is_dir()] + return sum([find_empty_dirs(d) for d in directories], []) + + while empties := find_empty_dirs(root): + if empties == [root] and not delete_root: + return + for directory in empties: + os.rmdir(directory) + + class SharedRoot: """PathLike local representation of an :class:`.ExternalStorage`. @@ -19,12 +41,10 @@ class SharedRoot: the :class:`.ExternalStorage`, this object will "download" the contents of that key to that local path. - 2. When requested, or when this object ??? (TODO: __exit__ or __del__), - it transfers any newly created files to the + 2. When requested, it transfers any newly created files to the :class:`.ExternalStorage`. - 3. Optionally, this can delete the local cache of files when requested - or when this object ??? (TODO: __exit__ or __del__) + 3. It can delete all of the files it manages This can be opened in "read-only" mode, which prevents new files from being created, but does not prevent changes to existing versions of @@ -95,6 +115,24 @@ def get_other_shared_dir(self, prefix, delete_holding=None): read_only=True, ) + def transfer_single_file_to_external(self, held_file): + """Transfer a given file from holding into external storage + """ + if self.read_only: + logging.debug("Read-only: Not transfering to external storage") + return # early exit + + path = Path(held_file) + if not path.exists(): + logging.info(f"Found nonexistent path {path}, not " + "transfering to external storage") + elif path.is_dir(): + logging.debug(f"Found directory {path}, not " + "transfering to external storage") + else: + logging.info(f"Transfering {path} to external storage") + self.external.store_path(held_file.label, path) + def transfer_holding_to_external(self): """Transfer all objects in the registry to external storage""" if self.read_only: @@ -102,23 +140,15 @@ def transfer_holding_to_external(self): return # early exit for obj in self.registry: - path = Path(obj) - if not path.exists(): - logging.info(f"Found nonexistent path {path}, not " - "transfering to external storage") - elif path.is_dir(): - logging.debug(f"Found directory {path}, not " - "transfering to external storage") - else: - logging.info(f"Transfering {path} to external storage") - self.external.store_path(obj.label, path) - - def __del__(self): - # take everything in self.shared_dir and write to it shared; keeping - # our prefix - self.transfer_holding_to_external() + self.transfer_single_file_to_external(obj) + + def cleanup(self): + """Perform end-of-lifecycle cleanup. + """ if self.delete_holding: - shutil.rmtree(self.shared_dir) + for file in self.registry: + os.delete(file) + _delete_empty_dirs(self.shared_dir) def register_path(self, shared_path): """Register a :class:`.SharedPath` with this :class:`.SharedRoot`. diff --git a/gufe/storage/storagemanager.py b/gufe/storage/storagemanager.py new file mode 100644 index 00000000..ac3c6831 --- /dev/null +++ b/gufe/storage/storagemanager.py @@ -0,0 +1,136 @@ +import os +from pathlib import Path +from contextlib import contextmanager + +def _storage_path_conflict(external, path): + # this is a little brittle; would be nice to directly check the + # filenames without needing to load the files? maybe we have a way + # to do that? + if isinstance(pseudodir.external, FileStorage): + root = Path(external.root_dir) + + ... + + +class _DAGStorageManager: + """Context manager to handle details of storage lifecycle. + + Making this a separate class ensures that ``running_unit`` is always + called within the context of a given DAG. This is usually not created + directly; instead, it is created (and used) with its ``running_dag`` + classmethod, typically from within a ``StorageManager``. + """ + def __init__(self, storage_manager, dag_label): + self.manager = storage_manager + self.dag_label = dag_label + self.permanents = [] + + @classmethod # NB: classmethod must be on top + @contextmanager + def running_dag(cls, storage_manager, dag_label): + """DAG level of the storage lifecycle + + When the DAG is completed, transfer everything to the permanent + storage, and delete the holding area for permanent (if we are + supposed to). + + This is not usually called by users; instead it is called from + within the ``StorageManager``. + """ + dag_manager = cls(storage_manager, dag_label) + try: + yield dag_manager + finally: + for permanent in dag_manager.permanents: + permanent.transfer_holding_to_external() + + if not dag_manager.manager.keep_holding: + for d in dag_manager.permanents + dag_manager.shareds: + d.cleanup() + + @contextmanager + def running_unit(self, unit): + """Unit level of the storage lifecycle. + + This provides the holding directories used for scratch, shared, and + permanent. At the end of the unit, it transfers anything from shared + to the real shared external storage, cleans up the scratch + directory and the shared holding directory. + """ + unit_label = unit.key + scratch = self.manager.get_scratch(self.dag_label, unit_label) + shared = self.manager.get_shared(self.dag_label, unit_label) + permanent = self.manager.get_permanent(self.dag_label, unit_label) + try: + yield scratch, shared, permanent + finally: + self.permanents.append(permanent) + shared.transfer_holding_to_external() + + # TODO: check whether shared external is the same as scratch, + # put this in can_delete + can_delete = True + if not self.manager.keep_scratch and can_delete: + shutil.rmtree(scratch) + + if not self.manager.keep_holding: + shared.cleanup() + + +class StorageManager: + """Tool to manage the storage lifecycle during a DAG. + + This object primarily contains the logic for getting the holding + directories. A separate class, in the ``_DAGContextClass`` variable, + handles the logic for the context managers. + """ + _DAGContextClass = _DAGStorageManager + def __init__( + self, + scratch_root: os.Pathlike, + shared_root: ExternalStorage, + permanent_root: ExternalStorage, + *, + keep_scratch: bool = False, + keep_holding: bool = False, + holding: os.PathLike = Path(".holding") + ): + ... + + def get_scratch(self, dag_label: str , unit_label: str) -> Path: + """Get the path for this unit's scratch directory""" + scratch = self.scratch_root / dag_label / "scratch" / unit_label + scratch.mkdir(parents=True, exist_ok=True) + return scratch + + def get_permanent(self, dag_label, unit_label): + """Get the object for this unit's permanent holding directory""" + return SharedRoot( + scratch=self.scratch_root / dag_label, + external=self.permanent_root, + prefix=unit_label, + ) + + def get_shared(self, dag_label, unit_label): + """Get the object for this unit's shared holding directory""" + return SharedRoot( + scratch=self.scratch_root / dag_label, + external=self.shared_root, + prefix=unit_label + ) + + def running_dag(self, dag_label: str): + """Return a context manager that handles storage. + + For simple use cases, this is the only method a user needs to call. + Usage is something like: + + .. code:: + + with manager.running_dag(dag_label) as dag_ctx: + for unit in dag_ordered_units: + with dag_ctx.running_unit(unit) as dirs: + scratch, shared, permanent = dirs + # run the unit + """ + return self._DAGContextClass.running_dag(self, dag_label) From b692c2f7669da2ac22d8075038b4e57a9c7b3962 Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Tue, 30 May 2023 16:32:07 -0500 Subject: [PATCH 05/69] rename to Stagine --- ...pseudodirectory.py => stagingdirectory.py} | 79 ++++++++++--------- ...odirectory.py => test_stagingdirectory.py} | 0 2 files changed, 43 insertions(+), 36 deletions(-) rename gufe/storage/{pseudodirectory.py => stagingdirectory.py} (76%) rename gufe/tests/storage/{test_pseudodirectory.py => test_stagingdirectory.py} (100%) diff --git a/gufe/storage/pseudodirectory.py b/gufe/storage/stagingdirectory.py similarity index 76% rename from gufe/storage/pseudodirectory.py rename to gufe/storage/stagingdirectory.py index 550f3b45..08e6285e 100644 --- a/gufe/storage/pseudodirectory.py +++ b/gufe/storage/stagingdirectory.py @@ -1,6 +1,6 @@ from typing import Union, Optional from pathlib import Path -from os import PathLike +from os import PathLike, rmdir from .externalresource import ExternalStorage import logging @@ -16,24 +16,26 @@ def _delete_empty_dirs(root, delete_root=True): root = Path(root) def find_empty_dirs(directory): - if not (paths := directory.iterdir()): + if not (paths := list(directory.iterdir())): return [directory] directories = [p for p in paths if p.is_dir()] return sum([find_empty_dirs(d) for d in directories], []) - while empties := find_empty_dirs(root): + + while root.exists() and (empties := find_empty_dirs(root)): if empties == [root] and not delete_root: return for directory in empties: - os.rmdir(directory) + _logger.debug(f"Removing '{directory}'") + rmdir(directory) -class SharedRoot: +class StagingDirectory: """PathLike local representation of an :class:`.ExternalStorage`. This connects objects on a local filesystem to the key-value store of a (possibly remote) :class:`.ExternalStorage`. It presents a FileLike - interface to users, but internally (via the :class:`.SharedPath` objects + interface to users, but internally (via the :class:`.StagingPath` objects it contains in its registry) maps local filenames to the keys (labels) for the key-value store. @@ -59,14 +61,14 @@ class SharedRoot: prefix : str label for this specific unit holding : os.PathLike - name of the subdirectory of scratch where shared results are + name of the subdirectory of scratch where staged results are temporarily stored; default is '.holding'. This must be the same for all units within a DAG. delete_holding : bool whether to delete the contents of the $SCRATCH/$HOLDING/$PREFIX directory when this object is deleted read_only : bool - write to prevent NEW files from being written within this shared + write to prevent NEW files from being written within this staging directory. NOTE: This will not prevent overwrite of existing files in scratch space, but it will prevent changed files from uploading to the external storage. @@ -88,7 +90,7 @@ def __init__( self.delete_holding = delete_holding self.holding = holding - self.registry : set[SharedPath] = set() + self.registry : set[StagingPath] = set() # NOTE: the fact that we use $SCRATCH/$HOLDING/$PREFIX instead of # $SCRATCH/$PREFIX/$HOLDING is important for 2 reasons: # 1. This doesn't take any of the user's namespace from their @@ -97,16 +99,16 @@ def __init__( # external storage is exactly the same as this local storage, # meaning that copies to/from the external storage are no-ops. # Use FileStorage(scratch / holding) for that. - self.shared_dir = self.scratch / holding / prefix - self.shared_dir.mkdir(exist_ok=True, parents=True) + self.staging_dir = self.scratch / holding / prefix + self.staging_dir.mkdir(exist_ok=True, parents=True) - def get_other_shared_dir(self, prefix, delete_holding=None): - """Get a related unit's shared directory. + def get_other_staging_dir(self, prefix, delete_holding=None): + """Get a related unit's staging directory. """ if delete_holding is None: delete_holding = self.delete_holding - return SharedRoot( + return StagingDirectory( scratch=self.scratch, external=self.external, prefix=prefix, @@ -148,10 +150,11 @@ def cleanup(self): if self.delete_holding: for file in self.registry: os.delete(file) - _delete_empty_dirs(self.shared_dir) + _delete_empty_dirs(self.staging_dir) - def register_path(self, shared_path): - """Register a :class:`.SharedPath` with this :class:`.SharedRoot`. + def register_path(self, staging_path): + """ + Register a :class:`.StagingPath` with this :class:`.StagingDirectory`. This marks a given path as something for this object to manage, by loading it into the ``registry``. This way it is tracked such that @@ -159,30 +162,30 @@ def register_path(self, shared_path): such that the local copy can be deleted when it is no longer needed. If this objects's :class:`.ExternalStorage` already has data for the - label associated with the provided :class:`.Sharedpath`, then the + label associated with the provided :class:`.Stagingpath`, then the contents of that should copied to the local path so that it can be read by the user. Parameters ---------- - shared_path: :class:`.SharedPath` + staging_path: :class:`.StagingPath` the path to track """ - label_exists = self.external.exists(shared_path.label) + label_exists = self.external.exists(staging_path.label) if self.read_only and not label_exists: - raise IOError(f"Unable to create '{shared_path.label}'. This " - "shared path is read-only.") + raise IOError(f"Unable to create '{staging_path.label}'. This " + "staging path is read-only.") - self.registry.add(shared_path) + self.registry.add(staging_path) # if this is a file that exists, bring it into our subdir # NB: this happens even if you're intending to overwrite the path, # which is kind of wasteful if label_exists: - scratch_path = self.shared_dir / shared_path.path + scratch_path = self.staging_dir / staging_path.path # TODO: switch this to using `get_filename` and `store_path` - with self.external.load_stream(shared_path.label) as f: + with self.external.load_stream(staging_path.label) as f: external_bytes = f.read() if scratch_path.exists(): ... # TODO: something to check that the bytes are the same? @@ -191,31 +194,35 @@ def register_path(self, shared_path): f.write(external_bytes) def __truediv__(self, path: PathLike): - return SharedPath(root=self, path=path) + return StagingPath(root=self, path=path) def __fspath__(self): - return str(self.shared_dir) + return str(self.staging_dir) def __repr__(self): - return f"SharedRoot({self.scratch}, {self.external}, {self.prefix})" + return ( + f"StagingDirectory({self.scratch}, {self.external}, " + f"{self.prefix})" + ) -class SharedPath: +class StagingPath: """PathLike object linking local path with label for external storage. - On creation, this registers with a :class:`.SharedRoot` that will manage - the local path and transferring data with its :class:`.ExternalStorage`. + On creation, this registers with a :class:`.StagingDirectory` that will + manage the local path and transferring data with its + :class:`.ExternalStorage`. """ - def __init__(self, root: SharedRoot, path: PathLike): + def __init__(self, root: StagingDirectory, path: PathLike): self.root = root self.path = Path(path) self.root.register_path(self) def __truediv__(self, path): - return SharedPath(self.root, self.path / path) + return StagingPath(self.root, self.path / path) def __fspath__(self): - return str(self.root.shared_dir / self.path) + return str(self.root.staging_dir / self.path) @property def label(self): @@ -223,9 +230,9 @@ def label(self): return str(self.root.prefix / self.path) def __repr__(self): - return f"SharedPath({self.__fspath__()})" + return f"StagingPath({self.__fspath__()})" # TODO: how much of the pathlib.Path interface do we want to wrap? # although edge cases may be a pain, we can get most of it with, e.g.: # def exists(self): return Path(self).exists() - # but also, can do pathlib.Path(shared_path) and get hte whole thing + # but also, can do pathlib.Path(staging_path) and get hte whole thing diff --git a/gufe/tests/storage/test_pseudodirectory.py b/gufe/tests/storage/test_stagingdirectory.py similarity index 100% rename from gufe/tests/storage/test_pseudodirectory.py rename to gufe/tests/storage/test_stagingdirectory.py From 7432ff4dc8a14dabf19990abfdb45cc51c82891c Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Tue, 30 May 2023 16:32:18 -0500 Subject: [PATCH 06/69] Add tests for _delete_empty_dirs --- gufe/storage/storagemanager.py | 4 ++ gufe/tests/storage/test_stagingdirectory.py | 73 +++++++++++++++++---- 2 files changed, 63 insertions(+), 14 deletions(-) diff --git a/gufe/storage/storagemanager.py b/gufe/storage/storagemanager.py index ac3c6831..c3ab34f9 100644 --- a/gufe/storage/storagemanager.py +++ b/gufe/storage/storagemanager.py @@ -67,6 +67,10 @@ def running_unit(self, unit): self.permanents.append(permanent) shared.transfer_holding_to_external() + # everything in permanent must also be available in shared + for file in permanent.registry: + shared.transfer_single_file_to_external(file) + # TODO: check whether shared external is the same as scratch, # put this in can_delete can_delete = True diff --git a/gufe/tests/storage/test_stagingdirectory.py b/gufe/tests/storage/test_stagingdirectory.py index 4763cdd2..46c48819 100644 --- a/gufe/tests/storage/test_stagingdirectory.py +++ b/gufe/tests/storage/test_stagingdirectory.py @@ -4,14 +4,15 @@ import pathlib from gufe.storage.externalresource import MemoryStorage -from gufe.storage.pseudodirectory import SharedRoot - +from gufe.storage.stagingdirectory import ( + StagingDirectory, _delete_empty_dirs +) @pytest.fixture def root(tmp_path): external = MemoryStorage() external.store_bytes("old_unit/data.txt", b"foo") - root = SharedRoot( + root = StagingDirectory( scratch=tmp_path, external=external, prefix="new_unit", @@ -26,7 +27,54 @@ def root_with_contents(root): return root -class TestSharedRoot: +def test_delete_empty_dirs(tmp_path): + base = tmp_path / "tmp" + paths = [ + base / "foo" / "qux" / "qux.txt", + + ] + dirs = [ + base / "foo" / "bar" / "baz", + base / "quux", + ] + for directory in dirs: + directory.mkdir(parents=True, exist_ok=True) + + for path in paths: + path.parent.mkdir(parents=True, exist_ok=True) + path.touch() + + _delete_empty_dirs(base) + for path in paths: + assert path.exists() + + for directory in dirs: + assert not directory.exists() + + assert not (base / "foo" / "bar").exists() + +@pytest.mark.parametrize('delete_root', [True, False]) +def test_delete_empty_dirs_delete_root(tmp_path, delete_root): + base = tmp_path / "tmp" + dirs = [ + base / "foo" / "bar" / "baz", + base / "quux", + ] + for directory in dirs: + directory.mkdir(parents=True, exist_ok=True) + + _delete_empty_dirs(base, delete_root=delete_root) + + for directory in dirs: + assert not directory.exists() + + assert not (base / "foo" / "bar").exists() + assert base.exists() is not delete_root + + + + +class TestStagingDirectory: @pytest.mark.parametrize('pathlist', [ ['file.txt'], ['dir', 'file.txt'] ]) @@ -36,7 +84,7 @@ def test_path(self, root, pathlist): path = path / p inner_path = os.sep.join(pathlist) - actual_path = root.shared_dir / inner_path + actual_path = root.staging_dir / inner_path assert pathlib.Path(path) == actual_path @@ -44,16 +92,16 @@ def test_read_old(self, root): # When the file doesn't exist locally, it should be pulled down the # first time that we register the path. - # initial conditions, without touching SharedRoot/SharedPath + # initial conditions, without touching StagingDirectory/StagingPath label = "old_unit/data.txt" on_filesystem = root.scratch / root.holding / label assert not on_filesystem.exists() assert root.external.exists(label) - # when we create the specific SharedPath, it registers and + # when we create the specific StagingPath, it registers and # "downloads" the file - old_shared = root.get_other_shared_dir("old_unit") - filepath = old_shared / "data.txt" + old_staging = root.get_other_staging_dir("old_unit") + filepath = old_staging / "data.txt" assert pathlib.Path(filepath) == on_filesystem assert on_filesystem.exists() @@ -74,9 +122,9 @@ def test_write_new(self, root): assert not root.external.exists(label) def test_write_old_fail(self, root): - old_shared = root.get_other_shared_dir("old_unit") + old_staging = root.get_other_staging_dir("old_unit") with pytest.raises(IOError, match="read-only"): - old_shared / "foo.txt" + old_staging / "foo.txt" def test_transfer_to_external(self, root_with_contents): path = list(root_with_contents.registry)[0] # only 1 @@ -94,9 +142,6 @@ def test_transfer_to_external_no_file(self, root): def test_tranfer_to_external_directory(self, root): ... - def test_del(self): - ... - def test_existing_local_and_external(self, root): ... From 319d0d000ac1d67896e2ca7f0a0f646569a6d386 Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Wed, 31 May 2023 07:33:05 -0500 Subject: [PATCH 07/69] Storage docs --- docs/guide/storage.rst | 64 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 63 insertions(+), 1 deletion(-) diff --git a/docs/guide/storage.rst b/docs/guide/storage.rst index 39413208..caac6399 100644 --- a/docs/guide/storage.rst +++ b/docs/guide/storage.rst @@ -1,5 +1,67 @@ The GUFE Storage System ======================= +Storage lifetimes +----------------- -Storage docs. +The storage system in GUFE is heavily tied to the GUFE protocol system. The +key concept here is that the different levels of the GUFE protocol system; +campaign, DAG, and unit; inherently imply different lifetimes for the data +that is created. Those different lifetimes define the stages of the GUFE +storage system. + +In an abstract sense, as used by protocol developers, these three levels +correspond to three lifetimes of data: + +* ``scratch``: This is temporary data that is only needed for the lifetime + of a :class:`.ProtocolUnit`. This data is not guaranteed to be available + beyown the single :class:`.ProtocolUnit` where it is created, but may be + reused within that :class:`.ProtocolUnit`. +* ``shared``: This is data that is shared between different units in a + :class:`.ProtocolDAG`. For example, a single equilibration stage might be + shared between multiple production runs. The output snapshot of the + equilibration would be suitable for as something to put in ``shared`` + data. This data is guarateed to be present from when it is created until + the end of the :class:`.ProtocolDAG`, but is not guaranteed to exist after + the :class:`.ProtocolDAG` terminates. +* ``permanent``: This is the data that will be needed beyond the scope of a + single rough estimate of the calculation. This could include anything that + an extension of the simulation would require, or things that require + network-scale analysis. Anything stored here will be usable after the + calculation has completed. + +The ``scratch`` area is always a local directory. However, ``shared`` and +``permanent`` can be external (remote) resources, using the +:class:`.ExternalResource` API. + +As a practical matter, the GUFE storage system can be handled with a +:class:`.StorageManager`. This automates some aspects of the transfer +between stages of the GUFE storage system, and simplifies the API for +protocol authors. In detail, this provides protocol authors with +``PathLike`` objects for ``scratch``, ``shared``, and ``permanent``. All +three of these objects actually point to special subdirectories of the +scratch space for a specific unit, but are managed by context manangers at +the executor level, which handle the process of moving objects from local +directories to the actual ``shared`` and ``permanent`` locations, which can +be external resources. + + +External resource utilities +--------------------------- + +For flexible data storage, GUFE defines the :class:`.ExternalResource` API, +which allows data be stored/loaded in a way that is agnostic to the +underlying data store, as long as the store can be represented as a +key-value store. Withing GUFE, we provide several examples, including +:class:`.FileStorage` and :class:`.MemoryStorage` (primarily useful for +testing.) The specific ``shared`` and ``permanent`` resources, as provided +to the executor, can be instances of an :class:`.ExternalResource`. + +.. note:: + + The ``shared`` space must be a resource where an uploaded object is + instantaneously available, otherwise later protocol units may fail if the + shared result is unavailable. This means that files or approaches based + on ``scp`` or ``sftp`` are fine, but things like cloud storage, where the + existence of a new document may take time to propagate through the + network, are not recommended for ``shared``. From 5650609f9fa525e578ed0ab0fd5325cad2f25972 Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Wed, 31 May 2023 07:53:52 -0500 Subject: [PATCH 08/69] outline of storage manager tests --- gufe/storage/storagemanager.py | 47 ++++++++++++-------- gufe/tests/storage/test_storagemanager.py | 54 +++++++++++++++++++++++ 2 files changed, 82 insertions(+), 19 deletions(-) create mode 100644 gufe/tests/storage/test_storagemanager.py diff --git a/gufe/storage/storagemanager.py b/gufe/storage/storagemanager.py index c3ab34f9..0aacda35 100644 --- a/gufe/storage/storagemanager.py +++ b/gufe/storage/storagemanager.py @@ -1,15 +1,17 @@ -import os +from os import PathLike from pathlib import Path from contextlib import contextmanager +from .externalstorage import ExternalStorage, FileStorage + def _storage_path_conflict(external, path): - # this is a little brittle; would be nice to directly check the - # filenames without needing to load the files? maybe we have a way - # to do that? - if isinstance(pseudodir.external, FileStorage): + # this is a little brittle; I don't like hard-coding the class here + if isinstance(external, FileStorage): root = Path(external.root_dir) + else: + return False - ... + return False class _DAGStorageManager: @@ -45,7 +47,7 @@ def running_dag(cls, storage_manager, dag_label): permanent.transfer_holding_to_external() if not dag_manager.manager.keep_holding: - for d in dag_manager.permanents + dag_manager.shareds: + for d in dag_manager.permanents: d.cleanup() @contextmanager @@ -64,19 +66,20 @@ def running_unit(self, unit): try: yield scratch, shared, permanent finally: + # TODO: should some of this be in an else clause instead? self.permanents.append(permanent) shared.transfer_holding_to_external() - # everything in permanent must also be available in shared for file in permanent.registry: shared.transfer_single_file_to_external(file) - - # TODO: check whether shared external is the same as scratch, - # put this in can_delete - can_delete = True - if not self.manager.keep_scratch and can_delete: + scratch_conflict = _storage_path_conflict(shared.external, + scratch) + if not self.manager.keep_scratch and not scratch_conflict: shutil.rmtree(scratch) + shared_conflict = _storage_path_conflict(shared.external, + shared) + if not self.manager.keep_holding: shared.cleanup() @@ -85,21 +88,27 @@ class StorageManager: """Tool to manage the storage lifecycle during a DAG. This object primarily contains the logic for getting the holding - directories. A separate class, in the ``_DAGContextClass`` variable, + directories. A separate class, in the ``DAGContextClass`` variable, handles the logic for the context managers. """ - _DAGContextClass = _DAGStorageManager def __init__( self, - scratch_root: os.Pathlike, + scratch_root: PathLike, shared_root: ExternalStorage, permanent_root: ExternalStorage, *, keep_scratch: bool = False, keep_holding: bool = False, - holding: os.PathLike = Path(".holding") + holding: PathLike = Path(".holding"), + DAGContextClass: type = _DAGStorageManager, ): - ... + self.scratch_root = scratch_root + self.shared_root = shared_root + self.permanent_root = permanent_root + self.keep_scratch = keep_scratch + self.keep_holding = keep_holding + self.holding = holding + self.DAGContextClass = DAGContextClass def get_scratch(self, dag_label: str , unit_label: str) -> Path: """Get the path for this unit's scratch directory""" @@ -137,4 +146,4 @@ def running_dag(self, dag_label: str): scratch, shared, permanent = dirs # run the unit """ - return self._DAGContextClass.running_dag(self, dag_label) + return self.DAGContextClass.running_dag(self, dag_label) diff --git a/gufe/tests/storage/test_storagemanager.py b/gufe/tests/storage/test_storagemanager.py new file mode 100644 index 00000000..bb9402b3 --- /dev/null +++ b/gufe/tests/storage/test_storagemanager.py @@ -0,0 +1,54 @@ +import pytest +from gufe.storage.storagemanager import ( + StorageManager, _storage_path_conflict +) +from gufe.storage.externalresource import MemoryStorage + +@pytest.fixture +def storage_manager_std(tmp_path): + return StorageManager( + scratch_root=tmp_path / "scratch", + shared_root=MemoryStorage(), + permanent_root=MemoryStorage() + ) + +@pytest.fixture +def storage_manager_holding_overlaps_shared(tmp_path): + ... + +@pytest.fixture +def storage_manager_holding_overlaps_permanent(tmp_path): + ... + +@pytest.mark.parametrize('manager', ['std']) +def test_lifecycle(request, manager, dag_units): + # heavy integration test to ensure that the whole process works + # this is the primary test of _DAGStorageManager + storage_manager = request.getfixture(f"storage_manager_{manager}") + with storage_manager.running_dag("dag_label") as dag_ctx: + for unit in dag_units: + with dag_ctx.running_unit(unit) as (scratch, shared, permanent): + results.append(unit.run(scratch, shareed, permanent)) + # TODO: asserts at this level + # all files exist; are where we expect them + # TODO: asserts at this level + # TODO: asserts at this level + +def test_lifecycle_keep_scratch_and_holding(): + ... + +def test_storage_path_conflict_ok(): + ... + +def test_storage_path_conflict_problem(): + ... + +class TestStorageManager: + def test_get_scratch(): + ... + + def test_get_permanent(): + ... + + def test_get_shared(): + ... From ddbbd19cbba45d94e14140898b222c1dd1fa432f Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Wed, 31 May 2023 12:21:02 -0500 Subject: [PATCH 09/69] minor improvements on staging directory --- gufe/storage/stagingdirectory.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/gufe/storage/stagingdirectory.py b/gufe/storage/stagingdirectory.py index 08e6285e..a1237999 100644 --- a/gufe/storage/stagingdirectory.py +++ b/gufe/storage/stagingdirectory.py @@ -1,7 +1,8 @@ from typing import Union, Optional from pathlib import Path -from os import PathLike, rmdir +from os import PathLike, rmdir, remove from .externalresource import ExternalStorage +from contextlib import contextmanager import logging _logger = logging.getLogger(__name__) @@ -54,13 +55,13 @@ class StagingDirectory: Parameters ---------- - scratch : os.PathLike + scratch : PathLike the scratch directory shared by all objects on this host external : :class:`.ExternalStorage` external storage resource where objects should eventualy go prefix : str label for this specific unit - holding : os.PathLike + holding : PathLike name of the subdirectory of scratch where staged results are temporarily stored; default is '.holding'. This must be the same for all units within a DAG. @@ -117,6 +118,13 @@ def get_other_staging_dir(self, prefix, delete_holding=None): read_only=True, ) + @contextmanager + def other_shared(self, prefix, delete_holding=None): + other = self.get_other_staging_dir(prefix, delete_holding) + yield other + other.cleanup() + + def transfer_single_file_to_external(self, held_file): """Transfer a given file from holding into external storage """ @@ -149,7 +157,7 @@ def cleanup(self): """ if self.delete_holding: for file in self.registry: - os.delete(file) + remove(file) _delete_empty_dirs(self.staging_dir) def register_path(self, staging_path): @@ -174,7 +182,8 @@ def register_path(self, staging_path): label_exists = self.external.exists(staging_path.label) if self.read_only and not label_exists: - raise IOError(f"Unable to create '{staging_path.label}'. This " + raise IOError(f"Unable to create '{staging_path.label}'. File " + "does not exist in external storage, and This " "staging path is read-only.") self.registry.add(staging_path) From c5ce48a50f3d746a1599a232ab4dad8943f161b7 Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Wed, 31 May 2023 12:44:13 -0500 Subject: [PATCH 10/69] first storage lifecycle test works --- gufe/storage/stagingdirectory.py | 4 +- gufe/storage/storagemanager.py | 20 ++- gufe/tests/storage/test_storagemanager.py | 148 +++++++++++++++++++--- 3 files changed, 148 insertions(+), 24 deletions(-) diff --git a/gufe/storage/stagingdirectory.py b/gufe/storage/stagingdirectory.py index a1237999..d7411e6e 100644 --- a/gufe/storage/stagingdirectory.py +++ b/gufe/storage/stagingdirectory.py @@ -92,6 +92,7 @@ def __init__( self.holding = holding self.registry : set[StagingPath] = set() + self.preexisting : set[StagingPath] = set() # NOTE: the fact that we use $SCRATCH/$HOLDING/$PREFIX instead of # $SCRATCH/$PREFIX/$HOLDING is important for 2 reasons: # 1. This doesn't take any of the user's namespace from their @@ -156,7 +157,7 @@ def cleanup(self): """Perform end-of-lifecycle cleanup. """ if self.delete_holding: - for file in self.registry: + for file in self.registry - self.preexisting: remove(file) _delete_empty_dirs(self.staging_dir) @@ -197,6 +198,7 @@ def register_path(self, staging_path): with self.external.load_stream(staging_path.label) as f: external_bytes = f.read() if scratch_path.exists(): + self.preexisting.add(staging_path) ... # TODO: something to check that the bytes are the same? scratch_path.parent.mkdir(exist_ok=True, parents=True) with open(scratch_path, mode='wb') as f: diff --git a/gufe/storage/storagemanager.py b/gufe/storage/storagemanager.py index 0aacda35..f914d32d 100644 --- a/gufe/storage/storagemanager.py +++ b/gufe/storage/storagemanager.py @@ -1,17 +1,26 @@ from os import PathLike from pathlib import Path from contextlib import contextmanager +import shutil -from .externalstorage import ExternalStorage, FileStorage +from .externalresource import ExternalStorage, FileStorage +from .stagingdirectory import StagingDirectory def _storage_path_conflict(external, path): + """Check if deleting ``path`` could delete externally stored data + """ # this is a little brittle; I don't like hard-coding the class here if isinstance(external, FileStorage): root = Path(external.root_dir) else: return False - return False + try: + _ = root.relative_to(Path(path)) + except ValueError: + return False + else: + return True class _DAGStorageManager: @@ -79,8 +88,7 @@ def running_unit(self, unit): shared_conflict = _storage_path_conflict(shared.external, shared) - - if not self.manager.keep_holding: + if not self.manager.keep_holding and not shared_conflict: shared.cleanup() @@ -118,7 +126,7 @@ def get_scratch(self, dag_label: str , unit_label: str) -> Path: def get_permanent(self, dag_label, unit_label): """Get the object for this unit's permanent holding directory""" - return SharedRoot( + return StagingDirectory( scratch=self.scratch_root / dag_label, external=self.permanent_root, prefix=unit_label, @@ -126,7 +134,7 @@ def get_permanent(self, dag_label, unit_label): def get_shared(self, dag_label, unit_label): """Get the object for this unit's shared holding directory""" - return SharedRoot( + return StagingDirectory( scratch=self.scratch_root / dag_label, external=self.shared_root, prefix=unit_label diff --git a/gufe/tests/storage/test_storagemanager.py b/gufe/tests/storage/test_storagemanager.py index bb9402b3..95fdf255 100644 --- a/gufe/tests/storage/test_storagemanager.py +++ b/gufe/tests/storage/test_storagemanager.py @@ -2,7 +2,9 @@ from gufe.storage.storagemanager import ( StorageManager, _storage_path_conflict ) -from gufe.storage.externalresource import MemoryStorage +from gufe.storage.stagingdirectory import StagingDirectory +from gufe.storage.externalresource import MemoryStorage, FileStorage +from pathlib import Path @pytest.fixture def storage_manager_std(tmp_path): @@ -20,35 +22,147 @@ def storage_manager_holding_overlaps_shared(tmp_path): def storage_manager_holding_overlaps_permanent(tmp_path): ... +@pytest.fixture +def dag_units(): + class Unit1: + @property + def key(self): + return "unit1" + + def run(self, scratch, shared, permanent): + (scratch / "foo.txt").touch() + with open(shared / "bar.txt", mode='w') as f: + f.write("bar was written") + with open(permanent / "baz.txt", mode='w') as f: + f.write("baz was written") + + return "done 1" + + class Unit2: + @property + def key(self): + return "unit2" + + def run(self, scratch, shared, permanent): + (scratch / "foo2.txt").touch() + with shared.other_shared("unit1") as prev_shared: + with open(prev_shared / "bar.txt", mode='r') as f: + bar = f.read() + + # note that you can open a file from permanent as if it was + # from shared -- everything in permanent is in shared + with open(prev_shared / "baz.txt", mode='r') as f: + baz = f.read() + + return {"bar": bar, "baz": baz} + + return [Unit1(), Unit2()] + + @pytest.mark.parametrize('manager', ['std']) def test_lifecycle(request, manager, dag_units): # heavy integration test to ensure that the whole process works # this is the primary test of _DAGStorageManager - storage_manager = request.getfixture(f"storage_manager_{manager}") + storage_manager = request.getfixturevalue(f"storage_manager_{manager}") + permanent_root = storage_manager.permanent_root + shared_root = storage_manager.shared_root + results = [] + unit1_dir = Path(storage_manager.get_shared("dag_label", "unit1")) + scratch1 = Path(storage_manager.get_scratch("dag_label", "unit1")) + scratch2 = Path(storage_manager.get_scratch("dag_label", "unit2")) + barfile = unit1_dir / "bar.txt" + bazfile = unit1_dir / "baz.txt" + foofile = scratch1 / "foo.txt" + foo2file = scratch2 / "foo2.txt" + + all_files = {barfile, bazfile, foofile, foo2file} with storage_manager.running_dag("dag_label") as dag_ctx: for unit in dag_units: with dag_ctx.running_unit(unit) as (scratch, shared, permanent): - results.append(unit.run(scratch, shareed, permanent)) - # TODO: asserts at this level - # all files exist; are where we expect them - # TODO: asserts at this level - # TODO: asserts at this level + results.append(unit.run(scratch, shared, permanent)) + + # check that the expected files are found in staging + exists = { + "unit1": {barfile, bazfile, foofile}, + "unit2": {foo2file, bazfile} + }[unit.key] + + for file in exists: + assert file.exists() + + for file in all_files - exists: + assert not file.exists() + + # check that shared store is as expected + expected_in_shared = { + "unit1": set(), + "unit2": {"unit1/bar.txt", "unit1/baz.txt"} + }[unit.key] + assert set(shared_root.iter_contents()) == expected_in_shared + # check that permanent store is empty + assert list(permanent_root.iter_contents()) == [] + # AFTER THE RUNNING_UNIT CONTEXT + # Same for both units because unit2 doesn't add anything to + # shared/permanent + # Files staged for shared should be transferred to shared and + # removed from the staging directories; files staged for + # permanent should remain + for_permanent = {bazfile} + for file in for_permanent: + assert file.exists() + + for file in all_files - for_permanent: + assert not file.exists() + + # check that we have things in shared + expected_in_shared = {"unit1/bar.txt", "unit1/baz.txt"} + assert set(shared_root.iter_contents()) == expected_in_shared + # ensure that we haven't written to permanent yet + assert list(permanent_root.iter_contents()) == [] + # AFTER THE RUNNING_DAG CONTEXT + # all staged files should be deleted + for file in all_files: + assert not file.exists() + # shared still contains everything it had; but this isn't something we + # guarantee, so we don't actually test for it + # assert set(shared_root.iter_contents()) == {"unit1/bar.txt", + # "unit1/baz.txt"} + assert list(permanent_root.iter_contents()) == ["unit1/baz.txt"] def test_lifecycle_keep_scratch_and_holding(): ... -def test_storage_path_conflict_ok(): - ... +def test_storage_path_conflict_ok(tmp_path): + # if the filestorage root is not in the given path, no conflict + external = FileStorage(tmp_path / "foo" / "bar") + path = tmp_path / "foo" / "baz" + assert _storage_path_conflict(external, path) is False + +def test_storage_path_conflict_not_filestorage(tmp_path): + # if the external resource isn't a FileStorage, no conflict + external = MemoryStorage() + path = tmp_path / "foo" / "baz" + assert _storage_path_conflict(external, path) is False + +def test_storage_path_conflict_problem(tmp_path): + # if the filestorage root is in the given path, we have a conflict + external = FileStorage(tmp_path / "foo" / "bar") + path = tmp_path / "foo" + assert _storage_path_conflict(external, path) is True -def test_storage_path_conflict_problem(): - ... class TestStorageManager: - def test_get_scratch(): - ... + def test_get_scratch(self, storage_manager_std): + scratch = storage_manager_std.get_scratch("dag_label", "unit_label") + assert str(scratch).endswith("dag_label/scratch/unit_label") + assert isinstance(scratch, Path) - def test_get_permanent(): - ... + def test_get_permanent(self, storage_manager_std): + perm = storage_manager_std.get_permanent("dag_label", "unit_label") + assert perm.__fspath__().endswith("dag_label/.holding/unit_label") + assert isinstance(perm, StagingDirectory) - def test_get_shared(): - ... + def test_get_shared(self, storage_manager_std): + shared = storage_manager_std.get_shared("dag_label", "unit_label") + assert shared.__fspath__().endswith("dag_label/.holding/unit_label") + assert isinstance(shared, StagingDirectory) From b805aac64c88eb9052233744e5adc560c05d0e76 Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Wed, 31 May 2023 14:11:15 -0500 Subject: [PATCH 11/69] cleanup mypy --- gufe/storage/stagingdirectory.py | 131 ++++++++++++++------ gufe/storage/storagemanager.py | 28 ++++- gufe/tests/storage/test_stagingdirectory.py | 10 +- gufe/tests/storage/test_storagemanager.py | 13 +- 4 files changed, 128 insertions(+), 54 deletions(-) diff --git a/gufe/storage/stagingdirectory.py b/gufe/storage/stagingdirectory.py index d7411e6e..e19b7882 100644 --- a/gufe/storage/stagingdirectory.py +++ b/gufe/storage/stagingdirectory.py @@ -82,12 +82,10 @@ def __init__( *, holding: PathLike = Path(".holding"), delete_holding: bool = True, - read_only: bool = False, ): self.external = external self.scratch = Path(scratch) self.prefix = Path(prefix) - self.read_only = read_only self.delete_holding = delete_holding self.holding = holding @@ -104,34 +102,9 @@ def __init__( self.staging_dir = self.scratch / holding / prefix self.staging_dir.mkdir(exist_ok=True, parents=True) - def get_other_staging_dir(self, prefix, delete_holding=None): - """Get a related unit's staging directory. - """ - if delete_holding is None: - delete_holding = self.delete_holding - - return StagingDirectory( - scratch=self.scratch, - external=self.external, - prefix=prefix, - holding=self.holding, - delete_holding=delete_holding, - read_only=True, - ) - - @contextmanager - def other_shared(self, prefix, delete_holding=None): - other = self.get_other_staging_dir(prefix, delete_holding) - yield other - other.cleanup() - - def transfer_single_file_to_external(self, held_file): """Transfer a given file from holding into external storage """ - if self.read_only: - logging.debug("Read-only: Not transfering to external storage") - return # early exit path = Path(held_file) if not path.exists(): @@ -146,10 +119,6 @@ def transfer_single_file_to_external(self, held_file): def transfer_holding_to_external(self): """Transfer all objects in the registry to external storage""" - if self.read_only: - logging.debug("Read-only: Not transfering to external storage") - return # early exit - for obj in self.registry: self.transfer_single_file_to_external(obj) @@ -182,20 +151,18 @@ def register_path(self, staging_path): """ label_exists = self.external.exists(staging_path.label) - if self.read_only and not label_exists: - raise IOError(f"Unable to create '{staging_path.label}'. File " - "does not exist in external storage, and This " - "staging path is read-only.") - self.registry.add(staging_path) # if this is a file that exists, bring it into our subdir # NB: this happens even if you're intending to overwrite the path, # which is kind of wasteful if label_exists: + self._load_file_from_external(self.external, staging_path) + + def _load_file_from_external(self, external, staging_path): scratch_path = self.staging_dir / staging_path.path # TODO: switch this to using `get_filename` and `store_path` - with self.external.load_stream(staging_path.label) as f: + with external.load_stream(staging_path.label) as f: external_bytes = f.read() if scratch_path.exists(): self.preexisting.add(staging_path) @@ -217,6 +184,96 @@ def __repr__(self): ) +class SharedStaging(StagingDirectory): + def __init__( + self, + scratch: PathLike, + external: ExternalStorage, + prefix: str, + *, + holding: PathLike = Path(".holding"), + delete_holding: bool = True, + read_only: bool = False, + ): + super().__init__(scratch, external, prefix, holding=holding, + delete_holding=delete_holding) + self.read_only = read_only + + def get_other_shared(self, prefix, delete_holding=None): + """Get a related unit's staging directory. + """ + if delete_holding is None: + delete_holding = self.delete_holding + + return SharedStaging( + scratch=self.scratch, + external=self.external, + prefix=prefix, + holding=self.holding, + delete_holding=delete_holding, + read_only=True, + ) + + @contextmanager + def other_shared(self, prefix, delete_holding=None): + """Context manager approach for getting a related unit's directory. + + This is usually the recommended way to get a previous unit's shared + data. + """ + other = self.get_other_shared(prefix, delete_holding) + yield other + other.cleanup() + + def transfer_single_file_to_external(self, held_file): + if self.read_only: + logging.debug("Read-only: Not transfering to external storage") + return # early exit + + super().transfer_single_file_to_external(held_file) + + def transfer_holding_to_external(self): + if self.read_only: + logging.debug("Read-only: Not transfering to external storage") + return # early exit + + super().transfer_holding_to_external() + + def register_path(self, staging_path): + label_exists = self.external.exists(staging_path.label) + + if self.read_only and not label_exists: + raise IOError(f"Unable to create '{staging_path.label}'. File " + "does not exist in external storage, and This " + "staging path is read-only.") + + super().register_path(staging_path) + + +class PermanentStaging(StagingDirectory): + def __init__( + self, + scratch: PathLike, + external: ExternalStorage, + shared: ExternalStorage, + prefix: str, + *, + holding: PathLike = Path(".holding"), + delete_holding: bool = True, + ): + super().__init__(scratch, external, prefix, holding=holding, + delete_holding=delete_holding) + self.shared = shared + + def transfer_single_file_to_external(self, held_file): + # for this one, if we can't fin + path = Path(held_file) + if not path.exists(): + self._load_file_from_external(self.shared, held_file) + + super().transfer_single_file_to_external(held_file) + + class StagingPath: """PathLike object linking local path with label for external storage. diff --git a/gufe/storage/storagemanager.py b/gufe/storage/storagemanager.py index f914d32d..aa4fc4ef 100644 --- a/gufe/storage/storagemanager.py +++ b/gufe/storage/storagemanager.py @@ -3,8 +3,10 @@ from contextlib import contextmanager import shutil +from typing import Type + from .externalresource import ExternalStorage, FileStorage -from .stagingdirectory import StagingDirectory +from .stagingdirectory import SharedStaging, PermanentStaging def _storage_path_conflict(external, path): """Check if deleting ``path`` could delete externally stored data @@ -22,8 +24,20 @@ def _storage_path_conflict(external, path): else: return True +class _AbstractDAGContextManager: + @classmethod + @contextmanager + def running_dag(cls, storage_manager, dag_label): + raise NotImplementedError() + + @contextmanager + def running_unit(cls, unit): + raise NotImplementedError() + +DAGContextManager = Type[_DAGStorageManager] -class _DAGStorageManager: + +class _DAGStorageManager(_AbstractDAGContextManager): """Context manager to handle details of storage lifecycle. Making this a separate class ensures that ``running_unit`` is always @@ -108,9 +122,9 @@ def __init__( keep_scratch: bool = False, keep_holding: bool = False, holding: PathLike = Path(".holding"), - DAGContextClass: type = _DAGStorageManager, + DAGContextClass: DAGContextManager = _DAGStorageManager, ): - self.scratch_root = scratch_root + self.scratch_root = Path(scratch_root) self.shared_root = shared_root self.permanent_root = permanent_root self.keep_scratch = keep_scratch @@ -120,21 +134,23 @@ def __init__( def get_scratch(self, dag_label: str , unit_label: str) -> Path: """Get the path for this unit's scratch directory""" + scratch = self.scratch_root / dag_label / "scratch" / unit_label scratch.mkdir(parents=True, exist_ok=True) return scratch def get_permanent(self, dag_label, unit_label): """Get the object for this unit's permanent holding directory""" - return StagingDirectory( + return PermanentStaging( scratch=self.scratch_root / dag_label, external=self.permanent_root, + shared=self.shared_root, prefix=unit_label, ) def get_shared(self, dag_label, unit_label): """Get the object for this unit's shared holding directory""" - return StagingDirectory( + return SharedStaging( scratch=self.scratch_root / dag_label, external=self.shared_root, prefix=unit_label diff --git a/gufe/tests/storage/test_stagingdirectory.py b/gufe/tests/storage/test_stagingdirectory.py index 46c48819..09f4fc2d 100644 --- a/gufe/tests/storage/test_stagingdirectory.py +++ b/gufe/tests/storage/test_stagingdirectory.py @@ -5,14 +5,14 @@ from gufe.storage.externalresource import MemoryStorage from gufe.storage.stagingdirectory import ( - StagingDirectory, _delete_empty_dirs + SharedStaging, PermanentStaging, _delete_empty_dirs ) @pytest.fixture def root(tmp_path): external = MemoryStorage() external.store_bytes("old_unit/data.txt", b"foo") - root = StagingDirectory( + root = SharedStaging( scratch=tmp_path, external=external, prefix="new_unit", @@ -74,7 +74,7 @@ def test_delete_empty_dirs_delete_root(tmp_path, delete_root): -class TestStagingDirectory: +class TestSharedStaging: @pytest.mark.parametrize('pathlist', [ ['file.txt'], ['dir', 'file.txt'] ]) @@ -100,7 +100,7 @@ def test_read_old(self, root): # when we create the specific StagingPath, it registers and # "downloads" the file - old_staging = root.get_other_staging_dir("old_unit") + old_staging = root.get_other_shared("old_unit") filepath = old_staging / "data.txt" assert pathlib.Path(filepath) == on_filesystem assert on_filesystem.exists() @@ -122,7 +122,7 @@ def test_write_new(self, root): assert not root.external.exists(label) def test_write_old_fail(self, root): - old_staging = root.get_other_staging_dir("old_unit") + old_staging = root.get_other_shared("old_unit") with pytest.raises(IOError, match="read-only"): old_staging / "foo.txt" diff --git a/gufe/tests/storage/test_storagemanager.py b/gufe/tests/storage/test_storagemanager.py index 95fdf255..c4d60e56 100644 --- a/gufe/tests/storage/test_storagemanager.py +++ b/gufe/tests/storage/test_storagemanager.py @@ -25,9 +25,7 @@ def storage_manager_holding_overlaps_permanent(tmp_path): @pytest.fixture def dag_units(): class Unit1: - @property - def key(self): - return "unit1" + key = "unit1" def run(self, scratch, shared, permanent): (scratch / "foo.txt").touch() @@ -39,9 +37,7 @@ def run(self, scratch, shared, permanent): return "done 1" class Unit2: - @property - def key(self): - return "unit2" + key = "unit2" def run(self, scratch, shared, permanent): (scratch / "foo2.txt").touch() @@ -128,6 +124,11 @@ def test_lifecycle(request, manager, dag_units): # assert set(shared_root.iter_contents()) == {"unit1/bar.txt", # "unit1/baz.txt"} assert list(permanent_root.iter_contents()) == ["unit1/baz.txt"] + # check the results + assert results == [ + "done 1", + {"bar": "bar was written", "baz": "baz was written"} + ] def test_lifecycle_keep_scratch_and_holding(): ... From ed5e83c23e3ee03534f8cbe5af753cc4ac9baeae Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Wed, 31 May 2023 14:18:29 -0500 Subject: [PATCH 12/69] change to unit taking in the label --- gufe/storage/storagemanager.py | 9 ++++----- gufe/tests/storage/test_storagemanager.py | 2 +- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/gufe/storage/storagemanager.py b/gufe/storage/storagemanager.py index aa4fc4ef..6a7a46a9 100644 --- a/gufe/storage/storagemanager.py +++ b/gufe/storage/storagemanager.py @@ -27,14 +27,14 @@ def _storage_path_conflict(external, path): class _AbstractDAGContextManager: @classmethod @contextmanager - def running_dag(cls, storage_manager, dag_label): + def running_dag(cls, storage_manager, dag_label: str): raise NotImplementedError() @contextmanager - def running_unit(cls, unit): + def running_unit(cls, unit_label: str): raise NotImplementedError() -DAGContextManager = Type[_DAGStorageManager] +DAGContextManager = Type[_AbstractDAGContextManager] class _DAGStorageManager(_AbstractDAGContextManager): @@ -74,7 +74,7 @@ def running_dag(cls, storage_manager, dag_label): d.cleanup() @contextmanager - def running_unit(self, unit): + def running_unit(self, unit_label: str): """Unit level of the storage lifecycle. This provides the holding directories used for scratch, shared, and @@ -82,7 +82,6 @@ def running_unit(self, unit): to the real shared external storage, cleans up the scratch directory and the shared holding directory. """ - unit_label = unit.key scratch = self.manager.get_scratch(self.dag_label, unit_label) shared = self.manager.get_shared(self.dag_label, unit_label) permanent = self.manager.get_permanent(self.dag_label, unit_label) diff --git a/gufe/tests/storage/test_storagemanager.py b/gufe/tests/storage/test_storagemanager.py index c4d60e56..c2497170 100644 --- a/gufe/tests/storage/test_storagemanager.py +++ b/gufe/tests/storage/test_storagemanager.py @@ -74,7 +74,7 @@ def test_lifecycle(request, manager, dag_units): all_files = {barfile, bazfile, foofile, foo2file} with storage_manager.running_dag("dag_label") as dag_ctx: for unit in dag_units: - with dag_ctx.running_unit(unit) as (scratch, shared, permanent): + with dag_ctx.running_unit(unit.key) as (scratch, shared, permanent): results.append(unit.run(scratch, shared, permanent)) # check that the expected files are found in staging From 61810398c60ec9038f2fb67494770ebfad79db3f Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Mon, 5 Jun 2023 17:59:30 -0500 Subject: [PATCH 13/69] lots of updates; switched to harness for tests --- docs/guide/storage.rst | 22 ++ gufe/storage/stagingdirectory.py | 10 +- gufe/storage/storagemanager.py | 29 ++- gufe/tests/storage/test_stagingdirectory.py | 5 +- gufe/tests/storage/test_storagemanager.py | 218 ++++++++++++-------- 5 files changed, 179 insertions(+), 105 deletions(-) diff --git a/docs/guide/storage.rst b/docs/guide/storage.rst index caac6399..76ae59c7 100644 --- a/docs/guide/storage.rst +++ b/docs/guide/storage.rst @@ -65,3 +65,25 @@ to the executor, can be instances of an :class:`.ExternalResource`. on ``scp`` or ``sftp`` are fine, but things like cloud storage, where the existence of a new document may take time to propagate through the network, are not recommended for ``shared``. + + +Details: Manangement of the Storage Lifetime +-------------------------------------------- + +The concepts of the storage lifetimes are important for protocol authors, +but details of implementation are left to the specific executor. In order to +facilitate ??? + +* :class:`.StorageManager`: This is the overall façade interface for + interacting with the rest of the storage lifecycle tools. +* ``DAGContextManager``: +* :class:`.StagingDirectory`: +* :class:`.StagingPath`: + +In practice, the executor uses the :class:`.StorageManager` to create a +:class:`.DAGContextManager` at the level of a DAG, and then uses the +:class:`.DAGContextManager` to create a context to run a unit. That context +creates a :class:`.SharedStaging` and a :class:`.PermanentStaging` +associated with the specific unit. Those staging directories, with the +scratch directory, are provided to the :class:`.ProtocolDAGUnit`, so that +these are the only objects protocol authors need to interact with. diff --git a/gufe/storage/stagingdirectory.py b/gufe/storage/stagingdirectory.py index e19b7882..b70a1858 100644 --- a/gufe/storage/stagingdirectory.py +++ b/gufe/storage/stagingdirectory.py @@ -60,7 +60,11 @@ class StagingDirectory: external : :class:`.ExternalStorage` external storage resource where objects should eventualy go prefix : str - label for this specific unit + label for this specific unit; this should be a slash-separated + description of where this unit fits in the hierarchy. For example, + it might be ``$DAG_LABEL/$UNIT_LABEL`` or + ``$DAG_LABEL/$UNIT_LABEL/$UNIT_REPEAT``. It must be a unique + identifier for this unit within the permanent storage. holding : PathLike name of the subdirectory of scratch where staged results are temporarily stored; default is '.holding'. This must be the same for @@ -99,7 +103,7 @@ def __init__( # external storage is exactly the same as this local storage, # meaning that copies to/from the external storage are no-ops. # Use FileStorage(scratch / holding) for that. - self.staging_dir = self.scratch / holding / prefix + self.staging_dir = self.scratch / prefix / holding self.staging_dir.mkdir(exist_ok=True, parents=True) def transfer_single_file_to_external(self, held_file): @@ -266,7 +270,7 @@ def __init__( self.shared = shared def transfer_single_file_to_external(self, held_file): - # for this one, if we can't fin + # if we can't find it locally, we load it from shared storage path = Path(held_file) if not path.exists(): self._load_file_from_external(self.shared, held_file) diff --git a/gufe/storage/storagemanager.py b/gufe/storage/storagemanager.py index 6a7a46a9..d88b9e86 100644 --- a/gufe/storage/storagemanager.py +++ b/gufe/storage/storagemanager.py @@ -28,12 +28,21 @@ class _AbstractDAGContextManager: @classmethod @contextmanager def running_dag(cls, storage_manager, dag_label: str): + """Return a context manager for when a DAG is started. + + This context manager handles the DAG scale of the lifecycle. + """ raise NotImplementedError() @contextmanager - def running_unit(cls, unit_label: str): + def running_unit(self, unit_label: str): + """Return a context manager for when unit is started. + + This context manager handles the unit scale of the lifecycle. + """ raise NotImplementedError() + DAGContextManager = Type[_AbstractDAGContextManager] @@ -82,9 +91,9 @@ def running_unit(self, unit_label: str): to the real shared external storage, cleans up the scratch directory and the shared holding directory. """ - scratch = self.manager.get_scratch(self.dag_label, unit_label) - shared = self.manager.get_shared(self.dag_label, unit_label) - permanent = self.manager.get_permanent(self.dag_label, unit_label) + scratch = self.manager.get_scratch(unit_label) + shared = self.manager.get_shared(unit_label) + permanent = self.manager.get_permanent(unit_label) try: yield scratch, shared, permanent finally: @@ -131,26 +140,26 @@ def __init__( self.holding = holding self.DAGContextClass = DAGContextClass - def get_scratch(self, dag_label: str , unit_label: str) -> Path: + def get_scratch(self, unit_label: str) -> Path: """Get the path for this unit's scratch directory""" - scratch = self.scratch_root / dag_label / "scratch" / unit_label + scratch = self.scratch_root / unit_label / "scratch" scratch.mkdir(parents=True, exist_ok=True) return scratch - def get_permanent(self, dag_label, unit_label): + def get_permanent(self, unit_label): """Get the object for this unit's permanent holding directory""" return PermanentStaging( - scratch=self.scratch_root / dag_label, + scratch=self.scratch_root, external=self.permanent_root, shared=self.shared_root, prefix=unit_label, ) - def get_shared(self, dag_label, unit_label): + def get_shared(self, unit_label): """Get the object for this unit's shared holding directory""" return SharedStaging( - scratch=self.scratch_root / dag_label, + scratch=self.scratch_root, external=self.shared_root, prefix=unit_label ) diff --git a/gufe/tests/storage/test_stagingdirectory.py b/gufe/tests/storage/test_stagingdirectory.py index 09f4fc2d..e825d28b 100644 --- a/gufe/tests/storage/test_stagingdirectory.py +++ b/gufe/tests/storage/test_stagingdirectory.py @@ -94,7 +94,7 @@ def test_read_old(self, root): # initial conditions, without touching StagingDirectory/StagingPath label = "old_unit/data.txt" - on_filesystem = root.scratch / root.holding / label + on_filesystem = root.scratch / "old_unit" / root.holding / "data.txt" assert not on_filesystem.exists() assert root.external.exists(label) @@ -111,7 +111,8 @@ def test_read_old(self, root): def test_write_new(self, root): label = "new_unit/somefile.txt" - on_filesystem = root.scratch / root.holding / label + on_filesystem = (root.scratch / "new_unit" / root.holding + / "somefile.txt") assert not on_filesystem.exists() with open(root / "somefile.txt", mode='wb') as f: f.write(b"testing") diff --git a/gufe/tests/storage/test_storagemanager.py b/gufe/tests/storage/test_storagemanager.py index c2497170..9c36e391 100644 --- a/gufe/tests/storage/test_storagemanager.py +++ b/gufe/tests/storage/test_storagemanager.py @@ -9,19 +9,11 @@ @pytest.fixture def storage_manager_std(tmp_path): return StorageManager( - scratch_root=tmp_path / "scratch", + scratch_root=tmp_path / "working", shared_root=MemoryStorage(), permanent_root=MemoryStorage() ) -@pytest.fixture -def storage_manager_holding_overlaps_shared(tmp_path): - ... - -@pytest.fixture -def storage_manager_holding_overlaps_permanent(tmp_path): - ... - @pytest.fixture def dag_units(): class Unit1: @@ -41,7 +33,9 @@ class Unit2: def run(self, scratch, shared, permanent): (scratch / "foo2.txt").touch() - with shared.other_shared("unit1") as prev_shared: + # TODO: this will change; the inputs should include a way to get + # the previous shared unit label + with shared.other_shared("dag/unit1") as prev_shared: with open(prev_shared / "bar.txt", mode='r') as f: bar = f.read() @@ -54,85 +48,129 @@ def run(self, scratch, shared, permanent): return [Unit1(), Unit2()] +class LifecycleHarness: + @pytest.fixture + def storage_manager(self, tmp_path): + raise NotImplementedError() + + @staticmethod + def get_files_dict(storage_manager): + root = storage_manager.scratch_root + holding = storage_manager.holding + return { + "foo": root / "dag/unit1/scratch/foo.txt", + "foo2": root / "dag/unit2/scratch/foo2.txt", + "bar": root / "dag/unit1" / holding / "bar.txt", + "baz": root / "dag/unit1" / holding / "baz.txt", + } + + def test_lifecycle(self, storage_manager, dag_units, tmp_path): + results = [] + dag_label = "dag" + with storage_manager.running_dag(dag_label) as dag_ctx: + for unit in dag_units: + label = f"{dag_ctx.dag_label}/{unit.key}" + with dag_ctx.running_unit(label) as (scratch, shared, perm): + results.append(unit.run(scratch, shared, perm)) + self.in_unit_asserts(storage_manager, label) + self.after_unit_asserts(storage_manager, label) + self.after_dag_asserts(storage_manager) + assert results == [ + "done 1", + {"bar": "bar was written", "baz": "baz was written"} + ] + + def _in_unit_existing_files(self, unit_label): + raise NotImplementedError() + + def _after_unit_existing_files(self, unit_label): + raise NotImplementedError() + + def _after_dag_existing_files(self): + raise NotImplementedError() + + @staticmethod + def assert_existing_files(files_dict, existing): + for file in existing: + assert files_dict[file].exists() + + for file in set(files_dict) - existing: + assert not files_dict[file].exists() + + def in_unit_asserts(self, storage_manager, unit_label): + # check that shared and permanent are correct + shared_root = storage_manager.shared_root + permanent_root = storage_manager.permanent_root + expected_in_shared = { + "dag/unit1": set(), + "dag/unit2": {"dag/unit1/bar.txt", "dag/unit1/baz.txt"} + }[unit_label] + assert set(shared_root.iter_contents()) == expected_in_shared + + assert list(permanent_root.iter_contents()) == [] + + # manager-specific check for files + files_dict = self.get_files_dict(storage_manager) + existing = self._in_unit_existing_files(unit_label) + self.assert_existing_files(files_dict, existing) + + def after_unit_asserts(self, storage_manager, unit_label): + shared_root = storage_manager.shared_root + permanent_root = storage_manager.permanent_root + # these are independent of unit label + expected_in_shared = {"dag/unit1/bar.txt", "dag/unit1/baz.txt"} + assert set(shared_root.iter_contents()) == expected_in_shared + assert list(permanent_root.iter_contents()) == [] + + files_dict = self.get_files_dict(storage_manager) + existing = self._after_unit_existing_files(unit_label) + self.assert_existing_files(files_dict, existing) + + def after_dag_asserts(self, storage_manager): + permanent_root = storage_manager.permanent_root + # shared still contains everything it had; but this isn't something + # we guarantee, so we don't actually test for it: + # shared_root = storage_manager.shared_root + # assert set(shared_root.iter_contents()) == {"unit1/bar.txt", + # "unit1/baz.txt"} + assert list(permanent_root.iter_contents()) == ["dag/unit1/baz.txt"] + + files_dict = self.get_files_dict(storage_manager) + existing = self._after_dag_existing_files() + self.assert_existing_files(files_dict, existing) + + +class TestStandardStorageManager(LifecycleHarness): + @pytest.fixture + def storage_manager(self, storage_manager_std): + return storage_manager_std + + def _in_unit_existing_files(self, unit_label): + return { + "dag/unit1": {'bar', 'baz', 'foo'}, + "dag/unit2": {'foo2', 'baz'} + }[unit_label] + + def _after_unit_existing_files(self, unit_label): + # Same for both units because unit2 doesn't add anything to + # shared/permanent; in this one, only files staged for permanent + # should remain + return {'baz'} + + def _after_dag_existing_files(self): + return set() -@pytest.mark.parametrize('manager', ['std']) -def test_lifecycle(request, manager, dag_units): - # heavy integration test to ensure that the whole process works - # this is the primary test of _DAGStorageManager - storage_manager = request.getfixturevalue(f"storage_manager_{manager}") - permanent_root = storage_manager.permanent_root - shared_root = storage_manager.shared_root - results = [] - unit1_dir = Path(storage_manager.get_shared("dag_label", "unit1")) - scratch1 = Path(storage_manager.get_scratch("dag_label", "unit1")) - scratch2 = Path(storage_manager.get_scratch("dag_label", "unit2")) - barfile = unit1_dir / "bar.txt" - bazfile = unit1_dir / "baz.txt" - foofile = scratch1 / "foo.txt" - foo2file = scratch2 / "foo2.txt" - - all_files = {barfile, bazfile, foofile, foo2file} - with storage_manager.running_dag("dag_label") as dag_ctx: - for unit in dag_units: - with dag_ctx.running_unit(unit.key) as (scratch, shared, permanent): - results.append(unit.run(scratch, shared, permanent)) - - # check that the expected files are found in staging - exists = { - "unit1": {barfile, bazfile, foofile}, - "unit2": {foo2file, bazfile} - }[unit.key] - - for file in exists: - assert file.exists() - - for file in all_files - exists: - assert not file.exists() - - # check that shared store is as expected - expected_in_shared = { - "unit1": set(), - "unit2": {"unit1/bar.txt", "unit1/baz.txt"} - }[unit.key] - assert set(shared_root.iter_contents()) == expected_in_shared - # check that permanent store is empty - assert list(permanent_root.iter_contents()) == [] - # AFTER THE RUNNING_UNIT CONTEXT - # Same for both units because unit2 doesn't add anything to - # shared/permanent - # Files staged for shared should be transferred to shared and - # removed from the staging directories; files staged for - # permanent should remain - for_permanent = {bazfile} - for file in for_permanent: - assert file.exists() - - for file in all_files - for_permanent: - assert not file.exists() - - # check that we have things in shared - expected_in_shared = {"unit1/bar.txt", "unit1/baz.txt"} - assert set(shared_root.iter_contents()) == expected_in_shared - # ensure that we haven't written to permanent yet - assert list(permanent_root.iter_contents()) == [] - # AFTER THE RUNNING_DAG CONTEXT - # all staged files should be deleted - for file in all_files: - assert not file.exists() - # shared still contains everything it had; but this isn't something we - # guarantee, so we don't actually test for it - # assert set(shared_root.iter_contents()) == {"unit1/bar.txt", - # "unit1/baz.txt"} - assert list(permanent_root.iter_contents()) == ["unit1/baz.txt"] - # check the results - assert results == [ - "done 1", - {"bar": "bar was written", "baz": "baz was written"} - ] def test_lifecycle_keep_scratch_and_holding(): ... +def test_lifecycle_holding_overlaps_shared(tmp_path): + ... + +def test_lifecycle_holding_overlaps_permanent(tmp_path): + ... + + def test_storage_path_conflict_ok(tmp_path): # if the filestorage root is not in the given path, no conflict external = FileStorage(tmp_path / "foo" / "bar") @@ -154,16 +192,16 @@ def test_storage_path_conflict_problem(tmp_path): class TestStorageManager: def test_get_scratch(self, storage_manager_std): - scratch = storage_manager_std.get_scratch("dag_label", "unit_label") - assert str(scratch).endswith("dag_label/scratch/unit_label") + scratch = storage_manager_std.get_scratch("dag_label/unit_label") + assert str(scratch).endswith("dag_label/unit_label/scratch") assert isinstance(scratch, Path) def test_get_permanent(self, storage_manager_std): - perm = storage_manager_std.get_permanent("dag_label", "unit_label") - assert perm.__fspath__().endswith("dag_label/.holding/unit_label") + perm = storage_manager_std.get_permanent("dag_label/unit_label") + assert perm.__fspath__().endswith("dag_label/unit_label/.holding") assert isinstance(perm, StagingDirectory) def test_get_shared(self, storage_manager_std): - shared = storage_manager_std.get_shared("dag_label", "unit_label") - assert shared.__fspath__().endswith("dag_label/.holding/unit_label") + shared = storage_manager_std.get_shared("dag_label/unit_label") + assert shared.__fspath__().endswith("dag_label/unit_label/.holding") assert isinstance(shared, StagingDirectory) From 1880f730219e2171b59baec261ef19e32f3055f9 Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Tue, 6 Jun 2023 11:03:07 -0500 Subject: [PATCH 14/69] Big reorg for shared overlapping staging --- docs/guide/storage.rst | 12 +- gufe/storage/stagingdirectory.py | 53 +++++-- gufe/storage/storagemanager.py | 27 +++- gufe/tests/storage/test_stagingdirectory.py | 5 +- gufe/tests/storage/test_storagemanager.py | 161 +++++++++++++++----- 5 files changed, 198 insertions(+), 60 deletions(-) diff --git a/docs/guide/storage.rst b/docs/guide/storage.rst index 76ae59c7..671c0b4a 100644 --- a/docs/guide/storage.rst +++ b/docs/guide/storage.rst @@ -72,7 +72,9 @@ Details: Manangement of the Storage Lifetime The concepts of the storage lifetimes are important for protocol authors, but details of implementation are left to the specific executor. In order to -facilitate ??? +facilitate correct treatment of the storage lifecycle, GUFE provides a few +helpers. The information in this section is mostly of interest to authors of +executors. The helpers are: * :class:`.StorageManager`: This is the overall façade interface for interacting with the rest of the storage lifecycle tools. @@ -87,3 +89,11 @@ creates a :class:`.SharedStaging` and a :class:`.PermanentStaging` associated with the specific unit. Those staging directories, with the scratch directory, are provided to the :class:`.ProtocolDAGUnit`, so that these are the only objects protocol authors need to interact with. + +In using these, we assume that the label for data from a given unit is of +the form ``$DAG/$UNIT_INFO/$FILEPATH``. The details of ``$DAG`` and +``$UNIT_INFO`` are up the executor; in particular, there may be a more +deeply nested hierarchy for the ``$UNIT_INFO`` (e.g., to account for retries +of a given unit). An executor that wants to use a data label that doesnt' +match this format should not use :class:`.StorageManager` and the related +tools. diff --git a/gufe/storage/stagingdirectory.py b/gufe/storage/stagingdirectory.py index b70a1858..94dc3c24 100644 --- a/gufe/storage/stagingdirectory.py +++ b/gufe/storage/stagingdirectory.py @@ -1,12 +1,36 @@ from typing import Union, Optional from pathlib import Path from os import PathLike, rmdir, remove -from .externalresource import ExternalStorage +from .externalresource import ExternalStorage, FileStorage from contextlib import contextmanager import logging _logger = logging.getLogger(__name__) +def _safe_to_delete_holding(external, path, prefix): + """Check if deleting ``path`` could delete externally stored data. + + If external storage is a FileStorage, then it will storage files for + this unit or dag in the directory ``external.root_dir / prefix``, where + ``prefix`` is either the unit label or the dag label. If ``path`` is + inside that directory, then deleting it may delete information from the + external storage. In that case, this returns False, indicating a + conflict. Otherwise, this returns True. + """ + # this is a little brittle; I don't like hard-coding the class here + if isinstance(external, FileStorage): + root = Path(external.root_dir) / prefix + else: + return True + + p = Path(path) + try: + _ = p.relative_to(root) + except ValueError: + return True + else: + return False + def _delete_empty_dirs(root, delete_root=True): """Delete all empty directories. @@ -95,17 +119,16 @@ def __init__( self.registry : set[StagingPath] = set() self.preexisting : set[StagingPath] = set() - # NOTE: the fact that we use $SCRATCH/$HOLDING/$PREFIX instead of - # $SCRATCH/$PREFIX/$HOLDING is important for 2 reasons: - # 1. This doesn't take any of the user's namespace from their - # $SCRATCH/$PREFIX directory. - # 2. This allows us to easily use an external FileStorage where the - # external storage is exactly the same as this local storage, - # meaning that copies to/from the external storage are no-ops. - # Use FileStorage(scratch / holding) for that. - self.staging_dir = self.scratch / prefix / holding + self.staging_dir = self.scratch / holding / prefix self.staging_dir.mkdir(exist_ok=True, parents=True) + def _delete_holding_safe(self): + return _safe_to_delete_holding( + external=self.external, + path=self.staging_dir, + prefix=self.prefix, + ) + def transfer_single_file_to_external(self, held_file): """Transfer a given file from holding into external storage """ @@ -129,7 +152,7 @@ def transfer_holding_to_external(self): def cleanup(self): """Perform end-of-lifecycle cleanup. """ - if self.delete_holding: + if self.delete_holding and self._delete_holding_safe(): for file in self.registry - self.preexisting: remove(file) _delete_empty_dirs(self.staging_dir) @@ -269,6 +292,14 @@ def __init__( delete_holding=delete_holding) self.shared = shared + def _delete_holding_safe(self): + shared_safe = _safe_to_delete_holding( + external=self.shared, + path=self.staging_dir, + prefix=self.prefix + ) + return shared_safe and super()._delete_holding_safe() + def transfer_single_file_to_external(self, held_file): # if we can't find it locally, we load it from shared storage path = Path(held_file) diff --git a/gufe/storage/storagemanager.py b/gufe/storage/storagemanager.py index d88b9e86..6a1f0838 100644 --- a/gufe/storage/storagemanager.py +++ b/gufe/storage/storagemanager.py @@ -8,17 +8,25 @@ from .externalresource import ExternalStorage, FileStorage from .stagingdirectory import SharedStaging, PermanentStaging -def _storage_path_conflict(external, path): - """Check if deleting ``path`` could delete externally stored data +def _storage_path_conflict(external, path, label): + """Check if deleting ``path`` could delete externally stored data. + + If external storage is a FileStorage, then it will storage files for + this unit or dag in the directory ``external.root_dir / label``, where + ``label`` is either the unit label or the dag label. If ``path`` is + inside that directory, then deleting it may delete information from the + external storage. In that case, this returns True, indicating a + conflict. Otherwise, this returns False. """ # this is a little brittle; I don't like hard-coding the class here if isinstance(external, FileStorage): - root = Path(external.root_dir) + root = Path(external.root_dir) / label else: return False + p = Path(path) try: - _ = root.relative_to(Path(path)) + _ = p.relative_to(root) except ValueError: return False else: @@ -80,6 +88,7 @@ def running_dag(cls, storage_manager, dag_label): if not dag_manager.manager.keep_holding: for d in dag_manager.permanents: + # import pdb; pdb.set_trace() d.cleanup() @contextmanager @@ -104,12 +113,12 @@ def running_unit(self, unit_label: str): for file in permanent.registry: shared.transfer_single_file_to_external(file) scratch_conflict = _storage_path_conflict(shared.external, - scratch) + scratch, unit_label) if not self.manager.keep_scratch and not scratch_conflict: shutil.rmtree(scratch) shared_conflict = _storage_path_conflict(shared.external, - shared) + shared, unit_label) if not self.manager.keep_holding and not shared_conflict: shared.cleanup() @@ -143,7 +152,7 @@ def __init__( def get_scratch(self, unit_label: str) -> Path: """Get the path for this unit's scratch directory""" - scratch = self.scratch_root / unit_label / "scratch" + scratch = self.scratch_root / "scratch" / unit_label scratch.mkdir(parents=True, exist_ok=True) return scratch @@ -154,6 +163,7 @@ def get_permanent(self, unit_label): external=self.permanent_root, shared=self.shared_root, prefix=unit_label, + holding=self.holding, ) def get_shared(self, unit_label): @@ -161,7 +171,8 @@ def get_shared(self, unit_label): return SharedStaging( scratch=self.scratch_root, external=self.shared_root, - prefix=unit_label + prefix=unit_label, + holding=self.holding, ) def running_dag(self, dag_label: str): diff --git a/gufe/tests/storage/test_stagingdirectory.py b/gufe/tests/storage/test_stagingdirectory.py index e825d28b..42893358 100644 --- a/gufe/tests/storage/test_stagingdirectory.py +++ b/gufe/tests/storage/test_stagingdirectory.py @@ -94,7 +94,7 @@ def test_read_old(self, root): # initial conditions, without touching StagingDirectory/StagingPath label = "old_unit/data.txt" - on_filesystem = root.scratch / "old_unit" / root.holding / "data.txt" + on_filesystem = root.scratch / root.holding / "old_unit/data.txt" assert not on_filesystem.exists() assert root.external.exists(label) @@ -111,8 +111,7 @@ def test_read_old(self, root): def test_write_new(self, root): label = "new_unit/somefile.txt" - on_filesystem = (root.scratch / "new_unit" / root.holding - / "somefile.txt") + on_filesystem = root.scratch / root.holding / "new_unit/somefile.txt" assert not on_filesystem.exists() with open(root / "somefile.txt", mode='wb') as f: f.write(b"testing") diff --git a/gufe/tests/storage/test_storagemanager.py b/gufe/tests/storage/test_storagemanager.py index 9c36e391..41092fc5 100644 --- a/gufe/tests/storage/test_storagemanager.py +++ b/gufe/tests/storage/test_storagemanager.py @@ -11,7 +11,7 @@ def storage_manager_std(tmp_path): return StorageManager( scratch_root=tmp_path / "working", shared_root=MemoryStorage(), - permanent_root=MemoryStorage() + permanent_root=MemoryStorage(), ) @pytest.fixture @@ -58,10 +58,10 @@ def get_files_dict(storage_manager): root = storage_manager.scratch_root holding = storage_manager.holding return { - "foo": root / "dag/unit1/scratch/foo.txt", - "foo2": root / "dag/unit2/scratch/foo2.txt", - "bar": root / "dag/unit1" / holding / "bar.txt", - "baz": root / "dag/unit1" / holding / "baz.txt", + "foo": root / "scratch/dag/unit1/foo.txt", + "foo2": root / "scratch/dag/unit2/foo2.txt", + "bar": root / holding / "dag/unit1/bar.txt", + "baz": root / holding / "dag/unit1/baz.txt", } def test_lifecycle(self, storage_manager, dag_units, tmp_path): @@ -97,6 +97,20 @@ def assert_existing_files(files_dict, existing): for file in set(files_dict) - existing: assert not files_dict[file].exists() + def _in_staging_shared(self, unit_label, in_after): + """ + This is to include things when a shared staging directory reports + that files exist in it. + """ + return set() + + def _in_staging_permanent(self, unit_label, in_after): + """ + This is to include things when a permanent staging directory reports + that files exist in it. + """ + return set() + def in_unit_asserts(self, storage_manager, unit_label): # check that shared and permanent are correct shared_root = storage_manager.shared_root @@ -104,10 +118,11 @@ def in_unit_asserts(self, storage_manager, unit_label): expected_in_shared = { "dag/unit1": set(), "dag/unit2": {"dag/unit1/bar.txt", "dag/unit1/baz.txt"} - }[unit_label] + }[unit_label] | self._in_staging_shared(unit_label, "in") assert set(shared_root.iter_contents()) == expected_in_shared - assert list(permanent_root.iter_contents()) == [] + expected_in_permanent = self._in_staging_permanent(unit_label, "in") + assert set(permanent_root.iter_contents()) == expected_in_permanent # manager-specific check for files files_dict = self.get_files_dict(storage_manager) @@ -117,24 +132,33 @@ def in_unit_asserts(self, storage_manager, unit_label): def after_unit_asserts(self, storage_manager, unit_label): shared_root = storage_manager.shared_root permanent_root = storage_manager.permanent_root - # these are independent of unit label + shared_extras = self._in_staging_shared(unit_label, "after") + permanent_extras = self._in_staging_permanent(unit_label, "after") expected_in_shared = {"dag/unit1/bar.txt", "dag/unit1/baz.txt"} + expected_in_shared |= shared_extras assert set(shared_root.iter_contents()) == expected_in_shared - assert list(permanent_root.iter_contents()) == [] + assert set(permanent_root.iter_contents()) == permanent_extras + # manager-specific check for files files_dict = self.get_files_dict(storage_manager) existing = self._after_unit_existing_files(unit_label) self.assert_existing_files(files_dict, existing) def after_dag_asserts(self, storage_manager): permanent_root = storage_manager.permanent_root + permanent_extras = self._in_staging_permanent('dag/unit2', "after") # shared still contains everything it had; but this isn't something - # we guarantee, so we don't actually test for it: + # we guarantee, so we don't actually test for it, but we could with + # this: # shared_root = storage_manager.shared_root - # assert set(shared_root.iter_contents()) == {"unit1/bar.txt", - # "unit1/baz.txt"} - assert list(permanent_root.iter_contents()) == ["dag/unit1/baz.txt"] + # shared_extras = self._in_staging_shared('dag/unit2', "after") + # expected_in_shared = {"dag/unit1/bar.txt", "dag/unit1/baz.txt"} + # expected_in_shared |= shared_extras + # assert set(shared_root.iter_contents()) == expected_in_shared + expected_in_permanent = {"dag/unit1/baz.txt"} | permanent_extras + assert set(permanent_root.iter_contents()) == expected_in_permanent + # manager-specific check for files files_dict = self.get_files_dict(storage_manager) existing = self._after_dag_existing_files() self.assert_existing_files(files_dict, existing) @@ -161,47 +185,110 @@ def _after_dag_existing_files(self): return set() -def test_lifecycle_keep_scratch_and_holding(): - ... +class TestKeepScratchAndHoldingStorageManager(LifecycleHarness): + @pytest.fixture + def storage_manager(self, tmp_path): + return StorageManager( + scratch_root=tmp_path / "working", + shared_root=MemoryStorage(), + permanent_root=MemoryStorage(), + keep_scratch=True, + keep_holding=True + ) + + @staticmethod + def files_after_unit(unit_label): + unit1 = {'bar', 'baz', 'foo'} + unit2 = {'foo2', 'baz'} + return { + 'dag/unit1': unit1, + 'dag/unit2': unit1 | unit2 + }[unit_label] + + def _in_unit_existing_files(self, unit_label): + return self.files_after_unit(unit_label) + + def _after_unit_existing_files(self, unit_label): + return self.files_after_unit(unit_label) + + def _after_dag_existing_files(self): + return self.files_after_unit('dag/unit2') + + +class TestHoldingOverlapsSharedStorageManager(LifecycleHarness): + @pytest.fixture + def storage_manager(self, tmp_path): + root = tmp_path / "working" + return StorageManager( + scratch_root=root, + shared_root=FileStorage(root), + permanent_root=MemoryStorage(), + holding="", + ) + + def _in_staging_shared(self, unit_label, in_after): + bar = "dag/unit1/bar.txt" + baz = "dag/unit1/baz.txt" + foo = "scratch/dag/unit1/foo.txt" + foo2 = "scratch/dag/unit2/foo2.txt" + return { + ("dag/unit1", "in"): {bar, baz, foo}, + ("dag/unit1", "after"): {bar, baz}, + ("dag/unit2", "in"): {bar, baz, foo2}, + ("dag/unit2", "after"): {baz} + }[unit_label, in_after] -def test_lifecycle_holding_overlaps_shared(tmp_path): - ... + def _in_unit_existing_files(self, unit_label): + return { + "dag/unit1": {'foo', 'bar', 'baz'}, + "dag/unit2": {'foo2', 'bar', 'baz'}, + }[unit_label] + + def _after_unit_existing_files(self, unit_label): + # same for both; all files come from unit 1 + return {"bar", "baz"} + + def _after_dag_existing_files(self): + # NOTE: currently we don't delete bar at the end of a cycle, but we + # don't guarantee that we would not. So it exists, but changing that + # isn't API-breaking. + return {"bar", "baz"} -def test_lifecycle_holding_overlaps_permanent(tmp_path): - ... +# class TestHoldingOverlapsPermanentStorageManager(LifecycleHarness): +# ... -def test_storage_path_conflict_ok(tmp_path): - # if the filestorage root is not in the given path, no conflict - external = FileStorage(tmp_path / "foo" / "bar") - path = tmp_path / "foo" / "baz" - assert _storage_path_conflict(external, path) is False +# def test_storage_path_conflict_ok(tmp_path): +# # if the filestorage root is not in the given path, no conflict +# external = FileStorage(tmp_path / "foo" / "bar") +# path = tmp_path / "foo" / "baz" +# assert _storage_path_conflict(external, path) is False -def test_storage_path_conflict_not_filestorage(tmp_path): - # if the external resource isn't a FileStorage, no conflict - external = MemoryStorage() - path = tmp_path / "foo" / "baz" - assert _storage_path_conflict(external, path) is False +# def test_storage_path_conflict_not_filestorage(tmp_path): +# # if the external resource isn't a FileStorage, no conflict +# external = MemoryStorage() +# path = tmp_path / "foo" / "baz" +# assert _storage_path_conflict(external, path) is False -def test_storage_path_conflict_problem(tmp_path): - # if the filestorage root is in the given path, we have a conflict - external = FileStorage(tmp_path / "foo" / "bar") - path = tmp_path / "foo" - assert _storage_path_conflict(external, path) is True +# def test_storage_path_conflict_problem(tmp_path): +# # if the filestorage root is in the given path, we have a conflict +# external = FileStorage(tmp_path / "foo" / "bar") +# path = tmp_path / "foo" +# assert _storage_path_conflict(external, path) is True class TestStorageManager: def test_get_scratch(self, storage_manager_std): scratch = storage_manager_std.get_scratch("dag_label/unit_label") - assert str(scratch).endswith("dag_label/unit_label/scratch") + assert str(scratch).endswith("scratch/dag_label/unit_label") assert isinstance(scratch, Path) def test_get_permanent(self, storage_manager_std): perm = storage_manager_std.get_permanent("dag_label/unit_label") - assert perm.__fspath__().endswith("dag_label/unit_label/.holding") + assert perm.__fspath__().endswith(".holding/dag_label/unit_label") assert isinstance(perm, StagingDirectory) def test_get_shared(self, storage_manager_std): shared = storage_manager_std.get_shared("dag_label/unit_label") - assert shared.__fspath__().endswith("dag_label/unit_label/.holding") + assert shared.__fspath__().endswith(".holding/dag_label/unit_label") assert isinstance(shared, StagingDirectory) From 1e4ca2c217ca79d7e5409bcfa4f6d5c64c29f05a Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Tue, 6 Jun 2023 12:08:09 -0500 Subject: [PATCH 15/69] remove _storage_path_conflict Makes much more sense as _safe_to_delete_holding --- gufe/storage/stagingdirectory.py | 5 ++++ gufe/storage/storagemanager.py | 32 ++------------------- gufe/tests/storage/test_stagingdirectory.py | 23 +++++++++++++-- gufe/tests/storage/test_storagemanager.py | 21 +------------- 4 files changed, 30 insertions(+), 51 deletions(-) diff --git a/gufe/storage/stagingdirectory.py b/gufe/storage/stagingdirectory.py index 94dc3c24..9a06a3ca 100644 --- a/gufe/storage/stagingdirectory.py +++ b/gufe/storage/stagingdirectory.py @@ -278,6 +278,11 @@ def register_path(self, staging_path): class PermanentStaging(StagingDirectory): + """Staging directory for the permanent storage. + + This allows files to be downloaded from a shared + :class:`.ExternalStorage`. + """ def __init__( self, scratch: PathLike, diff --git a/gufe/storage/storagemanager.py b/gufe/storage/storagemanager.py index 6a1f0838..4d3cefd2 100644 --- a/gufe/storage/storagemanager.py +++ b/gufe/storage/storagemanager.py @@ -8,29 +8,6 @@ from .externalresource import ExternalStorage, FileStorage from .stagingdirectory import SharedStaging, PermanentStaging -def _storage_path_conflict(external, path, label): - """Check if deleting ``path`` could delete externally stored data. - - If external storage is a FileStorage, then it will storage files for - this unit or dag in the directory ``external.root_dir / label``, where - ``label`` is either the unit label or the dag label. If ``path`` is - inside that directory, then deleting it may delete information from the - external storage. In that case, this returns True, indicating a - conflict. Otherwise, this returns False. - """ - # this is a little brittle; I don't like hard-coding the class here - if isinstance(external, FileStorage): - root = Path(external.root_dir) / label - else: - return False - - p = Path(path) - try: - _ = p.relative_to(root) - except ValueError: - return False - else: - return True class _AbstractDAGContextManager: @classmethod @@ -112,14 +89,11 @@ def running_unit(self, unit_label: str): # everything in permanent must also be available in shared for file in permanent.registry: shared.transfer_single_file_to_external(file) - scratch_conflict = _storage_path_conflict(shared.external, - scratch, unit_label) - if not self.manager.keep_scratch and not scratch_conflict: + + if not self.manager.keep_scratch: shutil.rmtree(scratch) - shared_conflict = _storage_path_conflict(shared.external, - shared, unit_label) - if not self.manager.keep_holding and not shared_conflict: + if not self.manager.keep_holding: shared.cleanup() diff --git a/gufe/tests/storage/test_stagingdirectory.py b/gufe/tests/storage/test_stagingdirectory.py index 42893358..2def33af 100644 --- a/gufe/tests/storage/test_stagingdirectory.py +++ b/gufe/tests/storage/test_stagingdirectory.py @@ -3,9 +3,10 @@ import os import pathlib -from gufe.storage.externalresource import MemoryStorage +from gufe.storage.externalresource import MemoryStorage, FileStorage from gufe.storage.stagingdirectory import ( - SharedStaging, PermanentStaging, _delete_empty_dirs + SharedStaging, PermanentStaging, _delete_empty_dirs, + _safe_to_delete_holding ) @pytest.fixture @@ -27,6 +28,24 @@ def root_with_contents(root): return root +def test_safe_to_delete_holding_ok(tmp_path): + external = FileStorage(tmp_path / "foo") + prefix = "bar" + holding = tmp_path / "foo" / "baz" + assert _safe_to_delete_holding(external, holding, prefix) + +def test_safe_to_delete_holding_danger(tmp_path): + external = FileStorage(tmp_path / "foo") + prefix = "bar" + holding = tmp_path / "foo" / "bar" / "baz" + assert not _safe_to_delete_holding(external, holding, prefix) + +def test_safe_to_delete_holding_not_filestorage(tmp_path): + external = MemoryStorage() + prefix = "bar" + holding = tmp_path / "bar" + assert _safe_to_delete_holding(external, holding, prefix) + def test_delete_empty_dirs(tmp_path): base = tmp_path / "tmp" paths = [ diff --git a/gufe/tests/storage/test_storagemanager.py b/gufe/tests/storage/test_storagemanager.py index 41092fc5..2d7fb292 100644 --- a/gufe/tests/storage/test_storagemanager.py +++ b/gufe/tests/storage/test_storagemanager.py @@ -1,6 +1,6 @@ import pytest from gufe.storage.storagemanager import ( - StorageManager, _storage_path_conflict + StorageManager ) from gufe.storage.stagingdirectory import StagingDirectory from gufe.storage.externalresource import MemoryStorage, FileStorage @@ -258,25 +258,6 @@ def _after_dag_existing_files(self): # ... -# def test_storage_path_conflict_ok(tmp_path): -# # if the filestorage root is not in the given path, no conflict -# external = FileStorage(tmp_path / "foo" / "bar") -# path = tmp_path / "foo" / "baz" -# assert _storage_path_conflict(external, path) is False - -# def test_storage_path_conflict_not_filestorage(tmp_path): -# # if the external resource isn't a FileStorage, no conflict -# external = MemoryStorage() -# path = tmp_path / "foo" / "baz" -# assert _storage_path_conflict(external, path) is False - -# def test_storage_path_conflict_problem(tmp_path): -# # if the filestorage root is in the given path, we have a conflict -# external = FileStorage(tmp_path / "foo" / "bar") -# path = tmp_path / "foo" -# assert _storage_path_conflict(external, path) is True - - class TestStorageManager: def test_get_scratch(self, storage_manager_std): scratch = storage_manager_std.get_scratch("dag_label/unit_label") From a6d26f3022ba7f2deb936d946702ce80c41d1251 Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Tue, 6 Jun 2023 12:45:47 -0500 Subject: [PATCH 16/69] docs & types --- docs/guide/storage.rst | 8 ----- gufe/storage/stagingdirectory.py | 51 ++++++++++++++++++++++---------- gufe/storage/storagemanager.py | 2 +- 3 files changed, 36 insertions(+), 25 deletions(-) diff --git a/docs/guide/storage.rst b/docs/guide/storage.rst index 671c0b4a..0153bf89 100644 --- a/docs/guide/storage.rst +++ b/docs/guide/storage.rst @@ -89,11 +89,3 @@ creates a :class:`.SharedStaging` and a :class:`.PermanentStaging` associated with the specific unit. Those staging directories, with the scratch directory, are provided to the :class:`.ProtocolDAGUnit`, so that these are the only objects protocol authors need to interact with. - -In using these, we assume that the label for data from a given unit is of -the form ``$DAG/$UNIT_INFO/$FILEPATH``. The details of ``$DAG`` and -``$UNIT_INFO`` are up the executor; in particular, there may be a more -deeply nested hierarchy for the ``$UNIT_INFO`` (e.g., to account for retries -of a given unit). An executor that wants to use a data label that doesnt' -match this format should not use :class:`.StorageManager` and the related -tools. diff --git a/gufe/storage/stagingdirectory.py b/gufe/storage/stagingdirectory.py index 9a06a3ca..2f7ee313 100644 --- a/gufe/storage/stagingdirectory.py +++ b/gufe/storage/stagingdirectory.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from typing import Union, Optional from pathlib import Path from os import PathLike, rmdir, remove @@ -7,7 +9,10 @@ import logging _logger = logging.getLogger(__name__) -def _safe_to_delete_holding(external, path, prefix): +# TODO: holding -> staging + +def _safe_to_delete_holding(external: ExternalStorage, path: PathLike, + prefix: Union[PathLike, str]) -> bool: """Check if deleting ``path`` could delete externally stored data. If external storage is a FileStorage, then it will storage files for @@ -32,7 +37,7 @@ def _safe_to_delete_holding(external, path, prefix): return False -def _delete_empty_dirs(root, delete_root=True): +def _delete_empty_dirs(root: PathLike, delete_root: bool = True): """Delete all empty directories. Repeats so that directories that only contained empty directories also @@ -46,7 +51,6 @@ def find_empty_dirs(directory): directories = [p for p in paths if p.is_dir()] return sum([find_empty_dirs(d) for d in directories], []) - while root.exists() and (empties := find_empty_dirs(root)): if empties == [root] and not delete_root: return @@ -123,16 +127,17 @@ def __init__( self.staging_dir.mkdir(exist_ok=True, parents=True) def _delete_holding_safe(self): + """Check if deleting staging will remove data from external. + """ return _safe_to_delete_holding( external=self.external, path=self.staging_dir, prefix=self.prefix, ) - def transfer_single_file_to_external(self, held_file): + def transfer_single_file_to_external(self, held_file: StagingPath): """Transfer a given file from holding into external storage """ - path = Path(held_file) if not path.exists(): logging.info(f"Found nonexistent path {path}, not " @@ -157,7 +162,7 @@ def cleanup(self): remove(file) _delete_empty_dirs(self.staging_dir) - def register_path(self, staging_path): + def register_path(self, staging_path: StagingPath): """ Register a :class:`.StagingPath` with this :class:`.StagingDirectory`. @@ -186,7 +191,8 @@ def register_path(self, staging_path): if label_exists: self._load_file_from_external(self.external, staging_path) - def _load_file_from_external(self, external, staging_path): + def _load_file_from_external(self, external: ExternalStorage, + staging_path: StagingPath): scratch_path = self.staging_dir / staging_path.path # TODO: switch this to using `get_filename` and `store_path` with external.load_stream(staging_path.label) as f: @@ -198,7 +204,7 @@ def _load_file_from_external(self, external, staging_path): with open(scratch_path, mode='wb') as f: f.write(external_bytes) - def __truediv__(self, path: PathLike): + def __truediv__(self, path: Union[PathLike, str, bytes]): return StagingPath(root=self, path=path) def __fspath__(self): @@ -210,8 +216,18 @@ def __repr__(self): f"{self.prefix})" ) + def __del__(self): # -no-cov- + # in case someone doesn't use this within a context manager + if self.staging_dir.exists(): + self.cleanup() + + class SharedStaging(StagingDirectory): + """Staging for shared external storage. + + This enables read-only versions to be loaded from other units. + """ def __init__( self, scratch: PathLike, @@ -226,7 +242,8 @@ def __init__( delete_holding=delete_holding) self.read_only = read_only - def get_other_shared(self, prefix, delete_holding=None): + def get_other_shared(self, prefix: Union[str, PathLike], + delete_holding: Optional[bool] = None): """Get a related unit's staging directory. """ if delete_holding is None: @@ -242,7 +259,8 @@ def get_other_shared(self, prefix, delete_holding=None): ) @contextmanager - def other_shared(self, prefix, delete_holding=None): + def other_shared(self, prefix: Union[str, PathLike], + delete_holding: Optional[bool] = None): """Context manager approach for getting a related unit's directory. This is usually the recommended way to get a previous unit's shared @@ -252,7 +270,7 @@ def other_shared(self, prefix, delete_holding=None): yield other other.cleanup() - def transfer_single_file_to_external(self, held_file): + def transfer_single_file_to_external(self, held_file: StagingPath): if self.read_only: logging.debug("Read-only: Not transfering to external storage") return # early exit @@ -266,7 +284,7 @@ def transfer_holding_to_external(self): super().transfer_holding_to_external() - def register_path(self, staging_path): + def register_path(self, staging_path: StagingPath): label_exists = self.external.exists(staging_path.label) if self.read_only and not label_exists: @@ -305,7 +323,7 @@ def _delete_holding_safe(self): ) return shared_safe and super()._delete_holding_safe() - def transfer_single_file_to_external(self, held_file): + def transfer_single_file_to_external(self, held_file: StagingPath): # if we can't find it locally, we load it from shared storage path = Path(held_file) if not path.exists(): @@ -321,19 +339,20 @@ class StagingPath: manage the local path and transferring data with its :class:`.ExternalStorage`. """ - def __init__(self, root: StagingDirectory, path: PathLike): + def __init__(self, root: StagingDirectory, + path: Union[PathLike, str, bytes]): self.root = root self.path = Path(path) self.root.register_path(self) - def __truediv__(self, path): + def __truediv__(self, path: Union[PathLike, str, bytes]): return StagingPath(self.root, self.path / path) def __fspath__(self): return str(self.root.staging_dir / self.path) @property - def label(self): + def label(self) -> str: """Label used in :class:`.ExternalStorage` for this path""" return str(self.root.prefix / self.path) diff --git a/gufe/storage/storagemanager.py b/gufe/storage/storagemanager.py index 4d3cefd2..51fe5cd8 100644 --- a/gufe/storage/storagemanager.py +++ b/gufe/storage/storagemanager.py @@ -30,7 +30,7 @@ def running_unit(self, unit_label: str): DAGContextManager = Type[_AbstractDAGContextManager] - +# TODO: rename class _DAGStorageManager(_AbstractDAGContextManager): """Context manager to handle details of storage lifecycle. From aabbc33054ddf1818a75f2944f83c08d7bae5e52 Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Tue, 6 Jun 2023 13:09:42 -0500 Subject: [PATCH 17/69] docs, types, logging --- gufe/storage/stagingdirectory.py | 26 +++++++++++--------------- gufe/storage/storagemanager.py | 17 +++++++++++------ 2 files changed, 22 insertions(+), 21 deletions(-) diff --git a/gufe/storage/stagingdirectory.py b/gufe/storage/stagingdirectory.py index 2f7ee313..5d43769c 100644 --- a/gufe/storage/stagingdirectory.py +++ b/gufe/storage/stagingdirectory.py @@ -77,10 +77,6 @@ class StagingDirectory: 3. It can delete all of the files it manages - This can be opened in "read-only" mode, which prevents new files from - being created, but does not prevent changes to existing versions of - local files. - Parameters ---------- scratch : PathLike @@ -100,11 +96,6 @@ class StagingDirectory: delete_holding : bool whether to delete the contents of the $SCRATCH/$HOLDING/$PREFIX directory when this object is deleted - read_only : bool - write to prevent NEW files from being written within this staging - directory. NOTE: This will not prevent overwrite of existing files - in scratch space, but it will prevent changed files from uploading - to the external storage. """ def __init__( self, @@ -140,13 +131,13 @@ def transfer_single_file_to_external(self, held_file: StagingPath): """ path = Path(held_file) if not path.exists(): - logging.info(f"Found nonexistent path {path}, not " + _logger.info(f"Found nonexistent path {path}, not " "transfering to external storage") elif path.is_dir(): - logging.debug(f"Found directory {path}, not " + _logger.debug(f"Found directory {path}, not " "transfering to external storage") else: - logging.info(f"Transfering {path} to external storage") + _logger.info(f"Transfering {path} to external storage") self.external.store_path(held_file.label, path) def transfer_holding_to_external(self): @@ -159,7 +150,12 @@ def cleanup(self): """ if self.delete_holding and self._delete_holding_safe(): for file in self.registry - self.preexisting: - remove(file) + if Path(file).exists(): + _logger.debug(f"Removing file {file}") + remove(file) + else: + _logger.warning("Request to remove missing file " + f"{file}") _delete_empty_dirs(self.staging_dir) def register_path(self, staging_path: StagingPath): @@ -272,14 +268,14 @@ def other_shared(self, prefix: Union[str, PathLike], def transfer_single_file_to_external(self, held_file: StagingPath): if self.read_only: - logging.debug("Read-only: Not transfering to external storage") + _logger.debug("Read-only: Not transfering to external storage") return # early exit super().transfer_single_file_to_external(held_file) def transfer_holding_to_external(self): if self.read_only: - logging.debug("Read-only: Not transfering to external storage") + _logger.debug("Read-only: Not transfering to external storage") return # early exit super().transfer_holding_to_external() diff --git a/gufe/storage/storagemanager.py b/gufe/storage/storagemanager.py index 51fe5cd8..11dbae0b 100644 --- a/gufe/storage/storagemanager.py +++ b/gufe/storage/storagemanager.py @@ -1,3 +1,4 @@ +from __future__ import annotations from os import PathLike from pathlib import Path from contextlib import contextmanager @@ -12,7 +13,7 @@ class _AbstractDAGContextManager: @classmethod @contextmanager - def running_dag(cls, storage_manager, dag_label: str): + def running_dag(cls, storage_manager: StorageManager, dag_label: str): """Return a context manager for when a DAG is started. This context manager handles the DAG scale of the lifecycle. @@ -39,14 +40,14 @@ class _DAGStorageManager(_AbstractDAGContextManager): directly; instead, it is created (and used) with its ``running_dag`` classmethod, typically from within a ``StorageManager``. """ - def __init__(self, storage_manager, dag_label): + def __init__(self, storage_manager: StorageLabel, dag_label: str): self.manager = storage_manager self.dag_label = dag_label self.permanents = [] @classmethod # NB: classmethod must be on top @contextmanager - def running_dag(cls, storage_manager, dag_label): + def running_dag(cls, storage_manager: StorageManager, dag_label: str): """DAG level of the storage lifecycle When the DAG is completed, transfer everything to the permanent @@ -76,6 +77,9 @@ def running_unit(self, unit_label: str): permanent. At the end of the unit, it transfers anything from shared to the real shared external storage, cleans up the scratch directory and the shared holding directory. + + Note that the unit label here is the *entire* label; that is, it + would also include information identifying the DAG. """ scratch = self.manager.get_scratch(unit_label) shared = self.manager.get_shared(unit_label) @@ -130,7 +134,7 @@ def get_scratch(self, unit_label: str) -> Path: scratch.mkdir(parents=True, exist_ok=True) return scratch - def get_permanent(self, unit_label): + def get_permanent(self, unit_label) -> PermanentStaging: """Get the object for this unit's permanent holding directory""" return PermanentStaging( scratch=self.scratch_root, @@ -140,7 +144,7 @@ def get_permanent(self, unit_label): holding=self.holding, ) - def get_shared(self, unit_label): + def get_shared(self, unit_label) -> SharedStaging: """Get the object for this unit's shared holding directory""" return SharedStaging( scratch=self.scratch_root, @@ -159,7 +163,8 @@ def running_dag(self, dag_label: str): with manager.running_dag(dag_label) as dag_ctx: for unit in dag_ordered_units: - with dag_ctx.running_unit(unit) as dirs: + label = f"{dag_ctx.dag_label}/{unit.key}" + with dag_ctx.running_unit(label) as dirs: scratch, shared, permanent = dirs # run the unit """ From b4d73b39d62ef814aea5718269aa5eae904c3375 Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Tue, 6 Jun 2023 17:17:20 -0500 Subject: [PATCH 18/69] finish TestHoldingOverlapsPermanentStorageManager --- gufe/storage/stagingdirectory.py | 21 ++++---- gufe/tests/storage/test_storagemanager.py | 58 ++++++++++++++++++----- 2 files changed, 57 insertions(+), 22 deletions(-) diff --git a/gufe/storage/stagingdirectory.py b/gufe/storage/stagingdirectory.py index 5d43769c..49f16a89 100644 --- a/gufe/storage/stagingdirectory.py +++ b/gufe/storage/stagingdirectory.py @@ -189,16 +189,16 @@ def register_path(self, staging_path: StagingPath): def _load_file_from_external(self, external: ExternalStorage, staging_path: StagingPath): - scratch_path = self.staging_dir / staging_path.path - # TODO: switch this to using `get_filename` and `store_path` - with external.load_stream(staging_path.label) as f: - external_bytes = f.read() - if scratch_path.exists(): - self.preexisting.add(staging_path) - ... # TODO: something to check that the bytes are the same? - scratch_path.parent.mkdir(exist_ok=True, parents=True) - with open(scratch_path, mode='wb') as f: - f.write(external_bytes) + scratch_path = self.staging_dir / staging_path.path + # TODO: switch this to using `get_filename` and `store_path` + with external.load_stream(staging_path.label) as f: + external_bytes = f.read() + if scratch_path.exists(): + self.preexisting.add(staging_path) + ... # TODO: something to check that the bytes are the same? + scratch_path.parent.mkdir(exist_ok=True, parents=True) + with open(scratch_path, mode='wb') as f: + f.write(external_bytes) def __truediv__(self, path: Union[PathLike, str, bytes]): return StagingPath(root=self, path=path) @@ -218,7 +218,6 @@ def __del__(self): # -no-cov- self.cleanup() - class SharedStaging(StagingDirectory): """Staging for shared external storage. diff --git a/gufe/tests/storage/test_storagemanager.py b/gufe/tests/storage/test_storagemanager.py index 2d7fb292..c37451ad 100644 --- a/gufe/tests/storage/test_storagemanager.py +++ b/gufe/tests/storage/test_storagemanager.py @@ -215,6 +215,7 @@ def _after_dag_existing_files(self): return self.files_after_unit('dag/unit2') + class TestHoldingOverlapsSharedStorageManager(LifecycleHarness): @pytest.fixture def storage_manager(self, tmp_path): @@ -226,6 +227,23 @@ def storage_manager(self, tmp_path): holding="", ) + def _in_unit_existing_files(self, unit_label): + return { + "dag/unit1": {'foo', 'bar', 'baz'}, + "dag/unit2": {'foo2', 'bar', 'baz'}, + }[unit_label] + + + def _after_unit_existing_files(self, unit_label): + # same for both; all files come from unit 1 + return {"bar", "baz"} + + def _after_dag_existing_files(self): + # NOTE: currently we don't delete bar at the end of a cycle, but we + # don't guarantee that we would not. So it exists, but changing that + # isn't API-breaking. + return {"bar", "baz"} + def _in_staging_shared(self, unit_label, in_after): bar = "dag/unit1/bar.txt" baz = "dag/unit1/baz.txt" @@ -238,24 +256,42 @@ def _in_staging_shared(self, unit_label, in_after): ("dag/unit2", "after"): {baz} }[unit_label, in_after] + +class TestHoldingOverlapsPermanentStorageManager(LifecycleHarness): + @pytest.fixture + def storage_manager(self, tmp_path): + root = tmp_path / "working" + return StorageManager( + scratch_root=root, + permanent_root=FileStorage(root), + shared_root=MemoryStorage(), + holding="", + ) + def _in_unit_existing_files(self, unit_label): return { "dag/unit1": {'foo', 'bar', 'baz'}, - "dag/unit2": {'foo2', 'bar', 'baz'}, + "dag/unit2": {"foo2", "baz"}, # no bar because it was temporary }[unit_label] - def _after_unit_existing_files(self, unit_label): - # same for both; all files come from unit 1 - return {"bar", "baz"} - def _after_dag_existing_files(self): - # NOTE: currently we don't delete bar at the end of a cycle, but we - # don't guarantee that we would not. So it exists, but changing that - # isn't API-breaking. - return {"bar", "baz"} + return {"baz"} -# class TestHoldingOverlapsPermanentStorageManager(LifecycleHarness): -# ... + def _in_staging_permanent(self, unit_label, in_after): + bar = "dag/unit1/bar.txt" + baz = "dag/unit1/baz.txt" + foo = "scratch/dag/unit1/foo.txt" + foo2 = "scratch/dag/unit2/foo2.txt" + return { + ("dag/unit1", "in"): {bar, baz, foo}, + ("dag/unit1", "after"): {baz}, + ("dag/unit2", "in"): {baz, foo2}, + ("dag/unit2", "after"): {baz} + }[unit_label, in_after] + + def _after_unit_existing_files(self, unit_label): + # same for both; all files come from unit 1 + return {"baz"} class TestStorageManager: From 7af006e29dee3f349256cb06cff7264600103487 Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Tue, 6 Jun 2023 18:00:21 -0500 Subject: [PATCH 19/69] mypy --- gufe/storage/stagingdirectory.py | 10 +++++----- gufe/storage/storagemanager.py | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/gufe/storage/stagingdirectory.py b/gufe/storage/stagingdirectory.py index 49f16a89..7c44f4aa 100644 --- a/gufe/storage/stagingdirectory.py +++ b/gufe/storage/stagingdirectory.py @@ -200,7 +200,7 @@ def _load_file_from_external(self, external: ExternalStorage, with open(scratch_path, mode='wb') as f: f.write(external_bytes) - def __truediv__(self, path: Union[PathLike, str, bytes]): + def __truediv__(self, path: Union[PathLike, str]): return StagingPath(root=self, path=path) def __fspath__(self): @@ -237,7 +237,7 @@ def __init__( delete_holding=delete_holding) self.read_only = read_only - def get_other_shared(self, prefix: Union[str, PathLike], + def get_other_shared(self, prefix: str, delete_holding: Optional[bool] = None): """Get a related unit's staging directory. """ @@ -254,7 +254,7 @@ def get_other_shared(self, prefix: Union[str, PathLike], ) @contextmanager - def other_shared(self, prefix: Union[str, PathLike], + def other_shared(self, prefix: str, delete_holding: Optional[bool] = None): """Context manager approach for getting a related unit's directory. @@ -335,12 +335,12 @@ class StagingPath: :class:`.ExternalStorage`. """ def __init__(self, root: StagingDirectory, - path: Union[PathLike, str, bytes]): + path: Union[PathLike, str]): self.root = root self.path = Path(path) self.root.register_path(self) - def __truediv__(self, path: Union[PathLike, str, bytes]): + def __truediv__(self, path: Union[PathLike, str]): return StagingPath(self.root, self.path / path) def __fspath__(self): diff --git a/gufe/storage/storagemanager.py b/gufe/storage/storagemanager.py index 11dbae0b..bfb3bfc3 100644 --- a/gufe/storage/storagemanager.py +++ b/gufe/storage/storagemanager.py @@ -40,10 +40,10 @@ class _DAGStorageManager(_AbstractDAGContextManager): directly; instead, it is created (and used) with its ``running_dag`` classmethod, typically from within a ``StorageManager``. """ - def __init__(self, storage_manager: StorageLabel, dag_label: str): + def __init__(self, storage_manager: StorageManager, dag_label: str): self.manager = storage_manager self.dag_label = dag_label - self.permanents = [] + self.permanents: list[PermanentStaging] = [] @classmethod # NB: classmethod must be on top @contextmanager From 58a58bc94d13e8dafa4e1ea58df827f72cec5c2d Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Tue, 6 Jun 2023 18:48:09 -0500 Subject: [PATCH 20/69] test_repr --- gufe/tests/storage/test_stagingdirectory.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/gufe/tests/storage/test_stagingdirectory.py b/gufe/tests/storage/test_stagingdirectory.py index 2def33af..e9f4287f 100644 --- a/gufe/tests/storage/test_stagingdirectory.py +++ b/gufe/tests/storage/test_stagingdirectory.py @@ -94,6 +94,12 @@ def test_delete_empty_dirs_delete_root(tmp_path, delete_root): class TestSharedStaging: + def test_repr(self, root): + r = repr(root) + assert r.startswith("StagingDirectory") + assert "MemoryStorage" in r + assert r.endswith(", new_unit)") + @pytest.mark.parametrize('pathlist', [ ['file.txt'], ['dir', 'file.txt'] ]) From 8e429f5d9248131dadf82c1cb033a67078463723 Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Tue, 6 Jun 2023 18:58:54 -0500 Subject: [PATCH 21/69] renaming around DAGContextManager --- gufe/storage/storagemanager.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/gufe/storage/storagemanager.py b/gufe/storage/storagemanager.py index bfb3bfc3..4a08dc75 100644 --- a/gufe/storage/storagemanager.py +++ b/gufe/storage/storagemanager.py @@ -10,7 +10,7 @@ from .stagingdirectory import SharedStaging, PermanentStaging -class _AbstractDAGContextManager: +class DAGContextManager: @classmethod @contextmanager def running_dag(cls, storage_manager: StorageManager, dag_label: str): @@ -28,11 +28,9 @@ def running_unit(self, unit_label: str): """ raise NotImplementedError() +_DCMType = Type[DAGContextManager] # to shorten some lines -DAGContextManager = Type[_AbstractDAGContextManager] - -# TODO: rename -class _DAGStorageManager(_AbstractDAGContextManager): +class SingleProcDAGContextManager(DAGContextManager): """Context manager to handle details of storage lifecycle. Making this a separate class ensures that ``running_unit`` is always @@ -117,7 +115,7 @@ def __init__( keep_scratch: bool = False, keep_holding: bool = False, holding: PathLike = Path(".holding"), - DAGContextClass: DAGContextManager = _DAGStorageManager, + DAGContextClass: _DCMType = SingleProcDAGContextManager, ): self.scratch_root = Path(scratch_root) self.shared_root = shared_root From b70df482c0cbebe9fd0a5d34be15bb13c723eedc Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Tue, 6 Jun 2023 19:09:17 -0500 Subject: [PATCH 22/69] holding => staging --- gufe/storage/stagingdirectory.py | 68 ++++++++++----------- gufe/storage/storagemanager.py | 32 +++++----- gufe/tests/storage/test_stagingdirectory.py | 28 ++++----- gufe/tests/storage/test_storagemanager.py | 22 +++---- 4 files changed, 75 insertions(+), 75 deletions(-) diff --git a/gufe/storage/stagingdirectory.py b/gufe/storage/stagingdirectory.py index 7c44f4aa..02626365 100644 --- a/gufe/storage/stagingdirectory.py +++ b/gufe/storage/stagingdirectory.py @@ -11,7 +11,7 @@ # TODO: holding -> staging -def _safe_to_delete_holding(external: ExternalStorage, path: PathLike, +def _safe_to_delete_staging(external: ExternalStorage, path: PathLike, prefix: Union[PathLike, str]) -> bool: """Check if deleting ``path`` could delete externally stored data. @@ -89,11 +89,11 @@ class StagingDirectory: it might be ``$DAG_LABEL/$UNIT_LABEL`` or ``$DAG_LABEL/$UNIT_LABEL/$UNIT_REPEAT``. It must be a unique identifier for this unit within the permanent storage. - holding : PathLike + staging : PathLike name of the subdirectory of scratch where staged results are - temporarily stored; default is '.holding'. This must be the same for + temporarily stored; default is '.staging'. This must be the same for all units within a DAG. - delete_holding : bool + delete_staging : bool whether to delete the contents of the $SCRATCH/$HOLDING/$PREFIX directory when this object is deleted """ @@ -103,31 +103,31 @@ def __init__( external: ExternalStorage, prefix: str, *, - holding: PathLike = Path(".holding"), - delete_holding: bool = True, + staging: PathLike = Path(".staging"), + delete_staging: bool = True, ): self.external = external self.scratch = Path(scratch) self.prefix = Path(prefix) - self.delete_holding = delete_holding - self.holding = holding + self.delete_staging = delete_staging + self.staging = staging self.registry : set[StagingPath] = set() self.preexisting : set[StagingPath] = set() - self.staging_dir = self.scratch / holding / prefix + self.staging_dir = self.scratch / staging / prefix self.staging_dir.mkdir(exist_ok=True, parents=True) - def _delete_holding_safe(self): + def _delete_staging_safe(self): """Check if deleting staging will remove data from external. """ - return _safe_to_delete_holding( + return _safe_to_delete_staging( external=self.external, path=self.staging_dir, prefix=self.prefix, ) def transfer_single_file_to_external(self, held_file: StagingPath): - """Transfer a given file from holding into external storage + """Transfer a given file from staging into external storage """ path = Path(held_file) if not path.exists(): @@ -140,7 +140,7 @@ def transfer_single_file_to_external(self, held_file: StagingPath): _logger.info(f"Transfering {path} to external storage") self.external.store_path(held_file.label, path) - def transfer_holding_to_external(self): + def transfer_staging_to_external(self): """Transfer all objects in the registry to external storage""" for obj in self.registry: self.transfer_single_file_to_external(obj) @@ -148,7 +148,7 @@ def transfer_holding_to_external(self): def cleanup(self): """Perform end-of-lifecycle cleanup. """ - if self.delete_holding and self._delete_holding_safe(): + if self.delete_staging and self._delete_staging_safe(): for file in self.registry - self.preexisting: if Path(file).exists(): _logger.debug(f"Removing file {file}") @@ -229,39 +229,39 @@ def __init__( external: ExternalStorage, prefix: str, *, - holding: PathLike = Path(".holding"), - delete_holding: bool = True, + staging: PathLike = Path(".staging"), + delete_staging: bool = True, read_only: bool = False, ): - super().__init__(scratch, external, prefix, holding=holding, - delete_holding=delete_holding) + super().__init__(scratch, external, prefix, staging=staging, + delete_staging=delete_staging) self.read_only = read_only def get_other_shared(self, prefix: str, - delete_holding: Optional[bool] = None): + delete_staging: Optional[bool] = None): """Get a related unit's staging directory. """ - if delete_holding is None: - delete_holding = self.delete_holding + if delete_staging is None: + delete_staging = self.delete_staging return SharedStaging( scratch=self.scratch, external=self.external, prefix=prefix, - holding=self.holding, - delete_holding=delete_holding, + staging=self.staging, + delete_staging=delete_staging, read_only=True, ) @contextmanager def other_shared(self, prefix: str, - delete_holding: Optional[bool] = None): + delete_staging: Optional[bool] = None): """Context manager approach for getting a related unit's directory. This is usually the recommended way to get a previous unit's shared data. """ - other = self.get_other_shared(prefix, delete_holding) + other = self.get_other_shared(prefix, delete_staging) yield other other.cleanup() @@ -272,12 +272,12 @@ def transfer_single_file_to_external(self, held_file: StagingPath): super().transfer_single_file_to_external(held_file) - def transfer_holding_to_external(self): + def transfer_staging_to_external(self): if self.read_only: _logger.debug("Read-only: Not transfering to external storage") return # early exit - super().transfer_holding_to_external() + super().transfer_staging_to_external() def register_path(self, staging_path: StagingPath): label_exists = self.external.exists(staging_path.label) @@ -303,20 +303,20 @@ def __init__( shared: ExternalStorage, prefix: str, *, - holding: PathLike = Path(".holding"), - delete_holding: bool = True, + staging: PathLike = Path(".staging"), + delete_staging: bool = True, ): - super().__init__(scratch, external, prefix, holding=holding, - delete_holding=delete_holding) + super().__init__(scratch, external, prefix, staging=staging, + delete_staging=delete_staging) self.shared = shared - def _delete_holding_safe(self): - shared_safe = _safe_to_delete_holding( + def _delete_staging_safe(self): + shared_safe = _safe_to_delete_staging( external=self.shared, path=self.staging_dir, prefix=self.prefix ) - return shared_safe and super()._delete_holding_safe() + return shared_safe and super()._delete_staging_safe() def transfer_single_file_to_external(self, held_file: StagingPath): # if we can't find it locally, we load it from shared storage diff --git a/gufe/storage/storagemanager.py b/gufe/storage/storagemanager.py index 4a08dc75..dbba86ea 100644 --- a/gufe/storage/storagemanager.py +++ b/gufe/storage/storagemanager.py @@ -49,7 +49,7 @@ def running_dag(cls, storage_manager: StorageManager, dag_label: str): """DAG level of the storage lifecycle When the DAG is completed, transfer everything to the permanent - storage, and delete the holding area for permanent (if we are + storage, and delete the staging area for permanent (if we are supposed to). This is not usually called by users; instead it is called from @@ -60,9 +60,9 @@ def running_dag(cls, storage_manager: StorageManager, dag_label: str): yield dag_manager finally: for permanent in dag_manager.permanents: - permanent.transfer_holding_to_external() + permanent.transfer_staging_to_external() - if not dag_manager.manager.keep_holding: + if not dag_manager.manager.keep_staging: for d in dag_manager.permanents: # import pdb; pdb.set_trace() d.cleanup() @@ -71,10 +71,10 @@ def running_dag(cls, storage_manager: StorageManager, dag_label: str): def running_unit(self, unit_label: str): """Unit level of the storage lifecycle. - This provides the holding directories used for scratch, shared, and + This provides the staging directories used for scratch, shared, and permanent. At the end of the unit, it transfers anything from shared to the real shared external storage, cleans up the scratch - directory and the shared holding directory. + directory and the shared staging directory. Note that the unit label here is the *entire* label; that is, it would also include information identifying the DAG. @@ -87,7 +87,7 @@ def running_unit(self, unit_label: str): finally: # TODO: should some of this be in an else clause instead? self.permanents.append(permanent) - shared.transfer_holding_to_external() + shared.transfer_staging_to_external() # everything in permanent must also be available in shared for file in permanent.registry: shared.transfer_single_file_to_external(file) @@ -95,14 +95,14 @@ def running_unit(self, unit_label: str): if not self.manager.keep_scratch: shutil.rmtree(scratch) - if not self.manager.keep_holding: + if not self.manager.keep_staging: shared.cleanup() class StorageManager: """Tool to manage the storage lifecycle during a DAG. - This object primarily contains the logic for getting the holding + This object primarily contains the logic for getting the staging directories. A separate class, in the ``DAGContextClass`` variable, handles the logic for the context managers. """ @@ -113,16 +113,16 @@ def __init__( permanent_root: ExternalStorage, *, keep_scratch: bool = False, - keep_holding: bool = False, - holding: PathLike = Path(".holding"), + keep_staging: bool = False, + staging: PathLike = Path(".staging"), DAGContextClass: _DCMType = SingleProcDAGContextManager, ): self.scratch_root = Path(scratch_root) self.shared_root = shared_root self.permanent_root = permanent_root self.keep_scratch = keep_scratch - self.keep_holding = keep_holding - self.holding = holding + self.keep_staging = keep_staging + self.staging = staging self.DAGContextClass = DAGContextClass def get_scratch(self, unit_label: str) -> Path: @@ -133,22 +133,22 @@ def get_scratch(self, unit_label: str) -> Path: return scratch def get_permanent(self, unit_label) -> PermanentStaging: - """Get the object for this unit's permanent holding directory""" + """Get the object for this unit's permanent staging directory""" return PermanentStaging( scratch=self.scratch_root, external=self.permanent_root, shared=self.shared_root, prefix=unit_label, - holding=self.holding, + staging=self.staging, ) def get_shared(self, unit_label) -> SharedStaging: - """Get the object for this unit's shared holding directory""" + """Get the object for this unit's shared staging directory""" return SharedStaging( scratch=self.scratch_root, external=self.shared_root, prefix=unit_label, - holding=self.holding, + staging=self.staging, ) def running_dag(self, dag_label: str): diff --git a/gufe/tests/storage/test_stagingdirectory.py b/gufe/tests/storage/test_stagingdirectory.py index e9f4287f..69c3bea0 100644 --- a/gufe/tests/storage/test_stagingdirectory.py +++ b/gufe/tests/storage/test_stagingdirectory.py @@ -6,7 +6,7 @@ from gufe.storage.externalresource import MemoryStorage, FileStorage from gufe.storage.stagingdirectory import ( SharedStaging, PermanentStaging, _delete_empty_dirs, - _safe_to_delete_holding + _safe_to_delete_staging ) @pytest.fixture @@ -17,7 +17,7 @@ def root(tmp_path): scratch=tmp_path, external=external, prefix="new_unit", - delete_holding=False + delete_staging=False ) return root @@ -28,23 +28,23 @@ def root_with_contents(root): return root -def test_safe_to_delete_holding_ok(tmp_path): +def test_safe_to_delete_staging_ok(tmp_path): external = FileStorage(tmp_path / "foo") prefix = "bar" - holding = tmp_path / "foo" / "baz" - assert _safe_to_delete_holding(external, holding, prefix) + staging = tmp_path / "foo" / "baz" + assert _safe_to_delete_staging(external, staging, prefix) -def test_safe_to_delete_holding_danger(tmp_path): +def test_safe_to_delete_staging_danger(tmp_path): external = FileStorage(tmp_path / "foo") prefix = "bar" - holding = tmp_path / "foo" / "bar" / "baz" - assert not _safe_to_delete_holding(external, holding, prefix) + staging = tmp_path / "foo" / "bar" / "baz" + assert not _safe_to_delete_staging(external, staging, prefix) -def test_safe_to_delete_holding_not_filestorage(tmp_path): +def test_safe_to_delete_staging_not_filestorage(tmp_path): external = MemoryStorage() prefix = "bar" - holding = tmp_path / "bar" - assert _safe_to_delete_holding(external, holding, prefix) + staging = tmp_path / "bar" + assert _safe_to_delete_staging(external, staging, prefix) def test_delete_empty_dirs(tmp_path): base = tmp_path / "tmp" @@ -119,7 +119,7 @@ def test_read_old(self, root): # initial conditions, without touching StagingDirectory/StagingPath label = "old_unit/data.txt" - on_filesystem = root.scratch / root.holding / "old_unit/data.txt" + on_filesystem = root.scratch / root.staging / "old_unit/data.txt" assert not on_filesystem.exists() assert root.external.exists(label) @@ -136,7 +136,7 @@ def test_read_old(self, root): def test_write_new(self, root): label = "new_unit/somefile.txt" - on_filesystem = root.scratch / root.holding / "new_unit/somefile.txt" + on_filesystem = root.scratch / root.staging / "new_unit/somefile.txt" assert not on_filesystem.exists() with open(root / "somefile.txt", mode='wb') as f: f.write(b"testing") @@ -155,7 +155,7 @@ def test_transfer_to_external(self, root_with_contents): path = list(root_with_contents.registry)[0] # only 1 assert not root_with_contents.external.exists(path.label) - root_with_contents.transfer_holding_to_external() + root_with_contents.transfer_staging_to_external() assert root_with_contents.external.exists(path.label) with root_with_contents.external.load_stream(path.label) as f: diff --git a/gufe/tests/storage/test_storagemanager.py b/gufe/tests/storage/test_storagemanager.py index c37451ad..fb584be1 100644 --- a/gufe/tests/storage/test_storagemanager.py +++ b/gufe/tests/storage/test_storagemanager.py @@ -56,12 +56,12 @@ def storage_manager(self, tmp_path): @staticmethod def get_files_dict(storage_manager): root = storage_manager.scratch_root - holding = storage_manager.holding + staging = storage_manager.staging return { "foo": root / "scratch/dag/unit1/foo.txt", "foo2": root / "scratch/dag/unit2/foo2.txt", - "bar": root / holding / "dag/unit1/bar.txt", - "baz": root / holding / "dag/unit1/baz.txt", + "bar": root / staging / "dag/unit1/bar.txt", + "baz": root / staging / "dag/unit1/baz.txt", } def test_lifecycle(self, storage_manager, dag_units, tmp_path): @@ -185,7 +185,7 @@ def _after_dag_existing_files(self): return set() -class TestKeepScratchAndHoldingStorageManager(LifecycleHarness): +class TestKeepScratchAndStagingStorageManager(LifecycleHarness): @pytest.fixture def storage_manager(self, tmp_path): return StorageManager( @@ -193,7 +193,7 @@ def storage_manager(self, tmp_path): shared_root=MemoryStorage(), permanent_root=MemoryStorage(), keep_scratch=True, - keep_holding=True + keep_staging=True ) @staticmethod @@ -216,7 +216,7 @@ def _after_dag_existing_files(self): -class TestHoldingOverlapsSharedStorageManager(LifecycleHarness): +class TestStagingOverlapsSharedStorageManager(LifecycleHarness): @pytest.fixture def storage_manager(self, tmp_path): root = tmp_path / "working" @@ -224,7 +224,7 @@ def storage_manager(self, tmp_path): scratch_root=root, shared_root=FileStorage(root), permanent_root=MemoryStorage(), - holding="", + staging="", ) def _in_unit_existing_files(self, unit_label): @@ -257,7 +257,7 @@ def _in_staging_shared(self, unit_label, in_after): }[unit_label, in_after] -class TestHoldingOverlapsPermanentStorageManager(LifecycleHarness): +class TestStagingOverlapsPermanentStorageManager(LifecycleHarness): @pytest.fixture def storage_manager(self, tmp_path): root = tmp_path / "working" @@ -265,7 +265,7 @@ def storage_manager(self, tmp_path): scratch_root=root, permanent_root=FileStorage(root), shared_root=MemoryStorage(), - holding="", + staging="", ) def _in_unit_existing_files(self, unit_label): @@ -302,10 +302,10 @@ def test_get_scratch(self, storage_manager_std): def test_get_permanent(self, storage_manager_std): perm = storage_manager_std.get_permanent("dag_label/unit_label") - assert perm.__fspath__().endswith(".holding/dag_label/unit_label") + assert perm.__fspath__().endswith(".staging/dag_label/unit_label") assert isinstance(perm, StagingDirectory) def test_get_shared(self, storage_manager_std): shared = storage_manager_std.get_shared("dag_label/unit_label") - assert shared.__fspath__().endswith(".holding/dag_label/unit_label") + assert shared.__fspath__().endswith(".staging/dag_label/unit_label") assert isinstance(shared, StagingDirectory) From 08e3ac2cd7702a0c01ec08e33af563c72887a7ab Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Tue, 6 Jun 2023 19:29:12 -0500 Subject: [PATCH 23/69] finish docs (I think?) --- docs/guide/storage.rst | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/docs/guide/storage.rst b/docs/guide/storage.rst index 0153bf89..d740525e 100644 --- a/docs/guide/storage.rst +++ b/docs/guide/storage.rst @@ -42,8 +42,8 @@ protocol authors. In detail, this provides protocol authors with three of these objects actually point to special subdirectories of the scratch space for a specific unit, but are managed by context manangers at the executor level, which handle the process of moving objects from local -directories to the actual ``shared`` and ``permanent`` locations, which can -be external resources. +staging directories to the actual ``shared`` and ``permanent`` locations, +which can be external resources. External resource utilities @@ -78,9 +78,24 @@ executors. The helpers are: * :class:`.StorageManager`: This is the overall façade interface for interacting with the rest of the storage lifecycle tools. -* ``DAGContextManager``: -* :class:`.StagingDirectory`: -* :class:`.StagingPath`: +* :class:`.DAGContextManager`: This provides context managers at the DAG and + unit level to handle the transfer of storage. GUFE provides a + :class:`.SingleProcDAGContextManager` to handle the simple case that an + entire DAG is run within a single process. If individual units are run on + different remote resources, a more complicated :class:`.DAGContextManager` + would be needed. +* :class:`.StagingDirectory`: This represents the root directory for staging + the results of a given :class:`.ProtocolUnit`. This is an abstract + representation of a local directory. Paths within it register with it, and + it handles deletion of the temporary local files when not needed, as well + as the download of remote files when necessary for reading. There are two + important subclasses of this: :class:`.SharedStaging` for a ``shared`` + resource, and :class:`.PermanentStaging` for a ``permanent`` resource. +* :class:`.StagingPath`: This represents a file within the + :class:`.StagingDirectory`. It contains both the key (label) used in the + key-value store, as well as the actual local path to the file. On + creation, it registers itself with its :class:`.StagingDirectory`, which + handles managing it over its lifecycle. In practice, the executor uses the :class:`.StorageManager` to create a :class:`.DAGContextManager` at the level of a DAG, and then uses the From ca7871b00edd7fd7bf8b614437b4b9549c39c773 Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Tue, 6 Jun 2023 19:39:20 -0500 Subject: [PATCH 24/69] remove completed TODO --- gufe/storage/stagingdirectory.py | 1 - 1 file changed, 1 deletion(-) diff --git a/gufe/storage/stagingdirectory.py b/gufe/storage/stagingdirectory.py index 02626365..f529b327 100644 --- a/gufe/storage/stagingdirectory.py +++ b/gufe/storage/stagingdirectory.py @@ -9,7 +9,6 @@ import logging _logger = logging.getLogger(__name__) -# TODO: holding -> staging def _safe_to_delete_staging(external: ExternalStorage, path: PathLike, prefix: Union[PathLike, str]) -> bool: From 2aa06169e6a629897828180edc1a32ebac251e05 Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Fri, 9 Jun 2023 10:01:32 -0500 Subject: [PATCH 25/69] start to testing edge case logging right now, we seem to not be getting anything; see if this is because someone has added a handler on the root logger or because we can't caplog from a parent logger (may need specific __name__) --- gufe/tests/storage/test_stagingdirectory.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/gufe/tests/storage/test_stagingdirectory.py b/gufe/tests/storage/test_stagingdirectory.py index 69c3bea0..36ad1228 100644 --- a/gufe/tests/storage/test_stagingdirectory.py +++ b/gufe/tests/storage/test_stagingdirectory.py @@ -1,4 +1,6 @@ import pytest +from unittest import mock +import logging import os import pathlib @@ -161,7 +163,18 @@ def test_transfer_to_external(self, root_with_contents): with root_with_contents.external.load_stream(path.label) as f: assert f.read() == b"bar" - def test_transfer_to_external_no_file(self, root): + @mock.patch.object(SharedStaging, 'register_path') + def test_transfer_to_external_no_file(self, root, caplog): + nonfile = root / "does_not_exist.txt" + # ensure that we've set this up correctly + assert nonfile not in root.registry + caplog.set_level(logging.INFO, logger="gufe.storage") + root.transfer_single_file_to_external(nonfile) + assert len(caplog.records) == 1 + + + + ... def test_tranfer_to_external_directory(self, root): From 7c03dcd76e10b88f30443ea279028694705e2a0d Mon Sep 17 00:00:00 2001 From: Richard Gowers Date: Mon, 12 Jun 2023 08:51:50 +0100 Subject: [PATCH 26/69] Update stagingdirectory.py --- gufe/storage/stagingdirectory.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gufe/storage/stagingdirectory.py b/gufe/storage/stagingdirectory.py index f529b327..1f55bfc9 100644 --- a/gufe/storage/stagingdirectory.py +++ b/gufe/storage/stagingdirectory.py @@ -14,7 +14,7 @@ def _safe_to_delete_staging(external: ExternalStorage, path: PathLike, prefix: Union[PathLike, str]) -> bool: """Check if deleting ``path`` could delete externally stored data. - If external storage is a FileStorage, then it will storage files for + If external storage is a FileStorage, then it will store files for this unit or dag in the directory ``external.root_dir / prefix``, where ``prefix`` is either the unit label or the dag label. If ``path`` is inside that directory, then deleting it may delete information from the From 383075e8288614c42a3eb7459ef6271bd8922339 Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Mon, 12 Jun 2023 15:52:29 -0500 Subject: [PATCH 27/69] tests for single_file_transfer logging --- gufe/storage/stagingdirectory.py | 3 +++ gufe/tests/storage/test_stagingdirectory.py | 26 ++++++++++++++------- 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/gufe/storage/stagingdirectory.py b/gufe/storage/stagingdirectory.py index f529b327..8dc86b47 100644 --- a/gufe/storage/stagingdirectory.py +++ b/gufe/storage/stagingdirectory.py @@ -177,6 +177,9 @@ def register_path(self, staging_path: StagingPath): the path to track """ label_exists = self.external.exists(staging_path.label) + fspath = Path(staging_path.__fspath__()) + if not fspath.parent.exists(): + fspath.parent.mkdir(parents=True, exist_ok=True) self.registry.add(staging_path) diff --git a/gufe/tests/storage/test_stagingdirectory.py b/gufe/tests/storage/test_stagingdirectory.py index 36ad1228..2f6301f8 100644 --- a/gufe/tests/storage/test_stagingdirectory.py +++ b/gufe/tests/storage/test_stagingdirectory.py @@ -163,22 +163,30 @@ def test_transfer_to_external(self, root_with_contents): with root_with_contents.external.load_stream(path.label) as f: assert f.read() == b"bar" - @mock.patch.object(SharedStaging, 'register_path') def test_transfer_to_external_no_file(self, root, caplog): - nonfile = root / "does_not_exist.txt" + with mock.patch.object(root, 'register_path'): + nonfile = root / "does_not_exist.txt" # ensure that we've set this up correctly assert nonfile not in root.registry - caplog.set_level(logging.INFO, logger="gufe.storage") + logger_name = "gufe.storage.stagingdirectory" + caplog.set_level(logging.INFO, logger=logger_name) root.transfer_single_file_to_external(nonfile) assert len(caplog.records) == 1 + record = caplog.records[0] + assert "nonexistent" in record.msg + def test_transfer_to_external_directory(self, root, caplog): + directory = root / "directory" + with open(directory / "file.txt", mode='w') as f: + f.write("foo") - - - ... - - def test_tranfer_to_external_directory(self, root): - ... + logger_name = "gufe.storage.stagingdirectory" + caplog.set_level(logging.DEBUG, logger=logger_name) + root.transfer_single_file_to_external(directory) + assert len(caplog.records) == 1 + record = caplog.records[0] + assert "Found directory" in record.msg + assert "not transfering" in record.msg def test_existing_local_and_external(self, root): ... From 63653983e1cd723d7182087d725b91fc79909110 Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Fri, 16 Jun 2023 14:57:03 -0500 Subject: [PATCH 28/69] tests for read-only transfers --- gufe/storage/stagingdirectory.py | 4 +- gufe/tests/storage/test_stagingdirectory.py | 81 +++++++++++++++++++-- 2 files changed, 78 insertions(+), 7 deletions(-) diff --git a/gufe/storage/stagingdirectory.py b/gufe/storage/stagingdirectory.py index 8dc86b47..e6cc34e0 100644 --- a/gufe/storage/stagingdirectory.py +++ b/gufe/storage/stagingdirectory.py @@ -210,7 +210,7 @@ def __fspath__(self): def __repr__(self): return ( - f"StagingDirectory({self.scratch}, {self.external}, " + f"{self.__class__.__name__}({self.scratch}, {self.external}, " f"{self.prefix})" ) @@ -286,7 +286,7 @@ def register_path(self, staging_path: StagingPath): if self.read_only and not label_exists: raise IOError(f"Unable to create '{staging_path.label}'. File " - "does not exist in external storage, and This " + "does not exist in external storage, and this " "staging path is read-only.") super().register_path(staging_path) diff --git a/gufe/tests/storage/test_stagingdirectory.py b/gufe/tests/storage/test_stagingdirectory.py index 2f6301f8..c4281fd8 100644 --- a/gufe/tests/storage/test_stagingdirectory.py +++ b/gufe/tests/storage/test_stagingdirectory.py @@ -30,6 +30,26 @@ def root_with_contents(root): return root +@pytest.fixture +def read_only_with_overwritten(root_with_contents): + read_only = SharedStaging( + scratch=root_with_contents.scratch, + external=root_with_contents.external, + prefix="old_unit", + staging=root_with_contents.staging, + delete_staging=root_with_contents.delete_staging, + read_only=True + ) + filename = pathlib.Path(read_only) / "data.txt" + assert not filename.exists() + staged = read_only / "data.txt" + assert filename.exists() + with open(staged, mode='w') as f: + f.write("changed") + + return read_only, staged + + def test_safe_to_delete_staging_ok(tmp_path): external = FileStorage(tmp_path / "foo") prefix = "bar" @@ -188,11 +208,62 @@ def test_transfer_to_external_directory(self, root, caplog): assert "Found directory" in record.msg assert "not transfering" in record.msg - def test_existing_local_and_external(self, root): - ... + def test_single_file_transfer_read_only(self, + read_only_with_overwritten, + caplog): + read_only, staged = read_only_with_overwritten + with read_only.external.load_stream("old_unit/data.txt") as f: + old_contents = f.read() - def test_existing_local_and_external_conflict(self, root): - ... + assert old_contents == b"foo" + logger_name = "gufe.storage.stagingdirectory" + caplog.set_level(logging.DEBUG, logger=logger_name) + read_only.transfer_single_file_to_external(staged) + assert len(caplog.records) == 1 + record = caplog.records[0] + assert "Read-only:" in record.msg + with read_only.external.load_stream("old_unit/data.txt") as f: + new_contents = f.read() + assert old_contents == new_contents + + def test_transfer_read_only(self, read_only_with_overwritten, caplog): + read_only, staged = read_only_with_overwritten + with read_only.external.load_stream("old_unit/data.txt") as f: + old_contents = f.read() - def test_no_transfer_for_read_only(self, root): + assert old_contents == b"foo" + logger_name = "gufe.storage.stagingdirectory" + caplog.set_level(logging.DEBUG, logger=logger_name) + read_only.transfer_staging_to_external() + assert len(caplog.records) == 1 + record = caplog.records[0] + assert "Read-only:" in record.msg + with read_only.external.load_stream("old_unit/data.txt") as f: + new_contents = f.read() + assert old_contents == new_contents + + def test_cleanup(self, root_with_contents): + path = pathlib.Path(root_with_contents.__fspath__()) / "data.txt" + assert path.exists() + root_with_contents.cleanup() + assert not path.exists() + + def test_register_cleanup_preexisting_file(self, root): + filename = pathlib.Path(root.__fspath__()) / "foo.txt" + filename.touch() + root.external.store_bytes("new_unit/foo.txt", b"") + assert len(root.registry) == 0 + assert len(root.preexisting) == 0 + staging = root / "foo.txt" + assert staging.label == "new_unit/foo.txt" + assert len(root.registry) == 1 + assert len(root.preexisting) == 1 + + assert filename.exists() + root.cleanup() + assert filename.exists() + + +class TestPermanentStage: + def test_delete_staging_safe(self): ... From d35bd60150cb81a27ff3781ebdbeeab75cbde01e Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Fri, 16 Jun 2023 15:21:26 -0500 Subject: [PATCH 29/69] fix repr and cleanup tests --- gufe/tests/storage/test_stagingdirectory.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gufe/tests/storage/test_stagingdirectory.py b/gufe/tests/storage/test_stagingdirectory.py index c4281fd8..3a7e3ef4 100644 --- a/gufe/tests/storage/test_stagingdirectory.py +++ b/gufe/tests/storage/test_stagingdirectory.py @@ -118,7 +118,7 @@ def test_delete_empty_dirs_delete_root(tmp_path, delete_root): class TestSharedStaging: def test_repr(self, root): r = repr(root) - assert r.startswith("StagingDirectory") + assert r.startswith("SharedStaging") assert "MemoryStorage" in r assert r.endswith(", new_unit)") @@ -243,6 +243,7 @@ def test_transfer_read_only(self, read_only_with_overwritten, caplog): assert old_contents == new_contents def test_cleanup(self, root_with_contents): + root_with_contents.delete_staging = True # slightly naughty path = pathlib.Path(root_with_contents.__fspath__()) / "data.txt" assert path.exists() root_with_contents.cleanup() From 7cc10f9b692503e6eeff24769c43d498bd2ee284 Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Fri, 16 Jun 2023 16:26:55 -0500 Subject: [PATCH 30/69] test for permanent transfer to external --- gufe/tests/storage/test_stagingdirectory.py | 25 ++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/gufe/tests/storage/test_stagingdirectory.py b/gufe/tests/storage/test_stagingdirectory.py index 3a7e3ef4..2dfc1ffd 100644 --- a/gufe/tests/storage/test_stagingdirectory.py +++ b/gufe/tests/storage/test_stagingdirectory.py @@ -49,6 +49,18 @@ def read_only_with_overwritten(root_with_contents): return read_only, staged +@pytest.fixture +def permanent(tmp_path): + shared = MemoryStorage() + shared.store_bytes("final/old_unit/data.txt", b"foo") + perm = PermanentStaging( + scratch=tmp_path, + external=MemoryStorage(), + shared=shared, + prefix="final", + delete_staging=True + ) + return perm def test_safe_to_delete_staging_ok(tmp_path): external = FileStorage(tmp_path / "foo") @@ -266,5 +278,16 @@ def test_register_cleanup_preexisting_file(self, root): class TestPermanentStage: - def test_delete_staging_safe(self): + def test_delete_staging_safe(self, permanent): + ... + + def test_load_missing_for_transfer(self, permanent): + fname = pathlib.Path(permanent) / "old_unit/data.txt" + assert not fname.exists() + staging = permanent / "old_unit/data.txt" + assert not fname.exists() + assert permanent.external._data == {} + permanent.transfer_staging_to_external() + assert fname.exists() + assert permanent.external._data == {"final/old_unit/data.txt": b"foo"} ... From ab025f13e29064065f31dc38bf7675adbc35b7b5 Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Sat, 17 Jun 2023 11:44:07 -0500 Subject: [PATCH 31/69] test for Permanent delete staging --- gufe/tests/storage/test_stagingdirectory.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/gufe/tests/storage/test_stagingdirectory.py b/gufe/tests/storage/test_stagingdirectory.py index 2dfc1ffd..98e4befd 100644 --- a/gufe/tests/storage/test_stagingdirectory.py +++ b/gufe/tests/storage/test_stagingdirectory.py @@ -125,8 +125,6 @@ def test_delete_empty_dirs_delete_root(tmp_path, delete_root): assert base.exists() is not delete_root - - class TestSharedStaging: def test_repr(self, root): r = repr(root) @@ -278,8 +276,18 @@ def test_register_cleanup_preexisting_file(self, root): class TestPermanentStage: - def test_delete_staging_safe(self, permanent): - ... + @pytest.mark.parametrize('is_safe', [True, False]) + def test_delete_staging_safe(self, tmp_path, is_safe): + staging = ".staging" if is_safe else "" + permanent = PermanentStaging( + scratch=tmp_path, + external=MemoryStorage(), + shared=FileStorage(tmp_path), + prefix="final", + staging=staging, + delete_staging=True + ) + assert permanent._delete_staging_safe() is is_safe def test_load_missing_for_transfer(self, permanent): fname = pathlib.Path(permanent) / "old_unit/data.txt" @@ -290,4 +298,3 @@ def test_load_missing_for_transfer(self, permanent): permanent.transfer_staging_to_external() assert fname.exists() assert permanent.external._data == {"final/old_unit/data.txt": b"foo"} - ... From cd70ab22ae9714c5f34fbc1b211e8b1d6b906523 Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Sat, 17 Jun 2023 12:29:47 -0500 Subject: [PATCH 32/69] Add test for missing file on cleanup --- gufe/storage/stagingdirectory.py | 5 +++-- gufe/tests/storage/test_stagingdirectory.py | 12 ++++++++++++ 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/gufe/storage/stagingdirectory.py b/gufe/storage/stagingdirectory.py index e6cc34e0..fbe43cd0 100644 --- a/gufe/storage/stagingdirectory.py +++ b/gufe/storage/stagingdirectory.py @@ -153,8 +153,9 @@ def cleanup(self): _logger.debug(f"Removing file {file}") remove(file) else: - _logger.warning("Request to remove missing file " - f"{file}") + _logger.warning("During staging cleanup, file " + f"{file} was marked for deletion, but " + "can not be found on disk.") _delete_empty_dirs(self.staging_dir) def register_path(self, staging_path: StagingPath): diff --git a/gufe/tests/storage/test_stagingdirectory.py b/gufe/tests/storage/test_stagingdirectory.py index 98e4befd..0c1879f5 100644 --- a/gufe/tests/storage/test_stagingdirectory.py +++ b/gufe/tests/storage/test_stagingdirectory.py @@ -259,6 +259,18 @@ def test_cleanup(self, root_with_contents): root_with_contents.cleanup() assert not path.exists() + def test_cleanup_missing(self, root, caplog): + root.delete_staging = True + file = root / "foo.txt" + assert file in root.registry + assert not pathlib.Path(file).exists() + logger_name = "gufe.storage.stagingdirectory" + caplog.set_level(logging.WARNING, logger=logger_name) + root.cleanup() + assert len(caplog.records) == 1 + record = caplog.records[0] + assert "can not be found on disk" in record.msg + def test_register_cleanup_preexisting_file(self, root): filename = pathlib.Path(root.__fspath__()) / "foo.txt" filename.touch() From 90f2597d1af1cc4e5b2b8c70a57618d1feb889fb Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Thu, 22 Jun 2023 11:56:53 -0500 Subject: [PATCH 33/69] get_other_shared to private --- gufe/storage/stagingdirectory.py | 4 ++-- gufe/tests/storage/test_stagingdirectory.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/gufe/storage/stagingdirectory.py b/gufe/storage/stagingdirectory.py index e9fd86f5..25ac49f1 100644 --- a/gufe/storage/stagingdirectory.py +++ b/gufe/storage/stagingdirectory.py @@ -240,7 +240,7 @@ def __init__( delete_staging=delete_staging) self.read_only = read_only - def get_other_shared(self, prefix: str, + def _get_other_shared(self, prefix: str, delete_staging: Optional[bool] = None): """Get a related unit's staging directory. """ @@ -264,7 +264,7 @@ def other_shared(self, prefix: str, This is usually the recommended way to get a previous unit's shared data. """ - other = self.get_other_shared(prefix, delete_staging) + other = self._get_other_shared(prefix, delete_staging) yield other other.cleanup() diff --git a/gufe/tests/storage/test_stagingdirectory.py b/gufe/tests/storage/test_stagingdirectory.py index 0c1879f5..bed95378 100644 --- a/gufe/tests/storage/test_stagingdirectory.py +++ b/gufe/tests/storage/test_stagingdirectory.py @@ -157,7 +157,7 @@ def test_read_old(self, root): # when we create the specific StagingPath, it registers and # "downloads" the file - old_staging = root.get_other_shared("old_unit") + old_staging = root._get_other_shared("old_unit") filepath = old_staging / "data.txt" assert pathlib.Path(filepath) == on_filesystem assert on_filesystem.exists() @@ -179,7 +179,7 @@ def test_write_new(self, root): assert not root.external.exists(label) def test_write_old_fail(self, root): - old_staging = root.get_other_shared("old_unit") + old_staging = root._get_other_shared("old_unit") with pytest.raises(IOError, match="read-only"): old_staging / "foo.txt" From dd1b6dc19819c2822acd8d159dc36d6431b7c84d Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Fri, 1 Dec 2023 10:41:17 -0600 Subject: [PATCH 34/69] updates from other branch --- gufe/storage/stagingdirectory.py | 120 ++++++++---- gufe/storage/storagemanager.py | 206 ++++++++------------ gufe/tests/storage/test_stagingdirectory.py | 18 +- gufe/tests/storage/test_storagemanager.py | 70 +++---- gufe/utils.py | 28 +++ 5 files changed, 230 insertions(+), 212 deletions(-) diff --git a/gufe/storage/stagingdirectory.py b/gufe/storage/stagingdirectory.py index 25ac49f1..356a5d34 100644 --- a/gufe/storage/stagingdirectory.py +++ b/gufe/storage/stagingdirectory.py @@ -6,6 +6,8 @@ from .externalresource import ExternalStorage, FileStorage from contextlib import contextmanager +from gufe.utils import delete_empty_dirs + import logging _logger = logging.getLogger(__name__) @@ -36,27 +38,6 @@ def _safe_to_delete_staging(external: ExternalStorage, path: PathLike, return False -def _delete_empty_dirs(root: PathLike, delete_root: bool = True): - """Delete all empty directories. - - Repeats so that directories that only contained empty directories also - get deleted. - """ - root = Path(root) - - def find_empty_dirs(directory): - if not (paths := list(directory.iterdir())): - return [directory] - directories = [p for p in paths if p.is_dir()] - return sum([find_empty_dirs(d) for d in directories], []) - - while root.exists() and (empties := find_empty_dirs(root)): - if empties == [root] and not delete_root: - return - for directory in empties: - _logger.debug(f"Removing '{directory}'") - rmdir(directory) - class StagingDirectory: """PathLike local representation of an :class:`.ExternalStorage`. @@ -74,7 +55,7 @@ class StagingDirectory: 2. When requested, it transfers any newly created files to the :class:`.ExternalStorage`. - 3. It can delete all of the files it manages + 3. It can delete all of the files it manages. Parameters ---------- @@ -104,11 +85,13 @@ def __init__( *, staging: PathLike = Path(".staging"), delete_staging: bool = True, + delete_empty_dirs: bool = True, ): self.external = external self.scratch = Path(scratch) self.prefix = Path(prefix) self.delete_staging = delete_staging + self.delete_empty_dirs = delete_empty_dirs self.staging = staging self.registry : set[StagingPath] = set() @@ -128,7 +111,7 @@ def _delete_staging_safe(self): def transfer_single_file_to_external(self, held_file: StagingPath): """Transfer a given file from staging into external storage """ - path = Path(held_file) + path = Path(held_file.fspath) if not path.exists(): _logger.info(f"Found nonexistent path {path}, not " "transfering to external storage") @@ -138,25 +121,39 @@ def transfer_single_file_to_external(self, held_file: StagingPath): else: _logger.info(f"Transfering {path} to external storage") self.external.store_path(held_file.label, path) + return held_file + + return None # no transfer + def transfer_staging_to_external(self): - """Transfer all objects in the registry to external storage""" - for obj in self.registry: - self.transfer_single_file_to_external(obj) + """Transfer all objects in the registry to external storage + + """ + return [ + transferred + for file in self.registry + if (transferred := self.transfer_single_file_to_external(file)) + ] def cleanup(self): """Perform end-of-lifecycle cleanup. """ if self.delete_staging and self._delete_staging_safe(): for file in self.registry - self.preexisting: - if Path(file).exists(): + path = Path(file.fspath) + if path.exists(): _logger.debug(f"Removing file {file}") - remove(file) + # TODO: handle special case of directory? + path.unlink() + self.registry.remove(file) else: _logger.warning("During staging cleanup, file " f"{file} was marked for deletion, but " "can not be found on disk.") - _delete_empty_dirs(self.staging_dir) + + if self.delete_empty_dirs: + delete_empty_dirs(self.staging_dir) def register_path(self, staging_path: StagingPath): """ @@ -178,7 +175,11 @@ def register_path(self, staging_path: StagingPath): the path to track """ label_exists = self.external.exists(staging_path.label) - fspath = Path(staging_path.__fspath__()) + fspath = Path(staging_path.fspath) + + # TODO: what if the staging path is a directory? not sure that we + # have a way to know that; but not sure that adding it to the + # registry is right either if not fspath.parent.exists(): fspath.parent.mkdir(parents=True, exist_ok=True) @@ -192,13 +193,16 @@ def register_path(self, staging_path: StagingPath): def _load_file_from_external(self, external: ExternalStorage, staging_path: StagingPath): + # import pdb; pdb.set_trace() scratch_path = self.staging_dir / staging_path.path # TODO: switch this to using `get_filename` and `store_path` - with external.load_stream(staging_path.label) as f: - external_bytes = f.read() if scratch_path.exists(): self.preexisting.add(staging_path) - ... # TODO: something to check that the bytes are the same? + + with external.load_stream(staging_path.label) as f: + external_bytes = f.read() + ... # TODO: check that the bytes are the same if preexisting? + scratch_path.parent.mkdir(exist_ok=True, parents=True) with open(scratch_path, mode='wb') as f: f.write(external_bytes) @@ -211,7 +215,7 @@ def __fspath__(self): def __repr__(self): return ( - f"{self.__class__.__name__}({self.scratch}, {self.external}, " + f"{self.__class__.__name__}('{self.scratch}', {self.external}, " f"{self.prefix})" ) @@ -234,10 +238,12 @@ def __init__( *, staging: PathLike = Path(".staging"), delete_staging: bool = True, + delete_empty_dirs: bool = True, read_only: bool = False, ): super().__init__(scratch, external, prefix, staging=staging, - delete_staging=delete_staging) + delete_staging=delete_staging, + delete_empty_dirs=delete_empty_dirs) self.read_only = read_only def _get_other_shared(self, prefix: str, @@ -273,14 +279,14 @@ def transfer_single_file_to_external(self, held_file: StagingPath): _logger.debug("Read-only: Not transfering to external storage") return # early exit - super().transfer_single_file_to_external(held_file) + return super().transfer_single_file_to_external(held_file) def transfer_staging_to_external(self): if self.read_only: _logger.debug("Read-only: Not transfering to external storage") return # early exit - super().transfer_staging_to_external() + return super().transfer_staging_to_external() def register_path(self, staging_path: StagingPath): label_exists = self.external.exists(staging_path.label) @@ -308,9 +314,11 @@ def __init__( *, staging: PathLike = Path(".staging"), delete_staging: bool = True, + delete_empty_dirs: bool = True, ): super().__init__(scratch, external, prefix, staging=staging, - delete_staging=delete_staging) + delete_staging=delete_staging, + delete_empty_dirs=delete_empty_dirs) self.shared = shared def _delete_staging_safe(self): @@ -323,7 +331,7 @@ def _delete_staging_safe(self): def transfer_single_file_to_external(self, held_file: StagingPath): # if we can't find it locally, we load it from shared storage - path = Path(held_file) + path = Path(held_file.fspath) if not path.exists(): self._load_file_from_external(self.shared, held_file) @@ -336,28 +344,58 @@ class StagingPath: On creation, this registers with a :class:`.StagingDirectory` that will manage the local path and transferring data with its :class:`.ExternalStorage`. + + This object can always be used as a FileLike (using, e.g., the standard + ``open`` builtin). This requires that a staged path that exists on an + external resource be downloaded into a local file when it is referenced. + + For a representation of a file that does not require the download (for + example, when deserializing results that point to files) instead use + :class:`.ExternalFile`. """ def __init__(self, root: StagingDirectory, path: Union[PathLike, str]): self.root = root self.path = Path(path) + + def register(self): + """Register this path with its StagingDirectory. + + If a file associated with this path exists in an external storage, + it will be downloaded to the staging area as part of registration. + """ self.root.register_path(self) def __truediv__(self, path: Union[PathLike, str]): return StagingPath(self.root, self.path / path) - def __fspath__(self): + def __eq__(self, other): + return (isinstance(other, StagingPath) + and self.root == other.root + and self.path == other.path) + + def __hash__(self): + return hash((self.root, self.path)) + + @property + def fspath(self): return str(self.root.staging_dir / self.path) + def __fspath__(self): + self.register() + return self.fspath + @property def label(self) -> str: """Label used in :class:`.ExternalStorage` for this path""" return str(self.root.prefix / self.path) def __repr__(self): - return f"StagingPath({self.__fspath__()})" + return f"StagingPath('{self.fspath}')" # TODO: how much of the pathlib.Path interface do we want to wrap? # although edge cases may be a pain, we can get most of it with, e.g.: # def exists(self): return Path(self).exists() # but also, can do pathlib.Path(staging_path) and get hte whole thing + + diff --git a/gufe/storage/storagemanager.py b/gufe/storage/storagemanager.py index dbba86ea..9061a874 100644 --- a/gufe/storage/storagemanager.py +++ b/gufe/storage/storagemanager.py @@ -4,108 +4,15 @@ from contextlib import contextmanager import shutil +from gufe.utils import delete_empty_dirs + from typing import Type from .externalresource import ExternalStorage, FileStorage from .stagingdirectory import SharedStaging, PermanentStaging -class DAGContextManager: - @classmethod - @contextmanager - def running_dag(cls, storage_manager: StorageManager, dag_label: str): - """Return a context manager for when a DAG is started. - - This context manager handles the DAG scale of the lifecycle. - """ - raise NotImplementedError() - - @contextmanager - def running_unit(self, unit_label: str): - """Return a context manager for when unit is started. - - This context manager handles the unit scale of the lifecycle. - """ - raise NotImplementedError() - -_DCMType = Type[DAGContextManager] # to shorten some lines - -class SingleProcDAGContextManager(DAGContextManager): - """Context manager to handle details of storage lifecycle. - - Making this a separate class ensures that ``running_unit`` is always - called within the context of a given DAG. This is usually not created - directly; instead, it is created (and used) with its ``running_dag`` - classmethod, typically from within a ``StorageManager``. - """ - def __init__(self, storage_manager: StorageManager, dag_label: str): - self.manager = storage_manager - self.dag_label = dag_label - self.permanents: list[PermanentStaging] = [] - - @classmethod # NB: classmethod must be on top - @contextmanager - def running_dag(cls, storage_manager: StorageManager, dag_label: str): - """DAG level of the storage lifecycle - - When the DAG is completed, transfer everything to the permanent - storage, and delete the staging area for permanent (if we are - supposed to). - - This is not usually called by users; instead it is called from - within the ``StorageManager``. - """ - dag_manager = cls(storage_manager, dag_label) - try: - yield dag_manager - finally: - for permanent in dag_manager.permanents: - permanent.transfer_staging_to_external() - - if not dag_manager.manager.keep_staging: - for d in dag_manager.permanents: - # import pdb; pdb.set_trace() - d.cleanup() - - @contextmanager - def running_unit(self, unit_label: str): - """Unit level of the storage lifecycle. - - This provides the staging directories used for scratch, shared, and - permanent. At the end of the unit, it transfers anything from shared - to the real shared external storage, cleans up the scratch - directory and the shared staging directory. - - Note that the unit label here is the *entire* label; that is, it - would also include information identifying the DAG. - """ - scratch = self.manager.get_scratch(unit_label) - shared = self.manager.get_shared(unit_label) - permanent = self.manager.get_permanent(unit_label) - try: - yield scratch, shared, permanent - finally: - # TODO: should some of this be in an else clause instead? - self.permanents.append(permanent) - shared.transfer_staging_to_external() - # everything in permanent must also be available in shared - for file in permanent.registry: - shared.transfer_single_file_to_external(file) - - if not self.manager.keep_scratch: - shutil.rmtree(scratch) - - if not self.manager.keep_staging: - shared.cleanup() - - class StorageManager: - """Tool to manage the storage lifecycle during a DAG. - - This object primarily contains the logic for getting the staging - directories. A separate class, in the ``DAGContextClass`` variable, - handles the logic for the context managers. - """ def __init__( self, scratch_root: PathLike, @@ -114,56 +21,107 @@ def __init__( *, keep_scratch: bool = False, keep_staging: bool = False, + keep_shared: bool = False, staging: PathLike = Path(".staging"), - DAGContextClass: _DCMType = SingleProcDAGContextManager, + delete_empty_dirs: bool = True, ): self.scratch_root = Path(scratch_root) self.shared_root = shared_root self.permanent_root = permanent_root self.keep_scratch = keep_scratch self.keep_staging = keep_staging + self.keep_shared = keep_shared self.staging = staging - self.DAGContextClass = DAGContextClass + self.delete_empty_dirs = delete_empty_dirs - def get_scratch(self, unit_label: str) -> Path: - """Get the path for this unit's scratch directory""" + # these are used to track what files can be deleted from shared if + # keep_shared is False + self.shared_xfer = set() + self.permanent_xfer = set() - scratch = self.scratch_root / "scratch" / unit_label - scratch.mkdir(parents=True, exist_ok=True) - return scratch - - def get_permanent(self, unit_label) -> PermanentStaging: - """Get the object for this unit's permanent staging directory""" - return PermanentStaging( + self.permanent_staging = PermanentStaging( scratch=self.scratch_root, external=self.permanent_root, shared=self.shared_root, - prefix=unit_label, staging=self.staging, + delete_empty_dirs=delete_empty_dirs, + prefix="" ) - def get_shared(self, unit_label) -> SharedStaging: - """Get the object for this unit's shared staging directory""" - return SharedStaging( + self.shared_staging = SharedStaging( scratch=self.scratch_root, external=self.shared_root, - prefix=unit_label, staging=self.staging, + delete_empty_dirs=delete_empty_dirs, + prefix="" # TODO: remove prefix ) - def running_dag(self, dag_label: str): - """Return a context manager that handles storage. + def make_label(self, dag_label, unit_label, attempt, **kwargs): + """ + + The specific executor may change this by making a very simple + adapter subclass and overriding this method, which can take + arbitrary additional kwargs that may tie it to a specific executor. + """ + return f"{dag_label}/{unit_label}_attempt_{attempt}" - For simple use cases, this is the only method a user needs to call. - Usage is something like: + @property + def _scratch_base(self): + return self.scratch_root / "scratch" - .. code:: + def _scratch_loc(self, dag_label, unit_label, attempt, **kwargs): + label = self.make_label(dag_label, unit_label, attempt) + return self._scratch_base / label - with manager.running_dag(dag_label) as dag_ctx: - for unit in dag_ordered_units: - label = f"{dag_ctx.dag_label}/{unit.key}" - with dag_ctx.running_unit(label) as dirs: - scratch, shared, permanent = dirs - # run the unit - """ - return self.DAGContextClass.running_dag(self, dag_label) + @contextmanager + def running_dag(self, dag_label): + # TODO: remove (or use) dag_label + try: + yield self + finally: + # import pdb; pdb.set_trace() + # clean up after DAG completes + self.permanent_staging.transfer_staging_to_external() + + if not self.keep_staging: + self.permanent_staging.cleanup() + + if not self.keep_shared: + for file in self.shared_xfer: + self.shared_root.delete(file.label) + + for file in self.permanent_xfer: + if self.shared_root != self.permanent_root: + self.shared_root.delete(file.label) + + if self.delete_empty_dirs: + delete_empty_dirs(self._scratch_base, delete_root=False) + + @contextmanager + def running_unit(self, dag_label, unit_label, **kwargs): + scratch = self._scratch_loc(dag_label, unit_label, **kwargs) + label = self.make_label(dag_label, unit_label, **kwargs) + scratch.mkdir(parents=True, exist_ok=True) + shared = self.shared_staging / label + permanent = self.permanent_staging / label + try: + yield scratch, shared, permanent + finally: + # import pdb; pdb.set_trace() + # clean up after unit + + # track the files that were in shared so that we can delete them + # at the end of the DAG if requires + shared_xfers = self.shared_staging.transfer_staging_to_external() + self.shared_xfer.update(set(shared_xfers)) + + # everything in permanent should also be in shared + for file in self.permanent_staging.registry: + self.shared_staging.transfer_single_file_to_external(file) + self.permanent_xfer.add(file) + + if not self.keep_scratch: + shutil.rmtree(scratch) + + if not self.keep_staging: + self.shared_staging.cleanup() diff --git a/gufe/tests/storage/test_stagingdirectory.py b/gufe/tests/storage/test_stagingdirectory.py index bed95378..f94a1888 100644 --- a/gufe/tests/storage/test_stagingdirectory.py +++ b/gufe/tests/storage/test_stagingdirectory.py @@ -7,8 +7,8 @@ from gufe.storage.externalresource import MemoryStorage, FileStorage from gufe.storage.stagingdirectory import ( - SharedStaging, PermanentStaging, _delete_empty_dirs, - _safe_to_delete_staging + SharedStaging, PermanentStaging, _safe_to_delete_staging, + delete_empty_dirs, # TODO: move to appropriate place ) @pytest.fixture @@ -43,6 +43,8 @@ def read_only_with_overwritten(root_with_contents): filename = pathlib.Path(read_only) / "data.txt" assert not filename.exists() staged = read_only / "data.txt" + assert not filename.exists() + staged.__fspath__() assert filename.exists() with open(staged, mode='w') as f: f.write("changed") @@ -97,7 +99,7 @@ def test_delete_empty_dirs(tmp_path): path.parent.mkdir(parents=True, exist_ok=True) path.touch() - _delete_empty_dirs(base) + delete_empty_dirs(base) for path in paths: assert path.exists() @@ -116,7 +118,7 @@ def test_delete_empty_dirs_delete_root(tmp_path, delete_root): for directory in dirs: directory.mkdir(parents=True, exist_ok=True) - _delete_empty_dirs(base, delete_root=delete_root) + delete_empty_dirs(base, delete_root=delete_root) for directory in dirs: assert not directory.exists() @@ -180,8 +182,9 @@ def test_write_new(self, root): def test_write_old_fail(self, root): old_staging = root._get_other_shared("old_unit") + staged = old_staging / "foo.txt" with pytest.raises(IOError, match="read-only"): - old_staging / "foo.txt" + staged.__fspath__() def test_transfer_to_external(self, root_with_contents): path = list(root_with_contents.registry)[0] # only 1 @@ -262,6 +265,7 @@ def test_cleanup(self, root_with_contents): def test_cleanup_missing(self, root, caplog): root.delete_staging = True file = root / "foo.txt" + file.__fspath__() assert file in root.registry assert not pathlib.Path(file).exists() logger_name = "gufe.storage.stagingdirectory" @@ -279,6 +283,9 @@ def test_register_cleanup_preexisting_file(self, root): assert len(root.preexisting) == 0 staging = root / "foo.txt" assert staging.label == "new_unit/foo.txt" + assert len(root.registry) == 0 + assert len(root.preexisting) == 0 + staging.__fspath__() assert len(root.registry) == 1 assert len(root.preexisting) == 1 @@ -305,6 +312,7 @@ def test_load_missing_for_transfer(self, permanent): fname = pathlib.Path(permanent) / "old_unit/data.txt" assert not fname.exists() staging = permanent / "old_unit/data.txt" + staging.__fspath__() assert not fname.exists() assert permanent.external._data == {} permanent.transfer_staging_to_external() diff --git a/gufe/tests/storage/test_storagemanager.py b/gufe/tests/storage/test_storagemanager.py index fb584be1..80af74c1 100644 --- a/gufe/tests/storage/test_storagemanager.py +++ b/gufe/tests/storage/test_storagemanager.py @@ -1,7 +1,5 @@ import pytest -from gufe.storage.storagemanager import ( - StorageManager -) +from gufe.storage.storagemanager import StorageManager from gufe.storage.stagingdirectory import StagingDirectory from gufe.storage.externalresource import MemoryStorage, FileStorage from pathlib import Path @@ -35,7 +33,7 @@ def run(self, scratch, shared, permanent): (scratch / "foo2.txt").touch() # TODO: this will change; the inputs should include a way to get # the previous shared unit label - with shared.other_shared("dag/unit1") as prev_shared: + with shared.root.other_shared("dag/unit1_attempt_0") as prev_shared: with open(prev_shared / "bar.txt", mode='r') as f: bar = f.read() @@ -48,6 +46,7 @@ def run(self, scratch, shared, permanent): return [Unit1(), Unit2()] + class LifecycleHarness: @pytest.fixture def storage_manager(self, tmp_path): @@ -58,10 +57,10 @@ def get_files_dict(storage_manager): root = storage_manager.scratch_root staging = storage_manager.staging return { - "foo": root / "scratch/dag/unit1/foo.txt", - "foo2": root / "scratch/dag/unit2/foo2.txt", - "bar": root / staging / "dag/unit1/bar.txt", - "baz": root / staging / "dag/unit1/baz.txt", + "foo": root / "scratch/dag/unit1_attempt_0/foo.txt", + "foo2": root / "scratch/dag/unit2_attempt_0/foo2.txt", + "bar": root / staging / "dag/unit1_attempt_0/bar.txt", + "baz": root / staging / "dag/unit1_attempt_0/baz.txt", } def test_lifecycle(self, storage_manager, dag_units, tmp_path): @@ -69,9 +68,12 @@ def test_lifecycle(self, storage_manager, dag_units, tmp_path): dag_label = "dag" with storage_manager.running_dag(dag_label) as dag_ctx: for unit in dag_units: - label = f"{dag_ctx.dag_label}/{unit.key}" - with dag_ctx.running_unit(label) as (scratch, shared, perm): + label = f"{dag_label}/{unit.key}" + with dag_ctx.running_unit(dag_label, unit.key, attempt=0) as ( + scratch, shared, perm + ): results.append(unit.run(scratch, shared, perm)) + # import pdb; pdb.set_trace() self.in_unit_asserts(storage_manager, label) self.after_unit_asserts(storage_manager, label) self.after_dag_asserts(storage_manager) @@ -117,7 +119,8 @@ def in_unit_asserts(self, storage_manager, unit_label): permanent_root = storage_manager.permanent_root expected_in_shared = { "dag/unit1": set(), - "dag/unit2": {"dag/unit1/bar.txt", "dag/unit1/baz.txt"} + "dag/unit2": {"dag/unit1_attempt_0/bar.txt", + "dag/unit1_attempt_0/baz.txt"} }[unit_label] | self._in_staging_shared(unit_label, "in") assert set(shared_root.iter_contents()) == expected_in_shared @@ -134,7 +137,8 @@ def after_unit_asserts(self, storage_manager, unit_label): permanent_root = storage_manager.permanent_root shared_extras = self._in_staging_shared(unit_label, "after") permanent_extras = self._in_staging_permanent(unit_label, "after") - expected_in_shared = {"dag/unit1/bar.txt", "dag/unit1/baz.txt"} + expected_in_shared = {"dag/unit1_attempt_0/bar.txt", + "dag/unit1_attempt_0/baz.txt"} expected_in_shared |= shared_extras assert set(shared_root.iter_contents()) == expected_in_shared assert set(permanent_root.iter_contents()) == permanent_extras @@ -155,7 +159,8 @@ def after_dag_asserts(self, storage_manager): # expected_in_shared = {"dag/unit1/bar.txt", "dag/unit1/baz.txt"} # expected_in_shared |= shared_extras # assert set(shared_root.iter_contents()) == expected_in_shared - expected_in_permanent = {"dag/unit1/baz.txt"} | permanent_extras + expected_in_permanent = ({"dag/unit1_attempt_0/baz.txt"} + | permanent_extras) assert set(permanent_root.iter_contents()) == expected_in_permanent # manager-specific check for files @@ -239,16 +244,14 @@ def _after_unit_existing_files(self, unit_label): return {"bar", "baz"} def _after_dag_existing_files(self): - # NOTE: currently we don't delete bar at the end of a cycle, but we - # don't guarantee that we would not. So it exists, but changing that - # isn't API-breaking. - return {"bar", "baz"} + # these get deleted because we don't keep shared here + return set() def _in_staging_shared(self, unit_label, in_after): - bar = "dag/unit1/bar.txt" - baz = "dag/unit1/baz.txt" - foo = "scratch/dag/unit1/foo.txt" - foo2 = "scratch/dag/unit2/foo2.txt" + bar = "dag/unit1_attempt_0/bar.txt" + baz = "dag/unit1_attempt_0/baz.txt" + foo = "scratch/dag/unit1_attempt_0/foo.txt" + foo2 = "scratch/dag/unit2_attempt_0/foo2.txt" return { ("dag/unit1", "in"): {bar, baz, foo}, ("dag/unit1", "after"): {bar, baz}, @@ -278,10 +281,10 @@ def _after_dag_existing_files(self): return {"baz"} def _in_staging_permanent(self, unit_label, in_after): - bar = "dag/unit1/bar.txt" - baz = "dag/unit1/baz.txt" - foo = "scratch/dag/unit1/foo.txt" - foo2 = "scratch/dag/unit2/foo2.txt" + bar = "dag/unit1_attempt_0/bar.txt" + baz = "dag/unit1_attempt_0/baz.txt" + foo = "scratch/dag/unit1_attempt_0/foo.txt" + foo2 = "scratch/dag/unit2_attempt_0/foo2.txt" return { ("dag/unit1", "in"): {bar, baz, foo}, ("dag/unit1", "after"): {baz}, @@ -292,20 +295,3 @@ def _in_staging_permanent(self, unit_label, in_after): def _after_unit_existing_files(self, unit_label): # same for both; all files come from unit 1 return {"baz"} - - -class TestStorageManager: - def test_get_scratch(self, storage_manager_std): - scratch = storage_manager_std.get_scratch("dag_label/unit_label") - assert str(scratch).endswith("scratch/dag_label/unit_label") - assert isinstance(scratch, Path) - - def test_get_permanent(self, storage_manager_std): - perm = storage_manager_std.get_permanent("dag_label/unit_label") - assert perm.__fspath__().endswith(".staging/dag_label/unit_label") - assert isinstance(perm, StagingDirectory) - - def test_get_shared(self, storage_manager_std): - shared = storage_manager_std.get_shared("dag_label/unit_label") - assert shared.__fspath__().endswith(".staging/dag_label/unit_label") - assert isinstance(shared, StagingDirectory) diff --git a/gufe/utils.py b/gufe/utils.py index f9d3b0ff..4382a801 100644 --- a/gufe/utils.py +++ b/gufe/utils.py @@ -4,6 +4,12 @@ import io import warnings +from os import PathLike, rmdir +import pathlib + +import logging +_logger = logging.getLogger(__name__) + class ensure_filelike: """Context manager to convert pathlike or filelike to filelike. @@ -52,3 +58,25 @@ def __exit__(self, type, value, traceback): if self.do_close: self.context.close() + + +def delete_empty_dirs(root: PathLike, delete_root: bool = True): + """Delete all empty directories. + + Repeats so that directories that only contained empty directories also + get deleted. + """ + root = pathlib.Path(root) + + def find_empty_dirs(directory): + if not (paths := list(directory.iterdir())): + return [directory] + directories = [p for p in paths if p.is_dir()] + return sum([find_empty_dirs(d) for d in directories], []) + + while root.exists() and (empties := find_empty_dirs(root)): + if empties == [root] and not delete_root: + return + for directory in empties: + _logger.debug(f"Removing '{directory}'") + rmdir(directory) From a575dd3db002b65cc0744534accf1e7d5b722af0 Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Fri, 1 Dec 2023 11:08:28 -0600 Subject: [PATCH 35/69] make mypy happy --- gufe/storage/storagemanager.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/gufe/storage/storagemanager.py b/gufe/storage/storagemanager.py index 9061a874..61665b7f 100644 --- a/gufe/storage/storagemanager.py +++ b/gufe/storage/storagemanager.py @@ -10,6 +10,7 @@ from .externalresource import ExternalStorage, FileStorage from .stagingdirectory import SharedStaging, PermanentStaging +from .stagingdirectory import StagingPath # typing class StorageManager: @@ -36,8 +37,8 @@ def __init__( # these are used to track what files can be deleted from shared if # keep_shared is False - self.shared_xfer = set() - self.permanent_xfer = set() + self.shared_xfer: set[StagingPath] = set() + self.permanent_xfer: set[StagingPath] = set() self.permanent_staging = PermanentStaging( scratch=self.scratch_root, From 24d382015e84f20f5de96972680dab1355adf14c Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Fri, 1 Dec 2023 11:46:28 -0600 Subject: [PATCH 36/69] Use StorageManager/StagingPath in execute_DAG --- gufe/protocols/protocoldag.py | 128 ++++++--- gufe/protocols/protocolunit.py | 5 +- gufe/tests/storage/test_storage_demo.py | 364 ++++++++++++++++++++++++ 3 files changed, 455 insertions(+), 42 deletions(-) create mode 100644 gufe/tests/storage/test_storage_demo.py diff --git a/gufe/protocols/protocoldag.py b/gufe/protocols/protocoldag.py index 85978136..7f044457 100644 --- a/gufe/protocols/protocoldag.py +++ b/gufe/protocols/protocoldag.py @@ -17,6 +17,13 @@ ProtocolUnit, ProtocolUnitResult, ProtocolUnitFailure, Context ) +from ..storage.storagemanager import StorageManager +from ..storage.externalresource.filestorage import FileStorage +from ..storage.externalresource.base import ExternalStorage + +import logging +_logger = logging.getLogger(__name__) + class DAGMixin: _protocol_units: list[ProtocolUnit] @@ -345,9 +352,23 @@ def _from_dict(cls, dct: dict): return cls(**dct) +class ReproduceOldBehaviorStorageManager(StorageManager): + # Default behavior has scratch at {dag_label}/scratch/{unit_label} and + # shared at {dag_label}/{unit_label}. This little class makes changes + # that get us back to the original behavior of this class: scratch at + # {dag_label}/scratch_{unit_label} and shared at + # {dag_label}/shared_{unit_label}. + def _scratch_loc(self, dag_label, unit_label, attempt): + return self.scratch_root / f"scratch_{unit_label}_attempt_{attempt}" + + def make_label(self, dag_label, unit_label, attempt): + return f"{dag_label}/shared_{unit_label}_attempt_{attempt}" + + def execute_DAG(protocoldag: ProtocolDAG, *, - shared_basedir: Path, - scratch_basedir: Path, + shared_basedir: Optional[PathLike], + scratch_basedir: PathLike, + shared: Optional[ExternalStorage] = None, keep_shared: bool = False, keep_scratch: bool = False, raise_error: bool = True, @@ -385,52 +406,77 @@ def execute_DAG(protocoldag: ProtocolDAG, *, The result of executing the `ProtocolDAG`. """ + # the directory given as shared_root is actually the directory for this + # DAG; the "shared_root" for the storage manager is the parent. We'll + # force permanent to be the same. + if shared is None: + shared = FileStorage(shared_basedir.parent) + dag_label = shared_basedir.name + storage_manager = ReproduceOldBehaviorStorageManager( + scratch_root=scratch_basedir, + shared_root=shared, + permanent_root=shared, + keep_scratch=keep_scratch, + keep_shared=keep_shared, + keep_staging=True, + delete_empty_dirs=False, + #staging=Path(""), # use the actual directories as the staging + ) + return new_execute_DAG(protocoldag, dag_label, storage_manager, + raise_error, n_retries) + + +def new_execute_DAG( + protocoldag, + dag_label, + storage_manager, + raise_error=False, + n_retries=0 +): + # this simplifies setup of execute_DAG by allowing you to directly + # provide the storage_manager; the extra option in the old one just + # configure the storage_manager if n_retries < 0: raise ValueError("Must give positive number of retries") # iterate in DAG order results: dict[GufeKey, ProtocolUnitResult] = {} all_results = [] # successes AND failures - shared_paths = [] - for unit in protocoldag.protocol_units: - # translate each `ProtocolUnit` in input into corresponding - # `ProtocolUnitResult` - inputs = _pu_to_pur(unit.inputs, results) - - attempt = 0 - while attempt <= n_retries: - shared = shared_basedir / f'shared_{str(unit.key)}_attempt_{attempt}' - shared_paths.append(shared) - shared.mkdir() - - scratch = scratch_basedir / f'scratch_{str(unit.key)}_attempt_{attempt}' - scratch.mkdir() - - context = Context(shared=shared, - scratch=scratch) - - # execute - result = unit.execute( - context=context, - raise_error=raise_error, - **inputs) - all_results.append(result) - - if not keep_scratch: - shutil.rmtree(scratch) - - if result.ok(): - # attach result to this `ProtocolUnit` - results[unit.key] = result - break - attempt += 1 - if not result.ok(): - break - - if not keep_shared: - for shared_path in shared_paths: - shutil.rmtree(shared_path) + with storage_manager.running_dag(dag_label) as dag_ctx: + for unit in protocoldag.protocol_units: + # import pdb; pdb.set_trace() + attempt = 0 + while attempt <= n_retries: + # translate each `ProtocolUnit` in input into corresponding + # `ProtocolUnitResult` + inputs = _pu_to_pur(unit.inputs, results) + + label = storage_manager.make_label(dag_label, unit.key, + attempt=attempt) + with dag_ctx.running_unit(dag_label, unit.key, attempt=attempt) as ( + scratch, shared, perm + ): + # TODO: context manager should return context + context = Context(shared=shared, + scratch=scratch, + permanent=perm) + _logger.info("Starting unit {label}") + _logger.info(context) + result = unit.execute( + context=context, + raise_error=raise_error, + **inputs) + all_results.append(result) + + if result.ok(): + # attach result to this `ProtocolUnit` + results[unit.key] = result + break + attempt += 1 + + if not result.ok(): + break return ProtocolDAGResult( name=protocoldag.name, diff --git a/gufe/protocols/protocolunit.py b/gufe/protocols/protocolunit.py index 73728bdf..77e1f913 100644 --- a/gufe/protocols/protocolunit.py +++ b/gufe/protocols/protocolunit.py @@ -23,6 +23,8 @@ GufeTokenizable, GufeKey, TOKENIZABLE_REGISTRY ) +from ..storage.stagingdirectory import StagingDirectory + @dataclass class Context: @@ -31,7 +33,8 @@ class Context: """ scratch: PathLike - shared: PathLike + shared: StagingDirectory + permanent: StagingDirectory def _list_dependencies(inputs, cls): diff --git a/gufe/tests/storage/test_storage_demo.py b/gufe/tests/storage/test_storage_demo.py new file mode 100644 index 00000000..49011784 --- /dev/null +++ b/gufe/tests/storage/test_storage_demo.py @@ -0,0 +1,364 @@ +import pytest + +import pathlib + +import gufe +from gufe.storage.externalresource import MemoryStorage, FileStorage +from gufe.storage.storagemanager import StorageManager +from gufe.storage.stagingdirectory import StagingPath +from gufe.protocols.protocoldag import new_execute_DAG + +""" +This module contains complete integration tests for the storage lifecycle, +using an actual protocol as an example. + +These tests are largely redundant from the perspective of unit testing, but +the :class:`.StoragedDemoProtocol` is useful as an example for +implementation. Furthermore, as integration tests, they ensure that the +whole setup works together. +""" + + +class Unit1(gufe.ProtocolUnit): + def _execute(self, ctx): + share_file = ctx.shared / "shared.txt" + with open(share_file, mode='w') as f: + f.write("I can be shared") + + perm_file = ctx.permanent / "permanent.txt" + with open(perm_file, mode='w') as f: + f.write("I'm permanent (but I can be shared)") + + scratch_file = ctx.scratch / "scratch.txt" + with open(scratch_file, mode='w') as f: + f.write("This is scratch -- can't be shared") + + return {'share_file': share_file, + 'perm_file': perm_file, + 'scratch_file': scratch_file} + + +class Unit2(gufe.ProtocolUnit): + def _execute(self, ctx, unit1_result): + u1_outputs = unit1_result.outputs + + outputs = {} + for file_label, file in unit1_result.outputs.items(): + # import pdb; pdb.set_trace() + # labels are, e.g., share_file; file is StagingPath + key = f"{file_label}_contents" + try: + with open(file, mode='r') as f: + outputs[key] = f.read() + except FileNotFoundError: + outputs[key] = "File not found" + + return outputs + + +class StorageDemoProtocol(gufe.Protocol): + @classmethod + def _default_settings(cls): + return {} + + @classmethod + def _defaults(cls): + return {} + + def _create(self, stateA, stateB, mapping, extends): + u1 = Unit1() + u2 = Unit2(unit1_result=u1) + return [u1, u2] + + def _gather(self, protocol_dag_results): + return {} + +@pytest.fixture +def demo_dag(solvated_ligand, solvated_complex): + transformation = gufe.Transformation( + solvated_ligand, + solvated_complex, + protocol=StorageDemoProtocol(StorageDemoProtocol.default_settings()), + mapping=None + ) + dag = transformation.create() + return dag + + +class ExecutionStorageDemoTest: + """ + Template method pattern ABC for tests of StorageDemoProtocol execution. + + Using template method here because it ensures that all aspects get + tested for all implementations, even though individual aspects may + differ between different setups. + """ + def get_shared_and_permanent(self): + raise NotImplementedError() + + @staticmethod + def _parse_keep(keep): + return ( + 'scratch' in keep, + 'staging' in keep, + 'shared' in keep, + 'empties' in keep + ) + + def assert_dag_result(self, result, demo_dag, storage_manager): + """Test that the ProtocolDAGResult has the expected contents. + + This should be preserved across all execution methods. + """ + u1_label = self.u1_label(demo_dag) + keep_scratch = storage_manager.keep_scratch + + assert result.ok + assert len(result.protocol_unit_results) == 2 + res1, res2 = result.protocol_unit_results + assert set(res1.outputs) == {'share_file', 'perm_file', 'scratch_file'} + assert isinstance(res1.outputs['scratch_file'], pathlib.Path) + assert isinstance(res1.outputs['share_file'], StagingPath) + assert isinstance(res1.outputs['perm_file'], StagingPath) + + if keep_scratch: + scratch_res2 = "This is scratch -- can't be shared" + else: + scratch_res2 = "File not found" + + assert res2.outputs == { + 'share_file_contents': "I can be shared", + 'perm_file_contents': "I'm permanent (but I can be shared)", + 'scratch_file_contents': scratch_res2 + } + + def assert_shared_and_permanent(self, storage_manager, dag): + """Check the final status of the shared and permanent containers. + + The can depend on the relation between the shared and permanent + external storage containers. For example, if they are the same + object, the final contents of permament will also include the final + contents of shared (and vice versa). + + Default behavior here is for the case of distinct backends. + """ + shared = storage_manager.shared_root + permanent = storage_manager.permanent_root + u1_label = self.u1_label(dag) + keep_shared = storage_manager.keep_shared + + perm_file = f"{u1_label}/permanent.txt" + shared_file = f"{u1_label}/shared.txt" + + assert list(permanent.iter_contents()) == [perm_file] + with permanent.load_stream(perm_file) as f: + assert f.read() == b"I'm permanent (but I can be shared)" + + if keep_shared: + assert list(shared.iter_contents()) == [shared_file, perm_file] + with shared.load_stream(shared_file) as f: + assert f.read() == b"I can be shared" + with shared.load_stream(perm_file) as f: + assert f.read() == b"I'm permanent (but I can be shared)" + else: + assert list(shared.iter_contents()) == [] + + def assert_scratch(self, storage_manager): + """Check the final status of the scratch directory. + + This will change if the scratch is within the staging root directory + (for cases where we want to keep one of staging/scratch and not the + other; empty directories might get deleted in one case). + """ + scratch = storage_manager.scratch_root + keep_scratch = storage_manager.keep_scratch + del_empty_dirs = storage_manager.delete_empty_dirs + assert scratch.is_dir() + + if keep_scratch: + n_expected = 1 if del_empty_dirs else 2 + dag_dir = scratch / "scratch/dag" + assert len(list(dag_dir.iterdir())) == n_expected + else: + assert 'scratch' not in list(scratch.iterdir()) + + def assert_staging(self, storage_manager, dag): + """Check the final status of the staging directory. + + Behavior here will change if staging overlaps with a FileStorage for + either shared or permanent. + """ + keep_staging = storage_manager.keep_staging + u1_label = self.u1_label(dag) + scratch_root = storage_manager.scratch_root + u1_staging = scratch_root / ".staging" / u1_label + + if keep_staging: + assert (u1_staging / "shared.txt").exists() + assert (u1_staging / "permanent.txt").exists() + else: + assert ".staging" not in list(scratch_root.iterdir()) + + @staticmethod + def u1_label(dag): + """Unit 1 label""" + return f"dag/{dag.protocol_units[0].key}_attempt_0" + + @staticmethod + def u2_label(dag): + """Unit 2 label""" + return f"dag/{dag.protocol_units[1].key}_attempt_0" + + def get_storage_manager(self, keep, tmp_path): + keep_scr, keep_sta, keep_sha, empties = self._parse_keep(keep) + del_empty_dirs = not empties + shared, permanent = self.get_shared_and_permanent() + + storage_manager = StorageManager( + scratch_root=tmp_path, + shared_root=shared, + permanent_root=permanent, + keep_scratch=keep_scr, + keep_staging=keep_sta, + keep_shared=keep_sha, + delete_empty_dirs=del_empty_dirs, + ) + return storage_manager + + @pytest.mark.parametrize('keep', [ + 'nothing', 'scratch', 'staging', 'shared', 'scratch,staging', + 'scratch,shared', 'staging,shared', 'scratch,staging,shared', + 'scratch,empties', 'scratch,shared,empties', + ]) + def test_execute_dag(self, demo_dag, keep, tmp_path): + storage_manager = self.get_storage_manager(keep, tmp_path) + + dag_label = "dag" + result = new_execute_DAG(demo_dag, dag_label, storage_manager, + raise_error=True, n_retries=2) + + self.assert_dag_result(result, demo_dag, storage_manager) + self.assert_shared_and_permanent(storage_manager, demo_dag) + self.assert_scratch(storage_manager) + self.assert_staging(storage_manager, demo_dag) + + +class TestExecuteStorageDemoDiffBackends(ExecutionStorageDemoTest): + """ + Test execution when permanent and shared are different MemoryStorages. + + This is considered the standard base case; this should be easiest to + pass, as there should be no special case code that needs to be invoked. + """ + def get_shared_and_permanent(self): + return MemoryStorage(), MemoryStorage() + + +class TestExecuteStorageDemoSameBackend(ExecutionStorageDemoTest): + """ + Test execution when permanent and shared are the same MemoryStorage. + """ + def get_shared_and_permanent(self): + backend = MemoryStorage() + return backend, backend + + def assert_shared_and_permanent(self, storage_manager, dag): + shared = storage_manager.shared_root + permanent = storage_manager.permanent_root + u1_label = self.u1_label(dag) + keep_shared = storage_manager.keep_shared + + perm_file = f"{u1_label}/permanent.txt" + shared_file = f"{u1_label}/shared.txt" + + assert shared is permanent + # we'll test everything in permanent, because shared is identical + + if keep_shared: + expected = {perm_file, shared_file} + else: + expected = {perm_file} + + assert set(permanent.iter_contents()) == expected + with permanent.load_stream(perm_file) as f: + assert f.read() == b"I'm permanent (but I can be shared)" + + if keep_shared: + with permanent.load_stream(shared_file) as f: + assert f.read() == b"I can be shared" + + +class TestExecuteStorageDemoStagingOverlap(TestExecuteStorageDemoSameBackend): + """ + Test execution when permanent and shared overlap with staging. + + This represents the approach we will probably actually use. In this + case, we use identical FileStorage for shared and permanent, and those + overlap with the staging directory. The result is that file locations + don't actually change. + """ + def get_shared_and_permanent(self): + ... # override the need for this; not the prettiest, but it works + + def get_storage_manager(self, keep, tmp_path): + keep_scr, keep_sta, keep_sha, empties = self._parse_keep(keep) + del_empty_dirs = not empties + backend = FileStorage(tmp_path) + storage_manager = StorageManager( + scratch_root=tmp_path, + shared_root=backend, + permanent_root=backend, + keep_scratch=keep_scr, + keep_staging=keep_sta, + keep_shared=keep_sha, + delete_empty_dirs=del_empty_dirs, + staging="", + ) + return storage_manager + + def assert_shared_and_permanent(self, storage_manager, dag): + shared = storage_manager.shared_root + permanent = storage_manager.permanent_root + u1_label = self.u1_label(dag) + keep_shared = storage_manager.keep_shared + keep_scratch = storage_manager.keep_scratch + + perm_file = f"{u1_label}/permanent.txt" + shared_file = f"{u1_label}/shared.txt" + scratch_file = f"scratch/{u1_label}/scratch.txt" + + assert shared is permanent + # we'll test everything in permanent, because shared is identical + + expected = {perm_file} + + if keep_shared: + expected.add(shared_file) + + if keep_scratch: + expected.add(scratch_file) + + assert set(permanent.iter_contents()) == expected + with permanent.load_stream(perm_file) as f: + assert f.read() == b"I'm permanent (but I can be shared)" + + if keep_shared: + with permanent.load_stream(shared_file) as f: + assert f.read() == b"I can be shared" + + if keep_scratch: + with permanent.load_stream(scratch_file) as f: + assert f.read() == b"This is scratch -- can't be shared" + + def assert_staging(self, storage_manager, dag): + # in this case, keep_staging is ignored in favor of the behavior of + # keep_shared + keep_shared = storage_manager.keep_shared + u1_label = self.u1_label(dag) + scratch_root = storage_manager.scratch_root + u1_staging = scratch_root / u1_label + + assert (u1_staging / "permanent.txt").exists() + + if keep_shared: + assert (u1_staging / "shared.txt").exists() From 0c973b7a6ea7ac3388fc77c196d9db69eaad9fe1 Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Fri, 1 Dec 2023 11:49:33 -0600 Subject: [PATCH 37/69] fix up protocol tests --- gufe/tests/test_protocol.py | 3 ++- gufe/tests/test_protocoldag.py | 16 ++++++++-------- gufe/tests/test_protocolunit.py | 24 ++++++++++++++++++------ 3 files changed, 28 insertions(+), 15 deletions(-) diff --git a/gufe/tests/test_protocol.py b/gufe/tests/test_protocol.py index f700fa45..6a468bfd 100644 --- a/gufe/tests/test_protocol.py +++ b/gufe/tests/test_protocol.py @@ -696,7 +696,8 @@ def test_execute_DAG_retries(solvated_ligand, vacuum_ligand, tmpdir): # we did 3 retries, so 4 total failures assert len(r.protocol_unit_results) == 5 assert len(r.protocol_unit_failures) == 4 - assert len(list(shared.iterdir())) == 5 + assert len(list(shared.iterdir())) == 0 + # assert len(list(shared.iterdir())) == 5 def test_execute_DAG_bad_nretries(solvated_ligand, vacuum_ligand, tmpdir): diff --git a/gufe/tests/test_protocoldag.py b/gufe/tests/test_protocoldag.py index a4a08a7b..d248ec30 100644 --- a/gufe/tests/test_protocoldag.py +++ b/gufe/tests/test_protocoldag.py @@ -14,9 +14,9 @@ class WriterUnit(gufe.ProtocolUnit): def _execute(ctx, **inputs): my_id = inputs['identity'] - with open(os.path.join(ctx.shared, f'unit_{my_id}_shared.txt'), 'w') as out: + with open(ctx.shared / f'unit_{my_id}_shared.txt', 'w') as out: out.write(f'unit {my_id} existed!\n') - with open(os.path.join(ctx.scratch, f'unit_{my_id}_scratch.txt'), 'w') as out: + with open(ctx.scratch / f'unit_{my_id}_scratch.txt', 'w') as out: out.write(f'unit {my_id} was here\n') return { @@ -94,12 +94,12 @@ def test_execute_dag(tmpdir, keep_shared, keep_scratch, writefile_dag): # will have produced 4 files in scratch and shared directory for pu in writefile_dag.protocol_units: identity = pu.inputs['identity'] - shared_file = os.path.join(shared, - f'shared_{str(pu.key)}_attempt_0', - f'unit_{identity}_shared.txt') - scratch_file = os.path.join(scratch, - f'scratch_{str(pu.key)}_attempt_0', - f'unit_{identity}_scratch.txt') + shared_file = (shared + / f'shared_{str(pu.key)}_attempt_0' + / f'unit_{identity}_shared.txt') + scratch_file = (scratch + / f'scratch_{str(pu.key)}_attempt_0' + / f'unit_{identity}_scratch.txt') if keep_shared: assert os.path.exists(shared_file) else: diff --git a/gufe/tests/test_protocolunit.py b/gufe/tests/test_protocolunit.py index 9896f856..2daaa687 100644 --- a/gufe/tests/test_protocolunit.py +++ b/gufe/tests/test_protocolunit.py @@ -57,8 +57,12 @@ def test_execute(self, tmpdir): scratch = Path('scratch') / str(unit.key) scratch.mkdir(parents=True) - ctx = Context(shared=shared, scratch=scratch) - + permanent = Path('permanent') / str(unit.key) + permanent.mkdir(parents=True) + + ctx = Context(shared=shared, scratch=scratch, + permanent=permanent) + u: ProtocolUnitFailure = unit.execute(context=ctx, an_input=3) assert u.exception[0] == "ValueError" @@ -70,8 +74,12 @@ def test_execute(self, tmpdir): scratch = Path('scratch') / str(unit.key) scratch.mkdir(parents=True) - ctx = Context(shared=shared, scratch=scratch) - + permanent = Path('permanent') / str(unit.key) + permanent.mkdir(parents=True) + + ctx = Context(shared=shared, scratch=scratch, + permanent=permanent) + # now try actually letting the error raise on execute with pytest.raises(ValueError, match="should always be 2"): unit.execute(context=ctx, raise_error=True, an_input=3) @@ -87,8 +95,12 @@ def test_execute_KeyboardInterrupt(self, tmpdir): scratch = Path('scratch') / str(unit.key) scratch.mkdir(parents=True) - ctx = Context(shared=shared, scratch=scratch) - + permanent = Path('permanent') / str(unit.key) + permanent.mkdir(parents=True) + + ctx = Context(shared=shared, scratch=scratch, + permanent=permanent) + with pytest.raises(KeyboardInterrupt): unit.execute(context=ctx, an_input=3) From 9260a2da75446d3ff81d3e1304580e7a23c3fe44 Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Fri, 1 Dec 2023 11:53:20 -0600 Subject: [PATCH 38/69] Add __eq__ for ext resources Also fixes file copying when copying to a nested directory --- gufe/storage/externalresource/filestorage.py | 11 +++++++++-- gufe/storage/externalresource/memorystorage.py | 3 +++ 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/gufe/storage/externalresource/filestorage.py b/gufe/storage/externalresource/filestorage.py index 164538e3..5330f88b 100644 --- a/gufe/storage/externalresource/filestorage.py +++ b/gufe/storage/externalresource/filestorage.py @@ -20,6 +20,12 @@ def __init__(self, root_dir: Union[pathlib.Path, str]): def _exists(self, location): return self._as_path(location).exists() + def __eq__(self, other): + return ( + isinstance(other, FileStorage) + and self.root_dir == other.root_dir + ) + def _store_bytes(self, location, byte_data): path = self._as_path(location) directory = path.parent @@ -30,8 +36,9 @@ def _store_bytes(self, location, byte_data): f.write(byte_data) def _store_path(self, location, path): - my_path = self._as_path(location) - if path.resolve() != my_path.resolve(): + my_path = self._as_path(location).resolve() + if path.resolve() != my_path: + my_path.parent.mkdir(parents=True, exist_ok=True) shutil.copyfile(path, my_path) def _iter_contents(self, prefix): diff --git a/gufe/storage/externalresource/memorystorage.py b/gufe/storage/externalresource/memorystorage.py index da0931bd..40c05087 100644 --- a/gufe/storage/externalresource/memorystorage.py +++ b/gufe/storage/externalresource/memorystorage.py @@ -26,6 +26,9 @@ def _delete(self, location): f"Unable to delete '{location}': key does not exist" ) + def __eq__(self, other): + return self is other + def _store_bytes(self, location, byte_data): self._data[location] = byte_data return location, self.get_metadata(location) From d97fca43b7222441b82453cebb1c992a4c56edc6 Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Fri, 1 Dec 2023 11:56:55 -0600 Subject: [PATCH 39/69] Test: FileStorage.store_path for nested target --- gufe/tests/storage/test_externalresource.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/gufe/tests/storage/test_externalresource.py b/gufe/tests/storage/test_externalresource.py index 2ce3a9ff..868407f1 100644 --- a/gufe/tests/storage/test_externalresource.py +++ b/gufe/tests/storage/test_externalresource.py @@ -52,14 +52,16 @@ def test_store_bytes(self, file_storage): with open(fileloc, 'rb') as f: assert as_bytes == f.read() - def test_store_path(self, file_storage): + @pytest.mark.parametrize('nested', [True, False]) + def test_store_path(self, file_storage, nested): orig_file = file_storage.root_dir / ".hidden" / "bar.txt" orig_file.parent.mkdir() as_bytes = "This is bar".encode('utf-8') with open(orig_file, 'wb') as f: f.write(as_bytes) - fileloc = file_storage.root_dir / "bar.txt" + nested_dir = "nested" if nested else "" + fileloc = file_storage.root_dir / nested_dir / "bar.txt" assert not fileloc.exists() file_storage.store_path(fileloc, orig_file) From a01feaf14a111668889ccaf527954fda8959a4ad Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Fri, 1 Dec 2023 12:21:53 -0600 Subject: [PATCH 40/69] add tests for ext resource __eq__ --- gufe/tests/storage/test_externalresource.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/gufe/tests/storage/test_externalresource.py b/gufe/tests/storage/test_externalresource.py index 868407f1..43736991 100644 --- a/gufe/tests/storage/test_externalresource.py +++ b/gufe/tests/storage/test_externalresource.py @@ -70,6 +70,12 @@ def test_store_path(self, file_storage, nested): with open(fileloc, 'rb') as f: assert as_bytes == f.read() + def test_eq(self, tmp_path): + reference = FileStorage(tmp_path) + assert reference == FileStorage(tmp_path) + assert reference != FileStorage(tmp_path / "foo") + assert reference != MemoryStorage() + def test_delete(self, file_storage): path = file_storage.root_dir / "foo.txt" assert path.exists() @@ -163,6 +169,11 @@ def test_store_path(self, tmp_path): assert storage._data == self.contents + def test_eq(self): + reference = MemoryStorage() + assert reference == reference + assert reference != MemoryStorage() + @pytest.mark.parametrize('prefix,expected', [ ("", {'foo.txt', 'foo_dir/a.txt', 'foo_dir/b.txt'}), ("foo", {'foo.txt', 'foo_dir/a.txt', 'foo_dir/b.txt'}), From 11810eb847f06a1ad69c44d8cde4b627d3d60e3e Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Fri, 1 Dec 2023 12:40:38 -0600 Subject: [PATCH 41/69] make mypy happy --- gufe/protocols/protocoldag.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/gufe/protocols/protocoldag.py b/gufe/protocols/protocoldag.py index 7f044457..b4ef9c32 100644 --- a/gufe/protocols/protocoldag.py +++ b/gufe/protocols/protocoldag.py @@ -366,9 +366,8 @@ def make_label(self, dag_label, unit_label, attempt): def execute_DAG(protocoldag: ProtocolDAG, *, - shared_basedir: Optional[PathLike], + shared_basedir: PathLike, scratch_basedir: PathLike, - shared: Optional[ExternalStorage] = None, keep_shared: bool = False, keep_scratch: bool = False, raise_error: bool = True, @@ -409,8 +408,8 @@ def execute_DAG(protocoldag: ProtocolDAG, *, # the directory given as shared_root is actually the directory for this # DAG; the "shared_root" for the storage manager is the parent. We'll # force permanent to be the same. - if shared is None: - shared = FileStorage(shared_basedir.parent) + shared_basedir = Path(shared_basedir) + shared = FileStorage(shared_basedir.parent) dag_label = shared_basedir.name storage_manager = ReproduceOldBehaviorStorageManager( scratch_root=scratch_basedir, From 908329ca6d5f1d0cf54f18022693c3d80432a109 Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Fri, 1 Dec 2023 12:47:40 -0600 Subject: [PATCH 42/69] some docstrings --- gufe/protocols/protocoldag.py | 41 ++++++++++++++++++++++++++++------- 1 file changed, 33 insertions(+), 8 deletions(-) diff --git a/gufe/protocols/protocoldag.py b/gufe/protocols/protocoldag.py index b4ef9c32..0ff38ff8 100644 --- a/gufe/protocols/protocoldag.py +++ b/gufe/protocols/protocoldag.py @@ -425,15 +425,40 @@ def execute_DAG(protocoldag: ProtocolDAG, *, raise_error, n_retries) -def new_execute_DAG( - protocoldag, - dag_label, - storage_manager, - raise_error=False, - n_retries=0 -): +def new_execute_DAG( # TODO: this is a terrible name + protocoldag: ProtocolDAG, + dag_label: str, + storage_manager: StorageManager, + raise_error: bool = False, + n_retries: int = 0 +) -> ProtocolDAGResult: + """ + Locally execute a full :class:`ProtocolDAG` in serial and in-process. + + Alternate input signature to generalize execute_DAG + + Parameters + ---------- + protocoldag : ProtocolDAG + The :class:``ProtocolDAG`` to execute. + dag_label : str + Label to use for the DAG + storage_manager : StorageManager + The :class:`.StorageManager` to handle storing files. + raise_error : bool + If True, raise an exception if a ProtocolUnit fails, default True + if False, any exceptions will be stored as `ProtocolUnitFailure` + objects inside the returned `ProtocolDAGResult` + n_retries : int + the number of times to attempt, default 0, i.e. try once and only once + + Returns + ------- + ProtocolDAGResult + The result of executing the `ProtocolDAG`. + """ # this simplifies setup of execute_DAG by allowing you to directly - # provide the storage_manager; the extra option in the old one just + # provide the storage_manager; the extra options in the old one just # configure the storage_manager if n_retries < 0: raise ValueError("Must give positive number of retries") From a659afa6d054a4cd81df9857772fc4e68dbef440 Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Fri, 1 Dec 2023 13:13:07 -0600 Subject: [PATCH 43/69] staging serialization --- gufe/storage/stagingserialization.py | 94 ++++++++++++++++++++++++++++ 1 file changed, 94 insertions(+) create mode 100644 gufe/storage/stagingserialization.py diff --git a/gufe/storage/stagingserialization.py b/gufe/storage/stagingserialization.py new file mode 100644 index 00000000..b50817fe --- /dev/null +++ b/gufe/storage/stagingserialization.py @@ -0,0 +1,94 @@ +from gufe.tokenization import JSON_HANDLER +from gufe.custom_json import JSONCodec, JSONSerializerDeserializer +from .stagingdirectory import StagingPath + +class StagingPathSerialization: + # TODO: where should this go? I think maybe on the storage manager + + # Serializing staging paths + # ------------------------- + # + # Some important user stories to consider: + # + # 1. I am loading my results object, and I will want to use the + # associated files. This should be transparent, regardless of where + # the permanent storage is located. + # 2. I am loading my results object, but I do not need the large stored + # files. I do not want to download them when they aren't needed. + # 3. My permanent storage was a directory on my file system, but I have + # moved that directory (with use cases of (a) I moved the absolute + # path; (b) it is at a different relative path with respect to my + # pwd. + # 4. I'm working with files from two different permanent storages. I + # need to be able to load from both in the same Python process. + # + # Outputs from a protocol may contain a :class:`.StagingPath`. Note that + # a :class:`.StagingPath` is inherently not a complete description of + # how to obtain the associated data: in almost every situation, there is + # some additional context required. This can include the credentials to + # an external server, or a base path that the file can be found relative + # to (which may have changed if the user moves it.) Because of this, we + # need to inject that context in to the deserialization. + # + # This object injects the relevant context, provided by the + # :class:`.StorageManager`. It creates a JSONSerializerDeserializer + # based on the one being used by gufe in this process, using all the + # installed codecs plus an additional codec specific to this context. + # + # User stories 1 and 2 are handled by the nature of the + # :class:`.StagingPath` object. The external file downloads as part of + # the ``__fspath__`` method. This means that when using the ``open`` + # builtin, you will automatically download the file to a local staging + # directory. However, the reference to the file can exist in the results + # object without downloading the file. + # + # User stories 3 and 4 are handled by this + # :class:`.StagingPathSerialization` class. Story 3 is handled by + # allowing the appropriate context (in the form of a + # :class:`.StorageManager`) to be injected into the deserialization + # process. Story 4 can be handled by using more than one + # :class:`.StagingPathSerialization` context (associated with different + # :class:`.StorageManager` objects. + + def __init__(self, manager): + self.manager = manager + self.codec = JSONCodec( + cls=StagingPath, + to_dict=self.to_dict, + from_dict=self.from_dict, + ) + self.refresh_handler() + + def refresh_handler(self): + codecs = JSON_HANDLER.codecs + [self.codec] + self.json_handler = JSONSerializerDeserializer(codecs) + + @property + def encoder(self): + return self.json_handler.encoder + + @property + def decoder(self): + return self.json_handler.decoder + + def to_dict(self, path): + # scratch, shared, permanent may form nested with progressively + # smaller contexts, so the last of those it is in is where it should + # be labelled. TODO: opportunity for performance improvement if + # needed + loc = None + if path.label in self.manager.scratch_root.iterdir(): + loc = "scratch" + if path.label in self.manager.shared_root.iter_contents(): + loc = "shared" + if path.label in self.manager.permanent_root.iter_contents(): + loc = "permanent" + + return { + ':container:': loc, + ':label:': path.label, + } + + def from_dict(self, dct): + staging = getattr(self.manager, f"{dct[':container:']}_staging") + return staging / dct[':label:'] From 32b1fe4ce68a1776bc854f39a94f378da336ee7a Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Fri, 1 Dec 2023 13:13:33 -0600 Subject: [PATCH 44/69] tests for staging serialization --- .../storage/test_stagingserialization.py | 247 ++++++++++++++++++ 1 file changed, 247 insertions(+) create mode 100644 gufe/tests/storage/test_stagingserialization.py diff --git a/gufe/tests/storage/test_stagingserialization.py b/gufe/tests/storage/test_stagingserialization.py new file mode 100644 index 00000000..a0d1a16d --- /dev/null +++ b/gufe/tests/storage/test_stagingserialization.py @@ -0,0 +1,247 @@ +import pytest +from gufe.storage.stagingserialization import StagingPathSerialization + +from gufe.storage.stagingdirectory import StagingPath +from gufe.storage.storagemanager import StorageManager +from gufe.storage.externalresource import MemoryStorage, FileStorage + +import json +import pathlib +import shutil + +@pytest.fixture +def storage_manager(tmp_path): + return StorageManager( + scratch_root=tmp_path / "working", + shared_root=MemoryStorage(), + permanent_root=MemoryStorage(), + ) + +@pytest.fixture +def shared_path(storage_manager): + label = storage_manager.make_label("dag", "unit", attempt=0) + path = storage_manager.shared_staging / label / "file.txt" + with open(path, mode='w') as f: + f.write("contents here") + + storage_manager.shared_staging.transfer_staging_to_external() + return path + +@pytest.fixture +def permanent_path(storage_manager): + label = storage_manager.make_label("dag", "unit", attempt=0) + path = storage_manager.permanent_staging / label / "file.txt" + with open(path, mode='w') as f: + f.write("contents here") + + storage_manager.permanent_staging.transfer_staging_to_external() + return path + +@pytest.fixture +def scratch_path(storage_manager): + scratch_dir = storage_manager._scratch_loc("dag", "unit", attempt=0) + path = scratch_dir / "file.txt" + return path + +@pytest.fixture +def serialization_handler(storage_manager): + return StagingPathSerialization(storage_manager) + +class TestStagingPathSerialization: + @pytest.mark.parametrize('pathtype', ['scratch', 'shared', 'permanent']) + def test_round_trip(self, serialization_handler, pathtype, request): + # NB: scratch is a pathlib.Path, not a StagingPath. It is tested + # here to ensure round-trips as part of the overall user story for + # this, but it doesn't invoke the machinery of the + # StagingPathSerialization object + path = request.getfixturevalue(f"{pathtype}_path") + as_json = json.dumps( + path, + cls=serialization_handler.json_handler.encoder + ) + reloaded = json.loads( + as_json, + cls=serialization_handler.json_handler.decoder + ) + + assert path == reloaded + + @pytest.mark.parametrize('pathtype', ['shared', 'permanent']) + def test_to_dict(self, serialization_handler, pathtype, request): + path = request.getfixturevalue(f"{pathtype}_path") + dct = serialization_handler.to_dict(path) + assert dct == { + ':container:': pathtype, + ':label:': "dag/unit_attempt_0/file.txt", + } + + # tests for specific user stories + @pytest.mark.parametrize('pathtype', ['shared', 'permanent']) + def test_reload_file_contents(self, pathtype, request): + # USER STORY: I am loading my results object, and I will want to use + # the associated files. This should be transparent, regardless of + # where the storage is located (e.g., not local storage). (This is + # actually a test of the staging tools, but we include it here for + # completeness of the user stories.) + path = request.getfixturevalue(f"{pathtype}_path") + + # remove the file (remains in the MemoryStorage) + p = pathlib.Path(path.fspath) + assert p.exists() + p.unlink() + assert not p.exists() + + # reload the file (NB: nothing special done here; download is + # transparent to user) + with open(path, mode='r') as f: + contents = f.read() + + assert p.exists() + assert contents == "contents here" + + @pytest.mark.parametrize('pathtype', ['shared', 'permanent']) + def test_load_results_object_file_not_downloaded(self, + serialization_handler, + pathtype, request): + # USER STORY: I am loading my results object, but I do not need the + # large stored files. I do not want to download them when they + # aren't needed. + path = request.getfixturevalue(f"{pathtype}_path") + # serialize the path object + json_str = json.dumps(path, cls=serialization_handler.encoder) + + # delete the path from the directory + p = pathlib.Path(path) + assert p.exists() + p.unlink() + assert not p.exists() + + # reload the serialized form of the object + reloaded = json.loads(json_str, cls=serialization_handler.decoder) + + # check that the deserialized version has the path, but that the + # path does not exist on the filesystem + assert isinstance(reloaded, StagingPath) + assert reloaded.label == path.label + assert reloaded.path == path.path + assert not p.exists() + # NOTE: as soon as you call `__fspath__`, the file will download + + @pytest.mark.parametrize('move', ['relative', 'absolute']) + def test_permanent_storage_moved(self, move, tmp_path, monkeypatch): + # USER STORY: My permanent storage was a directory on my file + # system, but I have moved that directory (with use cases of (a) I + # moved the absolute path; (b) it is at a different relative path + # with respect to my pwd). + monkeypatch.chdir(tmp_path) + old_manager = StorageManager( + scratch_root="old/scratch", + shared_root=FileStorage("old/shared"), + permanent_root=FileStorage("old/permanent") + ) + old_handler = StagingPathSerialization(old_manager) + old_path = old_manager.permanent_staging / "dag/unit/result.txt" + with open(old_path, mode='w') as f: + f.write("contents here") + + old_manager.permanent_staging.transfer_staging_to_external() + perm_p = pathlib.Path(tmp_path / "old/permanent/dag/unit/result.txt") + assert perm_p.exists() + + # serialize the path object + json_str = json.dumps(old_path, cls=old_handler.encoder) + + # move the storage subdirectory; create a new, associated storage + # manager/serialization handler + if move == "relative": + # change to within t + monkeypatch.chdir(tmp_path / "old") + new_manager = StorageManager( + scratch_root="scratch", + shared_root=FileStorage("shared"), + permanent_root=FileStorage("permanent") + ) + expected_path = tmp_path / "old/permanent/dag/unit/result.txt" + elif move == "absolute": + shutil.move(tmp_path / "old", tmp_path / "new") + new_manager = StorageManager( + scratch_root="new/scratch", + shared_root=FileStorage("new/shared"), + permanent_root=FileStorage("new/permanent") + ) + expected_path = tmp_path / "new/permanent/dag/unit/result.txt" + else: # -no-cov- + raise RuntimeWarning(f"Bad test parameter '{move}': should be " + "'relative' or 'absolute'") + + new_handler = StagingPathSerialization(new_manager) + + # deserialize the path using the new serialization handler + reloaded = json.loads(json_str, cls=new_handler.decoder) + + # ensure that the path exists and that the data can be reloaded + assert isinstance(reloaded, StagingPath) + assert reloaded.label == old_path.label + assert pathlib.Path(expected_path).exists() + + with open(reloaded, mode='r') as f: + contents = f.read() + + assert contents == "contents here" + + def test_two_different_permanent_storages(self, tmp_path): + # I'm working with files from two different permanent storages. I + # need to be able to load from both in the same Python process. + # (NOTE: this user story is primarily to prevent us from changing to + # a solution based on global/class vars to set context.) + manager1 = StorageManager( + scratch_root=tmp_path / "working1", + shared_root=MemoryStorage(), + permanent_root=MemoryStorage(), + ) + manager2 = StorageManager( + scratch_root=tmp_path / "working2", + shared_root=MemoryStorage(), + permanent_root=MemoryStorage(), + ) + handler1 = StagingPathSerialization(manager1) + handler2 = StagingPathSerialization(manager2) + + path1 = manager1.permanent_staging / "file1.txt" + with open(path1, mode='w') as f: + f.write("contents 1") + manager1.permanent_staging.transfer_staging_to_external() + + path2 = manager2.permanent_staging / "file2.txt" + with open(path2, mode='w') as f: + f.write("contents 2") + manager2.permanent_staging.transfer_staging_to_external() + + # serialize the paths + json_str1 = json.dumps(path1, cls=handler1.encoder) + json_str2 = json.dumps(path2, cls=handler2.encoder) + + # delete all staged files + assert pathlib.Path(path1.fspath).exists() + manager1.permanent_staging.cleanup() + assert not pathlib.Path(path1.fspath).exists() + + assert pathlib.Path(path2.fspath).exists() + manager2.permanent_staging.cleanup() + assert not pathlib.Path(path2.fspath).exists() + + # reload and check contents of both permanent files + reloaded1 = json.loads(json_str1, cls=handler1.decoder) + reloaded2 = json.loads(json_str2, cls=handler2.decoder) + + assert isinstance(reloaded1, StagingPath) + assert reloaded1.label == path1.label + assert not pathlib.Path(reloaded1.fspath).exists() + with open(reloaded1, mode='r') as f: + assert f.read() == "contents 1" + + assert isinstance(reloaded2, StagingPath) + assert reloaded2.label == path2.label + assert not pathlib.Path(reloaded2.fspath).exists() + with open(reloaded2, mode='r') as f: + assert f.read() == "contents 2" From 5d0df5ff28fce4381c103357662aebe13d5a2a7f Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Fri, 1 Dec 2023 14:12:49 -0600 Subject: [PATCH 45/69] pep8 --- gufe/storage/stagingdirectory.py | 12 ++++-------- gufe/tests/storage/test_stagingdirectory.py | 9 +++++++++ gufe/tests/storage/test_storagemanager.py | 8 +++++--- gufe/utils.py | 1 - 4 files changed, 18 insertions(+), 12 deletions(-) diff --git a/gufe/storage/stagingdirectory.py b/gufe/storage/stagingdirectory.py index 356a5d34..8c27b416 100644 --- a/gufe/storage/stagingdirectory.py +++ b/gufe/storage/stagingdirectory.py @@ -38,7 +38,6 @@ def _safe_to_delete_staging(external: ExternalStorage, path: PathLike, return False - class StagingDirectory: """PathLike local representation of an :class:`.ExternalStorage`. @@ -94,8 +93,8 @@ def __init__( self.delete_empty_dirs = delete_empty_dirs self.staging = staging - self.registry : set[StagingPath] = set() - self.preexisting : set[StagingPath] = set() + self.registry: set[StagingPath] = set() + self.preexisting: set[StagingPath] = set() self.staging_dir = self.scratch / staging / prefix self.staging_dir.mkdir(exist_ok=True, parents=True) @@ -125,7 +124,6 @@ def transfer_single_file_to_external(self, held_file: StagingPath): return None # no transfer - def transfer_staging_to_external(self): """Transfer all objects in the registry to external storage @@ -201,7 +199,7 @@ def _load_file_from_external(self, external: ExternalStorage, with external.load_stream(staging_path.label) as f: external_bytes = f.read() - ... # TODO: check that the bytes are the same if preexisting? + ... # TODO: check that the bytes are the same if preexisting? scratch_path.parent.mkdir(exist_ok=True, parents=True) with open(scratch_path, mode='wb') as f: @@ -247,7 +245,7 @@ def __init__( self.read_only = read_only def _get_other_shared(self, prefix: str, - delete_staging: Optional[bool] = None): + delete_staging: Optional[bool] = None): """Get a related unit's staging directory. """ if delete_staging is None: @@ -397,5 +395,3 @@ def __repr__(self): # although edge cases may be a pain, we can get most of it with, e.g.: # def exists(self): return Path(self).exists() # but also, can do pathlib.Path(staging_path) and get hte whole thing - - diff --git a/gufe/tests/storage/test_stagingdirectory.py b/gufe/tests/storage/test_stagingdirectory.py index f94a1888..08bed6e6 100644 --- a/gufe/tests/storage/test_stagingdirectory.py +++ b/gufe/tests/storage/test_stagingdirectory.py @@ -11,6 +11,7 @@ delete_empty_dirs, # TODO: move to appropriate place ) + @pytest.fixture def root(tmp_path): external = MemoryStorage() @@ -23,6 +24,7 @@ def root(tmp_path): ) return root + @pytest.fixture def root_with_contents(root): with open(root / "data.txt", mode='wb') as f: @@ -30,6 +32,7 @@ def root_with_contents(root): return root + @pytest.fixture def read_only_with_overwritten(root_with_contents): read_only = SharedStaging( @@ -51,6 +54,7 @@ def read_only_with_overwritten(root_with_contents): return read_only, staged + @pytest.fixture def permanent(tmp_path): shared = MemoryStorage() @@ -64,24 +68,28 @@ def permanent(tmp_path): ) return perm + def test_safe_to_delete_staging_ok(tmp_path): external = FileStorage(tmp_path / "foo") prefix = "bar" staging = tmp_path / "foo" / "baz" assert _safe_to_delete_staging(external, staging, prefix) + def test_safe_to_delete_staging_danger(tmp_path): external = FileStorage(tmp_path / "foo") prefix = "bar" staging = tmp_path / "foo" / "bar" / "baz" assert not _safe_to_delete_staging(external, staging, prefix) + def test_safe_to_delete_staging_not_filestorage(tmp_path): external = MemoryStorage() prefix = "bar" staging = tmp_path / "bar" assert _safe_to_delete_staging(external, staging, prefix) + def test_delete_empty_dirs(tmp_path): base = tmp_path / "tmp" paths = [ @@ -108,6 +116,7 @@ def test_delete_empty_dirs(tmp_path): assert not (base / "foo" / "bar").exists() + @pytest.mark.parametrize('delete_root', [True, False]) def test_delete_empty_dirs_delete_root(tmp_path, delete_root): base = tmp_path / "tmp" diff --git a/gufe/tests/storage/test_storagemanager.py b/gufe/tests/storage/test_storagemanager.py index 80af74c1..c98a52d9 100644 --- a/gufe/tests/storage/test_storagemanager.py +++ b/gufe/tests/storage/test_storagemanager.py @@ -4,6 +4,7 @@ from gufe.storage.externalresource import MemoryStorage, FileStorage from pathlib import Path + @pytest.fixture def storage_manager_std(tmp_path): return StorageManager( @@ -12,6 +13,7 @@ def storage_manager_std(tmp_path): permanent_root=MemoryStorage(), ) + @pytest.fixture def dag_units(): class Unit1: @@ -33,7 +35,9 @@ def run(self, scratch, shared, permanent): (scratch / "foo2.txt").touch() # TODO: this will change; the inputs should include a way to get # the previous shared unit label - with shared.root.other_shared("dag/unit1_attempt_0") as prev_shared: + with ( + shared.root.other_shared("dag/unit1_attempt_0") as prev_shared + ): with open(prev_shared / "bar.txt", mode='r') as f: bar = f.read() @@ -220,7 +224,6 @@ def _after_dag_existing_files(self): return self.files_after_unit('dag/unit2') - class TestStagingOverlapsSharedStorageManager(LifecycleHarness): @pytest.fixture def storage_manager(self, tmp_path): @@ -238,7 +241,6 @@ def _in_unit_existing_files(self, unit_label): "dag/unit2": {'foo2', 'bar', 'baz'}, }[unit_label] - def _after_unit_existing_files(self, unit_label): # same for both; all files come from unit 1 return {"bar", "baz"} diff --git a/gufe/utils.py b/gufe/utils.py index 4382a801..519835d5 100644 --- a/gufe/utils.py +++ b/gufe/utils.py @@ -59,7 +59,6 @@ def __exit__(self, type, value, traceback): self.context.close() - def delete_empty_dirs(root: PathLike, delete_root: bool = True): """Delete all empty directories. From e0573323108e1708ddc54ee2a758675c1c764db2 Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Fri, 1 Dec 2023 14:13:51 -0600 Subject: [PATCH 46/69] pep8 --- gufe/tests/storage/test_storagemanager.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gufe/tests/storage/test_storagemanager.py b/gufe/tests/storage/test_storagemanager.py index c98a52d9..70852bf0 100644 --- a/gufe/tests/storage/test_storagemanager.py +++ b/gufe/tests/storage/test_storagemanager.py @@ -254,7 +254,7 @@ def _in_staging_shared(self, unit_label, in_after): baz = "dag/unit1_attempt_0/baz.txt" foo = "scratch/dag/unit1_attempt_0/foo.txt" foo2 = "scratch/dag/unit2_attempt_0/foo2.txt" - return { + return { ("dag/unit1", "in"): {bar, baz, foo}, ("dag/unit1", "after"): {bar, baz}, ("dag/unit2", "in"): {bar, baz, foo2}, @@ -287,7 +287,7 @@ def _in_staging_permanent(self, unit_label, in_after): baz = "dag/unit1_attempt_0/baz.txt" foo = "scratch/dag/unit1_attempt_0/foo.txt" foo2 = "scratch/dag/unit2_attempt_0/foo2.txt" - return { + return { ("dag/unit1", "in"): {bar, baz, foo}, ("dag/unit1", "after"): {baz}, ("dag/unit2", "in"): {baz, foo2}, From bdb37bb59239c3cd0fdc00ca540132df036049da Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Fri, 1 Dec 2023 14:16:24 -0600 Subject: [PATCH 47/69] pep8 --- gufe/protocols/protocoldag.py | 8 ++++---- gufe/tests/storage/test_storage_demo.py | 1 + 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/gufe/protocols/protocoldag.py b/gufe/protocols/protocoldag.py index 0ff38ff8..0ba68a56 100644 --- a/gufe/protocols/protocoldag.py +++ b/gufe/protocols/protocoldag.py @@ -419,7 +419,7 @@ def execute_DAG(protocoldag: ProtocolDAG, *, keep_shared=keep_shared, keep_staging=True, delete_empty_dirs=False, - #staging=Path(""), # use the actual directories as the staging + # staging=Path(""), # use the actual directories as the staging ) return new_execute_DAG(protocoldag, dag_label, storage_manager, raise_error, n_retries) @@ -478,9 +478,9 @@ def new_execute_DAG( # TODO: this is a terrible name label = storage_manager.make_label(dag_label, unit.key, attempt=attempt) - with dag_ctx.running_unit(dag_label, unit.key, attempt=attempt) as ( - scratch, shared, perm - ): + with dag_ctx.running_unit( + dag_label, unit.key, attempt=attempt + ) as (scratch, shared, perm): # TODO: context manager should return context context = Context(shared=shared, scratch=scratch, diff --git a/gufe/tests/storage/test_storage_demo.py b/gufe/tests/storage/test_storage_demo.py index 49011784..f87444b8 100644 --- a/gufe/tests/storage/test_storage_demo.py +++ b/gufe/tests/storage/test_storage_demo.py @@ -73,6 +73,7 @@ def _create(self, stateA, stateB, mapping, extends): def _gather(self, protocol_dag_results): return {} + @pytest.fixture def demo_dag(solvated_ligand, solvated_complex): transformation = gufe.Transformation( From e39a79026c35b0b9a8643e4f8c1001bd81b74cc3 Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Fri, 1 Dec 2023 14:19:27 -0600 Subject: [PATCH 48/69] pep8 --- gufe/storage/stagingserialization.py | 1 + gufe/tests/storage/test_stagingserialization.py | 6 ++++++ 2 files changed, 7 insertions(+) diff --git a/gufe/storage/stagingserialization.py b/gufe/storage/stagingserialization.py index b50817fe..72a6fc48 100644 --- a/gufe/storage/stagingserialization.py +++ b/gufe/storage/stagingserialization.py @@ -2,6 +2,7 @@ from gufe.custom_json import JSONCodec, JSONSerializerDeserializer from .stagingdirectory import StagingPath + class StagingPathSerialization: # TODO: where should this go? I think maybe on the storage manager diff --git a/gufe/tests/storage/test_stagingserialization.py b/gufe/tests/storage/test_stagingserialization.py index a0d1a16d..47cc3ed1 100644 --- a/gufe/tests/storage/test_stagingserialization.py +++ b/gufe/tests/storage/test_stagingserialization.py @@ -9,6 +9,7 @@ import pathlib import shutil + @pytest.fixture def storage_manager(tmp_path): return StorageManager( @@ -17,6 +18,7 @@ def storage_manager(tmp_path): permanent_root=MemoryStorage(), ) + @pytest.fixture def shared_path(storage_manager): label = storage_manager.make_label("dag", "unit", attempt=0) @@ -27,6 +29,7 @@ def shared_path(storage_manager): storage_manager.shared_staging.transfer_staging_to_external() return path + @pytest.fixture def permanent_path(storage_manager): label = storage_manager.make_label("dag", "unit", attempt=0) @@ -37,16 +40,19 @@ def permanent_path(storage_manager): storage_manager.permanent_staging.transfer_staging_to_external() return path + @pytest.fixture def scratch_path(storage_manager): scratch_dir = storage_manager._scratch_loc("dag", "unit", attempt=0) path = scratch_dir / "file.txt" return path + @pytest.fixture def serialization_handler(storage_manager): return StagingPathSerialization(storage_manager) + class TestStagingPathSerialization: @pytest.mark.parametrize('pathtype', ['scratch', 'shared', 'permanent']) def test_round_trip(self, serialization_handler, pathtype, request): From 1cfe9108ea43542036daf79bec86072515e61815 Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Mon, 4 Dec 2023 12:28:39 -0600 Subject: [PATCH 49/69] StagingDirectory -> StagingRegistry also, some docs cleanup/updates --- docs/guide/storage.rst | 40 +++++++++---------- ...stagingdirectory.py => stagingregistry.py} | 16 ++++---- gufe/storage/storagemanager.py | 4 +- ...ngdirectory.py => test_stagingregistry.py} | 14 +++---- gufe/tests/storage/test_storagemanager.py | 1 - 5 files changed, 37 insertions(+), 38 deletions(-) rename gufe/storage/{stagingdirectory.py => stagingregistry.py} (97%) rename gufe/tests/storage/{test_stagingdirectory.py => test_stagingregistry.py} (96%) diff --git a/docs/guide/storage.rst b/docs/guide/storage.rst index d740525e..165b9640 100644 --- a/docs/guide/storage.rst +++ b/docs/guide/storage.rst @@ -15,13 +15,13 @@ correspond to three lifetimes of data: * ``scratch``: This is temporary data that is only needed for the lifetime of a :class:`.ProtocolUnit`. This data is not guaranteed to be available - beyown the single :class:`.ProtocolUnit` where it is created, but may be + beyond the single :class:`.ProtocolUnit` where it is created, but may be reused within that :class:`.ProtocolUnit`. * ``shared``: This is data that is shared between different units in a :class:`.ProtocolDAG`. For example, a single equilibration stage might be shared between multiple production runs. The output snapshot of the equilibration would be suitable for as something to put in ``shared`` - data. This data is guarateed to be present from when it is created until + data. This data is guaranteed to be present from when it is created until the end of the :class:`.ProtocolDAG`, but is not guaranteed to exist after the :class:`.ProtocolDAG` terminates. * ``permanent``: This is the data that will be needed beyond the scope of a @@ -40,10 +40,10 @@ between stages of the GUFE storage system, and simplifies the API for protocol authors. In detail, this provides protocol authors with ``PathLike`` objects for ``scratch``, ``shared``, and ``permanent``. All three of these objects actually point to special subdirectories of the -scratch space for a specific unit, but are managed by context manangers at -the executor level, which handle the process of moving objects from local -staging directories to the actual ``shared`` and ``permanent`` locations, -which can be external resources. +local scratch space for a specific unit, but are managed by context +managers at the executor level, which handle the process of moving objects +from local staging directories to the actual ``shared`` and ``permanent`` +locations, which can be external resources. External resource utilities @@ -77,30 +77,30 @@ helpers. The information in this section is mostly of interest to authors of executors. The helpers are: * :class:`.StorageManager`: This is the overall façade interface for - interacting with the rest of the storage lifecycle tools. -* :class:`.DAGContextManager`: This provides context managers at the DAG and - unit level to handle the transfer of storage. GUFE provides a - :class:`.SingleProcDAGContextManager` to handle the simple case that an - entire DAG is run within a single process. If individual units are run on - different remote resources, a more complicated :class:`.DAGContextManager` - would be needed. -* :class:`.StagingDirectory`: This represents the root directory for staging - the results of a given :class:`.ProtocolUnit`. This is an abstract + interacting with the rest of the storage lifecycle tools. It provides two + methods to generate context managers; one for the :class:`.ProtocolDAG` + level of the lifecycle, and one for the :class:`.ProtocoUnit` level of the + lifecycle. This class is designed for the use case that the entire DAG is + run in serial within a single process. Subclasses of this can be created + for other execution architectures, where the main logic changes would be + in the methods that return those context managers. +* :class:`.StagingRegistry`: This handles the logic around staging paths + within a :class:`.ProtocolUnit`. Think of this as an abstract representation of a local directory. Paths within it register with it, and it handles deletion of the temporary local files when not needed, as well as the download of remote files when necessary for reading. There are two important subclasses of this: :class:`.SharedStaging` for a ``shared`` resource, and :class:`.PermanentStaging` for a ``permanent`` resource. * :class:`.StagingPath`: This represents a file within the - :class:`.StagingDirectory`. It contains both the key (label) used in the - key-value store, as well as the actual local path to the file. On - creation, it registers itself with its :class:`.StagingDirectory`, which - handles managing it over its lifecycle. + :class:`.StagingRegistry`. It contains both the key (label) used in the + key-value store, as well as the actual local path to the file. When its + ``__fspath__`` method is called, it registers itself with its + :class:`.StagingRegistry`, which handles managing it over its lifecycle. In practice, the executor uses the :class:`.StorageManager` to create a :class:`.DAGContextManager` at the level of a DAG, and then uses the :class:`.DAGContextManager` to create a context to run a unit. That context creates a :class:`.SharedStaging` and a :class:`.PermanentStaging` associated with the specific unit. Those staging directories, with the -scratch directory, are provided to the :class:`.ProtocolDAGUnit`, so that +scratch directory, are provided to the :class:`.ProtocolUnit`, so that these are the only objects protocol authors need to interact with. diff --git a/gufe/storage/stagingdirectory.py b/gufe/storage/stagingregistry.py similarity index 97% rename from gufe/storage/stagingdirectory.py rename to gufe/storage/stagingregistry.py index 8c27b416..74278830 100644 --- a/gufe/storage/stagingdirectory.py +++ b/gufe/storage/stagingregistry.py @@ -38,7 +38,7 @@ def _safe_to_delete_staging(external: ExternalStorage, path: PathLike, return False -class StagingDirectory: +class StagingRegistry: """PathLike local representation of an :class:`.ExternalStorage`. This connects objects on a local filesystem to the key-value store of a @@ -73,7 +73,7 @@ class StagingDirectory: temporarily stored; default is '.staging'. This must be the same for all units within a DAG. delete_staging : bool - whether to delete the contents of the $SCRATCH/$HOLDING/$PREFIX + whether to delete the contents of the $SCRATCH/$STAGING/$PREFIX directory when this object is deleted """ def __init__( @@ -155,7 +155,7 @@ def cleanup(self): def register_path(self, staging_path: StagingPath): """ - Register a :class:`.StagingPath` with this :class:`.StagingDirectory`. + Register a :class:`.StagingPath` with this :class:`.StagingRegistry`. This marks a given path as something for this object to manage, by loading it into the ``registry``. This way it is tracked such that @@ -223,7 +223,7 @@ def __del__(self): # -no-cov- self.cleanup() -class SharedStaging(StagingDirectory): +class SharedStaging(StagingRegistry): """Staging for shared external storage. This enables read-only versions to be loaded from other units. @@ -297,7 +297,7 @@ def register_path(self, staging_path: StagingPath): super().register_path(staging_path) -class PermanentStaging(StagingDirectory): +class PermanentStaging(StagingRegistry): """Staging directory for the permanent storage. This allows files to be downloaded from a shared @@ -339,7 +339,7 @@ def transfer_single_file_to_external(self, held_file: StagingPath): class StagingPath: """PathLike object linking local path with label for external storage. - On creation, this registers with a :class:`.StagingDirectory` that will + On creation, this registers with a :class:`.StagingRegistry` that will manage the local path and transferring data with its :class:`.ExternalStorage`. @@ -351,13 +351,13 @@ class StagingPath: example, when deserializing results that point to files) instead use :class:`.ExternalFile`. """ - def __init__(self, root: StagingDirectory, + def __init__(self, root: StagingRegistry, path: Union[PathLike, str]): self.root = root self.path = Path(path) def register(self): - """Register this path with its StagingDirectory. + """Register this path with its StagingRegistry. If a file associated with this path exists in an external storage, it will be downloaded to the staging area as part of registration. diff --git a/gufe/storage/storagemanager.py b/gufe/storage/storagemanager.py index 61665b7f..d5a80d92 100644 --- a/gufe/storage/storagemanager.py +++ b/gufe/storage/storagemanager.py @@ -9,8 +9,8 @@ from typing import Type from .externalresource import ExternalStorage, FileStorage -from .stagingdirectory import SharedStaging, PermanentStaging -from .stagingdirectory import StagingPath # typing +from .stagingregistry import SharedStaging, PermanentStaging +from .stagingregistry import StagingPath # typing class StorageManager: diff --git a/gufe/tests/storage/test_stagingdirectory.py b/gufe/tests/storage/test_stagingregistry.py similarity index 96% rename from gufe/tests/storage/test_stagingdirectory.py rename to gufe/tests/storage/test_stagingregistry.py index 08bed6e6..65806f67 100644 --- a/gufe/tests/storage/test_stagingdirectory.py +++ b/gufe/tests/storage/test_stagingregistry.py @@ -6,7 +6,7 @@ import pathlib from gufe.storage.externalresource import MemoryStorage, FileStorage -from gufe.storage.stagingdirectory import ( +from gufe.storage.stagingregistry import ( SharedStaging, PermanentStaging, _safe_to_delete_staging, delete_empty_dirs, # TODO: move to appropriate place ) @@ -160,7 +160,7 @@ def test_read_old(self, root): # When the file doesn't exist locally, it should be pulled down the # first time that we register the path. - # initial conditions, without touching StagingDirectory/StagingPath + # initial conditions, without touching StagingRegistry/StagingPath label = "old_unit/data.txt" on_filesystem = root.scratch / root.staging / "old_unit/data.txt" assert not on_filesystem.exists() @@ -210,7 +210,7 @@ def test_transfer_to_external_no_file(self, root, caplog): nonfile = root / "does_not_exist.txt" # ensure that we've set this up correctly assert nonfile not in root.registry - logger_name = "gufe.storage.stagingdirectory" + logger_name = "gufe.storage.stagingregistry" caplog.set_level(logging.INFO, logger=logger_name) root.transfer_single_file_to_external(nonfile) assert len(caplog.records) == 1 @@ -222,7 +222,7 @@ def test_transfer_to_external_directory(self, root, caplog): with open(directory / "file.txt", mode='w') as f: f.write("foo") - logger_name = "gufe.storage.stagingdirectory" + logger_name = "gufe.storage.stagingregistry" caplog.set_level(logging.DEBUG, logger=logger_name) root.transfer_single_file_to_external(directory) assert len(caplog.records) == 1 @@ -238,7 +238,7 @@ def test_single_file_transfer_read_only(self, old_contents = f.read() assert old_contents == b"foo" - logger_name = "gufe.storage.stagingdirectory" + logger_name = "gufe.storage.stagingregistry" caplog.set_level(logging.DEBUG, logger=logger_name) read_only.transfer_single_file_to_external(staged) assert len(caplog.records) == 1 @@ -254,7 +254,7 @@ def test_transfer_read_only(self, read_only_with_overwritten, caplog): old_contents = f.read() assert old_contents == b"foo" - logger_name = "gufe.storage.stagingdirectory" + logger_name = "gufe.storage.stagingregistry" caplog.set_level(logging.DEBUG, logger=logger_name) read_only.transfer_staging_to_external() assert len(caplog.records) == 1 @@ -277,7 +277,7 @@ def test_cleanup_missing(self, root, caplog): file.__fspath__() assert file in root.registry assert not pathlib.Path(file).exists() - logger_name = "gufe.storage.stagingdirectory" + logger_name = "gufe.storage.stagingregistry" caplog.set_level(logging.WARNING, logger=logger_name) root.cleanup() assert len(caplog.records) == 1 diff --git a/gufe/tests/storage/test_storagemanager.py b/gufe/tests/storage/test_storagemanager.py index 70852bf0..69b1a6dc 100644 --- a/gufe/tests/storage/test_storagemanager.py +++ b/gufe/tests/storage/test_storagemanager.py @@ -1,6 +1,5 @@ import pytest from gufe.storage.storagemanager import StorageManager -from gufe.storage.stagingdirectory import StagingDirectory from gufe.storage.externalresource import MemoryStorage, FileStorage from pathlib import Path From 78e003b602ba179db5c1b65e6b0fd0d61fbea111 Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Thu, 7 Dec 2023 17:07:44 -0600 Subject: [PATCH 50/69] remove prefix; remove get_other_shared This required a pretty significant rewrite of the code. --- gufe/storage/stagingregistry.py | 123 +++++++-------------- gufe/storage/storagemanager.py | 13 ++- gufe/tests/storage/test_stagingregistry.py | 99 ++++++++++------- gufe/tests/storage/test_storagemanager.py | 25 ++--- 4 files changed, 118 insertions(+), 142 deletions(-) diff --git a/gufe/storage/stagingregistry.py b/gufe/storage/stagingregistry.py index 74278830..2d6f5c99 100644 --- a/gufe/storage/stagingregistry.py +++ b/gufe/storage/stagingregistry.py @@ -11,31 +11,23 @@ import logging _logger = logging.getLogger(__name__) - -def _safe_to_delete_staging(external: ExternalStorage, path: PathLike, - prefix: Union[PathLike, str]) -> bool: - """Check if deleting ``path`` could delete externally stored data. - - If external storage is a FileStorage, then it will store files for - this unit or dag in the directory ``external.root_dir / prefix``, where - ``prefix`` is either the unit label or the dag label. If ``path`` is - inside that directory, then deleting it may delete information from the - external storage. In that case, this returns False, indicating a - conflict. Otherwise, this returns True. - """ - # this is a little brittle; I don't like hard-coding the class here +def _safe_to_delete_file( + external: ExternalStorage, + path: PathLike +) -> bool: + """Check that deleting this file will not remove it from external""" + # kind of brittle: deals with internals of FileStorage if isinstance(external, FileStorage): - root = Path(external.root_dir) / prefix + root = external.root_dir else: return True p = Path(path) try: - _ = p.relative_to(root) + label = str(p.relative_to(root)) except ValueError: return True - else: - return False + return not external.exists(label) class StagingRegistry: @@ -62,25 +54,18 @@ class StagingRegistry: the scratch directory shared by all objects on this host external : :class:`.ExternalStorage` external storage resource where objects should eventualy go - prefix : str - label for this specific unit; this should be a slash-separated - description of where this unit fits in the hierarchy. For example, - it might be ``$DAG_LABEL/$UNIT_LABEL`` or - ``$DAG_LABEL/$UNIT_LABEL/$UNIT_REPEAT``. It must be a unique - identifier for this unit within the permanent storage. staging : PathLike name of the subdirectory of scratch where staged results are temporarily stored; default is '.staging'. This must be the same for all units within a DAG. delete_staging : bool - whether to delete the contents of the $SCRATCH/$STAGING/$PREFIX + whether to delete the contents of the $SCRATCH/$STAGING directory when this object is deleted """ def __init__( self, scratch: PathLike, external: ExternalStorage, - prefix: str, *, staging: PathLike = Path(".staging"), delete_staging: bool = True, @@ -88,23 +73,20 @@ def __init__( ): self.external = external self.scratch = Path(scratch) - self.prefix = Path(prefix) self.delete_staging = delete_staging self.delete_empty_dirs = delete_empty_dirs self.staging = staging self.registry: set[StagingPath] = set() self.preexisting: set[StagingPath] = set() - self.staging_dir = self.scratch / staging / prefix + self.staging_dir = self.scratch / staging self.staging_dir.mkdir(exist_ok=True, parents=True) - def _delete_staging_safe(self): - """Check if deleting staging will remove data from external. - """ - return _safe_to_delete_staging( + def _delete_file_safe(self, file): + """Check if deleting this file will remove it from external.""" + return _safe_to_delete_file( external=self.external, - path=self.staging_dir, - prefix=self.prefix, + path=file ) def transfer_single_file_to_external(self, held_file: StagingPath): @@ -134,21 +116,26 @@ def transfer_staging_to_external(self): if (transferred := self.transfer_single_file_to_external(file)) ] + def _delete_file(self, file: StagingPath): + path = Path(file.fspath) + if path.exists(): + _logger.debug(f"Removing file '{file}'") + # TODO: handle special case of directory? + path.unlink() + self.registry.remove(file) + else: + _logger.warning( + f"During staging cleanup, file '{file}' was marked for " + "deletion, but can not be found on disk." + ) + def cleanup(self): """Perform end-of-lifecycle cleanup. """ - if self.delete_staging and self._delete_staging_safe(): + if self.delete_staging: for file in self.registry - self.preexisting: - path = Path(file.fspath) - if path.exists(): - _logger.debug(f"Removing file {file}") - # TODO: handle special case of directory? - path.unlink() - self.registry.remove(file) - else: - _logger.warning("During staging cleanup, file " - f"{file} was marked for deletion, but " - "can not be found on disk.") + if self._delete_file_safe(file): + self._delete_file(file) if self.delete_empty_dirs: delete_empty_dirs(self.staging_dir) @@ -213,8 +200,7 @@ def __fspath__(self): def __repr__(self): return ( - f"{self.__class__.__name__}('{self.scratch}', {self.external}, " - f"{self.prefix})" + f"{self.__class__.__name__}('{self.scratch}', {self.external})" ) def __del__(self): # -no-cov- @@ -232,46 +218,17 @@ def __init__( self, scratch: PathLike, external: ExternalStorage, - prefix: str, *, staging: PathLike = Path(".staging"), delete_staging: bool = True, delete_empty_dirs: bool = True, read_only: bool = False, ): - super().__init__(scratch, external, prefix, staging=staging, + super().__init__(scratch, external, staging=staging, delete_staging=delete_staging, delete_empty_dirs=delete_empty_dirs) self.read_only = read_only - def _get_other_shared(self, prefix: str, - delete_staging: Optional[bool] = None): - """Get a related unit's staging directory. - """ - if delete_staging is None: - delete_staging = self.delete_staging - - return SharedStaging( - scratch=self.scratch, - external=self.external, - prefix=prefix, - staging=self.staging, - delete_staging=delete_staging, - read_only=True, - ) - - @contextmanager - def other_shared(self, prefix: str, - delete_staging: Optional[bool] = None): - """Context manager approach for getting a related unit's directory. - - This is usually the recommended way to get a previous unit's shared - data. - """ - other = self._get_other_shared(prefix, delete_staging) - yield other - other.cleanup() - def transfer_single_file_to_external(self, held_file: StagingPath): if self.read_only: _logger.debug("Read-only: Not transfering to external storage") @@ -308,24 +265,22 @@ def __init__( scratch: PathLike, external: ExternalStorage, shared: ExternalStorage, - prefix: str, *, staging: PathLike = Path(".staging"), delete_staging: bool = True, delete_empty_dirs: bool = True, ): - super().__init__(scratch, external, prefix, staging=staging, + super().__init__(scratch, external, staging=staging, delete_staging=delete_staging, delete_empty_dirs=delete_empty_dirs) self.shared = shared - def _delete_staging_safe(self): - shared_safe = _safe_to_delete_staging( + def _delete_file_safe(self, file): + shared_safe = _safe_to_delete_file( external=self.shared, - path=self.staging_dir, - prefix=self.prefix + path=file ) - return shared_safe and super()._delete_staging_safe() + return shared_safe and super()._delete_file_safe(file) def transfer_single_file_to_external(self, held_file: StagingPath): # if we can't find it locally, we load it from shared storage @@ -386,7 +341,7 @@ def __fspath__(self): @property def label(self) -> str: """Label used in :class:`.ExternalStorage` for this path""" - return str(self.root.prefix / self.path) + return str(self.path) def __repr__(self): return f"StagingPath('{self.fspath}')" diff --git a/gufe/storage/storagemanager.py b/gufe/storage/storagemanager.py index d5a80d92..21cc5f9f 100644 --- a/gufe/storage/storagemanager.py +++ b/gufe/storage/storagemanager.py @@ -46,7 +46,6 @@ def __init__( shared=self.shared_root, staging=self.staging, delete_empty_dirs=delete_empty_dirs, - prefix="" ) self.shared_staging = SharedStaging( @@ -54,7 +53,6 @@ def __init__( external=self.shared_root, staging=self.staging, delete_empty_dirs=delete_empty_dirs, - prefix="" # TODO: remove prefix ) def make_label(self, dag_label, unit_label, attempt, **kwargs): @@ -87,8 +85,17 @@ def running_dag(self, dag_label): if not self.keep_staging: self.permanent_staging.cleanup() + if not self.keep_shared: - for file in self.shared_xfer: + # we'd like to do something like loop over + # self.shared_xfer - self.permanent_xfer; however, + # StagedPaths have different staging registries. This gives + # the set of paths we do want to delete + perm_xfer_paths = {p.fspath for p in self.permanent_xfer} + shared_xfer_to_delete = {p for p in self.shared_xfer + if p.fspath not in perm_xfer_paths} + + for file in shared_xfer_to_delete: self.shared_root.delete(file.label) for file in self.permanent_xfer: diff --git a/gufe/tests/storage/test_stagingregistry.py b/gufe/tests/storage/test_stagingregistry.py index 65806f67..c0f21fc5 100644 --- a/gufe/tests/storage/test_stagingregistry.py +++ b/gufe/tests/storage/test_stagingregistry.py @@ -7,7 +7,7 @@ from gufe.storage.externalresource import MemoryStorage, FileStorage from gufe.storage.stagingregistry import ( - SharedStaging, PermanentStaging, _safe_to_delete_staging, + SharedStaging, PermanentStaging, _safe_to_delete_file, delete_empty_dirs, # TODO: move to appropriate place ) @@ -19,7 +19,6 @@ def root(tmp_path): root = SharedStaging( scratch=tmp_path, external=external, - prefix="new_unit", delete_staging=False ) return root @@ -27,7 +26,8 @@ def root(tmp_path): @pytest.fixture def root_with_contents(root): - with open(root / "data.txt", mode='wb') as f: + # file staged but not yet shipped to external + with open(root / "new_unit/data.txt", mode='wb') as f: f.write(b"bar") return root @@ -38,14 +38,13 @@ def read_only_with_overwritten(root_with_contents): read_only = SharedStaging( scratch=root_with_contents.scratch, external=root_with_contents.external, - prefix="old_unit", staging=root_with_contents.staging, delete_staging=root_with_contents.delete_staging, read_only=True ) - filename = pathlib.Path(read_only) / "data.txt" + filename = pathlib.Path(read_only) / "old_unit/data.txt" assert not filename.exists() - staged = read_only / "data.txt" + staged = read_only / "old_unit/data.txt" assert not filename.exists() staged.__fspath__() assert filename.exists() @@ -58,36 +57,35 @@ def read_only_with_overwritten(root_with_contents): @pytest.fixture def permanent(tmp_path): shared = MemoryStorage() - shared.store_bytes("final/old_unit/data.txt", b"foo") + shared.store_bytes("old_unit/data.txt", b"foo") perm = PermanentStaging( - scratch=tmp_path, + scratch=tmp_path / "final", external=MemoryStorage(), shared=shared, - prefix="final", delete_staging=True ) return perm -def test_safe_to_delete_staging_ok(tmp_path): +@pytest.mark.parametrize('rel_path', [ + ("bar"), ("baz"), ("../bar") +]) +def test_safe_to_delete_file(tmp_path, rel_path): external = FileStorage(tmp_path / "foo") - prefix = "bar" - staging = tmp_path / "foo" / "baz" - assert _safe_to_delete_staging(external, staging, prefix) - + external.store_bytes("bar", b"") + ext_loc = tmp_path / "foo" / "bar" + assert ext_loc.exists() -def test_safe_to_delete_staging_danger(tmp_path): - external = FileStorage(tmp_path / "foo") - prefix = "bar" - staging = tmp_path / "foo" / "bar" / "baz" - assert not _safe_to_delete_staging(external, staging, prefix) + staged = external.root_dir / rel_path + is_safe = (rel_path != "bar") + assert _safe_to_delete_file(external, staged) is is_safe -def test_safe_to_delete_staging_not_filestorage(tmp_path): +def test_safe_to_delete_file_not_filestorage(tmp_path): external = MemoryStorage() - prefix = "bar" + external.store_bytes("bar", b"") staging = tmp_path / "bar" - assert _safe_to_delete_staging(external, staging, prefix) + assert _safe_to_delete_file(external, staging) def test_delete_empty_dirs(tmp_path): @@ -141,7 +139,6 @@ def test_repr(self, root): r = repr(root) assert r.startswith("SharedStaging") assert "MemoryStorage" in r - assert r.endswith(", new_unit)") @pytest.mark.parametrize('pathlist', [ ['file.txt'], ['dir', 'file.txt'] @@ -168,9 +165,11 @@ def test_read_old(self, root): # when we create the specific StagingPath, it registers and # "downloads" the file - old_staging = root._get_other_shared("old_unit") - filepath = old_staging / "data.txt" - assert pathlib.Path(filepath) == on_filesystem + filepath = root / "old_unit/data.txt" + assert pathlib.Path(filepath.fspath) == on_filesystem + + assert not on_filesystem.exists() + filepath.register() assert on_filesystem.exists() # let's just be sure we can read in the data as desired @@ -181,7 +180,7 @@ def test_write_new(self, root): label = "new_unit/somefile.txt" on_filesystem = root.scratch / root.staging / "new_unit/somefile.txt" assert not on_filesystem.exists() - with open(root / "somefile.txt", mode='wb') as f: + with open(root / "new_unit/somefile.txt", mode='wb') as f: f.write(b"testing") # this has been written to disk in scratch, but not yet saved to @@ -189,9 +188,10 @@ def test_write_new(self, root): assert on_filesystem.exists() assert not root.external.exists(label) + @pytest.mark.xfail # Need test that read-only errors on new files def test_write_old_fail(self, root): old_staging = root._get_other_shared("old_unit") - staged = old_staging / "foo.txt" + staged = old_,tstaging / "foo.txt" with pytest.raises(IOError, match="read-only"): staged.__fspath__() @@ -207,7 +207,7 @@ def test_transfer_to_external(self, root_with_contents): def test_transfer_to_external_no_file(self, root, caplog): with mock.patch.object(root, 'register_path'): - nonfile = root / "does_not_exist.txt" + nonfile = root / "old_unit/does_not_exist.txt" # ensure that we've set this up correctly assert nonfile not in root.registry logger_name = "gufe.storage.stagingregistry" @@ -218,7 +218,7 @@ def test_transfer_to_external_no_file(self, root, caplog): assert "nonexistent" in record.msg def test_transfer_to_external_directory(self, root, caplog): - directory = root / "directory" + directory = root / "old_unit/directory" with open(directory / "file.txt", mode='w') as f: f.write("foo") @@ -266,15 +266,16 @@ def test_transfer_read_only(self, read_only_with_overwritten, caplog): def test_cleanup(self, root_with_contents): root_with_contents.delete_staging = True # slightly naughty - path = pathlib.Path(root_with_contents.__fspath__()) / "data.txt" + root_path = pathlib.Path(root_with_contents.__fspath__()) + path = root_path / "new_unit/data.txt" assert path.exists() root_with_contents.cleanup() assert not path.exists() def test_cleanup_missing(self, root, caplog): root.delete_staging = True - file = root / "foo.txt" - file.__fspath__() + file = root / "old_unit/foo.txt" + file.register() assert file in root.registry assert not pathlib.Path(file).exists() logger_name = "gufe.storage.stagingregistry" @@ -285,12 +286,13 @@ def test_cleanup_missing(self, root, caplog): assert "can not be found on disk" in record.msg def test_register_cleanup_preexisting_file(self, root): - filename = pathlib.Path(root.__fspath__()) / "foo.txt" + filename = pathlib.Path(root.__fspath__()) / "new_unit/foo.txt" + filename.parent.mkdir(parents=True, exist_ok=True) filename.touch() root.external.store_bytes("new_unit/foo.txt", b"") assert len(root.registry) == 0 assert len(root.preexisting) == 0 - staging = root / "foo.txt" + staging = root / "new_unit/foo.txt" assert staging.label == "new_unit/foo.txt" assert len(root.registry) == 0 assert len(root.preexisting) == 0 @@ -303,19 +305,32 @@ def test_register_cleanup_preexisting_file(self, root): assert filename.exists() -class TestPermanentStage: +class TestPermanentStaging: @pytest.mark.parametrize('is_safe', [True, False]) - def test_delete_staging_safe(self, tmp_path, is_safe): + def test_delete_file_safe(self, tmp_path, is_safe): staging = ".staging" if is_safe else "" + scratch_root_dir = tmp_path / "final" + + # create a file in the external storage + external = FileStorage(scratch_root_dir) + external.store_bytes("foo.txt", b"foo") + external_file_loc = external.root_dir / "foo.txt" + assert external_file_loc.exists() + permanent = PermanentStaging( - scratch=tmp_path, + scratch=scratch_root_dir, external=MemoryStorage(), - shared=FileStorage(tmp_path), - prefix="final", + shared=external, staging=staging, delete_staging=True ) - assert permanent._delete_staging_safe() is is_safe + my_file = permanent / "foo.txt" + + # double check that we set things up correctly + assert (str(external_file_loc) != my_file.fspath) is is_safe + + # test the code + assert permanent._delete_file_safe(my_file) is is_safe def test_load_missing_for_transfer(self, permanent): fname = pathlib.Path(permanent) / "old_unit/data.txt" @@ -326,4 +341,4 @@ def test_load_missing_for_transfer(self, permanent): assert permanent.external._data == {} permanent.transfer_staging_to_external() assert fname.exists() - assert permanent.external._data == {"final/old_unit/data.txt": b"foo"} + assert permanent.external._data == {"old_unit/data.txt": b"foo"} diff --git a/gufe/tests/storage/test_storagemanager.py b/gufe/tests/storage/test_storagemanager.py index 69b1a6dc..779d5552 100644 --- a/gufe/tests/storage/test_storagemanager.py +++ b/gufe/tests/storage/test_storagemanager.py @@ -34,16 +34,14 @@ def run(self, scratch, shared, permanent): (scratch / "foo2.txt").touch() # TODO: this will change; the inputs should include a way to get # the previous shared unit label - with ( - shared.root.other_shared("dag/unit1_attempt_0") as prev_shared - ): - with open(prev_shared / "bar.txt", mode='r') as f: - bar = f.read() + prev_shared = shared.root / "dag/unit1_attempt_0" + with open(prev_shared / "bar.txt", mode='r') as f: + bar = f.read() - # note that you can open a file from permanent as if it was - # from shared -- everything in permanent is in shared - with open(prev_shared / "baz.txt", mode='r') as f: - baz = f.read() + # note that you can open a file from permanent as if it was + # from shared -- everything in permanent is in shared + with open(prev_shared / "baz.txt", mode='r') as f: + baz = f.read() return {"bar": bar, "baz": baz} @@ -75,8 +73,8 @@ def test_lifecycle(self, storage_manager, dag_units, tmp_path): with dag_ctx.running_unit(dag_label, unit.key, attempt=0) as ( scratch, shared, perm ): - results.append(unit.run(scratch, shared, perm)) # import pdb; pdb.set_trace() + results.append(unit.run(scratch, shared, perm)) self.in_unit_asserts(storage_manager, label) self.after_unit_asserts(storage_manager, label) self.after_dag_asserts(storage_manager) @@ -180,7 +178,8 @@ def storage_manager(self, storage_manager_std): def _in_unit_existing_files(self, unit_label): return { "dag/unit1": {'bar', 'baz', 'foo'}, - "dag/unit2": {'foo2', 'baz'} + "dag/unit2": {'foo2', 'baz', 'bar'}, + # bar was deleted, but gets brought back in unit2 }[unit_label] def _after_unit_existing_files(self, unit_label): @@ -275,7 +274,7 @@ def storage_manager(self, tmp_path): def _in_unit_existing_files(self, unit_label): return { "dag/unit1": {'foo', 'bar', 'baz'}, - "dag/unit2": {"foo2", "baz"}, # no bar because it was temporary + "dag/unit2": {"foo2", "baz", "bar"}, # bar is resurrected }[unit_label] def _after_dag_existing_files(self): @@ -289,7 +288,7 @@ def _in_staging_permanent(self, unit_label, in_after): return { ("dag/unit1", "in"): {bar, baz, foo}, ("dag/unit1", "after"): {baz}, - ("dag/unit2", "in"): {baz, foo2}, + ("dag/unit2", "in"): {baz, foo2, bar}, # bar is resurrected ("dag/unit2", "after"): {baz} }[unit_label, in_after] From 265e786bb0f9fe6866cd57d07fd93541e4d64304 Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Thu, 7 Dec 2023 17:22:59 -0600 Subject: [PATCH 51/69] delete_empty_dirs => keep_empty_dirs Mainly matters for the StorageManager --- gufe/storage/stagingregistry.py | 14 +++++++------- gufe/storage/storagemanager.py | 10 +++++----- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/gufe/storage/stagingregistry.py b/gufe/storage/stagingregistry.py index 2d6f5c99..626e6b8f 100644 --- a/gufe/storage/stagingregistry.py +++ b/gufe/storage/stagingregistry.py @@ -69,12 +69,12 @@ def __init__( *, staging: PathLike = Path(".staging"), delete_staging: bool = True, - delete_empty_dirs: bool = True, + keep_empty_dirs: bool = False, ): self.external = external self.scratch = Path(scratch) self.delete_staging = delete_staging - self.delete_empty_dirs = delete_empty_dirs + self.keep_empty_dirs = keep_empty_dirs self.staging = staging self.registry: set[StagingPath] = set() @@ -137,7 +137,7 @@ def cleanup(self): if self._delete_file_safe(file): self._delete_file(file) - if self.delete_empty_dirs: + if not self.keep_empty_dirs: delete_empty_dirs(self.staging_dir) def register_path(self, staging_path: StagingPath): @@ -221,12 +221,12 @@ def __init__( *, staging: PathLike = Path(".staging"), delete_staging: bool = True, - delete_empty_dirs: bool = True, + keep_empty_dirs: bool = False, read_only: bool = False, ): super().__init__(scratch, external, staging=staging, delete_staging=delete_staging, - delete_empty_dirs=delete_empty_dirs) + keep_empty_dirs=keep_empty_dirs) self.read_only = read_only def transfer_single_file_to_external(self, held_file: StagingPath): @@ -268,11 +268,11 @@ def __init__( *, staging: PathLike = Path(".staging"), delete_staging: bool = True, - delete_empty_dirs: bool = True, + keep_empty_dirs: bool = False, ): super().__init__(scratch, external, staging=staging, delete_staging=delete_staging, - delete_empty_dirs=delete_empty_dirs) + keep_empty_dirs=keep_empty_dirs) self.shared = shared def _delete_file_safe(self, file): diff --git a/gufe/storage/storagemanager.py b/gufe/storage/storagemanager.py index 21cc5f9f..4370b5b5 100644 --- a/gufe/storage/storagemanager.py +++ b/gufe/storage/storagemanager.py @@ -23,8 +23,8 @@ def __init__( keep_scratch: bool = False, keep_staging: bool = False, keep_shared: bool = False, + keep_empty_dirs: bool = False, staging: PathLike = Path(".staging"), - delete_empty_dirs: bool = True, ): self.scratch_root = Path(scratch_root) self.shared_root = shared_root @@ -33,7 +33,7 @@ def __init__( self.keep_staging = keep_staging self.keep_shared = keep_shared self.staging = staging - self.delete_empty_dirs = delete_empty_dirs + self.keep_empty_dirs = keep_empty_dirs # these are used to track what files can be deleted from shared if # keep_shared is False @@ -45,14 +45,14 @@ def __init__( external=self.permanent_root, shared=self.shared_root, staging=self.staging, - delete_empty_dirs=delete_empty_dirs, + keep_empty_dirs=keep_empty_dirs, ) self.shared_staging = SharedStaging( scratch=self.scratch_root, external=self.shared_root, staging=self.staging, - delete_empty_dirs=delete_empty_dirs, + keep_empty_dirs=keep_empty_dirs, ) def make_label(self, dag_label, unit_label, attempt, **kwargs): @@ -102,7 +102,7 @@ def running_dag(self, dag_label): if self.shared_root != self.permanent_root: self.shared_root.delete(file.label) - if self.delete_empty_dirs: + if not self.keep_empty_dirs: delete_empty_dirs(self._scratch_base, delete_root=False) @contextmanager From ce12326a2d3d35f13f471f742faa27261a63a7e2 Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Mon, 11 Dec 2023 12:46:25 -0600 Subject: [PATCH 52/69] Add logging to not clean up registered directory --- gufe/storage/stagingregistry.py | 12 +++++++++--- gufe/tests/storage/test_stagingregistry.py | 17 +++++++++++++++++ 2 files changed, 26 insertions(+), 3 deletions(-) diff --git a/gufe/storage/stagingregistry.py b/gufe/storage/stagingregistry.py index 626e6b8f..6da50d23 100644 --- a/gufe/storage/stagingregistry.py +++ b/gufe/storage/stagingregistry.py @@ -119,9 +119,15 @@ def transfer_staging_to_external(self): def _delete_file(self, file: StagingPath): path = Path(file.fspath) if path.exists(): - _logger.debug(f"Removing file '{file}'") - # TODO: handle special case of directory? - path.unlink() + if not path.is_dir(): + _logger.debug(f"Removing file '{file}'") + path.unlink() + else: + _logger.debug( + f"During staging cleanup, the directory '{file}' was " + "found as a staged path. This will be deleted only if " + "`keep_empty` is False." + ) self.registry.remove(file) else: _logger.warning( diff --git a/gufe/tests/storage/test_stagingregistry.py b/gufe/tests/storage/test_stagingregistry.py index c0f21fc5..e62ab697 100644 --- a/gufe/tests/storage/test_stagingregistry.py +++ b/gufe/tests/storage/test_stagingregistry.py @@ -285,6 +285,23 @@ def test_cleanup_missing(self, root, caplog): record = caplog.records[0] assert "can not be found on disk" in record.msg + def test_cleanup_directory(self, root, caplog): + root.delete_staging = True + dirname = root / "old_unit" + assert dirname not in root.registry + dirname.register() + assert dirname in root.registry + + assert not pathlib.Path(dirname).exists() + file = dirname / "foo.txt" + file.register() + # directory is created when something in the directory registered + assert pathlib.Path(dirname).exists() + logger_name = "gufe.storage.stagingregistry" + caplog.set_level(logging.DEBUG, logger=logger_name) + root.cleanup() + assert "During staging cleanup, the directory" in caplog.text + def test_register_cleanup_preexisting_file(self, root): filename = pathlib.Path(root.__fspath__()) / "new_unit/foo.txt" filename.parent.mkdir(parents=True, exist_ok=True) From 2420f6c53e4a939d1af3e5da1295ef623e07cd35 Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Mon, 11 Dec 2023 13:09:36 -0600 Subject: [PATCH 53/69] remove unneeded comment --- gufe/storage/stagingregistry.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/gufe/storage/stagingregistry.py b/gufe/storage/stagingregistry.py index 6da50d23..708d1221 100644 --- a/gufe/storage/stagingregistry.py +++ b/gufe/storage/stagingregistry.py @@ -168,9 +168,6 @@ def register_path(self, staging_path: StagingPath): label_exists = self.external.exists(staging_path.label) fspath = Path(staging_path.fspath) - # TODO: what if the staging path is a directory? not sure that we - # have a way to know that; but not sure that adding it to the - # registry is right either if not fspath.parent.exists(): fspath.parent.mkdir(parents=True, exist_ok=True) From fe64baa60d6b928d1425cb0525eab87f710a1cd3 Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Mon, 11 Dec 2023 13:42:11 -0600 Subject: [PATCH 54/69] yield Context instead of tuple also update for changes in other PR --- gufe/protocols/protocoldag.py | 6 +----- gufe/protocols/protocolunit.py | 6 +++--- gufe/storage/storagemanager.py | 9 ++++++++- gufe/tests/storage/test_storage_demo.py | 10 ++++------ gufe/tests/storage/test_storagemanager.py | 5 ++++- 5 files changed, 20 insertions(+), 16 deletions(-) diff --git a/gufe/protocols/protocoldag.py b/gufe/protocols/protocoldag.py index 0ba68a56..27e107ac 100644 --- a/gufe/protocols/protocoldag.py +++ b/gufe/protocols/protocoldag.py @@ -480,11 +480,7 @@ def new_execute_DAG( # TODO: this is a terrible name attempt=attempt) with dag_ctx.running_unit( dag_label, unit.key, attempt=attempt - ) as (scratch, shared, perm): - # TODO: context manager should return context - context = Context(shared=shared, - scratch=scratch, - permanent=perm) + ) as context: _logger.info("Starting unit {label}") _logger.info(context) result = unit.execute( diff --git a/gufe/protocols/protocolunit.py b/gufe/protocols/protocolunit.py index 77e1f913..388cb2b4 100644 --- a/gufe/protocols/protocolunit.py +++ b/gufe/protocols/protocolunit.py @@ -23,7 +23,7 @@ GufeTokenizable, GufeKey, TOKENIZABLE_REGISTRY ) -from ..storage.stagingdirectory import StagingDirectory +from ..storage.stagingregistry import StagingRegistry @dataclass @@ -33,8 +33,8 @@ class Context: """ scratch: PathLike - shared: StagingDirectory - permanent: StagingDirectory + shared: StagingRegistry + permanent: StagingRegistry def _list_dependencies(inputs, cls): diff --git a/gufe/storage/storagemanager.py b/gufe/storage/storagemanager.py index 4370b5b5..a569b4ab 100644 --- a/gufe/storage/storagemanager.py +++ b/gufe/storage/storagemanager.py @@ -12,6 +12,8 @@ from .stagingregistry import SharedStaging, PermanentStaging from .stagingregistry import StagingPath # typing +from gufe.protocols.protocolunit import Context + class StorageManager: def __init__( @@ -112,8 +114,13 @@ def running_unit(self, dag_label, unit_label, **kwargs): scratch.mkdir(parents=True, exist_ok=True) shared = self.shared_staging / label permanent = self.permanent_staging / label + context = Context( + scratch=scratch, + shared=shared, + permanent=permanent + ) try: - yield scratch, shared, permanent + yield context finally: # import pdb; pdb.set_trace() # clean up after unit diff --git a/gufe/tests/storage/test_storage_demo.py b/gufe/tests/storage/test_storage_demo.py index f87444b8..7d688bd2 100644 --- a/gufe/tests/storage/test_storage_demo.py +++ b/gufe/tests/storage/test_storage_demo.py @@ -5,7 +5,7 @@ import gufe from gufe.storage.externalresource import MemoryStorage, FileStorage from gufe.storage.storagemanager import StorageManager -from gufe.storage.stagingdirectory import StagingPath +from gufe.storage.stagingregistry import StagingPath from gufe.protocols.protocoldag import new_execute_DAG """ @@ -173,7 +173,7 @@ def assert_scratch(self, storage_manager): """ scratch = storage_manager.scratch_root keep_scratch = storage_manager.keep_scratch - del_empty_dirs = storage_manager.delete_empty_dirs + del_empty_dirs = not storage_manager.keep_empty_dirs assert scratch.is_dir() if keep_scratch: @@ -212,7 +212,6 @@ def u2_label(dag): def get_storage_manager(self, keep, tmp_path): keep_scr, keep_sta, keep_sha, empties = self._parse_keep(keep) - del_empty_dirs = not empties shared, permanent = self.get_shared_and_permanent() storage_manager = StorageManager( @@ -222,7 +221,7 @@ def get_storage_manager(self, keep, tmp_path): keep_scratch=keep_scr, keep_staging=keep_sta, keep_shared=keep_sha, - delete_empty_dirs=del_empty_dirs, + keep_empty_dirs=empties, ) return storage_manager @@ -303,7 +302,6 @@ def get_shared_and_permanent(self): def get_storage_manager(self, keep, tmp_path): keep_scr, keep_sta, keep_sha, empties = self._parse_keep(keep) - del_empty_dirs = not empties backend = FileStorage(tmp_path) storage_manager = StorageManager( scratch_root=tmp_path, @@ -312,7 +310,7 @@ def get_storage_manager(self, keep, tmp_path): keep_scratch=keep_scr, keep_staging=keep_sta, keep_shared=keep_sha, - delete_empty_dirs=del_empty_dirs, + keep_empty_dirs=empties, staging="", ) return storage_manager diff --git a/gufe/tests/storage/test_storagemanager.py b/gufe/tests/storage/test_storagemanager.py index 779d5552..9be3b3af 100644 --- a/gufe/tests/storage/test_storagemanager.py +++ b/gufe/tests/storage/test_storagemanager.py @@ -71,9 +71,12 @@ def test_lifecycle(self, storage_manager, dag_units, tmp_path): for unit in dag_units: label = f"{dag_label}/{unit.key}" with dag_ctx.running_unit(dag_label, unit.key, attempt=0) as ( - scratch, shared, perm + context ): # import pdb; pdb.set_trace() + scratch = context.scratch + shared = context.shared + perm = context.permanent results.append(unit.run(scratch, shared, perm)) self.in_unit_asserts(storage_manager, label) self.after_unit_asserts(storage_manager, label) From 288e6ccbefe710039bd53f7ebae5d3bc7470dbe3 Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Mon, 11 Dec 2023 14:47:47 -0600 Subject: [PATCH 55/69] ReproduceOldBehaviorStorageManager.from_old_args This will make it easier to test that we create the same old directories --- gufe/protocols/protocoldag.py | 58 +++++++++++++++++++++++++++------- gufe/tests/test_protocoldag.py | 5 +++ 2 files changed, 52 insertions(+), 11 deletions(-) diff --git a/gufe/protocols/protocoldag.py b/gufe/protocols/protocoldag.py index 27e107ac..04348341 100644 --- a/gufe/protocols/protocoldag.py +++ b/gufe/protocols/protocoldag.py @@ -364,6 +364,48 @@ def _scratch_loc(self, dag_label, unit_label, attempt): def make_label(self, dag_label, unit_label, attempt): return f"{dag_label}/shared_{unit_label}_attempt_{attempt}" + @classmethod + def from_old_args( + cls, + shared_basedir: PathLike, + scratch_basedir: PathLike, *, + keep_shared: bool = False, + keep_scratch: bool = False, + ): + """ + Create an new storage manager based on the old execute_DAG args. + + Parameters + ---------- + shared_basedir : Path + Filesystem path to use for shared space that persists across whole DAG + execution. Used by a `ProtocolUnit` to pass file contents to dependent + class:``ProtocolUnit`` instances. + scratch_basedir : Path + Filesystem path to use for `ProtocolUnit` `scratch` space. + keep_shared : bool + If True, don't remove shared directories for `ProtocolUnit`s after + the `ProtocolDAG` is executed. + keep_scratch : bool + If True, don't remove scratch directories for a `ProtocolUnit` after + it is executed. + """ + # doing this here makes it easier to test than putting in + # execute_DAG + shared_basedir = Path(shared_basedir) + shared = FileStorage(shared_basedir.parent) + storage_manager = cls( + scratch_root=scratch_basedir, + shared_root=shared, + permanent_root=shared, + keep_scratch=keep_scratch, + keep_shared=keep_shared, + keep_staging=True, + keep_empty_dirs=True, + # staging=Path(""), # use the actual directories as the staging + ) + return storage_manager + def execute_DAG(protocoldag: ProtocolDAG, *, shared_basedir: PathLike, @@ -408,19 +450,13 @@ def execute_DAG(protocoldag: ProtocolDAG, *, # the directory given as shared_root is actually the directory for this # DAG; the "shared_root" for the storage manager is the parent. We'll # force permanent to be the same. - shared_basedir = Path(shared_basedir) - shared = FileStorage(shared_basedir.parent) - dag_label = shared_basedir.name - storage_manager = ReproduceOldBehaviorStorageManager( - scratch_root=scratch_basedir, - shared_root=shared, - permanent_root=shared, - keep_scratch=keep_scratch, + storage_manager = ReproduceOldBehaviorStorageManager.from_old_args( + shared_basedir=shared_basedir, + scratch_basedir=scratch_basedir, keep_shared=keep_shared, - keep_staging=True, - delete_empty_dirs=False, - # staging=Path(""), # use the actual directories as the staging + keep_scratch=keep_scratch ) + dag_label = shared_basedir.name return new_execute_DAG(protocoldag, dag_label, storage_manager, raise_error, n_retries) diff --git a/gufe/tests/test_protocoldag.py b/gufe/tests/test_protocoldag.py index d248ec30..81afb7fc 100644 --- a/gufe/tests/test_protocoldag.py +++ b/gufe/tests/test_protocoldag.py @@ -72,6 +72,11 @@ def writefile_dag(): return p.create(stateA=s1, stateB=s2, mapping={}) +class TestReproduceOldBehaviorStorageManager: + def test_context(self): + ... + + @pytest.mark.parametrize('keep_shared', [False, True]) @pytest.mark.parametrize('keep_scratch', [False, True]) def test_execute_dag(tmpdir, keep_shared, keep_scratch, writefile_dag): From bc1f36fc008b3b9c2b1162e61b0c8b9248bee1b5 Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Mon, 11 Dec 2023 17:47:06 -0600 Subject: [PATCH 56/69] tests for ReproduceOldBehaviorStorageManager.from_old_args --- gufe/protocols/protocoldag.py | 7 +++++-- gufe/tests/test_protocoldag.py | 34 ++++++++++++++++++++++++++++++++-- 2 files changed, 37 insertions(+), 4 deletions(-) diff --git a/gufe/protocols/protocoldag.py b/gufe/protocols/protocoldag.py index 04348341..be6211f3 100644 --- a/gufe/protocols/protocoldag.py +++ b/gufe/protocols/protocoldag.py @@ -359,7 +359,10 @@ class ReproduceOldBehaviorStorageManager(StorageManager): # {dag_label}/scratch_{unit_label} and shared at # {dag_label}/shared_{unit_label}. def _scratch_loc(self, dag_label, unit_label, attempt): - return self.scratch_root / f"scratch_{unit_label}_attempt_{attempt}" + return ( + self.scratch_root + / f"{dag_label}/scratch_{unit_label}_attempt_{attempt}" + ) def make_label(self, dag_label, unit_label, attempt): return f"{dag_label}/shared_{unit_label}_attempt_{attempt}" @@ -402,7 +405,7 @@ def from_old_args( keep_shared=keep_shared, keep_staging=True, keep_empty_dirs=True, - # staging=Path(""), # use the actual directories as the staging + staging=Path(""), # use the actual directories as the staging ) return storage_manager diff --git a/gufe/tests/test_protocoldag.py b/gufe/tests/test_protocoldag.py index 81afb7fc..e5fe8752 100644 --- a/gufe/tests/test_protocoldag.py +++ b/gufe/tests/test_protocoldag.py @@ -7,6 +7,7 @@ import gufe from gufe.protocols import execute_DAG +from gufe.protocols.protocoldag import ReproduceOldBehaviorStorageManager class WriterUnit(gufe.ProtocolUnit): @@ -73,8 +74,37 @@ def writefile_dag(): class TestReproduceOldBehaviorStorageManager: - def test_context(self): - ... + def test_context(self, tmp_path): + # check that the paths are the ones we expect + base = tmp_path / "working" + manager = ReproduceOldBehaviorStorageManager.from_old_args( + scratch_basedir=base, + shared_basedir=base + ) + dag_label = "dag" + unit_label = "unit" + expected_scratch = "working/dag/scratch_unit_attempt_0/scratch.txt" + expected_shared = "working/dag/shared_unit_attempt_0/shared.txt" + expected_perm = "working/dag/shared_unit_attempt_0/perm.txt" + with manager.running_dag(dag_label) as dag_ctx: + with dag_ctx.running_unit( + dag_label, unit_label, attempt=0 + ) as ctx: + scratch_f = ctx.scratch / "scratch.txt" + shared_f = ctx.shared / "shared.txt" + perm_f = ctx.permanent / "perm.txt" + + found_scratch = pathlib.Path(scratch_f).relative_to(tmp_path) + found_shared = pathlib.Path(shared_f.fspath).relative_to(tmp_path) + found_perm = pathlib.Path(perm_f.fspath).relative_to(tmp_path) + + assert str(found_scratch) == expected_scratch + assert str(found_shared) == expected_shared + assert str(found_perm) == expected_perm + # the label is the relative path to the base directory for a + # FileStorage + assert "working/" + shared_f.label == expected_shared + assert "working/" + perm_f.label == expected_perm @pytest.mark.parametrize('keep_shared', [False, True]) From aaa2aab0a77549f84dc745a596ab152c06d3997e Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Wed, 13 Dec 2023 16:28:48 -0600 Subject: [PATCH 57/69] StagingPath.fspath => StagingPath.as_path() Make a function that returns a pathlib.Path be the main thing, instead of the property that returns a string. Almost all usage of `.fspath` was to then wrap it in a pathlib.Path. This is more convenient for users. --- gufe/storage/stagingregistry.py | 18 +++++++++++------- gufe/storage/storagemanager.py | 8 +++++--- gufe/tests/storage/test_stagingregistry.py | 4 ++-- 3 files changed, 18 insertions(+), 12 deletions(-) diff --git a/gufe/storage/stagingregistry.py b/gufe/storage/stagingregistry.py index 6da50d23..34b623c7 100644 --- a/gufe/storage/stagingregistry.py +++ b/gufe/storage/stagingregistry.py @@ -92,7 +92,7 @@ def _delete_file_safe(self, file): def transfer_single_file_to_external(self, held_file: StagingPath): """Transfer a given file from staging into external storage """ - path = Path(held_file.fspath) + path = held_file.as_path() if not path.exists(): _logger.info(f"Found nonexistent path {path}, not " "transfering to external storage") @@ -117,7 +117,7 @@ def transfer_staging_to_external(self): ] def _delete_file(self, file: StagingPath): - path = Path(file.fspath) + path = file.as_path() if path.exists(): if not path.is_dir(): _logger.debug(f"Removing file '{file}'") @@ -166,7 +166,7 @@ def register_path(self, staging_path: StagingPath): the path to track """ label_exists = self.external.exists(staging_path.label) - fspath = Path(staging_path.fspath) + fspath = staging_path.as_path() # TODO: what if the staging path is a directory? not sure that we # have a way to know that; but not sure that adding it to the @@ -290,7 +290,7 @@ def _delete_file_safe(self, file): def transfer_single_file_to_external(self, held_file: StagingPath): # if we can't find it locally, we load it from shared storage - path = Path(held_file.fspath) + path = held_file.as_path() if not path.exists(): self._load_file_from_external(self.shared, held_file) @@ -336,13 +336,17 @@ def __eq__(self, other): def __hash__(self): return hash((self.root, self.path)) + def as_path(self): + """Return the pathlib.Path where this is staged""" + return Path(self._fspath) + @property - def fspath(self): + def _fspath(self): return str(self.root.staging_dir / self.path) def __fspath__(self): self.register() - return self.fspath + return self._fspath @property def label(self) -> str: @@ -350,7 +354,7 @@ def label(self) -> str: return str(self.path) def __repr__(self): - return f"StagingPath('{self.fspath}')" + return f"StagingPath('{self._fspath}')" # TODO: how much of the pathlib.Path interface do we want to wrap? # although edge cases may be a pain, we can get most of it with, e.g.: diff --git a/gufe/storage/storagemanager.py b/gufe/storage/storagemanager.py index 4370b5b5..588f2852 100644 --- a/gufe/storage/storagemanager.py +++ b/gufe/storage/storagemanager.py @@ -91,9 +91,11 @@ def running_dag(self, dag_label): # self.shared_xfer - self.permanent_xfer; however, # StagedPaths have different staging registries. This gives # the set of paths we do want to delete - perm_xfer_paths = {p.fspath for p in self.permanent_xfer} - shared_xfer_to_delete = {p for p in self.shared_xfer - if p.fspath not in perm_xfer_paths} + perm_xfer_paths = {p.as_path() for p in self.permanent_xfer} + shared_xfer_to_delete = { + p for p in self.shared_xfer + if p.as_path() not in perm_xfer_paths + } for file in shared_xfer_to_delete: self.shared_root.delete(file.label) diff --git a/gufe/tests/storage/test_stagingregistry.py b/gufe/tests/storage/test_stagingregistry.py index e62ab697..de5508af 100644 --- a/gufe/tests/storage/test_stagingregistry.py +++ b/gufe/tests/storage/test_stagingregistry.py @@ -166,7 +166,7 @@ def test_read_old(self, root): # when we create the specific StagingPath, it registers and # "downloads" the file filepath = root / "old_unit/data.txt" - assert pathlib.Path(filepath.fspath) == on_filesystem + assert filepath.as_path() == on_filesystem assert not on_filesystem.exists() filepath.register() @@ -344,7 +344,7 @@ def test_delete_file_safe(self, tmp_path, is_safe): my_file = permanent / "foo.txt" # double check that we set things up correctly - assert (str(external_file_loc) != my_file.fspath) is is_safe + assert (str(external_file_loc) != my_file._fspath) is is_safe # test the code assert permanent._delete_file_safe(my_file) is is_safe From cf60d1b17876567f2c7d06193eba757896114e17 Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Wed, 13 Dec 2023 16:41:44 -0600 Subject: [PATCH 58/69] pep8 --- gufe/storage/stagingregistry.py | 1 + gufe/storage/storagemanager.py | 1 - gufe/tests/storage/test_stagingregistry.py | 2 +- 3 files changed, 2 insertions(+), 2 deletions(-) diff --git a/gufe/storage/stagingregistry.py b/gufe/storage/stagingregistry.py index 34b623c7..51d2696b 100644 --- a/gufe/storage/stagingregistry.py +++ b/gufe/storage/stagingregistry.py @@ -11,6 +11,7 @@ import logging _logger = logging.getLogger(__name__) + def _safe_to_delete_file( external: ExternalStorage, path: PathLike diff --git a/gufe/storage/storagemanager.py b/gufe/storage/storagemanager.py index 588f2852..ac030cb8 100644 --- a/gufe/storage/storagemanager.py +++ b/gufe/storage/storagemanager.py @@ -85,7 +85,6 @@ def running_dag(self, dag_label): if not self.keep_staging: self.permanent_staging.cleanup() - if not self.keep_shared: # we'd like to do something like loop over # self.shared_xfer - self.permanent_xfer; however, diff --git a/gufe/tests/storage/test_stagingregistry.py b/gufe/tests/storage/test_stagingregistry.py index de5508af..392d34d9 100644 --- a/gufe/tests/storage/test_stagingregistry.py +++ b/gufe/tests/storage/test_stagingregistry.py @@ -191,7 +191,7 @@ def test_write_new(self, root): @pytest.mark.xfail # Need test that read-only errors on new files def test_write_old_fail(self, root): old_staging = root._get_other_shared("old_unit") - staged = old_,tstaging / "foo.txt" + staged = old_staging / "foo.txt" with pytest.raises(IOError, match="read-only"): staged.__fspath__() From 4861b3ef32721ea7e7df47ee6f11b2644f561c3d Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Thu, 14 Dec 2023 13:08:18 -0600 Subject: [PATCH 59/69] Add tests for nested files in shared --- gufe/tests/storage/test_storage_demo.py | 32 ++++++++++++++++++++++--- 1 file changed, 29 insertions(+), 3 deletions(-) diff --git a/gufe/tests/storage/test_storage_demo.py b/gufe/tests/storage/test_storage_demo.py index 7d688bd2..3661bb9f 100644 --- a/gufe/tests/storage/test_storage_demo.py +++ b/gufe/tests/storage/test_storage_demo.py @@ -25,6 +25,14 @@ def _execute(self, ctx): with open(share_file, mode='w') as f: f.write("I can be shared") + nested_file = ctx.shared / "nested" / "shared.txt" + with open(nested_file, mode='w') as f: + f.write("Nested files work as well") + + implicit_nested_file = ctx.shared / "implicit/nested.txt" + with open(implicit_nested_file, mode='w') as f: + f.write("Even if the new diretory is implicit") + perm_file = ctx.permanent / "permanent.txt" with open(perm_file, mode='w') as f: f.write("I'm permanent (but I can be shared)") @@ -150,13 +158,17 @@ def assert_shared_and_permanent(self, storage_manager, dag): perm_file = f"{u1_label}/permanent.txt" shared_file = f"{u1_label}/shared.txt" + nested_file = f"{u1_label}/nested/shared.txt" + implicit_nested_file = f"{u1_label}/implicit/nested.txt" assert list(permanent.iter_contents()) == [perm_file] with permanent.load_stream(perm_file) as f: assert f.read() == b"I'm permanent (but I can be shared)" if keep_shared: - assert list(shared.iter_contents()) == [shared_file, perm_file] + assert set(shared.iter_contents()) == { + shared_file, perm_file, nested_file, implicit_nested_file + } with shared.load_stream(shared_file) as f: assert f.read() == b"I can be shared" with shared.load_stream(perm_file) as f: @@ -197,9 +209,17 @@ def assert_staging(self, storage_manager, dag): if keep_staging: assert (u1_staging / "shared.txt").exists() assert (u1_staging / "permanent.txt").exists() + assert (u1_staging / "nested/shared.txt").exists() + assert (u1_staging / "implicit/nested.txt").exists() else: assert ".staging" not in list(scratch_root.iterdir()) + def assert_final_directories(self, storage_manager, dag): + if storage_manager.keep_empty_dirs: + ... + else: + ... + @staticmethod def u1_label(dag): """Unit 1 label""" @@ -270,12 +290,15 @@ def assert_shared_and_permanent(self, storage_manager, dag): perm_file = f"{u1_label}/permanent.txt" shared_file = f"{u1_label}/shared.txt" + nested_file = f"{u1_label}/nested/shared.txt" + implicit_nested_file = f"{u1_label}/implicit/nested.txt" assert shared is permanent # we'll test everything in permanent, because shared is identical if keep_shared: - expected = {perm_file, shared_file} + expected = {perm_file, shared_file, nested_file, + implicit_nested_file} else: expected = {perm_file} @@ -324,6 +347,8 @@ def assert_shared_and_permanent(self, storage_manager, dag): perm_file = f"{u1_label}/permanent.txt" shared_file = f"{u1_label}/shared.txt" + nested_file = f"{u1_label}/nested/shared.txt" + implicit_nested_file = f"{u1_label}/implicit/nested.txt" scratch_file = f"scratch/{u1_label}/scratch.txt" assert shared is permanent @@ -332,7 +357,8 @@ def assert_shared_and_permanent(self, storage_manager, dag): expected = {perm_file} if keep_shared: - expected.add(shared_file) + expected.update({shared_file, nested_file, + implicit_nested_file}) if keep_scratch: expected.add(scratch_file) From c0a37b3cd922e531d5bafaeba1ade6fef7496e69 Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Thu, 14 Dec 2023 14:50:50 -0600 Subject: [PATCH 60/69] tests for empty directories --- gufe/tests/storage/test_storage_demo.py | 61 +++++++++++++++++++++---- 1 file changed, 53 insertions(+), 8 deletions(-) diff --git a/gufe/tests/storage/test_storage_demo.py b/gufe/tests/storage/test_storage_demo.py index 3661bb9f..c7a3f147 100644 --- a/gufe/tests/storage/test_storage_demo.py +++ b/gufe/tests/storage/test_storage_demo.py @@ -214,11 +214,27 @@ def assert_staging(self, storage_manager, dag): else: assert ".staging" not in list(scratch_root.iterdir()) - def assert_final_directories(self, storage_manager, dag): - if storage_manager.keep_empty_dirs: - ... - else: - ... + def assert_empty_directories(self, storage_manager, dag): + """Check the final status for empty directories.""" + u1_label = self.u1_label(dag) + staging = storage_manager.scratch_root / storage_manager.staging + + directories = [ + staging / u1_label / "nested", + staging / u1_label / "implicit", + ] + + # name these conditions so the logic takes less thought + expected_empty = (storage_manager.keep_empty_dirs + and not storage_manager.keep_staging) + dir_exists = expected_empty or storage_manager.keep_staging + + for directory in directories: + assert directory.exists() == dir_exists + if dir_exists: + assert directory.is_dir() + actual_empty = len(list(directory.iterdir())) == 0 + assert actual_empty == expected_empty @staticmethod def u1_label(dag): @@ -245,19 +261,24 @@ def get_storage_manager(self, keep, tmp_path): ) return storage_manager + def execute(self, storage_manager, dag, dag_label): + result = new_execute_DAG(dag, dag_label, storage_manager, + raise_error=True, n_retries=2) + return result + @pytest.mark.parametrize('keep', [ 'nothing', 'scratch', 'staging', 'shared', 'scratch,staging', 'scratch,shared', 'staging,shared', 'scratch,staging,shared', - 'scratch,empties', 'scratch,shared,empties', + 'scratch,empties', 'scratch,shared,empties', 'staging,empties', ]) def test_execute_dag(self, demo_dag, keep, tmp_path): storage_manager = self.get_storage_manager(keep, tmp_path) dag_label = "dag" - result = new_execute_DAG(demo_dag, dag_label, storage_manager, - raise_error=True, n_retries=2) + result = self.execute(storage_manager, demo_dag, dag_label) self.assert_dag_result(result, demo_dag, storage_manager) + self.assert_empty_directories(storage_manager, demo_dag) self.assert_shared_and_permanent(storage_manager, demo_dag) self.assert_scratch(storage_manager) self.assert_staging(storage_manager, demo_dag) @@ -338,6 +359,11 @@ def get_storage_manager(self, keep, tmp_path): ) return storage_manager + def test_overlap_directories(storage_manager): + # test that the staging and shared/permanent backends overlap as + # expected + pytest.skip() + def assert_shared_and_permanent(self, storage_manager, dag): shared = storage_manager.shared_root permanent = storage_manager.permanent_root @@ -387,3 +413,22 @@ def assert_staging(self, storage_manager, dag): if keep_shared: assert (u1_staging / "shared.txt").exists() + + def assert_empty_directories(self, storage_manager, dag): + # in this case, the staging directories should not have been cleaned + # (because they overlap with storage), so all should exist. They + # will only be empty if `keep_shared is False` + u1_label = self.u1_label(dag) + staging = storage_manager.scratch_root / storage_manager.staging + + directories = [ + staging / u1_label / "nested", + staging / u1_label / "implicit", + ] + + for directory in directories: + assert directory.exists() + assert directory.is_dir() + expected_empty = not storage_manager.keep_shared + actual_empty = len(list(directory.iterdir())) == 0 + assert actual_empty == expected_empty From 3d7ca7b55de18e8e78ac08f92c4e40f9b827c9b8 Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Thu, 14 Dec 2023 15:20:55 -0600 Subject: [PATCH 61/69] tests for the directories in the overlap example --- gufe/tests/storage/test_storage_demo.py | 26 +++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/gufe/tests/storage/test_storage_demo.py b/gufe/tests/storage/test_storage_demo.py index c7a3f147..a78b1dca 100644 --- a/gufe/tests/storage/test_storage_demo.py +++ b/gufe/tests/storage/test_storage_demo.py @@ -359,10 +359,26 @@ def get_storage_manager(self, keep, tmp_path): ) return storage_manager - def test_overlap_directories(storage_manager): + def test_overlap_directories(self, tmp_path): # test that the staging and shared/permanent backends overlap as - # expected - pytest.skip() + # expected; basic idea is that creating a file in staging + # automatically creates a file in the shared/permanent external + # resources + storage_manager = self.get_storage_manager(keep="nothing", + tmp_path=tmp_path) + shared = storage_manager.shared_root + perm = storage_manager.permanent_root + stage1 = storage_manager.shared_staging / "foo/file.txt" + assert len(storage_manager.shared_staging.registry) == 0 + assert list(shared.iter_contents()) == [] + assert list(perm.iter_contents()) == [] + pathlib.Path(stage1).touch() + assert len(storage_manager.shared_staging.registry) == 1 + # because iter_contents actually loops over the directory, these + # have effectively been automatically added to the external storage + # resources + assert list(shared.iter_contents()) == ["foo/file.txt"] + assert list(perm.iter_contents()) == ["foo/file.txt"] def assert_shared_and_permanent(self, storage_manager, dag): shared = storage_manager.shared_root @@ -417,7 +433,9 @@ def assert_staging(self, storage_manager, dag): def assert_empty_directories(self, storage_manager, dag): # in this case, the staging directories should not have been cleaned # (because they overlap with storage), so all should exist. They - # will only be empty if `keep_shared is False` + # will only be empty if `keep_shared is False`. NOTE: I don't think + # it would be an API break here if code changed to ensure that all + # empty directories were cleaned out u1_label = self.u1_label(dag) staging = storage_manager.scratch_root / storage_manager.staging From c4126116865bea878e937211a88cd6694ba7b81c Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Thu, 14 Dec 2023 15:37:42 -0600 Subject: [PATCH 62/69] update for changes in preceding PRs --- gufe/storage/stagingserialization.py | 2 +- gufe/tests/storage/test_stagingserialization.py | 16 ++++++++-------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/gufe/storage/stagingserialization.py b/gufe/storage/stagingserialization.py index 72a6fc48..99deef60 100644 --- a/gufe/storage/stagingserialization.py +++ b/gufe/storage/stagingserialization.py @@ -1,6 +1,6 @@ from gufe.tokenization import JSON_HANDLER from gufe.custom_json import JSONCodec, JSONSerializerDeserializer -from .stagingdirectory import StagingPath +from .stagingregistry import StagingPath class StagingPathSerialization: diff --git a/gufe/tests/storage/test_stagingserialization.py b/gufe/tests/storage/test_stagingserialization.py index 47cc3ed1..e5df650b 100644 --- a/gufe/tests/storage/test_stagingserialization.py +++ b/gufe/tests/storage/test_stagingserialization.py @@ -1,7 +1,7 @@ import pytest from gufe.storage.stagingserialization import StagingPathSerialization -from gufe.storage.stagingdirectory import StagingPath +from gufe.storage.stagingregistry import StagingPath from gufe.storage.storagemanager import StorageManager from gufe.storage.externalresource import MemoryStorage, FileStorage @@ -92,7 +92,7 @@ def test_reload_file_contents(self, pathtype, request): path = request.getfixturevalue(f"{pathtype}_path") # remove the file (remains in the MemoryStorage) - p = pathlib.Path(path.fspath) + p = path.as_path() assert p.exists() p.unlink() assert not p.exists() @@ -228,13 +228,13 @@ def test_two_different_permanent_storages(self, tmp_path): json_str2 = json.dumps(path2, cls=handler2.encoder) # delete all staged files - assert pathlib.Path(path1.fspath).exists() + assert path1.as_path().exists() manager1.permanent_staging.cleanup() - assert not pathlib.Path(path1.fspath).exists() + assert not path1.as_path().exists() - assert pathlib.Path(path2.fspath).exists() + assert path2.as_path().exists() manager2.permanent_staging.cleanup() - assert not pathlib.Path(path2.fspath).exists() + assert not path2.as_path().exists() # reload and check contents of both permanent files reloaded1 = json.loads(json_str1, cls=handler1.decoder) @@ -242,12 +242,12 @@ def test_two_different_permanent_storages(self, tmp_path): assert isinstance(reloaded1, StagingPath) assert reloaded1.label == path1.label - assert not pathlib.Path(reloaded1.fspath).exists() + assert not reloaded1.as_path().exists() with open(reloaded1, mode='r') as f: assert f.read() == "contents 1" assert isinstance(reloaded2, StagingPath) assert reloaded2.label == path2.label - assert not pathlib.Path(reloaded2.fspath).exists() + assert not reloaded2.as_path().exists() with open(reloaded2, mode='r') as f: assert f.read() == "contents 2" From bf49c345ae858e9f87fd90f0b7d4a3899d53f8e6 Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Thu, 14 Dec 2023 16:51:45 -0600 Subject: [PATCH 63/69] First version of the change storage backend story --- .../storage/test_stagingserialization.py | 55 +++++++++++++++++-- 1 file changed, 51 insertions(+), 4 deletions(-) diff --git a/gufe/tests/storage/test_stagingserialization.py b/gufe/tests/storage/test_stagingserialization.py index e5df650b..bd0181df 100644 --- a/gufe/tests/storage/test_stagingserialization.py +++ b/gufe/tests/storage/test_stagingserialization.py @@ -196,10 +196,10 @@ def test_permanent_storage_moved(self, move, tmp_path, monkeypatch): assert contents == "contents here" def test_two_different_permanent_storages(self, tmp_path): - # I'm working with files from two different permanent storages. I - # need to be able to load from both in the same Python process. - # (NOTE: this user story is primarily to prevent us from changing to - # a solution based on global/class vars to set context.) + # USER STORY: I'm working with files from two different permanent + # storages. I need to be able to load from both in the same Python + # process. (NOTE: this user story is primarily to prevent us from + # changing to a solution based on global/class vars to set context.) manager1 = StorageManager( scratch_root=tmp_path / "working1", shared_root=MemoryStorage(), @@ -251,3 +251,50 @@ def test_two_different_permanent_storages(self, tmp_path): assert not reloaded2.as_path().exists() with open(reloaded2, mode='r') as f: assert f.read() == "contents 2" + + def test_change_storage_backend(self, tmp_path): + # USER STORY: I have generated data in one backend, and I tranferred + # it to another backend. It needs to be readable from the other + # backend. (Use case: data is in long-term cloud storage that + # requires credentials, but I want to share some part of that data + # with someone else by transferring it to a disk.) + cloud_manager = StorageManager( + scratch_root=tmp_path / "cloud", + shared_root=MemoryStorage(), + permanent_root=MemoryStorage(), + ) + cloud_serialization = StagingPathSerialization(cloud_manager) + + local_manager = StorageManager( + scratch_root=tmp_path / "local_scratch", + shared_root=MemoryStorage(), + permanent_root=FileStorage(tmp_path / "local_perm"), + ) + local_serialization = StagingPathSerialization(local_manager) + + # TODO: maybe add some more safety asserts in here? that each step + # goes as expected, to better diagnose potential failures? + # load data into the cloud storage + with cloud_manager.running_dag("dag") as dag_ctx: + with dag_ctx.running_unit("dag", "unit", attempt=0) as ctx: + cloud_path = ctx.permanent / "data.txt" + with open(cloud_path, mode='w') as f: + f.write("will store on cloud") + + # serialize the cloud_path (assume it is saved somewhere) + serialized = json.dumps(cloud_path, cls=cloud_serialization.encoder) + + # transfer from cloud storage to the local_manager + for label in cloud_manager.permanent_root.iter_contents("dag/unit"): + with cloud_manager.permanent_root.load_stream(label) as f: + local_manager.permanent_root.store_bytes(label, f.read()) + + # ensure that we can reload objects from the local manager + local_path = json.loads(serialized, cls=local_serialization.decoder) + + assert local_path != cloud_path + + with open(local_path, mode='r') as f: + contents = f.read() + + assert contents == "will store on cloud" From 10a40eb6aac37e8b9d4069cee5054b423e4175e5 Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Fri, 15 Dec 2023 15:26:28 -0600 Subject: [PATCH 64/69] docstrings; sketch out yet ANOTHER user story --- gufe/storage/stagingserialization.py | 70 ++++++++++++++++--- .../storage/test_stagingserialization.py | 14 ++++ 2 files changed, 76 insertions(+), 8 deletions(-) diff --git a/gufe/storage/stagingserialization.py b/gufe/storage/stagingserialization.py index 99deef60..39ead2b9 100644 --- a/gufe/storage/stagingserialization.py +++ b/gufe/storage/stagingserialization.py @@ -4,13 +4,21 @@ class StagingPathSerialization: - # TODO: where should this go? I think maybe on the storage manager + """Class for managing serialization of a :class:`.StagingPath`. + Serialization of a :class:`.StagingPath` needs to strip the specific + storage context (path to external files storage) because we should able + to change that out-of-process (e.g., move the directory containing + results) and still be able to deserialize correctly. This class is + responsible for abstracting/injecting the storage context for a + :class:`.StagingPath` is serialized/deserialized. + """ + # TODO: this long comment should probably go somewhere where it will + # show up in docs as well? Maybe just bump it into the class docstring? + # # Serializing staging paths # ------------------------- # - # Some important user stories to consider: - # # 1. I am loading my results object, and I will want to use the # associated files. This should be transparent, regardless of where # the permanent storage is located. @@ -22,6 +30,16 @@ class StagingPathSerialization: # pwd. # 4. I'm working with files from two different permanent storages. I # need to be able to load from both in the same Python process. + # 5. I have generated data in one backend, and I tranferred it to + # another backend. It needs to be readable from the other backend. + # (Use case: data is in long-term cloud storage that requires + # credentials, but I want to share some part of that data with + # someone else by transferring it to a disk.) + # 6. I am interfacing with a package that adds serialization types to + # the gufe JSON_HANDLER via an external JSONCodec. Maybe, in the + # worst case, the external codec gets added *after* I've created my + # serialization object. I need to be able to serialize those custom + # types. # # Outputs from a protocol may contain a :class:`.StagingPath`. Note that # a :class:`.StagingPath` is inherently not a complete description of @@ -43,14 +61,16 @@ class StagingPathSerialization: # directory. However, the reference to the file can exist in the results # object without downloading the file. # - # User stories 3 and 4 are handled by this + # User stories 3--6 are handled by this # :class:`.StagingPathSerialization` class. Story 3 is handled by # allowing the appropriate context (in the form of a # :class:`.StorageManager`) to be injected into the deserialization # process. Story 4 can be handled by using more than one # :class:`.StagingPathSerialization` context (associated with different - # :class:`.StorageManager` objects. - + # :class:`.StorageManager` objects. Story 5 is handled by injecting + # the appropriate context (and, in principle, is a variant of story 3.) + # Story 6 is handled by doing a just-in-time generation of the + # JSONSerializerDeserializer that we use for this class. def __init__(self, manager): self.manager = manager self.codec = JSONCodec( @@ -61,35 +81,69 @@ def __init__(self, manager): self.refresh_handler() def refresh_handler(self): + """Ensure that the current handler includes all registered codecs""" codecs = JSON_HANDLER.codecs + [self.codec] self.json_handler = JSONSerializerDeserializer(codecs) @property def encoder(self): + """ + JSONEncoder class to use when serializing a :class:`.StagingPath` + """ + self.refresh_handler() return self.json_handler.encoder @property def decoder(self): + """ + JSONdecoder class to use when deserializing a :class:`.StagingPath` + """ + self.refresh_handler() return self.json_handler.decoder - def to_dict(self, path): + def to_dict(self, path: StagingPath): + """ + Dict representation of a StagingPath, abstracting specific context. + + This provides a JSON-serializable representation of a StagingPath + where the specific context of the StagingPath (the specific storage + backend where it is located) is replaced by a generic representation + of 'scratch', 'shared', or 'permanent', allowing a new specific + context to be injected on deserialization. + """ # scratch, shared, permanent may form nested with progressively # smaller contexts, so the last of those it is in is where it should # be labelled. TODO: opportunity for performance improvement if # needed loc = None if path.label in self.manager.scratch_root.iterdir(): + # TODO: does this happen? we should only trigger this function + # on a StagingPath, and anything in scratch will only be + # pathlib.Path, right? loc = "scratch" if path.label in self.manager.shared_root.iter_contents(): loc = "shared" if path.label in self.manager.permanent_root.iter_contents(): loc = "permanent" + if loc is None: + raise RuntimeError( + f"Unable to serialize {path}: it does not appear to be " + "associated with storage managed by the context manager " + f"{self.manager}." + ) + return { ':container:': loc, ':label:': path.label, } - def from_dict(self, dct): + def from_dict(self, dct: dict) -> StagingPath: + """Recreate a StagingPath from its dict represnetation. + + This undoes the process from :method:`.to_dict`. It injects the + storage context in ``self.storage_manager`` into the deserialized + :class:`.StagingPath` instance. + """ staging = getattr(self.manager, f"{dct[':container:']}_staging") return staging / dct[':label:'] diff --git a/gufe/tests/storage/test_stagingserialization.py b/gufe/tests/storage/test_stagingserialization.py index bd0181df..d41256be 100644 --- a/gufe/tests/storage/test_stagingserialization.py +++ b/gufe/tests/storage/test_stagingserialization.py @@ -298,3 +298,17 @@ def test_change_storage_backend(self, tmp_path): contents = f.read() assert contents == "will store on cloud" + + def test_requires_new_codec(self, tmp_path): + # USER STORY: I am interfacing with a package that adds + # serialization types to the gufe JSON_HANDLER via an external + # JSONCodec. Maybe, in the worst case, the external codec gets added + # *after* I've created my serialization object. I need to be able to + # serialize those custom types. (NOTE: A better solution here is to + # have JSONCodecs also include some codec identifier in their + # `:is_custom:` field. That would allow us to dynamically add any + # missing codec, and only to do so when deserialization is needed. + # This is a change to the custom JSON stuff which hasn't been made + # yet. This might also allow faster deserialization by having + # :is_custom: map to something that can be used in a dispatch table.) + # TODO: implement test based on this user story From 4b2510cb7cf3ed4010192e93ac28e03cc5453ef6 Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Fri, 15 Dec 2023 15:50:47 -0600 Subject: [PATCH 65/69] minor docs update --- gufe/storage/stagingserialization.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/gufe/storage/stagingserialization.py b/gufe/storage/stagingserialization.py index 39ead2b9..c4db2df7 100644 --- a/gufe/storage/stagingserialization.py +++ b/gufe/storage/stagingserialization.py @@ -6,6 +6,11 @@ class StagingPathSerialization: """Class for managing serialization of a :class:`.StagingPath`. + This class is only created internally. Developers of executors will + interface with this indirectly through the :class:`.StorageManager`; the + expectation is that the only thing they will need is access to the + ``encoder`` and ``decoder`` properties. + Serialization of a :class:`.StagingPath` needs to strip the specific storage context (path to external files storage) because we should able to change that out-of-process (e.g., move the directory containing @@ -139,7 +144,7 @@ def to_dict(self, path: StagingPath): } def from_dict(self, dct: dict) -> StagingPath: - """Recreate a StagingPath from its dict represnetation. + """Recreate a StagingPath from its dict representation. This undoes the process from :method:`.to_dict`. It injects the storage context in ``self.storage_manager`` into the deserialized From 3390bb0273a5efaa3b8c80f4dfff606816e32198 Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Thu, 21 Dec 2023 17:44:27 -0500 Subject: [PATCH 66/69] test new codec added in staging serialization Adds a user story test where a type supported by a custom JSON codec is added to the result dict, and that codec isn't registered as of serialization object creation. --- .../storage/test_stagingserialization.py | 54 ++++++++++++++++++- 1 file changed, 53 insertions(+), 1 deletion(-) diff --git a/gufe/tests/storage/test_stagingserialization.py b/gufe/tests/storage/test_stagingserialization.py index d41256be..55c61ff0 100644 --- a/gufe/tests/storage/test_stagingserialization.py +++ b/gufe/tests/storage/test_stagingserialization.py @@ -5,6 +5,9 @@ from gufe.storage.storagemanager import StorageManager from gufe.storage.externalresource import MemoryStorage, FileStorage +from gufe.tokenization import GufeTokenizable, from_dict +from gufe.custom_json import JSONCodec + import json import pathlib import shutil @@ -53,6 +56,15 @@ def serialization_handler(storage_manager): return StagingPathSerialization(storage_manager) +class NewType: + # used in new codec test (putting as a nested class required fancier + # serialization approaches, easier to put it at module level) + # class where any instance is equivalent (carries no data) + def __eq__(self, other): + return isinstance(other, self.__class__) + + + class TestStagingPathSerialization: @pytest.mark.parametrize('pathtype', ['scratch', 'shared', 'permanent']) def test_round_trip(self, serialization_handler, pathtype, request): @@ -311,4 +323,44 @@ def test_requires_new_codec(self, tmp_path): # This is a change to the custom JSON stuff which hasn't been made # yet. This might also allow faster deserialization by having # :is_custom: map to something that can be used in a dispatch table.) - # TODO: implement test based on this user story + manager = StorageManager( + scratch_root=tmp_path / "working", + shared_root=MemoryStorage(), + permanent_root=MemoryStorage(), + ) + serialization = StagingPathSerialization(manager) + + # add a new custom codec for serialization + new_type_codec = JSONCodec( + cls=NewType, + to_dict=lambda obj: {}, + from_dict=lambda dct: NewType(), + ) + + # Create a dict to serialize; this represents the output dict that + # might come from a unit_result object. NB: including the file stuff + # here is actually extraneous (unless implementation changes + # significantly). + with manager.running_dag("dag") as dag_ctx: + with manager.running_unit("dag", "unit", attempt=0) as context: + file = context.permanent / "dag/unit/file.txt" + with open(file, mode='w') as f: + f.write("contents") + + output_dict = { + 'new_type_result': NewType(), + 'file_result': file, + } + + # before codec registration, error as not JSON serializable + with pytest.raises(TypeError, match="not JSON serializable"): + _ = json.dumps(output_dict, cls=serialization.encoder) + + # register codec and it works + from gufe.tokenization import JSON_HANDLER + JSON_HANDLER.add_codec(new_type_codec) + dumped = json.dumps(output_dict, cls=serialization.encoder) + + reloaded = json.loads(dumped, cls=serialization.decoder) + + assert reloaded == output_dict From da9955e713efaa8a61a67962d080ea839ffc72ef Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Fri, 22 Dec 2023 10:43:54 -0500 Subject: [PATCH 67/69] remove fspath from StagingRegistry This is to avoid likely footguns related to using os.path.join. Includes test of error on os.path.join. --- gufe/storage/stagingregistry.py | 5 +---- gufe/tests/storage/test_stagingregistry.py | 15 ++++++++++----- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/gufe/storage/stagingregistry.py b/gufe/storage/stagingregistry.py index 51d2696b..40b311a9 100644 --- a/gufe/storage/stagingregistry.py +++ b/gufe/storage/stagingregistry.py @@ -32,7 +32,7 @@ def _safe_to_delete_file( class StagingRegistry: - """PathLike local representation of an :class:`.ExternalStorage`. + """Local representation of an :class:`.ExternalStorage`. This connects objects on a local filesystem to the key-value store of a (possibly remote) :class:`.ExternalStorage`. It presents a FileLike @@ -202,9 +202,6 @@ def _load_file_from_external(self, external: ExternalStorage, def __truediv__(self, path: Union[PathLike, str]): return StagingPath(root=self, path=path) - def __fspath__(self): - return str(self.staging_dir) - def __repr__(self): return ( f"{self.__class__.__name__}('{self.scratch}', {self.external})" diff --git a/gufe/tests/storage/test_stagingregistry.py b/gufe/tests/storage/test_stagingregistry.py index 392d34d9..d5e5d71e 100644 --- a/gufe/tests/storage/test_stagingregistry.py +++ b/gufe/tests/storage/test_stagingregistry.py @@ -42,7 +42,7 @@ def read_only_with_overwritten(root_with_contents): delete_staging=root_with_contents.delete_staging, read_only=True ) - filename = pathlib.Path(read_only) / "old_unit/data.txt" + filename = (read_only / "old_unit/data.txt").as_path() assert not filename.exists() staged = read_only / "old_unit/data.txt" assert not filename.exists() @@ -140,6 +140,12 @@ def test_repr(self, root): assert r.startswith("SharedStaging") assert "MemoryStorage" in r + def test_fspath_fail(self, root): + # ensure that we get an error on os.path.join (or really, anything + # that hits os.fspath) + with pytest.raises(TypeError): + os.path.join(root, "filename.txt") + @pytest.mark.parametrize('pathlist', [ ['file.txt'], ['dir', 'file.txt'] ]) @@ -266,8 +272,7 @@ def test_transfer_read_only(self, read_only_with_overwritten, caplog): def test_cleanup(self, root_with_contents): root_with_contents.delete_staging = True # slightly naughty - root_path = pathlib.Path(root_with_contents.__fspath__()) - path = root_path / "new_unit/data.txt" + path = (root_with_contents / "new_unit/data.txt").as_path() assert path.exists() root_with_contents.cleanup() assert not path.exists() @@ -303,7 +308,7 @@ def test_cleanup_directory(self, root, caplog): assert "During staging cleanup, the directory" in caplog.text def test_register_cleanup_preexisting_file(self, root): - filename = pathlib.Path(root.__fspath__()) / "new_unit/foo.txt" + filename = (root / "new_unit/foo.txt").as_path() filename.parent.mkdir(parents=True, exist_ok=True) filename.touch() root.external.store_bytes("new_unit/foo.txt", b"") @@ -350,7 +355,7 @@ def test_delete_file_safe(self, tmp_path, is_safe): assert permanent._delete_file_safe(my_file) is is_safe def test_load_missing_for_transfer(self, permanent): - fname = pathlib.Path(permanent) / "old_unit/data.txt" + fname = (permanent / "old_unit/data.txt").as_path() assert not fname.exists() staging = permanent / "old_unit/data.txt" staging.__fspath__() From b22445bb3e043d13849a9eba0c3ae32e25799093 Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Fri, 22 Dec 2023 11:57:49 -0500 Subject: [PATCH 68/69] Attach StorageSerialization to StorageManager --- gufe/storage/storagemanager.py | 11 +++++++ .../storage/test_stagingserialization.py | 30 +++++++------------ 2 files changed, 22 insertions(+), 19 deletions(-) diff --git a/gufe/storage/storagemanager.py b/gufe/storage/storagemanager.py index 08f83e6a..ef95286a 100644 --- a/gufe/storage/storagemanager.py +++ b/gufe/storage/storagemanager.py @@ -10,6 +10,7 @@ from .externalresource import ExternalStorage, FileStorage from .stagingregistry import SharedStaging, PermanentStaging +from .stagingserialization import StagingPathSerialization from .stagingregistry import StagingPath # typing from gufe.protocols.protocolunit import Context @@ -57,6 +58,16 @@ def __init__( keep_empty_dirs=keep_empty_dirs, ) + self._serialization = StagingPathSerialization(self) + + @property + def json_encoder(self): + return self._serialization.encoder + + @property + def json_decoder(self): + return self._serialization.decoder + def make_label(self, dag_label, unit_label, attempt, **kwargs): """ diff --git a/gufe/tests/storage/test_stagingserialization.py b/gufe/tests/storage/test_stagingserialization.py index 55c61ff0..f36de19a 100644 --- a/gufe/tests/storage/test_stagingserialization.py +++ b/gufe/tests/storage/test_stagingserialization.py @@ -157,7 +157,6 @@ def test_permanent_storage_moved(self, move, tmp_path, monkeypatch): shared_root=FileStorage("old/shared"), permanent_root=FileStorage("old/permanent") ) - old_handler = StagingPathSerialization(old_manager) old_path = old_manager.permanent_staging / "dag/unit/result.txt" with open(old_path, mode='w') as f: f.write("contents here") @@ -167,7 +166,7 @@ def test_permanent_storage_moved(self, move, tmp_path, monkeypatch): assert perm_p.exists() # serialize the path object - json_str = json.dumps(old_path, cls=old_handler.encoder) + json_str = json.dumps(old_path, cls=old_manager.json_encoder) # move the storage subdirectory; create a new, associated storage # manager/serialization handler @@ -192,10 +191,8 @@ def test_permanent_storage_moved(self, move, tmp_path, monkeypatch): raise RuntimeWarning(f"Bad test parameter '{move}': should be " "'relative' or 'absolute'") - new_handler = StagingPathSerialization(new_manager) - # deserialize the path using the new serialization handler - reloaded = json.loads(json_str, cls=new_handler.decoder) + reloaded = json.loads(json_str, cls=new_manager.json_decoder) # ensure that the path exists and that the data can be reloaded assert isinstance(reloaded, StagingPath) @@ -222,8 +219,6 @@ def test_two_different_permanent_storages(self, tmp_path): shared_root=MemoryStorage(), permanent_root=MemoryStorage(), ) - handler1 = StagingPathSerialization(manager1) - handler2 = StagingPathSerialization(manager2) path1 = manager1.permanent_staging / "file1.txt" with open(path1, mode='w') as f: @@ -236,8 +231,8 @@ def test_two_different_permanent_storages(self, tmp_path): manager2.permanent_staging.transfer_staging_to_external() # serialize the paths - json_str1 = json.dumps(path1, cls=handler1.encoder) - json_str2 = json.dumps(path2, cls=handler2.encoder) + json_str1 = json.dumps(path1, cls=manager1.json_encoder) + json_str2 = json.dumps(path2, cls=manager2.json_encoder) # delete all staged files assert path1.as_path().exists() @@ -249,8 +244,8 @@ def test_two_different_permanent_storages(self, tmp_path): assert not path2.as_path().exists() # reload and check contents of both permanent files - reloaded1 = json.loads(json_str1, cls=handler1.decoder) - reloaded2 = json.loads(json_str2, cls=handler2.decoder) + reloaded1 = json.loads(json_str1, cls=manager1.json_decoder) + reloaded2 = json.loads(json_str2, cls=manager2.json_decoder) assert isinstance(reloaded1, StagingPath) assert reloaded1.label == path1.label @@ -275,14 +270,12 @@ def test_change_storage_backend(self, tmp_path): shared_root=MemoryStorage(), permanent_root=MemoryStorage(), ) - cloud_serialization = StagingPathSerialization(cloud_manager) local_manager = StorageManager( scratch_root=tmp_path / "local_scratch", shared_root=MemoryStorage(), permanent_root=FileStorage(tmp_path / "local_perm"), ) - local_serialization = StagingPathSerialization(local_manager) # TODO: maybe add some more safety asserts in here? that each step # goes as expected, to better diagnose potential failures? @@ -294,7 +287,7 @@ def test_change_storage_backend(self, tmp_path): f.write("will store on cloud") # serialize the cloud_path (assume it is saved somewhere) - serialized = json.dumps(cloud_path, cls=cloud_serialization.encoder) + serialized = json.dumps(cloud_path, cls=cloud_manager.json_encoder) # transfer from cloud storage to the local_manager for label in cloud_manager.permanent_root.iter_contents("dag/unit"): @@ -302,7 +295,7 @@ def test_change_storage_backend(self, tmp_path): local_manager.permanent_root.store_bytes(label, f.read()) # ensure that we can reload objects from the local manager - local_path = json.loads(serialized, cls=local_serialization.decoder) + local_path = json.loads(serialized, cls=local_manager.json_decoder) assert local_path != cloud_path @@ -328,7 +321,6 @@ def test_requires_new_codec(self, tmp_path): shared_root=MemoryStorage(), permanent_root=MemoryStorage(), ) - serialization = StagingPathSerialization(manager) # add a new custom codec for serialization new_type_codec = JSONCodec( @@ -354,13 +346,13 @@ def test_requires_new_codec(self, tmp_path): # before codec registration, error as not JSON serializable with pytest.raises(TypeError, match="not JSON serializable"): - _ = json.dumps(output_dict, cls=serialization.encoder) + _ = json.dumps(output_dict, cls=manager.json_encoder) # register codec and it works from gufe.tokenization import JSON_HANDLER JSON_HANDLER.add_codec(new_type_codec) - dumped = json.dumps(output_dict, cls=serialization.encoder) + dumped = json.dumps(output_dict, cls=manager.json_encoder) - reloaded = json.loads(dumped, cls=serialization.decoder) + reloaded = json.loads(dumped, cls=manager.json_decoder) assert reloaded == output_dict From 8534d71ecd21db392ec5e734f1dc6a980ae8e95a Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Wed, 17 Jan 2024 13:24:31 -0600 Subject: [PATCH 69/69] update for some review comments --- gufe/protocols/protocoldag.py | 2 +- gufe/protocols/protocolunit.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/gufe/protocols/protocoldag.py b/gufe/protocols/protocoldag.py index be6211f3..41b8f529 100644 --- a/gufe/protocols/protocoldag.py +++ b/gufe/protocols/protocoldag.py @@ -520,7 +520,7 @@ def new_execute_DAG( # TODO: this is a terrible name with dag_ctx.running_unit( dag_label, unit.key, attempt=attempt ) as context: - _logger.info("Starting unit {label}") + _logger.info(f"Starting unit {label}") _logger.info(context) result = unit.execute( context=context, diff --git a/gufe/protocols/protocolunit.py b/gufe/protocols/protocolunit.py index 388cb2b4..8e99e97a 100644 --- a/gufe/protocols/protocolunit.py +++ b/gufe/protocols/protocolunit.py @@ -23,7 +23,7 @@ GufeTokenizable, GufeKey, TOKENIZABLE_REGISTRY ) -from ..storage.stagingregistry import StagingRegistry +from ..storage.stagingregistry import StagingPath @dataclass @@ -33,8 +33,8 @@ class Context: """ scratch: PathLike - shared: StagingRegistry - permanent: StagingRegistry + shared: StagingPath + permanent: StagingPath def _list_dependencies(inputs, cls):