diff --git a/aiida/common/hashing.py b/aiida/common/hashing.py index 411cbd0bc7..ee8600bc73 100644 --- a/aiida/common/hashing.py +++ b/aiida/common/hashing.py @@ -41,9 +41,6 @@ HASHING_KEY = 'HashingKey' -# The key that is used to store the hash in the node extras -_HASH_EXTRA_KEY = '_aiida_hash' - ################################################################### # THE FOLLOWING WAS TAKEN FROM DJANGO BUT IT CAN BE EASILY REPLACED ################################################################### diff --git a/aiida/orm/nodes/node.py b/aiida/orm/nodes/node.py index 1a68ea3a2e..36a600c155 100644 --- a/aiida/orm/nodes/node.py +++ b/aiida/orm/nodes/node.py @@ -32,7 +32,7 @@ from aiida.common import exceptions from aiida.common.escaping import sql_string_match -from aiida.common.hashing import _HASH_EXTRA_KEY, make_hash +from aiida.common.hashing import make_hash from aiida.common.lang import classproperty, type_check from aiida.common.links import LinkType from aiida.manage.manager import get_manager @@ -122,6 +122,10 @@ class Node( """ # pylint: disable=too-many-public-methods + # The keys in the extras that are used to store the hash of the node and whether it should be used in caching. + _HASH_EXTRA_KEY: str = '_aiida_hash' + _VALID_CACHE_KEY: str = '_aiida_valid_cache' + # added by metaclass _plugin_type_string: ClassVar[str] _query_type_string: ClassVar[str] @@ -742,7 +746,7 @@ def _store(self, with_transaction: bool = True, clean: bool = True) -> 'Node': self._backend_entity.store(links, with_transaction=with_transaction, clean=clean) self._incoming_cache = [] - self._backend_entity.set_extra(_HASH_EXTRA_KEY, self.get_hash()) + self._backend_entity.set_extra(self._HASH_EXTRA_KEY, self.get_hash()) return self @@ -848,11 +852,11 @@ def _get_objects_to_hash(self) -> List[Any]: def rehash(self) -> None: """Regenerate the stored hash of the Node.""" - self.set_extra(_HASH_EXTRA_KEY, self.get_hash()) + self.set_extra(self._HASH_EXTRA_KEY, self.get_hash()) def clear_hash(self) -> None: """Sets the stored hash of the Node to None.""" - self.set_extra(_HASH_EXTRA_KEY, None) + self.set_extra(self._HASH_EXTRA_KEY, None) def get_cache_source(self) -> Optional[str]: """Return the UUID of the node that was used in creating this node from the cache, or None if it was not cached. @@ -905,22 +909,38 @@ def _iter_all_same_nodes(self, allow_before_store=False) -> Iterator['Node']: """ if not allow_before_store and not self.is_stored: raise exceptions.InvalidOperation('You can get the hash only after having stored the node') + node_hash = self._get_hash() if not node_hash or not self._cachable: return iter(()) builder = QueryBuilder(backend=self.backend) - builder.append(self.__class__, filters={'extras._aiida_hash': node_hash}, project='*', subclassing=False) - nodes_identical = (n[0] for n in builder.iterall()) + builder.append(self.__class__, filters={f'extras.{self._HASH_EXTRA_KEY}': node_hash}, subclassing=False) - return (node for node in nodes_identical if node.is_valid_cache) + return (node for node in builder.all(flat=True) if node.is_valid_cache) # type: ignore[misc,union-attr] @property def is_valid_cache(self) -> bool: - """Hook to exclude certain `Node` instances from being considered a valid cache.""" - # pylint: disable=no-self-use - return True + """Hook to exclude certain ``Node`` classes from being considered a valid cache. + + The base class assumes that all node instances are valid to cache from, unless the ``_VALID_CACHE_KEY`` extra + has been set to ``False`` explicitly. Subclasses can override this property with more specific logic, but should + probably also consider the value returned by this base class. + """ + return self.get_extra(self._VALID_CACHE_KEY, True) + + @is_valid_cache.setter + def is_valid_cache(self, valid: bool) -> None: + """Set whether this node instance is considered valid for caching or not. + + If a node instance has this property set to ``False``, it will never be used in the caching mechanism, unless + the subclass overrides the ``is_valid_cache`` property and ignores it implementation completely. + + :param valid: whether the node is valid or invalid for use in caching. + """ + type_check(valid, bool) + self.set_extra(self._VALID_CACHE_KEY, valid) def get_description(self) -> str: """Return a string with a description of the node. diff --git a/aiida/orm/nodes/process/process.py b/aiida/orm/nodes/process/process.py index e0582af59b..c61b874ca8 100644 --- a/aiida/orm/nodes/process/process.py +++ b/aiida/orm/nodes/process/process.py @@ -496,6 +496,14 @@ def is_valid_cache(self) -> bool: return is_valid_cache_func(self) + @is_valid_cache.setter + def is_valid_cache(self, valid: bool) -> None: + """Set whether this node instance is considered valid for caching or not. + + :param valid: whether the node is valid or invalid for use in caching. + """ + super().is_valid_cache = valid # type: ignore[misc] + def _get_objects_to_hash(self) -> List[Any]: """ Return a list of objects which should be included in the hash. diff --git a/docs/source/howto/run_codes.rst b/docs/source/howto/run_codes.rst index b525827d89..110381da05 100644 --- a/docs/source/howto/run_codes.rst +++ b/docs/source/howto/run_codes.rst @@ -497,7 +497,7 @@ Besides the on/off switch set by ``caching.default_enabled``, caching can be con caching.default_enabled profile True caching.disabled_for profile aiida.calculations:core.templatereplacer caching.enabled_for profile aiida.calculations:quantumespresso.pw - aiida.calculations:other + aiida.calculations:other In this example, caching is enabled by default, but explicitly disabled for calculations of the ``TemplatereplacerCalculation`` class, identified by its corresponding ``aiida.calculations:core.templatereplacer`` entry point string. It also shows how to enable caching for particular calculations (which has no effect here due to the profile-wide default). @@ -573,22 +573,9 @@ Caching can be enabled or disabled on a case-by-case basis by using the :class:` This affects only the current Python interpreter and won't change the behavior of the daemon workers. This means that this technique is only useful when using :py:class:`~aiida.engine.run`, and **not** with :py:class:`~aiida.engine.submit`. -If you suspect a node is being reused in error (e.g. during development), you can also manually *prevent* a specific node from being reused: - -#. Load one of the nodes you suspect to be a clone. - Check that :meth:`~aiida.orm.nodes.Node.get_cache_source` returns a UUID. - If it returns `None`, the node was not cloned. - -#. Clear the hashes of all nodes that are considered identical to this node: - - .. code:: python - - for node in node.get_all_same_nodes(): - node.clear_hash() - -#. Run your calculation again. - The node in question should no longer be reused. +Besides controlling which process classes are cached, it may be useful or necessary to control what already _stored_ nodes are used as caching _sources_. +Section :ref:`topics:provenance:caching:control-caching` provides details how AiiDA decides which stored nodes are equivalent to the node being stored and which are considered valid caching sources. .. |Computer| replace:: :py:class:`~aiida.orm.Computer` .. |CalcJob| replace:: :py:class:`~aiida.engine.processes.calcjobs.calcjob.CalcJob` diff --git a/docs/source/topics/provenance/caching.rst b/docs/source/topics/provenance/caching.rst index 2cb4e45e1e..d4c38e5eb4 100644 --- a/docs/source/topics/provenance/caching.rst +++ b/docs/source/topics/provenance/caching.rst @@ -87,13 +87,58 @@ For implementation details of the hashing mechanism for process nodes, see :ref: Controlling Caching ------------------- -Caching can be configured at runtime (see :ref:`how-to:run-codes:caching:configure`) and when implementing a new process class: +In the caching mechanism, there are two different types of roles played by the nodes: the node that is currently being stored is called the `target`, and the nodes already stored in the database that are considered to be equivalent are referred to as a `source`. + +Targets +....... + +Controlling what nodes will look in the database for existing equivalents when being stored is done on the class level. +Section :ref:`how-to:run-codes:caching:configure` explains how this can be controlled globally through the profile configuration, or locally through context managers. + +Sources +....... + +When a node is being stored (the `target`) and caching is enabled for its node class (see section above), a valid cache `source` is obtained through the method :meth:`~aiida.orm.nodes.node.Node._get_same_node`. +This method calls the iterator :meth:`~aiida.orm.nodes.node.Node._iter_all_same_nodes` and takes the first one it returns if there are any. +To find the list of `source` nodes that are equivalent to the `target` that is being stored, :meth:`~aiida.orm.nodes.node.Node._iter_all_same_nodes` performs the following steps: + + 1. It queries the database for all nodes that have the same hash as the `target` node. + 2. From the result, only those nodes are returned where the property :meth:`~aiida.orm.nodes.node.Node.is_valid_cache` returns ``True``. + +The property :meth:`~aiida.orm.nodes.node.Node.is_valid_cache` therefore allows to control whether a stored node can be used as a `source` in the caching mechanism. +By default, for all nodes, the property returns ``True``. +However, this can be changed on a per-node basis, by setting it to ``False`` + +.. code-block:: python + + node = load_node() + node.is_valid_cache = False + +Setting the property to ``False``, will cause an extra to be stored on the node in the database, such that even when it is loaded at a later point in time, ``is_valid_cache`` returns ``False``. + +.. code-block:: python + + node = load_node() + assert node.is_valid_cache is False + +Through this method, it is possible to guarantee that individual nodes are never used as a `source` for caching. + +The :class:`~aiida.engine.processes.process.Process` class overrides the :meth:`~aiida.orm.nodes.node.Node.is_valid_cache` property to give more fine-grained control on process nodes as caching sources. +If either :meth:`~aiida.orm.nodes.node.Node.is_valid_cache` of the base class or :meth:`~aiida.orm.nodes.process.process.ProcessNode.is_finished` returns ``False``, the process node is not a valid source. +Likewise, if the process class cannot be loaded from the node, through the :meth:`~aiida.orm.nodes.process.process.ProcessNode.process_class`, the node is not a valid caching source. +Finally, if the associated process class implements the :meth:`~aiida.engine.processes.process.Process.is_valid_cache` method, it is called, passing the node as an argument. +If that returns ``True``, the node is considered to be a valid caching source. + +The :meth:`~aiida.engine.processes.process.Process.is_valid_cache` is implemented on the :class:`~aiida.engine.processes.process.Process` class. +It will check whether the exit code that is set on the node, if any, has the keyword argument ``invalidates_cache`` set to ``True``, in which case the property will return ``False`` indicating the node is not a valid caching source. +Whether an exit code invalidates the cache, is controlled with the ``invalidates_cache`` argument when it is defined on the process spec through the :meth:`spec.exit_code ` method. + +.. warning:: + + Process plugins can override the :meth:`~aiida.engine.processes.process.Process.is_valid_cache` method, to further control how nodes are considered valid caching sources. + When doing so, make sure to call :meth:`super().is_valid_cache(node) ` and respect its output: if it is `False`, your implementation should also return `False`. + If you do not comply with this, the ``invalidates_cache`` keyword on exit codes will no longer work. -* The :meth:`spec.exit_code ` has a keyword argument ``invalidates_cache``. - If this is set to ``True``, that means that a calculation with this exit code will not be used as a cache source for another one, even if their hashes match. -* The :class:`Process ` parent class from which calcjobs inherit has an :meth:`is_valid_cache ` method, which can be overridden in the plugin to implement custom ways of invalidating the cache. - When doing this, make sure to call :meth:`super().is_valid_cache(node)` and respect its output: if it is `False`, your implementation should also return `False`. - If you do not comply with this, the 'invalidates_cache' keyword on exit codes will not work. .. _topics:provenance:caching:limitations: diff --git a/tests/orm/node/test_node.py b/tests/orm/node/test_node.py index 258d87d398..d1cb0c028b 100644 --- a/tests/orm/node/test_node.py +++ b/tests/orm/node/test_node.py @@ -917,56 +917,66 @@ def test_remove_comment(self): @pytest.mark.usefixtures('clear_database_before_test') -def test_store_from_cache(): - """Regression test for storing a Node with (nested) repository content with caching.""" - data = Data() - with tempfile.TemporaryDirectory() as tmpdir: - dir_path = os.path.join(tmpdir, 'directory') - os.makedirs(dir_path) - with open(os.path.join(dir_path, 'file'), 'w', encoding='utf8') as file: - file.write('content') - data.put_object_from_tree(tmpdir) +class TestNodeCaching: + """Tests the caching behavior of the ``Node`` class.""" - data.store() + def test_is_valid_cache(self): + """Test the ``Node.is_valid_cache`` property.""" + node = Node() + assert node.is_valid_cache - clone = data.clone() - clone._store_from_cache(data, with_transaction=True) # pylint: disable=protected-access + node.is_valid_cache = False + assert not node.is_valid_cache - assert clone.is_stored - assert clone.get_cache_source() == data.uuid - assert data.get_hash() == clone.get_hash() - - -@pytest.mark.usefixtures('clear_database_before_test') -def test_hashing_errors(aiida_caplog): - """Tests that ``get_hash`` fails in an expected manner.""" - node = Data().store() - node.__module__ = 'unknown' # this will inhibit package version determination - result = node.get_hash(ignore_errors=True) - assert result is None - assert aiida_caplog.record_tuples == [(node.logger.name, logging.ERROR, 'Node hashing failed')] - - with pytest.raises(exceptions.HashingError, match='package version could not be determined'): - result = node.get_hash(ignore_errors=False) - assert result is None + with pytest.raises(TypeError): + node.is_valid_cache = 'false' + def test_store_from_cache(self): + """Regression test for storing a Node with (nested) repository content with caching.""" + data = Data() + with tempfile.TemporaryDirectory() as tmpdir: + dir_path = os.path.join(tmpdir, 'directory') + os.makedirs(dir_path) + with open(os.path.join(dir_path, 'file'), 'w', encoding='utf8') as file: + file.write('content') + data.put_object_from_tree(tmpdir) -@pytest.mark.usefixtures('clear_database_before_test') -def test_uuid_equality_fallback(): - """Tests the fallback mechanism of checking equality by comparing uuids and hash.""" - node_0 = Data().store() + data.store() - nodepk = Data().store().pk - node_a = load_node(pk=nodepk) - node_b = load_node(pk=nodepk) + clone = data.clone() + clone._store_from_cache(data, with_transaction=True) # pylint: disable=protected-access - assert node_a == node_b - assert node_a != node_0 - assert node_b != node_0 + assert clone.is_stored + assert clone.get_cache_source() == data.uuid + assert data.get_hash() == clone.get_hash() - assert hash(node_a) == hash(node_b) - assert hash(node_a) != hash(node_0) - assert hash(node_b) != hash(node_0) + def test_hashing_errors(self, aiida_caplog): + """Tests that ``get_hash`` fails in an expected manner.""" + node = Data().store() + node.__module__ = 'unknown' # this will inhibit package version determination + result = node.get_hash(ignore_errors=True) + assert result is None + assert aiida_caplog.record_tuples == [(node.logger.name, logging.ERROR, 'Node hashing failed')] + + with pytest.raises(exceptions.HashingError, match='package version could not be determined'): + result = node.get_hash(ignore_errors=False) + assert result is None + + def test_uuid_equality_fallback(self): + """Tests the fallback mechanism of checking equality by comparing uuids and hash.""" + node_0 = Data().store() + + nodepk = Data().store().pk + node_a = load_node(pk=nodepk) + node_b = load_node(pk=nodepk) + + assert node_a == node_b + assert node_a != node_0 + assert node_b != node_0 + + assert hash(node_a) == hash(node_b) + assert hash(node_a) != hash(node_0) + assert hash(node_b) != hash(node_0) @pytest.mark.usefixtures('clear_database_before_test')