Merge pull request #120 from Oxid15/develop

Patch 0.7.3
Oxid15 · Oct 6, 2022 · fb92b6f · fb92b6f
2 parents 6c93aa2 + 187fd03
commit fb92b6f
Show file tree

Hide file tree

Showing 41 changed files with 543 additions and 306 deletions.
diff --git a/cascade/__init__.py b/cascade/__init__.py
@@ -15,7 +15,7 @@
 """
 
 
-__version__ = '0.7.2'
+__version__ = '0.7.3'
 __author__ = 'Ilia Moiseev'
 __author_email__ = '[email protected]'
 

diff --git a/cascade/base/meta_handler.py b/cascade/base/meta_handler.py
@@ -16,9 +16,8 @@
 
 import os
 import json
-from typing import Union
 import datetime
-from typing import List, Dict
+from typing import Union, List, Dict
 from json import JSONEncoder
 
 import yaml
@@ -61,15 +60,15 @@ def default(self, obj):
 
         return super(CustomEncoder, self).default(obj)
 
-    def obj_to_dict(self, obj):
+    def obj_to_dict(self, obj) -> Dict:
         return json.loads(self.encode(obj))
 
 
 class BaseHandler:
-    def read(self, path) -> List[Dict]:
+    def read(self, path: str) -> Union[Dict, List[Dict]]:
         raise NotImplementedError()
 
-    def write(self, path, obj, overwrite=True) -> None:
+    def write(self, path: str, obj, overwrite=True) -> None:
         raise NotImplementedError()
 
     def _raise_io_error(self, path, exc):
@@ -80,24 +79,7 @@ def _raise_io_error(self, path, exc):
 
 
 class JSONHandler(BaseHandler):
-    """
-    Handles the logic of dumping and loading json files
-    """
-    def read(self, path) -> Union[Dict, List[Dict]]:
-        """
-        Reads json from path
-
-        Parameters
-        ----------
-        path:
-            Path to the file. If no extension provided,
-            then .json will be added
-
-        Raises
-        ------
-        IOError
-            when decoding errors occur
-        """
+    def read(self, path: str) -> Union[Dict, List[Dict]]:
         _, ext = os.path.splitext(path)
         if ext == '':
             path += '.json'
@@ -111,32 +93,16 @@ def read(self, path) -> Union[Dict, List[Dict]]:
                 self._raise_io_error(path, e)
             return meta
 
-    def write(self, name, obj: List[Dict], overwrite=True) -> None:
-        """
-        Writes json to path using custom encoder
-        """
-        if not overwrite and os.path.exists(name):
+    def write(self, path:str, obj: List[Dict], overwrite=True) -> None:
+        if not overwrite and os.path.exists(path):
             return
 
-        with open(name, 'w') as f:
+        with open(path, 'w') as f:
             json.dump(obj, f, cls=CustomEncoder, indent=4)
 
 
 class YAMLHandler(BaseHandler):
-    def read(self, path) -> Union[Dict, List[Dict]]:
-        """
-        Reads yaml from path
-
-        Parameters
-        ----------
-        path:
-            Path to the file. If no extension provided, then .yml will be added
-
-        Raises
-        ------
-        IOError
-            when decoding errors occur
-        """
+    def read(self, path: str) -> Union[Dict, List[Dict]]:
         _, ext = os.path.splitext(path)
         if ext == '':
             path += '.yml'
@@ -148,7 +114,7 @@ def read(self, path) -> Union[Dict, List[Dict]]:
                 self._raise_io_error(path, e)
             return meta
 
-    def write(self, path, obj, overwrite=True) -> None:
+    def write(self, path: str, obj, overwrite=True) -> None:
         if not overwrite and os.path.exists(path):
             return
 
@@ -158,14 +124,14 @@ def write(self, path, obj, overwrite=True) -> None:
 
 
 class TextHandler(BaseHandler):
-    def read(self, path) -> Dict:
+    def read(self, path: str) -> Dict:
         """
         Reads text file from path and returns dict
         in the form {path: 'text from file'}
 
         Parameters
         ----------
-        path:
+        path: str
             Path to the file
         """
 
@@ -179,11 +145,61 @@ def write(self, path, obj, overwrite=True) -> None:
 
 
 class MetaHandler:
-    def read(self, path) -> List[Dict]:
+    """
+    Encapsulates the logic of reading and writing metadata to disk.
+
+    Supported read-write formats are `json` and `yml`. Other formats
+    are supported as read-only. For example one can read meta from txt or md file.
+
+    Examples
+    --------
+    >>> from cascade.base import MetaHandler
+    >>> mh = MetaHandler()
+    >>> mh.write('meta.json', {'hello': 'world'})
+    >>> obj = mh.read('meta.json')
+    >>> mh.write('meta.yml', {'hello': 'world'})
+    >>> obj = mh.read('meta.yml')
+    """
+    def read(self, path: str) -> Union[Dict, List[Dict]]:
+        """
+        Reads object from path.
+
+        Parameters
+        ----------
+            path: str
+                Path to the object.
+
+        Returns
+        -------
+            obj: Union[Dict, List[Dict]]
+
+        Raises
+        ------
+        IOError
+            when decoding errors occur
+        """
         handler = self._get_handler(path)
         return handler.read(path)
 
-    def write(self, path, obj, overwrite=True) -> None:
+    def write(self, path: str, obj, overwrite:bool = True) -> None:
+        """
+        Writes object to path.
+
+        Parameters
+        ----------
+            path: str
+                Path where to write object with name and extension
+            obj
+                An object to be serialized and saved
+            overwrite: bool, optional
+                Whether to overwrite the file if it already exists. If False
+                and file already exists will silently return without saving.
+
+        Raises
+        ------
+        IOError
+            when encoding errors occur
+        """
         handler = self._get_handler(path)
         return handler.write(path, obj, overwrite=overwrite)
 

diff --git a/cascade/base/traceable.py b/cascade/base/traceable.py
@@ -3,7 +3,23 @@
 
 
 class Traceable:
-    def __init__(self, *args, meta_prefix=None, **kwargs) -> None:
+    """
+    Base class for everything that has metadata in cascade.
+    Handles the logic of getting and updating internal meta prefix.
+    """
+    def __init__(self, *args, meta_prefix:Union[Dict, str] = None, **kwargs) -> None:
+        """
+        Parameters
+        ----------
+        meta_prefix: Union[Dict, str], optional
+            The dictionary that is used to update object's meta in `get_meta` call.
+            Due to the call of update can overwrite default values.
+            If str - prefix assumed to be path and loaded using MetaHandler.
+        
+        See also
+        --------
+        cascade.base.MetaHandler
+        """
         if meta_prefix is None:
             meta_prefix = {}
         elif isinstance(meta_prefix, str):
@@ -22,8 +38,8 @@ def get_meta(self) -> List[Dict]:
         meta: List[Dict]
             A list where last element is this object's metadata.
             Meta can be anything that is worth to document about
-            the object and its properties. This is done in form
-            of list to enable cascade-like calls in Modifiers and Samplers.
+            the object and its properties.
+            Meta is list to allow the formation of pipelines.
         """
         meta = {
             'name': repr(self)
@@ -36,8 +52,8 @@ def get_meta(self) -> List[Dict]:
 
     def update_meta(self, obj: Union[Dict, str]) -> None:
         """
-        Updates _meta_prefix, which is then updates
-        dataset's meta when get_meta() is called
+        Updates `_meta_prefix`, which then updates
+        dataset's meta when `get_meta()` is called
         """
         if isinstance(obj, str):
             obj = self._read_meta_from_file(obj)

diff --git a/cascade/data/apply_modifier.py b/cascade/data/apply_modifier.py
@@ -20,17 +20,27 @@
 
 class ApplyModifier(Modifier):
     """
-    Modifier that maps a function to previous dataset's elements in a lazy way.
+    Modifier that maps a function to given dataset's items in a lazy way.
     """
     def __init__(self, dataset: Dataset, func: Callable, *args, **kwargs) -> None:
         """
         Parameters
         ----------
         dataset: Dataset
-            a dataset to modify
+            A dataset to modify
         func: Callable
-            a function to be applied to every item of a dataset -
+            A function to be applied to every item of a dataset -
             each `__getitem__` would call `func` on an item obtained from a previous dataset
+
+        Examples
+        --------
+        >>> from cascade import data as cdd
+        >>> ds = cdd.Wrapper([0, 1, 2, 3, 4])
+        >>> ds = cdd.ApplyModifier(ds, lambda x: x ** 2)
+
+        Now function will only be applied when items are retrieved
+
+        >>> assert [item for item in ds] == [0, 1, 4, 9, 16]
         """
         super().__init__(dataset, *args, **kwargs)
         self._func = func

diff --git a/cascade/data/bruteforce_cacher.py b/cascade/data/bruteforce_cacher.py
@@ -20,14 +20,41 @@
 
 class BruteforceCacher(Modifier):
     """
-    Unusual modifier which loads everything in memory in initialization phase
-    and then returns values from cache
+    Identity modifier that calls all previous pipeline in __init__ loading everything
+    in memory. This is useful in combination with `Pickler` when pipeline
+    has heavy operations upstream. You can load everything and pickle it to turn off
+    heavy part of the pipeline.
+
+    Examples
+    --------
+    >>> from cascade import data as cdd
+    >>> ds = cdd.Wrapper([0 for _ in range(1000000)])
+    >>> ds = cdd.ApplyModifier(ds, lambda x: x + 1)
+    >>> ds = cdd.ApplyModifier(ds, lambda x: x + 1)
+    >>> ds = cdd.ApplyModifier(ds, lambda x: x + 1)
+
+    Cache heavy upstream part once
+
+    >>> ds = cdd.BruteforceCacher(ds)
+
+    Then pickle it
+
+    >>> ds = cdd.Pickler('ds', ds)
+
+    Unpickle and use further
+
+    >>> ds = cdd.Pickler('ds')
+    >>> ds = cdd.RandomSampler(ds, 1000)
 
     See also
     --------
-    Cascade.data.SequentialCacher
+    cascade.data.SequentialCacher
+    cascade.data.Pickler
     """
     def __init__(self, dataset: Dataset, *args, **kwargs) -> None:
+        """
+        Loads every item in dataset in internal list.
+        """
         super().__init__(dataset, *args, **kwargs)
         # forcibly calling all previous datasets in the init
         if hasattr(self._dataset, '__len__') and hasattr(self._dataset, '__getitem__'):

diff --git a/cascade/data/concatenator.py b/cascade/data/concatenator.py
@@ -23,15 +23,23 @@
 class Concatenator(Dataset):
     """
     Unifies several Datasets under one, calling them sequentially in the provided order.
+
+    Examples
+    --------
+    >>> from cascade.data import Wrapper, Concatenator
+    >>> ds_1 = Wrapper([0, 1, 2])
+    >>> ds_2 = Wrapper([2, 1, 0])
+    >>> ds = Concatenator((ds_1, ds_2))
+    >>> assert [item for item in ds] == [0, 1, 2, 2, 1, 0]
     """
     def __init__(self, datasets: Iterable[Dataset], *args, **kwargs) -> None:
         """
         Creates concatenated dataset from the list of datasets provided
 
         Parameters
         ----------
-        datasets: Iterable[Dataset]
-            a list or tuple of datasets to concatenate
+        datasets: Union[Iterable[Dataset], Mapping[Dataset]]
+            A list or tuple of datasets to concatenate
         """
         self._datasets = datasets
         lengths = [len(ds) for ds in self._datasets]
@@ -67,7 +75,5 @@ def get_meta(self) -> List[Dict]:
         Concatenator calls `get_meta()` of all its datasets
         """
         meta = super().get_meta()
-        meta[0]['data'] = {}
-        for ds in self._datasets:
-            meta[0]['data'][repr(ds)] = ds.get_meta()
+        meta[0]['data'] = [ds.get_meta() for ds in self._datasets]
         return meta
diff --git a/cascade/data/cyclic_sampler.py b/cascade/data/cyclic_sampler.py
@@ -25,15 +25,8 @@ class CyclicSampler(Sampler):
     -------
     >>> from cascade.data import CyclicSampler, Wrapper
     >>> ds = Wrapper([1,2,3])
-    >>> ds = CyclicSampler(ds, 5)
-    >>> for item in ds:
-    ...     print(item)
-    ...
-    1
-    2
-    3
-    1
-    2
+    >>> ds = CyclicSampler(ds, 7)
+    >>> assert [item for item in ds] == [1, 2, 3, 1, 2, 3, 1]
     """
     def __getitem__(self, index) -> T:
         internal_index = index % len(self._dataset)