Skip to content

Commit

Permalink
Merge pull request #120 from Oxid15/develop
Browse files Browse the repository at this point in the history
Patch 0.7.3
  • Loading branch information
Oxid15 authored Oct 6, 2022
2 parents 6c93aa2 + 187fd03 commit fb92b6f
Show file tree
Hide file tree
Showing 41 changed files with 543 additions and 306 deletions.
2 changes: 1 addition & 1 deletion cascade/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
"""


__version__ = '0.7.2'
__version__ = '0.7.3'
__author__ = 'Ilia Moiseev'
__author_email__ = '[email protected]'

Expand Down
112 changes: 64 additions & 48 deletions cascade/base/meta_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,8 @@

import os
import json
from typing import Union
import datetime
from typing import List, Dict
from typing import Union, List, Dict
from json import JSONEncoder

import yaml
Expand Down Expand Up @@ -61,15 +60,15 @@ def default(self, obj):

return super(CustomEncoder, self).default(obj)

def obj_to_dict(self, obj):
def obj_to_dict(self, obj) -> Dict:
return json.loads(self.encode(obj))


class BaseHandler:
def read(self, path) -> List[Dict]:
def read(self, path: str) -> Union[Dict, List[Dict]]:
raise NotImplementedError()

def write(self, path, obj, overwrite=True) -> None:
def write(self, path: str, obj, overwrite=True) -> None:
raise NotImplementedError()

def _raise_io_error(self, path, exc):
Expand All @@ -80,24 +79,7 @@ def _raise_io_error(self, path, exc):


class JSONHandler(BaseHandler):
"""
Handles the logic of dumping and loading json files
"""
def read(self, path) -> Union[Dict, List[Dict]]:
"""
Reads json from path
Parameters
----------
path:
Path to the file. If no extension provided,
then .json will be added
Raises
------
IOError
when decoding errors occur
"""
def read(self, path: str) -> Union[Dict, List[Dict]]:
_, ext = os.path.splitext(path)
if ext == '':
path += '.json'
Expand All @@ -111,32 +93,16 @@ def read(self, path) -> Union[Dict, List[Dict]]:
self._raise_io_error(path, e)
return meta

def write(self, name, obj: List[Dict], overwrite=True) -> None:
"""
Writes json to path using custom encoder
"""
if not overwrite and os.path.exists(name):
def write(self, path:str, obj: List[Dict], overwrite=True) -> None:
if not overwrite and os.path.exists(path):
return

with open(name, 'w') as f:
with open(path, 'w') as f:
json.dump(obj, f, cls=CustomEncoder, indent=4)


class YAMLHandler(BaseHandler):
def read(self, path) -> Union[Dict, List[Dict]]:
"""
Reads yaml from path
Parameters
----------
path:
Path to the file. If no extension provided, then .yml will be added
Raises
------
IOError
when decoding errors occur
"""
def read(self, path: str) -> Union[Dict, List[Dict]]:
_, ext = os.path.splitext(path)
if ext == '':
path += '.yml'
Expand All @@ -148,7 +114,7 @@ def read(self, path) -> Union[Dict, List[Dict]]:
self._raise_io_error(path, e)
return meta

def write(self, path, obj, overwrite=True) -> None:
def write(self, path: str, obj, overwrite=True) -> None:
if not overwrite and os.path.exists(path):
return

Expand All @@ -158,14 +124,14 @@ def write(self, path, obj, overwrite=True) -> None:


class TextHandler(BaseHandler):
def read(self, path) -> Dict:
def read(self, path: str) -> Dict:
"""
Reads text file from path and returns dict
in the form {path: 'text from file'}
Parameters
----------
path:
path: str
Path to the file
"""

Expand All @@ -179,11 +145,61 @@ def write(self, path, obj, overwrite=True) -> None:


class MetaHandler:
def read(self, path) -> List[Dict]:
"""
Encapsulates the logic of reading and writing metadata to disk.
Supported read-write formats are `json` and `yml`. Other formats
are supported as read-only. For example one can read meta from txt or md file.
Examples
--------
>>> from cascade.base import MetaHandler
>>> mh = MetaHandler()
>>> mh.write('meta.json', {'hello': 'world'})
>>> obj = mh.read('meta.json')
>>> mh.write('meta.yml', {'hello': 'world'})
>>> obj = mh.read('meta.yml')
"""
def read(self, path: str) -> Union[Dict, List[Dict]]:
"""
Reads object from path.
Parameters
----------
path: str
Path to the object.
Returns
-------
obj: Union[Dict, List[Dict]]
Raises
------
IOError
when decoding errors occur
"""
handler = self._get_handler(path)
return handler.read(path)

def write(self, path, obj, overwrite=True) -> None:
def write(self, path: str, obj, overwrite:bool = True) -> None:
"""
Writes object to path.
Parameters
----------
path: str
Path where to write object with name and extension
obj
An object to be serialized and saved
overwrite: bool, optional
Whether to overwrite the file if it already exists. If False
and file already exists will silently return without saving.
Raises
------
IOError
when encoding errors occur
"""
handler = self._get_handler(path)
return handler.write(path, obj, overwrite=overwrite)

Expand Down
26 changes: 21 additions & 5 deletions cascade/base/traceable.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,23 @@


class Traceable:
def __init__(self, *args, meta_prefix=None, **kwargs) -> None:
"""
Base class for everything that has metadata in cascade.
Handles the logic of getting and updating internal meta prefix.
"""
def __init__(self, *args, meta_prefix:Union[Dict, str] = None, **kwargs) -> None:
"""
Parameters
----------
meta_prefix: Union[Dict, str], optional
The dictionary that is used to update object's meta in `get_meta` call.
Due to the call of update can overwrite default values.
If str - prefix assumed to be path and loaded using MetaHandler.
See also
--------
cascade.base.MetaHandler
"""
if meta_prefix is None:
meta_prefix = {}
elif isinstance(meta_prefix, str):
Expand All @@ -22,8 +38,8 @@ def get_meta(self) -> List[Dict]:
meta: List[Dict]
A list where last element is this object's metadata.
Meta can be anything that is worth to document about
the object and its properties. This is done in form
of list to enable cascade-like calls in Modifiers and Samplers.
the object and its properties.
Meta is list to allow the formation of pipelines.
"""
meta = {
'name': repr(self)
Expand All @@ -36,8 +52,8 @@ def get_meta(self) -> List[Dict]:

def update_meta(self, obj: Union[Dict, str]) -> None:
"""
Updates _meta_prefix, which is then updates
dataset's meta when get_meta() is called
Updates `_meta_prefix`, which then updates
dataset's meta when `get_meta()` is called
"""
if isinstance(obj, str):
obj = self._read_meta_from_file(obj)
Expand Down
16 changes: 13 additions & 3 deletions cascade/data/apply_modifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,17 +20,27 @@

class ApplyModifier(Modifier):
"""
Modifier that maps a function to previous dataset's elements in a lazy way.
Modifier that maps a function to given dataset's items in a lazy way.
"""
def __init__(self, dataset: Dataset, func: Callable, *args, **kwargs) -> None:
"""
Parameters
----------
dataset: Dataset
a dataset to modify
A dataset to modify
func: Callable
a function to be applied to every item of a dataset -
A function to be applied to every item of a dataset -
each `__getitem__` would call `func` on an item obtained from a previous dataset
Examples
--------
>>> from cascade import data as cdd
>>> ds = cdd.Wrapper([0, 1, 2, 3, 4])
>>> ds = cdd.ApplyModifier(ds, lambda x: x ** 2)
Now function will only be applied when items are retrieved
>>> assert [item for item in ds] == [0, 1, 4, 9, 16]
"""
super().__init__(dataset, *args, **kwargs)
self._func = func
Expand Down
33 changes: 30 additions & 3 deletions cascade/data/bruteforce_cacher.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,41 @@

class BruteforceCacher(Modifier):
"""
Unusual modifier which loads everything in memory in initialization phase
and then returns values from cache
Identity modifier that calls all previous pipeline in __init__ loading everything
in memory. This is useful in combination with `Pickler` when pipeline
has heavy operations upstream. You can load everything and pickle it to turn off
heavy part of the pipeline.
Examples
--------
>>> from cascade import data as cdd
>>> ds = cdd.Wrapper([0 for _ in range(1000000)])
>>> ds = cdd.ApplyModifier(ds, lambda x: x + 1)
>>> ds = cdd.ApplyModifier(ds, lambda x: x + 1)
>>> ds = cdd.ApplyModifier(ds, lambda x: x + 1)
Cache heavy upstream part once
>>> ds = cdd.BruteforceCacher(ds)
Then pickle it
>>> ds = cdd.Pickler('ds', ds)
Unpickle and use further
>>> ds = cdd.Pickler('ds')
>>> ds = cdd.RandomSampler(ds, 1000)
See also
--------
Cascade.data.SequentialCacher
cascade.data.SequentialCacher
cascade.data.Pickler
"""
def __init__(self, dataset: Dataset, *args, **kwargs) -> None:
"""
Loads every item in dataset in internal list.
"""
super().__init__(dataset, *args, **kwargs)
# forcibly calling all previous datasets in the init
if hasattr(self._dataset, '__len__') and hasattr(self._dataset, '__getitem__'):
Expand Down
16 changes: 11 additions & 5 deletions cascade/data/concatenator.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,15 +23,23 @@
class Concatenator(Dataset):
"""
Unifies several Datasets under one, calling them sequentially in the provided order.
Examples
--------
>>> from cascade.data import Wrapper, Concatenator
>>> ds_1 = Wrapper([0, 1, 2])
>>> ds_2 = Wrapper([2, 1, 0])
>>> ds = Concatenator((ds_1, ds_2))
>>> assert [item for item in ds] == [0, 1, 2, 2, 1, 0]
"""
def __init__(self, datasets: Iterable[Dataset], *args, **kwargs) -> None:
"""
Creates concatenated dataset from the list of datasets provided
Parameters
----------
datasets: Iterable[Dataset]
a list or tuple of datasets to concatenate
datasets: Union[Iterable[Dataset], Mapping[Dataset]]
A list or tuple of datasets to concatenate
"""
self._datasets = datasets
lengths = [len(ds) for ds in self._datasets]
Expand Down Expand Up @@ -67,7 +75,5 @@ def get_meta(self) -> List[Dict]:
Concatenator calls `get_meta()` of all its datasets
"""
meta = super().get_meta()
meta[0]['data'] = {}
for ds in self._datasets:
meta[0]['data'][repr(ds)] = ds.get_meta()
meta[0]['data'] = [ds.get_meta() for ds in self._datasets]
return meta
11 changes: 2 additions & 9 deletions cascade/data/cyclic_sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,15 +25,8 @@ class CyclicSampler(Sampler):
-------
>>> from cascade.data import CyclicSampler, Wrapper
>>> ds = Wrapper([1,2,3])
>>> ds = CyclicSampler(ds, 5)
>>> for item in ds:
... print(item)
...
1
2
3
1
2
>>> ds = CyclicSampler(ds, 7)
>>> assert [item for item in ds] == [1, 2, 3, 1, 2, 3, 1]
"""
def __getitem__(self, index) -> T:
internal_index = index % len(self._dataset)
Expand Down
Loading

0 comments on commit fb92b6f

Please sign in to comment.