From e9af6d7641d83a50e144802b66fa59fb5f50f957 Mon Sep 17 00:00:00 2001 From: Ilia Moiseev Date: Thu, 29 Sep 2022 22:36:46 +0300 Subject: [PATCH 01/12] Fix indentation size --- cascade/docs/source/cascade.data.rst | 26 +++++++++++++------------- cascade/docs/source/cascade.meta.rst | 14 +++++++------- cascade/docs/source/cascade.models.rst | 16 ++++++++-------- cascade/docs/source/cascade.utils.rst | 26 +++++++++++++------------- 4 files changed, 41 insertions(+), 41 deletions(-) diff --git a/cascade/docs/source/cascade.data.rst b/cascade/docs/source/cascade.data.rst index afcd9a89..85dd5491 100644 --- a/cascade/docs/source/cascade.data.rst +++ b/cascade/docs/source/cascade.data.rst @@ -1,7 +1,7 @@ cascade.data ============ .. autoclass:: cascade.data.ApplyModifier - :members: + :members: | @@ -10,7 +10,7 @@ cascade.data | .. autoclass:: cascade.data.BruteforceCacher - :members: + :members: | @@ -19,7 +19,7 @@ cascade.data | .. autoclass:: cascade.data.Concatenator - :members: + :members: | @@ -28,7 +28,7 @@ cascade.data | .. autoclass:: cascade.data.CyclicSampler - :members: + :members: | @@ -37,7 +37,7 @@ cascade.data | .. autoclass:: cascade.data.Dataset - :members: + :members: | @@ -46,7 +46,7 @@ cascade.data | .. autoclass:: cascade.data.Iterator - :members: + :members: | @@ -55,7 +55,7 @@ cascade.data | .. autoclass:: cascade.data.Wrapper - :members: + :members: | @@ -65,7 +65,7 @@ cascade.data .. autoclass:: cascade.data.Modifier - :members: + :members: | @@ -74,7 +74,7 @@ cascade.data | .. autoclass:: cascade.data.Sampler - :members: + :members: | @@ -83,7 +83,7 @@ cascade.data | .. autoclass:: cascade.data.FolderDataset - :members: + :members: | @@ -92,7 +92,7 @@ cascade.data | .. autoclass:: cascade.data.Pickler - :members: + :members: | @@ -101,7 +101,7 @@ cascade.data | .. autoclass:: cascade.data.RandomSampler - :members: + :members: | @@ -110,7 +110,7 @@ cascade.data | .. autoclass:: cascade.data.SequentialCacher - :members: + :members: | diff --git a/cascade/docs/source/cascade.meta.rst b/cascade/docs/source/cascade.meta.rst index c12a0144..016685dd 100644 --- a/cascade/docs/source/cascade.meta.rst +++ b/cascade/docs/source/cascade.meta.rst @@ -1,7 +1,7 @@ cascade.meta ============ .. autoclass:: cascade.meta.HistoryViewer - :members: + :members: | @@ -10,7 +10,7 @@ cascade.meta | .. autoclass:: cascade.meta.MetaValidator - :members: + :members: | @@ -20,7 +20,7 @@ cascade.meta .. autoclass:: cascade.meta.MetaViewer - :members: + :members: | @@ -29,7 +29,7 @@ cascade.meta | .. autoclass:: cascade.meta.MetricViewer - :members: + :members: | @@ -38,7 +38,7 @@ cascade.meta | .. autoclass:: cascade.meta.Validator - :members: + :members: | @@ -47,7 +47,7 @@ cascade.meta | .. autoclass:: cascade.meta.AggregateValidator - :members: + :members: | @@ -56,7 +56,7 @@ cascade.meta | .. autoclass:: cascade.meta.PredicateValidator - :members: + :members: | diff --git a/cascade/docs/source/cascade.models.rst b/cascade/docs/source/cascade.models.rst index 7a9263d3..a4129fdb 100644 --- a/cascade/docs/source/cascade.models.rst +++ b/cascade/docs/source/cascade.models.rst @@ -2,7 +2,7 @@ cascade.models ============== .. autoclass:: cascade.models.Model - :members: + :members: | @@ -11,7 +11,7 @@ cascade.models | .. autoclass:: cascade.models.ModelModifier - :members: + :members: | @@ -20,7 +20,7 @@ cascade.models | .. autoclass:: cascade.models.BasicModel - :members: + :members: | @@ -29,7 +29,7 @@ cascade.models | .. autoclass:: cascade.models.BasicModelModifier - :members: + :members: | @@ -38,7 +38,7 @@ cascade.models | .. autoclass:: cascade.models.ModelRepo - :members: + :members: | @@ -47,7 +47,7 @@ cascade.models | .. autoclass:: cascade.models.ModelLine - :members: + :members: | @@ -56,7 +56,7 @@ cascade.models | .. autoclass:: cascade.models.Trainer - :members: + :members: | @@ -65,7 +65,7 @@ cascade.models | .. autoclass:: cascade.models.BasicTrainer - :members: + :members: | diff --git a/cascade/docs/source/cascade.utils.rst b/cascade/docs/source/cascade.utils.rst index d79eca13..c2e0e586 100644 --- a/cascade/docs/source/cascade.utils.rst +++ b/cascade/docs/source/cascade.utils.rst @@ -2,7 +2,7 @@ cascade.utils ============== .. autoclass:: cascade.utils.ConstantBaseline - :members: + :members: | @@ -11,7 +11,7 @@ cascade.utils | .. autoclass:: cascade.utils.FolderImageDataset - :members: + :members: | @@ -20,7 +20,7 @@ cascade.utils | .. autoclass:: cascade.utils.NumpyWrapper - :members: + :members: | @@ -29,7 +29,7 @@ cascade.utils | .. autoclass:: cascade.utils.OverSampler - :members: + :members: | @@ -38,7 +38,7 @@ cascade.utils | .. autoclass:: cascade.utils.PaSchemaValidator - :members: + :members: | @@ -47,7 +47,7 @@ cascade.utils | .. autoclass:: cascade.utils.SkModel - :members: + :members: | @@ -56,7 +56,7 @@ cascade.utils | .. autoclass:: cascade.utils.TableDataset - :members: + :members: | @@ -65,7 +65,7 @@ cascade.utils | .. autoclass:: cascade.utils.TableFilter - :members: + :members: | @@ -74,7 +74,7 @@ cascade.utils | .. autoclass:: cascade.utils.CSVDataset - :members: + :members: | @@ -84,7 +84,7 @@ cascade.utils .. autoclass:: cascade.utils.PartedTableLoader - :members: + :members: | @@ -93,7 +93,7 @@ cascade.utils | .. autoclass:: cascade.utils.TableIterator - :members: + :members: | @@ -102,7 +102,7 @@ cascade.utils | .. autoclass:: cascade.utils.LargeCSVDataset - :members: + :members: | @@ -111,7 +111,7 @@ cascade.utils | .. autoclass:: cascade.utils.NullValidator - :members: + :members: | From 384e30eb71d1c4d1145d71d3933308c790e7f7f3 Mon Sep 17 00:00:00 2001 From: Ilia Moiseev Date: Thu, 29 Sep 2022 22:37:02 +0300 Subject: [PATCH 02/12] Fix typo --- cascade/data/bruteforce_cacher.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cascade/data/bruteforce_cacher.py b/cascade/data/bruteforce_cacher.py index 47c6c260..02b448ae 100644 --- a/cascade/data/bruteforce_cacher.py +++ b/cascade/data/bruteforce_cacher.py @@ -25,7 +25,7 @@ class BruteforceCacher(Modifier): See also -------- - Cascade.data.SequentialCacher + cascade.data.SequentialCacher """ def __init__(self, dataset: Dataset, *args, **kwargs) -> None: super().__init__(dataset, *args, **kwargs) From 0b8b60e553e4b0757df4de6c48aa662dd384ce31 Mon Sep 17 00:00:00 2001 From: Ilia Moiseev Date: Thu, 29 Sep 2022 23:21:32 +0300 Subject: [PATCH 03/12] Write and refine base documentation --- cascade/base/meta_handler.py | 112 ++++++++++++++++++++--------------- cascade/base/traceable.py | 26 ++++++-- 2 files changed, 85 insertions(+), 53 deletions(-) diff --git a/cascade/base/meta_handler.py b/cascade/base/meta_handler.py index 886da83b..28139baa 100644 --- a/cascade/base/meta_handler.py +++ b/cascade/base/meta_handler.py @@ -16,9 +16,8 @@ import os import json -from typing import Union import datetime -from typing import List, Dict +from typing import Union, List, Dict from json import JSONEncoder import yaml @@ -61,15 +60,15 @@ def default(self, obj): return super(CustomEncoder, self).default(obj) - def obj_to_dict(self, obj): + def obj_to_dict(self, obj) -> Dict: return json.loads(self.encode(obj)) class BaseHandler: - def read(self, path) -> List[Dict]: + def read(self, path: str) -> Union[Dict, List[Dict]]: raise NotImplementedError() - def write(self, path, obj, overwrite=True) -> None: + def write(self, path: str, obj, overwrite=True) -> None: raise NotImplementedError() def _raise_io_error(self, path, exc): @@ -80,24 +79,7 @@ def _raise_io_error(self, path, exc): class JSONHandler(BaseHandler): - """ - Handles the logic of dumping and loading json files - """ - def read(self, path) -> Union[Dict, List[Dict]]: - """ - Reads json from path - - Parameters - ---------- - path: - Path to the file. If no extension provided, - then .json will be added - - Raises - ------ - IOError - when decoding errors occur - """ + def read(self, path: str) -> Union[Dict, List[Dict]]: _, ext = os.path.splitext(path) if ext == '': path += '.json' @@ -111,32 +93,16 @@ def read(self, path) -> Union[Dict, List[Dict]]: self._raise_io_error(path, e) return meta - def write(self, name, obj: List[Dict], overwrite=True) -> None: - """ - Writes json to path using custom encoder - """ - if not overwrite and os.path.exists(name): + def write(self, path:str, obj: List[Dict], overwrite=True) -> None: + if not overwrite and os.path.exists(path): return - with open(name, 'w') as f: + with open(path, 'w') as f: json.dump(obj, f, cls=CustomEncoder, indent=4) class YAMLHandler(BaseHandler): - def read(self, path) -> Union[Dict, List[Dict]]: - """ - Reads yaml from path - - Parameters - ---------- - path: - Path to the file. If no extension provided, then .yml will be added - - Raises - ------ - IOError - when decoding errors occur - """ + def read(self, path: str) -> Union[Dict, List[Dict]]: _, ext = os.path.splitext(path) if ext == '': path += '.yml' @@ -148,7 +114,7 @@ def read(self, path) -> Union[Dict, List[Dict]]: self._raise_io_error(path, e) return meta - def write(self, path, obj, overwrite=True) -> None: + def write(self, path: str, obj, overwrite=True) -> None: if not overwrite and os.path.exists(path): return @@ -158,14 +124,14 @@ def write(self, path, obj, overwrite=True) -> None: class TextHandler(BaseHandler): - def read(self, path) -> Dict: + def read(self, path: str) -> Dict: """ Reads text file from path and returns dict in the form {path: 'text from file'} Parameters ---------- - path: + path: str Path to the file """ @@ -179,11 +145,61 @@ def write(self, path, obj, overwrite=True) -> None: class MetaHandler: - def read(self, path) -> List[Dict]: + """ + Encapsulates the logic of reading and writing metadata to disk. + + Supported read-write formats are `json` and `yml`. Other formats + are supported as read-only. For example one can read meta from txt or md file. + + Examples + -------- + >>> from cascade.base import MetaHandler + >>> mh = MetaHandler() + >>> mh.write('meta.json', {'hello': 'world'}) + >>> obj = mh.read('meta.json') + >>> mh.write('meta.yml', {'hello': 'world'}) + >>> obj = mh.read('meta.yml') + """ + def read(self, path: str) -> Union[Dict, List[Dict]]: + """ + Reads object from path. + + Parameters + ---------- + path: str + Path to the object. + + Returns + ------- + obj: Union[Dict, List[Dict]] + + Raises + ------ + IOError + when decoding errors occur + """ handler = self._get_handler(path) return handler.read(path) - def write(self, path, obj, overwrite=True) -> None: + def write(self, path: str, obj, overwrite:bool = True) -> None: + """ + Writes object to path. + + Parameters + ---------- + path: str + Path where to write object with name and extension + obj + An object to be serialized and saved + overwrite: bool, optional + Whether to overwrite the file if it already exists. If False + and file already exists will silently return without saving. + + Raises + ------ + IOError + when encoding errors occur + """ handler = self._get_handler(path) return handler.write(path, obj, overwrite=overwrite) diff --git a/cascade/base/traceable.py b/cascade/base/traceable.py index 1de4d4a3..15ea19b9 100644 --- a/cascade/base/traceable.py +++ b/cascade/base/traceable.py @@ -3,7 +3,23 @@ class Traceable: - def __init__(self, *args, meta_prefix=None, **kwargs) -> None: + """ + Base class for everything that has metadata in cascade. + Handles the logic of getting and updating internal meta prefix. + """ + def __init__(self, *args, meta_prefix:Union[Dict, str] = None, **kwargs) -> None: + """ + Parameters + ---------- + meta_prefix: Union[Dict, str], optional + The dictionary that is used to update object's meta in `get_meta` call. + Due to the call of update can overwrite default values. + If str - prefix assumed to be path and loaded using MetaHandler. + + See also + -------- + cascade.base.MetaHandler + """ if meta_prefix is None: meta_prefix = {} elif isinstance(meta_prefix, str): @@ -22,8 +38,8 @@ def get_meta(self) -> List[Dict]: meta: List[Dict] A list where last element is this object's metadata. Meta can be anything that is worth to document about - the object and its properties. This is done in form - of list to enable cascade-like calls in Modifiers and Samplers. + the object and its properties. + Meta is list to allow the formation of pipelines. """ meta = { 'name': repr(self) @@ -36,8 +52,8 @@ def get_meta(self) -> List[Dict]: def update_meta(self, obj: Union[Dict, str]) -> None: """ - Updates _meta_prefix, which is then updates - dataset's meta when get_meta() is called + Updates `_meta_prefix`, which then updates + dataset's meta when `get_meta()` is called """ if isinstance(obj, str): obj = self._read_meta_from_file(obj) From 6456a8a127b457db449106082075865a520c06b7 Mon Sep 17 00:00:00 2001 From: Ilia Moiseev Date: Thu, 29 Sep 2022 23:46:32 +0300 Subject: [PATCH 04/12] Failed test of Concatenator's meta --- cascade/tests/test_concatenator.py | 1 + 1 file changed, 1 insertion(+) diff --git a/cascade/tests/test_concatenator.py b/cascade/tests/test_concatenator.py index 3359d9a8..dc4f56ef 100644 --- a/cascade/tests/test_concatenator.py +++ b/cascade/tests/test_concatenator.py @@ -31,6 +31,7 @@ def test_meta(): c = Concatenator([n1, n2], meta_prefix={'num': 1}) assert c.get_meta()[0]['num'] == 1 + assert len(c.get_meta()[0]['data']) == 2 @pytest.mark.parametrize( From a20ba8205c43988ef9261d867a6eb67ae19f23fd Mon Sep 17 00:00:00 2001 From: Ilia Moiseev Date: Thu, 29 Sep 2022 23:49:15 +0300 Subject: [PATCH 05/12] Return to the data of Concatenator as list --- cascade/data/concatenator.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/cascade/data/concatenator.py b/cascade/data/concatenator.py index 61154651..e7c286c4 100644 --- a/cascade/data/concatenator.py +++ b/cascade/data/concatenator.py @@ -67,7 +67,5 @@ def get_meta(self) -> List[Dict]: Concatenator calls `get_meta()` of all its datasets """ meta = super().get_meta() - meta[0]['data'] = {} - for ds in self._datasets: - meta[0]['data'][repr(ds)] = ds.get_meta() + meta[0]['data'] = [ds.get_meta() for ds in self._datasets] return meta From fc88f06d1a56e9eac244d50e80eae6bd23f81f61 Mon Sep 17 00:00:00 2001 From: Ilia Moiseev Date: Fri, 30 Sep 2022 23:35:54 +0300 Subject: [PATCH 06/12] Write and refine data documentation --- cascade/data/apply_modifier.py | 16 ++++++++++--- cascade/data/bruteforce_cacher.py | 31 ++++++++++++++++++++++-- cascade/data/concatenator.py | 12 ++++++++-- cascade/data/cyclic_sampler.py | 11 ++------- cascade/data/dataset.py | 39 +++++++++++++++++++++---------- cascade/data/folder_dataset.py | 12 +++++++--- cascade/data/pickler.py | 12 +++++----- cascade/data/random_sampler.py | 6 ++--- cascade/data/range_sampler.py | 19 ++++++++++++++- cascade/data/sequential_cacher.py | 12 ++++++---- cascade/data/utils.py | 12 +++++----- 11 files changed, 131 insertions(+), 51 deletions(-) diff --git a/cascade/data/apply_modifier.py b/cascade/data/apply_modifier.py index a464d381..97636927 100644 --- a/cascade/data/apply_modifier.py +++ b/cascade/data/apply_modifier.py @@ -20,17 +20,27 @@ class ApplyModifier(Modifier): """ - Modifier that maps a function to previous dataset's elements in a lazy way. + Modifier that maps a function to given dataset's items in a lazy way. """ def __init__(self, dataset: Dataset, func: Callable, *args, **kwargs) -> None: """ Parameters ---------- dataset: Dataset - a dataset to modify + A dataset to modify func: Callable - a function to be applied to every item of a dataset - + A function to be applied to every item of a dataset - each `__getitem__` would call `func` on an item obtained from a previous dataset + + Examples + -------- + >>> from cascade import data as cdd + >>> ds = cdd.Wrapper([0, 1, 2, 3, 4]) + >>> ds = cdd.ApplyModifier(ds, lambda x: x ** 2) + + Now function will only be applied when items are retrieved + + >>> assert [item for item in ds] == [0, 1, 4, 9, 16] """ super().__init__(dataset, *args, **kwargs) self._func = func diff --git a/cascade/data/bruteforce_cacher.py b/cascade/data/bruteforce_cacher.py index 02b448ae..b07422d2 100644 --- a/cascade/data/bruteforce_cacher.py +++ b/cascade/data/bruteforce_cacher.py @@ -20,14 +20,41 @@ class BruteforceCacher(Modifier): """ - Unusual modifier which loads everything in memory in initialization phase - and then returns values from cache + Identity modifier that calls all previous pipeline in __init__ loading everything + in memory. This is useful in combination with `Pickler` when pipeline + has heavy operations upstream. You can load everything and pickle it to turn off + heavy part of the pipeline. + + Examples + -------- + >>> from cascade import data as cdd + >>> ds = cdd.Wrapper([0 for _ in range(1000000)]) + >>> ds = cdd.ApplyModifier(ds, lambda x: x + 1) + >>> ds = cdd.ApplyModifier(ds, lambda x: x + 1) + >>> ds = cdd.ApplyModifier(ds, lambda x: x + 1) + + Cache heavy upstream part once + + >>> ds = cdd.BruteforceCacher(ds) + + Then pickle it + + >>> ds = cdd.Pickler('ds', ds) + + Unpickle and use further + + >>> ds = cdd.Pickler('ds') + >>> ds = cdd.RandomSampler(ds, 1000) See also -------- cascade.data.SequentialCacher + cascade.data.Pickler """ def __init__(self, dataset: Dataset, *args, **kwargs) -> None: + """ + Loads every item in dataset in internal list. + """ super().__init__(dataset, *args, **kwargs) # forcibly calling all previous datasets in the init if hasattr(self._dataset, '__len__') and hasattr(self._dataset, '__getitem__'): diff --git a/cascade/data/concatenator.py b/cascade/data/concatenator.py index 61154651..9c4c420e 100644 --- a/cascade/data/concatenator.py +++ b/cascade/data/concatenator.py @@ -23,6 +23,14 @@ class Concatenator(Dataset): """ Unifies several Datasets under one, calling them sequentially in the provided order. + + Examples + -------- + >>> from cascade.data import Wrapper, Concatenator + >>> ds_1 = Wrapper([0, 1, 2]) + >>> ds_2 = Wrapper([2, 1, 0]) + >>> ds = Concatenator((ds_1, ds_2)) + >>> assert [item for item in ds] == [0, 1, 2, 2, 1, 0] """ def __init__(self, datasets: Iterable[Dataset], *args, **kwargs) -> None: """ @@ -30,8 +38,8 @@ def __init__(self, datasets: Iterable[Dataset], *args, **kwargs) -> None: Parameters ---------- - datasets: Iterable[Dataset] - a list or tuple of datasets to concatenate + datasets: Union[Iterable[Dataset], Mapping[Dataset]] + A list or tuple of datasets to concatenate """ self._datasets = datasets lengths = [len(ds) for ds in self._datasets] diff --git a/cascade/data/cyclic_sampler.py b/cascade/data/cyclic_sampler.py index ed3eeddc..9b4023ed 100644 --- a/cascade/data/cyclic_sampler.py +++ b/cascade/data/cyclic_sampler.py @@ -25,15 +25,8 @@ class CyclicSampler(Sampler): ------- >>> from cascade.data import CyclicSampler, Wrapper >>> ds = Wrapper([1,2,3]) - >>> ds = CyclicSampler(ds, 5) - >>> for item in ds: - ... print(item) - ... - 1 - 2 - 3 - 1 - 2 + >>> ds = CyclicSampler(ds, 7) + >>> assert [item for item in ds] == [1, 2, 3, 1, 2, 3, 1] """ def __getitem__(self, index) -> T: internal_index = index % len(self._dataset) diff --git a/cascade/data/dataset.py b/cascade/data/dataset.py index 4223d5c9..37c4e3ba 100644 --- a/cascade/data/dataset.py +++ b/cascade/data/dataset.py @@ -11,7 +11,7 @@ limitations under the License. """ -from typing import Dict, Generic, Iterable, List, TypeVar +from typing import Dict, Generic, Iterable, List, Mapping, TypeVar from ..base import Traceable T = TypeVar('T') @@ -35,7 +35,7 @@ def __getitem__(self, index) -> T: """ Abstract method - should be defined in every successor """ - raise NotImplementedError + raise NotImplementedError() def get_meta(self) -> List[Dict]: """ @@ -50,22 +50,25 @@ def get_meta(self) -> List[Dict]: meta[0]['type'] = 'dataset' return meta - def __repr__(self): + def __repr__(self) -> str: """ Returns ------- - string representation of a Dataset. This repr used as a name for get_meta() method - by default gives the name of class from basic repr + repr: str + Representation of a Dataset. This repr used as a name for get_meta() method + by default gives the name of class from basic repr See also -------- cascade.data.Dataset.get_meta() """ - rp = super().__repr__() - return rp[1:].split()[0] + return super().__repr__().split()[0] class Iterator(Dataset): + """ + Wraps Dataset around any Iterable. Does not have map-like interface. + """ def __init__(self, data: Iterable, *args, **kwargs): super().__init__(*args, **kwargs) self._data = data @@ -87,7 +90,7 @@ class Wrapper(Dataset): """ Wraps Dataset around any list-like object. """ - def __init__(self, obj, *args, **kwargs) -> None: + def __init__(self, obj: Mapping, *args, **kwargs) -> None: self._data = obj super().__init__(*args, **kwargs) @@ -120,7 +123,7 @@ def __init__(self, dataset: Dataset, *args, **kwargs) -> None: Parameters ---------- dataset: Dataset - a dataset to modify + A dataset to modify """ self._dataset = dataset super().__init__(*args, **kwargs) @@ -128,7 +131,7 @@ def __init__(self, dataset: Dataset, *args, **kwargs) -> None: def __getitem__(self, index) -> T: return self._dataset[index] - def __iter__(self): + def __iter__(self) -> T: for i in range(len(self)): yield self.__getitem__(i) @@ -150,14 +153,26 @@ def get_meta(self) -> List[Dict]: class Sampler(Modifier): """ Defines certain sampling over a Dataset. Its distinctive feature is that it changes the number of - items in dataset. It can constitute a batch sampler or random sampler or sample in cycling manner. + items in dataset. It can be used to build a batch sampler, random sampler, etc. See also -------- cascade.data.CyclicSampler + cascade.data.RandomSampler + cascade.data.RangeSampler """ def __init__(self, dataset: Dataset, num_samples: int, *args, **kwargs) -> None: - assert num_samples > 0 + """ + Constructs a Sampler. + + Parameters + ---------- + dataset: Dataset + A dataset to sample from + num_samples: int + The number of samples + """ + assert num_samples > 0, 'The number of samples should be positive' super().__init__(dataset, *args, **kwargs) self._num_samples = num_samples diff --git a/cascade/data/folder_dataset.py b/cascade/data/folder_dataset.py index e9fb13b2..92b3174f 100644 --- a/cascade/data/folder_dataset.py +++ b/cascade/data/folder_dataset.py @@ -8,13 +8,19 @@ class FolderDataset(Dataset): """ Basic "folder of files" dataset. Accepts root folder in which considers all files. - Is abstract - getitem is not defined, since it is specific for each file type + Is abstract - getitem is not defined, since it is specific for each file type. See also -------- cascade.utils.FolderImageDataset """ - def __init__(self, root, *args, **kwargs) -> None: + def __init__(self, root: str, *args, **kwargs) -> None: + """ + Parameters + ---------- + root: str + A path to the folder of files + """ super().__init__(*args, **kwargs) self._root = os.path.abspath(root) if not os.path.exists(self._root): @@ -39,5 +45,5 @@ def get_meta(self) -> List[Dict]: meta[0]['md5sums'].append(md5(f.read()).hexdigest()) return meta - def __len__(self): + def __len__(self) -> int: return len(self._names) diff --git a/cascade/data/pickler.py b/cascade/data/pickler.py index 76ec32ec..8ad30731 100644 --- a/cascade/data/pickler.py +++ b/cascade/data/pickler.py @@ -16,14 +16,14 @@ import os import pickle -from . import Modifier +from . import Dataset, Modifier class Pickler(Modifier): """ - Pickles an input dataset or unpickles one + Pickles input dataset or unpickles one """ - def __init__(self, path, dataset=None, *args, **kwargs) -> None: + def __init__(self, path: str, dataset: Dataset = None, *args, **kwargs) -> None: """ Loads pickled dataset or dumps one depending on parameters passed: @@ -32,10 +32,10 @@ def __init__(self, path, dataset=None, *args, **kwargs) -> None: Parameters ---------- - path: - path to the pickled dataset + path: str + Path to the pickled dataset dataset: Dataset, optional - a dataset to be pickled + A dataset to be pickled Raises ------ diff --git a/cascade/data/random_sampler.py b/cascade/data/random_sampler.py index f7b2ba4d..ed20eb1d 100644 --- a/cascade/data/random_sampler.py +++ b/cascade/data/random_sampler.py @@ -15,7 +15,7 @@ """ from numpy.random import random_integers, shuffle -from . import Dataset, Sampler +from . import Dataset, Sampler, T class RandomSampler(Sampler): @@ -23,7 +23,7 @@ class RandomSampler(Sampler): Shuffles dataset. Can randomly sample from dataset if num_samples is not None and less than length of dataset. """ - def __init__(self, dataset: Dataset, num_samples=None, **kwargs) -> None: + def __init__(self, dataset: Dataset, num_samples: int = None, **kwargs) -> None: """ Parameters ---------- @@ -44,5 +44,5 @@ def __init__(self, dataset: Dataset, num_samples=None, **kwargs) -> None: self._indices = random_integers(0, len(dataset) - 1, num_samples) super().__init__(dataset, num_samples, **kwargs) - def __getitem__(self, index): + def __getitem__(self, index) -> T: return super().__getitem__(self._indices[index]) diff --git a/cascade/data/range_sampler.py b/cascade/data/range_sampler.py index af4b72f8..349bd97b 100644 --- a/cascade/data/range_sampler.py +++ b/cascade/data/range_sampler.py @@ -42,7 +42,24 @@ class RangeSampler(Sampler): 2 3 """ - def __init__(self, dataset: Dataset, start=None, stop=None, step=1, *args, **kwargs) -> None: + def __init__(self, + dataset: Dataset, + start:int = None, + stop:int = None, + step:int = 1, + *args, **kwargs) -> None: + """ + Parameters + ---------- + dataset: Dataset + A dataset to sampler from + start: int + Start index in range - included + stop: int + Stop index in range - excluded + step: int, optional + Step of range + """ if start is not None and stop is None: stop = start start = 0 diff --git a/cascade/data/sequential_cacher.py b/cascade/data/sequential_cacher.py index abdd6369..292f0d71 100644 --- a/cascade/data/sequential_cacher.py +++ b/cascade/data/sequential_cacher.py @@ -29,14 +29,18 @@ class SequentialCacher(Modifier): -------- BruteforceCacher """ - def __init__(self, dataset: Dataset, batch_size=2, *args, **kwargs) -> None: + def __init__( + self, + dataset: Dataset, + batch_size: int = 2, + *args, **kwargs) -> None: """ Parameters ---------- dataset: Dataset - dataset to cache sequentially - batch_size: int, default: 2 - a number of items to load and keep in each moment + Dataset to cache sequentially + batch_size: int, optional + A number of items to load and keep in each moment """ # TODO: make something to release this assert assert hasattr(dataset, '__len__'), 'Dataset should have __len__' diff --git a/cascade/data/utils.py b/cascade/data/utils.py index c8c62b0d..6c877b56 100644 --- a/cascade/data/utils.py +++ b/cascade/data/utils.py @@ -22,24 +22,24 @@ def split(ds: Dataset, frac=0.5, num=None) -> Tuple[Dataset]: >>> ds1, ds2 = cdd.split(ds) >>> print([item for item in ds1]) - ... [0, 1] + [0, 1] >>> print([item for item in ds2]) - ... [2, 3, 4] + [2, 3, 4] >>> ds1, ds2 = cdd.split(ds, 0.6) >>> print([item for item in ds1]) - ... [0, 1, 2] + [0, 1, 2] >>> print([item for item in ds2]) - ... [3, 4] + [3, 4] >>> ds1, ds2 = cdd.split(ds, num=4) >>> print([item for item in ds1]) - ... [0, 1, 2, 3] + [0, 1, 2, 3] >>> print([item for item in ds2]) - ... [4] + [4] ''' if num is None: num = floor(len(ds) * frac) From af0532507d58c9b31cb05b20fe182e563f221eac Mon Sep 17 00:00:00 2001 From: Ilia Moiseev Date: Thu, 6 Oct 2022 00:23:46 +0300 Subject: [PATCH 07/12] Write and fix meta documentation --- cascade/meta/history_viewer.py | 24 ++++++++++++++++++------ cascade/meta/meta_validator.py | 16 +++++++++------- cascade/meta/meta_viewer.py | 17 +++++++++-------- cascade/meta/metric_viewer.py | 25 +++++++++++++++++++------ cascade/meta/validator.py | 11 +++++++---- 5 files changed, 62 insertions(+), 31 deletions(-) diff --git a/cascade/meta/history_viewer.py b/cascade/meta/history_viewer.py index 42a4551a..2003726b 100644 --- a/cascade/meta/history_viewer.py +++ b/cascade/meta/history_viewer.py @@ -27,15 +27,20 @@ from . import MetaViewer from .. import __version__ +from ..data import Dataset class HistoryViewer: """ The tool which allows user to visualize training history of model versions. - Uses plotly to show how metrics of models changed in time and how - models with different hyperparameters depend on each other + Uses shows how metrics of models changed over time and how + models with different hyperparameters depend on each other. """ - def __init__(self, repo, last_lines=None, last_models=None) -> None: + def __init__( + self, + repo, + last_lines: int = None, + last_models: int = None) -> None: """ Parameters ---------- @@ -112,7 +117,7 @@ def _specific_argmin(arr, self_index) -> int: arg_min = i return arg_min - def plot(self, metric: str, show=False) -> plotly.graph_objects.Figure: + def plot(self, metric: str, show: bool = False) -> plotly.graph_objects.Figure: """ Plots training history of model versions using plotly. @@ -120,7 +125,7 @@ def plot(self, metric: str, show=False) -> plotly.graph_objects.Figure: ---------- metric: str Metric should be present in meta of at least one model in repo - show: bool + show: bool, optional Whether to return and show or just return figure """ @@ -199,7 +204,14 @@ def plot(self, metric: str, show=False) -> plotly.graph_objects.Figure: return fig - def serve(self, metric, **kwargs): + def serve(self, metric: str, **kwargs): + """ + Run dash-based server with HistoryViewer, updating plots in real-time. + + Note + ---- + This feature needs `dash` to be installed. + """ # Conditional import try: import dash diff --git a/cascade/meta/meta_validator.py b/cascade/meta/meta_validator.py index aa4b94fa..25a1da2b 100644 --- a/cascade/meta/meta_validator.py +++ b/cascade/meta/meta_validator.py @@ -45,10 +45,6 @@ class MetaValidator(Validator): If the structure of pipeline is different it saves new meta file. - Raises - ------ - cascade.meta.DataValidationException - See also -------- cascade.data.Modifier @@ -58,10 +54,16 @@ def __init__(self, dataset: Dataset, root=None, meta_fmt='.json') -> None: Parameters ---------- dataset: Dataset - dataset to validate - root: str - path to the folder in which to store meta + Dataset to validate + root: str, optional + Path to the folder in which to store meta default is './.cascade' + meta_fmt: str, optional + Format of metadata files + + Raises + ------ + cascade.meta.DataValidationException """ super().__init__(dataset, lambda x: True) self._mh = MetaHandler() diff --git a/cascade/meta/meta_viewer.py b/cascade/meta/meta_viewer.py index b8cb31a1..8afe4d05 100644 --- a/cascade/meta/meta_viewer.py +++ b/cascade/meta/meta_viewer.py @@ -21,22 +21,20 @@ class MetaViewer: """ - The class to read and write meta data. + The class to view all metadata in folders and subfolders. """ - def __init__(self, root, filt=None) -> None: + def __init__(self, root: str, filt: Dict=None) -> None: """ Parameters ---------- - root: + root: str path to the folder containing metadata files - to dump and load metadata files MetaHandler is used filt Dict, optional: - dictionary that specifies which values should be present in meta + dictionary that specifies which values that should be present in meta for example to find all models use `filt={'type': 'model'}` See also -------- - cascade.meta.ModelRepo cascade.meta.MetaHandler """ if not os.path.exists(root): @@ -55,12 +53,12 @@ def __init__(self, root, filt=None) -> None: if filt is not None: self.names = list(filter(self._filter, self.names)) - def __getitem__(self, index) -> List[Dict]: + def __getitem__(self, index: int) -> List[Dict]: """ Returns ------- meta: List[Dict] - object containing meta + Meta object """ return self.read(self.names[index]) @@ -92,4 +90,7 @@ def _filter(self, name): @staticmethod def obj_to_dict(obj): + """ + Serializes the object using extended JSONEncoder + """ return JSONEncoder().obj_to_dict(obj) diff --git a/cascade/meta/metric_viewer.py b/cascade/meta/metric_viewer.py index d3db721e..3c58a2ae 100644 --- a/cascade/meta/metric_viewer.py +++ b/cascade/meta/metric_viewer.py @@ -15,6 +15,7 @@ """ import os +from typing import List import warnings import pendulum from flatten_json import flatten @@ -28,8 +29,9 @@ class MetricViewer: """ Interface for viewing metrics in model meta files - uses ModelRepo to extract metrics of all models if any - constructs a `pd.DataFrame` of metrics internally, which is showed in `__repr__` + uses ModelRepo to extract metrics of all models if any. + As metrics it uses data from `metrics` field in models' + meta and as parameters it uses `params` field. """ def __init__(self, repo) -> None: """ @@ -42,7 +44,10 @@ def __init__(self, repo) -> None: self._metrics = [] self.reload_table() - def reload_table(self): + def reload_table(self) -> None: + """ + Updates internal state + """ self._metrics = [] for line in self._repo: viewer_root = line.root @@ -91,7 +96,10 @@ def reload_table(self): def __repr__(self) -> str: return repr(self.table) - def plot_table(self, show=False): + def plot_table(self, show: bool = False): + """ + Uses plotly to graphically show table with metrics and parameters. + """ data = pd.DataFrame(map(flatten, self.table.to_dict('records'))) fig = go.Figure(data=[ go.Table( @@ -107,13 +115,18 @@ def plot_table(self, show=False): fig.show() return fig - def serve(self, page_size=50, include=None, exclude=None, **kwargs) -> None: + def serve( + self, + page_size: int = 50, + include: List[str] = None, + exclude: List[str] = None, + **kwargs) -> None: """ Runs dash-based server with interactive table of metrics and parameters. Parameters ---------- - page_size: + page_size: int, optional Size of the table in rows on one page include: List[str], optional: List of parameters or metrics to be added. Only them will be present along with some default. diff --git a/cascade/meta/validator.py b/cascade/meta/validator.py index 25425827..a3ccda24 100644 --- a/cascade/meta/validator.py +++ b/cascade/meta/validator.py @@ -18,7 +18,9 @@ class DataValidationException(Exception): - pass + """ + Raised when data validation fails + """ class Validator(Modifier): @@ -37,7 +39,8 @@ def __init__(self, dataset: Dataset, class AggregateValidator(Validator): """ - This validator accepts an aggregate function that accepts a `Dataset` and return `True` of `False` + This validator accepts an aggregate function + that accepts a `Dataset` and returns `True` or `False` Example ------- @@ -61,8 +64,8 @@ def __init__(self, dataset: Dataset, func: Callable[[Dataset], bool], **kwargs) class PredicateValidator(Validator): """ - This validator accepts function that is applied to each item in dataset and return `True` or `False` - Calls all previous lazy datasets in __init__ + This validator accepts function that is applied to each item in a dataset + and returns `True` or `False`. Calls `__getitem__`s of all previous datasets in `__init__`. Example ------- From 6d4a9f44506126305849c967efd277cc54cefd8f Mon Sep 17 00:00:00 2001 From: Ilia Moiseev Date: Thu, 6 Oct 2022 00:24:06 +0300 Subject: [PATCH 08/12] Order records in data docs --- cascade/docs/source/cascade.data.rst | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/cascade/docs/source/cascade.data.rst b/cascade/docs/source/cascade.data.rst index 85dd5491..a14c8284 100644 --- a/cascade/docs/source/cascade.data.rst +++ b/cascade/docs/source/cascade.data.rst @@ -109,8 +109,7 @@ cascade.data | -.. autoclass:: cascade.data.SequentialCacher - :members: +.. autoclass:: cascade.data.RangeSampler | @@ -118,8 +117,8 @@ cascade.data | - -.. autoclass:: cascade.data.RangeSampler +.. autoclass:: cascade.data.SequentialCacher + :members: | From 68dd95a2c0c6ef5c998a52af01c0ca84d262ce1c Mon Sep 17 00:00:00 2001 From: Ilia Moiseev Date: Thu, 6 Oct 2022 00:52:24 +0300 Subject: [PATCH 09/12] Write and fix models documentation --- cascade/docs/source/cascade.models.rst | 17 +++---- cascade/models/basic_model.py | 21 +++++++-- cascade/models/model.py | 12 ++++- cascade/models/model_line.py | 24 +++++----- cascade/models/model_repo.py | 64 ++++++++++++++------------ cascade/models/trainer.py | 20 ++++---- 6 files changed, 93 insertions(+), 65 deletions(-) diff --git a/cascade/docs/source/cascade.models.rst b/cascade/docs/source/cascade.models.rst index a4129fdb..324208b0 100644 --- a/cascade/docs/source/cascade.models.rst +++ b/cascade/docs/source/cascade.models.rst @@ -1,7 +1,7 @@ cascade.models ============== -.. autoclass:: cascade.models.Model +.. autoclass:: cascade.models.BasicModel :members: | @@ -10,7 +10,7 @@ cascade.models | -.. autoclass:: cascade.models.ModelModifier +.. autoclass:: cascade.models.BasicModelModifier :members: | @@ -19,7 +19,7 @@ cascade.models | -.. autoclass:: cascade.models.BasicModel +.. autoclass:: cascade.models.ModelLine :members: | @@ -28,7 +28,7 @@ cascade.models | -.. autoclass:: cascade.models.BasicModelModifier +.. autoclass:: cascade.models.ModelRepo :members: | @@ -37,7 +37,7 @@ cascade.models | -.. autoclass:: cascade.models.ModelRepo +.. autoclass:: cascade.models.Model :members: | @@ -46,7 +46,7 @@ cascade.models | -.. autoclass:: cascade.models.ModelLine +.. autoclass:: cascade.models.ModelModifier :members: | @@ -55,7 +55,7 @@ cascade.models | -.. autoclass:: cascade.models.Trainer +.. autoclass:: cascade.models.BasicTrainer :members: | @@ -64,7 +64,8 @@ cascade.models | -.. autoclass:: cascade.models.BasicTrainer + +.. autoclass:: cascade.models.Trainer :members: | diff --git a/cascade/models/basic_model.py b/cascade/models/basic_model.py index 7455581f..64a9c7ad 100644 --- a/cascade/models/basic_model.py +++ b/cascade/models/basic_model.py @@ -15,7 +15,7 @@ """ -from typing import Dict, Callable, AnyStr +from typing import Dict, Callable from .model import Model, ModelModifier @@ -39,12 +39,24 @@ def save(self, filepath) -> None: def predict(self, x, *args, **kwargs): raise NotImplementedError() - def evaluate(self, x, y, metrics_dict: Dict[AnyStr, Callable], *args, **kwargs) -> None: + def evaluate(self, x, y, metrics_dict: Dict[str, Callable], *args, **kwargs) -> None: """ - Receives x and y batches. Passes x to the model's predict method along with any args or kwargs needed. + Receives x and y validation sequences. Passes x to the model's predict + method along with any args or kwargs needed. Then updates self.metrics with what functions in `metrics_dict` return. `metrics_dict` should contain names of the metrics and the functions with the interface: - f(true, predicted) -> metric_value + f(true, predicted) -> metric_value, where metric_value is not always scalar, can be + array or dict. For example confusion matrix. + + Parameters + ---------- + x: + Input of the model. + y: + Desired output to compare with the values predicted. + metrics_dict: Dict[str, Callable] + Dictionary with functions that given ground-truth and + predicted values return metrics. """ preds = self.predict(x, *args, **kwargs) self.metrics.update({key: metrics_dict[key](y, preds) for key in metrics_dict}) @@ -57,4 +69,3 @@ class BasicModelModifier(ModelModifier, BasicModel): """ Interface to unify BasicModel and ModelModifier. """ - pass diff --git a/cascade/models/model.py b/cascade/models/model.py index ce311d32..8880f9e2 100644 --- a/cascade/models/model.py +++ b/cascade/models/model.py @@ -57,7 +57,7 @@ def predict(self, *args, **kwargs): def evaluate(self, *args, **kwargs) -> None: """ - Evaluates model against any metrics. Should not return any values, just populating self.metrics dict. + Evaluates model against any metrics. Should not return any value, just populate self.metrics dict. """ raise NotImplementedError() @@ -104,7 +104,17 @@ def get_meta(self) -> List[Dict]: class ModelModifier(Model): + """ + Analog of dataset's Modifier. Can be used to chain + two models in one. + """ def __init__(self, model: Model, **kwargs): + """ + Parameters + ---------- + model: Model + A model to modify. + """ self._model = model super().__init__(**kwargs) diff --git a/cascade/models/model_line.py b/cascade/models/model_line.py index 29c6d3f6..033c3f3e 100644 --- a/cascade/models/model_line.py +++ b/cascade/models/model_line.py @@ -31,19 +31,19 @@ class ModelLine(Traceable): A line of models is typically a models with the same hyperparameters and architecture, but different epochs or using different data. """ - def __init__(self, folder, model_cls=Model, meta_fmt='.json', **kwargs) -> None: + def __init__(self, folder: str, model_cls=Model, meta_fmt='.json', **kwargs) -> None: """ All models in line should be instances of the same class. Parameters ---------- - folder: - Path to a folder where ModelLine will be created or already was created - if folder does not exist, creates it - model_cls: - A class of models in repo. ModelLine uses this class to reconstruct a model - meta_fmt: - Format in which to store meta data. '.json', '.yml' are supported. .json is default. + folder: str + Path to a folder where ModelLine will be created or already was created. + If folder does not exist, creates it + model_cls: type, optional + A class of models in line. ModelLine uses this class to reconstruct a model + meta_fmt: str, optional + Format in which to store meta data. See also -------- cascade.models.ModelRepo @@ -96,20 +96,20 @@ def __len__(self) -> int: def save(self, model: Model, only_meta=False) -> None: """ - Saves a model and its metadata to a line folder. + Saves a model and its metadata to a line's folder. Model is automatically assigned a number and a model is saved using Model's method `save` in its own folder. Folder's name is assigned using f'{idx:0>5d}'. For example: 00001 or 00042. The name passed to model's save is just "model" without extension. It is Model's responsibility to correctly assign extension and save its own state. - Additionally, saves ModelLine's meta to the Line's root + Additionally, saves ModelLine's meta to the Line's root. Parameters ---------- - model: cascade.models.Model + model: Model Model to be saved - only_meta: bool + only_meta: bool, optional Flag, that indicates whether to save model's binaries. If True saves only metadata. """ idx = len(self.model_names) diff --git a/cascade/models/model_repo.py b/cascade/models/model_repo.py index 6e53023a..a997ab37 100644 --- a/cascade/models/model_repo.py +++ b/cascade/models/model_repo.py @@ -51,40 +51,41 @@ class ModelRepo(Repo): An interface to manage experiments with several lines of models. When created, initializes an empty folder constituting a repository of model lines. - Stores meta-data in file meta.json in the root folder. With every run if the repo was already + Stores its meta-data in its root folder. With every run if the repo was already created earlier, updates its meta and logs changes in human-readable format in file history.log Example ------- >>> from cascade.models import ModelRepo + >>> from cascade.utils import ConstantBaseline >>> repo = ModelRepo('repo', _meta_prefix={'description': 'This is a repo with one VGG16 line for the example.'}) - >>> vgg16_line = repo.add_line('vgg16', VGG16Model) - >>> vgg16 = VGG16Model() - >>> vgg16.fit() - >>> vgg16_line.save(vgg16) + >>> line = repo.add_line('model', ConstantBaseline) + >>> model = ConstantBaseline(1) + >>> model.fit() + >>> line.save(model) >>> from cascade.models import ModelRepo - >>> repo = ModelRepo('repo', lines=[dict(name='vgg16', model_cls=VGGModel)]) - >>> vgg16 = VGG16Model() - >>> vgg16.fit() - >>> repo['vgg16'].save(vgg16) + >>> from cascade.utils import ConstantBaseline + >>> repo = ModelRepo('repo', lines=[dict(name='constant', model_cls=ConstantBaseline)]) + >>> model = ConstantBaseline() + >>> model.fit() + >>> repo['constant'].save(model) """ - def __init__(self, folder, lines=None, overwrite=False, meta_fmt='.json', **kwargs): + def __init__(self, folder, lines:List[Dict] = None, + overwrite:bool = False, meta_fmt:str = '.json', **kwargs): """ Parameters ---------- folder: - Path to a folder where ModelRepo needs to be created or already was created - if folder does not exist, creates it - lines: List[Dict] - A list with parameters of model lines to add at creation or to initialize (alias for `add_model`) - overwrite: bool - if True will remove folder that is passed in first argument and start a new repo - in that place - meta_fmt: str - extension of repo's metadata files and that will be assigned to the lines by default - `.json` and `.yml` are supported + Path to a folder where ModelRepo needs to be created or already was created. + If folder does not exist, creates it automatically. + lines: List[Dict], optional + A list with parameters of model lines to add at creation or to initialize (alias for `add_model`). + overwrite: bool, optional + If True will remove folder that is passed in first argument and start a new repo in that place. + meta_fmt: str, optional + Extension of repo's metadata files and that will be assigned to the lines by default. See also -------- cascade.models.ModelLine @@ -119,18 +120,17 @@ def _load_lines(self): def add_line(self, name, *args, meta_fmt=None, **kwargs): """ - Adds new line to repo if it doesn't exist and returns it - If line exists, defines it in repo + Adds new line to repo if it doesn't exist and returns it. + If line exists, defines it in repo with parameters provided. Supports all the parameters of ModelLine using args and kwargs. Parameters: - name: str - Name of the line. It is used to name a folder of line. - Repo prepends it with `self._root` before creating. - meta_fmt: str - Format of meta files. Supported values are the same as for repo. - If omitted, inherits format from repo. + name: str + Name of the line. It is used to name a folder of line. + Repo prepends it with `self._root` before creating. + meta_fmt: str + Format of meta files. If omitted, inherits format from repo. See also -------- cascade.models.ModelLine @@ -214,6 +214,9 @@ def get_meta(self) -> List[Dict]: return meta def reload(self) -> None: + """ + Updates internal state. + """ self._load_lines() self._update_meta() @@ -228,6 +231,9 @@ def __add__(self, repo): return ModelRepoConcatenator([self, repo]) def get_line_names(self) -> List[str]: + """ + Returns list of line names. + """ # TODO: write test covering this return list(self.lines.keys()) @@ -236,7 +242,7 @@ class ModelRepoConcatenator(Repo): """ The class to concatenate different Repos. For the ease of use please, don't use it directly. - Just do repo = repo_1 + repo_2 to unify repos. + Just do `repo = repo_1 + repo_2` to unify two or more repos. """ def __init__(self, repos: Iterable[Repo], *args, **kwargs) -> None: super().__init__(*args, **kwargs) diff --git a/cascade/models/trainer.py b/cascade/models/trainer.py index f2f754e1..c24ebe37 100644 --- a/cascade/models/trainer.py +++ b/cascade/models/trainer.py @@ -19,8 +19,8 @@ def __init__(self, repo: Union[ModelRepo, str], *args, **kwargs) -> None: """ Parameters ---------- - repo: Union[ModelRepo, str] - Either repo or path to it + repo: Union[ModelRepo, str] + Either repo or path to it """ if isinstance(repo, str): self._repo = ModelRepo(repo) @@ -56,10 +56,10 @@ def train(self, train_data: Iterable, test_data: Iterable, *args, - train_kwargs=None, - test_kwargs=None, - epochs=1, - start_from=None, + train_kwargs: Dict = None, + test_kwargs: Dict = None, + epochs: int = 1, + start_from: str = None, **kwargs) -> None: """ Trains, evaluates and saves given model. If specified, loads model from checkpoint. @@ -71,13 +71,13 @@ def train(self, train data to be passed to model's fit() test_data: Iterable test data to be passed to model's evaluate() - train_kwargs: + train_kwargs: Dict, optional arguments for fit() - test_kwargs: + test_kwargs: Dict, optional arguments for evaluate() - the most common is the dict of metrics - epochs: + epochs: int, optional how many times to repeat training on data - start_from: str + start_from: str, optional name of line from which to start, start from the latest model in line """ From 61e6904a025ce836f770d0dc32117f033b7d5849 Mon Sep 17 00:00:00 2001 From: Ilia Moiseev Date: Thu, 6 Oct 2022 11:47:25 +0300 Subject: [PATCH 10/12] Write and fix utils documentation --- cascade/utils/baselines.py | 7 ++- cascade/utils/numpy_wrapper.py | 7 +-- cascade/utils/oversampler.py | 10 ++-- cascade/utils/pa_schema_validator.py | 4 ++ cascade/utils/sk_model.py | 6 +-- cascade/utils/table_dataset.py | 48 +++++++++++--------- cascade/utils/text_classification_dataset.py | 9 +++- cascade/utils/time_series_dataset.py | 38 +++++++++++++--- cascade/utils/torch_model.py | 31 +++++++++++-- cascade/utils/undersampler.py | 8 ++-- 10 files changed, 118 insertions(+), 50 deletions(-) diff --git a/cascade/utils/baselines.py b/cascade/utils/baselines.py index ccbd9262..36339b17 100644 --- a/cascade/utils/baselines.py +++ b/cascade/utils/baselines.py @@ -31,7 +31,12 @@ def __init__(self, constant=None, **kwargs) -> None: def fit(self, x, y, *args, **kwargs) -> None: pass - def predict(self, x, *args, **kwargs): + def predict(self, x, *args, **kwargs) -> np.ndarray: + """ + Returns the array of the same shape as input full of + given constant. + """ + # TODO: make more universal when work with input shape return np.full_like(x, self._constant) def save(self, path) -> None: diff --git a/cascade/utils/numpy_wrapper.py b/cascade/utils/numpy_wrapper.py index f25be255..5004a111 100644 --- a/cascade/utils/numpy_wrapper.py +++ b/cascade/utils/numpy_wrapper.py @@ -14,18 +14,19 @@ limitations under the License. """ +from typing import Dict, List import numpy as np from ..data import Wrapper class NumpyWrapper(Wrapper): """ - A wrapper around .npy files. Loads file on `__init__`. + A wrapper around .npy files. Loads file in `__init__`. """ - def __init__(self, path, *args, **kwargs): + def __init__(self, path: str, *args, **kwargs) -> None: self._path = path super().__init__(np.load(path), *args, **kwargs) - def get_meta(self): + def get_meta(self) -> List[Dict]: meta = super().get_meta() meta[0]['root'] = self._path diff --git a/cascade/utils/oversampler.py b/cascade/utils/oversampler.py index 5001bfbf..528f2182 100644 --- a/cascade/utils/oversampler.py +++ b/cascade/utils/oversampler.py @@ -14,14 +14,14 @@ limitations under the License. """ -from ..data import Sampler +from ..data import T, Dataset, Sampler import numpy as np from tqdm import trange class OverSampler(Sampler): """ - Accepts datasets which return tuples of objects and labels. + Accepts datasets which return tuples of objects and labels in the respected order. Isn't lazy - runs through all the items ones to determine key order. Doesn't store values afterwards. @@ -29,7 +29,7 @@ class OverSampler(Sampler): of times needed to make equal distribution. Works for any number of classes. """ - def __init__(self, dataset, *args, **kwargs): + def __init__(self, dataset: Dataset, *args, **kwargs) -> None: labels = [int(dataset[i][1]) for i in trange(len(dataset))] ulabels = np.unique(labels) label_nums, _ = np.histogram(labels, bins=len(ulabels)) @@ -47,12 +47,12 @@ def __init__(self, dataset, *args, **kwargs): super().__init__(dataset, num_samples=ln, *args, **kwargs) - def __getitem__(self, index): + def __getitem__(self, index: int) -> T: if index < len(self._dataset): return self._dataset[index] else: idx = self._add_indices[index - len(self._dataset)] return self._dataset[idx] - def __len__(self): + def __len__(self) -> int: return len(self._dataset) + len(self._add_indices) diff --git a/cascade/utils/pa_schema_validator.py b/cascade/utils/pa_schema_validator.py index 04172b2c..e29081d9 100644 --- a/cascade/utils/pa_schema_validator.py +++ b/cascade/utils/pa_schema_validator.py @@ -19,6 +19,10 @@ def __init__(self, dataset, schema, *args, **kwargs) -> None: Schema of the table in the format that is acceptable by pandera or path to the YAML file with schema. For more details on schemas see pandera's documentation. + + Raises + ------ + DataValidationException """ super().__init__(dataset, *args, func=lambda x: self._validate(x, schema), **kwargs) diff --git a/cascade/utils/sk_model.py b/cascade/utils/sk_model.py index cb94f3c8..4d4d3a2f 100644 --- a/cascade/utils/sk_model.py +++ b/cascade/utils/sk_model.py @@ -38,7 +38,7 @@ def __init__(self, name=None, blocks=None, **kwargs) -> None: ---------- name: str, optional Name of the model - blocks: list + blocks: list, optional List of sklearn transformers to make a pipeline from """ if name is not None: @@ -86,7 +86,7 @@ def predict_proba(self, x, *args, **kwargs): # hash from meta: {meta["md5sum"]}\n \ # hash from .pkl: {file_hash}') - def load(self, path) -> None: + def load(self, path: str) -> None: """ Loads the model from path provided. If no extension, .pkl is added. """ @@ -102,7 +102,7 @@ def load(self, path) -> None: with open(path, 'rb') as f: self._pipeline = pickle.load(f) - def save(self, path) -> None: + def save(self, path: str) -> None: """ Saves model to the path provided. If no extension, then .pkl is added. diff --git a/cascade/utils/table_dataset.py b/cascade/utils/table_dataset.py index ad5b05ed..fa5706bc 100644 --- a/cascade/utils/table_dataset.py +++ b/cascade/utils/table_dataset.py @@ -14,7 +14,7 @@ limitations under the License. """ -from typing import List, Dict +from typing import List, Dict, Iterable import pandas as pd from dask import dataframe as dd @@ -24,13 +24,14 @@ class TableDataset(Dataset): """ - Wrapper for `pd.DataFrame`s + Wrapper for `pd.DataFrame`s which allows to manage metadata and perform + validation. """ def __init__(self, *args, t=None, **kwargs): """ Parameters ---------- - t: + t: optional pd.DataFrame or TableDataset to be set as table """ super().__init__(*args, **kwargs) @@ -45,7 +46,7 @@ def __init__(self, *args, t=None, **kwargs): def __getitem__(self, index): """ - Returns row from table by index + Returns a row from table by index """ return self._table.iloc[index] @@ -54,7 +55,7 @@ def __repr__(self): def __len__(self): """ - Return len of the table + Returns length of the table """ return len(self._table) @@ -70,7 +71,8 @@ def get_meta(self) -> List[Dict]: def to_csv(self, path, **kwargs): """ - Saves the table to .csv + Saves the table to .csv file. Any kwargs are sent to + `pd.DataFrame.to_csv`. """ self._table.to_csv(path, **kwargs) @@ -79,14 +81,15 @@ class TableFilter(TableDataset, Modifier): """ Filter for table values """ - def __init__(self, dataset, mask, *args, **kwargs): + def __init__(self, dataset: TableDataset, + mask: Iterable[bool], *args, **kwargs): """ Parameters ---------- dataset: TableDataset - Dataset to be filtered + Dataset to be filtered. mask: Iterable[bool] - Binary mask to select values from table + Binary mask to select values from table. """ super().__init__(dataset, t=dataset._table, *args, **kwargs) init_len = len(dataset) @@ -101,7 +104,7 @@ class CSVDataset(TableDataset): """ def __init__(self, csv_file_path, *args, **kwargs): """ - Passes all args and kwargs to the read_csv + Passes all args and kwargs to `pd.read_csv` Parameters ---------- @@ -115,7 +118,11 @@ def __init__(self, csv_file_path, *args, **kwargs): class PartedTableLoader(Dataset): """ Works like CSVDataset, but uses dask to load tables - and returns partitions on __getitem__ + and returns partitions on `__getitem__`. + + See also + -------- + cascade.utils.CSVDataset """ def __init__(self, csv_file_path, *args, **kwargs): super().__init__(**kwargs) @@ -123,13 +130,13 @@ def __init__(self, csv_file_path, *args, **kwargs): def __getitem__(self, index): """ - Returns partition under the index + Returns partition under the index. """ return self._table.get_partition(index).compute() def __len__(self): """ - The number of partitions + Returns the number of partitions. """ return self._table.npartitions @@ -138,14 +145,13 @@ class TableIterator(Iterator): """ Iterates over the table from path by the chunks. """ - def __init__(self, csv_file_path, *args, chunk_size=1000, **kwargs): + def __init__(self, csv_file_path: str, *args, chunk_size:int = 1000, **kwargs): """ Parameters ---------- - csv_file_path: - path to the .csv file - - chunk_size: int + csv_file_path: str + Path to the .csv file + chunk_size: int, optional number of rows to return in one __next__ """ self.chunk_size = chunk_size @@ -178,13 +184,13 @@ def __len__(self): class NullValidator(TableDataset, AggregateValidator): """ - Checks there are no null values in the table. + Checks that there are no null values in the table. """ def __init__(self, dataset: TableDataset, *args, **kwargs) -> None: - super().__init__(dataset, self.check_nulls, + super().__init__(dataset, self._check_nulls, *args, t=dataset._table, **kwargs) - def check_nulls(self, x): + def _check_nulls(self, x): mask = x._table.isnull().values if ~(mask.any()): return True diff --git a/cascade/utils/text_classification_dataset.py b/cascade/utils/text_classification_dataset.py index 41021b87..11c5fa61 100644 --- a/cascade/utils/text_classification_dataset.py +++ b/cascade/utils/text_classification_dataset.py @@ -26,13 +26,15 @@ class TextClassificationDataset(Dataset): Dataset to simplify loading of data for text classification. Texts of different classes should be placed in different folders. """ - def __init__(self, path, encoding='utf-8', *args, **kwargs): + def __init__(self, path: str, encoding: str = 'utf-8', *args, **kwargs): """ Parameters ---------- - path: + path: str Path to the folder with folders of text files. In each folder should be only one class of texts. + encoding: str, optional + Encoding that is used to open files. """ super().__init__(*args, *kwargs) self._encoding = encoding @@ -57,6 +59,9 @@ def __getitem__(self, index): return text, label def __len__(self): + """ + Total number of files. + """ return len(self._paths) def get_meta(self) -> List[Dict]: diff --git a/cascade/utils/time_series_dataset.py b/cascade/utils/time_series_dataset.py index 89c78851..171e429e 100644 --- a/cascade/utils/time_series_dataset.py +++ b/cascade/utils/time_series_dataset.py @@ -14,7 +14,7 @@ limitations under the License. """ -from typing import Iterable +from typing import Iterable, Literal import pendulum from datetime import datetime @@ -35,9 +35,9 @@ def __init__(self, *args, time=None, data=None, **kwargs): """ Parameters ---------- - time: Iterable[datetime] + time: Iterable[datetime], optional The time dimension. Should be represented subclasses of datetime - data: Iterable + data: Iterable, optional The data dimension. Should be 1D array or list. """ if time is not None and data is not None: @@ -72,6 +72,8 @@ def __init__(self, *args, time=None, data=None, **kwargs): def to_numpy(self): """ + Returns only data without time in numpy array format. + Returns ------- data: np.ndarray @@ -92,8 +94,8 @@ def get_data(self): """ Returns ------- - data: tuple(time, data) - (time as it is and data as np.array) + data: tuple + Time and data as np.array """ return self._time, self.to_numpy() @@ -153,8 +155,23 @@ def __len__(self): class Average(TimeSeriesDataset, Modifier): + """ + Averages values over some time step. + """ def __init__(self, dataset: TimeSeriesDataset, - unit='years', amount=1, *args, **kwargs): + unit: str = 'years', + amount=1, *args, **kwargs): + """ + Parameters + ---------- + dataset: TimeSeriesDataset, + A dataset to average + unit: str, optional + Time unit over which to average - years, month, etc. + amount: + The amount of units over which to average. For example for six month periods use + `unit='months'` and `amount=6`. + """ time, data = dataset.get_data() reg_time = [d for d in pendulum .period(time[0], time[-1]) @@ -180,6 +197,9 @@ def _avg(arr, arr_dates, dates): class Interpolate(TimeSeriesDataset, Modifier): + """ + The wrapper around pd.Series.interpolate. + """ def __init__(self, dataset, method='linear', limit_direction='both', **kwargs): t = dataset.to_pandas() @@ -190,6 +210,12 @@ def __init__(self, dataset, method='linear', class Align(TimeSeriesDataset, Modifier): + """ + Given dataset and some time scale selects + data from dataset using time scale. Works + only if dataset has data in given points + in time. + """ def __init__(self, dataset, time, *args, **kwargs): super().__init__(dataset, time=time, data=dataset[time], *args, **kwargs) diff --git a/cascade/utils/torch_model.py b/cascade/utils/torch_model.py index 2ddcca43..bc5142fc 100644 --- a/cascade/utils/torch_model.py +++ b/cascade/utils/torch_model.py @@ -14,28 +14,49 @@ limitations under the License. """ +from typing import Dict, List import torch -from typing import ClassVar from ..models import Model class TorchModel(Model): - def __init__(self, model_class: ClassVar, *args, **kwargs) -> None: + """ + The wrapper around `nn.Module`s. + """ + def __init__(self, model_class: type, *args, **kwargs) -> None: + """ + Parameters + ---------- + model_class: type + The class created when new nn.Module was defined. Will be used + to construct model. If any arguments needed, please pass them + into `args` and `kwargs`. + """ self._model = model_class(*args, **kwargs) super().__init__(*args, **kwargs) def predict(self, *args, **kwargs): + """ + Calls internal module with whatever arguments. + """ return self._model(*args, **kwargs) - def save(self, path, *args, **kwargs) -> None: + def save(self, path: str, *args, **kwargs) -> None: + """ + Saves the model using `torch.save`. + """ with open(path, 'wb') as f: + # TODO: pass args and kwargs torch.save(self._model, f) - def load(self, path, *args, **kwargs) -> None: + def load(self, path: str, *args, **kwargs) -> None: + """ + Loads the model using `torch.load`. + """ with open(path, 'rb') as f: self._model = torch.load(f) - def get_meta(self): + def get_meta(self) -> List[Dict]: meta = super().get_meta() meta[0]['module'] = repr(self._model) return meta diff --git a/cascade/utils/undersampler.py b/cascade/utils/undersampler.py index 7092ea32..b61e0551 100644 --- a/cascade/utils/undersampler.py +++ b/cascade/utils/undersampler.py @@ -14,7 +14,7 @@ limitations under the License. """ -from ..data import Sampler +from ..data import T, Dataset, Sampler from numpy import unique, min, histogram from tqdm import trange @@ -23,13 +23,13 @@ class UnderSampler(Sampler): """ Accepts datasets which return tuples of objects and labels. Isn't lazy - runs through all the items ones to determine key order. - Doesn't store values afterwards. + Doesn't store values in memory afterwards. To undersample it removes items of majority class for the amount of times needed to make equal distribution. Works for any number of classes. """ - def __init__(self, dataset): + def __init__(self, dataset: Dataset) -> None: labels = [int(dataset[i][1]) for i in trange(len(dataset))] ulabels = unique(labels) label_nums, _ = histogram(labels, bins=len(ulabels)) @@ -46,7 +46,7 @@ def __init__(self, dataset): print(f'Original length was {len(dataset)} and new is {ln}') super().__init__(dataset, ln) - def __getitem__(self, index): + def __getitem__(self, index: int) -> T: idx = self._rem_indices[index] return self._dataset[idx] From fbc793b988b6935e5e9ec078e41f8125429c58c7 Mon Sep 17 00:00:00 2001 From: Ilia Moiseev Date: Thu, 6 Oct 2022 12:05:39 +0300 Subject: [PATCH 11/12] Update versions of requirements, make them more compatible --- requirements.txt | 6 +++--- setup.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/requirements.txt b/requirements.txt index 2bafcdbf..83178784 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,8 @@ tqdm>=4.64.1 -numpy>=1.23.3 -pandas>=1.4.2 +numpy>=1.18.5 +pandas>=1.4.0 deepdiff>=5.8.0 pendulum>=2.1.2 plotly>=5.7.0 flatten_json>=0.1.13 -pyyaml>=6.0 +pyyaml>=5.4.1 diff --git a/setup.py b/setup.py index 73ef0890..10b5d010 100644 --- a/setup.py +++ b/setup.py @@ -27,12 +27,12 @@ python_requires=">=3.8", install_requires=[ 'tqdm>=4.64.1', - 'numpy>=1.23.3', - 'pandas>=1.4.2', + 'numpy>=1.18.5', + 'pandas>=1.4.0', 'deepdiff>=5.8.0', 'pendulum>=2.1.2', 'plotly>=5.7.0', 'flatten_json>=0.1.13', - 'pyyaml>=6.0' + 'pyyaml>=5.4.1' ] ) From 187fd03e16fa2f793928d9ecd93e1b22c5dfdb50 Mon Sep 17 00:00:00 2001 From: Ilia Moiseev Date: Thu, 6 Oct 2022 12:28:26 +0300 Subject: [PATCH 12/12] Bump version --- cascade/__init__.py | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cascade/__init__.py b/cascade/__init__.py index a1bcd6ab..3c3e69c1 100644 --- a/cascade/__init__.py +++ b/cascade/__init__.py @@ -15,7 +15,7 @@ """ -__version__ = '0.7.2' +__version__ = '0.7.3' __author__ = 'Ilia Moiseev' __author_email__ = 'ilia.moiseev.5@yandex.ru' diff --git a/setup.py b/setup.py index 10b5d010..ddd971fe 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ setuptools.setup( name="cascade-ml", - version='0.7.2', + version='0.7.3', author='Ilia Moiseev', author_email='ilia.moiseev.5@yandex.ru', license='Apache License 2.0',