diff --git a/.travis.yml b/.travis.yml
index ece2993d..4863d49e 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -2,7 +2,8 @@ language: python
python:
- "3.6"
- - "3.7-dev"
+ - "3.7"
+ - "3.8"
install:
- make install-dev
@@ -24,7 +25,7 @@ env:
#
script:
- mypy $SOURCE_FILES --ignore-missing-imports
- - pylint $SOURCE_FILES -d 'C0103, R0913, R0902, R0914, C0302, R0904, R0801, E1101'
+ - pylint $SOURCE_FILES -d 'C0103, R0913, R0902, R0914, C0302, R0904, R0801, E1101, C0330'
- pytest tests/ --showlocals -v --cov=pymfe/
- make html
diff --git a/Makefile b/Makefile
index 0ae253fe..aad26ecb 100644
--- a/Makefile
+++ b/Makefile
@@ -26,7 +26,7 @@ t: test-cov ## Shortcut to test-cov
code-check: ## Execute the code check with flake8, pylint, mypy.
flake8 $(PACKAGE)
- pylint $(PACKAGE) -d 'C0103, R0913, R0902, R0914, C0302, R0904, R0801, E1101'
+ pylint $(PACKAGE) -d 'C0103, R0913, R0902, R0914, C0302, R0904, R0801, E1101, C0330'
mypy $(PACKAGE) --ignore-missing-imports
c: code-check # Shortcut to code-check
@@ -52,3 +52,6 @@ help: ## List target command description.
@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}'
h: help ## Shortcut to help
+
+format: ## format all the package using black
+ @black --line-length 79 pymfe/
diff --git a/README.md b/README.md
index 8a7791dd..25ce199e 100644
--- a/README.md
+++ b/README.md
@@ -1,62 +1,95 @@
-# pymfe: Python Meta-Feature Extractor
[![Build Status](https://travis-ci.org/ealcobaca/pymfe.svg?branch=master)](https://travis-ci.org/ealcobaca/pymfe)
[![codecov](https://codecov.io/gh/ealcobaca/pymfe/branch/master/graph/badge.svg)](https://codecov.io/gh/ealcobaca/pymfe)
[![Documentation Status](https://readthedocs.org/projects/pymfe/badge/?version=latest)](https://pymfe.readthedocs.io/en/latest/?badge=latest)
[![PythonVersion](https://img.shields.io/pypi/pyversions/pymfe.svg)](https://www.python.org/downloads/release/python-370/)
[![Pypi](https://badge.fury.io/py/pymfe.svg)](https://badge.fury.io/py/pymfe)
-The pymfe (**py**thon **m**eta-**f**eature **e**xtractor) provides a comprehensive set of meta-features implemented in python.
-The package brings cutting edge meta-features, following recent literature propose.
-The pymfe architecture was thought to systematically make the extraction, which can produce a robust set of meta-features.
-Moreover, pymfe follows recent meta-feature formalization aiming to make MtL reproducible.
+# pymfe: Python Meta-Feature Extractor
+The pymfe (**py**thon **m**eta-**f**eature **e**xtractor) provides a
+comprehensive set of meta-features implemented in python. The package brings
+cutting edge meta-features, following recent literature propose. The pymfe
+architecture was thought to systematically make the extraction, which can
+produce a robust set of meta-features. Moreover, pymfe follows recent
+meta-feature formalization aiming to make MtL reproducible.
-Here, you can use different measures and summary functions, setting their hyperparameters, and also measuring automatically the elapsed time.
-Moreover, you can extract meta-features from specific models, or even extract meta-features with confidence intervals using bootstrap.
-There are a lot of other interesting features and you can see more about it looking at the documentation.
+Here, you can use different measures and summary functions, setting their
+hyperparameters, and also measuring automatically the elapsed time. Moreover,
+you can extract meta-features from specific models, or even extract
+meta-features with confidence intervals using bootstrap. There are a lot of
+other interesting features and you can see more about it looking at the
+documentation.
## Meta-feature
-In the Meta-learning (MtL) literature, meta-features are measures used to characterize data sets and/or their relations with algorithm bias.
-According to Brazdil et al. (2008), "Meta-learning is the study of principled methods that exploit meta-knowledge to obtain efficient models and solutions by adapting the machine learning and data mining process".
-
-Meta-features are used in MtL and AutoML tasks in general, to represent/understand a dataset, to understanding a learning bias, to create machine learning (or data mining) recommendations systems, and to create surrogates models, to name a few.
-
-Pinto et al. (2016) and Rivolli et al. (2018) defined a meta-feature as follows.
-Let be a dataset,
- be a characterization measure,
-and be a summarization function.
-Both and
- have also hyperparameters associated,
+In the Meta-learning (MtL) literature, meta-features are measures used to
+characterize data sets and/or their relations with algorithm bias.
+According to Brazdil et al. (2008), "Meta-learning is the study of principled
+methods that exploit meta-knowledge to obtain efficient models and solutions by
+adapting the machine learning and data mining process".
+
+Meta-features are used in MtL and AutoML tasks in general, to
+represent/understand a dataset, to understanding a learning bias, to create
+machine learning (or data mining) recommendations systems, and to create
+surrogates models, to name a few.
+
+Pinto et al. (2016) and Rivolli et al. (2018) defined a meta-feature as
+follows. Let
+be a dataset,
+be a characterization measure, and
+
+be a summarization function. Both
+ and
+
+have also hyperparameters associated,
and
respectively.
-Thus, a meta-feature for a given dataset is:
+Thus, a meta-feature
+
+for a given dataset
+ is:
.
-The measure can extract more than one value from each data set, i.e., can vary according to
-, which can be mapped to a vector of fixed length
- using a summarization function
+The measure
+can extract more than one value from each data set, i.e.,
+
+can vary according to ,
+which can be mapped to a vector of fixed length
+
+using a summarization function
.
In this package, We provided the following meta-features groups:
-- **General**: General information related to the dataset, also known as simple measures, such as the number of instances, attributes and classes.
-- **Statistical**: Standard statistical measures to describe the numerical properties of data distribution.
-- **Information-theoretic**: Particularly appropriate to describe discrete (categorical) attributes and their relationship with the classes.
-- **Model-based**: Measures designed to extract characteristics from simple machine learning models.
+- **General**: General information related to the dataset, also known as simple
+ measures, such as the number of instances, attributes and classes.
+- **Statistical**: Standard statistical measures to describe the numerical
+ properties of data distribution.
+- **Information-theoretic**: Particularly appropriate to describe discrete
+ (categorical) attributes and their relationship with the classes.
+- **Model-based**: Measures designed to extract characteristics from simple
+ machine learning models.
- **Landmarking**: Performance of simple and efficient learning algorithms.
-- **Relative Landmarking**: Relative performance of simple and efficient learning algorithms.
-- **Subsampling Landmarking**: Performance of simple and efficient learning algorithms from a subsample of the dataset.
-- **Clustering**: Clustering measures extract information about dataset based on external validation indexes.
-- **Concept**: Estimate the variability of class labels among examples and the examples density.
+- **Relative Landmarking**: Relative performance of simple and efficient
+ learning algorithms.
+- **Subsampling Landmarking**: Performance of simple and efficient learning
+ algorithms from a subsample of the dataset.
+- **Clustering**: Clustering measures extract information about dataset based
+ on external validation indexes.
+- **Concept**: Estimate the variability of class labels among examples and the
+ examples density.
- **Itemset**: Compute the correlation between binary attributes.
-- **Complexity**: Estimate the difficulty in separating the data points into their expected classes.
+- **Complexity**: Estimate the difficulty in separating the data points into
+ their expected classes.
-In the pymfe package, you can use different measures and summary functions, setting their hyperparameters, and automatically measure the elapsed time.
-Moreover, you can extract meta-features from specific models, or even obtain meta-features with confidence intervals using bootstrap.
-There are many other exciting features. You can see more about it looking at the [documentation](https://pymfe.readthedocs.io/en/latest/api.html).
+In the pymfe package, you can use different measures and summary functions,
+setting their hyperparameters, and automatically measure the elapsed time.
+Moreover, you can extract meta-features from specific models, or even obtain
+meta-features with confidence intervals using bootstrap.
+There are many other exciting features. You can see more about it looking at
+the [documentation](https://pymfe.readthedocs.io/en/latest/api.html).
## Dependencies
@@ -87,7 +120,12 @@ python3 setup.py install
## Example of use
-The simplest way to extract meta-features is by instantiating the `MFE` class. The parameters are the measures, the group of measures, and the summarization functions to be extracted. The default parameter is to extract all the measures. The `fit` function can be called by passing the `X` and `y`. The `extract` function is used to extract the related measures. A simple example using `pymfe` for supervised tasks is given next:
+The simplest way to extract meta-features is by instantiating the `MFE` class.
+It computes five meta-features groups by default using mean and standard
+deviation as summary functions: General, Statistical, Information-theoretic,
+Model-based, and Landmarking. The `fit` method can be called by passing the `X`
+and `y`. Then the `extract` method is used to extract the related measures.
+A simple example using `pymfe` for supervised tasks is given next:
```python
# Load a dataset
@@ -117,7 +155,10 @@ ft = mfe.extract()
print(ft)
```
-You can simply omit the target attribute for unsupervised tasks while fitting the data into the MFE model. The `pymfe` package automatically finds and extracts only the metafeatures suitable for this type of task. Examples are given next:
+You can simply omit the target attribute for unsupervised tasks while fitting
+the data into the `MFE` model. The `pymfe` package automatically finds and
+extracts only the metafeatures suitable for this type of task. Examples are
+given next:
```python
# Load a dataset
@@ -141,7 +182,11 @@ ft = mfe.extract()
print(ft)
```
-Several measures return more than one value. To aggregate the returned values, summarization function can be used. This method can compute `min`, `max`, `mean`, `median`, `kurtosis`, `standard deviation`, among others. The default methods are the `mean` and the `sd`. Next, it is possible to see an example of the use of this method:
+Several measures return more than one value. To aggregate the returned values,
+summarization function can be used. This method can compute `min`, `max`,
+`mean`, `median`, `kurtosis`, `standard deviation`, among others. The default
+methods are the `mean` and the `sd`. Next, it is possible to see an example of
+the use of this method:
```python
## Extract default measures using min, median and max
@@ -157,7 +202,13 @@ ft = mfe.extract()
print(ft)
```
-It is possible to pass custom arguments to every metafeature using MFE `extract` method kwargs. The keywords must be the target metafeature name, and the value must be a dictionary in the format {`argument`: `value`}, i.e., each key in the dictionary is a target argument with its respective value. In the example below, the extraction of metafeatures `min` and `max` happens as usual, but the metafeatures `sd,` `nr_norm` and `nr_cor_attr` will receive user custom argument values, which will interfere in each metafeature result.
+It is possible to pass custom arguments to every metafeature using `MFE`
+`extract` method kwargs. The keywords must be the target metafeature name, and
+the value must be a dictionary in the format {`argument`: `value`}, i.e., each
+key in the dictionary is a target argument with its respective value. In the
+example below, the extraction of metafeatures `min` and `max` happens as
+usual, but the metafeatures `sd,` `nr_norm` and `nr_cor_attr` will receive user
+custom argument values, which will interfere in each metafeature result.
```python
# Extract measures with custom user arguments
@@ -171,7 +222,9 @@ ft = mfe.extract(
print(ft)
```
-If you want to extract metafeatures from a pre-fitted machine learning model (from `sklearn package`), you can use the `extract_from_model` method without needing to use the training data:
+If you want to extract metafeatures from a pre-fitted machine learning model
+(from `sklearn package`), you can use the `extract_from_model` method without
+needing to use the training data:
```python
import sklearn.tree
@@ -196,7 +249,10 @@ ft = extractor.extract_from_model(
print(ft)
```
-You can also extract your metafeatures with confidence intervals using bootstrap. Keep in mind that this method extracts each metafeature several times, and may be very expensive depending mainly on your data and the number of metafeature extract methods called.
+You can also extract your metafeatures with confidence intervals using
+bootstrap. Keep in mind that this method extracts each metafeature several
+times, and may be very expensive depending mainly on your data and the number
+of metafeature extract methods called.
```python
# Extract metafeatures with confidence interval
@@ -213,7 +269,8 @@ print(ft)
```
## Documentation
-We write a great Documentation to guide you on how to use the pymfe library. You can find the Documentation in this [link](https://pymfe.readthedocs.io/en/latest/?badge=latest).
+We write a great [Documentation](https://pymfe.readthedocs.io/en/latest/?badge=latest)
+to guide you on how to use the pymfe library.
You can find in the documentation interesting pages like:
* [Getting started](https://pymfe.readthedocs.io/en/latest/install.html)
* [API documentation](https://pymfe.readthedocs.io/en/latest/api.html)
@@ -222,19 +279,27 @@ You can find in the documentation interesting pages like:
## Developer notes
-* We are glad to accept any contributions, please check [Contributing](https://github.com/ealcobaca/pymfe/blob/master/CONTRIBUTING.md) and the [Documentation](https://pymfe.readthedocs.io/en/latest/?badge=latest).
-* To submit bugs and feature requests, report at [project issues](https://github.com/ealcobaca/pymfe/issues).
-* In the current version, the meta-feature extractor supports only classification problems. The authors plan to extend the package to add clustering and regression measures and to support MtL evaluation measures. For more specific information on how to extract each group of measures, please refer to the functions documentation page and the examples contained therein. For a general overview of the `pymfe` package, please have a look at the associated documentation.
+* We are glad to accept any contributions, please check
+ [Contributing](https://github.com/ealcobaca/pymfe/blob/master/CONTRIBUTING.md)
+ and the [Documentation](https://pymfe.readthedocs.io/en/latest/?badge=latest).
+* To submit bugs and feature requests, report at
+ [project issues](https://github.com/ealcobaca/pymfe/issues).
## License
-This project is licensed under the MIT License - see the [License](LICENSE) file for details.
+This project is licensed under the MIT License - see the
+[License](https://github.com/ealcobaca/pymfe/blob/master/LICENCE) file for
+details.
## Cite Us
-If you use the `pymfe` or [`mfe`](https://github.com/rivolli/mfe) in scientific publication, we would appreciate citations to the following paper:
+If you use the `pymfe` in scientific publication, we would appreciate citations
+to the following paper:
-Edesio Alcobaça, Felipe Siqueira, Adriano Rivolli, Luís P. F. Garcia, Jefferson T. Oliva, & André C. P. L. F. de Carvalho (2020). MFE: Towards reproducible meta-feature extraction. Journal of Machine Learning Research, 21(111), 1-5. http://jmlr.org/papers/v21/19-348.html
+[Edesio Alcobaça, Felipe Siqueira, Adriano Rivolli, Luís P. F. Garcia,
+Jefferson T. Oliva, & André C. P. L. F. de Carvalho (2020).
+MFE: Towards reproducible meta-feature extraction.
+Journal of Machine Learning Research, 21(111), 1-5.](http://jmlr.org/papers/v21/19-348.html)
You can also use the bibtex format:
```bibtex
@@ -257,10 +322,20 @@ You can also use the bibtex format:
```
## Acknowledgments
-We would like to thank every [Contributor](https://github.com/ealcobaca/pymfe/graphs/contributors) directly or indirectly has helped this project to happen. Thank you all.
+We would like to thank every
+[Contributor](https://github.com/ealcobaca/pymfe/graphs/contributors)
+that directly or indirectly has make this project to happen. Thank you all.
## References
-
-1. Rivolli, A., Garcia, L. P. F., Soares, C., Vanschoren, J., and de Carvalho, A. C. P. L. F. (2018). Towards Reproducible Empirical Research in Meta-Learning. arXiv:1808.10406.
-2. Pinto, F., Soares, C., & Mendes-Moreira, J. (2016, April). Towards automatic generation of metafeatures. In Pacific-Asia Conference on Knowledge Discovery and Data Mining (pp. 215-226). Springer, Cham.
-3. Brazdil, P., Carrier, C. G., Soares, C., & Vilalta, R. (2008). Metalearning: Applications to data mining. Springer Science & Business Media.
+1. [Brazdil, P., Carrier, C. G., Soares, C., & Vilalta, R. (2008). Metalearning:
+Applications to data mining. Springer Science
+and Business Media.](https://www.springer.com/gp/book/9783540732624)
+2. [Pinto, F., Soares, C., & Mendes-Moreira, J. (2016, April). Towards automatic
+generation of metafeatures. In Pacific-Asia Conference on Knowledge Discovery
+and Data Mining (pp. 215-226). Springer,
+Cham.](https://link.springer.com/chapter/10.1007/978-3-319-31753-3_18)
+3. [Rivolli, A., Garcia, L. P. F., Soares, C., Vanschoren, J., and de Carvalho,
+A. C. P. L. F. (2018). Characterizing classification datasets: a study of
+meta-features for meta-learning.
+arXiv:1808.10406.](https://arxiv.org/abs/1808.10406v2)
+
diff --git a/docs/source/_static/theme_overrides.css b/docs/source/_static/theme_overrides.css
deleted file mode 100644
index 63ee6cc7..00000000
--- a/docs/source/_static/theme_overrides.css
+++ /dev/null
@@ -1,13 +0,0 @@
-/* override table width restrictions */
-@media screen and (min-width: 767px) {
-
- .wy-table-responsive table td {
- /* !important prevents the common CSS stylesheets from overriding
- this as on RTD they are loaded after this stylesheet */
- white-space: normal !important;
- }
-
- .wy-table-responsive {
- overflow: visible !important;
- }
-}
diff --git a/docs/source/about.rst b/docs/source/about.rst
index 4de30d5c..35cc5765 100644
--- a/docs/source/about.rst
+++ b/docs/source/about.rst
@@ -13,12 +13,30 @@ You can find the contributors of this package here_.
Citing PyMFE
------------
-If you use PyMFE in a scientific publication, we would appreciate
-citations to the following paper::
+If you use the `pymfe` in scientific publication, we would appreciate citations
+to the following paper:
+
+`Edesio Alcobaça, Felipe Siqueira, Adriano Rivolli, Luís P. F. Garcia,
+Jefferson T. Oliva, & André C. P. L. F. de Carvalho (2020).
+MFE: Towards reproducible meta-feature extraction. Journal of Machine Learning
+Research, 21(111), 1-5. `_
+
+You can also use the bibtex format::
+
+ @article{JMLR:v21:19-348,
+ author = {Edesio Alcobaça and
+ Felipe Siqueira and
+ Adriano Rivolli and
+ Luís P. F. Garcia and
+ Jefferson T. Oliva and
+ André C. P. L. F. de Carvalho
+ },
+ title = {MFE: Towards reproducible meta-feature extraction},
+ journal = {Journal of Machine Learning Research},
+ year = {2020},
+ volume = {21},
+ number = {111},
+ pages = {1-5},
+ url = {http://jmlr.org/papers/v21/19-348.html}
+ }
- None
-
-Extra information
------------------
-See the `README `_
-file from GitHub for extra information.
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 29eb553e..28f56c86 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -87,13 +87,7 @@
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['_static']
-
-html_context = {
- 'css_files': [
- '_static/theme_overrides.css', # override wide tables in RTD theme
- ],
- }
+# html_static_path = ['_static']
# Output file base name for HTML help builder.
@@ -124,8 +118,8 @@
'reference_url': {
# The module you locally document uses None
'pymfe': None,
- }
- # 'plot_gallery': True,
+ },
+ # 'thumbnail_size': (50, 50),
# 'junit': '../test-results/sphinx-gallery/junit.xml',
# 'log_level': {'backreference_missing': 'warning'},
# 'subsection_order': ExplicitOrder(['../examples/sin_func',
diff --git a/docs/source/new.rst b/docs/source/new.rst
index 40c9679e..882ec055 100644
--- a/docs/source/new.rst
+++ b/docs/source/new.rst
@@ -6,8 +6,8 @@ The PyMFE releases are available in PyPI_ and GitHub_.
.. _GitHub: https://github.com/ealcobaca/pymfe/releases
-Version 0.3.0 (Available on PyPI)
----------------------------------
+Version 0.3.0
+-------------
* Metafeature extraction with confidence intervals
* Pydoc fixes and package documentation/code consistency improvements
@@ -41,8 +41,8 @@ Version 0.3.0 (Available on PyPI)
* Online documentation improvement
-Version 0.2.0 (Available on PyPI)
----------------------------------
+Version 0.2.0
+-------------
* New meta-feature groups
* Complexity
@@ -70,8 +70,8 @@ Version 0.2.0 (Available on PyPI)
* Statistical group updated
-Version 0.1.1 (Available on PyPI)
----------------------------------
+Version 0.1.1
+-------------
* Bugs solved
* False positive of mypy fixed
@@ -88,8 +88,8 @@ Version 0.1.1 (Available on PyPI)
current percentage of progress done so far.
-Version 0.1.0 (Available on PyPI)
----------------------------------
+Version 0.1.0
+-------------
* Meta-feature groups available
* Relative landmarking
@@ -123,15 +123,12 @@ Version 0.1.0 (Available on PyPI)
* Several new tests added
-Version 0.0.3 (Available on PyPI)
----------------------------------
+Version 0.0.3
+-------------
* Documentation improvement
* Setup improvement
-
-Initial Release
----------------
* Meta-feature groups available:
* Simple
@@ -144,4 +141,3 @@ Initial Release
* Landmarking
-
diff --git a/docs/source/using.rst b/docs/source/using.rst
index 31347c25..27668f46 100644
--- a/docs/source/using.rst
+++ b/docs/source/using.rst
@@ -1,12 +1,13 @@
Using PyMFE
###########
Extracting metafeatures with PyMFE is easy.
-
-The parameters are the measures, the group of measures and the summarization
-functions to be extracted. The default behavior is to extract all default
-measures, which is. The ``fit`` function can be called by passing the ``X``
-and ``y``. The ``extract`` function is used to extract the related measures.
-See this example::
+
+The simplest way to extract meta-features is by instantiating the `MFE` class.
+It computes five meta-features groups by default using mean and standard
+deviation as summary functions: General, Statistical, Information-theoretic,
+Model-based, and Landmarking. The `fit` method can be called by passing the `X`
+and `y`. Then the `extract` method is used to extract the related measures.
+A simple example using `pymfe` for supervised tasks is given next::
# Load a dataset
from sklearn.datasets import load_iris
diff --git a/docs/sphinxext/github_link.py b/docs/sphinxext/github_link.py
index 19150e02..351361c9 100644
--- a/docs/sphinxext/github_link.py
+++ b/docs/sphinxext/github_link.py
@@ -1,3 +1,5 @@
+# Authors : https://github.com/scikit-learn-contrib/imbalanced-learn/blob/master/doc/sphinxext/github_link.py
+
from operator import attrgetter
import inspect
import subprocess
diff --git a/examples/01_introductory_examples/plot_extract_from_model.py b/examples/01_introductory_examples/plot_extract_from_model.py
new file mode 100644
index 00000000..39ca8bca
--- /dev/null
+++ b/examples/01_introductory_examples/plot_extract_from_model.py
@@ -0,0 +1,36 @@
+"""
+Meta-features from a model
+==========================
+
+In this example, we will show you how to extract meta-features from a
+pre-fitted model.
+"""
+
+# Load a dataset
+import sklearn.tree
+from sklearn.datasets import load_iris
+from pymfe.mfe import MFE
+
+iris = load_iris()
+
+###############################################################################
+# If you want to extract metafeatures from a pre-fitted machine learning model
+# (from sklearn package), you can use the `extract_from_model` method without
+# needing to use the training data:
+
+# Extract from model
+
+model = sklearn.tree.DecisionTreeClassifier().fit(iris.data, iris.target)
+extractor = MFE()
+ft = extractor.extract_from_model(model)
+print("\n".join("{:50} {:30}".format(x, y) for x, y in zip(ft[0], ft[1])))
+
+# Extract specific metafeatures from model
+extractor = MFE(features=["tree_shape", "nodes_repeated"], summary="histogram")
+
+ft = extractor.extract_from_model(
+ model,
+ arguments_fit={"verbose": 1},
+ arguments_extract={"verbose": 1, "histogram": {"bins": 5}})
+
+print("\n".join("{:50} {:30}".format(x, y) for x, y in zip(ft[0], ft[1])))
diff --git a/examples/01_introductory_examples/plot_unsupervised_meta_features.py b/examples/01_introductory_examples/plot_unsupervised_meta_features.py
new file mode 100644
index 00000000..d467cdfc
--- /dev/null
+++ b/examples/01_introductory_examples/plot_unsupervised_meta_features.py
@@ -0,0 +1,33 @@
+"""
+Extracting meta-features from unsupervised learning
+===================================================
+
+In this example we will show you how to extract meta-features from unsupervised
+machine learning tasks.
+"""
+
+# Load a dataset
+from sklearn.datasets import load_iris
+from pymfe.mfe import MFE
+
+data = load_iris()
+y = data.target
+X = data.data
+
+###############################################################################
+#
+# You can simply omit the target attribute for unsupervised tasks while
+# fitting the data into the MFE model. The `pymfe` package automatically finds
+# and extracts only the metafeatures suitable for this type of task.
+
+# Extract default unsupervised measures
+mfe = MFE()
+mfe.fit(X)
+ft = mfe.extract()
+print("\n".join("{:50} {:30}".format(x, y) for x, y in zip(ft[0], ft[1])))
+
+# Extract all available unsupervised measures
+mfe = MFE(groups="all")
+mfe.fit(X)
+ft = mfe.extract()
+print("\n".join("{:50} {:30}".format(x, y) for x, y in zip(ft[0], ft[1])))
diff --git a/examples/02_advanced_examples/README.txt b/examples/02_advanced_examples/README.txt
index 0f8e08f1..b5ade66b 100644
--- a/examples/02_advanced_examples/README.txt
+++ b/examples/02_advanced_examples/README.txt
@@ -2,4 +2,4 @@ Advanced Examples
-----------------
These examples will show you how to use some advanced configurations and tricks
-to make codification more comfortable.
+to code more comfortable.
diff --git a/examples/02_advanced_examples/plot_confidence_interval.py b/examples/02_advanced_examples/plot_confidence_interval.py
new file mode 100644
index 00000000..f1927264
--- /dev/null
+++ b/examples/02_advanced_examples/plot_confidence_interval.py
@@ -0,0 +1,35 @@
+"""
+Meta-feature confidence interval
+================================
+
+In this example, we will show you how to extract meta-features with confidence
+interval.
+"""
+
+# Load a dataset
+import sklearn.tree
+from sklearn.datasets import load_iris
+from pymfe.mfe import MFE
+
+data = load_iris()
+y = data.target
+X = data.data
+
+# You can also extract your meta-features with confidence intervals using
+# bootstrap. Keep in mind that this method extracts each meta-feature several
+# times, and may be very expensive depending mainly on your data and the
+# number of meta-feature extract methods called.
+
+# Extract meta-features with confidence interval
+mfe = MFE(features=["mean", "nr_cor_attr", "sd", "max"])
+mfe.fit(X, y)
+
+ft = mfe.extract_with_confidence(
+ sample_num=256,
+ confidence=0.99,
+ verbose=1,
+)
+
+print("\n".join("{:50} {:30} {:30}".format(x, y[0], y[1])
+ for x, y in zip(ft[0], ft[2])))
+
diff --git a/examples/02_advanced_examples/plot_custom_arguments.py b/examples/02_advanced_examples/plot_custom_arguments.py
index 324188b4..cf2a2a9f 100644
--- a/examples/02_advanced_examples/plot_custom_arguments.py
+++ b/examples/02_advanced_examples/plot_custom_arguments.py
@@ -1,6 +1,6 @@
"""
Customizing measures arguments
-===================================
+==============================
In this example we will show you how to custorize the measures.
"""
diff --git a/examples/03_miscellaneous_examples/plot_default_value_for_attr_conc.py b/examples/03_miscellaneous_examples/plot_default_value_for_attr_conc.py
new file mode 100644
index 00000000..9a0d0c3e
--- /dev/null
+++ b/examples/03_miscellaneous_examples/plot_default_value_for_attr_conc.py
@@ -0,0 +1,57 @@
+"""
+Meta-feature confidence interval
+================================
+
+In this example, we will show you how the default value `max_attr_num` of
+meta-feature `attr_conc` was solved.
+"""
+
+# Load a dataset
+from sklearn.datasets import load_iris
+import numpy as np
+import pymfe.mfe
+import matplotlib.pyplot as plt
+
+iris = load_iris()
+
+# Added a default value for `max_attr_num` parameter of the `attr_conc`
+# meta-feature extraction method, which is the most expensive meta-feature
+# extraction method by far.
+
+# The default parameter was determined by a simple inspection at the feature
+# extraction time growing rate to the number of attributes on the fitted data.
+# The threshold accepted for the time extraction is a value less than 2
+# seconds.
+
+# The test dataset was the iris dataset. The test code used is reproduced
+# below.
+np.random.seed(0)
+
+arrsize = np.zeros(10)
+time = np.zeros(10)
+
+X = np.empty((iris.target.size, 0))
+
+for i in np.arange(10):
+ X = np.hstack((X, iris.data))
+ print(f"{i}. Number of attributes: {X.shape[1]} ...")
+ model = pymfe.mfe.MFE(features="attr_conc",
+ summary="mean",
+ measure_time="total").fit(X)
+ res = model.extract(suppress_warnings=True)
+
+ arrsize[i] = model._custom_args_ft["C"].shape[1]
+ time[i] = res[2][0]
+
+plt.plot(arrsize, time, label="time elapsed")
+plt.hlines(y=np.arange(1, 1 + int(np.ceil(np.max(time)))),
+ xmin=0,
+ xmax=arrsize[-1],
+ linestyle="dotted",
+ color="red")
+plt.legend()
+plt.show()
+
+# The time cost of extraction for the attr_conc meta-feature does not grow
+# significantly with the number of instance and, hence, it is not necessary to
+# sample in the instance axis.
diff --git a/examples/README.rst b/examples/README.rst
new file mode 100644
index 00000000..cb596e34
--- /dev/null
+++ b/examples/README.rst
@@ -0,0 +1,54 @@
+The PyMFE example gallery
+=========================
+
+In this gallery, we will show a set of examples to help you to use this package and guide you on the meta-feature extraction process.
+
+In the Meta-learning (MtL) literature, meta-features are measures used to characterize data sets and/or their relations with algorithm bias.
+According to Brazdil et al. (2008), "Meta-learning is the study of principled methods that exploit meta-knowledge to obtain efficient models and solutions by adapting the machine learning and data mining process".
+
+Meta-features are used in MtL and AutoML tasks in general, to represent/understand a dataset, to understanding a learning bias, to create machine learning (or data mining) recommendations systems, and to create surrogates models, to name a few.
+
+Pinto et al. (2016) and Rivolli et al. (2018) defined a meta-feature as follows.
+Let :math:`D \in \mathcal{D}` be a dataset,
+:math:`m\colon \mathcal{D} \to \mathbb{R}^{k'}` be a characterization measure,
+and :math:`\sigma\colon \mathbb{R}^{k'} \to \mathbb{R}^{k}` be a summarization function.
+Both :math:`m` and
+:math:`\sigma` have also hyperparameters associated,
+:math:`h_m` and
+:math:`h_\sigma` respectively.
+Thus, a meta-feature :math:`f\colon \mathcal{D} \to \mathbb{R}^{k}` for a given dataset :math:`D` is:
+
+.. math::
+ f\big(D\big) = \sigma\big(m(D,h_m), h_\sigma\big).
+
+The measure :math: `m` can extract more than one value from each data set, i.e.,
+:math:`k'` can vary according to
+:math:`D`, which can be mapped to a vector of fixed length
+:math:`k` using a summarization function
+:math: `\sigma`.
+
+In this package, We provided the following meta-features groups:
+
+* **General**: General information related to the dataset, also known as simple measures, such as the number of instances, attributes and classes.
+
+* **Statistical**: Standard statistical measures to describe the numerical properties of data distribution.
+
+* **Information-theoretic**: Particularly appropriate to describe discrete (categorical) attributes and their relationship with the classes.
+
+* **Model-based**: Measures designed to extract characteristics from simple machine learning models.
+
+* **Landmarking**: Performance of simple and efficient learning algorithms.
+
+* **Relative Landmarking**: Relative performance of simple and efficient learning algorithms.
+
+* **Subsampling Landmarking**: Performance of simple and efficient learning algorithms from a subsample of the dataset.
+
+* **Clustering**: Clustering measures extract information about dataset based on external validation indexes.
+
+* **Concept**: Estimate the variability of class labels among examples and the examples density.
+
+* **Itemset**: Compute the correlation between binary attributes.
+
+* **Complexity**: Estimate the difficulty in separating the data points into their expected classes.
+
+Below is a gallery of examples:
diff --git a/examples/README.txt b/examples/README.txt
deleted file mode 100644
index 93346aa4..00000000
--- a/examples/README.txt
+++ /dev/null
@@ -1,26 +0,0 @@
-The PyMFE example gallery
-=========================
-
-In this gallery, we will show a set of examples to help you to use this package and guide you on the meta-feature extraction process.
-
-
-Extracts meta-features from datasets to support the design of recommendation systems based on Meta-Learning (MtL). The meta-features, also called characterization measures, are able to characterize the complexity of datasets and to provide estimates of algorithm performance. The package contains not only the standard, but also more recent characterization measures. By making available a large set of meta-feature extraction functions, this package allows a comprehensive data characterization, a deep data exploration and a large number of MtL-based data analysis.
-
-Measures
---------
-
-In MtL, meta-features are designed to extract general properties able to characterize datasets. The meta-feature values should provide relevant evidences about the performance of algorithms, allowing the design of MtL-based recommendation systems. Thus, these measures must be able to predict, with a low computational cost, the performance of the algorithms under evaluation. In this package, the meta-feature measures are divided into 11 groups:
-
-- **General**: General information related to the dataset, also known as simple measures, such as the number of instances, attributes and classes.
-- **Statistical**: Standard statistical measures to describe the numerical properties of data distribution.
-- **Information-theoretic**: Particularly appropriate to describe discrete (categorical) attributes and their relationship with the classes.
-- **Model-based**: Measures designed to extract characteristics from simple machine learning models.
-- **Landmarking**: Performance of simple and efficient learning algorithms.
-- **Relative Landmarking**: Relative performance of simple and efficient learning algorithms.
-- **Subsampling Landmarking**: Performance of simple and efficient learning algorithms from a subsample of the dataset.
-- **Clustering**: Clustering measures extract information about dataset based on external validation indexes.
-- **Concept**: Estimate the variability of class labels among examples and the examples density.
-- **Itemset**: Compute the correlation between binary attributes.
-- **Complexity**: Estimate the difficulty in separating the data points into their expected classes.
-
-Below is a gallery of examples:
diff --git a/pymfe/_dev.py b/pymfe/_dev.py
index e2356234..ea6d2d72 100644
--- a/pymfe/_dev.py
+++ b/pymfe/_dev.py
@@ -259,10 +259,12 @@ class post-processing methods (these methods will be explained
# Important detail: all methods must be classmethods; there is no class
# instantiation in the pymfe framework.
@classmethod
- def precompute_basic_precomp_method(cls,
- y: t.Optional[np.ndarray] = None,
- argument_bar: t.Optional[int] = None,
- **kwargs) -> t.Dict[str, t.Any]:
+ def precompute_basic_precomp_method(
+ cls,
+ y: t.Optional[np.ndarray] = None,
+ argument_bar: t.Optional[int] = None,
+ **kwargs
+ ) -> t.Dict[str, t.Any]:
"""A precomputation method example.
The pydoc of each method must explain cleary what is the purpose of
@@ -395,9 +397,9 @@ def precompute_basic_precomp_method(cls,
return precomp_vals
@classmethod
- def precompute_more_info(cls,
- argument_bar: t.Optional[int] = None,
- **kwargs) -> t.Dict[str, t.Any]:
+ def precompute_more_info(
+ cls, argument_bar: t.Optional[int] = None, **kwargs
+ ) -> t.Dict[str, t.Any]:
"""Highly relevant information about precomputation methods.
1. How many precomputation methods per class?
@@ -479,14 +481,15 @@ def precompute_more_info(cls,
precomp_vals["qux"] = 1.0
precomp_vals["quux"] = 5 + 1.0j * (precomp_vals["qux"])
precomp_vals["quuz"] = np.array(
- [precomp_vals["qux"] + i for i in np.arange(5)])
+ [precomp_vals["qux"] + i for i in np.arange(5)]
+ )
return precomp_vals
@classmethod
- def precompute_random_values(cls,
- random_state: t.Optional[int] = None,
- **kwargs) -> t.Dict[str, t.Any]:
+ def precompute_random_values(
+ cls, random_state: t.Optional[int] = None, **kwargs
+ ) -> t.Dict[str, t.Any]:
"""Precomputation method with pseudo-random behavior.
1. An important pymfe default argument for you: 'random_state'
@@ -539,12 +542,12 @@ def precompute_random_values(cls,
@classmethod
def ft_metafeature_name(
- cls,
- X: np.ndarray,
- y: np.ndarray,
- random_state: t.Optional[int] = None,
- opt_arg_bar: float = 1.0,
- opt_arg_baz: np.ndarray = None,
+ cls,
+ X: np.ndarray,
+ y: np.ndarray,
+ random_state: t.Optional[int] = None,
+ opt_arg_bar: float = 1.0,
+ opt_arg_baz: np.ndarray = None,
) -> int:
"""Single-line description of this feature extraction method.
@@ -673,8 +676,9 @@ def ft_metafeature_name(
return ret
@classmethod
- def ft_fitted_data_arguments(cls, X: np.ndarray, N: np.ndarray,
- C: np.ndarray, y: np.ndarray) -> int:
+ def ft_fitted_data_arguments(
+ cls, X: np.ndarray, N: np.ndarray, C: np.ndarray, y: np.ndarray
+ ) -> int:
"""Information about some arguments related to fitted data.
1. Handling Numerical, Categorical and Mixed data types
@@ -728,10 +732,11 @@ def ft_fitted_data_arguments(cls, X: np.ndarray, N: np.ndarray,
@classmethod
def ft_using_precomputed_values(
- cls,
- y: np.ndarray,
- # y_unique: np.ndarray, # Wrong! Need an default value.
- y_unique: t.Optional[np.ndarray] = None) -> np.ndarray:
+ cls,
+ y: np.ndarray,
+ # y_unique: np.ndarray, # Wrong! Need an default value.
+ y_unique: t.Optional[np.ndarray] = None,
+ ) -> np.ndarray:
"""Metafeature extraction method using precomputed values.
1. How to use precomputed arguments
@@ -811,10 +816,7 @@ def ft_using_precomputed_values(
return res
@classmethod
- def ft_about_return_values(
- cls,
- y: np.ndarray,
- ) -> np.ndarray:
+ def ft_about_return_values(cls, y: np.ndarray,) -> np.ndarray:
"""Information about return values of feature extraction methods.
1. You have two return options for metafeature extraction methods
@@ -938,14 +940,19 @@ def non_protected_methods_without_any_prefixes(cls) -> None:
fact that it is not of the user's interest.
"""
raise NotImplementedError(
- "Hide me prefixing my name with a single '_'.")
+ "Hide me prefixing my name with a single '_'."
+ )
@classmethod
def postprocess_groupName1_groupName2(
- cls, mtf_names: t.List[str], mtf_vals: t.List[float],
- mtf_time: t.List[float], class_indexes: t.Sequence[int],
- groups: t.Tuple[str, ...], inserted_group_dep: t.FrozenSet[str],
- **kwargs
+ cls,
+ mtf_names: t.List[str],
+ mtf_vals: t.List[float],
+ mtf_time: t.List[float],
+ class_indexes: t.Sequence[int],
+ groups: t.Tuple[str, ...],
+ inserted_group_dep: t.FrozenSet[str],
+ **kwargs
) -> t.Optional[t.Tuple[t.List[str], t.List[float], t.List[float]]]:
"""Introduction to post-processing methods.
diff --git a/pymfe/_internal.py b/pymfe/_internal.py
index ada92f76..310e7e0d 100644
--- a/pymfe/_internal.py
+++ b/pymfe/_internal.py
@@ -109,7 +109,7 @@
"clustering",
"complexity",
"itemset",
- "concept"
+ "concept",
) # type: t.Tuple[str, ...]
GROUP_PREREQUISITES = (
@@ -122,7 +122,7 @@
None,
None,
None,
- None
+ None,
) # type: t.Tuple[t.Optional[str], ...]
VALID_MFECLASSES = (
@@ -135,10 +135,10 @@
clustering.MFEClustering,
complexity.MFEComplexity,
itemset.MFEItemset,
- concept.MFEConcept
+ concept.MFEConcept,
) # type: t.Tuple
-VALID_SUMMARY = (*_summary.SUMMARY_METHODS, ) # type: t.Tuple[str, ...]
+VALID_SUMMARY = (*_summary.SUMMARY_METHODS,) # type: t.Tuple[str, ...]
VALID_TIMEOPT = (
"avg",
@@ -159,7 +159,7 @@
"robust": sklearn.preprocessing.RobustScaler,
}
-VALID_RESCALE = (*_RESCALE_SCALERS, )
+VALID_RESCALE = (*_RESCALE_SCALERS,)
TIMEOPT_AVG_PREFIX = "avg"
@@ -174,8 +174,9 @@
TypeMtdTuple = t.Tuple[str, t.Callable[..., t.Any]]
"""Type annotation which describes the a metafeature method tuple."""
-TypeExtMtdTuple = t.Tuple[str, t.Callable[..., t.Any],
- t.Tuple[str, ...], t.Tuple[str, ...]]
+TypeExtMtdTuple = t.Tuple[
+ str, t.Callable[..., t.Any], t.Tuple[str, ...], t.Tuple[str, ...]
+]
"""Type annotation which extends TypeMtdTuple with extra fields."""
_TYPE_NUMERIC = (
@@ -185,12 +186,7 @@
)
"""Tuple with generic numeric types."""
-TypeNumeric = t.TypeVar(
- "TypeNumeric",
- int,
- float,
- np.number,
-)
+TypeNumeric = t.TypeVar("TypeNumeric", int, float, np.number,)
"""Typing alias of generic numeric types for static code checking."""
@@ -217,11 +213,13 @@
"""Common exceptions of metafeature extraction."""
-def warning_format(message: str,
- category: t.Type[Warning],
- filename: str,
- lineno: int,
- line: str = None) -> str:
+def warning_format(
+ message: str,
+ category: t.Type[Warning],
+ filename: str,
+ lineno: int,
+ line: str = None,
+) -> str:
"""Change warnings format to a simpler one.
Args:
@@ -245,10 +243,11 @@ def warning_format(message: str,
warnings.formatwarning = warning_format
-def _check_values_in_group(value: t.Union[str, t.Iterable[str]],
- valid_group: t.Iterable[str],
- wildcard: t.Optional[str] = "all"
- ) -> t.Tuple[t.Tuple[str, ...], t.Tuple[str, ...]]:
+def _check_values_in_group(
+ value: t.Union[str, t.Iterable[str]],
+ valid_group: t.Iterable[str],
+ wildcard: t.Optional[str] = "all",
+) -> t.Tuple[t.Tuple[str, ...], t.Tuple[str, ...]]:
"""Checks if a value is in a set or a set of values is a subset of a set.
Args:
@@ -274,8 +273,9 @@ def _check_values_in_group(value: t.Union[str, t.Iterable[str]],
"""
if not isinstance(value, collections.Iterable):
- raise TypeError("Parameter type is not "
- "consistent ({0}).".format(type(value)))
+ raise TypeError(
+ "Parameter type is not consistent ({0}).".format(type(value))
+ )
in_group = tuple() # type: t.Tuple[str, ...]
not_in_group = tuple() # type: t.Tuple[str, ...]
@@ -286,10 +286,10 @@ def _check_values_in_group(value: t.Union[str, t.Iterable[str]],
in_group = tuple(valid_group)
elif value in valid_group:
- in_group = (value, )
+ in_group = (value,)
else:
- not_in_group = (value, )
+ not_in_group = (value,)
else:
value_set = set(map(str.lower, value))
@@ -303,11 +303,11 @@ def _check_values_in_group(value: t.Union[str, t.Iterable[str]],
def get_prefixed_mtds_from_class(
- class_obj: t.Any,
- prefix: str,
- only_name: bool = False,
- prefix_removal: bool = False,
- ) -> t.Union[t.List[str], t.List[TypeMtdTuple]]:
+ class_obj: t.Any,
+ prefix: str,
+ only_name: bool = False,
+ prefix_removal: bool = False,
+) -> t.Union[t.List[str], t.List[TypeMtdTuple]]:
"""Get all class methods from ``class_obj`` prefixed with ``prefix``.
Args:
@@ -331,7 +331,8 @@ def get_prefixed_mtds_from_class(
method names.
"""
class_methods = inspect.getmembers(
- class_obj, predicate=inspect.ismethod) # type: t.List[TypeMtdTuple]
+ class_obj, predicate=inspect.ismethod
+ ) # type: t.List[TypeMtdTuple]
# It is assumed that all feature-extraction related methods
# name are all prefixed with "MTF_PREFIX" and all precomputa-
@@ -358,13 +359,12 @@ def get_prefixed_mtds_from_class(
def _get_all_prefixed_mtds(
- prefix: str,
- groups: t.Tuple[str, ...],
- update_groups_by: t.Optional[t.Union[t.FrozenSet[str],
- t.Set[str]]] = None,
- prefix_removal: bool = False,
- custom_class_: t.Any = None,
- ) -> t.Dict[str, t.Tuple]:
+ prefix: str,
+ groups: t.Tuple[str, ...],
+ update_groups_by: t.Optional[t.Union[t.FrozenSet[str], t.Set[str]]] = None,
+ prefix_removal: bool = False,
+ custom_class_: t.Any = None,
+) -> t.Dict[str, t.Tuple]:
"""Get all methods prefixed with ``prefix`` in predefined feature ``groups``.
The predefined metafeature groups are inside ``VALID_GROUPS`` attribute.
@@ -416,14 +416,13 @@ def _get_all_prefixed_mtds(
verify_classes = tuple(VALID_MFECLASSES)
else:
- verify_groups = ("test_methods", )
- verify_classes = (custom_class_, )
+ verify_groups = ("test_methods",)
+ verify_classes = (custom_class_,)
methods_by_group = {
ft_type_id: get_prefixed_mtds_from_class(
- class_obj=mfe_class,
- prefix=prefix,
- prefix_removal=prefix_removal)
+ class_obj=mfe_class, prefix=prefix, prefix_removal=prefix_removal
+ )
for ft_type_id, mfe_class in zip(verify_groups, verify_classes)
if ft_type_id in groups or custom_class_ is not None
}
@@ -457,7 +456,8 @@ def _get_all_prefixed_mtds(
def _preprocess_iterable_arg(
- values: t.Union[str, t.Iterable[str]]) -> t.List[str]:
+ values: t.Union[str, t.Iterable[str]]
+) -> t.List[str]:
"""Process ``values`` to a canonical form.
This canonical form consists in removing repeated elements from ``values``,
@@ -477,8 +477,9 @@ def _preprocess_iterable_arg(
return list(map(str.lower, set(values)))
-def _extract_mtd_args(ft_mtd_callable: t.Callable,
- ) -> t.Tuple[t.Tuple[str, ...], t.Tuple[str, ...]]:
+def _extract_mtd_args(
+ ft_mtd_callable: t.Callable,
+) -> t.Tuple[t.Tuple[str, ...], t.Tuple[str, ...]]:
"""Extracts arguments from given method.
Args:
@@ -512,10 +513,10 @@ def _extract_mtd_args(ft_mtd_callable: t.Callable,
def summarize(
- features: t.Union[np.ndarray, t.Sequence],
- callable_sum: t.Callable,
- callable_args: t.Optional[t.Dict[str, t.Any]] = None,
- ) -> t.Union[t.Sequence, TypeNumeric]:
+ features: t.Union[np.ndarray, t.Sequence],
+ callable_sum: t.Callable,
+ callable_args: t.Optional[t.Dict[str, t.Any]] = None,
+) -> t.Union[t.Sequence, TypeNumeric]:
"""Returns ``feature`` values summarized by ``callable_sum``.
Args:
@@ -559,10 +560,11 @@ def array_is_returned(mtd_callable: t.Callable) -> bool:
def get_feat_value(
- mtd_name: str,
- mtd_args: t.Dict[str, t.Any],
- mtd_callable: t.Callable,
- suppress_warnings: bool = False) -> t.Union[TypeNumeric, np.ndarray]:
+ mtd_name: str,
+ mtd_args: t.Dict[str, t.Any],
+ mtd_callable: t.Callable,
+ suppress_warnings: bool = False,
+) -> t.Union[TypeNumeric, np.ndarray]:
"""Extract features from ``mtd_callable`` with ``mtd_args`` as args.
Args:
@@ -598,22 +600,30 @@ def get_feat_value(
if not suppress_warnings:
warnings.warn(
"Can't extract feature '{0}'.\n Exception message: {1}.{2}"
- .format(mtd_name, repr(type_e), "\n Will set it as 'np.nan' "
- "for all summary functions." if is_array else ""),
- RuntimeWarning)
+ .format(
+ mtd_name,
+ repr(type_e),
+ "\n Will set it as 'np.nan' for all summary functions."
+ if is_array
+ else "",
+ ),
+ RuntimeWarning,
+ )
features = np.empty(0) if is_array else np.nan
return features
-def build_mtd_kwargs(mtd_name: str,
- mtd_args: t.Iterable[str],
- mtd_mandatory: t.Iterable[str],
- inner_custom_args: t.Optional[t.Dict[str, t.Any]] = None,
- user_custom_args: t.Optional[t.Dict[str, t.Any]] = None,
- precomp_args: t.Optional[t.Dict[str, t.Any]] = None,
- suppress_warnings: bool = False) -> t.Dict[str, t.Any]:
+def build_mtd_kwargs(
+ mtd_name: str,
+ mtd_args: t.Iterable[str],
+ mtd_mandatory: t.Iterable[str],
+ inner_custom_args: t.Optional[t.Dict[str, t.Any]] = None,
+ user_custom_args: t.Optional[t.Dict[str, t.Any]] = None,
+ precomp_args: t.Optional[t.Dict[str, t.Any]] = None,
+ suppress_warnings: bool = False,
+) -> t.Dict[str, t.Any]:
"""Build a ``kwargs`` (:obj:`dict`) for a feature-extraction :obj:`callable`.
Args:
@@ -667,28 +677,36 @@ def build_mtd_kwargs(mtd_name: str,
callable_args = {
custom_arg: combined_args[custom_arg]
- for custom_arg in combined_args if custom_arg in mtd_args
+ for custom_arg in combined_args
+ if custom_arg in mtd_args
}
if not set(mtd_mandatory).issubset(callable_args):
raise RuntimeError("Method mandatory arguments not satisfied.")
if not suppress_warnings:
- unknown_arg_set = (unknown_arg
- for unknown_arg in user_custom_args.keys()
- if unknown_arg not in mtd_args
- ) # type: t.Generator[str, None, None]
+ unknown_arg_set = (
+ unknown_arg
+ for unknown_arg in user_custom_args.keys()
+ if unknown_arg not in mtd_args
+ ) # type: t.Generator[str, None, None]
for unknown_arg in unknown_arg_set:
warnings.warn(
"Unknown argument '{0}' for method '{1}'.".format(
- unknown_arg, mtd_name), UserWarning)
+ unknown_arg, mtd_name
+ ),
+ UserWarning,
+ )
return callable_args
-def check_summary_warnings(value: t.Union[TypeNumeric, t.Sequence, np.ndarray],
- name_feature: str, name_summary: str) -> None:
+def check_summary_warnings(
+ value: t.Union[TypeNumeric, t.Sequence, np.ndarray],
+ name_feature: str,
+ name_summary: str,
+) -> None:
"""Check if there is :obj:`np.nan` within summarized values.
Args:
@@ -708,12 +726,14 @@ def check_summary_warnings(value: t.Union[TypeNumeric, t.Sequence, np.ndarray],
warnings.warn(
"Can't summarize feature '{0}' with summary '{1}'. "
"Will set it as 'np.nan'.".format(name_feature, name_summary),
- RuntimeWarning)
+ RuntimeWarning,
+ )
-def convert_alias(groups_alias: t.Iterable[t.Iterable],
- values: t.Optional[t.Union[t.Iterable[str], str]] = None
- ) -> t.List[str]:
+def convert_alias(
+ groups_alias: t.Iterable[t.Iterable],
+ values: t.Optional[t.Union[t.Iterable[str], str]] = None,
+) -> t.List[str]:
"""Change the values of the alias to the groups."""
if not values:
values = []
@@ -734,13 +754,13 @@ def convert_alias(groups_alias: t.Iterable[t.Iterable],
def process_generic_set(
- values: t.Optional[t.Union[t.Iterable[str], str]],
- group_name: str,
- wildcard: t.Optional[str] = "all",
- groups_alias: t.Iterable[t.Iterable] = None,
- allow_none: bool = False,
- allow_empty: bool = False,
- ) -> t.Tuple[str, ...]:
+ values: t.Optional[t.Union[t.Iterable[str], str]],
+ group_name: str,
+ wildcard: t.Optional[str] = "all",
+ groups_alias: t.Iterable[t.Iterable] = None,
+ allow_none: bool = False,
+ allow_empty: bool = False,
+) -> t.Tuple[str, ...]:
"""Check if given ``values`` are in an internal valid set named ``group_name``.
Args:
@@ -796,50 +816,62 @@ def process_generic_set(
if allow_none:
return tuple()
- raise ValueError('"Values" can not be None. (while checking '
- 'group "{}").'.format(group_name))
+ raise ValueError(
+ '"Values" can not be None. (while checking group "{}").'.format(
+ group_name
+ )
+ )
if values is not None and not values:
if allow_empty:
return tuple()
- raise ValueError('"Values" can not be empty. (while checking '
- 'group "{}")'.format(group_name))
+ raise ValueError(
+ '"Values" can not be empty. (while checking group "{}")'.format(
+ group_name
+ )
+ )
if group_name.upper() in ("SUMMARY", "FEATURES"):
- raise ValueError('Forbidden "group_name" option ({}). There is a '
- "specify processing method for it".format(group_name))
+ raise ValueError(
+ 'Forbidden "group_name" option ({}). There is a '
+ "specify processing method for it".format(group_name)
+ )
_module_name = sys.modules[__name__]
try:
valid_values = inspect.getattr_static(
- _module_name, "{0}{1}".format(VALID_VALUE_PREFIX,
- group_name.upper()))
+ _module_name,
+ "{0}{1}".format(VALID_VALUE_PREFIX, group_name.upper()),
+ )
except AttributeError:
- raise ValueError('Invalid "group_name" "{}". Check _internal '
- "module documentation to verify which ones "
- "are available for use.".format(group_name))
+ raise ValueError(
+ 'Invalid "group_name" "{}". Check _internal '
+ "module documentation to verify which ones "
+ "are available for use.".format(group_name)
+ )
if groups_alias:
values = convert_alias(groups_alias, values)
in_valid_set, not_in_valid_set = _check_values_in_group(
- value=values,
- valid_group=valid_values,
- wildcard=wildcard)
+ value=values, valid_group=valid_values, wildcard=wildcard
+ )
if not_in_valid_set:
- raise ValueError("Unknown values: {0}. "
- "Please select values in {1}.".format(
- not_in_valid_set, valid_values))
+ raise ValueError(
+ "Unknown values: {0}. Please select values in {1}.".format(
+ not_in_valid_set, valid_values
+ )
+ )
return in_valid_set
def solve_group_dependencies(
- groups: t.Tuple[str, ...],
- ) -> t.Tuple[t.Tuple[str, ...], t.FrozenSet[str]]:
+ groups: t.Tuple[str, ...],
+) -> t.Tuple[t.Tuple[str, ...], t.FrozenSet[str]]:
"""Solve dependencies between groups.
Those dependencies must be registered in ``GROUP_PREREQUISITES`` tuple.
@@ -856,7 +888,8 @@ def solve_group_dependencies(
cur_dependencies = {cur_dependencies}
inserted_dependencies.update(
- set(cur_dependencies).difference(groups))
+ set(cur_dependencies).difference(groups)
+ )
groups = tuple(set(groups).union(inserted_dependencies))
@@ -864,11 +897,11 @@ def solve_group_dependencies(
def process_generic_option(
- value: t.Optional[str],
- group_name: str,
- allow_none: bool = False,
- allow_empty: bool = False,
- ) -> t.Optional[str]:
+ value: t.Optional[str],
+ group_name: str,
+ allow_none: bool = False,
+ allow_empty: bool = False,
+) -> t.Optional[str]:
"""Check if given ``value`` is in an internal reference group of values.
This function is essentially a wrapper for the ``process_generic_set``
@@ -895,16 +928,18 @@ def process_generic_option(
"""
if value is not None and not isinstance(value, str):
- raise TypeError('"value" (group name {}) must be a string-'
- "type object (got {}).".format(group_name,
- type(value)))
+ raise TypeError(
+ '"value" (group name {}) must be a string-'
+ "type object (got {}).".format(group_name, type(value))
+ )
processed_value = process_generic_set(
values=value,
group_name=group_name,
wildcard=None,
allow_none=allow_none,
- allow_empty=allow_empty)
+ allow_empty=allow_empty,
+ )
canonical_value = None
@@ -918,9 +953,8 @@ def process_generic_option(
def process_summary(
- summary: t.Union[str, t.Iterable[str]],
- wildcard: str = "all"
- ) -> t.Tuple[t.Tuple[str, ...], t.Tuple[TypeExtMtdTuple, ...]]:
+ summary: t.Union[str, t.Iterable[str]], wildcard: str = "all"
+) -> t.Tuple[t.Tuple[str, ...], t.Tuple[TypeExtMtdTuple, ...]]:
"""Generate metadata from ``summary`` MFE instantiation argument.
Args:
@@ -952,14 +986,14 @@ def process_summary(
return tuple(), tuple()
in_group, not_in_group = _check_values_in_group(
- value=summary,
- valid_group=VALID_SUMMARY,
- wildcard=wildcard)
+ value=summary, valid_group=VALID_SUMMARY, wildcard=wildcard
+ )
if not_in_group:
- raise ValueError("Unknown summary function '{0}'. "
- "Please select values in {1}.".format(
- not_in_group, VALID_SUMMARY))
+ raise ValueError(
+ "Unknown summary function '{0}'. "
+ "Please select values in {1}.".format(not_in_group, VALID_SUMMARY)
+ )
summary_methods = [] # type: t.List[TypeExtMtdTuple]
available_sum_methods = [] # type: t.List[str]
@@ -968,13 +1002,16 @@ def process_summary(
summary_mtd_callable = _summary.SUMMARY_METHODS.get(summary_func)
if not summary_mtd_callable:
- warnings.warn("Missing summary function "
- "'{0}' at _summary module.".format(
- summary_func),
- RuntimeWarning)
+ warnings.warn(
+ "Missing summary function '{0}' at _summary module.".format(
+ summary_func
+ ),
+ RuntimeWarning,
+ )
else:
summary_mtd_args, mandatory = _extract_mtd_args(
- summary_mtd_callable)
+ summary_mtd_callable
+ )
summary_mtd_pack = (
summary_func,
@@ -990,14 +1027,14 @@ def process_summary(
def process_features(
- features: t.Union[str, t.Iterable[str]],
- groups: t.Tuple[str, ...],
- wildcard: str = "all",
- suppress_warnings: bool = False,
- custom_class_: t.Any = None,
- ) -> t.Tuple[t.Tuple[str, ...],
- t.Tuple[TypeExtMtdTuple, ...],
- t.Tuple[str, ...]]:
+ features: t.Union[str, t.Iterable[str]],
+ groups: t.Tuple[str, ...],
+ wildcard: str = "all",
+ suppress_warnings: bool = False,
+ custom_class_: t.Any = None,
+) -> t.Tuple[
+ t.Tuple[str, ...], t.Tuple[TypeExtMtdTuple, ...], t.Tuple[str, ...]
+]:
"""Generate metadata from ``features`` MFE instantiation argument.
The use of this function to happen after ``process_groups`` function, as
@@ -1046,7 +1083,7 @@ def process_features(
groups = tuple()
else:
- groups = ("custom", )
+ groups = ("custom",)
processed_ft = _preprocess_iterable_arg(features) # type: t.List[str]
@@ -1063,7 +1100,8 @@ def process_features(
) # type: t.Dict[str, t.Tuple]
ft_mtds_filtered = mtds_metadata.get(
- "methods", tuple()) # type: t.Tuple[TypeMtdTuple, ...]
+ "methods", tuple()
+ ) # type: t.Tuple[TypeMtdTuple, ...]
groups = mtds_metadata.get("groups", groups)
@@ -1081,9 +1119,11 @@ def process_features(
if ft_mtd_name in processed_ft:
mtd_callable_args, mandatory = _extract_mtd_args(ft_mtd_callable)
- extended_item = (*ft_mtd_tuple,
- mtd_callable_args,
- mandatory) # type: TypeExtMtdTuple
+ extended_item = (
+ *ft_mtd_tuple,
+ mtd_callable_args,
+ mandatory,
+ ) # type: TypeExtMtdTuple
ft_mtd_processed.append(extended_item)
available_feat_names.append(ft_mtd_name)
@@ -1091,18 +1131,20 @@ def process_features(
if not suppress_warnings:
for unknown_ft in processed_ft:
- warnings.warn("Unknown feature '{}'. You can check available "
- "feature names with either 'valid_metafeatures()'"
- " or 'metafeature_description()' methods."
- .format(unknown_ft), UserWarning)
+ warnings.warn(
+ "Unknown feature '{}'. You can check available "
+ "feature names with either 'valid_metafeatures()'"
+ " or 'metafeature_description()' methods.".format(unknown_ft),
+ UserWarning,
+ )
return tuple(available_feat_names), tuple(ft_mtd_processed), groups
def _patch_precomp_groups(
- precomp_groups: t.Optional[t.Union[str, t.Iterable[str]]],
- groups: t.Optional[t.Tuple[str, ...]] = None,
- ) -> t.Union[str, t.Iterable[str]]:
+ precomp_groups: t.Optional[t.Union[str, t.Iterable[str]]],
+ groups: t.Optional[t.Tuple[str, ...]] = None,
+) -> t.Union[str, t.Iterable[str]]:
"""Enforce precomputation in model-based metafeatures."""
if not precomp_groups:
precomp_groups = set()
@@ -1117,13 +1159,14 @@ def _patch_precomp_groups(
def process_precomp_groups(
- precomp_groups: t.Optional[t.Union[str, t.Iterable[str]]],
- groups: t.Optional[t.Tuple[str, ...]] = None,
- wildcard: str = "all",
- suppress_warnings: bool = False,
- verbose: int = 0,
- custom_class_: t.Any = None,
- **kwargs) -> t.Dict[str, t.Any]:
+ precomp_groups: t.Optional[t.Union[str, t.Iterable[str]]],
+ groups: t.Optional[t.Tuple[str, ...]] = None,
+ wildcard: str = "all",
+ suppress_warnings: bool = False,
+ verbose: int = 0,
+ custom_class_: t.Any = None,
+ **kwargs
+) -> t.Dict[str, t.Any]:
"""Process ``precomp_groups`` argument while fitting into a MFE model.
This function is expected to be used after ``process_groups`` function,
@@ -1174,7 +1217,8 @@ def process_precomp_groups(
return {}
processed_precomp_groups = _preprocess_iterable_arg(
- precomp_groups) # type: t.Sequence[str]
+ precomp_groups
+ ) # type: t.Sequence[str]
if wildcard in processed_precomp_groups:
processed_precomp_groups = groups
@@ -1187,10 +1231,14 @@ def process_precomp_groups(
warnings.warn(
" {} Unknown precomp_groups '{}'. You can check available "
"metafeature groups using 'valid_groups()' method.".format(
- VERBOSE_WARNING_SYMBOL, unknown_precomp), UserWarning)
+ VERBOSE_WARNING_SYMBOL, unknown_precomp
+ ),
+ UserWarning,
+ )
processed_precomp_groups = tuple(
- set(processed_precomp_groups).intersection(groups))
+ set(processed_precomp_groups).intersection(groups)
+ )
mtds_metadata = _get_all_prefixed_mtds(
prefix=PRECOMPUTE_PREFIX,
@@ -1199,7 +1247,8 @@ def process_precomp_groups(
) # type: t.Dict[str, t.Tuple]
precomp_mtds_filtered = mtds_metadata.get(
- "methods", tuple()) # type: t.Tuple[TypeMtdTuple, ...]
+ "methods", tuple()
+ ) # type: t.Tuple[TypeMtdTuple, ...]
del mtds_metadata
@@ -1221,12 +1270,16 @@ def process_precomp_groups(
new_precomp_vals = {}
if not suppress_warnings:
- warnings.warn(" {} Something went wrong while "
- "precomputing '{}'. Will ignore "
- "this method. Error message:\n"
- "{}.".format(VERBOSE_WARNING_SYMBOL,
- precomp_mtd_name,
- repr(type_err)))
+ warnings.warn(
+ " {} Something went wrong while "
+ "precomputing '{}'. Will ignore "
+ "this method. Error message:\n"
+ "{}.".format(
+ VERBOSE_WARNING_SYMBOL,
+ precomp_mtd_name,
+ repr(type_err),
+ )
+ )
error_count += 1
@@ -1240,8 +1293,11 @@ def process_precomp_groups(
_prev_precomp_len = len(precomp_items)
if verbose >= 2 and new_item_count > 0:
- print(" {} Got {} new precomputed values.".format(
- VERBOSE_BLOCK_END_SYMBOL, new_item_count))
+ print(
+ " {} Got {} new precomputed values.".format(
+ VERBOSE_BLOCK_END_SYMBOL, new_item_count
+ )
+ )
# Update kwargs to avoid recalculations iteratively
kwargs = {
@@ -1254,26 +1310,33 @@ def process_precomp_groups(
cur_progress=100 * ind / len(precomp_mtds_filtered),
cur_mtf_name=precomp_mtd_name,
item_type="precomputation",
- verbose=verbose)
+ verbose=verbose,
+ )
if verbose == 1:
_t_num_cols, _ = shutil.get_terminal_size()
- print("\r{:<{fill}}".format(
- "Process of precomputation finished.", fill=_t_num_cols))
+ print(
+ "\r{:<{fill}}".format(
+ "Process of precomputation finished.", fill=_t_num_cols
+ )
+ )
if verbose >= 2 and error_count > 0:
- print("\nNote: can't precompute a total of {} metafeatures, "
- "out of {} ({:.2f}%).".format(
- error_count,
- len(precomp_mtds_filtered),
- 100 * error_count / len(precomp_mtds_filtered)))
+ print(
+ "\nNote: can't precompute a total of {} metafeatures, "
+ "out of {} ({:.2f}%).".format(
+ error_count,
+ len(precomp_mtds_filtered),
+ 100 * error_count / len(precomp_mtds_filtered),
+ )
+ )
return precomp_items
-def check_data(X: t.Union[np.ndarray, list],
- y: t.Union[np.ndarray, list]
- ) -> t.Tuple[np.ndarray, t.Optional[np.ndarray]]:
+def check_data(
+ X: t.Union[np.ndarray, list], y: t.Union[np.ndarray, list]
+) -> t.Tuple[np.ndarray, t.Optional[np.ndarray]]:
"""Checks ``X`` and ``y`` data type and shape and transform it if necessary.
Args:
@@ -1326,17 +1389,16 @@ def check_data(X: t.Union[np.ndarray, list],
if y is not None:
if X.shape[0] != y.shape[0]:
- raise ValueError('"X" number of rows and "y" '
- "length shapes do not match.")
+ raise ValueError(
+ '"X" number of rows and "y" length shapes do not match.'
+ )
return np.copy(X), np.copy(y)
return np.copy(X), None
-def isnumeric(
- value: t.Any,
- check_subtype: bool = True) -> bool:
+def isnumeric(value: t.Any, check_subtype: bool = True) -> bool:
"""Checks if ``value`` is a numeric type or a collection of numerics.
The ``Numeric Type`` is assumed to be one of the following:
@@ -1356,9 +1418,11 @@ def isnumeric(
bool: True if `value` is a numeric type object or a collection of
numeric-only elements. False otherwise.
"""
- if (check_subtype
- and isinstance(value, (collections.Iterable, np.ndarray))
- and not isinstance(value, str)):
+ if (
+ check_subtype
+ and isinstance(value, (collections.Iterable, np.ndarray))
+ and not isinstance(value, str)
+ ):
value = np.array(value)
@@ -1384,7 +1448,8 @@ def remove_prefix(value: str, prefix: str) -> str:
TypeError: if ``value`` is not a string.
"""
if value.startswith(prefix):
- return value[len(prefix):]
+ l_prefix = len(prefix)
+ return value[l_prefix:]
return value
@@ -1425,15 +1490,13 @@ def transform_cat_gray(data_categoric: np.ndarray) -> t.Optional[np.ndarray]:
_, num_col = data_categoric.shape
- dummy_attr_names = [
- "C{}".format(i) for i in range(num_col)
- ]
+ dummy_attr_names = ["C{}".format(i) for i in range(num_col)]
named_data = {
# attr_name: data_categoric[:, attr_index]
# We need to cast to 'str' because sometimes categorical can be set as
# 'string'.
- attr_name: data_categoric[:, attr_index].astype('str')
+ attr_name: data_categoric[:, attr_index].astype("str")
for attr_index, attr_name in enumerate(dummy_attr_names)
}
@@ -1444,15 +1507,17 @@ def transform_cat_gray(data_categoric: np.ndarray) -> t.Optional[np.ndarray]:
return np.asarray(enc_data, dtype=float)
except patsy.PatsyError:
- raise ValueError("Categorical data encoding of type 'gray' has no "
- "support for missing values. Please handle the "
- "missing data manually before fitting it into the "
- "MFE model.")
+ raise ValueError(
+ "Categorical data encoding of type 'gray' has no "
+ "support for missing values. Please handle the "
+ "missing data manually before fitting it into the "
+ "MFE model."
+ )
def transform_cat_onehot(
- data_categoric: np.ndarray,
- use_all_columns: bool = True) -> t.Optional[np.ndarray]:
+ data_categoric: np.ndarray, use_all_columns: bool = True
+) -> t.Optional[np.ndarray]:
"""Transform categorical data using one-hot encoding."""
if data_categoric.size == 0:
return None
@@ -1469,27 +1534,30 @@ def transform_cat_onehot(
cur_attr = data_categoric[:, attr_ind, np.newaxis]
if not use_all_columns and len(set(cur_attr.ravel())) <= 1:
- raise ValueError("This type of one-hot encoding does not "
- "support features with 1 or less distinct "
- "values. Drop the {}th categorical feature "
- "or select another encoding strategy.".format(
- attr_ind + 1))
+ raise ValueError(
+ "This type of one-hot encoding does not "
+ "support features with 1 or less distinct "
+ "values. Drop the {}th categorical feature "
+ "or select another encoding strategy.".format(attr_ind + 1)
+ )
try:
one_cat_attrs.append(ohe.fit_transform(cur_attr))
except ValueError:
- raise ValueError("Categorical data encoding of type 'one-hot' has "
- "no support for missing values. Please handle the"
- " missing data manually before fitting it into "
- "the MFE model.")
+ raise ValueError(
+ "Categorical data encoding of type 'one-hot' has "
+ "no support for missing values. Please handle the"
+ " missing data manually before fitting it into "
+ "the MFE model."
+ )
return np.hstack(one_cat_attrs)
-def _equal_freq_discretization(data: np.ndarray,
- num_bins: int,
- tol: float = 1e-8) -> np.ndarray:
+def _equal_freq_discretization(
+ data: np.ndarray, num_bins: int, tol: float = 1e-8
+) -> np.ndarray:
"""Discretize a 1-D numeric array into an equal-frequency histogram."""
hist_divs = np.quantile(data, np.linspace(0, 1, num_bins + 1)[1:])
@@ -1511,8 +1579,9 @@ def _equal_freq_discretization(data: np.ndarray,
return np.digitize(x=data, bins=hist_divs, right=True)
-def transform_num(data_numeric: np.ndarray,
- num_bins: t.Optional[int] = None) -> t.Optional[np.ndarray]:
+def transform_num(
+ data_numeric: np.ndarray, num_bins: t.Optional[int] = None
+) -> t.Optional[np.ndarray]:
"""Discretize numeric data with an equal-frequency histogram.
The index of the histogram bin overwrites its correspondent numeric
@@ -1542,13 +1611,14 @@ def transform_num(data_numeric: np.ndarray,
raise TypeError('"num_bins" must be integer or NoneType.')
if num_bins <= 0:
- raise ValueError('"num_bins" must be a positive'
- "integer or NoneType.")
+ raise ValueError(
+ '"num_bins" must be a positiveinteger or NoneType.'
+ )
num_inst, _ = data_numeric.shape
if not num_bins:
- num_bins = int(num_inst**(1/3))
+ num_bins = int(num_inst ** (1 / 3))
data_numeric = data_numeric.astype(float)
@@ -1556,14 +1626,15 @@ def transform_num(data_numeric: np.ndarray,
func1d=_equal_freq_discretization,
axis=0,
arr=data_numeric,
- num_bins=num_bins)
+ num_bins=num_bins,
+ )
return digitalized_data
-def rescale_data(data: np.ndarray,
- option: str,
- args: t.Optional[t.Dict[str, t.Any]] = None) -> np.ndarray:
+def rescale_data(
+ data: np.ndarray, option: str, args: t.Optional[t.Dict[str, t.Any]] = None
+) -> np.ndarray:
"""Rescale numeric fitted data accordingly to user select option.
Args:
@@ -1591,8 +1662,10 @@ def rescale_data(data: np.ndarray,
scaler model is also raised by this function.
"""
if option not in VALID_RESCALE:
- raise ValueError("Unknown data rescaling option '{0}'. Please choose "
- "one value among {1}".format(option, VALID_RESCALE))
+ raise ValueError(
+ "Unknown data rescaling option '{0}'. Please choose "
+ "one value among {1}".format(option, VALID_RESCALE)
+ )
if not args:
args = {}
@@ -1627,20 +1700,26 @@ def check_score(score: str, groups: t.Tuple[str, ...]):
} # type: t.Dict[str, t.Callable[[np.ndarray, np.ndarray], float]]
if score is not None and not isinstance(score, str):
- raise ValueError('"score" is not None or str but "{0}" was passed.'
- 'The valid values are {1}'.format(
- score, list(valid_scoring.keys())))
+ raise ValueError(
+ '"score" is not None or str but "{0}" was passed.'
+ "The valid values are {1}".format(
+ score, list(valid_scoring.keys())
+ )
+ )
if "landmarking" in groups:
if score is None:
raise ValueError(
- 'Landmarking metafeatures need a score metric.'
+ "Landmarking metafeatures need a score metric."
'One of the following "score" values is required:'
- '{0}'.format(list(valid_scoring.keys())))
+ "{0}".format(list(valid_scoring.keys()))
+ )
if score not in valid_scoring:
raise ValueError(
- 'One of the following "score" values is required:'
- '{0}'.format(list(valid_scoring.keys())))
+ 'One of the following "score" values is required:{0}'.format(
+ list(valid_scoring.keys())
+ )
+ )
return valid_scoring[score]
return None
@@ -1662,9 +1741,10 @@ def check_group_dependencies(groups: t.Iterable[str]) -> t.Set[str]:
def select_results_by_classes(
- mtf_names: t.Sequence[str],
- class_names: t.Union[str, t.Iterable[str]],
- include_dependencies: bool = False) -> t.List[int]:
+ mtf_names: t.Sequence[str],
+ class_names: t.Union[str, t.Iterable[str]],
+ include_dependencies: bool = False,
+) -> t.List[int]:
"""Get indexes of metafeatures related to given ``class_names``."""
if isinstance(class_names, str):
class_names = {class_names}
@@ -1683,7 +1763,8 @@ def select_results_by_classes(
class_obj=VALID_MFECLASSES[VALID_GROUPS.index(class_name)],
prefix=MTF_PREFIX,
only_name=True,
- prefix_removal=True)
+ prefix_removal=True,
+ )
classes_mtd_names.update(_aux) # type: ignore
@@ -1701,11 +1782,12 @@ def select_results_by_classes(
def post_processing(
- results: t.Tuple[t.List, ...],
- groups: t.Tuple[str, ...],
- suppress_warnings: bool = False,
- custom_class_: t.Any = None,
- **kwargs) -> None:
+ results: t.Tuple[t.List, ...],
+ groups: t.Tuple[str, ...],
+ suppress_warnings: bool = False,
+ custom_class_: t.Any = None,
+ **kwargs
+) -> None:
"""Detect and apply post-processing methods in metafeatures.
This function should be used after the metafeature extraction.
@@ -1732,13 +1814,12 @@ def post_processing(
ds.
"""
mtds_metadata = _get_all_prefixed_mtds(
- prefix=POSTPROCESS_PREFIX,
- groups=groups,
- custom_class_=custom_class_,
+ prefix=POSTPROCESS_PREFIX, groups=groups, custom_class_=custom_class_,
) # type: t.Dict[str, t.Tuple]
postprocess_mtds = mtds_metadata.get(
- "methods", tuple()) # type: t.Tuple[TypeMtdTuple, ...]
+ "methods", tuple()
+ ) # type: t.Tuple[TypeMtdTuple, ...]
del mtds_metadata
@@ -1760,47 +1841,54 @@ def post_processing(
for postprocess_mtd_name, postprocess_mtd_callable in postprocess_mtds:
extra_inner_args["class_indexes"] = select_results_by_classes(
mtf_names=mtf_names,
- class_names=remove_prefix(value=postprocess_mtd_name,
- prefix=POSTPROCESS_PREFIX).split("_"))
+ class_names=remove_prefix(
+ value=postprocess_mtd_name, prefix=POSTPROCESS_PREFIX
+ ).split("_"),
+ )
try:
new_results = postprocess_mtd_callable( # type: ignore
- **extra_inner_args,
- **kwargs)
+ **extra_inner_args, **kwargs
+ )
if new_results:
if len(new_results) != len(results):
- raise ValueError("Postprocessing result has length '{}'. "
- "Expecting '{}'.".format(len(new_results),
- len(results)))
+ raise ValueError(
+ "Postprocessing result has length '{}'. "
+ "Expecting '{}'.".format(
+ len(new_results), len(results)
+ )
+ )
for res_list_old, res_list_new in zip(results, new_results):
res_list_old += res_list_new
except _EXCEPTIONS as type_err:
if not suppress_warnings:
- warnings.warn("Something went wrong while "
- "postprocessing '{0}'. Will ignore "
- "this method. Error message:\n"
- "{1}.".format(postprocess_mtd_name,
- repr(type_err)))
+ warnings.warn(
+ "Something went wrong while "
+ "postprocessing '{0}'. Will ignore "
+ "this method. Error message:\n"
+ "{1}.".format(postprocess_mtd_name, repr(type_err))
+ )
if remove_groups:
kwargs.pop("groups")
def print_verbose_progress(
- cur_progress: float,
- cur_mtf_name: str,
- item_type: str,
- verbose: int = 0) -> None:
+ cur_progress: float, cur_mtf_name: str, item_type: str, verbose: int = 0
+) -> None:
"""Print messages about extraction progress based on ``verbose``."""
if verbose <= 0:
return
if verbose >= 2:
- print("Done with '{}' {} (progress of {:.2f}%)."
- .format(cur_mtf_name, item_type, cur_progress))
+ print(
+ "Done with '{}' {} (progress of {:.2f}%).".format(
+ cur_mtf_name, item_type, cur_progress
+ )
+ )
return
_t_num_cols, _ = shutil.get_terminal_size()
@@ -1811,8 +1899,14 @@ def print_verbose_progress(
_total_prog_symb = int(cur_progress * _t_num_cols / 100)
- print("".join([
- "\r[",
- _total_prog_symb * "#",
- (_t_num_cols - _total_prog_symb) * ".",
- "]{:.2f}%".format(cur_progress)]), end="")
+ print(
+ "".join(
+ [
+ "\r[",
+ _total_prog_symb * "#",
+ (_t_num_cols - _total_prog_symb) * ".",
+ "]{:.2f}%".format(cur_progress),
+ ]
+ ),
+ end="",
+ )
diff --git a/pymfe/_summary.py b/pymfe/_summary.py
index 67c8cae2..14198d47 100644
--- a/pymfe/_summary.py
+++ b/pymfe/_summary.py
@@ -25,9 +25,9 @@ def _remove_nan(values: TypeValList) -> TypeValList:
return values[~np.isnan(values)]
-def sum_histogram(values: TypeValList,
- bins: int = 10,
- normalize: bool = True) -> TypeValList:
+def sum_histogram(
+ values: TypeValList, bins: int = 10, normalize: bool = True
+) -> TypeValList:
"""Returns a list of frequencies of a histogram of given values.
Args:
@@ -61,11 +61,13 @@ def sum_histogram(values: TypeValList,
return freqs
-def sum_quantiles(values: TypeValList,
- package: str = "numpy",
- numpy_interpolation: str = "linear",
- scipy_alphap: float = 0.4,
- scipy_betap: float = 0.4) -> TypeValList:
+def sum_quantiles(
+ values: TypeValList,
+ package: str = "numpy",
+ numpy_interpolation: str = "linear",
+ scipy_alphap: float = 0.4,
+ scipy_betap: float = 0.4,
+) -> TypeValList:
"""Calc. min, first, second and third quartiles, and max from ``values``.
Args:
@@ -106,21 +108,28 @@ def sum_quantiles(values: TypeValList,
valid_packages = ("numpy", "scipy")
if package not in valid_packages:
- raise ValueError('"package" must be in {} '
- "(got {}).".format(valid_packages, package))
+ raise ValueError(
+ '"package" must be in {} (got {}).'.format(valid_packages, package)
+ )
if package == "numpy":
- return np.quantile(values, (0.00, 0.25, 0.50, 0.75, 1.00),
- interpolation=numpy_interpolation)
-
- return scipy.stats.mstats.mquantiles(values,
- (0.00, 0.25, 0.50, 0.75, 1.00),
- alphap=scipy_alphap,
- betap=scipy_betap)
-
-
-def sum_nanquantiles(values: TypeValList,
- numpy_interpolation: str = "linear") -> TypeValList:
+ return np.quantile(
+ values,
+ (0.00, 0.25, 0.50, 0.75, 1.00),
+ interpolation=numpy_interpolation,
+ )
+
+ return scipy.stats.mstats.mquantiles(
+ values,
+ (0.00, 0.25, 0.50, 0.75, 1.00),
+ alphap=scipy_alphap,
+ betap=scipy_betap,
+ )
+
+
+def sum_nanquantiles(
+ values: TypeValList, numpy_interpolation: str = "linear"
+) -> TypeValList:
"""Calculate the ``values`` quantiles, ignoring `nan` values.
The quantiles calculated corresponds to the minimum, maximum,
@@ -129,13 +138,16 @@ def sum_nanquantiles(values: TypeValList,
if len(values) == 0:
return np.full(5, fill_value=np.nan)
- return np.nanquantile(values, (0.00, 0.25, 0.50, 0.75, 1.00),
- interpolation=numpy_interpolation)
+ return np.nanquantile(
+ values,
+ (0.00, 0.25, 0.50, 0.75, 1.00),
+ interpolation=numpy_interpolation,
+ )
-def sum_skewness(values: TypeValList,
- method: int = 3,
- bias: bool = True) -> float:
+def sum_skewness(
+ values: TypeValList, method: int = 3, bias: bool = True
+) -> float:
"""Calculate the skewness from ``values`` using ``method`` strategy.
Args:
@@ -176,8 +188,9 @@ def sum_skewness(values: TypeValList,
ValueError: if ``method`` is not 1, 2 nor 3.
"""
if method not in (1, 2, 3):
- raise ValueError('Invalid method "{}" for '
- "extracting the skewness".format(method))
+ raise ValueError(
+ 'Invalid method "{}" for extracting the skewness'.format(method)
+ )
num_vals = len(values)
@@ -187,17 +200,17 @@ def sum_skewness(values: TypeValList,
skew_val = scipy.stats.skew(values, bias=bias)
if method == 2 and num_vals != 2:
- skew_val *= (num_vals * (num_vals - 1.0))**0.5 / (num_vals - 2.0)
+ skew_val *= (num_vals * (num_vals - 1.0)) ** 0.5 / (num_vals - 2.0)
elif method == 3:
- skew_val *= ((num_vals - 1.0) / num_vals)**(1.5)
+ skew_val *= ((num_vals - 1.0) / num_vals) ** (1.5)
return skew_val
-def sum_kurtosis(values: TypeValList,
- method: int = 3,
- bias: bool = True) -> float:
+def sum_kurtosis(
+ values: TypeValList, method: int = 3, bias: bool = True
+) -> float:
"""Calculate the kurtosis of ``values`` using ``method`` strategy.
Args:
@@ -238,8 +251,9 @@ def sum_kurtosis(values: TypeValList,
ValueError: if ``method`` is not 1, 2 nor 3.
"""
if method not in (1, 2, 3):
- raise ValueError('Invalid method "{}" for '
- "extracting the kurtosis".format(method))
+ raise ValueError(
+ 'Invalid method "{}" for extracting the kurtosis'.format(method)
+ )
num_vals = len(values)
@@ -253,7 +267,7 @@ def sum_kurtosis(values: TypeValList,
kurt_val *= (num_vals - 1.0) / ((num_vals - 2.0) * (num_vals - 3.0))
elif method == 3:
- kurt_val = (kurt_val + 3.0) * (1.0 - 1.0 / num_vals)**2.0 - 3.0
+ kurt_val = (kurt_val + 3.0) * (1.0 - 1.0 / num_vals) ** 2.0 - 3.0
return kurt_val
@@ -305,47 +319,48 @@ def sum_nanptp(values: TypeValList) -> float:
return np.nanmax(values) - np.nanmin(values)
-def sum_nanhistogram(values: TypeValList,
- bins: int = 10,
- normalize: bool = True) -> TypeValList:
+def sum_nanhistogram(
+ values: TypeValList, bins: int = 10, normalize: bool = True
+) -> TypeValList:
"""Create a histogram ignoring `nan` values."""
if not isinstance(values, np.ndarray):
values = np.asarray(values, dtype=float)
- return sum_histogram(values=_remove_nan(values=values),
- bins=bins,
- normalize=normalize)
+ return sum_histogram(
+ values=_remove_nan(values=values), bins=bins, normalize=normalize
+ )
-def sum_nankurtosis(values: TypeValList,
- method: int = 3,
- bias: bool = True) -> float:
+def sum_nankurtosis(
+ values: TypeValList, method: int = 3, bias: bool = True
+) -> float:
"""Estimate data kurtosis ignoring `nan` values."""
if not isinstance(values, np.ndarray):
values = np.asarray(values, dtype=float)
- return sum_kurtosis(values=_remove_nan(values=values),
- method=method,
- bias=bias)
+ return sum_kurtosis(
+ values=_remove_nan(values=values), method=method, bias=bias
+ )
-def sum_nanskewness(values: TypeValList,
- method: int = 3,
- bias: bool = True) -> float:
+def sum_nanskewness(
+ values: TypeValList, method: int = 3, bias: bool = True
+) -> float:
"""Estimate data skewness ignoring `nan` values."""
if not isinstance(values, np.ndarray):
values = np.asarray(values, dtype=float)
- return sum_skewness(values=_remove_nan(values=values),
- method=method,
- bias=bias)
+ return sum_skewness(
+ values=_remove_nan(values=values), method=method, bias=bias
+ )
def _apply_power_func(
- values: TypeValList,
- p_func: t.Callable[[TypeValList, t.Union[int, float]], t.Union[int,
- float]],
- p: t.Union[int, float, t.Iterable[t.Union[int, float]]],
+ values: TypeValList,
+ p_func: t.Callable[
+ [TypeValList, t.Union[int, float]], t.Union[int, float]
+ ],
+ p: t.Union[int, float, t.Iterable[t.Union[int, float]]],
) -> t.Union[float, np.ndarray]:
"""Apply a power function to ``values`` using ``p_func``."""
if len(values) == 0:
@@ -366,12 +381,14 @@ def _apply_power_func(
def sum_powersum(
- values: TypeValList,
- p: t.Union[int, float, t.Iterable[t.Union[int, float]]] = 2,
+ values: TypeValList,
+ p: t.Union[int, float, t.Iterable[t.Union[int, float]]] = 2,
) -> t.Union[float, np.ndarray]:
"""Calculate the power sum of ``values``."""
- def ps_func(arr: TypeValList, p: t.Union[int,
- float]) -> t.Union[int, float]:
+
+ def ps_func(
+ arr: TypeValList, p: t.Union[int, float]
+ ) -> t.Union[int, float]:
if np.any(np.isnan(arr)):
return np.nan
@@ -381,20 +398,22 @@ def ps_func(arr: TypeValList, p: t.Union[int,
def sum_nanpowersum(
- values: TypeValList,
- p: t.Union[int, float, t.Iterable[t.Union[int, float]]] = 2,
+ values: TypeValList,
+ p: t.Union[int, float, t.Iterable[t.Union[int, float]]] = 2,
) -> t.Union[float, np.ndarray]:
"""Calculate the power sum of ``values`` ignoring nan values."""
return sum_powersum(_remove_nan(values=values), p=p)
def sum_pnorm(
- values: TypeValList,
- p: t.Union[int, float, t.Iterable[t.Union[int, float]]] = 2,
+ values: TypeValList,
+ p: t.Union[int, float, t.Iterable[t.Union[int, float]]] = 2,
) -> t.Union[float, np.ndarray]:
"""Calculate the p-norm of ``values``."""
- def pn_func(arr: TypeValList, p: t.Union[int,
- float]) -> t.Union[int, float]:
+
+ def pn_func(
+ arr: TypeValList, p: t.Union[int, float]
+ ) -> t.Union[int, float]:
if np.any(np.isnan(arr)):
return np.nan
@@ -404,8 +423,8 @@ def pn_func(arr: TypeValList, p: t.Union[int,
def sum_nanpnorm(
- values: TypeValList,
- p: t.Union[int, float, t.Iterable[t.Union[int, float]]] = 2,
+ values: TypeValList,
+ p: t.Union[int, float, t.Iterable[t.Union[int, float]]] = 2,
) -> t.Union[float, np.ndarray]:
"""Calculate the p-norm of ``values`` ignoring nan values."""
return sum_pnorm(_remove_nan(values=values), p=p)
@@ -427,37 +446,39 @@ def sum_nansum(values: TypeValList) -> float:
return np.nansum(values)
-SUMMARY_METHODS = collections.OrderedDict((
- ("mean", np.mean),
- ("nanmean", np.nanmean),
- ("sd", sum_std),
- ("nansd", sum_nanstd),
- ("var", sum_var),
- ("nanvar", sum_nanvar),
- ("count", len),
- ("nancount", sum_nancount),
- ("histogram", sum_histogram),
- ("nanhistogram", sum_nanhistogram),
- ("iq_range", scipy.stats.iqr),
- ("naniq_range", sum_naniq_range),
- ("kurtosis", sum_kurtosis),
- ("nankurtosis", sum_nankurtosis),
- ("max", np.max),
- ("nanmax", np.nanmax),
- ("median", np.median),
- ("nanmedian", np.nanmedian),
- ("min", np.min),
- ("nanmin", np.nanmin),
- ("quantiles", sum_quantiles),
- ("nanquantiles", sum_nanquantiles),
- ("range", np.ptp),
- ("nanrange", sum_nanptp),
- ("skewness", sum_skewness),
- ("nanskewness", sum_nanskewness),
- ("sum", sum_sum),
- ("nansum", sum_nansum),
- ("powersum", sum_powersum),
- ("pnorm", sum_pnorm),
- ("nanpowersum", sum_nanpowersum),
- ("nanpnorm", sum_nanpnorm),
-))
+SUMMARY_METHODS = collections.OrderedDict(
+ (
+ ("mean", np.mean),
+ ("nanmean", np.nanmean),
+ ("sd", sum_std),
+ ("nansd", sum_nanstd),
+ ("var", sum_var),
+ ("nanvar", sum_nanvar),
+ ("count", len),
+ ("nancount", sum_nancount),
+ ("histogram", sum_histogram),
+ ("nanhistogram", sum_nanhistogram),
+ ("iq_range", scipy.stats.iqr),
+ ("naniq_range", sum_naniq_range),
+ ("kurtosis", sum_kurtosis),
+ ("nankurtosis", sum_nankurtosis),
+ ("max", np.max),
+ ("nanmax", np.nanmax),
+ ("median", np.median),
+ ("nanmedian", np.nanmedian),
+ ("min", np.min),
+ ("nanmin", np.nanmin),
+ ("quantiles", sum_quantiles),
+ ("nanquantiles", sum_nanquantiles),
+ ("range", np.ptp),
+ ("nanrange", sum_nanptp),
+ ("skewness", sum_skewness),
+ ("nanskewness", sum_nanskewness),
+ ("sum", sum_sum),
+ ("nansum", sum_nansum),
+ ("powersum", sum_powersum),
+ ("pnorm", sum_pnorm),
+ ("nanpowersum", sum_nanpowersum),
+ ("nanpnorm", sum_nanpnorm),
+ )
+)
diff --git a/pymfe/_utils.py b/pymfe/_utils.py
index c4b28376..73705495 100644
--- a/pymfe/_utils.py
+++ b/pymfe/_utils.py
@@ -4,8 +4,9 @@
import numpy as np
-def calc_cls_inds(y: np.ndarray,
- classes: t.Optional[np.ndarray] = None) -> np.ndarray:
+def calc_cls_inds(
+ y: np.ndarray, classes: t.Optional[np.ndarray] = None
+) -> np.ndarray:
"""Compute the ``cls_inds`` variable.
The ``cls_inds`` variable is a boolean array which marks with
@@ -16,7 +17,8 @@ def calc_cls_inds(y: np.ndarray,
if classes is None:
classes = np.unique(y)
- cls_inds = np.array([np.equal(y, cur_cls) for cur_cls in classes],
- dtype=bool)
+ cls_inds = np.array(
+ [np.equal(y, cur_cls) for cur_cls in classes], dtype=bool
+ )
return cls_inds
diff --git a/pymfe/_version.py b/pymfe/_version.py
index 564b7a22..28cb5ddb 100644
--- a/pymfe/_version.py
+++ b/pymfe/_version.py
@@ -21,4 +21,4 @@
# Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer.
# 'X.Y.dev0' is the canonical version of 'X.Y.dev'
-__version__ = '0.4rc0'
+__version__ = "0.4"
diff --git a/pymfe/clustering.py b/pymfe/clustering.py
index 6d3ac86a..3182a231 100644
--- a/pymfe/clustering.py
+++ b/pymfe/clustering.py
@@ -58,9 +58,9 @@ class MFEClustering:
"""
@classmethod
- def precompute_clustering_class(cls,
- y: t.Optional[np.ndarray] = None,
- **kwargs) -> t.Dict[str, t.Any]:
+ def precompute_clustering_class(
+ cls, y: t.Optional[np.ndarray] = None, **kwargs
+ ) -> t.Dict[str, t.Any]:
"""Precompute distinct classes and its frequencies from ``y``.
Parameters
@@ -103,12 +103,14 @@ def precompute_clustering_class(cls,
return precomp_vals
@classmethod
- def precompute_group_distances(cls,
- N: np.ndarray,
- y: t.Optional[np.ndarray] = None,
- dist_metric: str = "euclidean",
- classes: t.Optional[np.ndarray] = None,
- **kwargs) -> t.Dict[str, t.Any]:
+ def precompute_group_distances(
+ cls,
+ N: np.ndarray,
+ y: t.Optional[np.ndarray] = None,
+ dist_metric: str = "euclidean",
+ classes: t.Optional[np.ndarray] = None,
+ **kwargs
+ ) -> t.Dict[str, t.Any]:
"""Precompute distance metrics between instances.
Parameters
@@ -136,13 +138,13 @@ def precompute_group_distances(cls,
-------
:obj:`dict`
The following precomputed items are returned:
- * ``pairwise_norm_interclass_dist`` (:obj:`np.ndarray`):
+ * ``pairwise_norm_intercls_dist`` (:obj:`np.ndarray`):
normalized distance between each distinct pair of
instances of different classes.
- * ``pairwise_intraclass_dists`` (:obj:`np.ndarray`):
+ * ``pairwise_intracls_dists`` (:obj:`np.ndarray`):
distance between each distinct pair of instances of
the same class.
- * ``intraclass_dists`` (:obj:`np.ndarray`): the distance
+ * ``intracls_dists`` (:obj:`np.ndarray`): the distance
between the fartest pair of instances of the same class.
The following precomputed items are necessary and are also
@@ -158,10 +160,15 @@ def precompute_group_distances(cls,
"""
precomp_vals = {}
- if N is not None and y is not None and not {
- "pairwise_norm_interclass_dist", "pairwise_intraclass_dists",
- "intraclass_dists"
- }.issubset(kwargs):
+ if (
+ N is not None
+ and y is not None
+ and not {
+ "pairwise_norm_intercls_dist",
+ "pairwise_intracls_dists",
+ "intracls_dists",
+ }.issubset(kwargs)
+ ):
cls_inds = kwargs.get("cls_inds")
if cls_inds is None:
@@ -170,42 +177,53 @@ def precompute_group_distances(cls,
classes = new_vals["classes"]
precomp_vals.update(new_vals)
- precomp_vals["pairwise_norm_interclass_dist"] = (
- cls._calc_pairwise_norm_interclass_dist(
- N=N,
- y=y,
- dist_metric=dist_metric,
- classes=classes,
- cls_inds=cls_inds))
+ precomp_vals[
+ "pairwise_norm_intercls_dist"
+ ] = cls._calc_pwise_norm_intercls_dist(
+ N=N,
+ y=y,
+ dist_metric=dist_metric,
+ classes=classes,
+ cls_inds=cls_inds,
+ )
- precomp_vals["pairwise_intraclass_dists"] = (
- cls._calc_all_intraclass_dists(
- N=N,
- y=y,
- dist_metric=dist_metric,
- cls_inds=cls_inds,
- classes=classes,
- get_max_dist=False))
+ precomp_vals[
+ "pairwise_intracls_dists"
+ ] = cls._calc_all_intracls_dists(
+ N=N,
+ y=y,
+ dist_metric=dist_metric,
+ cls_inds=cls_inds,
+ classes=classes,
+ get_max_dist=False,
+ )
- if precomp_vals["pairwise_intraclass_dists"].ndim == 2:
- precomp_vals["intraclass_dists"] = (
- precomp_vals["pairwise_intraclass_dists"].max(axis=1))
+ if precomp_vals["pairwise_intracls_dists"].ndim == 2:
+ precomp_vals["intracls_dists"] = precomp_vals[
+ "pairwise_intracls_dists"
+ ].max(axis=1)
else:
- precomp_vals["intraclass_dists"] = np.array([
- np.max(class_arr)
- for class_arr in precomp_vals["pairwise_intraclass_dists"]
- ])
+ precomp_vals["intracls_dists"] = np.array(
+ [
+ np.max(class_arr)
+ for class_arr in precomp_vals[
+ "pairwise_intracls_dists"
+ ]
+ ]
+ )
return precomp_vals
@classmethod
- def precompute_nearest_neighbors(cls,
- N: np.ndarray,
- y: t.Optional[np.ndarray] = None,
- n_neighbors: t.Optional[int] = None,
- dist_metric: str = "euclidean",
- **kwargs) -> t.Dict[str, t.Any]:
+ def precompute_nearest_neighbors(
+ cls,
+ N: np.ndarray,
+ y: t.Optional[np.ndarray] = None,
+ n_neighbors: t.Optional[int] = None,
+ dist_metric: str = "euclidean",
+ **kwargs
+ ) -> t.Dict[str, t.Any]:
"""Precompute the ``n_neighbors`` Nearest Neighbors of every instance.
Parameters
@@ -233,14 +251,17 @@ def precompute_nearest_neighbors(cls,
-------
:obj:`dict`
The following precomputed items are returned:
- * ``pairwise_intraclass_dists`` (:obj:`np.ndarray`):
+ * ``pairwise_intracls_dists`` (:obj:`np.ndarray`):
distance between each distinct pair of instances of
the same class.
"""
precomp_vals = {}
- if (N is not None and y is not None
- and not {"nearest_neighbors"}.issubset(kwargs)):
+ if (
+ N is not None
+ and y is not None
+ and not {"nearest_neighbors"}.issubset(kwargs)
+ ):
class_freqs = kwargs.get("class_freqs")
if class_freqs is None:
@@ -250,18 +271,20 @@ def precompute_nearest_neighbors(cls,
n_neighbors = int(np.sqrt(class_freqs.min()))
precomp_vals["nearest_neighbors"] = cls._get_nearest_neighbors(
- N=N, n_neighbors=n_neighbors, dist_metric=dist_metric)
+ N=N, n_neighbors=n_neighbors, dist_metric=dist_metric
+ )
return precomp_vals
@classmethod
def precompute_class_representatives(
- cls,
- N: np.ndarray,
- y: t.Optional[np.ndarray] = None,
- representative: str = "mean",
- classes: t.Optional[np.ndarray] = None,
- **kwargs) -> t.Dict[str, t.Any]:
+ cls,
+ N: np.ndarray,
+ y: t.Optional[np.ndarray] = None,
+ representative: str = "mean",
+ classes: t.Optional[np.ndarray] = None,
+ **kwargs
+ ) -> t.Dict[str, t.Any]:
"""Precomputations related to cluster representative instances.
Parameters
@@ -313,44 +336,50 @@ class (effectively holding the same result as if the argument
-------
:obj:`dict`
The following precomputed items are returned:
- * ``pairwise_intraclass_dists`` (:obj:`np.ndarray`):
+ * ``pairwise_intracls_dists`` (:obj:`np.ndarray`):
distance between each distinct pair of instances of
the same class.
"""
precomp_vals = {}
- if (N is not None and y is not None
- and not {"representative"}.issubset(kwargs)):
+ if (
+ N is not None
+ and y is not None
+ and not {"representative"}.issubset(kwargs)
+ ):
precomp_vals["representative"] = cls._get_class_representatives(
- N=N, y=y, representative=representative, classes=classes)
+ N=N, y=y, representative=representative, classes=classes
+ )
return precomp_vals
@classmethod
- def _calc_normalized_interclass_dist(
- cls,
- group_inst_a: np.ndarray,
- group_inst_b: np.ndarray,
- dist_metric: str = "euclidean",
+ def _calc_normalized_intercls_dist(
+ cls,
+ group_inst_a: np.ndarray,
+ group_inst_b: np.ndarray,
+ dist_metric: str = "euclidean",
) -> np.ndarray:
"""Calculate the distance between instances of different classes.
The distance is normalized by the number of distinct pairs
between ``group_inst_a`` and ``group_inst_b``.
"""
- norm_interclass_dist = scipy.spatial.distance.cdist(
- group_inst_a, group_inst_b, metric=dist_metric)
+ norm_intercls_dist = scipy.spatial.distance.cdist(
+ group_inst_a, group_inst_b, metric=dist_metric
+ )
- return norm_interclass_dist / norm_interclass_dist.size
+ return norm_intercls_dist / norm_intercls_dist.size
@classmethod
- def _calc_pairwise_norm_interclass_dist(
- cls,
- N: np.ndarray,
- y: np.ndarray,
- dist_metric: str = "euclidean",
- classes: t.Optional[np.ndarray] = None,
- cls_inds: t.Optional[np.ndarray] = None) -> np.ndarray:
+ def _calc_pwise_norm_intercls_dist(
+ cls,
+ N: np.ndarray,
+ y: np.ndarray,
+ dist_metric: str = "euclidean",
+ classes: t.Optional[np.ndarray] = None,
+ cls_inds: t.Optional[np.ndarray] = None,
+ ) -> np.ndarray:
"""Calculate all pairwise normalized interclass distances."""
if cls_inds is None:
if classes is None:
@@ -358,42 +387,47 @@ def _calc_pairwise_norm_interclass_dist(
cls_inds = _utils.calc_cls_inds(y=y, classes=classes)
- interclass_dists = [
- cls._calc_normalized_interclass_dist(
+ intercls_dists = [
+ cls._calc_normalized_intercls_dist(
N[cls_inds[id_cls_a, :], :],
N[cls_inds[id_cls_b, :], :],
- dist_metric=dist_metric)
+ dist_metric=dist_metric,
+ )
for id_cls_a, id_cls_b in itertools.combinations(
- np.arange(cls_inds.shape[0]), 2)
+ np.arange(cls_inds.shape[0]), 2
+ )
]
- return interclass_dists
+ return intercls_dists
@classmethod
- def _calc_intraclass_dists(cls,
- instances: np.ndarray,
- dist_metric: str = "euclidean",
- get_max_dist: bool = True) -> float:
+ def _calc_intracls_dists(
+ cls,
+ instances: np.ndarray,
+ dist_metric: str = "euclidean",
+ get_max_dist: bool = True,
+ ) -> float:
"""Calculate the intraclass distance of the given instances.
The intraclass is the maximum distance between two distinct
instances of the same class. If ``get_max`` is false, then
all distances are returned instead.
"""
- intraclass_dists = scipy.spatial.distance.pdist(
- instances, metric=dist_metric)
+ intracls_dists = scipy.spatial.distance.pdist(
+ instances, metric=dist_metric
+ )
- return intraclass_dists.max() if get_max_dist else intraclass_dists
+ return intracls_dists.max() if get_max_dist else intracls_dists
@classmethod
- def _calc_all_intraclass_dists(
- cls,
- N: np.ndarray,
- y: np.ndarray,
- dist_metric: str = "euclidean",
- get_max_dist: bool = True,
- cls_inds: t.Optional[np.ndarray] = None,
- classes: t.Optional[np.ndarray] = None,
+ def _calc_all_intracls_dists(
+ cls,
+ N: np.ndarray,
+ y: np.ndarray,
+ dist_metric: str = "euclidean",
+ get_max_dist: bool = True,
+ cls_inds: t.Optional[np.ndarray] = None,
+ classes: t.Optional[np.ndarray] = None,
) -> np.ndarray:
"""Calculate all intraclass (internal to a class) distances."""
if cls_inds is None:
@@ -402,21 +436,22 @@ def _calc_all_intraclass_dists(
cls_inds = _utils.calc_cls_inds(y=y, classes=classes)
- intraclass_dists = np.array([
- cls._calc_intraclass_dists(
- N[cur_class, :],
- dist_metric=dist_metric,
- get_max_dist=get_max_dist) for cur_class in cls_inds
- ])
+ intracls_dists = np.array(
+ [
+ cls._calc_intracls_dists(
+ N[cur_class, :],
+ dist_metric=dist_metric,
+ get_max_dist=get_max_dist,
+ )
+ for cur_class in cls_inds
+ ]
+ )
- return intraclass_dists
+ return intracls_dists
@classmethod
def _get_nearest_neighbors(
- cls,
- N: np.ndarray,
- n_neighbors: int,
- dist_metric: str = "euclidean",
+ cls, N: np.ndarray, n_neighbors: int, dist_metric: str = "euclidean",
) -> np.ndarray:
"""Indexes of ``n_neighbors`` nearest neighbors for each instance."""
model = sklearn.neighbors.KDTree(N, metric=dist_metric)
@@ -424,18 +459,20 @@ def _get_nearest_neighbors(
# Note: skip the first column because it's always the
# instance itself
nearest_neighbors = model.query(
- N, k=n_neighbors + 1, return_distance=False)[:, 1:]
+ N, k=n_neighbors + 1, return_distance=False
+ )[:, 1:]
return nearest_neighbors
@classmethod
def _get_class_representatives(
- cls,
- N: np.ndarray,
- y: np.ndarray,
- representative: t.Union[t.Sequence, np.ndarray, str] = "mean",
- cls_inds: t.Optional[np.ndarray] = None,
- classes: t.Optional[np.ndarray] = None) -> np.ndarray:
+ cls,
+ N: np.ndarray,
+ y: np.ndarray,
+ representative: t.Union[t.Sequence, np.ndarray, str] = "mean",
+ cls_inds: t.Optional[np.ndarray] = None,
+ classes: t.Optional[np.ndarray] = None,
+ ) -> np.ndarray:
"""Get a representative instance for each distinct class.
If ``representative`` argument is a string, then it must be
@@ -452,14 +489,15 @@ def _get_class_representatives(
classes = np.unique(y)
if isinstance(representative, str):
- center_method = {
- "mean": np.mean,
- "median": np.median,
- }.get(representative)
+ center_method = {"mean": np.mean, "median": np.median}.get(
+ representative
+ )
if center_method is None:
- raise ValueError("'representative' must be 'mean' or "
- "'median'. Got '{}'.".format(representative))
+ raise ValueError(
+ "'representative' must be 'mean' or "
+ "'median'. Got '{}'.".format(representative)
+ )
if cls_inds is None:
cls_inds = _utils.calc_cls_inds(y=y, classes=classes)
@@ -469,11 +507,14 @@ def _get_class_representatives(
for cur_class in cls_inds
]
- elif not isinstance(representative,
- (collections.Sequence, np.ndarray)):
- raise TypeError("'representative' type must be string "
- "or a sequence or a numpy array. "
- "Got '{}'.".format(type(representative)))
+ elif not isinstance(
+ representative, (collections.Sequence, np.ndarray)
+ ):
+ raise TypeError(
+ "'representative' type must be string "
+ "or a sequence or a numpy array. "
+ "Got '{}'.".format(type(representative))
+ )
representative_arr = np.asarray(representative)
@@ -481,27 +522,31 @@ def _get_class_representatives(
_, num_attr = N.shape
if num_repr != classes.size:
- raise ValueError("There must exist one class representative "
- "for every distinct class. (Expected '{}', "
- "got '{}'".format(classes.size, num_repr))
+ raise ValueError(
+ "There must exist one class representative "
+ "for every distinct class. (Expected '{}', "
+ "got '{}'".format(classes.size, num_repr)
+ )
if repr_dim != num_attr:
- raise ValueError("The dimension of each class representative "
- "must match the instances dimension. (Expected "
- "'{}', got '{}'".format(classes.size, repr_dim))
+ raise ValueError(
+ "The dimension of each class representative "
+ "must match the instances dimension. (Expected "
+ "'{}', got '{}'".format(classes.size, repr_dim)
+ )
return representative_arr
@classmethod
def ft_vdu(
- cls,
- N: np.ndarray,
- y: np.ndarray,
- dist_metric: str = "euclidean",
- cls_inds: t.Optional[np.ndarray] = None,
- classes: t.Optional[np.ndarray] = None,
- intraclass_dists: t.Optional[np.ndarray] = None,
- pairwise_norm_interclass_dist: t.Optional[np.ndarray] = None,
+ cls,
+ N: np.ndarray,
+ y: np.ndarray,
+ dist_metric: str = "euclidean",
+ cls_inds: t.Optional[np.ndarray] = None,
+ classes: t.Optional[np.ndarray] = None,
+ intracls_dists: t.Optional[np.ndarray] = None,
+ pairwise_norm_intercls_dist: t.Optional[np.ndarray] = None,
) -> float:
"""Compute the Dunn Index.
@@ -531,11 +576,11 @@ def ft_vdu(
classes : :obj:`np.ndarray`, optional
Distinct classes in ``y``. Used to exploit precomputations.
- intraclass_dists : :obj:`np.ndarray`, optional
+ intracls_dists : :obj:`np.ndarray`, optional
Distance between the fartest pair of instances in the same
class, for each class. Used to exploit precomputations.
- pairwise_norm_interclass_dists : :obj:`np.ndarray`, optional
+ pairwise_norm_intercls_dists : :obj:`np.ndarray`, optional
Normalized pairwise distances between instances of different
classes.
@@ -550,29 +595,30 @@ def ft_vdu(
partitions, J. Cybern. 4 (1) (1974) 95–104.
"""
- if pairwise_norm_interclass_dist is None:
- pairwise_norm_interclass_dist = (
- cls._calc_pairwise_norm_interclass_dist(
- N=N,
- y=y,
- dist_metric=dist_metric,
- classes=classes,
- cls_inds=cls_inds))
+ if pairwise_norm_intercls_dist is None:
+ pairwise_norm_intercls_dist = cls._calc_pwise_norm_intercls_dist(
+ N=N,
+ y=y,
+ dist_metric=dist_metric,
+ classes=classes,
+ cls_inds=cls_inds,
+ )
- if intraclass_dists is None:
- intraclass_dists = cls._calc_all_intraclass_dists(
+ if intracls_dists is None:
+ intracls_dists = cls._calc_all_intracls_dists(
N=N,
y=y,
dist_metric=dist_metric,
classes=classes,
- cls_inds=cls_inds).max()
+ cls_inds=cls_inds,
+ ).max()
- _min_interclass_dist = np.inf
+ _min_intercls_dist = np.inf
- for vals in pairwise_norm_interclass_dist:
- _min_interclass_dist = min(_min_interclass_dist, np.min(vals))
+ for vals in pairwise_norm_intercls_dist:
+ _min_intercls_dist = min(_min_intercls_dist, np.min(vals))
- vdu = _min_interclass_dist / intraclass_dists.max()
+ vdu = _min_intercls_dist / intracls_dists.max()
return vdu
@@ -602,13 +648,13 @@ def ft_vdb(cls, N: np.ndarray, y: np.ndarray) -> float:
@classmethod
def ft_int(
- cls,
- N: np.ndarray,
- y: np.ndarray,
- dist_metric: str = "euclidean",
- cls_inds: t.Optional[np.ndarray] = None,
- classes: t.Optional[np.ndarray] = None,
- pairwise_norm_interclass_dist: t.Optional[np.ndarray] = None,
+ cls,
+ N: np.ndarray,
+ y: np.ndarray,
+ dist_metric: str = "euclidean",
+ cls_inds: t.Optional[np.ndarray] = None,
+ classes: t.Optional[np.ndarray] = None,
+ pairwise_norm_intercls_dist: t.Optional[np.ndarray] = None,
) -> float:
"""Compute the INT index.
@@ -638,7 +684,7 @@ def ft_int(
classes : :obj:`np.ndarray`, optional
Distinct classes in ``y``. Used to exploit precomputations.
- pairwise_norm_interclass_dists : :obj:`np.ndarray`, optional
+ pairwise_norm_intercls_dists : :obj:`np.ndarray`, optional
Normalized pairwise distances between instances of different
classes. Used to exploit precomputations.
@@ -662,31 +708,33 @@ def ft_int(
if class_num == 1:
return np.nan
- if pairwise_norm_interclass_dist is None:
- pairwise_norm_interclass_dist = (
- cls._calc_pairwise_norm_interclass_dist(
- N=N,
- y=y,
- dist_metric=dist_metric,
- classes=classes,
- cls_inds=cls_inds))
+ if pairwise_norm_intercls_dist is None:
+ pairwise_norm_intercls_dist = cls._calc_pwise_norm_intercls_dist(
+ N=N,
+ y=y,
+ dist_metric=dist_metric,
+ classes=classes,
+ cls_inds=cls_inds,
+ )
norm_factor = 2.0 / (class_num * (class_num - 1.0))
- _sum_interclass_dist = 0.0
+ _sum_intercls_dist = 0.0
- for vals in pairwise_norm_interclass_dist:
- _sum_interclass_dist += np.sum(vals)
+ for vals in pairwise_norm_intercls_dist:
+ _sum_intercls_dist += np.sum(vals)
- return _sum_interclass_dist * norm_factor
+ return _sum_intercls_dist * norm_factor
@classmethod
- def ft_sil(cls,
- N: np.ndarray,
- y: np.ndarray,
- dist_metric: str = "euclidean",
- sample_frac: t.Optional[int] = None,
- random_state: t.Optional[int] = None) -> float:
+ def ft_sil(
+ cls,
+ N: np.ndarray,
+ y: np.ndarray,
+ dist_metric: str = "euclidean",
+ sample_frac: t.Optional[int] = None,
+ random_state: t.Optional[int] = None,
+ ) -> float:
"""Compute the mean silhouette value.
Metric range is -1 to +1 (both inclusive).
@@ -736,16 +784,14 @@ def ft_sil(cls,
labels=y,
metric=dist_metric,
sample_size=sample_size,
- random_state=random_state)
+ random_state=random_state,
+ )
return silhouette
@classmethod
def ft_pb(
- cls,
- N: np.ndarray,
- y: np.ndarray,
- dist_metric: str = "euclidean",
+ cls, N: np.ndarray, y: np.ndarray, dist_metric: str = "euclidean",
) -> float:
"""Compute the pearson correlation between class matching and instance
distances.
@@ -778,13 +824,16 @@ def ft_pb(
"""
inst_dists = scipy.spatial.distance.pdist(X=N, metric=dist_metric)
- inst_matching_classes = np.array([
- inst_class_a == inst_class_b
- for inst_class_a, inst_class_b in itertools.combinations(y, 2)
- ])
+ inst_matching_classes = np.array(
+ [
+ inst_class_a == inst_class_b
+ for inst_class_a, inst_class_b in itertools.combinations(y, 2)
+ ]
+ )
correlation, _ = scipy.stats.pointbiserialr(
- x=inst_matching_classes, y=inst_dists)
+ x=inst_matching_classes, y=inst_dists
+ )
return correlation
@@ -817,9 +866,7 @@ def ft_ch(cls, N: np.ndarray, y: np.ndarray) -> float:
@classmethod
def ft_nre(
- cls,
- y: np.ndarray,
- class_freqs: t.Optional[np.ndarray] = None,
+ cls, y: np.ndarray, class_freqs: t.Optional[np.ndarray] = None,
) -> float:
"""Compute the normalized relative entropy.
@@ -854,11 +901,11 @@ def ft_nre(
@classmethod
def ft_sc(
- cls,
- y: np.ndarray,
- size: int = 15,
- normalize: bool = False,
- class_freqs: t.Optional[np.ndarray] = None,
+ cls,
+ y: np.ndarray,
+ size: int = 15,
+ normalize: bool = False,
+ class_freqs: t.Optional[np.ndarray] = None,
) -> int:
"""Compute the number of clusters with size smaller than a given size.
diff --git a/pymfe/complexity.py b/pymfe/complexity.py
index dcfefd33..d4eb9e94 100644
--- a/pymfe/complexity.py
+++ b/pymfe/complexity.py
@@ -57,10 +57,11 @@ class MFEComplexity:
computed in module ``statistical`` can freely be used for any
precomputation or feature extraction method of module ``landmarking``).
"""
+
@classmethod
- def precompute_complexity(cls,
- y: t.Optional[np.ndarray] = None,
- **kwargs) -> t.Dict[str, t.Any]:
+ def precompute_complexity(
+ cls, y: t.Optional[np.ndarray] = None, **kwargs
+ ) -> t.Dict[str, t.Any]:
"""Precompute some useful things to support feature-based measures.
Parameters
@@ -91,7 +92,7 @@ def precompute_complexity(cls,
"""
precomp_vals = {} # type: t.Dict[str, t.Any]
- if (y is not None and not {"classes", "class_freqs"}.issubset(kwargs)):
+ if y is not None and not {"classes", "class_freqs"}.issubset(kwargs):
sub_dic = MFEGeneral.precompute_general_class(y)
precomp_vals.update(sub_dic)
@@ -108,11 +109,13 @@ def precompute_complexity(cls,
return precomp_vals
@classmethod
- def precompute_pca_tx(cls,
- N: np.ndarray,
- tx_n_components: float = 0.95,
- random_state: t.Optional[int] = None,
- **kwargs) -> t.Dict[str, int]:
+ def precompute_pca_tx(
+ cls,
+ N: np.ndarray,
+ tx_n_components: float = 0.95,
+ random_state: t.Optional[int] = None,
+ **kwargs
+ ) -> t.Dict[str, int]:
"""Precompute PCA to support dimensionality measures.
Parameters
@@ -149,8 +152,9 @@ def precompute_pca_tx(cls,
precomp_vals = {}
if N is not None and "num_attr_pca" not in kwargs:
- pca = sklearn.decomposition.PCA(n_components=tx_n_components,
- random_state=random_state)
+ pca = sklearn.decomposition.PCA(
+ n_components=tx_n_components, random_state=random_state
+ )
pca.fit(N)
@@ -162,11 +166,12 @@ def precompute_pca_tx(cls,
@classmethod
def precompute_complexity_svm(
- cls,
- y: t.Optional[np.ndarray] = None,
- max_iter: t.Union[int, float] = 1e5,
- random_state: t.Optional[int] = None,
- **kwargs) -> t.Dict[str, sklearn.pipeline.Pipeline]:
+ cls,
+ y: t.Optional[np.ndarray] = None,
+ max_iter: t.Union[int, float] = 1e5,
+ random_state: t.Optional[int] = None,
+ **kwargs
+ ) -> t.Dict[str, sklearn.pipeline.Pipeline]:
"""Init a Support Vector Classifier pipeline (with data standardization.)
Parameters
@@ -204,12 +209,14 @@ def precompute_complexity_svm(
# Note: 'C' parameter is inversely proportional to the
# regularization strenght, which is '0.5' in the reference
# paper.
- svc = sklearn.svm.LinearSVC(penalty="l2",
- loss="hinge",
- C=2.0,
- tol=10e-3,
- max_iter=int(max_iter),
- random_state=random_state)
+ svc = sklearn.svm.LinearSVC(
+ penalty="l2",
+ loss="hinge",
+ C=2.0,
+ tol=10e-3,
+ max_iter=int(max_iter),
+ random_state=random_state,
+ )
pip = sklearn.pipeline.Pipeline([("scaler", scaler), ("svc", svc)])
@@ -218,11 +225,13 @@ def precompute_complexity_svm(
return precomp_vals
@classmethod
- def precompute_norm_dist_mat(cls,
- N: np.ndarray,
- metric: str = "minkowski",
- p: t.Union[int, float] = 2,
- **kwargs) -> t.Dict[str, np.ndarray]:
+ def precompute_norm_dist_mat(
+ cls,
+ N: np.ndarray,
+ metric: str = "minkowski",
+ p: t.Union[int, float] = 2,
+ **kwargs
+ ) -> t.Dict[str, np.ndarray]:
"""Precompute normalized ``N`` and pairwise distance among instances.
Parameters
@@ -276,13 +285,13 @@ def precompute_norm_dist_mat(cls,
N_scaled = kwargs.get("N_scaled")
if N_scaled is not None and "norm_dist_mat" not in kwargs:
- norm_dist_mat, orig_dist_mat_min, orig_dist_mat_ptp = (
- cls._calc_norm_dist_mat(
- N=N,
- metric=metric,
- p=p,
- N_scaled=N_scaled,
- return_scalers=True))
+ (
+ norm_dist_mat,
+ orig_dist_mat_min,
+ orig_dist_mat_ptp,
+ ) = cls._calc_norm_dist_mat(
+ N=N, metric=metric, p=p, N_scaled=N_scaled, return_scalers=True
+ )
precomp_vals["norm_dist_mat"] = norm_dist_mat
precomp_vals["orig_dist_mat_min"] = orig_dist_mat_min
@@ -291,12 +300,14 @@ def precompute_norm_dist_mat(cls,
return precomp_vals
@classmethod
- def precompute_nearest_enemy(cls,
- N: np.ndarray,
- y: t.Optional[np.ndarray] = None,
- metric: str = "minkowski",
- p: t.Union[int, float] = 2,
- **kwargs) -> t.Dict[str, np.ndarray]:
+ def precompute_nearest_enemy(
+ cls,
+ N: np.ndarray,
+ y: t.Optional[np.ndarray] = None,
+ metric: str = "minkowski",
+ p: t.Union[int, float] = 2,
+ **kwargs
+ ) -> t.Dict[str, np.ndarray]:
"""Precompute instances nearest enemy related values.
The instance nearest enemy is the nearest instance from a
@@ -347,28 +358,33 @@ def precompute_nearest_enemy(cls,
"""
precomp_vals = {}
- if (y is not None and N.size
- and not {"nearest_enemy_dist", "nearest_enemy_ind"
- }.issubset(kwargs)):
+ if (
+ y is not None
+ and N.size
+ and not {"nearest_enemy_dist", "nearest_enemy_ind"}.issubset(
+ kwargs
+ )
+ ):
norm_dist_mat = kwargs.get("norm_dist_mat")
cls_inds = kwargs.get("cls_inds")
if norm_dist_mat is None:
precomp_vals.update(
- cls.precompute_norm_dist_mat(N=N,
- metric=metric,
- p=p,
- **kwargs))
+ cls.precompute_norm_dist_mat(
+ N=N, metric=metric, p=p, **kwargs
+ )
+ )
norm_dist_mat = precomp_vals["norm_dist_mat"]
if cls_inds is None:
precomp_vals.update(cls.precompute_complexity(y=y, **kwargs))
cls_inds = precomp_vals["cls_inds"]
- nearest_enemy_dist, nearest_enemy_ind = (cls._calc_nearest_enemies(
+ nearest_enemy_dist, nearest_enemy_ind = cls._calc_nearest_enemies(
norm_dist_mat=norm_dist_mat,
cls_inds=cls_inds,
- return_inds=True))
+ return_inds=True,
+ )
precomp_vals["nearest_enemy_dist"] = nearest_enemy_dist
precomp_vals["nearest_enemy_ind"] = nearest_enemy_ind
@@ -377,13 +393,13 @@ def precompute_nearest_enemy(cls,
@classmethod
def _calc_norm_dist_mat(
- cls,
- N: np.ndarray,
- metric: str,
- p: t.Union[int, float] = 2,
- N_scaled: t.Optional[np.ndarray] = None,
- normalize: bool = True,
- return_scalers: bool = False,
+ cls,
+ N: np.ndarray,
+ metric: str,
+ p: t.Union[int, float] = 2,
+ N_scaled: t.Optional[np.ndarray] = None,
+ normalize: bool = True,
+ return_scalers: bool = False,
) -> np.ndarray:
"""Calculate a pairwise normalized distance matrix.
@@ -406,14 +422,16 @@ def _calc_norm_dist_mat(
N_scaled = cls._scale_N(N=N)
norm_dist_mat = scipy.spatial.distance.cdist(
- N_scaled, N_scaled, metric=metric, p=p)
+ N_scaled, N_scaled, metric=metric, p=p
+ )
orig_dist_mat_min = np.min(norm_dist_mat)
orig_dist_mat_ptp = np.ptp(norm_dist_mat)
if normalize and np.not_equal(0.0, orig_dist_mat_ptp):
norm_dist_mat = (
- norm_dist_mat - orig_dist_mat_min) / orig_dist_mat_ptp
+ norm_dist_mat - orig_dist_mat_min
+ ) / orig_dist_mat_ptp
if return_scalers:
return norm_dist_mat, orig_dist_mat_min, orig_dist_mat_ptp
@@ -498,9 +516,7 @@ def _calc_maxmin(N_cls_1: np.ndarray, N_cls_2: np.ndarray) -> np.ndarray:
@staticmethod
def _calc_overlap(
- N: np.ndarray,
- minmax: np.ndarray,
- maxmin: np.ndarray,
+ N: np.ndarray, minmax: np.ndarray, maxmin: np.ndarray,
) -> t.Tuple[int, np.ndarray, np.ndarray]:
"""Compute the instances in overlapping region by feature."""
# True if the example is in the overlapping region
@@ -513,11 +529,11 @@ def _calc_overlap(
@classmethod
def _interpolate(
- cls,
- N: np.ndarray,
- y: np.ndarray,
- cls_inds: np.ndarray,
- random_state: t.Optional[int] = None,
+ cls,
+ N: np.ndarray,
+ y: np.ndarray,
+ cls_inds: np.ndarray,
+ random_state: t.Optional[int] = None,
) -> t.Tuple[np.ndarray, np.ndarray]:
"""Create a new dataset using interpolated instances from ``N``.
@@ -554,10 +570,10 @@ def _interpolate(
@classmethod
def _calc_nearest_enemies(
- cls,
- norm_dist_mat: np.ndarray,
- cls_inds: np.ndarray,
- return_inds: bool = True,
+ cls,
+ norm_dist_mat: np.ndarray,
+ cls_inds: np.ndarray,
+ return_inds: bool = True,
) -> t.Union[np.ndarray, t.Tuple[np.ndarray, ...]]:
"""Calculate each instances nearest enemies.
@@ -593,19 +609,23 @@ def _scale_N(N: np.ndarray) -> np.ndarray:
"""Scale all features of N to [0, 1] range."""
N_scaled = N
- if (not np.allclose(1.0, np.max(N, axis=0))
- or not np.allclose(0.0, np.min(N, axis=0))):
+ if not np.allclose(1.0, np.max(N, axis=0)) or not np.allclose(
+ 0.0, np.min(N, axis=0)
+ ):
N_scaled = sklearn.preprocessing.MinMaxScaler(
- feature_range=(0, 1)).fit_transform(N)
+ feature_range=(0, 1)
+ ).fit_transform(N)
return N_scaled
@classmethod
- def ft_f1(cls,
- N: np.ndarray,
- y: np.ndarray,
- cls_inds: t.Optional[np.ndarray] = None,
- class_freqs: t.Optional[np.ndarray] = None) -> np.ndarray:
+ def ft_f1(
+ cls,
+ N: np.ndarray,
+ y: np.ndarray,
+ cls_inds: t.Optional[np.ndarray] = None,
+ class_freqs: t.Optional[np.ndarray] = None,
+ ) -> np.ndarray:
"""Maximum Fisher's discriminant ratio.
It measures theoverlap between the values of the features in
@@ -659,15 +679,20 @@ def ft_f1(cls,
centroids = np.asarray(
[np.mean(N[inds_cur_cls, :], axis=0) for inds_cur_cls in cls_inds],
- dtype=float)
+ dtype=float,
+ )
- _numer = np.sum(np.square(centroids - mean_global).T * class_freqs,
- axis=1)
+ _numer = np.sum(
+ np.square(centroids - mean_global).T * class_freqs, axis=1
+ )
- _denom = np.sum([
- sum(np.square(N[inds_cur_cls, :] - centroids[cls_ind, :]))
- for cls_ind, inds_cur_cls in enumerate(cls_inds)
- ], axis=0)
+ _denom = np.sum(
+ [
+ sum(np.square(N[inds_cur_cls, :] - centroids[cls_ind, :]))
+ for cls_ind, inds_cur_cls in enumerate(cls_inds)
+ ],
+ axis=0,
+ )
attr_discriminant_ratio = _numer / _denom
@@ -680,12 +705,14 @@ def ft_f1(cls,
return f1
@classmethod
- def ft_f1v(cls,
- N: np.ndarray,
- y: np.ndarray,
- ovo_comb: t.Optional[np.ndarray] = None,
- cls_inds: t.Optional[np.ndarray] = None,
- class_freqs: t.Optional[np.ndarray] = None) -> np.ndarray:
+ def ft_f1v(
+ cls,
+ N: np.ndarray,
+ y: np.ndarray,
+ ovo_comb: t.Optional[np.ndarray] = None,
+ cls_inds: t.Optional[np.ndarray] = None,
+ class_freqs: t.Optional[np.ndarray] = None,
+ ) -> np.ndarray:
"""Directional-vector maximum Fisher's discriminant ratio.
This measure searches for a vector which can separate the two
@@ -747,18 +774,21 @@ def ft_f1v(cls,
for cls_ind, inds_cur_cls in enumerate(cls_inds):
cur_cls_inst = N[inds_cur_cls, :]
mat_scatter_within.append(
- np.cov(cur_cls_inst, rowvar=False, ddof=1))
+ np.cov(cur_cls_inst, rowvar=False, ddof=1)
+ )
centroids[cls_ind, :] = cur_cls_inst.mean(axis=0)
for ind, (cls_id_1, cls_id_2) in enumerate(ovo_comb):
- centroid_diff = (centroids[cls_id_1, :] -
- centroids[cls_id_2, :]).reshape(-1, 1)
+ centroid_diff = (
+ centroids[cls_id_1, :] - centroids[cls_id_2, :]
+ ).reshape(-1, 1)
total_inst_num = class_freqs[cls_id_1] + class_freqs[cls_id_2]
- W_mat = (class_freqs[cls_id_1] * mat_scatter_within[cls_id_1] +
- class_freqs[cls_id_2] *
- mat_scatter_within[cls_id_2]) / total_inst_num
+ W_mat = (
+ class_freqs[cls_id_1] * mat_scatter_within[cls_id_1]
+ + class_freqs[cls_id_2] * mat_scatter_within[cls_id_2]
+ ) / total_inst_num
# Note: the result of np.linalg.piv 'Moore-Penrose' pseudo-inverse
# does not match with the result of MASS::ginv 'Moore-Penrose'
@@ -777,11 +807,13 @@ def ft_f1v(cls,
return f1v
@classmethod
- def ft_f2(cls,
- N: np.ndarray,
- y: np.ndarray,
- ovo_comb: t.Optional[np.ndarray] = None,
- cls_inds: t.Optional[np.ndarray] = None) -> float:
+ def ft_f2(
+ cls,
+ N: np.ndarray,
+ y: np.ndarray,
+ ovo_comb: t.Optional[np.ndarray] = None,
+ cls_inds: t.Optional[np.ndarray] = None,
+ ) -> float:
"""Volume of the overlapping region.
This measure calculates the overlap of the distributions of
@@ -844,18 +876,19 @@ def ft_f2(cls,
maxmin = cls._calc_maxmin(N_cls_1, N_cls_2)
f4[ind] = np.prod(
- np.maximum(0.0, minmax - maxmin) / (maxmax - minmin))
+ np.maximum(0.0, minmax - maxmin) / (maxmax - minmin)
+ )
return f4
@classmethod
def ft_f3(
- cls,
- N: np.ndarray,
- y: np.ndarray,
- ovo_comb: t.Optional[np.ndarray] = None,
- cls_inds: t.Optional[np.ndarray] = None,
- class_freqs: t.Optional[np.ndarray] = None,
+ cls,
+ N: np.ndarray,
+ y: np.ndarray,
+ ovo_comb: t.Optional[np.ndarray] = None,
+ cls_inds: t.Optional[np.ndarray] = None,
+ class_freqs: t.Optional[np.ndarray] = None,
) -> np.ndarray:
"""Compute feature maximum individual efficiency.
@@ -911,10 +944,12 @@ def ft_f3(
ind_less_overlap, feat_overlap_num, _ = cls._calc_overlap(
N=N,
minmax=cls._calc_minmax(N_cls_1, N_cls_2),
- maxmin=cls._calc_maxmin(N_cls_1, N_cls_2))
+ maxmin=cls._calc_maxmin(N_cls_1, N_cls_2),
+ )
- f3[ind] = (feat_overlap_num[ind_less_overlap] /
- (class_freqs[cls_id_1] + class_freqs[cls_id_2]))
+ f3[ind] = feat_overlap_num[ind_less_overlap] / (
+ class_freqs[cls_id_1] + class_freqs[cls_id_2]
+ )
# The measure is computed in the literature using the mean. However, it
# is formulated here as a meta-feature. Therefore, the post-processing
@@ -923,12 +958,12 @@ def ft_f3(
@classmethod
def ft_f4(
- cls,
- N: np.ndarray,
- y: np.ndarray,
- ovo_comb: t.Optional[np.ndarray] = None,
- cls_inds: t.Optional[np.ndarray] = None,
- class_freqs: t.Optional[np.ndarray] = None,
+ cls,
+ N: np.ndarray,
+ y: np.ndarray,
+ ovo_comb: t.Optional[np.ndarray] = None,
+ cls_inds: t.Optional[np.ndarray] = None,
+ class_freqs: t.Optional[np.ndarray] = None,
) -> np.ndarray:
"""Compute the collective feature efficiency.
@@ -978,8 +1013,9 @@ def ft_f4(
f4 = np.zeros(ovo_comb.shape[0], dtype=float)
for ind, (cls_id_1, cls_id_2) in enumerate(ovo_comb):
- cls_subset_union = np.logical_or(cls_inds[cls_id_1, :],
- cls_inds[cls_id_2, :])
+ cls_subset_union = np.logical_or(
+ cls_inds[cls_id_1, :], cls_inds[cls_id_2, :]
+ )
cls_1 = cls_inds[cls_id_1, cls_subset_union]
cls_2 = cls_inds[cls_id_2, cls_subset_union]
@@ -994,11 +1030,15 @@ def ft_f4(
# Note: 'feat_overlapped_region' is a boolean vector with
# True values if the example is in the overlapping region
- ind_less_overlap, _, feat_overlapped_region = (
- cls._calc_overlap(
- N=N_view,
- minmax=cls._calc_minmax(N_cls_1, N_cls_2),
- maxmin=cls._calc_maxmin(N_cls_1, N_cls_2)))
+ (
+ ind_less_overlap,
+ _,
+ feat_overlapped_region,
+ ) = cls._calc_overlap(
+ N=N_view,
+ minmax=cls._calc_minmax(N_cls_1, N_cls_2),
+ maxmin=cls._calc_maxmin(N_cls_1, N_cls_2),
+ )
# Boolean that if True, this example is in the overlapping
# region
@@ -1018,8 +1058,9 @@ def ft_f4(
subset_size = N_subset.shape[0]
- f4[ind] = subset_size / (class_freqs[cls_id_1] +
- class_freqs[cls_id_2])
+ f4[ind] = subset_size / (
+ class_freqs[cls_id_1] + class_freqs[cls_id_2]
+ )
# The measure is computed in the literature using the mean. However, it
# is formulated here as a meta-feature. Therefore, the post-processing
@@ -1027,15 +1068,17 @@ def ft_f4(
return f4
@classmethod
- def ft_l1(cls,
- N: np.ndarray,
- y: np.ndarray,
- ovo_comb: t.Optional[np.ndarray] = None,
- cls_inds: t.Optional[np.ndarray] = None,
- class_freqs: t.Optional[np.ndarray] = None,
- svc_pipeline: t.Optional[sklearn.pipeline.Pipeline] = None,
- max_iter: t.Union[int, float] = 1e5,
- random_state: t.Optional[int] = None) -> np.ndarray:
+ def ft_l1(
+ cls,
+ N: np.ndarray,
+ y: np.ndarray,
+ ovo_comb: t.Optional[np.ndarray] = None,
+ cls_inds: t.Optional[np.ndarray] = None,
+ class_freqs: t.Optional[np.ndarray] = None,
+ svc_pipeline: t.Optional[sklearn.pipeline.Pipeline] = None,
+ max_iter: t.Union[int, float] = 1e5,
+ random_state: t.Optional[int] = None,
+ ) -> np.ndarray:
"""Sum of error distance by linear programming.
This measure assesses if the data are linearly separable by
@@ -1111,9 +1154,9 @@ def ft_l1(cls,
class_freqs = sub_dic["class_freqs"]
if svc_pipeline is None:
- sub_dic = cls.precompute_complexity_svm(max_iter=max_iter,
- y=y,
- random_state=random_state)
+ sub_dic = cls.precompute_complexity_svm(
+ max_iter=max_iter, y=y, random_state=random_state
+ )
svc_pipeline = sub_dic["svc_pipeline"]
@@ -1131,13 +1174,15 @@ def ft_l1(cls,
if misclassified_insts.size:
insts_dists = svc_pipeline.decision_function(
- misclassified_insts)
+ misclassified_insts
+ )
else:
insts_dists = np.array([0.0], dtype=float)
- sum_err_dist[ind] = (np.linalg.norm(insts_dists, ord=1) /
- (class_freqs[cls_1] + class_freqs[cls_2]))
+ sum_err_dist[ind] = np.linalg.norm(insts_dists, ord=1) / (
+ class_freqs[cls_1] + class_freqs[cls_2]
+ )
l1 = 1.0 - 1.0 / (1.0 + sum_err_dist)
# The measure is computed in the literature using the mean. However, it
@@ -1146,14 +1191,16 @@ def ft_l1(cls,
return l1
@classmethod
- def ft_l2(cls,
- N: np.ndarray,
- y: np.ndarray,
- ovo_comb: t.Optional[np.ndarray] = None,
- cls_inds: t.Optional[np.ndarray] = None,
- svc_pipeline: t.Optional[sklearn.pipeline.Pipeline] = None,
- max_iter: t.Union[int, float] = 1e5,
- random_state: t.Optional[int] = None) -> np.ndarray:
+ def ft_l2(
+ cls,
+ N: np.ndarray,
+ y: np.ndarray,
+ ovo_comb: t.Optional[np.ndarray] = None,
+ cls_inds: t.Optional[np.ndarray] = None,
+ svc_pipeline: t.Optional[sklearn.pipeline.Pipeline] = None,
+ max_iter: t.Union[int, float] = 1e5,
+ random_state: t.Optional[int] = None,
+ ) -> np.ndarray:
"""Compute the OVO subsets error rate of linear classifier.
The linear model used is induced by the Support Vector
@@ -1219,9 +1266,9 @@ def ft_l2(cls,
cls_inds = sub_dic["cls_inds"]
if svc_pipeline is None:
- sub_dic = cls.precompute_complexity_svm(max_iter=max_iter,
- y=y,
- random_state=random_state)
+ sub_dic = cls.precompute_complexity_svm(
+ max_iter=max_iter, y=y, random_state=random_state
+ )
svc_pipeline = sub_dic["svc_pipeline"]
@@ -1236,9 +1283,9 @@ def ft_l2(cls,
svc_pipeline.fit(N_subset, y_subset)
y_pred = svc_pipeline.predict(N_subset)
- error = sklearn.metrics.zero_one_loss(y_true=y_subset,
- y_pred=y_pred,
- normalize=True)
+ error = sklearn.metrics.zero_one_loss(
+ y_true=y_subset, y_pred=y_pred, normalize=True
+ )
l2[ind] = error
@@ -1248,14 +1295,16 @@ def ft_l2(cls,
return l2
@classmethod
- def ft_l3(cls,
- N: np.ndarray,
- y: np.ndarray,
- ovo_comb: t.Optional[np.ndarray] = None,
- cls_inds: t.Optional[np.ndarray] = None,
- svc_pipeline: t.Optional[sklearn.pipeline.Pipeline] = None,
- max_iter: t.Union[int, float] = 1e5,
- random_state: t.Optional[int] = None) -> np.ndarray:
+ def ft_l3(
+ cls,
+ N: np.ndarray,
+ y: np.ndarray,
+ ovo_comb: t.Optional[np.ndarray] = None,
+ cls_inds: t.Optional[np.ndarray] = None,
+ svc_pipeline: t.Optional[sklearn.pipeline.Pipeline] = None,
+ max_iter: t.Union[int, float] = 1e5,
+ random_state: t.Optional[int] = None,
+ ) -> np.ndarray:
"""Non-Linearity of a linear classifier.
This index is sensitive to how the data from a class are
@@ -1325,9 +1374,9 @@ def ft_l3(cls,
cls_inds = sub_dic["cls_inds"]
if svc_pipeline is None:
- sub_dic = cls.precompute_complexity_svm(max_iter=max_iter,
- y=y,
- random_state=random_state)
+ sub_dic = cls.precompute_complexity_svm(
+ max_iter=max_iter, y=y, random_state=random_state
+ )
svc_pipeline = sub_dic["svc_pipeline"]
@@ -1353,13 +1402,14 @@ def ft_l3(cls,
N=N_subset,
y=y_subset,
cls_inds=cls_inds_subset,
- random_state=random_state)
+ random_state=random_state,
+ )
y_pred = svc_pipeline.predict(N_interpol)
- error = sklearn.metrics.zero_one_loss(y_true=y_interpol,
- y_pred=y_pred,
- normalize=True)
+ error = sklearn.metrics.zero_one_loss(
+ y_true=y_interpol, y_pred=y_pred, normalize=True
+ )
l3[ind] = error
@@ -1369,13 +1419,15 @@ def ft_l3(cls,
return l3
@classmethod
- def ft_n1(cls,
- N: np.ndarray,
- y: np.ndarray,
- metric: str = "minkowski",
- p: t.Union[int, float] = 2,
- N_scaled: t.Optional[np.ndarray] = None,
- norm_dist_mat: t.Optional[np.ndarray] = None) -> float:
+ def ft_n1(
+ cls,
+ N: np.ndarray,
+ y: np.ndarray,
+ metric: str = "minkowski",
+ p: t.Union[int, float] = 2,
+ N_scaled: t.Optional[np.ndarray] = None,
+ norm_dist_mat: t.Optional[np.ndarray] = None,
+ ) -> float:
"""Compute the fraction of borderline points.
This measure is in [0, 1] range.
@@ -1425,15 +1477,17 @@ def ft_n1(cls,
"""
if norm_dist_mat is None:
norm_dist_mat = cls._calc_norm_dist_mat(
- N=N, metric=metric, p=p, N_scaled=N_scaled)
+ N=N, metric=metric, p=p, N_scaled=N_scaled
+ )
# Compute the minimum spanning tree using Kruskal's Minimum
# Spanning Tree algorithm.
# Note: in the paper, the authors used Prim's algorithm.
# Our implementation may change it in a future version due to
# time complexity advantages of Prim's algorithm in this context.
- mst = scipy.sparse.csgraph.minimum_spanning_tree(csgraph=np.triu(
- norm_dist_mat, k=1), overwrite=True)
+ mst = scipy.sparse.csgraph.minimum_spanning_tree(
+ csgraph=np.triu(norm_dist_mat, k=1), overwrite=True
+ )
node_id_i, node_id_j = np.nonzero(mst)
@@ -1442,25 +1496,30 @@ def ft_n1(cls,
# Number of vertices connected with different classes
borderline_inst_num = np.unique(
- np.concatenate([
- node_id_i[which_have_diff_cls],
- node_id_j[which_have_diff_cls],
- ])).size
+ np.concatenate(
+ [
+ node_id_i[which_have_diff_cls],
+ node_id_j[which_have_diff_cls],
+ ]
+ )
+ ).size
n1 = borderline_inst_num / y.size
return n1
@classmethod
- def ft_n2(cls,
- N: np.ndarray,
- y: np.ndarray,
- metric: str = "minkowski",
- p: t.Union[int, float] = 2,
- class_freqs: t.Optional[np.ndarray] = None,
- cls_inds: t.Optional[np.ndarray] = None,
- N_scaled: t.Optional[np.ndarray] = None,
- norm_dist_mat: t.Optional[np.ndarray] = None) -> np.ndarray:
+ def ft_n2(
+ cls,
+ N: np.ndarray,
+ y: np.ndarray,
+ metric: str = "minkowski",
+ p: t.Union[int, float] = 2,
+ class_freqs: t.Optional[np.ndarray] = None,
+ cls_inds: t.Optional[np.ndarray] = None,
+ N_scaled: t.Optional[np.ndarray] = None,
+ norm_dist_mat: t.Optional[np.ndarray] = None,
+ ) -> np.ndarray:
"""Ratio of intra and extra class nearest neighbor distance.
This measure computes the ratio of two sums:
@@ -1533,27 +1592,33 @@ def ft_n2(cls,
if norm_dist_mat is None:
norm_dist_mat = cls._calc_norm_dist_mat(
- N=N, metric=metric, p=p, N_scaled=N_scaled)
+ N=N, metric=metric, p=p, N_scaled=N_scaled
+ )
intra_extra = np.zeros(y.size, dtype=float)
cur_ind = 0
for cls_ind, inds_cur_cls in enumerate(cls_inds):
- norm_dist_mat_intracls = (
- norm_dist_mat[inds_cur_cls, :][:, inds_cur_cls])
- norm_dist_mat_intercls = (
- norm_dist_mat[~inds_cur_cls, :][:, inds_cur_cls])
+ norm_dist_mat_intracls = norm_dist_mat[inds_cur_cls, :][
+ :, inds_cur_cls
+ ]
+ norm_dist_mat_intercls = norm_dist_mat[~inds_cur_cls, :][
+ :, inds_cur_cls
+ ]
- norm_dist_mat_intracls[np.diag_indices_from(
- norm_dist_mat_intracls)] = np.inf
+ norm_dist_mat_intracls[
+ np.diag_indices_from(norm_dist_mat_intracls)
+ ] = np.inf
_aux = np.arange(class_freqs[cls_ind])
intra = norm_dist_mat_intracls[
- np.argmin(norm_dist_mat_intracls, axis=0), _aux]
+ np.argmin(norm_dist_mat_intracls, axis=0), _aux
+ ]
extra = norm_dist_mat_intercls[
- np.argmin(norm_dist_mat_intercls, axis=0), _aux]
+ np.argmin(norm_dist_mat_intercls, axis=0), _aux
+ ]
next_ind = cur_ind + class_freqs[cls_ind]
intra_extra[cur_ind:next_ind] = intra / extra
@@ -1566,13 +1631,15 @@ def ft_n2(cls,
return n2
@classmethod
- def ft_n3(cls,
- N: np.ndarray,
- y: np.ndarray,
- metric: str = "minkowski",
- p: t.Union[int, float] = 2,
- N_scaled: t.Optional[np.ndarray] = None,
- norm_dist_mat: t.Optional[np.ndarray] = None) -> np.ndarray:
+ def ft_n3(
+ cls,
+ N: np.ndarray,
+ y: np.ndarray,
+ metric: str = "minkowski",
+ p: t.Union[int, float] = 2,
+ N_scaled: t.Optional[np.ndarray] = None,
+ norm_dist_mat: t.Optional[np.ndarray] = None,
+ ) -> np.ndarray:
"""Error rate of the nearest neighbor classifier.
The N3 measure refers to the error rate of a 1-NN classifier
@@ -1625,10 +1692,12 @@ def ft_n3(cls,
"""
if norm_dist_mat is None:
norm_dist_mat = cls._calc_norm_dist_mat(
- N=N, metric=metric, p=p, N_scaled=N_scaled)
+ N=N, metric=metric, p=p, N_scaled=N_scaled
+ )
model = sklearn.neighbors.KNeighborsClassifier(
- n_neighbors=1, metric="precomputed").fit(norm_dist_mat, y)
+ n_neighbors=1, metric="precomputed"
+ ).fit(norm_dist_mat, y)
neighbor_inds = model.kneighbors(return_distance=False).ravel()
@@ -1640,18 +1709,20 @@ def ft_n3(cls,
return misclassifications
@classmethod
- def ft_n4(cls,
- N: np.ndarray,
- y: np.ndarray,
- metric: str = "minkowski",
- p: t.Union[int, float] = 2,
- n_neighbors: int = 1,
- random_state: t.Optional[int] = None,
- cls_inds: t.Optional[np.ndarray] = None,
- N_scaled: t.Optional[np.ndarray] = None,
- norm_dist_mat: t.Optional[np.ndarray] = None,
- orig_dist_mat_min: t.Optional[float] = None,
- orig_dist_mat_ptp: t.Optional[float] = None) -> np.ndarray:
+ def ft_n4(
+ cls,
+ N: np.ndarray,
+ y: np.ndarray,
+ metric: str = "minkowski",
+ p: t.Union[int, float] = 2,
+ n_neighbors: int = 1,
+ random_state: t.Optional[int] = None,
+ cls_inds: t.Optional[np.ndarray] = None,
+ N_scaled: t.Optional[np.ndarray] = None,
+ norm_dist_mat: t.Optional[np.ndarray] = None,
+ orig_dist_mat_min: t.Optional[float] = None,
+ orig_dist_mat_ptp: t.Optional[float] = None,
+ ) -> np.ndarray:
"""Compute the non-linearity of the k-NN Classifier.
The average value of this measure is in [0, 1] range.
@@ -1728,30 +1799,30 @@ def ft_n4(cls,
if N_scaled is None:
N_scaled = cls._scale_N(N=N)
- if (norm_dist_mat is None or orig_dist_mat_min is None or
- orig_dist_mat_ptp is None):
- norm_dist_mat, orig_dist_mat_min, orig_dist_mat_ptp = (
- cls._calc_norm_dist_mat(
- N=N,
- metric=metric,
- p=p,
- N_scaled=N_scaled,
- return_scalers=True))
-
- N_interpol, y_interpol = cls._interpolate(N=N_scaled,
- y=y,
- cls_inds=cls_inds,
- random_state=random_state)
-
- knn = sklearn.neighbors.KNeighborsClassifier(n_neighbors=n_neighbors,
- metric="precomputed").fit(
- norm_dist_mat, y)
+ if (
+ norm_dist_mat is None
+ or orig_dist_mat_min is None
+ or orig_dist_mat_ptp is None
+ ):
+ (
+ norm_dist_mat,
+ orig_dist_mat_min,
+ orig_dist_mat_ptp,
+ ) = cls._calc_norm_dist_mat(
+ N=N, metric=metric, p=p, N_scaled=N_scaled, return_scalers=True
+ )
+
+ N_interpol, y_interpol = cls._interpolate(
+ N=N_scaled, y=y, cls_inds=cls_inds, random_state=random_state
+ )
+
+ knn = sklearn.neighbors.KNeighborsClassifier(
+ n_neighbors=n_neighbors, metric="precomputed"
+ ).fit(norm_dist_mat, y)
test_dist = scipy.spatial.distance.cdist(
- N_interpol,
- N_scaled,
- metric=metric,
- p=p)
+ N_interpol, N_scaled, metric=metric, p=p
+ )
# Note: normalizing test data distances with original data
# information in order to provide unbiased predictions (i.e.
@@ -1769,9 +1840,9 @@ def ft_n4(cls,
return misclassifications
@classmethod
- def ft_c1(cls,
- y: np.array,
- class_freqs: t.Optional[np.ndarray] = None) -> float:
+ def ft_c1(
+ cls, y: np.array, class_freqs: t.Optional[np.ndarray] = None
+ ) -> float:
"""Compute the entropy of class proportions.
This measure is in [0, 1] range.
@@ -1805,15 +1876,16 @@ def ft_c1(cls,
# Note: calling 'ft_nre' just to make explicity the link
# between this metafeature and 'C1'.
- c1 = MFEClustering.ft_nre(y=y,
- class_freqs=class_freqs) / np.log(num_class)
+ c1 = MFEClustering.ft_nre(y=y, class_freqs=class_freqs) / np.log(
+ num_class
+ )
return c1
@classmethod
- def ft_c2(cls,
- y: np.ndarray,
- class_freqs: t.Optional[np.ndarray] = None) -> float:
+ def ft_c2(
+ cls, y: np.ndarray, class_freqs: t.Optional[np.ndarray] = None
+ ) -> float:
"""Compute the imbalance ratio.
This measure is in [0, 1] range.
@@ -1854,16 +1926,18 @@ def ft_c2(cls,
return c2
@classmethod
- def ft_t1(cls,
- N: np.ndarray,
- y: np.ndarray,
- metric: str = "minkowski",
- p: t.Union[int, float] = 2,
- cls_inds: t.Optional[np.ndarray] = None,
- N_scaled: t.Optional[np.ndarray] = None,
- norm_dist_mat: t.Optional[np.ndarray] = None,
- orig_dist_mat_min: t.Optional[float] = None,
- orig_dist_mat_ptp: t.Optional[float] = None) -> np.ndarray:
+ def ft_t1(
+ cls,
+ N: np.ndarray,
+ y: np.ndarray,
+ metric: str = "minkowski",
+ p: t.Union[int, float] = 2,
+ cls_inds: t.Optional[np.ndarray] = None,
+ N_scaled: t.Optional[np.ndarray] = None,
+ norm_dist_mat: t.Optional[np.ndarray] = None,
+ orig_dist_mat_min: t.Optional[float] = None,
+ orig_dist_mat_ptp: t.Optional[float] = None,
+ ) -> np.ndarray:
"""Fraction of hyperspheres covering data.
This measure uses a process that builds hyperspheres centered
@@ -1939,10 +2013,12 @@ def ft_t1(cls,
classification problems. IEEE Transactions on Pattern Analysis and
Machine Intelligence, 24(3):289–300, 2002.
"""
+
def _calc_hyperspheres_radius(
- nearest_enemy_ind: np.ndarray,
- nearest_enemy_dist: np.ndarray) -> np.ndarray:
+ nearest_enemy_ind: np.ndarray, nearest_enemy_dist: np.ndarray
+ ) -> np.ndarray:
"""Get the radius of hyperspheres which cover the given dataset."""
+
def _recurse_radius_calc(ind_inst: int) -> float:
"""Recursively calculate hyperspheres to cover dataset."""
if radius[ind_inst] >= 0.0:
@@ -1955,7 +2031,8 @@ def _recurse_radius_calc(ind_inst: int) -> float:
# each other, thus the hypersphere radius of both is
# half the distance between the instances.
radius[ind_enemy] = radius[ind_inst] = (
- 0.5 * nearest_enemy_dist[ind_inst])
+ 0.5 * nearest_enemy_dist[ind_inst]
+ )
return radius[ind_inst]
# Note: set the current instance radius to '0' before
@@ -1965,14 +2042,15 @@ def _recurse_radius_calc(ind_inst: int) -> float:
radius_enemy = _recurse_radius_calc(ind_inst=ind_enemy)
- radius[ind_inst] = abs(nearest_enemy_dist[ind_inst] -
- radius_enemy)
+ radius[ind_inst] = abs(
+ nearest_enemy_dist[ind_inst] - radius_enemy
+ )
return radius[ind_inst]
- radius = np.full(nearest_enemy_ind.size,
- fill_value=-1.0,
- dtype=float)
+ radius = np.full(
+ nearest_enemy_ind.size, fill_value=-1.0, dtype=float
+ )
for ind in np.arange(radius.size):
if radius[ind] < 0.0:
@@ -1980,20 +2058,26 @@ def _recurse_radius_calc(ind_inst: int) -> float:
return radius
- def _is_hypersphere_in(center_a: np.ndarray, center_b: np.ndarray,
- radius_a: float, radius_b: float) -> bool:
+ def _is_hypersphere_in(
+ center_a: np.ndarray,
+ center_b: np.ndarray,
+ radius_a: float,
+ radius_b: float,
+ ) -> bool:
"""Checks if a hypersphere `a` is in a hypersphere `b`."""
upper_a, lower_a = center_a + radius_a, center_a - radius_a
upper_b, lower_b = center_b + radius_b, center_b - radius_b
for ind in np.arange(center_a.size):
- if ((upper_a[ind] > upper_b[ind])
- or (lower_a[ind] < lower_b[ind])):
+ if (upper_a[ind] > upper_b[ind]) or (
+ lower_a[ind] < lower_b[ind]
+ ):
return False
return True
- def _agglomerate_hyperspheres(centers: np.ndarray,
- radius: np.ndarray) -> np.ndarray:
+ def _agglomerate_hyperspheres(
+ centers: np.ndarray, radius: np.ndarray
+ ) -> np.ndarray:
"""Agglomerate internal hyperspheres into outer hyperspheres.
Returns the number of training instances within each
@@ -2005,13 +2089,16 @@ def _agglomerate_hyperspheres(centers: np.ndarray,
for ind_a, ind_sphere_a in enumerate(sorted_sphere_inds[:-1]):
for ind_sphere_b in sorted_sphere_inds[:ind_a:-1]:
- if _is_hypersphere_in(center_a=centers[ind_sphere_a, :],
- center_b=centers[ind_sphere_b, :],
- radius_a=radius[ind_sphere_a],
- radius_b=radius[ind_sphere_b]):
-
- sphere_inst_num[ind_sphere_b] += (
- sphere_inst_num[ind_sphere_a])
+ if _is_hypersphere_in(
+ center_a=centers[ind_sphere_a, :],
+ center_b=centers[ind_sphere_b, :],
+ radius_a=radius[ind_sphere_a],
+ radius_b=radius[ind_sphere_b],
+ ):
+
+ sphere_inst_num[ind_sphere_b] += sphere_inst_num[
+ ind_sphere_a
+ ]
sphere_inst_num[ind_sphere_a] = 0
break
@@ -2023,35 +2110,37 @@ def _agglomerate_hyperspheres(centers: np.ndarray,
if cls_inds is None:
cls_inds = _utils.calc_cls_inds(y)
- if (norm_dist_mat is None or orig_dist_mat_min is None or
- orig_dist_mat_ptp is None):
+ if (
+ norm_dist_mat is None
+ or orig_dist_mat_min is None
+ or orig_dist_mat_ptp is None
+ ):
orig_dist_mat = cls._calc_norm_dist_mat(
- N=N,
- metric=metric,
- p=p,
- N_scaled=N_scaled,
- normalize=False)
+ N=N, metric=metric, p=p, N_scaled=N_scaled, normalize=False
+ )
else:
orig_dist_mat = (
- norm_dist_mat * orig_dist_mat_ptp + orig_dist_mat_min)
+ norm_dist_mat * orig_dist_mat_ptp + orig_dist_mat_min
+ )
# Note: using the original pairwise distances between instances,
# instead of the normalized ones, to preserve geometrical/spatial
# coherence between the sphere centers, radius, and placements.
# That is why we are not using neither the precomputed
# 'nearest_enemy_dist' nor 'nearest_enemy_ind' values here.
- nearest_enemy_dist, nearest_enemy_ind = (cls._calc_nearest_enemies(
- norm_dist_mat=orig_dist_mat,
- cls_inds=cls_inds,
- return_inds=True))
+ nearest_enemy_dist, nearest_enemy_ind = cls._calc_nearest_enemies(
+ norm_dist_mat=orig_dist_mat, cls_inds=cls_inds, return_inds=True
+ )
radius = _calc_hyperspheres_radius(
nearest_enemy_ind=nearest_enemy_ind,
- nearest_enemy_dist=nearest_enemy_dist)
+ nearest_enemy_dist=nearest_enemy_dist,
+ )
- sphere_inst_count = _agglomerate_hyperspheres(centers=N_scaled,
- radius=radius)
+ sphere_inst_count = _agglomerate_hyperspheres(
+ centers=N_scaled, radius=radius
+ )
# Note: in the reference paper, just the fraction of
# remaining hyperspheres to the size of the dataset is
@@ -2098,10 +2187,10 @@ def ft_t2(cls, N: np.ndarray) -> float:
@classmethod
def ft_t3(
- cls,
- N: np.ndarray,
- num_attr_pca: t.Optional[int] = None,
- random_state: t.Optional[int] = None,
+ cls,
+ N: np.ndarray,
+ num_attr_pca: t.Optional[int] = None,
+ random_state: t.Optional[int] = None,
) -> float:
"""Compute the average number of PCA dimensions per points.
@@ -2147,10 +2236,12 @@ def ft_t3(
return num_attr_pca / num_inst
@classmethod
- def ft_t4(cls,
- N: np.ndarray,
- num_attr_pca: t.Optional[int] = None,
- random_state: t.Optional[int] = None) -> float:
+ def ft_t4(
+ cls,
+ N: np.ndarray,
+ num_attr_pca: t.Optional[int] = None,
+ random_state: t.Optional[int] = None,
+ ) -> float:
"""Compute the ratio of the PCA dimension to the original dimension.
The components kept in the PCA dimension explains at least 95% of
@@ -2197,15 +2288,17 @@ def ft_t4(cls,
return num_attr_pca / num_attr
@classmethod
- def ft_lsc(cls,
- N: np.ndarray,
- y: np.ndarray,
- metric: str = "minkowski",
- p: t.Union[int, float] = 2,
- cls_inds: t.Optional[np.ndarray] = None,
- N_scaled: t.Optional[np.ndarray] = None,
- norm_dist_mat: t.Optional[np.ndarray] = None,
- nearest_enemy_dist: t.Optional[np.ndarray] = None) -> float:
+ def ft_lsc(
+ cls,
+ N: np.ndarray,
+ y: np.ndarray,
+ metric: str = "minkowski",
+ p: t.Union[int, float] = 2,
+ cls_inds: t.Optional[np.ndarray] = None,
+ N_scaled: t.Optional[np.ndarray] = None,
+ norm_dist_mat: t.Optional[np.ndarray] = None,
+ nearest_enemy_dist: t.Optional[np.ndarray] = None,
+ ) -> float:
"""Local set average cardinality.
The Local-Set (LS) of an example `x_i` in a dataset ``N`` is
@@ -2279,7 +2372,8 @@ def ft_lsc(cls,
"""
if norm_dist_mat is None:
norm_dist_mat = cls._calc_norm_dist_mat(
- N=N, metric=metric, p=p, N_scaled=N_scaled)
+ N=N, metric=metric, p=p, N_scaled=N_scaled
+ )
if nearest_enemy_dist is None:
if cls_inds is None:
@@ -2288,22 +2382,25 @@ def ft_lsc(cls,
nearest_enemy_dist = cls._calc_nearest_enemies(
norm_dist_mat=norm_dist_mat,
cls_inds=cls_inds,
- return_inds=False)
+ return_inds=False,
+ )
- lsc = (1.0 - np.sum(norm_dist_mat < nearest_enemy_dist) / (y.size**2))
+ lsc = 1.0 - np.sum(norm_dist_mat < nearest_enemy_dist) / (y.size ** 2)
return lsc
@classmethod
- def ft_density(cls,
- N: np.ndarray,
- y: np.ndarray,
- radius: t.Union[int, float] = 0.15,
- metric: str = "minkowski",
- p: t.Union[int, float] = 2,
- cls_inds: t.Optional[np.ndarray] = None,
- N_scaled: t.Optional[np.ndarray] = None,
- norm_dist_mat: t.Optional[np.ndarray] = None) -> float:
+ def ft_density(
+ cls,
+ N: np.ndarray,
+ y: np.ndarray,
+ radius: t.Union[int, float] = 0.15,
+ metric: str = "minkowski",
+ p: t.Union[int, float] = 2,
+ cls_inds: t.Optional[np.ndarray] = None,
+ N_scaled: t.Optional[np.ndarray] = None,
+ norm_dist_mat: t.Optional[np.ndarray] = None,
+ ) -> float:
"""Average density of the network.
This measure considers the number of edges that are retained in the
@@ -2373,7 +2470,8 @@ class to both be considered neighbors of each other. Note that
if norm_dist_mat is None:
norm_dist_mat = cls._calc_norm_dist_mat(
- N=N, metric=metric, p=p, N_scaled=N_scaled)
+ N=N, metric=metric, p=p, N_scaled=N_scaled
+ )
# Note: -y.size to discount self-loops
total_edges = -y.size
@@ -2392,15 +2490,16 @@ class to both be considered neighbors of each other. Note that
@classmethod
def ft_cls_coef(
- cls,
- N: np.ndarray,
- y: np.ndarray,
- radius: t.Union[int, float] = 0.15,
- metric: str = "minkowski",
- p: t.Union[int, float] = 2,
- cls_inds: t.Optional[np.ndarray] = None,
- N_scaled: t.Optional[np.ndarray] = None,
- norm_dist_mat: t.Optional[np.ndarray] = None) -> np.ndarray:
+ cls,
+ N: np.ndarray,
+ y: np.ndarray,
+ radius: t.Union[int, float] = 0.15,
+ metric: str = "minkowski",
+ p: t.Union[int, float] = 2,
+ cls_inds: t.Optional[np.ndarray] = None,
+ N_scaled: t.Optional[np.ndarray] = None,
+ norm_dist_mat: t.Optional[np.ndarray] = None,
+ ) -> np.ndarray:
"""Clustering coefficient.
The clustering coefficient of a vertex `v_i` is given by the
@@ -2469,7 +2568,8 @@ class to both be considered neighbors of each other. Note that
if norm_dist_mat is None:
norm_dist_mat = cls._calc_norm_dist_mat(
- N=N, metric=metric, p=p, N_scaled=N_scaled)
+ N=N, metric=metric, p=p, N_scaled=N_scaled
+ )
# Note: -1 to discount self-loops
neighbor_edges = np.full(y.size, fill_value=-1, dtype=int)
@@ -2485,8 +2585,9 @@ class to both be considered neighbors of each other. Note that
# number of the node neighbors, as the paper seems to claim.
total_nodes = np.sum(norm_dist_mat < radius, axis=1)
- cls_coef = 1.0 - 2 * np.mean(neighbor_edges / (1e-8 + total_nodes *
- (total_nodes - 1)))
+ cls_coef = 1.0 - 2 * np.mean(
+ neighbor_edges / (1e-8 + total_nodes * (total_nodes - 1))
+ )
# Note: the R mfe implementation calculates cls_coef as:
# cls_coef = 1 - transitivity(g), like the code below:
@@ -2503,15 +2604,17 @@ class to both be considered neighbors of each other. Note that
return cls_coef
@classmethod
- def ft_hubs(cls,
- N: np.ndarray,
- y: np.ndarray,
- radius: t.Union[int, float] = 0.15,
- metric: str = "minkowski",
- p: t.Union[int, float] = 2,
- cls_inds: t.Optional[np.ndarray] = None,
- N_scaled: t.Optional[np.ndarray] = None,
- norm_dist_mat: t.Optional[np.ndarray] = None) -> np.ndarray:
+ def ft_hubs(
+ cls,
+ N: np.ndarray,
+ y: np.ndarray,
+ radius: t.Union[int, float] = 0.15,
+ metric: str = "minkowski",
+ p: t.Union[int, float] = 2,
+ cls_inds: t.Optional[np.ndarray] = None,
+ N_scaled: t.Optional[np.ndarray] = None,
+ norm_dist_mat: t.Optional[np.ndarray] = None,
+ ) -> np.ndarray:
"""Hub score.
The hub score scores each node by the number of connections it
@@ -2582,7 +2685,8 @@ class to both be considered neighbors of each other. Note that
if norm_dist_mat is None:
norm_dist_mat = cls._calc_norm_dist_mat(
- N=N, metric=metric, p=p, N_scaled=N_scaled)
+ N=N, metric=metric, p=p, N_scaled=N_scaled
+ )
adj_mat = np.zeros_like(norm_dist_mat, dtype=norm_dist_mat.dtype)
diff --git a/pymfe/concept.py b/pymfe/concept.py
index c12b5abb..1994ef54 100644
--- a/pymfe/concept.py
+++ b/pymfe/concept.py
@@ -53,10 +53,9 @@ class MFEConcept:
"""
@classmethod
- def precompute_concept_dist(cls,
- N: np.ndarray,
- concept_dist_metric: str = "euclidean",
- **kwargs) -> t.Dict[str, t.Any]:
+ def precompute_concept_dist(
+ cls, N: np.ndarray, concept_dist_metric: str = "euclidean", **kwargs
+ ) -> t.Dict[str, t.Any]:
"""Precompute some useful things to support complexity measures.
Parameters
@@ -85,11 +84,13 @@ def precompute_concept_dist(cls,
if N is not None and "concept_distances" not in kwargs:
# 0-1 scaling
N = sklearn.preprocessing.MinMaxScaler(
- feature_range=(0, 1)).fit_transform(N)
+ feature_range=(0, 1)
+ ).fit_transform(N)
# distance matrix
concept_distances = scipy.spatial.distance.cdist(
- N, N, metric=concept_dist_metric)
+ N, N, metric=concept_dist_metric
+ )
precomp_vals["concept_distances"] = concept_distances
@@ -97,13 +98,13 @@ def precompute_concept_dist(cls,
@classmethod
def ft_conceptvar(
- cls,
- N: np.ndarray,
- y: np.ndarray,
- conceptvar_alpha: float = 2.0,
- concept_dist_metric: str = "euclidean",
- concept_minimum: float = 10e-10,
- concept_distances: t.Optional[np.ndarray] = None,
+ cls,
+ N: np.ndarray,
+ y: np.ndarray,
+ conceptvar_alpha: float = 2.0,
+ concept_dist_metric: str = "euclidean",
+ concept_minimum: float = 10e-10,
+ concept_distances: t.Optional[np.ndarray] = None,
) -> np.ndarray:
"""Compute the concept variation that estimates the variability of
class labels among examples.
@@ -158,12 +159,13 @@ class labels among examples.
rep_class_matrix = np.repeat([y], y.shape[0], axis=0)
# check if class is different
- class_diff = np.not_equal(rep_class_matrix.T,
- rep_class_matrix).astype(int)
+ class_diff = np.not_equal(rep_class_matrix.T, rep_class_matrix).astype(
+ int
+ )
- conceptvar_by_example = np.sum(
- weights * class_diff, axis=0) / np.sum(
- weights, axis=0)
+ conceptvar_by_example = np.sum(weights * class_diff, axis=0) / np.sum(
+ weights, axis=0
+ )
# The original meta-feature is the mean of the return.
# It will be done by the summary functions.
@@ -171,12 +173,12 @@ class labels among examples.
@classmethod
def ft_wg_dist(
- cls,
- N: np.ndarray,
- wg_dist_alpha: float = 2.0,
- concept_dist_metric: str = "euclidean",
- concept_minimum: float = 10e-10,
- concept_distances: t.Optional[np.ndarray] = None,
+ cls,
+ N: np.ndarray,
+ wg_dist_alpha: float = 2.0,
+ concept_dist_metric: str = "euclidean",
+ concept_minimum: float = 10e-10,
+ concept_distances: t.Optional[np.ndarray] = None,
) -> np.ndarray:
"""Compute the weighted distance, that captures how dense or sparse
is the example distribution.
@@ -226,9 +228,9 @@ def ft_wg_dist(
weights = np.power(2, -wg_dist_alpha * (concept_distances / div))
np.fill_diagonal(weights, 0.0)
- wg_dist_example = np.sum(
- weights * concept_distances, axis=0) / np.sum(
- weights, axis=0)
+ wg_dist_example = np.sum(weights * concept_distances, axis=0) / np.sum(
+ weights, axis=0
+ )
# The original meta-feature is the mean of the return.
# It will be done by summary functions.
@@ -236,12 +238,12 @@ def ft_wg_dist(
@classmethod
def ft_impconceptvar(
- cls,
- N: np.ndarray,
- y: np.ndarray,
- impconceptvar_alpha: float = 1.0,
- concept_dist_metric: str = "euclidean",
- concept_distances: t.Optional[np.ndarray] = None,
+ cls,
+ N: np.ndarray,
+ y: np.ndarray,
+ impconceptvar_alpha: float = 1.0,
+ concept_dist_metric: str = "euclidean",
+ concept_distances: t.Optional[np.ndarray] = None,
) -> np.ndarray:
"""Compute the improved concept variation that estimates the
variability of class labels among examples.
@@ -290,8 +292,9 @@ def ft_impconceptvar(
rep_class_matrix = np.repeat([y], y.shape[0], axis=0)
# check if class is different
- class_diff = np.not_equal(rep_class_matrix.T,
- rep_class_matrix).astype(int)
+ class_diff = np.not_equal(rep_class_matrix.T, rep_class_matrix).astype(
+ int
+ )
impconceptvar_by_example = np.sum(weights * class_diff, axis=0)
@@ -301,11 +304,11 @@ def ft_impconceptvar(
@classmethod
def ft_cohesiveness(
- cls,
- N: np.ndarray,
- cohesiveness_alpha: float = 1.0,
- concept_dist_metric: str = "euclidean",
- concept_distances: t.Optional[np.ndarray] = None,
+ cls,
+ N: np.ndarray,
+ cohesiveness_alpha: float = 1.0,
+ concept_dist_metric: str = "euclidean",
+ concept_distances: t.Optional[np.ndarray] = None,
) -> np.ndarray:
"""Compute the improved version of the weighted distance, that
captures how dense or sparse is the example distribution.
diff --git a/pymfe/general.py b/pymfe/general.py
index b6e6f12c..4ff8793a 100644
--- a/pymfe/general.py
+++ b/pymfe/general.py
@@ -50,9 +50,9 @@ class MFEGeneral:
"""
@classmethod
- def precompute_general_class(cls,
- y: t.Optional[np.ndarray] = None,
- **kwargs) -> t.Dict[str, t.Any]:
+ def precompute_general_class(
+ cls, y: t.Optional[np.ndarray] = None, **kwargs
+ ) -> t.Dict[str, t.Any]:
"""Precompute distinct classes and its frequencies from ``y``.
Parameters
@@ -110,8 +110,9 @@ def ft_attr_to_inst(cls, X: np.ndarray) -> int:
return X.shape[1] / X.shape[0]
@classmethod
- def ft_cat_to_num(cls, X: np.ndarray,
- cat_cols: t.Sequence[int]) -> t.Union[int, np.float]:
+ def ft_cat_to_num(
+ cls, X: np.ndarray, cat_cols: t.Sequence[int]
+ ) -> t.Union[int, np.float]:
"""Compute the ratio between the number of categoric and numeric
features.
@@ -150,9 +151,7 @@ def ft_cat_to_num(cls, X: np.ndarray,
@classmethod
def ft_freq_class(
- cls,
- y: np.ndarray,
- class_freqs: t.Optional[np.ndarray] = None,
+ cls, y: np.ndarray, class_freqs: t.Optional[np.ndarray] = None,
) -> np.ndarray:
"""Compute the relative frequency of each distinct class.
@@ -253,7 +252,8 @@ def ft_nr_bin(cls, X: np.ndarray) -> int:
Classification, volume 37. Ellis Horwood Upper Saddle River, 1994.
"""
bin_cols = np.apply_along_axis(
- func1d=lambda col: np.unique(col).size == 2, axis=0, arr=X)
+ func1d=lambda col: np.unique(col).size == 2, axis=0, arr=X
+ )
return np.sum(bin_cols)
@@ -282,8 +282,9 @@ def ft_nr_cat(cls, cat_cols: t.Sequence[int]) -> int:
return len(cat_cols)
@classmethod
- def ft_nr_class(cls, y: np.ndarray,
- classes: t.Optional[np.ndarray] = None) -> int:
+ def ft_nr_class(
+ cls, y: np.ndarray, classes: t.Optional[np.ndarray] = None
+ ) -> int:
"""Compute the number of distinct classes.
Parameters
@@ -361,8 +362,9 @@ def ft_nr_num(cls, X: np.ndarray, cat_cols: t.Sequence[int]) -> int:
return X.shape[1] - len(cat_cols)
@classmethod
- def ft_num_to_cat(cls, X: np.ndarray,
- cat_cols: t.Sequence[int]) -> t.Union[int, np.float]:
+ def ft_num_to_cat(
+ cls, X: np.ndarray, cat_cols: t.Sequence[int]
+ ) -> t.Union[int, np.float]:
"""Compute the number of numerical and categorical features.
If the number of categoric features is zero, :obj:`np.nan` is returned
diff --git a/pymfe/info_theory.py b/pymfe/info_theory.py
index 24082129..fd053525 100644
--- a/pymfe/info_theory.py
+++ b/pymfe/info_theory.py
@@ -54,8 +54,9 @@ class MFEInfoTheory:
"""
@classmethod
- def precompute_class_freq(cls, y: t.Optional[np.ndarray] = None,
- **kwargs) -> t.Dict[str, t.Any]:
+ def precompute_class_freq(
+ cls, y: t.Optional[np.ndarray] = None, **kwargs
+ ) -> t.Dict[str, t.Any]:
"""Precompute each distinct class (absolute) frequencies.
Parameters
@@ -86,11 +87,13 @@ def precompute_class_freq(cls, y: t.Optional[np.ndarray] = None,
return precomp_vals
@classmethod
- def precompute_entropy(cls,
- y: t.Optional[np.ndarray] = None,
- C: t.Optional[np.ndarray] = None,
- class_freqs: t.Optional[np.ndarray] = None,
- **kwargs) -> t.Dict[str, t.Any]:
+ def precompute_entropy(
+ cls,
+ y: t.Optional[np.ndarray] = None,
+ C: t.Optional[np.ndarray] = None,
+ class_freqs: t.Optional[np.ndarray] = None,
+ **kwargs
+ ) -> t.Dict[str, t.Any]:
"""Precompute various values related to Shannon's Entropy.
Parameters
@@ -128,7 +131,8 @@ def precompute_entropy(cls,
if y is not None and "class_ent" not in kwargs:
precomp_vals["class_ent"] = cls.ft_class_ent(
- y, class_freqs=class_freqs)
+ y, class_freqs=class_freqs
+ )
if C is not None and C.size and "attr_ent" not in kwargs:
precomp_vals["attr_ent"] = cls.ft_attr_ent(C)
@@ -136,7 +140,8 @@ def precompute_entropy(cls,
if y is not None and C is not None and C.size:
if "joint_ent" not in kwargs:
precomp_vals["joint_ent"] = np.apply_along_axis(
- func1d=cls._calc_joint_ent, axis=0, arr=C, vec_y=y)
+ func1d=cls._calc_joint_ent, axis=0, arr=C, vec_y=y
+ )
if "mut_inf" not in kwargs:
precomp_vals["mut_inf"] = cls.ft_mut_inf(
@@ -144,14 +149,17 @@ def precompute_entropy(cls,
y=y,
attr_ent=precomp_vals.get("attr_ent"),
class_ent=precomp_vals.get("class_ent"),
- joint_ent=precomp_vals.get("joint_ent"))
+ joint_ent=precomp_vals.get("joint_ent"),
+ )
return precomp_vals
@classmethod
- def _calc_entropy(cls,
- values: t.Union[np.ndarray, t.List],
- value_freqs: t.Optional[np.ndarray] = None) -> float:
+ def _calc_entropy(
+ cls,
+ values: t.Union[np.ndarray, t.List],
+ value_freqs: t.Optional[np.ndarray] = None,
+ ) -> float:
"""Calculate Shannon's entropy within array ``values``.
Check ``ft_attr_ent`` and ``ft_class_ent`` methods for more informa-
@@ -172,24 +180,24 @@ def _calc_entropy(cls,
return scipy.stats.entropy(value_freqs, base=2)
@classmethod
- def _calc_joint_ent(cls,
- vec_x: np.ndarray,
- vec_y: np.ndarray,
- epsilon: float = 1.0e-8) -> float:
+ def _calc_joint_ent(
+ cls, vec_x: np.ndarray, vec_y: np.ndarray, epsilon: float = 1.0e-8
+ ) -> float:
"""Compute joint entropy between ``vec_x`` and ``vec_y``."""
- joint_prob_mat = pd.crosstab(
- vec_y, vec_x, normalize=True).values + epsilon
+ joint_prob_mat = (
+ pd.crosstab(vec_y, vec_x, normalize=True).values + epsilon
+ )
joint_ent = np.sum(
- np.multiply(joint_prob_mat, np.log2(joint_prob_mat)))
+ np.multiply(joint_prob_mat, np.log2(joint_prob_mat))
+ )
return -1.0 * joint_ent
@classmethod
- def _calc_conc(cls,
- vec_x: np.ndarray,
- vec_y: np.ndarray,
- epsilon: float = 1.0e-8) -> float:
+ def _calc_conc(
+ cls, vec_x: np.ndarray, vec_y: np.ndarray, epsilon: float = 1.0e-8
+ ) -> float:
"""Concentration coefficient between two arrays ``vec_x`` and
``vec_y``.
@@ -198,17 +206,19 @@ def _calc_conc(cls,
pij = pd.crosstab(vec_x, vec_y, normalize=True).values + epsilon
isum = pij.sum(axis=0)
- jsum2 = np.sum(pij.sum(axis=1)**2)
+ jsum2 = np.sum(pij.sum(axis=1) ** 2)
- conc = (np.sum(pij**2 / isum) - jsum2) / (1.0 - jsum2)
+ conc = (np.sum(pij ** 2 / isum) - jsum2) / (1.0 - jsum2)
return conc
@classmethod
- def ft_attr_conc(cls,
- C: np.ndarray,
- max_attr_num: t.Optional[int] = 12,
- random_state: t.Optional[int] = None) -> np.ndarray:
+ def ft_attr_conc(
+ cls,
+ C: np.ndarray,
+ max_attr_num: t.Optional[int] = 12,
+ random_state: t.Optional[int] = None,
+ ) -> np.ndarray:
"""Compute concentration coef. of each pair of distinct attributes.
Parameters
@@ -249,21 +259,24 @@ def ft_attr_conc(cls,
np.random.seed(random_state)
col_inds = np.random.choice(
- col_inds, size=max_attr_num, replace=False)
+ col_inds, size=max_attr_num, replace=False
+ )
col_permutations = itertools.permutations(col_inds, 2)
- attr_conc = np.array([
- cls._calc_conc(C[:, ind_attr_a], C[:, ind_attr_b])
- for ind_attr_a, ind_attr_b in col_permutations
- ])
+ attr_conc = np.array(
+ [
+ cls._calc_conc(C[:, ind_attr_a], C[:, ind_attr_b])
+ for ind_attr_a, ind_attr_b in col_permutations
+ ]
+ )
return attr_conc
@classmethod
- def ft_attr_ent(cls,
- C: np.ndarray,
- attr_ent: t.Optional[np.ndarray] = None) -> np.ndarray:
+ def ft_attr_ent(
+ cls, C: np.ndarray, attr_ent: t.Optional[np.ndarray] = None
+ ) -> np.ndarray:
"""Compute Shannon's entropy for each predictive attribute.
The Shannon's Entropy H of a vector x is defined as:
@@ -324,13 +337,16 @@ def ft_class_conc(cls, C: np.ndarray, y: np.ndarray) -> np.ndarray:
on Artificial Intelligence Tools, 10(4):525–554, 2001.
"""
return np.apply_along_axis(
- func1d=cls._calc_conc, axis=0, arr=C, vec_y=y)
+ func1d=cls._calc_conc, axis=0, arr=C, vec_y=y
+ )
@classmethod
- def ft_class_ent(cls,
- y: np.ndarray,
- class_ent: t.Optional[np.ndarray] = None,
- class_freqs: t.Optional[np.ndarray] = None) -> float:
+ def ft_class_ent(
+ cls,
+ y: np.ndarray,
+ class_ent: t.Optional[np.ndarray] = None,
+ class_freqs: t.Optional[np.ndarray] = None,
+ ) -> float:
"""Compute target attribute Shannon's entropy.
The Shannon's Entropy H of a vector y is defined as:
@@ -373,12 +389,14 @@ def ft_class_ent(cls,
return cls._calc_entropy(y, value_freqs=class_freqs)
@classmethod
- def ft_eq_num_attr(cls,
- C: np.ndarray,
- y: np.ndarray,
- class_ent: t.Optional[np.ndarray] = None,
- class_freqs: t.Optional[np.ndarray] = None,
- mut_inf: t.Optional[np.ndarray] = None) -> float:
+ def ft_eq_num_attr(
+ cls,
+ C: np.ndarray,
+ y: np.ndarray,
+ class_ent: t.Optional[np.ndarray] = None,
+ class_freqs: t.Optional[np.ndarray] = None,
+ mut_inf: t.Optional[np.ndarray] = None,
+ ) -> float:
"""Compute the number of attributes equivalent for a predictive task.
The attribute equivalence E is defined as:
@@ -436,10 +454,12 @@ def ft_eq_num_attr(cls,
return num_col * class_ent / np.sum(mut_inf)
@classmethod
- def ft_joint_ent(cls,
- C: np.ndarray,
- y: np.ndarray,
- joint_ent: t.Optional[np.ndarray] = None) -> np.ndarray:
+ def ft_joint_ent(
+ cls,
+ C: np.ndarray,
+ y: np.ndarray,
+ joint_ent: t.Optional[np.ndarray] = None,
+ ) -> np.ndarray:
"""Compute the joint entropy between each attribute and class.
The Joint Entropy H between a predictive attribute x and target
@@ -482,19 +502,22 @@ def ft_joint_ent(cls,
"""
if joint_ent is None:
joint_ent = np.apply_along_axis(
- func1d=cls._calc_joint_ent, axis=0, arr=C, vec_y=y)
+ func1d=cls._calc_joint_ent, axis=0, arr=C, vec_y=y
+ )
return joint_ent
@classmethod
- def ft_mut_inf(cls,
- C: np.ndarray,
- y: np.ndarray,
- mut_inf: t.Optional[np.ndarray] = None,
- attr_ent: t.Optional[np.ndarray] = None,
- class_ent: t.Optional[float] = None,
- joint_ent: t.Optional[np.ndarray] = None,
- class_freqs: t.Optional[np.ndarray] = None) -> np.ndarray:
+ def ft_mut_inf(
+ cls,
+ C: np.ndarray,
+ y: np.ndarray,
+ mut_inf: t.Optional[np.ndarray] = None,
+ attr_ent: t.Optional[np.ndarray] = None,
+ class_ent: t.Optional[float] = None,
+ joint_ent: t.Optional[np.ndarray] = None,
+ class_freqs: t.Optional[np.ndarray] = None,
+ ) -> np.ndarray:
"""Compute the mutual information between each attribute and target.
The mutual Information MI between an independent attribute `x` and
@@ -565,11 +588,13 @@ def ft_mut_inf(cls,
return attr_ent + class_ent - joint_ent
@classmethod
- def ft_ns_ratio(cls,
- C: np.ndarray,
- y: np.ndarray,
- attr_ent: t.Optional[np.ndarray] = None,
- mut_inf: t.Optional[np.ndarray] = None) -> float:
+ def ft_ns_ratio(
+ cls,
+ C: np.ndarray,
+ y: np.ndarray,
+ attr_ent: t.Optional[np.ndarray] = None,
+ mut_inf: t.Optional[np.ndarray] = None,
+ ) -> float:
"""Compute the noisiness of attributes.
Let ``y`` be a target attribute and `x` one predictive attribute in
diff --git a/pymfe/itemset.py b/pymfe/itemset.py
index 33eb854d..548e333d 100644
--- a/pymfe/itemset.py
+++ b/pymfe/itemset.py
@@ -51,8 +51,9 @@ class MFEItemset:
"""
@classmethod
- def precompute_binary_matrix(cls, C: t.Optional[np.ndarray],
- **kwargs) -> t.Dict[str, t.Any]:
+ def precompute_binary_matrix(
+ cls, C: t.Optional[np.ndarray], **kwargs
+ ) -> t.Dict[str, t.Any]:
"""Precompute the binary representation of attributes.
Parameters
@@ -96,9 +97,7 @@ def _matrix_to_binary(cls, C: np.array) -> t.List[np.ndarray]:
@classmethod
def ft_two_itemset(
- cls,
- C: np.ndarray,
- itemset_binary_matrix: t.List[np.ndarray] = None,
+ cls, C: np.ndarray, itemset_binary_matrix: t.List[np.ndarray] = None,
) -> np.ndarray:
"""Compute the two itemset meta-feature.
@@ -147,10 +146,9 @@ def ft_two_itemset(
return twoitem_by_attr
@classmethod
- def ft_one_itemset(cls,
- C: np.ndarray,
- itemset_binary_matrix: t.List[np.ndarray] = None
- ) -> np.ndarray:
+ def ft_one_itemset(
+ cls, C: np.ndarray, itemset_binary_matrix: t.List[np.ndarray] = None
+ ) -> np.ndarray:
"""Compute the one itemset meta-feature.
The one itemset is the individual frequency of each attribute
diff --git a/pymfe/landmarking.py b/pymfe/landmarking.py
index 4418f7c0..5b328ed4 100644
--- a/pymfe/landmarking.py
+++ b/pymfe/landmarking.py
@@ -54,11 +54,13 @@ class MFELandmarking:
"""
@classmethod
- def precompute_landmarking_sample(cls,
- N: np.ndarray,
- lm_sample_frac: float,
- random_state: t.Optional[int] = None,
- **kwargs) -> t.Dict[str, t.Any]:
+ def precompute_landmarking_sample(
+ cls,
+ N: np.ndarray,
+ lm_sample_frac: float,
+ random_state: t.Optional[int] = None,
+ **kwargs
+ ) -> t.Dict[str, t.Any]:
"""Precompute subsampling landmarking subsample indices.
Parameters
@@ -94,20 +96,22 @@ def precompute_landmarking_sample(cls,
precomp_vals["sample_inds"] = cls._get_sample_inds(
num_inst=num_inst,
lm_sample_frac=lm_sample_frac,
- random_state=random_state)
+ random_state=random_state,
+ )
return precomp_vals
@classmethod
def precompute_landmarking_kfolds(
- cls,
- N: np.ndarray,
- y: t.Optional[np.ndarray] = None,
- num_cv_folds: int = 10,
- shuffle_cv_folds: t.Optional[bool] = False,
- random_state: t.Optional[int] = None,
- lm_sample_frac: float = 1.0,
- **kwargs) -> t.Dict[str, t.Any]:
+ cls,
+ N: np.ndarray,
+ y: t.Optional[np.ndarray] = None,
+ num_cv_folds: int = 10,
+ shuffle_cv_folds: t.Optional[bool] = False,
+ random_state: t.Optional[int] = None,
+ lm_sample_frac: float = 1.0,
+ **kwargs
+ ) -> t.Dict[str, t.Any]:
"""Precompute k-fold cross validation related values.
Parameters
@@ -155,10 +159,14 @@ def precompute_landmarking_kfolds(
precomp_vals["skf"] = sklearn.model_selection.StratifiedKFold(
n_splits=num_cv_folds,
shuffle=shuffle_cv_folds,
- random_state=random_state if shuffle_cv_folds else None)
-
- if (not shuffle_cv_folds or random_state is not None and
- "cv_folds_imp_rank" not in kwargs):
+ random_state=random_state if shuffle_cv_folds else None,
+ )
+
+ if (
+ not shuffle_cv_folds
+ or random_state is not None
+ and "cv_folds_imp_rank" not in kwargs
+ ):
skf = precomp_vals.get("skf", kwargs.get("skf"))
sample_inds = kwargs.get("sample_inds")
@@ -167,40 +175,50 @@ def precompute_landmarking_kfolds(
y=y,
lm_sample_frac=lm_sample_frac,
random_state=random_state,
- sample_inds=sample_inds)
-
- attr_fold_imp = np.array([
- cls._rank_feat_importance(
- N=N[inds_train, :],
- y=y[inds_train],
- random_state=random_state)
- for inds_train, inds_test in skf.split(N, y)
- ], dtype=int)
+ sample_inds=sample_inds,
+ )
+
+ attr_fold_imp = np.array(
+ [
+ cls._rank_feat_importance(
+ N=N[inds_train, :],
+ y=y[inds_train],
+ random_state=random_state,
+ )
+ for inds_train, inds_test in skf.split(N, y)
+ ],
+ dtype=int,
+ )
precomp_vals["cv_folds_imp_rank"] = attr_fold_imp
return precomp_vals
@classmethod
- def _get_sample_inds(cls, num_inst: int, lm_sample_frac: float,
- random_state: t.Optional[int]) -> np.ndarray:
+ def _get_sample_inds(
+ cls,
+ num_inst: int,
+ lm_sample_frac: float,
+ random_state: t.Optional[int],
+ ) -> np.ndarray:
"""Sample indices to calculate subsampling landmarking metafeatures."""
if random_state is not None:
np.random.seed(random_state)
sample_inds = np.random.choice(
- a=num_inst, size=int(lm_sample_frac * num_inst), replace=False)
+ a=num_inst, size=int(lm_sample_frac * num_inst), replace=False
+ )
return sample_inds
@classmethod
def _sample_data(
- cls,
- N: np.ndarray,
- y: np.ndarray,
- lm_sample_frac: float,
- random_state: t.Optional[int] = None,
- sample_inds: t.Optional[np.ndarray] = None,
+ cls,
+ N: np.ndarray,
+ y: np.ndarray,
+ lm_sample_frac: float,
+ random_state: t.Optional[int] = None,
+ sample_inds: t.Optional[np.ndarray] = None,
) -> t.Tuple[np.ndarray, np.ndarray]:
"""Select ``lm_sample_frac`` percent of data from ``N`` and ``y``."""
if lm_sample_frac >= 1.0 and sample_inds is None:
@@ -212,18 +230,19 @@ def _sample_data(
sample_inds = cls._get_sample_inds(
num_inst=num_inst,
lm_sample_frac=lm_sample_frac,
- random_state=random_state)
+ random_state=random_state,
+ )
return N[sample_inds, :], y[sample_inds]
@classmethod
def _rank_feat_importance(
- cls,
- N: np.ndarray,
- y: np.ndarray,
- lm_sample_frac: float = 1.0,
- sample_inds: t.Optional[np.ndarray] = None,
- random_state: t.Optional[int] = None,
+ cls,
+ N: np.ndarray,
+ y: np.ndarray,
+ lm_sample_frac: float = 1.0,
+ sample_inds: t.Optional[np.ndarray] = None,
+ random_state: t.Optional[int] = None,
) -> np.ndarray:
"""Rank the feature importances of a DT model.
@@ -261,25 +280,28 @@ def _rank_feat_importance(
y=y,
lm_sample_frac=lm_sample_frac,
random_state=random_state,
- sample_inds=sample_inds)
+ sample_inds=sample_inds,
+ )
clf = sklearn.tree.DecisionTreeClassifier(
- random_state=random_state).fit(N, y)
+ random_state=random_state
+ ).fit(N, y)
return np.argsort(clf.feature_importances_)
@classmethod
def ft_best_node(
- cls,
- N: np.ndarray,
- y: np.ndarray,
- score: t.Callable[[np.ndarray, np.ndarray], np.ndarray],
- skf: t.Optional[sklearn.model_selection.StratifiedKFold] = None,
- num_cv_folds: int = 10,
- shuffle_cv_folds: bool = False,
- lm_sample_frac: float = 1.0,
- sample_inds: t.Optional[np.ndarray] = None,
- random_state: t.Optional[int] = None) -> np.ndarray:
+ cls,
+ N: np.ndarray,
+ y: np.ndarray,
+ score: t.Callable[[np.ndarray, np.ndarray], np.ndarray],
+ skf: t.Optional[sklearn.model_selection.StratifiedKFold] = None,
+ num_cv_folds: int = 10,
+ shuffle_cv_folds: bool = False,
+ lm_sample_frac: float = 1.0,
+ sample_inds: t.Optional[np.ndarray] = None,
+ random_state: t.Optional[int] = None,
+ ) -> np.ndarray:
"""Performance of a the best single decision tree node.
Construct a single decision tree node model induced by the most
@@ -344,16 +366,19 @@ def ft_best_node(
y=y,
lm_sample_frac=lm_sample_frac,
random_state=random_state,
- sample_inds=sample_inds)
+ sample_inds=sample_inds,
+ )
if skf is None:
skf = sklearn.model_selection.StratifiedKFold(
n_splits=num_cv_folds,
shuffle=shuffle_cv_folds,
- random_state=random_state if shuffle_cv_folds else None)
+ random_state=random_state if shuffle_cv_folds else None,
+ )
model = sklearn.tree.DecisionTreeClassifier(
- max_depth=1, random_state=random_state)
+ max_depth=1, random_state=random_state
+ )
res = np.zeros(skf.n_splits, dtype=float)
@@ -370,16 +395,17 @@ def ft_best_node(
@classmethod
def ft_random_node(
- cls,
- N: np.ndarray,
- y: np.ndarray,
- score: t.Callable[[np.ndarray, np.ndarray], np.ndarray],
- skf: t.Optional[sklearn.model_selection.StratifiedKFold] = None,
- num_cv_folds: int = 10,
- shuffle_cv_folds: bool = False,
- lm_sample_frac: float = 1.0,
- sample_inds: t.Optional[np.ndarray] = None,
- random_state: t.Optional[int] = None) -> np.ndarray:
+ cls,
+ N: np.ndarray,
+ y: np.ndarray,
+ score: t.Callable[[np.ndarray, np.ndarray], np.ndarray],
+ skf: t.Optional[sklearn.model_selection.StratifiedKFold] = None,
+ num_cv_folds: int = 10,
+ shuffle_cv_folds: bool = False,
+ lm_sample_frac: float = 1.0,
+ sample_inds: t.Optional[np.ndarray] = None,
+ random_state: t.Optional[int] = None,
+ ) -> np.ndarray:
"""Performance of the single decision tree node model induced by a
random attribute.
@@ -442,13 +468,15 @@ def ft_random_node(
y=y,
lm_sample_frac=lm_sample_frac,
random_state=random_state,
- sample_inds=sample_inds)
+ sample_inds=sample_inds,
+ )
if skf is None:
skf = sklearn.model_selection.StratifiedKFold(
n_splits=num_cv_folds,
shuffle=shuffle_cv_folds,
- random_state=random_state if shuffle_cv_folds else None)
+ random_state=random_state if shuffle_cv_folds else None,
+ )
if random_state is not None:
np.random.seed(random_state)
@@ -456,7 +484,8 @@ def ft_random_node(
rand_ind_attr = np.random.randint(0, N.shape[1], size=1)
model = sklearn.tree.DecisionTreeClassifier(
- max_depth=1, random_state=random_state)
+ max_depth=1, random_state=random_state
+ )
res = np.zeros(skf.n_splits, dtype=float)
@@ -473,17 +502,17 @@ def ft_random_node(
@classmethod
def ft_worst_node(
- cls,
- N: np.ndarray,
- y: np.ndarray,
- score: t.Callable[[np.ndarray, np.ndarray], np.ndarray],
- skf: t.Optional[sklearn.model_selection.StratifiedKFold] = None,
- num_cv_folds: int = 10,
- shuffle_cv_folds: bool = False,
- lm_sample_frac: float = 1.0,
- sample_inds: t.Optional[np.ndarray] = None,
- random_state: t.Optional[int] = None,
- cv_folds_imp_rank: t.Optional[np.ndarray] = None,
+ cls,
+ N: np.ndarray,
+ y: np.ndarray,
+ score: t.Callable[[np.ndarray, np.ndarray], np.ndarray],
+ skf: t.Optional[sklearn.model_selection.StratifiedKFold] = None,
+ num_cv_folds: int = 10,
+ shuffle_cv_folds: bool = False,
+ lm_sample_frac: float = 1.0,
+ sample_inds: t.Optional[np.ndarray] = None,
+ random_state: t.Optional[int] = None,
+ cv_folds_imp_rank: t.Optional[np.ndarray] = None,
) -> np.ndarray:
"""Performance of the single decision tree node model induced by the
worst informative attribute.
@@ -555,16 +584,19 @@ def ft_worst_node(
y=y,
lm_sample_frac=lm_sample_frac,
random_state=random_state,
- sample_inds=sample_inds)
+ sample_inds=sample_inds,
+ )
if skf is None:
skf = sklearn.model_selection.StratifiedKFold(
n_splits=num_cv_folds,
shuffle=shuffle_cv_folds,
- random_state=random_state if shuffle_cv_folds else None)
+ random_state=random_state if shuffle_cv_folds else None,
+ )
model = sklearn.tree.DecisionTreeClassifier(
- max_depth=1, random_state=random_state)
+ max_depth=1, random_state=random_state
+ )
res = np.zeros(skf.n_splits, dtype=float)
@@ -576,7 +608,8 @@ def ft_worst_node(
imp_rank = cls._rank_feat_importance(
N=N[inds_train, :],
y=y[inds_train],
- random_state=random_state)
+ random_state=random_state,
+ )
X_train = N[inds_train, imp_rank[0], np.newaxis]
X_test = N[inds_test, imp_rank[0], np.newaxis]
@@ -590,16 +623,16 @@ def ft_worst_node(
@classmethod
def ft_linear_discr(
- cls,
- N: np.ndarray,
- y: np.ndarray,
- score: t.Callable[[np.ndarray, np.ndarray], np.ndarray],
- skf: t.Optional[sklearn.model_selection.StratifiedKFold] = None,
- num_cv_folds: int = 10,
- shuffle_cv_folds: bool = False,
- lm_sample_frac: float = 1.0,
- sample_inds: t.Optional[np.ndarray] = None,
- random_state: t.Optional[int] = None,
+ cls,
+ N: np.ndarray,
+ y: np.ndarray,
+ score: t.Callable[[np.ndarray, np.ndarray], np.ndarray],
+ skf: t.Optional[sklearn.model_selection.StratifiedKFold] = None,
+ num_cv_folds: int = 10,
+ shuffle_cv_folds: bool = False,
+ lm_sample_frac: float = 1.0,
+ sample_inds: t.Optional[np.ndarray] = None,
+ random_state: t.Optional[int] = None,
) -> np.ndarray:
"""Performance of the Linear Discriminant classifier.
@@ -665,13 +698,15 @@ def ft_linear_discr(
y=y,
lm_sample_frac=lm_sample_frac,
random_state=random_state,
- sample_inds=sample_inds)
+ sample_inds=sample_inds,
+ )
if skf is None:
skf = sklearn.model_selection.StratifiedKFold(
n_splits=num_cv_folds,
shuffle=shuffle_cv_folds,
- random_state=random_state if shuffle_cv_folds else None)
+ random_state=random_state if shuffle_cv_folds else None,
+ )
model = sklearn.discriminant_analysis.LinearDiscriminantAnalysis()
@@ -690,16 +725,16 @@ def ft_linear_discr(
@classmethod
def ft_naive_bayes(
- cls,
- N: np.ndarray,
- y: np.ndarray,
- score: t.Callable[[np.ndarray, np.ndarray], np.ndarray],
- skf: t.Optional[sklearn.model_selection.StratifiedKFold] = None,
- num_cv_folds: int = 10,
- shuffle_cv_folds: bool = False,
- lm_sample_frac: float = 1.0,
- sample_inds: t.Optional[np.ndarray] = None,
- random_state: t.Optional[int] = None,
+ cls,
+ N: np.ndarray,
+ y: np.ndarray,
+ score: t.Callable[[np.ndarray, np.ndarray], np.ndarray],
+ skf: t.Optional[sklearn.model_selection.StratifiedKFold] = None,
+ num_cv_folds: int = 10,
+ shuffle_cv_folds: bool = False,
+ lm_sample_frac: float = 1.0,
+ sample_inds: t.Optional[np.ndarray] = None,
+ random_state: t.Optional[int] = None,
) -> np.ndarray:
"""Performance of the Naive Bayes classifier.
@@ -765,13 +800,15 @@ def ft_naive_bayes(
y=y,
lm_sample_frac=lm_sample_frac,
random_state=random_state,
- sample_inds=sample_inds)
+ sample_inds=sample_inds,
+ )
if skf is None:
skf = sklearn.model_selection.StratifiedKFold(
n_splits=num_cv_folds,
shuffle=shuffle_cv_folds,
- random_state=random_state if shuffle_cv_folds else None)
+ random_state=random_state if shuffle_cv_folds else None,
+ )
model = sklearn.naive_bayes.GaussianNB()
@@ -790,16 +827,16 @@ def ft_naive_bayes(
@classmethod
def ft_one_nn(
- cls,
- N: np.ndarray,
- y: np.ndarray,
- score: t.Callable[[np.ndarray, np.ndarray], np.ndarray],
- skf: t.Optional[sklearn.model_selection.StratifiedKFold] = None,
- num_cv_folds: int = 10,
- shuffle_cv_folds: bool = False,
- lm_sample_frac: float = 1.0,
- sample_inds: t.Optional[np.ndarray] = None,
- random_state: t.Optional[int] = None,
+ cls,
+ N: np.ndarray,
+ y: np.ndarray,
+ score: t.Callable[[np.ndarray, np.ndarray], np.ndarray],
+ skf: t.Optional[sklearn.model_selection.StratifiedKFold] = None,
+ num_cv_folds: int = 10,
+ shuffle_cv_folds: bool = False,
+ lm_sample_frac: float = 1.0,
+ sample_inds: t.Optional[np.ndarray] = None,
+ random_state: t.Optional[int] = None,
) -> np.ndarray:
"""Performance of the 1-Nearest Neighbor classifier.
@@ -861,20 +898,23 @@ def ft_one_nn(
y=y,
lm_sample_frac=lm_sample_frac,
random_state=random_state,
- sample_inds=sample_inds)
+ sample_inds=sample_inds,
+ )
if skf is None:
skf = sklearn.model_selection.StratifiedKFold(
n_splits=num_cv_folds,
shuffle=shuffle_cv_folds,
- random_state=random_state if shuffle_cv_folds else None)
+ random_state=random_state if shuffle_cv_folds else None,
+ )
model = sklearn.neighbors.KNeighborsClassifier(
n_neighbors=1,
algorithm="auto",
weights="uniform",
p=2,
- metric="minkowski")
+ metric="minkowski",
+ )
res = np.zeros(skf.n_splits, dtype=float)
@@ -891,17 +931,17 @@ def ft_one_nn(
@classmethod
def ft_elite_nn(
- cls,
- N: np.ndarray,
- y: np.ndarray,
- score: t.Callable[[np.ndarray, np.ndarray], np.ndarray],
- skf: t.Optional[sklearn.model_selection.StratifiedKFold] = None,
- num_cv_folds: int = 10,
- shuffle_cv_folds: bool = False,
- lm_sample_frac: float = 1.0,
- sample_inds: t.Optional[np.ndarray] = None,
- random_state: t.Optional[int] = None,
- cv_folds_imp_rank: t.Optional[np.ndarray] = None,
+ cls,
+ N: np.ndarray,
+ y: np.ndarray,
+ score: t.Callable[[np.ndarray, np.ndarray], np.ndarray],
+ skf: t.Optional[sklearn.model_selection.StratifiedKFold] = None,
+ num_cv_folds: int = 10,
+ shuffle_cv_folds: bool = False,
+ lm_sample_frac: float = 1.0,
+ sample_inds: t.Optional[np.ndarray] = None,
+ random_state: t.Optional[int] = None,
+ cv_folds_imp_rank: t.Optional[np.ndarray] = None,
) -> np.ndarray:
"""Performance of Elite Nearest Neighbor.
@@ -974,13 +1014,15 @@ def ft_elite_nn(
y=y,
lm_sample_frac=lm_sample_frac,
random_state=random_state,
- sample_inds=sample_inds)
+ sample_inds=sample_inds,
+ )
if skf is None:
skf = sklearn.model_selection.StratifiedKFold(
n_splits=num_cv_folds,
shuffle=shuffle_cv_folds,
- random_state=random_state if shuffle_cv_folds else None)
+ random_state=random_state if shuffle_cv_folds else None,
+ )
model = sklearn.neighbors.KNeighborsClassifier(n_neighbors=1)
@@ -994,7 +1036,8 @@ def ft_elite_nn(
imp_rank = cls._rank_feat_importance(
N=N[inds_train, :],
y=y[inds_train],
- random_state=random_state)
+ random_state=random_state,
+ )
X_train = N[inds_train, imp_rank[-1], np.newaxis]
X_test = N[inds_test, imp_rank[-1], np.newaxis]
diff --git a/pymfe/mfe.py b/pymfe/mfe.py
index 6d16fdc1..4a4eaedf 100644
--- a/pymfe/mfe.py
+++ b/pymfe/mfe.py
@@ -12,8 +12,9 @@
import pymfe._internal as _internal
-_TypeSeqExt = t.Sequence[t.Tuple[str, t.Callable, t.Tuple[str, ...],
- t.Tuple[str, ...]]]
+_TypeSeqExt = t.Sequence[
+ t.Tuple[str, t.Callable, t.Tuple[str, ...], t.Tuple[str, ...]]
+]
"""Type annotation for a sequence of TypeExtMtdTuple objects."""
@@ -41,21 +42,24 @@ class MFE:
Tuple object which contains summary functions names for features
summarization.
"""
- groups_alias = [('default', _internal.DEFAULT_GROUP)]
-
- def __init__(self,
- groups: t.Union[str, t.Iterable[str]] = "default",
- features: t.Union[str, t.Iterable[str]] = "all",
- summary: t.Union[str, t.Iterable[str]] = ("mean", "sd"),
- measure_time: t.Optional[str] = None,
- wildcard: str = "all",
- score: str = "accuracy",
- num_cv_folds: int = 10,
- shuffle_cv_folds: bool = False,
- lm_sample_frac: float = 1.0,
- hypparam_model_dt: t.Optional[t.Dict[str, t.Any]] = None,
- suppress_warnings: bool = False,
- random_state: t.Optional[int] = None) -> None:
+
+ groups_alias = [("default", _internal.DEFAULT_GROUP)]
+
+ def __init__(
+ self,
+ groups: t.Union[str, t.Iterable[str]] = "default",
+ features: t.Union[str, t.Iterable[str]] = "all",
+ summary: t.Union[str, t.Iterable[str]] = ("mean", "sd"),
+ measure_time: t.Optional[str] = None,
+ wildcard: str = "all",
+ score: str = "accuracy",
+ num_cv_folds: int = 10,
+ shuffle_cv_folds: bool = False,
+ lm_sample_frac: float = 1.0,
+ hypparam_model_dt: t.Optional[t.Dict[str, t.Any]] = None,
+ suppress_warnings: bool = False,
+ random_state: t.Optional[int] = None,
+ ) -> None:
"""Provides easy access for metafeature extraction from datasets.
It expected that user first calls ``fit`` method after instantiation
@@ -244,13 +248,16 @@ def __init__(self,
"""
self.groups = _internal.process_generic_set(
- values=groups, group_name="groups",
+ values=groups,
+ group_name="groups",
groups_alias=MFE.groups_alias,
- wildcard=wildcard) # type: t.Tuple[str, ...]
+ wildcard=wildcard,
+ ) # type: t.Tuple[str, ...]
- self.groups, self.inserted_group_dep = (
- _internal.solve_group_dependencies(
- groups=self.groups))
+ (
+ self.groups,
+ self.inserted_group_dep,
+ ) = _internal.solve_group_dependencies(groups=self.groups)
proc_feat = _internal.process_features(
features=features,
@@ -263,12 +270,12 @@ def __init__(self,
del proc_feat
self.summary, self._metadata_mtd_sm = _internal.process_summary(
- summary,
- wildcard=wildcard) # type: t.Tuple[t.Tuple[str, ...], _TypeSeqExt]
+ summary, wildcard=wildcard
+ ) # type: t.Tuple[t.Tuple[str, ...], _TypeSeqExt]
self.timeopt = _internal.process_generic_option(
- value=measure_time, group_name="timeopt",
- allow_none=True) # type: t.Optional[str]
+ value=measure_time, group_name="timeopt", allow_none=True
+ ) # type: t.Optional[str]
self.X = None # type: t.Optional[np.ndarray]
self.y = None # type: t.Optional[np.ndarray]
@@ -298,7 +305,8 @@ def __init__(self,
else:
raise ValueError(
'Invalid "random_state" argument ({0}). '
- 'Expecting None or an integer.'.format(random_state))
+ "Expecting None or an integer.".format(random_state)
+ )
self.shuffle_cv_folds = shuffle_cv_folds
@@ -306,24 +314,27 @@ def __init__(self,
self.num_cv_folds = num_cv_folds
else:
- raise ValueError('Invalid "num_cv_folds" argument ({0}). '
- 'Expecting an integer.'.format(random_state))
+ raise ValueError(
+ 'Invalid "num_cv_folds" argument ({0}). '
+ "Expecting an integer.".format(random_state)
+ )
if isinstance(lm_sample_frac, int):
lm_sample_frac = float(lm_sample_frac)
- if isinstance(lm_sample_frac, float)\
- and 0.5 <= lm_sample_frac <= 1.0:
+ if isinstance(lm_sample_frac, float) and 0.5 <= lm_sample_frac <= 1.0:
self.lm_sample_frac = lm_sample_frac
else:
- raise ValueError('Invalid "lm_sample_frac" argument ({0}). '
- 'Expecting an float [0.5, 1].'
- .format(random_state))
+ raise ValueError(
+ 'Invalid "lm_sample_frac" argument ({0}). '
+ "Expecting an float [0.5, 1].".format(random_state)
+ )
self.score = _internal.check_score(score, self.groups)
- self.hypparam_model_dt = (hypparam_model_dt.copy()
- if hypparam_model_dt else None)
+ self.hypparam_model_dt = (
+ hypparam_model_dt.copy() if hypparam_model_dt else None
+ )
# """Total time elapsed for precomputations."""
self.time_precomp = -1.0
@@ -335,14 +346,15 @@ def __init__(self,
self.time_total = -1.0
def _call_summary_methods(
- self,
- feature_values: t.Sequence[_internal.TypeNumeric],
- feature_name: str,
- verbose: int = 0,
- suppress_warnings: bool = False,
- **kwargs
- ) -> t.Tuple[t.List[str], t.List[t.Union[float, t.Sequence]], t.
- List[float]]:
+ self,
+ feature_values: t.Sequence[_internal.TypeNumeric],
+ feature_name: str,
+ verbose: int = 0,
+ suppress_warnings: bool = False,
+ **kwargs,
+ ) -> t.Tuple[
+ t.List[str], t.List[t.Union[float, t.Sequence]], t.List[float]
+ ]:
"""Invoke summary functions loaded in the model on given feature
values.
@@ -412,8 +424,10 @@ def _call_summary_methods(
"function...".format(
_internal.VERBOSE_BLOCK_MID_SYMBOL,
feature_name,
- sm_mtd_name),
- end=" ")
+ sm_mtd_name,
+ ),
+ end=" ",
+ )
sm_mtd_args_pack = _internal.build_mtd_kwargs(
mtd_name=sm_mtd_name,
@@ -421,30 +435,37 @@ def _call_summary_methods(
mtd_mandatory=set(),
user_custom_args=kwargs.get(sm_mtd_name),
inner_custom_args=self._custom_args_sum,
- suppress_warnings=suppress_warnings)
+ suppress_warnings=suppress_warnings,
+ )
summarized_val, time_sm = _internal.timeit(
- _internal.summarize, feature_values, sm_mtd_callable,
- sm_mtd_args_pack)
+ _internal.summarize,
+ feature_values,
+ sm_mtd_callable,
+ sm_mtd_args_pack,
+ )
if not suppress_warnings:
_internal.check_summary_warnings(
value=summarized_val,
name_feature=feature_name,
- name_summary=sm_mtd_name)
+ name_summary=sm_mtd_name,
+ )
if isinstance(summarized_val, np.ndarray):
summarized_val = summarized_val.flatten().tolist()
- if (isinstance(summarized_val, collections.Sequence)
- and not isinstance(summarized_val, str)):
+ if isinstance(
+ summarized_val, collections.Sequence
+ ) and not isinstance(summarized_val, str):
metafeat_vals += summarized_val
metafeat_names += [
".".join((feature_name, sm_mtd_name, str(i)))
for i in range(len(summarized_val))
]
- metafeat_times += ([time_sm] + (
- (len(summarized_val) - 1) * [0.0]))
+ metafeat_times += [time_sm] + (
+ (len(summarized_val) - 1) * [0.0]
+ )
else:
metafeat_vals.append(summarized_val)
@@ -455,20 +476,23 @@ def _call_summary_methods(
print("Done.")
if verbose >= 2:
- print(" {} Done summarizing '{}' feature.".format(
- _internal.VERBOSE_BLOCK_END_SYMBOL,
- feature_name))
+ print(
+ " {} Done summarizing '{}' feature.".format(
+ _internal.VERBOSE_BLOCK_END_SYMBOL, feature_name
+ )
+ )
return metafeat_names, metafeat_vals, metafeat_times
def _call_feature_methods(
- self,
- verbose: int = 0,
- # enable_parallel: bool = False,
- suppress_warnings: bool = False,
- **kwargs) -> t.Tuple[t.List[str],
- t.List[t.Union[int, float, t.Sequence]],
- t.List[float]]:
+ self,
+ verbose: int = 0,
+ # enable_parallel: bool = False,
+ suppress_warnings: bool = False,
+ **kwargs,
+ ) -> t.Tuple[
+ t.List[str], t.List[t.Union[int, float, t.Sequence]], t.List[float]
+ ]:
"""Invoke feature methods loaded in the model and gather results.
The returned values are already summarized if needed.
@@ -482,11 +506,16 @@ def _call_feature_methods(
skipped_count = 0
for ind, cur_metadata in enumerate(self._metadata_mtd_ft, 1):
- (ft_mtd_name, ft_mtd_callable,
- ft_mtd_args, ft_mandatory) = cur_metadata
+ (
+ ft_mtd_name,
+ ft_mtd_callable,
+ ft_mtd_args,
+ ft_mandatory,
+ ) = cur_metadata
ft_name_without_prefix = _internal.remove_prefix(
- value=ft_mtd_name, prefix=_internal.MTF_PREFIX)
+ value=ft_mtd_name, prefix=_internal.MTF_PREFIX
+ )
try:
ft_mtd_args_pack = _internal.build_mtd_kwargs(
@@ -496,28 +525,40 @@ def _call_feature_methods(
user_custom_args=kwargs.get(ft_name_without_prefix),
inner_custom_args=self._custom_args_ft,
precomp_args=self._precomp_args_ft,
- suppress_warnings=suppress_warnings)
+ suppress_warnings=suppress_warnings,
+ )
except RuntimeError:
# Not all method's mandatory arguments were satisfied.
# Skip the current method.
if verbose >= 2:
- print("\nSkipped '{}' ({} of {}).".format(
- ft_mtd_name, ind, len(self._metadata_mtd_ft)))
+ print(
+ "\nSkipped '{}' ({} of {}).".format(
+ ft_mtd_name, ind, len(self._metadata_mtd_ft)
+ )
+ )
skipped_count += 1
continue
if verbose >= 2:
- print("\nExtracting '{}' feature ({} of {})..."
- .format(ft_mtd_name, ind, len(self._metadata_mtd_ft)))
+ print(
+ "\nExtracting '{}' feature ({} of {})...".format(
+ ft_mtd_name, ind, len(self._metadata_mtd_ft)
+ )
+ )
features, time_ft = _internal.timeit(
- _internal.get_feat_value, ft_mtd_name, ft_mtd_args_pack,
- ft_mtd_callable, suppress_warnings)
+ _internal.get_feat_value,
+ ft_mtd_name,
+ ft_mtd_args_pack,
+ ft_mtd_callable,
+ suppress_warnings,
+ )
- ft_has_length = isinstance(features,
- (np.ndarray, collections.Sequence))
+ ft_has_length = isinstance(
+ features, (np.ndarray, collections.Sequence)
+ )
if ft_has_length and self._timeopt_type_is_avg():
time_ft /= len(features)
@@ -528,7 +569,8 @@ def _call_feature_methods(
feature_name=ft_name_without_prefix,
verbose=verbose,
suppress_warnings=suppress_warnings,
- **kwargs)
+ **kwargs,
+ )
summarized_names, summarized_vals, times_sm = sm_ret
@@ -546,27 +588,35 @@ def _call_feature_methods(
cur_progress=100 * ind / len(self._metadata_mtd_ft),
cur_mtf_name=ft_mtd_name,
item_type="feature",
- verbose=verbose)
+ verbose=verbose,
+ )
if verbose == 1:
_t_num_cols, _ = shutil.get_terminal_size()
- print("\r{:<{fill}}".format(
- "Process of metafeature extraction finished.",
- fill=_t_num_cols))
+ print(
+ "\r{:<{fill}}".format(
+ "Process of metafeature extraction finished.",
+ fill=_t_num_cols,
+ )
+ )
if verbose >= 2 and skipped_count > 0:
- print("\nNote: skipped a total of {} metafeatures, "
- "out of {} ({:.2f}%).".format(
- skipped_count,
- len(self._metadata_mtd_ft),
- 100 * skipped_count / len(self._metadata_mtd_ft)))
+ print(
+ "\nNote: skipped a total of {} metafeatures, "
+ "out of {} ({:.2f}%).".format(
+ skipped_count,
+ len(self._metadata_mtd_ft),
+ 100 * skipped_count / len(self._metadata_mtd_ft),
+ )
+ )
return metafeat_names, metafeat_vals, metafeat_times
def _fill_col_ind_by_type(
- self,
- cat_cols: t.Optional[t.Union[str, t.Iterable[int]]] = "auto",
- check_bool: bool = True) -> None:
+ self,
+ cat_cols: t.Optional[t.Union[str, t.Iterable[int]]] = "auto",
+ check_bool: bool = True,
+ ) -> None:
"""Select ``X`` column indexes based in its data type.
The indexes for numerical and categorical attributes are kept,
@@ -606,7 +656,8 @@ def _fill_col_ind_by_type(
axis=0,
arr=self.X,
check_subtype=True,
- ))
+ )
+ )
if check_bool:
categorical_cols |= np.apply_along_axis(
@@ -615,34 +666,40 @@ def _fill_col_ind_by_type(
arr=self.X,
)
- elif (isinstance(cat_cols, (np.ndarray, collections.Iterable))
- and not isinstance(cat_cols, str)):
+ elif isinstance(
+ cat_cols, (np.ndarray, collections.Iterable)
+ ) and not isinstance(cat_cols, str):
# and all(isinstance(x, int) for x in cat_cols)):
categorical_cols = [i in cat_cols for i in range(self.X.shape[1])]
else:
raise ValueError(
'Invalid "cat_cols" argument ({0}). '
- 'Expecting "auto" or an integer Iterable.'.format(cat_cols))
+ 'Expecting "auto" or an integer Iterable.'.format(cat_cols)
+ )
categorical_cols = np.array(categorical_cols)
self._attr_indexes_num = tuple(
- np.where(np.logical_not(categorical_cols))[0])
+ np.where(np.logical_not(categorical_cols))[0]
+ )
self._attr_indexes_cat = tuple(np.where(categorical_cols)[0])
def _timeopt_type_is_avg(self) -> bool:
"""Checks if user selected time option is an ``average`` type."""
- return (isinstance(self.timeopt, str)
- and self.timeopt.startswith(_internal.TIMEOPT_AVG_PREFIX))
+ return isinstance(self.timeopt, str) and self.timeopt.startswith(
+ _internal.TIMEOPT_AVG_PREFIX
+ )
def _timeopt_include_summary(self) -> bool:
"""Checks if user selected time option includes ``summary`` time."""
- return (isinstance(self.timeopt, str)
- and self.timeopt.endswith(_internal.TIMEOPT_SUMMARY_SUFFIX))
+ return isinstance(self.timeopt, str) and self.timeopt.endswith(
+ _internal.TIMEOPT_SUMMARY_SUFFIX
+ )
- def _combine_time(self, time_ft: float,
- times_sm: t.List[float]) -> t.List[float]:
+ def _combine_time(
+ self, time_ft: float, times_sm: t.List[float]
+ ) -> t.List[float]:
"""Treat time from feature extraction and summarization based in
``timeopt``.
@@ -679,8 +736,9 @@ def _combine_time(self, time_ft: float,
return total_time.tolist()
- def _set_data_categoric(self, transform_num: bool,
- num_bins: bool = None) -> np.ndarray:
+ def _set_data_categoric(
+ self, transform_num: bool, num_bins: bool = None
+ ) -> np.ndarray:
"""Returns categorical data from the fitted dataset.
Parameters
@@ -714,20 +772,25 @@ def _set_data_categoric(self, transform_num: bool,
method.
"""
if self.X is None:
- raise TypeError("It is necessary to fit valid data into the "
- 'model before setting up categoric data. ("X" '
- 'attribute is "NoneType").')
+ raise TypeError(
+ "It is necessary to fit valid data into the "
+ 'model before setting up categoric data. ("X" '
+ 'attribute is "NoneType").'
+ )
if self._attr_indexes_cat is None:
- raise TypeError("No information about indexes of categoric "
- "attributes. Please be sure to call method "
- '"_fill_col_ind_by_type" before this method.')
+ raise TypeError(
+ "No information about indexes of categoric "
+ "attributes. Please be sure to call method "
+ '"_fill_col_ind_by_type" before this method.'
+ )
data_cat = self.X[:, self._attr_indexes_cat]
if transform_num:
data_num_disc = _internal.transform_num(
- self.X[:, self._attr_indexes_num], num_bins=num_bins)
+ self.X[:, self._attr_indexes_num], num_bins=num_bins
+ )
if data_num_disc is not None and data_num_disc.size > 0:
data_cat = np.hstack((data_cat, data_num_disc))
@@ -735,10 +798,11 @@ def _set_data_categoric(self, transform_num: bool,
return data_cat
def _set_data_numeric(
- self,
- transform_cat: str = None,
- rescale: t.Optional[str] = None,
- rescale_args: t.Optional[t.Dict[str, t.Any]] = None) -> np.ndarray:
+ self,
+ transform_cat: str = None,
+ rescale: t.Optional[str] = None,
+ rescale_args: t.Optional[t.Dict[str, t.Any]] = None,
+ ) -> np.ndarray:
"""Returns numeric data from the fitted dataset.
Parameters
@@ -783,62 +847,75 @@ def _set_data_numeric(
`one-hot-full`}.
"""
if self.X is None:
- raise TypeError("It is necessary to fit valid data into the "
- 'model before setting up numeric data. ("X" '
- 'attribute is "NoneType").')
+ raise TypeError(
+ "It is necessary to fit valid data into the "
+ 'model before setting up numeric data. ("X" '
+ 'attribute is "NoneType").'
+ )
if self._attr_indexes_num is None:
- raise TypeError("No information about indexes of numeric "
- "attributes. Please be sure to call method "
- '"_fill_col_ind_by_type" before this method.')
-
- if (transform_cat is not None and
- transform_cat not in _internal.VALID_TRANSFORM_CAT):
- raise ValueError("Invalid 'transform_cat' value ('{}'). Must be "
- "a value in {}.".format(
- transform_cat, _internal.VALID_TRANSFORM_CAT))
+ raise TypeError(
+ "No information about indexes of numeric "
+ "attributes. Please be sure to call method "
+ '"_fill_col_ind_by_type" before this method.'
+ )
+
+ if (
+ transform_cat is not None
+ and transform_cat not in _internal.VALID_TRANSFORM_CAT
+ ):
+ raise ValueError(
+ "Invalid 'transform_cat' value ('{}'). Must be "
+ "a value in {}.".format(
+ transform_cat, _internal.VALID_TRANSFORM_CAT
+ )
+ )
data_num = self.X[:, self._attr_indexes_num]
if transform_cat:
if transform_cat == "gray":
cat_dummies = _internal.transform_cat_gray(
- self.X[:, self._attr_indexes_cat])
+ self.X[:, self._attr_indexes_cat]
+ )
else:
_use_all_ohe_columns = transform_cat == "one-hot-full"
cat_dummies = _internal.transform_cat_onehot(
self.X[:, self._attr_indexes_cat],
- use_all_columns=_use_all_ohe_columns)
+ use_all_columns=_use_all_ohe_columns,
+ )
if cat_dummies is not None and cat_dummies.size > 0:
data_num = np.hstack((data_num, cat_dummies)).astype(float)
if rescale:
data_num = _internal.rescale_data(
- data=data_num, option=rescale, args=rescale_args)
+ data=data_num, option=rescale, args=rescale_args
+ )
if data_num.dtype != float:
data_num = data_num.astype(float)
return data_num
- def fit(self,
- X: t.Sequence,
- y: t.Optional[t.Sequence] = None,
- transform_num: bool = True,
- transform_cat: str = "gray",
- rescale: t.Optional[str] = None,
- rescale_args: t.Optional[t.Dict[str, t.Any]] = None,
- cat_cols: t.Optional[t.Union[str, t.Iterable[int]]] = "auto",
- check_bool: bool = False,
- precomp_groups: t.Optional[str] = "all",
- wildcard: str = "all",
- suppress_warnings: bool = False,
- verbose: int = 0,
- **kwargs,
- ) -> "MFE":
+ def fit(
+ self,
+ X: t.Sequence,
+ y: t.Optional[t.Sequence] = None,
+ transform_num: bool = True,
+ transform_cat: str = "gray",
+ rescale: t.Optional[str] = None,
+ rescale_args: t.Optional[t.Dict[str, t.Any]] = None,
+ cat_cols: t.Optional[t.Union[str, t.Iterable[int]]] = "auto",
+ check_bool: bool = False,
+ precomp_groups: t.Optional[str] = "all",
+ wildcard: str = "all",
+ suppress_warnings: bool = False,
+ verbose: int = 0,
+ **kwargs,
+ ) -> "MFE":
"""Fits dataset into an MFE model.
Parameters
@@ -992,33 +1069,40 @@ def fit(self,
print("Done.")
rescale = _internal.process_generic_option(
- value=rescale, group_name="rescale", allow_none=True)
+ value=rescale, group_name="rescale", allow_none=True
+ )
self._fill_col_ind_by_type(cat_cols=cat_cols, check_bool=check_bool)
if verbose >= 2:
- print("Started data transformation process.",
- " {} Encoding numerical data into discrete values... "
- .format(_internal.VERBOSE_BLOCK_END_SYMBOL),
- sep="\n", end="")
+ print(
+ "Started data transformation process.",
+ " {} Encoding numerical data into discrete values... ".format(
+ _internal.VERBOSE_BLOCK_END_SYMBOL
+ ),
+ sep="\n",
+ end="",
+ )
data_cat = self._set_data_categoric(transform_num=transform_num)
if verbose >= 2:
- print("Done.",
- " {} Enconding categorical data into numerical values... "
- .format(_internal.VERBOSE_BLOCK_END_SYMBOL),
- sep="\n", end="")
+ print(
+ "Done.",
+ " {} Enconding categorical data into numerical values... "
+ .format(_internal.VERBOSE_BLOCK_END_SYMBOL),
+ sep="\n",
+ end="",
+ )
data_num = self._set_data_numeric(
transform_cat=transform_cat,
rescale=rescale,
- rescale_args=rescale_args)
+ rescale_args=rescale_args,
+ )
if verbose >= 2:
- print("Done.",
- "Finished data transformation process.",
- sep="\n")
+ print("Done.", "Finished data transformation process.", sep="\n")
# Custom arguments for metafeature extraction methods
self._custom_args_ft = {
@@ -1049,19 +1133,23 @@ def fit(self,
wildcard=wildcard,
suppress_warnings=suppress_warnings,
verbose=verbose,
- **{**self._custom_args_ft, **kwargs})
+ **{**self._custom_args_ft, **kwargs},
+ )
self.time_precomp = time.time() - _time_start
if verbose >= 2:
- print("\nFinished precomputation process.",
- " {} Total time elapsed: {:.8f} seconds".format(
- _internal.VERBOSE_BLOCK_MID_SYMBOL,
- self.time_precomp),
- " {} Got a total of {} precomputed values.".format(
- _internal.VERBOSE_BLOCK_END_SYMBOL,
- len(self._precomp_args_ft)),
- sep="\n")
+ print(
+ "\nFinished precomputation process.",
+ " {} Total time elapsed: {:.8f} seconds".format(
+ _internal.VERBOSE_BLOCK_MID_SYMBOL, self.time_precomp
+ ),
+ " {} Got a total of {} precomputed values.".format(
+ _internal.VERBOSE_BLOCK_END_SYMBOL,
+ len(self._precomp_args_ft),
+ ),
+ sep="\n",
+ )
# Custom arguments for postprocessing methods
self._postprocess_args_ft = {
@@ -1076,11 +1164,12 @@ def fit(self,
return self
def extract(
- self,
- verbose: int = 0,
- enable_parallel: bool = False,
- suppress_warnings: bool = False,
- **kwargs) -> t.Tuple[t.Sequence, ...]:
+ self,
+ verbose: int = 0,
+ enable_parallel: bool = False,
+ suppress_warnings: bool = False,
+ **kwargs,
+ ) -> t.Tuple[t.Sequence, ...]:
"""Extracts metafeatures from the previously fitted dataset.
Parameters
@@ -1171,11 +1260,13 @@ def extract(
"""
if self.X is None:
- raise TypeError("Fitted data not found. Call "
- '"fit" method before "extract".')
+ raise TypeError(
+ 'Fitted data not found. Call "fit" method before "extract".'
+ )
- if (not isinstance(self.X, np.ndarray)
- or not isinstance(self.y, np.ndarray)):
+ if not isinstance(self.X, np.ndarray) or not isinstance(
+ self.y, np.ndarray
+ ):
self.X, self.y = _internal.check_data(self.X, self.y)
if verbose >= 2:
@@ -1187,14 +1278,16 @@ def extract(
verbose=verbose,
enable_parallel=enable_parallel,
suppress_warnings=suppress_warnings,
- **kwargs) # type: t.Tuple[t.List, ...]
+ **kwargs,
+ ) # type: t.Tuple[t.List, ...]
_internal.post_processing(
results=results,
groups=self.groups,
suppress_warnings=suppress_warnings,
**self._postprocess_args_ft,
- **kwargs)
+ **kwargs,
+ )
self.time_extract = time.time() - _time_start
self.time_total = self.time_extract + self.time_precomp
@@ -1202,8 +1295,10 @@ def extract(
if results and results[0]:
# Sort results by metafeature name
results = tuple(
- map(list, zip(*sorted(zip(*results),
- key=lambda item: item[0]))))
+ map(
+ list, zip(*sorted(zip(*results), key=lambda item: item[0]))
+ )
+ )
res_names, res_vals, res_times = results
@@ -1213,40 +1308,50 @@ def extract(
"\nMetafeature extraction process done.",
" {} Time elapsed in total (precomputations + extraction): "
"{:.8f} seconds.".format(
- _internal.VERBOSE_BLOCK_MID_SYMBOL, self.time_total),
+ _internal.VERBOSE_BLOCK_MID_SYMBOL, self.time_total
+ ),
" {} Time elapsed for extractions: {:.8f} seconds ({:.2f}% "
"from the total).".format(
_internal.VERBOSE_BLOCK_MID_SYMBOL,
self.time_extract,
- _ext_t_pct),
+ _ext_t_pct,
+ ),
" {} Time elapsed for precomputations: {:.8f} seconds "
"({:.2f}% from the total).".format(
_internal.VERBOSE_BLOCK_MID_SYMBOL,
- self.time_precomp, 100 - _ext_t_pct),
+ self.time_precomp,
+ 100 - _ext_t_pct,
+ ),
" {} Total of {} values obtained.".format(
- _internal.VERBOSE_BLOCK_END_SYMBOL, len(res_vals)),
- sep="\n")
+ _internal.VERBOSE_BLOCK_END_SYMBOL, len(res_vals)
+ ),
+ sep="\n",
+ )
if self.timeopt:
return res_names, res_vals, res_times
return res_names, res_vals
- def _extract_with_bootstrap(self,
- extractor: "MFE",
- sample_num: int,
- arguments_fit: t.Dict[str, t.Any],
- arguments_extract: t.Dict[str, t.Any],
- verbose: int = 0) -> t.Tuple[np.ndarray, ...]:
+ def _extract_with_bootstrap(
+ self,
+ extractor: "MFE",
+ sample_num: int,
+ arguments_fit: t.Dict[str, t.Any],
+ arguments_extract: t.Dict[str, t.Any],
+ verbose: int = 0,
+ ) -> t.Tuple[np.ndarray, ...]:
"""Extract metafeatures using bootstrapping."""
if self.X is None:
- raise TypeError("Fitted data not found. Please call 'fit' "
- "method first.")
+ raise TypeError(
+ "Fitted data not found. Please call 'fit' method first."
+ )
def _handle_extract_ret(
- res: t.Tuple[np.ndarray, ...],
- args: t.Tuple[t.Sequence, ...],
- it_num: int) -> t.Tuple[np.ndarray, ...]:
+ res: t.Tuple[np.ndarray, ...],
+ args: t.Tuple[t.Sequence, ...],
+ it_num: int,
+ ) -> t.Tuple[np.ndarray, ...]:
"""Handle each .extraction method return value."""
mtf_names, mtf_vals, mtf_time = res
@@ -1265,7 +1370,8 @@ def _handle_extract_ret(
else:
mtf_names = np.asarray(cur_mtf_names, dtype=str)
mtf_vals = np.zeros(
- (len(cur_mtf_vals), sample_num), dtype=float)
+ (len(cur_mtf_vals), sample_num), dtype=float
+ )
mtf_vals[:, 0] = cur_mtf_vals
if self.timeopt:
@@ -1283,14 +1389,19 @@ def _handle_extract_ret(
bootstrap_random_state = (
self.random_state
if self.random_state is not None
- else np.random.randint(2 ** 20 - 1))
+ else np.random.randint(2 ** 20 - 1)
+ )
for it_num in np.arange(sample_num):
if verbose > 0:
- print("Extracting from sample dataset {} of {} ({:.2f}%)..."
- .format(1 + it_num,
- sample_num,
- 100.0 * (1 + it_num) / sample_num))
+ print(
+ "Extracting from sample dataset {} of {} ({:.2f}%)..."
+ .format(
+ 1 + it_num,
+ sample_num,
+ 100.0 * (1 + it_num) / sample_num,
+ )
+ )
# Note: setting random state to prevent same sample indices due
# to random states set during fit/extraction
@@ -1298,8 +1409,8 @@ def _handle_extract_ret(
bootstrap_random_state += 1
sample_inds = np.random.randint(
- self.X.shape[0],
- size=self.X.shape[0])
+ self.X.shape[0], size=self.X.shape[0]
+ )
X_sample = self.X[sample_inds, :]
y_sample = self.y[sample_inds] if self.y is not None else None
@@ -1309,22 +1420,26 @@ def _handle_extract_ret(
res = _handle_extract_ret(
res=res,
args=extractor.extract(**arguments_extract),
- it_num=it_num)
+ it_num=it_num,
+ )
if verbose > 0:
- print("Done extracting from sample dataset {}.\n"
- .format(1 + it_num))
+ print(
+ "Done extracting from sample dataset {}.\n".format(
+ 1 + it_num
+ )
+ )
return res
def extract_with_confidence(
- self,
- sample_num: int = 128,
- confidence: t.Union[float, t.Sequence[float]] = 0.95,
- return_avg_val: bool = True,
- arguments_fit: t.Optional[t.Dict[str, t.Any]] = None,
- arguments_extract: t.Optional[t.Dict[str, t.Any]] = None,
- verbose: int = 0,
+ self,
+ sample_num: int = 128,
+ confidence: t.Union[float, t.Sequence[float]] = 0.95,
+ return_avg_val: bool = True,
+ arguments_fit: t.Optional[t.Dict[str, t.Any]] = None,
+ arguments_extract: t.Optional[t.Dict[str, t.Any]] = None,
+ verbose: int = 0,
) -> t.Tuple[t.List, ...]:
"""Extract metafeatures with confidence intervals.
@@ -1415,8 +1530,11 @@ def extract_with_confidence(
_confidence = np.asarray(confidence, dtype=float)
if np.any(np.logical_or(_confidence <= 0.0, _confidence >= 1.0)):
- raise ValueError("'_confidence' must be in (0.0, 1.0) range (got "
- "{}.)".format(_confidence))
+ raise ValueError(
+ "'_confidence' must be in (0.0, 1.0) range (got {}.)".format(
+ _confidence
+ )
+ )
if self.random_state is not None:
np.random.seed(self.random_state)
@@ -1434,27 +1552,35 @@ def extract_with_confidence(
if verbose > 0:
print("Started metafeature extract with _confidence interval.")
print("Random seed:")
- print(" {} For extractor model: {}{}".format(
- _internal.VERBOSE_BLOCK_END_SYMBOL,
- _random_state,
- "" if self.random_state else " (chosen by default)"))
+ print(
+ " {} For extractor model: {}{}".format(
+ _internal.VERBOSE_BLOCK_END_SYMBOL,
+ _random_state,
+ "" if self.random_state else " (chosen by default)",
+ )
+ )
- print(" {} For bootstrapping: {}".format(
- _internal.VERBOSE_BLOCK_END_SYMBOL, self.random_state))
+ print(
+ " {} For bootstrapping: {}".format(
+ _internal.VERBOSE_BLOCK_END_SYMBOL, self.random_state
+ )
+ )
extractor = MFE(
features=self.features,
groups=self.groups,
summary=self.summary,
measure_time=self.timeopt,
- random_state=_random_state)
+ random_state=_random_state,
+ )
mtf_names, mtf_vals, mtf_time = self._extract_with_bootstrap(
extractor=extractor,
sample_num=sample_num,
verbose=verbose,
arguments_fit=arguments_fit,
- arguments_extract=arguments_extract)
+ arguments_extract=arguments_extract,
+ )
if verbose > 0:
print("Finished metafeature extract with _confidence interval.")
@@ -1479,11 +1605,11 @@ def extract_with_confidence(
return mtf_names, mtf_vals, mtf_conf_int
def extract_from_model(
- self,
- model: t.Any,
- arguments_fit: t.Optional[t.Dict[str, t.Any]] = None,
- arguments_extract: t.Optional[t.Dict[str, t.Any]] = None,
- verbose: int = 0,
+ self,
+ model: t.Any,
+ arguments_fit: t.Optional[t.Dict[str, t.Any]] = None,
+ arguments_extract: t.Optional[t.Dict[str, t.Any]] = None,
+ verbose: int = 0,
) -> t.Tuple[t.Sequence, ...]:
"""Extract model-based metafeatures from given model.
@@ -1528,27 +1654,33 @@ def extract_from_model(
affected by this method by any means.
"""
if "model-based" not in self.groups:
- raise ValueError("The current MFE model does not have the "
- "'model-based' metafeature group configured ("
- "found groups {}.) Please include it in the "
- "MFE model creation before using 'extract_from"
- "_model' method.".format(self.groups))
+ raise ValueError(
+ "The current MFE model does not have the "
+ "'model-based' metafeature group configured ("
+ "found groups {}.) Please include it in the "
+ "MFE model creation before using 'extract_from"
+ "_model' method.".format(self.groups)
+ )
model_argument = _internal.type_translator.get(type(model), None)
if model_argument is None:
- raise TypeError("'model' from type '{}' not supported. Currently "
- "only supporting classes: {}.".format(
- type(model),
- list(_internal.type_translator.keys())))
+ raise TypeError(
+ "'model' from type '{}' not supported. Currently "
+ "only supporting classes: {}.".format(
+ type(model), list(_internal.type_translator.keys())
+ )
+ )
try:
sklearn.utils.validation.check_is_fitted(model)
except sklearn.exceptions.NotFittedError:
- raise RuntimeError("Given 'model' does not have any fitted data. "
- "Please use its 'fit' method before using the "
- "model with 'extract_from_model' method.")
+ raise RuntimeError(
+ "Given 'model' does not have any fitted data. "
+ "Please use its 'fit' method before using the "
+ "model with 'extract_from_model' method."
+ )
if arguments_fit is None:
arguments_fit = {}
@@ -1557,22 +1689,30 @@ def extract_from_model(
arguments_extract = {}
if model_argument in arguments_fit:
- raise KeyError("Illegal argument '{}' in 'arguments_fit' (used "
- "internally by '.extract_from_model' method.)"
- "".format(model_argument))
+ raise KeyError(
+ "Illegal argument '{}' in 'arguments_fit' (used "
+ "internally by '.extract_from_model' method.)"
+ "".format(model_argument)
+ )
_fts = set(self.features).intersection(
- MFE.valid_metafeatures(groups="model-based"))
+ MFE.valid_metafeatures(groups="model-based")
+ )
if verbose >= 1:
print("Selected features from 'model-based' group:")
for ft_name in _fts:
- print(" {} {}".format(
- _internal.VERBOSE_BLOCK_END_SYMBOL, ft_name))
+ print(
+ " {} {}".format(
+ _internal.VERBOSE_BLOCK_END_SYMBOL, ft_name
+ )
+ )
- print("Total of {} 'model-based' metafeature method candidates."
- .format(len(_fts)))
+ print(
+ "Total of {} 'model-based' metafeature method candidates."
+ .format(len(_fts))
+ )
print("Started extraction from model.")
@@ -1581,11 +1721,14 @@ def extract_from_model(
groups="model-based",
summary=self.summary,
measure_time=self.timeopt,
- random_state=self.random_state).fit(
- X=[1],
- y=None, transform_num=False,
- **{model_argument: model},
- **arguments_fit)
+ random_state=self.random_state,
+ ).fit(
+ X=[1],
+ y=None,
+ transform_num=False,
+ **{model_argument: model},
+ **arguments_fit,
+ )
res = _extractor.extract(**arguments_extract)
@@ -1621,9 +1764,9 @@ def valid_summary(cls) -> t.Tuple[str, ...]:
return _internal.VALID_SUMMARY
@classmethod
- def _check_groups_type(cls,
- groups: t.Optional[t.Union[str, t.Iterable[str]]]
- ) -> t.Set[str]:
+ def _check_groups_type(
+ cls, groups: t.Optional[t.Union[str, t.Iterable[str]]]
+ ) -> t.Set[str]:
"""Cast ``groups`` to a tuple of valid metafeature group names."""
if groups is None:
return set(_internal.VALID_GROUPS)
@@ -1633,20 +1776,16 @@ def _check_groups_type(cls,
return set(groups)
@classmethod
- def _filter_groups(cls,
- groups: t.Set[str]
- ) -> t.Set[str]:
+ def _filter_groups(cls, groups: t.Set[str]) -> t.Set[str]:
"""Filter given groups by the available metafeature group names."""
filtered_group_set = {
- group for group in groups
- if group in _internal.VALID_GROUPS
+ group for group in groups if group in _internal.VALID_GROUPS
}
return filtered_group_set
@classmethod
def valid_metafeatures(
- cls,
- groups: t.Optional[t.Union[str, t.Iterable[str]]] = None,
+ cls, groups: t.Optional[t.Union[str, t.Iterable[str]]] = None,
) -> t.Tuple[str, ...]:
"""Return a tuple with all metafeatures related to given ``groups``.
@@ -1679,20 +1818,20 @@ def valid_metafeatures(
for group in groups.union(deps):
class_ind = _internal.VALID_GROUPS.index(group)
- mtf_names += (
- _internal.get_prefixed_mtds_from_class(
- class_obj=_internal.VALID_MFECLASSES[class_ind],
- prefix=_internal.MTF_PREFIX,
- only_name=True,
- prefix_removal=True))
+ mtf_names += _internal.get_prefixed_mtds_from_class(
+ class_obj=_internal.VALID_MFECLASSES[class_ind],
+ prefix=_internal.MTF_PREFIX,
+ only_name=True,
+ prefix_removal=True,
+ )
return tuple(mtf_names)
@classmethod
def parse_by_group(
- cls,
- groups: t.Union[t.Sequence[str], str],
- extracted_results: t.Tuple[t.Sequence, ...],
+ cls,
+ groups: t.Union[t.Sequence[str], str],
+ extracted_results: t.Tuple[t.Sequence, ...],
) -> t.Tuple[t.List, ...]:
"""Parse the result of ``extract`` for given metafeature ``groups``.
@@ -1728,19 +1867,19 @@ def parse_by_group(
selected_indexes = _internal.select_results_by_classes(
mtf_names=extracted_results[0],
class_names=groups,
- include_dependencies=True)
+ include_dependencies=True,
+ )
filtered_res = (
- [seq[ind] for ind in selected_indexes]
- for seq in extracted_results
+ [seq[ind] for ind in selected_indexes] for seq in extracted_results
)
return tuple(filtered_res)
@staticmethod
- def _parse_description(docstring: str,
- include_references: bool = False
- ) -> t.Tuple[str, str]:
+ def _parse_description(
+ docstring: str, include_references: bool = False
+ ) -> t.Tuple[str, str]:
"""Parse the docstring to get initial description and reference.
Parameters
@@ -1774,19 +1913,20 @@ def _parse_description(docstring: str,
if len(split) >= 2:
del split[0]
for spl in split:
- reference_description += "[" + " ".join(
- spl.split()) + "\n"
+ reference_description += (
+ "[" + " ".join(spl.split()) + "\n"
+ )
return (initial_description, reference_description)
@classmethod
def metafeature_description(
- cls,
- groups: t.Optional[t.Union[str, t.Iterable[str]]] = None,
- sort_by_group: bool = False,
- sort_by_mtf: bool = False,
- print_table: bool = True,
- include_references: bool = False
+ cls,
+ groups: t.Optional[t.Union[str, t.Iterable[str]]] = None,
+ sort_by_group: bool = False,
+ sort_by_mtf: bool = False,
+ print_table: bool = True,
+ include_references: bool = False,
) -> t.Optional[t.Tuple[t.List[t.List[str]], str]]:
"""Print a table with groups, metafeatures and description.
@@ -1848,16 +1988,17 @@ def metafeature_description(
for group in groups.union(deps):
class_ind = _internal.VALID_GROUPS.index(group)
- mtf_names = ( # tipe: t.Collection[t.Tuple[str, t.Callable]]
- _internal.get_prefixed_mtds_from_class(
- class_obj=_internal.VALID_MFECLASSES[class_ind],
- prefix=_internal.MTF_PREFIX,
- only_name=False,
- prefix_removal=True))
+ mtf_names = _internal.get_prefixed_mtds_from_class( # type: ignore
+ class_obj=_internal.VALID_MFECLASSES[class_ind],
+ prefix=_internal.MTF_PREFIX,
+ only_name=False,
+ prefix_removal=True,
+ )
for name, method in mtf_names: # type: ignore
ini_desc, ref_desc = MFE._parse_description(
- str(method.__doc__), include_references)
+ str(method.__doc__), include_references
+ )
mtf_desc_line = [group, name, ini_desc]
mtf_desc.append(mtf_desc_line)
diff --git a/pymfe/model_based.py b/pymfe/model_based.py
index b9b33ac6..b005c640 100644
--- a/pymfe/model_based.py
+++ b/pymfe/model_based.py
@@ -50,15 +50,17 @@ class MFEModelBased:
computed in module ``statistical`` can freely be used for any
precomputation or feature extraction method of module ``landmarking``).
"""
+
@classmethod
def precompute_model_based_class(
- cls,
- N: np.ndarray,
- y: t.Optional[np.ndarray] = None,
- dt_model: t.Optional[sklearn.tree.DecisionTreeClassifier] = None,
- random_state: t.Optional[int] = None,
- hypparam_model_dt: t.Optional[t.Dict[str, t.Any]] = None,
- **kwargs) -> t.Dict[str, t.Any]:
+ cls,
+ N: np.ndarray,
+ y: t.Optional[np.ndarray] = None,
+ dt_model: t.Optional[sklearn.tree.DecisionTreeClassifier] = None,
+ random_state: t.Optional[int] = None,
+ hypparam_model_dt: t.Optional[t.Dict[str, t.Any]] = None,
+ **kwargs
+ ) -> t.Dict[str, t.Any]:
"""Precompute the DT Model and some information related to it.
Parameters
@@ -104,25 +106,32 @@ def precompute_model_based_class(
if dt_model is not None:
precomp_vals["dt_model"] = dt_model
- if ((dt_model is not None or
- (N is not None and N.size > 0 and y is not None)) and not {
- "dt_model", "dt_info_table", "dt_node_depths", "leaf_nodes",
- "non_leaf_nodes"
- }.issubset(kwargs)):
+ if (
+ dt_model is not None
+ or (N is not None and N.size > 0 and y is not None)
+ ) and not {
+ "dt_model",
+ "dt_info_table",
+ "dt_node_depths",
+ "leaf_nodes",
+ "non_leaf_nodes",
+ }.issubset(
+ kwargs
+ ):
if hypparam_model_dt is None:
hypparam_model_dt = {}
if dt_model is None:
- dt_model = cls._fit_dt_model(N=N,
- y=y,
- random_state=random_state,
- **hypparam_model_dt)
+ dt_model = cls._fit_dt_model(
+ N=N, y=y, random_state=random_state, **hypparam_model_dt
+ )
leaf_nodes = cls._get_leaf_node_array(dt_model)
nonleaf_nodes = cls._get_nonleaf_node_array(dt_model)
- dt_info_table = cls.extract_table(dt_model=dt_model,
- leaf_nodes=leaf_nodes)
+ dt_info_table = cls.extract_table(
+ dt_model=dt_model, leaf_nodes=leaf_nodes
+ )
dt_node_depths = cls._calc_dt_node_depths(dt_model)
precomp_vals["leaf_nodes"] = np.flatnonzero(leaf_nodes)
@@ -133,37 +142,45 @@ def precompute_model_based_class(
precomp_vals["tree_shape"] = cls.ft_tree_shape(
dt_model=dt_model,
leaf_nodes=leaf_nodes,
- dt_node_depths=dt_node_depths)
+ dt_node_depths=dt_node_depths,
+ )
return precomp_vals
@staticmethod
def _get_leaf_node_array(
- dt_model: sklearn.tree.DecisionTreeClassifier) -> np.ndarray:
+ dt_model: sklearn.tree.DecisionTreeClassifier,
+ ) -> np.ndarray:
"""Get a boolean array with value True if a node is a leaf."""
return dt_model.tree_.feature < 0
@staticmethod
def _get_nonleaf_node_array(
- dt_model: sklearn.tree.DecisionTreeClassifier) -> np.ndarray:
+ dt_model: sklearn.tree.DecisionTreeClassifier,
+ ) -> np.ndarray:
"""Get a boolean array with value True if a node is non-leaf."""
return dt_model.tree_.feature >= 0
@classmethod
- def _fit_dt_model(cls,
- N: np.ndarray,
- y: np.ndarray,
- random_state: t.Optional[int] = None,
- **kwargs) -> sklearn.tree.DecisionTreeClassifier:
+ def _fit_dt_model(
+ cls,
+ N: np.ndarray,
+ y: np.ndarray,
+ random_state: t.Optional[int] = None,
+ **kwargs
+ ) -> sklearn.tree.DecisionTreeClassifier:
"""Build a Decision Tree Classifier model."""
dt_model = sklearn.tree.DecisionTreeClassifier(
- random_state=random_state, **kwargs)
+ random_state=random_state, **kwargs
+ )
return dt_model.fit(X=N, y=y)
@classmethod
- def extract_table(cls,
- dt_model: sklearn.tree.DecisionTreeClassifier,
- leaf_nodes: t.Optional[np.ndarray] = None) -> np.ndarray:
+ def extract_table(
+ cls,
+ dt_model: sklearn.tree.DecisionTreeClassifier,
+ leaf_nodes: t.Optional[np.ndarray] = None,
+ ) -> np.ndarray:
"""Bookkeep some information table from the ``dt_model`` into an array.
Parameters
@@ -194,20 +211,23 @@ def extract_table(cls,
if leaf_nodes is None:
leaf_nodes = cls._get_leaf_node_array(dt_model)
- dt_info_table = np.zeros((dt_model.tree_.node_count, 3),
- dtype=int) # type: np.ndarray
+ dt_info_table = np.zeros(
+ (dt_model.tree_.node_count, 3), dtype=int
+ ) # type: np.ndarray
dt_info_table[:, 0] = dt_model.tree_.feature
dt_info_table[:, 1] = dt_model.tree_.n_node_samples
- dt_info_table[leaf_nodes, 2] = np.argmax(
- dt_model.tree_.value[leaf_nodes], axis=2).ravel() + 1
+ dt_info_table[leaf_nodes, 2] = (
+ np.argmax(dt_model.tree_.value[leaf_nodes], axis=2).ravel() + 1
+ )
return dt_info_table
@classmethod
def _calc_dt_node_depths(
- cls, dt_model: sklearn.tree.DecisionTreeClassifier) -> np.ndarray:
+ cls, dt_model: sklearn.tree.DecisionTreeClassifier
+ ) -> np.ndarray:
"""Compute the depth of each node in the DT model.
Parameters
@@ -220,6 +240,7 @@ def _calc_dt_node_depths(
:obj:`np.ndarray`
The depth of each node.
"""
+
def node_depth(node_ind: int, cur_depth: int) -> None:
if not 0 <= node_ind < depths.size:
return
@@ -263,9 +284,10 @@ def ft_leaves(cls, dt_model: sklearn.tree.DecisionTreeClassifier) -> int:
@classmethod
def ft_tree_depth(
- cls,
- dt_model: sklearn.tree.DecisionTreeClassifier,
- dt_node_depths: t.Optional[np.ndarray] = None) -> np.ndarray:
+ cls,
+ dt_model: sklearn.tree.DecisionTreeClassifier,
+ dt_node_depths: t.Optional[np.ndarray] = None,
+ ) -> np.ndarray:
"""Compute the depth of every node in the DT model.
Parameters
@@ -297,10 +319,11 @@ def ft_tree_depth(
@classmethod
def ft_leaves_branch(
- cls,
- dt_model: sklearn.tree.DecisionTreeClassifier,
- leaf_nodes: t.Optional[np.ndarray] = None,
- dt_node_depths: t.Optional[np.ndarray] = None) -> np.ndarray:
+ cls,
+ dt_model: sklearn.tree.DecisionTreeClassifier,
+ leaf_nodes: t.Optional[np.ndarray] = None,
+ dt_node_depths: t.Optional[np.ndarray] = None,
+ ) -> np.ndarray:
"""Compute the size of branches in the DT model.
The size of branches consists in the depth of all leaves of the
@@ -342,10 +365,11 @@ def ft_leaves_branch(
@classmethod
def ft_leaves_corrob(
- cls,
- dt_model: sklearn.tree.DecisionTreeClassifier,
- leaf_nodes: t.Optional[np.ndarray] = None,
- dt_info_table: t.Optional[np.ndarray] = None) -> np.ndarray:
+ cls,
+ dt_model: sklearn.tree.DecisionTreeClassifier,
+ leaf_nodes: t.Optional[np.ndarray] = None,
+ dt_info_table: t.Optional[np.ndarray] = None,
+ ) -> np.ndarray:
"""Compute the leaves corroboration of the DT model.
The Leaves corroboration is the proportion of examples that
@@ -392,11 +416,12 @@ def ft_leaves_corrob(
@classmethod
def ft_tree_shape(
- cls,
- dt_model: sklearn.tree.DecisionTreeClassifier,
- tree_shape: t.Optional[np.ndarray] = None,
- leaf_nodes: t.Optional[np.ndarray] = None,
- dt_node_depths: t.Optional[np.ndarray] = None) -> np.ndarray:
+ cls,
+ dt_model: sklearn.tree.DecisionTreeClassifier,
+ tree_shape: t.Optional[np.ndarray] = None,
+ leaf_nodes: t.Optional[np.ndarray] = None,
+ dt_node_depths: t.Optional[np.ndarray] = None,
+ ) -> np.ndarray:
"""Compute the tree shape for every leaf node.
The tree shape is the probability of arrive in each leaf given a
@@ -446,11 +471,11 @@ def ft_tree_shape(
@classmethod
def ft_leaves_homo(
- cls,
- dt_model: sklearn.tree.DecisionTreeClassifier,
- tree_shape: t.Optional[np.ndarray] = None,
- leaf_nodes: t.Optional[np.ndarray] = None,
- dt_node_depths: t.Optional[np.ndarray] = None,
+ cls,
+ dt_model: sklearn.tree.DecisionTreeClassifier,
+ tree_shape: t.Optional[np.ndarray] = None,
+ leaf_nodes: t.Optional[np.ndarray] = None,
+ dt_node_depths: t.Optional[np.ndarray] = None,
) -> np.ndarray:
"""Compute the DT model Homogeneity for every leaf node.
@@ -494,9 +519,11 @@ def ft_leaves_homo(
if dt_node_depths is None:
dt_node_depths = cls._calc_dt_node_depths(dt_model)
- tree_shape = cls.ft_tree_shape(dt_model=dt_model,
- leaf_nodes=leaf_nodes,
- dt_node_depths=dt_node_depths)
+ tree_shape = cls.ft_tree_shape(
+ dt_model=dt_model,
+ leaf_nodes=leaf_nodes,
+ dt_node_depths=dt_node_depths,
+ )
num_leaves = cls.ft_leaves(dt_model)
@@ -504,9 +531,9 @@ def ft_leaves_homo(
@classmethod
def ft_leaves_per_class(
- cls,
- dt_model: sklearn.tree.DecisionTreeClassifier,
- dt_info_table: t.Optional[np.ndarray] = None,
+ cls,
+ dt_model: sklearn.tree.DecisionTreeClassifier,
+ dt_info_table: t.Optional[np.ndarray] = None,
) -> np.ndarray:
"""Compute the proportion of leaves per class in DT model.
@@ -572,7 +599,8 @@ def ft_nodes(cls, dt_model: sklearn.tree.DecisionTreeClassifier) -> int:
@classmethod
def ft_nodes_per_attr(
- cls, dt_model: sklearn.tree.DecisionTreeClassifier) -> float:
+ cls, dt_model: sklearn.tree.DecisionTreeClassifier
+ ) -> float:
"""Compute the ratio of nodes per number of attributes in DT model.
Parameters
@@ -597,7 +625,8 @@ def ft_nodes_per_attr(
@classmethod
def ft_nodes_per_inst(
- cls, dt_model: sklearn.tree.DecisionTreeClassifier) -> float:
+ cls, dt_model: sklearn.tree.DecisionTreeClassifier
+ ) -> float:
"""Compute the ratio of non-leaf nodes per number of instances in DT
model.
@@ -624,10 +653,11 @@ def ft_nodes_per_inst(
@classmethod
def ft_nodes_per_level(
- cls,
- dt_model: sklearn.tree.DecisionTreeClassifier,
- dt_node_depths: t.Optional[np.ndarray] = None,
- non_leaf_nodes: t.Optional[np.ndarray] = None) -> np.ndarray:
+ cls,
+ dt_model: sklearn.tree.DecisionTreeClassifier,
+ dt_node_depths: t.Optional[np.ndarray] = None,
+ non_leaf_nodes: t.Optional[np.ndarray] = None,
+ ) -> np.ndarray:
"""Compute the ratio of number of nodes per tree level in DT model.
Parameters
@@ -670,10 +700,11 @@ def ft_nodes_per_level(
@classmethod
def ft_nodes_repeated(
- cls,
- dt_model: sklearn.tree.DecisionTreeClassifier,
- dt_info_table: t.Optional[np.ndarray] = None,
- non_leaf_nodes: t.Optional[np.ndarray] = None) -> np.ndarray:
+ cls,
+ dt_model: sklearn.tree.DecisionTreeClassifier,
+ dt_info_table: t.Optional[np.ndarray] = None,
+ non_leaf_nodes: t.Optional[np.ndarray] = None,
+ ) -> np.ndarray:
"""Compute the number of repeated nodes in DT model.
The number of repeated nodes is the number of repeated attributes
@@ -718,7 +749,8 @@ def ft_nodes_repeated(
@classmethod
def ft_var_importance(
- cls, dt_model: sklearn.tree.DecisionTreeClassifier) -> np.ndarray:
+ cls, dt_model: sklearn.tree.DecisionTreeClassifier
+ ) -> np.ndarray:
"""Compute the features importance of the DT model for each
attribute.
@@ -745,10 +777,11 @@ def ft_var_importance(
@classmethod
def ft_tree_imbalance(
- cls,
- dt_model: sklearn.tree.DecisionTreeClassifier,
- leaf_nodes: t.Optional[np.ndarray] = None,
- dt_node_depths: t.Optional[np.ndarray] = None) -> np.ndarray:
+ cls,
+ dt_model: sklearn.tree.DecisionTreeClassifier,
+ leaf_nodes: t.Optional[np.ndarray] = None,
+ dt_node_depths: t.Optional[np.ndarray] = None,
+ ) -> np.ndarray:
"""Compute the tree imbalance for each leaf node.
Parameters
@@ -783,7 +816,9 @@ def ft_tree_imbalance(
leaf_depths = dt_node_depths[leaf_nodes]
prob_random_arrival = np.power(2.0, -leaf_depths)
- aux = np.power(2.0, -np.multiply(
- *np.unique(prob_random_arrival, return_counts=True))) # np.ndarray
+ aux = np.power(
+ 2.0,
+ -np.multiply(*np.unique(prob_random_arrival, return_counts=True)),
+ ) # np.ndarray
return -aux * np.log2(aux)
diff --git a/pymfe/relative.py b/pymfe/relative.py
index 1238527e..f1c4a0f1 100644
--- a/pymfe/relative.py
+++ b/pymfe/relative.py
@@ -54,10 +54,14 @@ class MFERelativeLandmarking:
@classmethod
def postprocess_landmarking_relative(
- cls, mtf_names: t.List[str], mtf_vals: t.List[float],
- mtf_time: t.List[float], class_indexes: t.Sequence[int],
- groups: t.Tuple[str, ...], inserted_group_dep: t.FrozenSet[str],
- **kwargs
+ cls,
+ mtf_names: t.List[str],
+ mtf_vals: t.List[float],
+ mtf_time: t.List[float],
+ class_indexes: t.Sequence[int],
+ groups: t.Tuple[str, ...],
+ inserted_group_dep: t.FrozenSet[str],
+ **kwargs
) -> t.Optional[t.Tuple[t.List[str], t.List[float], t.List[float]]]:
"""Generate Relative Landmarking from Landmarking metafeatures.
@@ -111,31 +115,35 @@ def postprocess_landmarking_relative(
mtf_rel_time = [] # type: t.List[float]
mtf_by_summ, mtf_orig_indexes = cls.group_mtf_by_summary(
- mtf_names=mtf_names,
- mtf_vals=mtf_vals,
- class_indexes=class_indexes)
+ mtf_names=mtf_names, mtf_vals=mtf_vals, class_indexes=class_indexes
+ )
avg_time = time.time()
mtf_by_summ = {
summary: scipy.stats.rankdata(
- a=mtf_by_summ[summary], method="average")
+ a=mtf_by_summ[summary], method="average"
+ )
for summary in mtf_by_summ
}
- avg_time = ((time.time() - avg_time) /
- (len(mtf_by_summ) if mtf_by_summ else 1.0))
+ avg_time = (time.time() - avg_time) / (
+ len(mtf_by_summ) if mtf_by_summ else 1.0
+ )
mtf_rel_vals, original_indexes = cls._flatten_dictionaries(
- mtf_by_summ, mtf_orig_indexes)
+ mtf_by_summ, mtf_orig_indexes
+ )
for cur_orig_index in original_indexes:
- mtf_rel_names.append("{}.relative".format(
- mtf_names[cur_orig_index]))
+ mtf_rel_names.append(
+ "{}.relative".format(mtf_names[cur_orig_index])
+ )
mtf_rel_time.append(mtf_time[cur_orig_index] + avg_time)
- change_in_place = ("landmarking" not in groups
- or "landmarking" in inserted_group_dep)
+ change_in_place = (
+ "landmarking" not in groups or "landmarking" in inserted_group_dep
+ )
if change_in_place:
for cur_index, cur_orig_index in enumerate(original_indexes):
@@ -149,10 +157,10 @@ def postprocess_landmarking_relative(
@classmethod
def group_mtf_by_summary(
- cls,
- mtf_names: t.List[str],
- mtf_vals: t.List[float],
- class_indexes: t.Sequence[int],
+ cls,
+ mtf_names: t.List[str],
+ mtf_vals: t.List[float],
+ class_indexes: t.Sequence[int],
) -> t.Tuple[t.Dict[str, t.List[float]], t.Dict[str, t.List[int]]]:
"""Group metafeatures by its correspondent summary method.
@@ -164,7 +172,9 @@ def group_mtf_by_summary(
re_get_summ = re.compile(
r"""[^\.]+\. # Feature name with the first separator
(.*) # Summary name (can have more than one suffix)
- """, re.VERBOSE)
+ """,
+ re.VERBOSE,
+ )
mtf_by_summ = {} # type: t.Dict[str, t.List[float]]
mtf_orig_indexes = {} # type: t.Dict[str, t.List[int]]
@@ -186,9 +196,9 @@ def group_mtf_by_summary(
@classmethod
def _flatten_dictionaries(
- cls,
- mtf_by_summ: t.Dict[str, t.List[float]],
- mtf_orig_indexes: t.Dict[str, t.List[int]],
+ cls,
+ mtf_by_summ: t.Dict[str, t.List[float]],
+ mtf_orig_indexes: t.Dict[str, t.List[int]],
) -> t.Tuple[t.List[float], t.List[int]]:
"""Flatten dictionary values to two lists with correspondence."""
ranked_values = [] # type: t.List[float]
diff --git a/pymfe/scoring.py b/pymfe/scoring.py
index 57a10f54..51a2cebf 100644
--- a/pymfe/scoring.py
+++ b/pymfe/scoring.py
@@ -4,30 +4,25 @@
def accuracy(y_true: np.ndarray, y_pred: np.ndarray) -> float:
- """Calculates the accuracy of a classification model.
- """
+ """Calculates the accuracy of a classification model."""
return sklearn.metrics.accuracy_score(y_true, y_pred)
def balanced_accuracy(y_true: np.ndarray, y_pred: np.ndarray) -> float:
- """Calculates the balanced accuracy of a classification model.
- """
+ """Calculates the balanced accuracy of a classification model."""
return sklearn.metrics.balanced_accuracy_score(y_true, y_pred)
def f1(y_true: np.ndarray, y_pred: np.ndarray) -> float:
- """Calculates the F1-score of a classification model.
- """
- return sklearn.metrics.f1_score(y_true, y_pred, average='weighted')
+ """Calculates the F1-score of a classification model."""
+ return sklearn.metrics.f1_score(y_true, y_pred, average="weighted")
def kappa(y_true: np.ndarray, y_pred: np.ndarray) -> float:
- """Calculates the Kappa-score of a classification model.
- """
+ """Calculates the Kappa-score of a classification model."""
raise NotImplementedError('The "kappa" score was not implemented.')
def auc(y_true: np.ndarray, y_pred: np.ndarray) -> float:
- """Calculates the AUC of a classification model.
- """
+ """Calculates the AUC of a classification model."""
raise NotImplementedError('The "auc" score was not implemented.')
diff --git a/pymfe/statistical.py b/pymfe/statistical.py
index 5022510d..17307790 100644
--- a/pymfe/statistical.py
+++ b/pymfe/statistical.py
@@ -58,9 +58,9 @@ class MFEStatistical:
"""
@classmethod
- def precompute_statistical_class(cls,
- y: t.Optional[np.ndarray] = None,
- **kwargs) -> t.Dict[str, t.Any]:
+ def precompute_statistical_class(
+ cls, y: t.Optional[np.ndarray] = None, **kwargs
+ ) -> t.Dict[str, t.Any]:
"""Precompute distinct classes and its abs. frequencies from ``y``.
Parameters
@@ -93,10 +93,12 @@ def precompute_statistical_class(cls,
return precomp_vals
@classmethod
- def precompute_can_cors(cls,
- N: t.Optional[np.ndarray] = None,
- y: t.Optional[np.ndarray] = None,
- **kwargs) -> t.Dict[str, t.Any]:
+ def precompute_can_cors(
+ cls,
+ N: t.Optional[np.ndarray] = None,
+ y: t.Optional[np.ndarray] = None,
+ **kwargs
+ ) -> t.Dict[str, t.Any]:
"""Precompute canonical correlations and its eigenvalues.
Parameters
@@ -123,8 +125,12 @@ def precompute_can_cors(cls,
"""
precomp_vals = {}
- if (y is not None and N is not None and N.size
- and not {"can_cors", "can_cor_eigvals"}.issubset(kwargs)):
+ if (
+ y is not None
+ and N is not None
+ and N.size
+ and not {"can_cors", "can_cor_eigvals"}.issubset(kwargs)
+ ):
can_cors = cls._calc_can_cors(N=N, y=y)
precomp_vals["can_cors"] = can_cors
@@ -133,10 +139,9 @@ def precompute_can_cors(cls,
return precomp_vals
@classmethod
- def precompute_statistical_cor_cov(cls,
- N: t.Optional[np.ndarray] = None,
- ddof: int = 1,
- **kwargs) -> t.Dict[str, t.Any]:
+ def precompute_statistical_cor_cov(
+ cls, N: t.Optional[np.ndarray] = None, ddof: int = 1, **kwargs
+ ) -> t.Dict[str, t.Any]:
"""Precomputes the correlation and covariance matrix of numerical data.
Be cautious in allowing this precomputation method on huge datasets, as
@@ -174,8 +179,9 @@ def precompute_statistical_cor_cov(cls,
if "abs_corr_mat" not in kwargs:
abs_corr_mat = np.abs(np.corrcoef(N, rowvar=False))
- if (not isinstance(abs_corr_mat, np.ndarray)
- and np.isnan(abs_corr_mat)):
+ if not isinstance(abs_corr_mat, np.ndarray) and np.isnan(
+ abs_corr_mat
+ ):
abs_corr_mat = np.array([np.nan])
precomp_vals["abs_corr_mat"] = abs_corr_mat
@@ -203,9 +209,7 @@ def _can_cor_to_eigval(can_cors: np.ndarray) -> np.ndarray:
@classmethod
def _calc_can_cors(
- cls,
- N: np.ndarray,
- y: np.ndarray,
+ cls, N: np.ndarray, y: np.ndarray,
) -> t.Union[np.ndarray, t.Tuple[np.ndarray, np.ndarray]]:
"""Calculate the Canonical Correlations between ``N`` and ``y.``
@@ -216,7 +220,8 @@ def _calc_can_cors(
kept.
"""
y_bin = sklearn.preprocessing.OneHotEncoder(
- sparse=False).fit_transform(y.reshape(-1, 1))
+ sparse=False
+ ).fit_transform(y.reshape(-1, 1))
num_classes, num_attr = y_bin.shape[1], N.shape[1]
# Note: 'n_components' is a theoretical upper bound, so it is not
@@ -230,7 +235,8 @@ def _calc_can_cors(
warnings.filterwarnings("ignore", category=UserWarning)
N_tf, y_tf = sklearn.cross_decomposition.CCA(
- n_components=n_components).fit_transform(N, y_bin)
+ n_components=n_components
+ ).fit_transform(N, y_bin)
warnings.filterwarnings("default", category=UserWarning)
@@ -247,10 +253,10 @@ def _calc_can_cors(
@classmethod
def ft_can_cor(
- cls,
- N: np.ndarray,
- y: np.ndarray,
- can_cors: t.Optional[np.ndarray] = None,
+ cls,
+ N: np.ndarray,
+ y: np.ndarray,
+ can_cors: t.Optional[np.ndarray] = None,
) -> np.ndarray:
"""Compute canonical correlations of data.
@@ -286,13 +292,15 @@ def ft_can_cor(
return can_cors
@classmethod
- def ft_gravity(cls,
- N: np.ndarray,
- y: np.ndarray,
- norm_ord: t.Union[int, float] = 2,
- classes: t.Optional[np.ndarray] = None,
- class_freqs: t.Optional[np.ndarray] = None,
- cls_inds: t.Optional[np.ndarray] = None) -> float:
+ def ft_gravity(
+ cls,
+ N: np.ndarray,
+ y: np.ndarray,
+ norm_ord: t.Union[int, float] = 2,
+ classes: t.Optional[np.ndarray] = None,
+ class_freqs: t.Optional[np.ndarray] = None,
+ cls_inds: t.Optional[np.ndarray] = None,
+ ) -> float:
"""Compute the distance between minority and majority classes center
of mass.
@@ -380,13 +388,15 @@ class has the same number of instances.
gravity = np.linalg.norm(
insts_cls_maj.mean(axis=0) - insts_cls_min.mean(axis=0),
- ord=norm_ord)
+ ord=norm_ord,
+ )
return gravity
@classmethod
- def ft_cor(cls, N: np.ndarray,
- abs_corr_mat: t.Optional[np.ndarray] = None) -> np.ndarray:
+ def ft_cor(
+ cls, N: np.ndarray, abs_corr_mat: t.Optional[np.ndarray] = None
+ ) -> np.ndarray:
"""Compute the absolute value of the correlation of distinct dataset
column pairs.
@@ -427,10 +437,12 @@ def ft_cor(cls, N: np.ndarray,
return np.abs(inf_triang_vals)
@classmethod
- def ft_cov(cls,
- N: np.ndarray,
- ddof: int = 1,
- cov_mat: t.Optional[np.ndarray] = None) -> np.ndarray:
+ def ft_cov(
+ cls,
+ N: np.ndarray,
+ ddof: int = 1,
+ cov_mat: t.Optional[np.ndarray] = None,
+ ) -> np.ndarray:
"""Compute the absolute value of the covariance of distinct dataset
attribute pairs.
@@ -474,10 +486,10 @@ def ft_cov(cls,
@classmethod
def ft_nr_disc(
- cls,
- N: np.ndarray,
- y: np.ndarray,
- can_cors: t.Optional[np.ndarray] = None,
+ cls,
+ N: np.ndarray,
+ y: np.ndarray,
+ can_cors: t.Optional[np.ndarray] = None,
) -> t.Union[int, float]:
"""Compute the number of canonical correlation between each attribute
and class.
@@ -519,10 +531,12 @@ def ft_nr_disc(
return can_cors.size
@classmethod
- def ft_eigenvalues(cls,
- N: np.ndarray,
- ddof: int = 1,
- cov_mat: t.Optional[np.ndarray] = None) -> np.ndarray:
+ def ft_eigenvalues(
+ cls,
+ N: np.ndarray,
+ ddof: int = 1,
+ cov_mat: t.Optional[np.ndarray] = None,
+ ) -> np.ndarray:
"""Compute the eigenvalues of covariance matrix from dataset.
Parameters
@@ -554,10 +568,9 @@ def ft_eigenvalues(cls,
return np.linalg.eigvals(cov_mat)
@classmethod
- def ft_g_mean(cls,
- N: np.ndarray,
- allow_zeros: bool = True,
- epsilon: float = 1.0e-10) -> np.ndarray:
+ def ft_g_mean(
+ cls, N: np.ndarray, allow_zeros: bool = True, epsilon: float = 1.0e-10
+ ) -> np.ndarray:
"""Compute the geometric mean of each attribute.
Parameters
@@ -652,8 +665,9 @@ def ft_iq_range(cls, N: np.ndarray) -> np.ndarray:
return scipy.stats.iqr(N, axis=0)
@classmethod
- def ft_kurtosis(cls, N: np.ndarray, method: int = 3,
- bias: bool = True) -> np.ndarray:
+ def ft_kurtosis(
+ cls, N: np.ndarray, method: int = 3, bias: bool = True
+ ) -> np.ndarray:
"""Compute the kurtosis of each attribute.
Parameters
@@ -705,7 +719,8 @@ def ft_kurtosis(cls, N: np.ndarray, method: int = 3,
axis=0,
arr=N,
method=method,
- bias=bias)
+ bias=bias,
+ )
return kurt_arr
@@ -830,12 +845,13 @@ def ft_min(cls, N: np.ndarray) -> np.ndarray:
return N.min(axis=0)
@classmethod
- def ft_nr_cor_attr(cls,
- N: np.ndarray,
- threshold: float = 0.5,
- normalize: bool = True,
- abs_corr_mat: t.Optional[np.ndarray] = None
- ) -> t.Union[int, float]:
+ def ft_nr_cor_attr(
+ cls,
+ N: np.ndarray,
+ threshold: float = 0.5,
+ normalize: bool = True,
+ abs_corr_mat: t.Optional[np.ndarray] = None,
+ ) -> t.Union[int, float]:
"""Compute the number of distinct highly correlated pair of attributes.
A pair of attributes is considered highly correlated if the
@@ -884,12 +900,14 @@ def ft_nr_cor_attr(cls,
return np.sum(abs_corr_vals >= threshold) * norm_factor
@classmethod
- def ft_nr_norm(cls,
- N: np.ndarray,
- method: str = "shapiro-wilk",
- threshold: float = 0.05,
- failure: str = "soft",
- max_samples: int = 5000) -> t.Union[float, int]:
+ def ft_nr_norm(
+ cls,
+ N: np.ndarray,
+ method: str = "shapiro-wilk",
+ threshold: float = 0.05,
+ failure: str = "soft",
+ max_samples: int = 5000,
+ ) -> t.Union[float, int]:
"""Compute the number of attributes normally distributed based in a
given method.
@@ -969,12 +987,17 @@ def ft_nr_norm(cls,
)
if method not in accepted_tests:
- raise ValueError("Unknown method {0}. Select one between "
- "{1}".format(method, accepted_tests))
+ raise ValueError(
+ "Unknown method {0}. Select one between {1}".format(
+ method, accepted_tests
+ )
+ )
if failure not in ("hard", "soft"):
- raise ValueError('"failure" argument must be either "soft" '
- 'or "hard" (got "{}").'.format(failure))
+ raise ValueError(
+ '"failure" argument must be either "soft" '
+ 'or "hard" (got "{}").'.format(failure)
+ )
if max_samples <= 0:
return np.nan
@@ -987,13 +1010,15 @@ def ft_nr_norm(cls,
if method in ("shapiro-wilk", "all"):
_, p_values_shapiro = np.apply_along_axis(
- func1d=scipy.stats.shapiro, axis=0, arr=N[:max_row_index, :])
+ func1d=scipy.stats.shapiro, axis=0, arr=N[:max_row_index, :]
+ )
test_results.append(p_values_shapiro > threshold)
if method in ("dagostino-pearson", "all"):
_, p_values_dagostino = scipy.stats.normaltest(
- N[:max_row_index, :], axis=0)
+ N[:max_row_index, :], axis=0
+ )
test_results.append(p_values_dagostino > threshold)
@@ -1002,7 +1027,8 @@ def ft_nr_norm(cls,
for attr_ind, attr_vals in enumerate(N[:max_row_index, :].T):
stat_value, crit_values, signif_levels = scipy.stats.anderson(
- attr_vals, dist="norm")
+ attr_vals, dist="norm"
+ )
# As scipy.stats.anderson gives critical values for fixed
# significance levels, then the strategy adopted is to use
@@ -1118,12 +1144,14 @@ def ft_sd(cls, N: np.ndarray, ddof: int = 1) -> np.ndarray:
return N.std(axis=0, ddof=ddof)
@classmethod
- def ft_sd_ratio(cls,
- N: np.ndarray,
- y: np.ndarray,
- ddof: int = 1,
- classes: t.Optional[np.ndarray] = None,
- class_freqs: t.Optional[np.ndarray] = None) -> float:
+ def ft_sd_ratio(
+ cls,
+ N: np.ndarray,
+ y: np.ndarray,
+ ddof: int = 1,
+ classes: t.Optional[np.ndarray] = None,
+ class_freqs: t.Optional[np.ndarray] = None,
+ ) -> float:
"""Compute a statistical test for homogeneity of covariances.
The test applied is the Box's M Test for equivalence of
@@ -1177,45 +1205,57 @@ def ft_sd_ratio(cls,
def calc_sample_cov_mat(N, y, ddof):
"""Calculate the Sample Covariance Matrix for each class."""
- sample_cov_matrices = np.array([
- np.cov(N[y == cl, :], rowvar=False, ddof=ddof)
- for cl in classes
- ])
+ sample_cov_matrices = np.array(
+ [
+ np.cov(N[y == cl, :], rowvar=False, ddof=ddof)
+ for cl in classes
+ ]
+ )
return np.flip(m=sample_cov_matrices, axis=(0, 1))
- def calc_pooled_cov_mat(sample_cov_matrices: np.ndarray,
- vec_weight: np.ndarray, num_inst: int,
- num_classes: int) -> np.ndarray:
+ def calc_pooled_cov_mat(
+ sample_cov_matrices: np.ndarray,
+ vec_weight: np.ndarray,
+ num_inst: int,
+ num_classes: int,
+ ) -> np.ndarray:
"""Calculate the Pooled Covariance Matrix."""
- pooled_cov_mat = np.array([
- weight * S_i
- for weight, S_i in zip(vec_weight, sample_cov_matrices)
- ]).sum(axis=0) / (num_inst - num_classes)
+ pooled_cov_mat = np.array(
+ [
+ weight * S_i
+ for weight, S_i in zip(vec_weight, sample_cov_matrices)
+ ]
+ ).sum(axis=0) / (num_inst - num_classes)
return pooled_cov_mat
def calc_gamma_factor(num_col, num_classes, num_inst):
"""Calculate the gamma factor which adjust the output."""
gamma = 1.0 - (
- (2.0 * num_col**2.0 + 3.0 * num_col - 1.0) /
- (6.0 * (num_col + 1.0) *
- (num_classes - 1.0))) * (np.sum(1.0 / vec_weight) - 1.0 /
- (num_inst - num_classes))
+ (2.0 * num_col ** 2.0 + 3.0 * num_col - 1.0)
+ / (6.0 * (num_col + 1.0) * (num_classes - 1.0))
+ ) * (np.sum(1.0 / vec_weight) - 1.0 / (num_inst - num_classes))
return gamma
- def calc_m_factor(sample_cov_matrices: np.ndarray,
- pooled_cov_mat: np.ndarray, num_inst: int,
- num_classes: int, gamma: float,
- vec_weight: np.ndarray) -> float:
+ def calc_m_factor(
+ sample_cov_matrices: np.ndarray,
+ pooled_cov_mat: np.ndarray,
+ num_inst: int,
+ num_classes: int,
+ gamma: float,
+ vec_weight: np.ndarray,
+ ) -> float:
"""Calculate the M factor."""
vec_logdet = [
np.math.log(np.linalg.det(S_i)) for S_i in sample_cov_matrices
]
- m_factor = (gamma * ((num_inst - num_classes) * np.math.log(
- np.linalg.det(pooled_cov_mat)) - np.dot(
- vec_weight, vec_logdet)))
+ m_factor = gamma * (
+ (num_inst - num_classes)
+ * np.math.log(np.linalg.det(pooled_cov_mat))
+ - np.dot(vec_weight, vec_logdet)
+ )
return m_factor
@@ -1230,19 +1270,27 @@ def calc_m_factor(sample_cov_matrices: np.ndarray,
vec_weight = class_freqs - 1.0
- pooled_cov_mat = calc_pooled_cov_mat(sample_cov_matrices, vec_weight,
- num_inst, num_classes)
+ pooled_cov_mat = calc_pooled_cov_mat(
+ sample_cov_matrices, vec_weight, num_inst, num_classes
+ )
gamma = calc_gamma_factor(num_col, num_classes, num_inst)
- m_factor = calc_m_factor(sample_cov_matrices, pooled_cov_mat,
- num_inst, num_classes, gamma, vec_weight)
+ m_factor = calc_m_factor(
+ sample_cov_matrices,
+ pooled_cov_mat,
+ num_inst,
+ num_classes,
+ gamma,
+ vec_weight,
+ )
return np.exp(m_factor / (num_col * (num_inst - num_classes)))
@classmethod
- def ft_skewness(cls, N: np.ndarray, method: int = 3,
- bias: bool = True) -> np.ndarray:
+ def ft_skewness(
+ cls, N: np.ndarray, method: int = 3, bias: bool = True
+ ) -> np.ndarray:
"""Compute the skewness for each attribute.
Parameters
@@ -1292,7 +1340,8 @@ def ft_skewness(cls, N: np.ndarray, method: int = 3,
axis=0,
arr=N,
bias=bias,
- method=method)
+ method=method,
+ )
return skew_arr
@@ -1395,11 +1444,11 @@ def ft_var(cls, N: np.ndarray, ddof: int = 1) -> np.ndarray:
@classmethod
def ft_w_lambda(
- cls,
- N: np.ndarray,
- y: np.ndarray,
- can_cor_eigvals: t.Optional[np.ndarray] = None,
- can_cors: t.Optional[np.ndarray] = None,
+ cls,
+ N: np.ndarray,
+ y: np.ndarray,
+ can_cor_eigvals: t.Optional[np.ndarray] = None,
+ can_cors: t.Optional[np.ndarray] = None,
) -> float:
"""Compute the Wilks' Lambda value.
@@ -1470,10 +1519,10 @@ def ft_w_lambda(
@classmethod
def ft_p_trace(
- cls,
- N: np.ndarray,
- y: np.ndarray,
- can_cors: t.Optional[np.ndarray] = None,
+ cls,
+ N: np.ndarray,
+ y: np.ndarray,
+ can_cors: t.Optional[np.ndarray] = None,
) -> float:
"""Compute the Pillai's trace.
@@ -1514,11 +1563,11 @@ def ft_p_trace(
@classmethod
def ft_lh_trace(
- cls,
- N: np.ndarray,
- y: np.ndarray,
- can_cor_eigvals: t.Optional[np.ndarray] = None,
- can_cors: t.Optional[np.ndarray] = None,
+ cls,
+ N: np.ndarray,
+ y: np.ndarray,
+ can_cor_eigvals: t.Optional[np.ndarray] = None,
+ can_cors: t.Optional[np.ndarray] = None,
) -> float:
"""Compute the Lawley-Hotelling trace.
@@ -1590,12 +1639,12 @@ def ft_lh_trace(
@classmethod
def ft_roy_root(
- cls,
- N: np.ndarray,
- y: np.ndarray,
- criterion: str = "eigval",
- can_cors: t.Optional[np.ndarray] = None,
- can_cor_eigvals: t.Optional[np.ndarray] = None,
+ cls,
+ N: np.ndarray,
+ y: np.ndarray,
+ criterion: str = "eigval",
+ can_cors: t.Optional[np.ndarray] = None,
+ can_cor_eigvals: t.Optional[np.ndarray] = None,
) -> float:
"""Compute the Roy's largest root.
@@ -1678,8 +1727,11 @@ def ft_roy_root(
VALID_CRITERIA = ("eigval", "cancor")
if criterion not in VALID_CRITERIA:
- raise ValueError("Roy's largest root 'criterion' must be in {}."
- .format(VALID_CRITERIA))
+ raise ValueError(
+ "Roy's largest root 'criterion' must be in {}.".format(
+ VALID_CRITERIA
+ )
+ )
if criterion == "eigval":
if can_cor_eigvals is None:
diff --git a/setup.py b/setup.py
index 2c51bf38..64d66adf 100644
--- a/setup.py
+++ b/setup.py
@@ -40,7 +40,8 @@
'Topic :: Scientific/Engineering',
'Operating System :: OS Independent',
'Programming Language :: Python :: 3.6',
- 'Programming Language :: Python :: 3.7']
+ 'Programming Language :: Python :: 3.7',
+ 'Programming Language :: Python :: 3.8']
INSTALL_REQUIRES = ['numpy', 'scipy', 'scikit-learn', 'patsy', 'pandas',