diff --git a/README.rst b/README.rst
index 3c6e4c0..39a78d6 100644
--- a/README.rst
+++ b/README.rst
@@ -25,9 +25,10 @@
Sklearn-genetic-opt
###################
-scikit-learn models hyperparameters tuning, using evolutionary algorithms.
+scikit-learn models hyperparameters tuning and feature selection, using evolutionary algorithms.
-This is meant to be an alternative from popular methods inside scikit-learn such as Grid Search and Randomized Grid Search.
+This is meant to be an alternative from popular methods inside scikit-learn such as Grid Search and Randomized Grid Search
+for hyperparameteres tuning, and from RFE, Select From Model for feature selection.
Sklearn-genetic-opt uses evolutionary algorithms from the DEAP package to choose the set of hyperparameters that
optimizes (max or min) the cross-validation scores, it can be used for both regression and classification problems.
@@ -37,7 +38,8 @@ Documentation is available `here `_
Main Features:
##############
-* **GASearchCV**: Principal class of the package, holds the evolutionary cross-validation optimization routine.
+* **GASearchCV**: Main class of the package for hyperparameters tuning, holds the evolutionary cross-validation optimization routine.
+* **GAFeatureSelectionCV**: Main class of the package for feature selection.
* **Algorithms**: Set of different evolutionary algorithms to use as an optimization procedure.
* **Callbacks**: Custom evaluation strategies to generate early stopping rules,
logging (into TensorBoard, .pkl files, etc) or your custom logic.
@@ -82,8 +84,8 @@ The only optional dependency that the last command does not install, it's Tensor
it is usually advised to look further which distribution works better for you.
-Example
-#######
+Example: Hyperparameters Tuning
+###############################
.. code-block:: python
@@ -134,6 +136,49 @@ Example
print("Best k solutions: ", evolved_estimator.hof)
+Example: Feature Selection
+##########################
+
+.. code:: python3
+
+ import matplotlib.pyplot as plt
+ from sklearn_genetic import GAFeatureSelectionCV
+ from sklearn.model_selection import train_test_split, StratifiedKFold
+ from sklearn.svm import SVC
+ from sklearn.datasets import load_iris
+ from sklearn.metrics import accuracy_score
+ import numpy as np
+
+ data = load_iris()
+ X, y = data["data"], data["target"]
+
+ # Add random non-important features
+ noise = np.random.uniform(0, 10, size=(X.shape[0], 5))
+ X = np.hstack((X, noise))
+
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)
+
+ clf = SVC(gamma='auto')
+
+ evolved_estimator = GAFeatureSelectionCV(
+ estimator=clf,
+ scoring="accuracy",
+ population_size=30,
+ generations=20,
+ n_jobs=-1)
+
+ # Train and select the features
+ evolved_estimator.fit(X_train, y_train)
+
+ # Features selected by the algorithm
+ features = evolved_estimator.best_features_
+ print(features)
+
+ # Predict only with the subset of selected features
+ y_predict_ga = evolved_estimator.predict(X_test[:, features])
+ print(accuracy_score(y_test, y_predict_ga))
+
+
Changelog
#########
diff --git a/docs/api/featureselectioncv.rst b/docs/api/featureselectioncv.rst
new file mode 100644
index 0000000..e263f27
--- /dev/null
+++ b/docs/api/featureselectioncv.rst
@@ -0,0 +1,23 @@
+
+FeatureSelectionCV
+------------------
+
+.. currentmodule:: sklearn_genetic
+
+.. autosummary:: GAFeatureSelectionCV
+ GASearchCV.decision_function
+ GASearchCV.fit
+ GASearchCV.get_params
+ GASearchCV.inverse_transform
+ GASearchCV.predict
+ GASearchCV.predict_proba
+ GASearchCV.score
+ GASearchCV.score_samples
+ GASearchCV.set_params
+ GASearchCV.transform
+
+.. autoclass:: sklearn_genetic.GAFeatureSelectionCV
+ :members:
+ :inherited-members:
+ :exclude-members: evaluate, mutate, n_features_in_, classes_
+ :undoc-members: True
\ No newline at end of file
diff --git a/docs/conf.py b/docs/conf.py
index d123101..e7126d5 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -55,7 +55,7 @@
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This pattern also affects html_static_path and html_extra_path.
-exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
+exclude_patterns = ["_build", "Thumbs.db", ".DS_Store", "**.ipynb_checkpoints"]
# -- Options for HTML output -------------------------------------------------
diff --git a/docs/images/basic_usage_accuracy_6.PNG b/docs/images/basic_usage_accuracy_6.PNG
new file mode 100644
index 0000000..cf71c5b
Binary files /dev/null and b/docs/images/basic_usage_accuracy_6.PNG differ
diff --git a/docs/images/basic_usage_fitness_plot_7.PNG b/docs/images/basic_usage_fitness_plot_7.PNG
new file mode 100644
index 0000000..7afb3b8
Binary files /dev/null and b/docs/images/basic_usage_fitness_plot_7.PNG differ
diff --git a/docs/images/basic_usage_train_log_5.PNG b/docs/images/basic_usage_train_log_5.PNG
new file mode 100644
index 0000000..f442f60
Binary files /dev/null and b/docs/images/basic_usage_train_log_5.PNG differ
diff --git a/docs/index.rst b/docs/index.rst
index a12c6a4..d7c9b0c 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -5,10 +5,13 @@
sklean-genetic-opt
==================
-scikit-learn models hyperparameters tuning, using evolutionary algorithms.
-##########################################################################
+scikit-learn models hyperparameters tuning and feature selection,
+using evolutionary algorithms.
-This is meant to be an alternative from popular methods inside scikit-learn such as Grid Search and Randomized Grid Search.
+#################################################################
+
+This is meant to be an alternative from popular methods inside scikit-learn such as Grid Search and Randomized Grid Search
+for hyperparameteres tuning, and from RFE, Select From Model for feature selection.
Sklearn-genetic-opt uses evolutionary algorithms from the deap package to choose a set of hyperparameters
that optimizes (max or min) the cross-validation scores, it can be used for both regression and classification problems.
@@ -73,6 +76,7 @@ as it is usually advised to look further which distribution works better for you
notebooks/sklearn_comparison.ipynb
notebooks/Boston_Houses_decision_tree.ipynb
+ notebooks/Iris_feature_selection.ipynb
notebooks/Digits_decision_tree.ipynb
notebooks/MLflow_logger.ipynb
@@ -87,6 +91,7 @@ as it is usually advised to look further which distribution works better for you
:caption: API Reference:
api/gasearchcv
+ api/featureselectioncv
api/callbacks
api/plots
api/mlflow
diff --git a/docs/notebooks/Iris_feature_selection.ipynb b/docs/notebooks/Iris_feature_selection.ipynb
new file mode 100644
index 0000000..c2d8dab
--- /dev/null
+++ b/docs/notebooks/Iris_feature_selection.ipynb
@@ -0,0 +1,260 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "source": [
+ "# Iris Feature Selection"
+ ],
+ "metadata": {
+ "collapsed": false
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "outputs": [],
+ "source": [
+ "import matplotlib.pyplot as plt\n",
+ "from sklearn_genetic import GAFeatureSelectionCV\n",
+ "from sklearn_genetic.plots import plot_fitness_evolution\n",
+ "from sklearn.model_selection import train_test_split, StratifiedKFold\n",
+ "from sklearn.svm import SVC\n",
+ "from sklearn.datasets import load_iris\n",
+ "from sklearn.metrics import accuracy_score\n",
+ "import numpy as np\n"
+ ],
+ "metadata": {
+ "collapsed": false,
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "### Import the data and split it in train and test sets\n",
+ "Random noise is added to simulate useless variables\n"
+ ],
+ "metadata": {
+ "collapsed": false
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "outputs": [
+ {
+ "data": {
+ "text/plain": "(150, 14)"
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data = load_iris()\n",
+ "X, y = data[\"data\"], data[\"target\"]\n",
+ "\n",
+ "noise = np.random.uniform(0, 10, size=(X.shape[0], 10))\n",
+ "\n",
+ "X = np.hstack((X, noise))\n",
+ "X.shape"
+ ],
+ "metadata": {
+ "collapsed": false,
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "### Split the training and test data"
+ ],
+ "metadata": {
+ "collapsed": false
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "outputs": [],
+ "source": [
+ "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)"
+ ],
+ "metadata": {
+ "collapsed": false,
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "### Define the GAFeatureSelectionCV options\n"
+ ],
+ "metadata": {
+ "collapsed": false
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "outputs": [],
+ "source": [
+ "clf = SVC(gamma='auto')\n",
+ "\n",
+ "evolved_estimator = GAFeatureSelectionCV(\n",
+ " estimator=clf,\n",
+ " cv=3,\n",
+ " scoring=\"accuracy\",\n",
+ " population_size=30,\n",
+ " generations=20,\n",
+ " n_jobs=-1,\n",
+ " verbose=True,\n",
+ " keep_top_k=2,\n",
+ " elitism=True,\n",
+ ")"
+ ],
+ "metadata": {
+ "collapsed": false,
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "### Fit the model and see some results"
+ ],
+ "metadata": {
+ "collapsed": false
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "INSTANCE\n",
+ "True\n",
+ "gen\tnevals\tfitness \tfitness_std\tfitness_max\tfitness_min\n",
+ "0 \t30 \t0.558444\t0.155441 \t0.893333 \t0.253333 \n",
+ "1 \t54 \t0.659333\t0.132948 \t0.893333 \t0.333333 \n",
+ "2 \t54 \t0.742667\t0.0867111 \t0.893333 \t0.586667 \n",
+ "3 \t55 \t0.805778\t0.0740117 \t0.893333 \t0.653333 \n",
+ "4 \t52 \t0.873333\t0.0435125 \t0.906667 \t0.746667 \n",
+ "5 \t53 \t0.896222\t0.00659592 \t0.913333 \t0.893333 \n",
+ "6 \t55 \t0.901111\t0.0131186 \t0.953333 \t0.893333 \n",
+ "7 \t54 \t0.911778\t0.0206332 \t0.953333 \t0.893333 \n",
+ "8 \t50 \t0.926444\t0.0210455 \t0.953333 \t0.893333 \n",
+ "9 \t51 \t0.941333\t0.020177 \t0.966667 \t0.913333 \n",
+ "10 \t49 \t0.955556\t0.00978787 \t0.966667 \t0.913333 \n",
+ "11 \t55 \t0.959111\t0.00660714 \t0.966667 \t0.953333 \n",
+ "12 \t57 \t0.965333\t0.004 \t0.966667 \t0.953333 \n",
+ "13 \t55 \t0.966444\t0.00271257 \t0.973333 \t0.953333 \n",
+ "14 \t58 \t0.966667\t6.66134e-16\t0.966667 \t0.966667 \n",
+ "15 \t53 \t0.966889\t0.0011967 \t0.973333 \t0.966667 \n",
+ "16 \t56 \t0.967556\t0.00226623 \t0.973333 \t0.966667 \n",
+ "17 \t53 \t0.969556\t0.00330357 \t0.973333 \t0.966667 \n",
+ "18 \t51 \t0.971111\t0.0031427 \t0.973333 \t0.966667 \n",
+ "19 \t58 \t0.972889\t0.00166296 \t0.973333 \t0.966667 \n",
+ "20 \t54 \t0.973333\t3.33067e-16\t0.973333 \t0.973333 \n"
+ ]
+ }
+ ],
+ "source": [
+ "evolved_estimator.fit(X, y)\n",
+ "features = evolved_estimator.best_features_\n",
+ "\n",
+ "# Predict only with the subset of selected features\n",
+ "y_predict_ga = evolved_estimator.predict(X_test[:, features])\n",
+ "accuracy = accuracy_score(y_test, y_predict_ga)"
+ ],
+ "metadata": {
+ "collapsed": false,
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[ True True True True False False False False False False False False\n",
+ " False False]\n",
+ "accuracy score: 0.98\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(evolved_estimator.best_features_)\n",
+ "print(\"accuracy score: \", \"{:.2f}\".format(accuracy))"
+ ],
+ "metadata": {
+ "collapsed": false,
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "outputs": [
+ {
+ "data": {
+ "text/plain": "