diff --git a/python_scripts/ensemble_bagging.py b/python_scripts/ensemble_bagging.py index 111ef9684..f3750e642 100644 --- a/python_scripts/ensemble_bagging.py +++ b/python_scripts/ensemble_bagging.py @@ -8,31 +8,28 @@ # %% [markdown] # # Bagging # -# This notebook introduces a very natural strategy to build ensembles of machine -# learning models named "bagging". +# In this notebook we introduce a very natural strategy to build ensembles of +# machine learning models, named "bagging". # # "Bagging" stands for Bootstrap AGGregatING. It uses bootstrap resampling # (random sampling with replacement) to learn several models on random # variations of the training set. At predict time, the predictions of each # learner are aggregated to give the final predictions. # -# First, we will generate a simple synthetic dataset to get insights regarding -# bootstraping. +# We first create a simple synthetic dataset to better understand bootstrapping. # %% import pandas as pd import numpy as np -# create a random number generator that will be used to set the randomness -rng = np.random.RandomState(1) - def generate_data(n_samples=30): """Generate synthetic dataset. Returns `data_train`, `data_test`, `target_train`.""" x_min, x_max = -3, 3 + rng = np.random.default_rng(1) # Create a random number generator x = rng.uniform(x_min, x_max, size=n_samples) - noise = 4.0 * rng.randn(n_samples) + noise = 4.0 * rng.normal(size=(n_samples,)) y = x**3 - 0.5 * (x + 1) ** 2 + noise y /= y.std() @@ -57,9 +54,8 @@ def generate_data(n_samples=30): # %% [markdown] # -# The relationship between our feature and the target to predict is non-linear. -# However, a decision tree is capable of approximating such a non-linear -# dependency: +# The target to predict is a non-linear function of the only feature. However, a +# decision tree is capable of approximating such a non-linear dependency: # %% from sklearn.tree import DecisionTreeRegressor @@ -86,23 +82,24 @@ def generate_data(n_samples=30): # # ## Bootstrap resampling # -# Given a dataset with `n` data points, bootstrapping corresponds to resampling -# with replacement `n` out of such `n` data points uniformly at random. +# Bootstrapping involves uniformly resampling `n` data points from a dataset of +# `n` points, with replacement, ensuring each sample has an equal chance of +# selection. # # As a result, the output of the bootstrap sampling procedure is another dataset -# with also n data points, but likely with duplicates. As a consequence, there -# are also data points from the original dataset that are never selected to -# appear in a bootstrap sample (by chance). Those data points that are left away -# are often referred to as the out-of-bag sample. +# with `n` data points, likely containing duplicates. Consequently, some data +# points from the original dataset may not be selected for a bootstrap sample. +# These unselected data points are often referred to as the out-of-bag sample. # -# We will create a function that given `data` and `target` will return a +# We now create a function that, given `data` and `target`, returns a # resampled variation `data_bootstrap` and `target_bootstrap`. # %% -def bootstrap_sample(data, target): +def bootstrap_sample(data, target, seed=0): # Indices corresponding to a sampling with replacement of the same sample # size than the original data + rng = np.random.default_rng(seed) bootstrap_indices = rng.choice( np.arange(target.shape[0]), size=target.shape[0], @@ -117,7 +114,7 @@ def bootstrap_sample(data, target): # %% [markdown] # -# We will generate 3 bootstrap samples and qualitatively check the difference +# We generate 3 bootstrap samples and qualitatively check the difference # with the original dataset. # %% @@ -127,6 +124,7 @@ def bootstrap_sample(data, target): data_bootstrap, target_bootstrap = bootstrap_sample( data_train, target_train, + seed=bootstrap_idx, # ensure bootstrap samples are different but reproducible ) plt.figure() plt.scatter( @@ -179,9 +177,9 @@ def bootstrap_sample(data, target): # %% [markdown] # # On average, roughly 63.2% of the original data points of the original dataset -# will be present in a given bootstrap sample. Since the bootstrap sample has -# the same size as the original dataset, there will be many samples that are in -# the bootstrap sample multiple times. +# are present in a given bootstrap sample. Since the bootstrap sample has the +# same size as the original dataset, there are many samples that are in the +# bootstrap sample multiple times. # # Using bootstrap we are able to generate many datasets, all slightly different. # We can fit a decision tree for each of these datasets and they all shall be @@ -193,7 +191,7 @@ def bootstrap_sample(data, target): tree = DecisionTreeRegressor(max_depth=3, random_state=0) data_bootstrap_sample, target_bootstrap_sample = bootstrap_sample( - data_train, target_train + data_train, target_train, seed=bootstrap_idx ) tree.fit(data_bootstrap_sample, target_bootstrap_sample) bag_of_trees.append(tree) @@ -224,7 +222,7 @@ def bootstrap_sample(data, target): # %% [markdown] # ## Aggregating # -# Once our trees are fitted, we are able to get predictions for each of them. In +# Once our trees are fitted, we are able to get predictions from each of them. In # regression, the most straightforward way to combine those predictions is just # to average them: for a given test data point, we feed the input feature values # to each of the `n` trained models in the ensemble and as a result compute `n` @@ -262,7 +260,7 @@ def bootstrap_sample(data, target): # %% [markdown] # -# The unbroken red line shows the averaged predictions, which would be the final +# The continuous red line shows the averaged predictions, which would be the final # predictions given by our 'bag' of decision tree regressors. Note that the # predictions of the ensemble is more stable because of the averaging operation. # As a result, the bag of trees as a whole is less likely to overfit than the @@ -298,7 +296,7 @@ def bootstrap_sample(data, target): bagged_trees_predictions = bagged_trees.predict(data_test) plt.plot(data_test["Feature"], bagged_trees_predictions) -_ = plt.title("Predictions from a bagging classifier") +_ = plt.title("Predictions from a bagging regressor") # %% [markdown] # Because we use 100 trees in the ensemble, the average prediction is indeed @@ -338,15 +336,14 @@ def bootstrap_sample(data, target): # %% [markdown] # We used a low value of the opacity parameter `alpha` to better appreciate the -# overlap in the prediction functions of the individual trees. -# -# This visualization gives some insights on the uncertainty in the predictions -# in different areas of the feature space. +# overlap in the prediction functions of the individual trees. Such +# visualization also gives us an intuition on the variance in the predictions +# across different zones of the feature space. # # ## Bagging complex pipelines # -# While we used a decision tree as a base model, nothing prevents us of using -# any other type of model. +# Even if here we used a decision tree as a base model, nothing prevents us from +# using any other type of model. # # As we know that the original data generating function is a noisy polynomial # transformation of the input variable, let us try to fit a bagged polynomial @@ -361,15 +358,14 @@ def bootstrap_sample(data, target): polynomial_regressor = make_pipeline( MinMaxScaler(), - PolynomialFeatures(degree=4), + PolynomialFeatures(degree=4, include_bias=False), Ridge(alpha=1e-10), ) # %% [markdown] -# This pipeline first scales the data to the 0-1 range with `MinMaxScaler`. Then -# it extracts degree-4 polynomial features. The resulting features will all stay -# in the 0-1 range by construction: if `x` lies in the 0-1 range then `x ** n` -# also lies in the 0-1 range for any value of `n`. +# This pipeline first scales the data to the 0-1 range using `MinMaxScaler`. It +# then generates degree-4 polynomial features. By design, these features remain +# in the 0-1 range, as any power of `x` within this range also stays within 0-1. # # Then the pipeline feeds the resulting non-linear features to a regularized # linear regression model for the final prediction of the target variable.