diff --git a/python_scripts/linear_models_feature_engineering_classification.py b/python_scripts/linear_models_feature_engineering_classification.py index 9fd203f34..12a2997da 100644 --- a/python_scripts/linear_models_feature_engineering_classification.py +++ b/python_scripts/linear_models_feature_engineering_classification.py @@ -235,7 +235,10 @@ def plot_decision_boundary(model, title=None): # %% from sklearn.preprocessing import KBinsDiscretizer -classifier = make_pipeline(KBinsDiscretizer(n_bins=5), LogisticRegression()) +classifier = make_pipeline( + KBinsDiscretizer(n_bins=5, encode="onehot"), # already the default params + LogisticRegression(), +) classifier # %% @@ -279,15 +282,20 @@ def plot_decision_boundary(model, title=None): # We can see that the decision boundary is now smooth, and while it favors # axis-aligned decision rules when extrapolating in low density regions, it can # adopt a more curvy decision boundary in the high density regions. -# -# Note however, that the number of knots is a hyperparameter that needs to be -# tuned. If we use too few knots, the model would underfit the data, as shown on -# the moons dataset. If we use too many knots, the model would overfit the data. -# # However, as for the binning transformation, the model still fails to separate # the data for the XOR dataset, irrespective of the number of knots, for the # same reasons: **the spline transformation is a feature-wise transformation** # and thus **cannot capture interactions** between features. +# +# Take into account that the number of knots is a hyperparameter that needs to be +# tuned. If we use too few knots, the model would underfit the data, as shown on +# the moons dataset. If we use too many knots, the model would overfit the data. +# +# ```{note} +# Notice that `KBinsDiscretizer(encode="onehot")` and `SplineTransformer` do not +# require additional scaling. Indeed, they can replace the scaling step for +# numerical features: they both create features with values in the [0, 1] range. +# ``` # %% [markdown] #