diff --git a/sklearn-train-test-split/README.md b/sklearn-train-test-split/README.md new file mode 100644 index 0000000000..97af37c342 --- /dev/null +++ b/sklearn-train-test-split/README.md @@ -0,0 +1,33 @@ +# Split Your Dataset With scikit-learn's `train_test_split()` + +The `train_test_split()` function in `sklearn` is a useful tool to prepare your dataset for machine learning tasks. This folder contains the code examples from the tutorial on [splitting your dataset with scikit-learn's `train_test_split()`](). + +## Installation + +1. Create a Python virtual environment + +```sh +$ python -m venv ./venv +$ source venv/bin/activate +(venv) $ +``` + +2. Install the requirements + +```sh +(venv) $ pip install -r requirements.txt +``` + +## Run the Scripts + +```sh +(venv) $ python script_name.py +``` + +## About the Author + +Martin Breuss - Email: martin@realpython.com + +## License + +Distributed under the MIT license. See ``LICENSE`` for more information. \ No newline at end of file diff --git a/sklearn-train-test-split/california_housing_example.py b/sklearn-train-test-split/california_housing_example.py new file mode 100644 index 0000000000..53ace85e18 --- /dev/null +++ b/sklearn-train-test-split/california_housing_example.py @@ -0,0 +1,26 @@ +from sklearn.datasets import fetch_california_housing +from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor +from sklearn.linear_model import LinearRegression +from sklearn.model_selection import train_test_split + +x, y = fetch_california_housing(return_X_y=True) + +x_train, x_test, y_train, y_test = train_test_split( + x, y, test_size=0.4, random_state=0 +) + + +model = LinearRegression().fit(x_train, y_train) +print("LinearRegression:") +print(model.score(x_train, y_train)) +print(model.score(x_test, y_test), end="\n\n") + +model = GradientBoostingRegressor(random_state=0).fit(x_train, y_train) +print("GradientBoostingRegressor:") +print(model.score(x_train, y_train)) +print(model.score(x_test, y_test), end="\n\n") + +model = RandomForestRegressor(random_state=0).fit(x_train, y_train) +print("RandomForestRegressor:") +print(model.score(x_train, y_train)) +print(model.score(x_test, y_test), end="\n\n") diff --git a/sklearn-train-test-split/explore_train_test_split.py b/sklearn-train-test-split/explore_train_test_split.py new file mode 100644 index 0000000000..b64a6d3897 --- /dev/null +++ b/sklearn-train-test-split/explore_train_test_split.py @@ -0,0 +1,40 @@ +import numpy as np +from sklearn.model_selection import train_test_split + +x = np.arange(1, 25).reshape(12, 2) +y = np.array([0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0]) + + +x_train, x_test, y_train, y_test = train_test_split(x, y) +print(x_train) +print(x_test) +print(y_train) +print(y_test) + +x_train, x_test, y_train, y_test = train_test_split( + x, y, test_size=4, random_state=4 +) +# Uncomment to view output +# print(x_train) +# print(x_test) +# print(y_train) +# print(y_test) + +x_train, x_test, y_train, y_test = train_test_split( + x, y, test_size=0.33, random_state=4, stratify=y +) +# Uncomment to view output +# print(x_train) +# print(x_test) +# print(y_train) +# print(y_test) + + +x_train, x_test, y_train, y_test = train_test_split( + x, y, test_size=0.33, shuffle=False +) +# Uncomment to view output +# print(x_train) +# print(x_test) +# print(y_train) +# print(y_test) diff --git a/sklearn-train-test-split/fit_and_score_example.py b/sklearn-train-test-split/fit_and_score_example.py new file mode 100644 index 0000000000..c37ca28671 --- /dev/null +++ b/sklearn-train-test-split/fit_and_score_example.py @@ -0,0 +1,41 @@ +import numpy as np +from sklearn.linear_model import LinearRegression +from sklearn.model_selection import train_test_split + +x = np.arange(20).reshape(-1, 1) +y = np.array( + [ + 5, + 12, + 11, + 19, + 30, + 29, + 23, + 40, + 51, + 54, + 74, + 62, + 68, + 73, + 89, + 84, + 89, + 101, + 99, + 106, + ] +) + + +x_train, x_test, y_train, y_test = train_test_split( + x, y, test_size=8, random_state=0 +) + +model = LinearRegression().fit(x_train, y_train) +print(model.intercept_) +print(model.coef_) + +print(model.score(x_train, y_train)) +print(model.score(x_test, y_test)) diff --git a/sklearn-train-test-split/requirements.txt b/sklearn-train-test-split/requirements.txt new file mode 100644 index 0000000000..60449b19b8 --- /dev/null +++ b/sklearn-train-test-split/requirements.txt @@ -0,0 +1,5 @@ +joblib==1.4.2 +numpy==2.0.0 +scikit-learn==1.5.0 +scipy==1.14.0 +threadpoolctl==3.5.0