realpython · brendaweles · Jul 3, 2024 · Jun 27, 2024 · Jul 3, 2024 · Jul 3, 2024
diff --git a/sklearn-train-test-split/README.md b/sklearn-train-test-split/README.md
@@ -0,0 +1,33 @@
+# Split Your Dataset With scikit-learn's `train_test_split()`
+
+The `train_test_split()` function in `sklearn` is a useful tool to prepare your dataset for machine learning tasks. This folder contains the code examples from the tutorial on [splitting your dataset with scikit-learn's `train_test_split()`]().
+
+## Installation
+
+1. Create a Python virtual environment
+
+```sh
+$ python -m venv ./venv
+$ source venv/bin/activate
+(venv) $
+```
+
+2. Install the requirements
+
+```sh
+(venv) $ pip install -r requirements.txt
+```
+
+## Run the Scripts
+
+```sh
+(venv) $ python script_name.py
+```
+
+## About the Author
+
+Martin Breuss - Email: [email protected]
+
+## License
+
+Distributed under the MIT license. See ``LICENSE`` for more information.
diff --git a/sklearn-train-test-split/california_housing_example.py b/sklearn-train-test-split/california_housing_example.py
@@ -0,0 +1,26 @@
+from sklearn.datasets import fetch_california_housing
+from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
+from sklearn.linear_model import LinearRegression
+from sklearn.model_selection import train_test_split
+
+x, y = fetch_california_housing(return_X_y=True)
+
+x_train, x_test, y_train, y_test = train_test_split(
+    x, y, test_size=0.4, random_state=0
+)
+
+
+model = LinearRegression().fit(x_train, y_train)
+print("LinearRegression:")
+print(model.score(x_train, y_train))
+print(model.score(x_test, y_test), end="\n\n")
+
+model = GradientBoostingRegressor(random_state=0).fit(x_train, y_train)
+print("GradientBoostingRegressor:")
+print(model.score(x_train, y_train))
+print(model.score(x_test, y_test), end="\n\n")
+
+model = RandomForestRegressor(random_state=0).fit(x_train, y_train)
+print("RandomForestRegressor:")
+print(model.score(x_train, y_train))
+print(model.score(x_test, y_test), end="\n\n")
diff --git a/sklearn-train-test-split/explore_train_test_split.py b/sklearn-train-test-split/explore_train_test_split.py
@@ -0,0 +1,40 @@
+import numpy as np
+from sklearn.model_selection import train_test_split
+
+x = np.arange(1, 25).reshape(12, 2)
+y = np.array([0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0])
+
+
+x_train, x_test, y_train, y_test = train_test_split(x, y)
+print(x_train)
+print(x_test)
+print(y_train)
+print(y_test)
+
+x_train, x_test, y_train, y_test = train_test_split(
+    x, y, test_size=4, random_state=4
+)
+# Uncomment to view output
+# print(x_train)
+# print(x_test)
+# print(y_train)
+# print(y_test)
+
+x_train, x_test, y_train, y_test = train_test_split(
+    x, y, test_size=0.33, random_state=4, stratify=y
+)
+# Uncomment to view output
+# print(x_train)
+# print(x_test)
+# print(y_train)
+# print(y_test)
+
+
+x_train, x_test, y_train, y_test = train_test_split(
+    x, y, test_size=0.33, shuffle=False
+)
+# Uncomment to view output
+# print(x_train)
+# print(x_test)
+# print(y_train)
+# print(y_test)
diff --git a/sklearn-train-test-split/fit_and_score_example.py b/sklearn-train-test-split/fit_and_score_example.py
@@ -0,0 +1,41 @@
+import numpy as np
+from sklearn.linear_model import LinearRegression
+from sklearn.model_selection import train_test_split
+
+x = np.arange(20).reshape(-1, 1)
+y = np.array(
+    [
+        5,
+        12,
+        11,
+        19,
+        30,
+        29,
+        23,
+        40,
+        51,
+        54,
+        74,
+        62,
+        68,
+        73,
+        89,
+        84,
+        89,
+        101,
+        99,
+        106,
+    ]
+)
+
+
+x_train, x_test, y_train, y_test = train_test_split(
+    x, y, test_size=8, random_state=0
+)
+
+model = LinearRegression().fit(x_train, y_train)
+print(model.intercept_)
+print(model.coef_)
+
+print(model.score(x_train, y_train))
+print(model.score(x_test, y_test))
diff --git a/sklearn-train-test-split/requirements.txt b/sklearn-train-test-split/requirements.txt
@@ -0,0 +1,5 @@
+joblib==1.4.2
+numpy==2.0.0
+scikit-learn==1.5.0
+scipy==1.14.0
+threadpoolctl==3.5.0