diff --git a/polars-missing-data/README.md b/polars-missing-data/README.md new file mode 100644 index 0000000000..5d1800160b --- /dev/null +++ b/polars-missing-data/README.md @@ -0,0 +1,10 @@ +These files will allow you to work along with the [How to Deal With Missing Data in Polars](https://realpython.com/how-to-deal-with-polars-missing-data/) tutorial. + +The files are: + +tutorial_code.ipynb - Contains the code you see in the tutorial. +tips.parquet - Parquet file containing tips information. +sales_trends.csv - CSV file containing sales trend data. +ft_exercise.parquet - Parquet file containing data used in consolidation exercise. +ft_exercise_solution.csv - Parquet file containing solution to consolidation exercise. + diff --git a/polars-missing-data/ft_exercise.parquet b/polars-missing-data/ft_exercise.parquet new file mode 100644 index 0000000000..7b11aa015e Binary files /dev/null and b/polars-missing-data/ft_exercise.parquet differ diff --git a/polars-missing-data/ft_exercise_solution.csv b/polars-missing-data/ft_exercise_solution.csv new file mode 100644 index 0000000000..f8e50219f0 --- /dev/null +++ b/polars-missing-data/ft_exercise_solution.csv @@ -0,0 +1,13 @@ +episode,series,title,original_date +1,1,A Touch of Class,1975-09-19 +2,1,The Builders,1975-09-26 +3,1,The Wedding Party,1975-10-03 +4,1,The Hotel Inspectors,1975-10-10 +5,1,Gourmet Night,1975-10-17 +6,1,The Germans,1975-10-24 +7,2,Communication Problems,1979-02-19 +8,2,The Psychiatrist,1979-02-26 +9,2,Waldorf Salad,1979-03-05 +10,2,The Kipper and the Corpse,1979-03-12 +11,2,The Anniversary,1979-03-26 +12,2,Basil the Rat,1979-10-25 diff --git a/polars-missing-data/sales_trends.csv b/polars-missing-data/sales_trends.csv new file mode 100644 index 0000000000..76621ce60a --- /dev/null +++ b/polars-missing-data/sales_trends.csv @@ -0,0 +1,6 @@ +product,last_year,current_year,next_year +A,17,19,29.0 +B,35,35,NaN +C,21,19, +D,42,50,-inf +E,23,25,inf diff --git a/polars-missing-data/tips.parquet b/polars-missing-data/tips.parquet new file mode 100644 index 0000000000..4dd2e91f38 Binary files /dev/null and b/polars-missing-data/tips.parquet differ diff --git a/polars-missing-data/tutorial_code.ipynb b/polars-missing-data/tutorial_code.ipynb new file mode 100644 index 0000000000..408e1ce561 --- /dev/null +++ b/polars-missing-data/tutorial_code.ipynb @@ -0,0 +1,404 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "e47899a3-0806-41d6-a71f-738e7ef9d8d3", + "metadata": {}, + "source": [ + "# Introduction" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a3c5250e-010c-4b4c-a5fa-43a3cc86df30", + "metadata": {}, + "outputs": [], + "source": [ + "!python -m pip install polars\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8a05aa96-ae34-41de-a7ef-1498e6d94cab", + "metadata": {}, + "outputs": [], + "source": [ + "import polars as pl\n", + "\n", + "tips = pl.scan_parquet(\"tips.parquet\")\n", + "\n", + "tips.collect()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "22c85bb2-8b10-4075-ab58-3b212f1ed050", + "metadata": {}, + "outputs": [], + "source": [ + "(\n", + " tips\n", + " .null_count()\n", + ").collect()\n" + ] + }, + { + "cell_type": "markdown", + "id": "c94a5e17-883a-4728-ac18-e4381b793182", + "metadata": {}, + "source": [ + "# How to Work With Missing Data in Polars" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "11bc9817-6c80-492d-8846-48451e68fcb1", + "metadata": {}, + "outputs": [], + "source": [ + "import polars as pl\n", + "\n", + "tips = pl.scan_parquet(\"tips.parquet\")\n", + "\n", + "(\n", + " tips\n", + " .filter(\n", + " pl.col(\"total\").is_null() & pl.col(\"tip\").is_null()\n", + " )\n", + ").collect()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d79f6c04-cfcd-45e5-aa36-4a097d6e2082", + "metadata": {}, + "outputs": [], + "source": [ + "(\n", + " tips\n", + " .drop_nulls(pl.col(\"total\"))\n", + " .filter(\n", + " pl.col(\"total\").is_null() & pl.col(\"tip\").is_null()\n", + " )\n", + ").collect()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8b7de256-b058-4b6d-b802-822019b0b7eb", + "metadata": {}, + "outputs": [], + "source": [ + "(\n", + " tips.drop_nulls(pl.col(\"total\"))\n", + " .with_columns(pl.col(\"tip\").fill_null(0))\n", + " .filter(pl.col(\"tip\").is_null())\n", + ").collect()\n" + ] + }, + { + "cell_type": "markdown", + "id": "c628e41c-fc20-4a56-85ea-9ff631e8d614", + "metadata": {}, + "source": [ + "# Using a More Strategic Approach" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "10fd34e7-e94e-47f1-b9da-533b0550c9b7", + "metadata": {}, + "outputs": [], + "source": [ + "import polars as pl\n", + "\n", + "tips = pl.scan_parquet(\"tips.parquet\")\n", + "\n", + "(tips.filter(pl.col(\"time\").is_null())).collect()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a84196c9-5032-4650-83dd-176319b6eed5", + "metadata": {}, + "outputs": [], + "source": [ + "(\n", + " tips\n", + " .filter(\n", + " pl.col(\"record_id\").is_in([2, 3, 4, 14, 15, 16])\n", + " )\n", + ").collect()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "acfdafa7-c9e0-49cc-8b1e-e4366ce2ac59", + "metadata": {}, + "outputs": [], + "source": [ + "(\n", + " tips\n", + " .drop_nulls(\"total\")\n", + " .with_columns(pl.col(\"tip\").fill_null(0))\n", + " .with_columns(pl.col(\"time\").fill_null(strategy=\"forward\"))\n", + " .filter(pl.col(\"record_id\").is_in([3, 15]))\n", + ").collect()\n" + ] + }, + { + "cell_type": "markdown", + "id": "9c007132-c939-47b7-84b6-bf89c3da74a2", + "metadata": {}, + "source": [ + "# Dealing With Nulls Across Multiple Columns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "19504937-9a8b-48c9-b504-62db2bff178c", + "metadata": {}, + "outputs": [], + "source": [ + "import polars as pl\n", + "\n", + "tips = pl.scan_parquet(\"tips.parquet\")\n", + "\n", + "(\n", + " tips\n", + " .filter(\n", + " pl.all_horizontal(pl.col(\"total\", \"tip\").is_null())\n", + " )\n", + ").collect()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0d5ba705-e675-4935-8aab-958a539bd66a", + "metadata": {}, + "outputs": [], + "source": [ + "tips = pl.scan_parquet(\"tips.parquet\")\n", + "\n", + "(\n", + " tips\n", + " .filter(\n", + " ~pl.all_horizontal(pl.col(\"total\", \"tip\").is_null())\n", + " )\n", + ").collect()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "29a6aab6-edb5-42cc-998b-7bd82f45ce8c", + "metadata": {}, + "outputs": [], + "source": [ + "import polars as pl\n", + "\n", + "tips = pl.scan_parquet(\"tips.parquet\")\n", + "\n", + "(\n", + " tips\n", + " .filter(\n", + " ~pl.all_horizontal(pl.col(\"total\", \"tip\").is_null())\n", + " )\n", + " .with_columns(pl.col(\"tip\").fill_null(0))\n", + " .with_columns(pl.col(\"time\").fill_null(strategy=\"forward\"))\n", + ").null_count().collect()\n" + ] + }, + { + "cell_type": "markdown", + "id": "32c00cbe-e300-4fd8-9a1e-f40371528fef", + "metadata": {}, + "source": [ + "# Dealing With Nulls by Column Data Type" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2e29d50f-b9f8-4545-b954-040490e6f15c", + "metadata": {}, + "outputs": [], + "source": [ + "import polars as pl\n", + "\n", + "scientists = pl.LazyFrame(\n", + " {\n", + " \"scientist_id\": [1, 2, 3, 4, 5],\n", + " \"first_name\": [\"Isaac\", \"Louis\", None, \"Charles\", \"Marie\"],\n", + " \"last_name\": [None, \"Pasteur\", \"Einstein\", \"Darwin\", \"Curie\"],\n", + " \"birth_year\": [1642, 1822, None, 1809, 1867],\n", + " \"death_year\": [1726, 1895, 1955, None, 1934],\n", + " }\n", + ")\n", + "\n", + "scientists.collect()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a6a5a990-d2cf-4dd2-8021-1a59e27c64d2", + "metadata": {}, + "outputs": [], + "source": [ + "import polars.selectors as cs\n", + "\n", + "(\n", + " scientists.with_columns(cs.string().fill_null(\"Unknown\")).with_columns(\n", + " cs.integer().fill_null(0)\n", + " )\n", + ").collect()\n" + ] + }, + { + "cell_type": "markdown", + "id": "f211113b-6988-4cf5-a0e9-c1c625b00148", + "metadata": {}, + "source": [ + "# Dealing With Those Pesky NaNs and infs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8b706a22-cc6a-49c9-858c-69bb3f72cb48", + "metadata": {}, + "outputs": [], + "source": [ + "import polars as pl\n", + "\n", + "sales_trends = pl.scan_csv(\"sales_trends.csv\")\n", + "\n", + "sales_trends.collect()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5cde06c9-1a4c-45da-991d-cda5cd27542c", + "metadata": {}, + "outputs": [], + "source": [ + "(\n", + " sales_trends\n", + " .with_columns(\n", + " pl.col(\"next_year\").replace(\n", + " [float(\"inf\"), -float(\"inf\"), float(\"NaN\")], None\n", + " )\n", + " )\n", + ").collect()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "babf6ca8-101f-40f8-8224-426eeece5a81", + "metadata": {}, + "outputs": [], + "source": [ + "(\n", + " sales_trends\n", + " .with_columns(\n", + " pl.col(\"next_year\").replace(\n", + " [float(\"inf\"), -float(\"inf\"), float(\"NaN\")], None\n", + " )\n", + " )\n", + " .with_columns(\n", + " pl.col(\"next_year\").fill_null(\n", + " pl.col(\"current_year\")\n", + " + (pl.col(\"current_year\") - pl.col(\"last_year\"))\n", + " )\n", + " )\n", + ").collect()\n" + ] + }, + { + "cell_type": "markdown", + "id": "903c4028-c3af-49ba-be08-e98afa785c09", + "metadata": {}, + "source": [ + "# Practicing Your Skills - Solution" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d564123d-42da-462b-a52a-c6a815e59b0d", + "metadata": {}, + "outputs": [], + "source": [ + "import polars as pl\n", + "\n", + "episodes = pl.scan_parquet(\"ft_exercise.parquet\")\n", + "\n", + "episodes.null_count().collect()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "000b53ba-c5d3-4a75-89d7-86c36881a078", + "metadata": {}, + "outputs": [], + "source": [ + "import polars as pl\n", + "\n", + "episodes = pl.scan_parquet(\"ft_exercise.parquet\")\n", + "\n", + "(\n", + " episodes\n", + " .with_columns(\n", + " pl.when(pl.col(\"episode\") == 6)\n", + " .then(pl.col(\"series\").fill_null(strategy=\"forward\"))\n", + " .otherwise(pl.col(\"series\").fill_null(strategy=\"backward\"))\n", + " )\n", + " .with_columns(\n", + " pl.when(pl.col(\"episode\") == 4)\n", + " .then(pl.col(\"title\").fill_null(\"The Hotel Inspectors\"))\n", + " .otherwise(pl.col(\"title\").fill_null(\"Waldorf Salad\"))\n", + " )\n", + " .with_columns(\n", + " pl.col(\"original_date\").interpolate()\n", + " )\n", + ").null_count().collect()\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}