From 8b61efcd62af533a76241defb7e99dd0b43fbd3d Mon Sep 17 00:00:00 2001 From: eyrei123 Date: Thu, 2 Jan 2025 19:44:38 +0000 Subject: [PATCH] Fixed Notebook --- polars-missing-data/README.md | 10 + polars-missing-data/ft_exercise.parquet | Bin 0 -> 1871 bytes polars-missing-data/ft_exercise_solution.csv | 13 + polars-missing-data/sales_trends.csv | 6 + polars-missing-data/tips.parquet | Bin 0 -> 4051 bytes polars-missing-data/tutorial_code.ipynb | 404 +++++++++++++++++++ 6 files changed, 433 insertions(+) create mode 100644 polars-missing-data/README.md create mode 100644 polars-missing-data/ft_exercise.parquet create mode 100644 polars-missing-data/ft_exercise_solution.csv create mode 100644 polars-missing-data/sales_trends.csv create mode 100644 polars-missing-data/tips.parquet create mode 100644 polars-missing-data/tutorial_code.ipynb diff --git a/polars-missing-data/README.md b/polars-missing-data/README.md new file mode 100644 index 0000000000..5d1800160b --- /dev/null +++ b/polars-missing-data/README.md @@ -0,0 +1,10 @@ +These files will allow you to work along with the [How to Deal With Missing Data in Polars](https://realpython.com/how-to-deal-with-polars-missing-data/) tutorial. + +The files are: + +tutorial_code.ipynb - Contains the code you see in the tutorial. +tips.parquet - Parquet file containing tips information. +sales_trends.csv - CSV file containing sales trend data. +ft_exercise.parquet - Parquet file containing data used in consolidation exercise. +ft_exercise_solution.csv - Parquet file containing solution to consolidation exercise. + diff --git a/polars-missing-data/ft_exercise.parquet b/polars-missing-data/ft_exercise.parquet new file mode 100644 index 0000000000000000000000000000000000000000..7b11aa015e8a2acff2dfda9c8c70896541cf2e33 GIT binary patch literal 1871 zcmb_dZ)jUp6hH6fC4F6!_6>J0?@0#1k!MQPZXKzs6!Erh+1j|JA+@be?Q4>yp=mSo zV$%f%8=I`3l&wx3{^`h=3>i8T=@-i;qV$vf)QKNN*WtPEHEpzPIN9Z$ z`|q6lJHLC*xx5IAZX*);Tp;_yL?r+a__%S|*Z;5pO9F`C012Gnf+~=q8a6-;xY35U zniy&Y0>np}9>SvvaS@ld83Jz6}(*eAhv)iQFmJA&~+bV}nUwPco56kEDDNBbPs< zD$Ezlz7!wyWe@n;GlprRV=G4A`JFJZVT=D#aI^c%t#0!-BiUhg{(AV-b$xnyYPeu{ zE=?Z&^VM^D>4%f&KZNN;^P{1;#VgW}_n+!LajkxOaeMIU^YTDw;hn!upT4hUA@jzO zA^oj|o)g#O-`0k^%F z=p`o)&VRcyhUfqptB92tkf0U|M0!x>3A979{fuBIX1R`U2T@v4+EF@CWDh_c3PstB z(tyH2h-fqPWOM0Mdc?@QkTCK#(+dKf5%gQtt(sBILrwaUP0xtIxP6kwgYTuodp z+3Izf)ztODsIa%Ls;Sn!`L+m}mmU$waRD9j`DF(wu1rf#GUr&C->N1vmI>Z-|86Ol z&E|=W&&e;YJ|%m#QolglsIt@Lhp-pHiHJM@43FO(&Y*^>npX?!$3^ezGq|hwN#Cy} zDn08akAwb+v<7llZ;5_UODO%x0{LC`-jj->^PJe*9?}o`RE_ul)CgVT5H~&K5uhJb zQ&pGgth|Y*N_DenbYIV7W_&O?Y^dKm8hP8+)y5dxiNbb_;(Dx$PjPK+buwnMcE)h* zVhs`07Ybazhq2U(9k+in&KENk@T1(mXss>dkgdYA2-dcTp<^6(a(rwSE@P1# z9@>HHKGv4vI9qYvHX0w!j2Jz^!GT>ZLw&u`3_oGvAF|F>;&-F!4){fmkGN>Usqc;+ zOoYbP@QM~xB_5}D2fP^KwV;V*gNb6Dx1+7?7 z%caF~h#Zz;SroOR#EY)At;JSr@u;|3E48+^`^_X#+TC{B?e|Oaz4_kv{_lP7z3;tA z3?3^*Xvk3%*-wi?Zcz{jf&~3;Pggb4q8ZTSNB*|vJFm_<`g~|q?(SDK8ijlBS~{K;GwrYsS=_ zpY~;*-{l&q?nu~hBR|xp@Wg|Vx!JAxA?%bb4}z_irRE1Ci-H~mSw-^lgDe7R4+3W| zxRDoV=6>RSfQfT!UVssIi}}y)XI-{wq$f?SG;Tw8>&9IBHI`$}ogH;&g!`&2&p7Q- z)tz>HBg*nL_cfQgQTqy0%h7qIw6fkf3Pgnw2-!?Uwo_dQ7nvh;gw9bw0!AP37BN^1 zSW6JUg!}A{8POpsc(_G@8Ez0vq0$WKh72QP6H~KUv&}6mt(XXE&0^1)%dxSYXJ_xg zug8B9*zzJrwgjy}TFr`1<){ctBuKa7buactm7W^a~4Pl(CEA7RN71 zSem$O`HB};u1ZQyNli=7$jn-uty+_lo0nfuSX8VoS-WoihK;3vd#P;G=JG8SFTb+& z@0G7sy;fcGdTrgd?QiU;-}&aQhTVJizSY>&+|s&l|J!W`-f8dX>^j(esHd0UBQ&-L zL^o%%j0)0Hatl&dW~Cyg=q(C5I0x-v@EJb89RO-9-~H!mh1nc4dTP zEJn@9Bf;_-2Bve@sy(zR1d3xDfoJM>mN50F!xVVHw&H8$V;{`d_;O2LsRDznk1DWf zAU1*oVI)-O0*nNjEx?WwcoYxpu2O)x_wq-?SS+A`AdnVe!sYgl+`2-6dNB`6-!GS9 z#u4&LxBkG3?&qaLA}mSh;g0Qk6AU(YsSKM36c5;nvHWiNDXHf|$vue-3-&jbz=4_G z*aoIU*j{I2euS)EiZSJdk{X7$+(0a_kYa+*yhFW;T^-Z;A=!$r<(P6nTq^hQ@|C$j zHe$yB$BgiL?-hZ;o)yB`epu_5UZP5ovk!JwBlK~4WXtuFi)o*xpFK3VL?mtkdXZAy7jxYBV0$BvS0T5!R~+^(gET38-r zJiOtAujXpP;oEB;JN8U8q#tRGysct|zE>j|b11F3eyr$D^v26`dc&N~p7h&&%~W-C zSW;*Uc25oc!Y&#v`D@wU-e+_(6OrTpLAECZtA`3mQ4u~s%YwO8{;)Yz8v+T+IDus z)*LhMbCcmKHHVSjms2PDEZ+7qIn!n9CwOI6^T(I=UK7%mSDu?w%j){jnwt6XOPe=M zL@V3jRF8+dzdHVOQ}%_=rYH9>FE#BmQ&}BTHxz|*x=%QG)J{w#A3jpMbo9{a_eYMU zIsGtoXXVYWS~#&L7tb6<8|jEK+YzF(Sq4S9MM)~{HL0^kCr$VrMm~z)NPcincB04= zgPHe)Az*-800s}-6D@V$AIxM`uA=QRfh=DYN%JB9)^`{i3$!*G| zw^1gq;-fMO+rD?bWf@#M5UWf$`P0u*n^E(Mn=T)OJ!;J7N5>^5 ztn!AQjOznKyxr&W5<9MQOO~X;g6dDaRc7y|vD;iP)zu8$>)&B+X1R5RN~3Ok^}V|Z z!!3(*)x#^UxFj2LE>I`>ZGxq9_7`_4xGkCqt|C(WMq;<@wy@~h z4SI103s~P$U?H<3-HuCRoo_1lv}_ErL&}a}EtZf0Y=D^RYBoJ1EhjatKx^$87`+Gc z`)r7c{VVVp`axUMHZ~Y$kqwP(Wi#|z16vHIN(~(#i%VBa>fim&_pAz{J^%%K76P#m zkpA?-HM!Y(tsBv5n7_r2P^v=&-Vh-^Q9zIWRLB8P@(E`YWD(P%q#|-E&H;AWA+lfk zg}SIXheZWj_0r~>w9=r>QUp*J4A;SwXC9!YCau#RzZFJDVg6Nn!jc&nz#2Z{IWt67 zZ$c0enxL$loU{T1Kt&bjq+|kHKrCn4LK7H}92Q->Rc{axV*E++wrHY?uKHLu?id-glnqA>7So5nh|Is5><%cRK%&dO7Gh8giEk zEQHt`w8IFQG||yNBFH#hM{lu3_5#O{d$!mi)q1+m0V&WeYj{W*xr|WQRbjM(+}t8$ z7KG3oDKlP0AjB9vYwN9+CT=J2HCWXpzatdI0g4caSkh*umQSRHo=NN{ocJj3`CB?uiQ5ZaswE1KtV zo@L=5Ee+@wg#4rvA_J$e=YaVCfh&$aCWLkh5C@3ApZi?t|GbUp>mme22L*@-;%gkA z3;w}7gYFjry*C7i976vY4T8vLpk?+##CEkT8nV{sd`V8F2ArHWDJQ#d& zPc>Lrt0YM9OfE&@cp)B$<6u_e&M|;rQbO>n1IsY3~(@FRubpdpAN^}NPm!akw;09N!3ZR_;p~%f5(dj zzW5V(iL&_O#L(w>(S-6}*audA2hW*^X9@6Y1`g~4x(Sy8|E1{c`>S{XL}k>n}@)B{}7h=(4 Ya#cwMh3t)>rv2v!aslls2K=4>HwCV_jQ{`u literal 0 HcmV?d00001 diff --git a/polars-missing-data/tutorial_code.ipynb b/polars-missing-data/tutorial_code.ipynb new file mode 100644 index 0000000000..408e1ce561 --- /dev/null +++ b/polars-missing-data/tutorial_code.ipynb @@ -0,0 +1,404 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "e47899a3-0806-41d6-a71f-738e7ef9d8d3", + "metadata": {}, + "source": [ + "# Introduction" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a3c5250e-010c-4b4c-a5fa-43a3cc86df30", + "metadata": {}, + "outputs": [], + "source": [ + "!python -m pip install polars\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8a05aa96-ae34-41de-a7ef-1498e6d94cab", + "metadata": {}, + "outputs": [], + "source": [ + "import polars as pl\n", + "\n", + "tips = pl.scan_parquet(\"tips.parquet\")\n", + "\n", + "tips.collect()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "22c85bb2-8b10-4075-ab58-3b212f1ed050", + "metadata": {}, + "outputs": [], + "source": [ + "(\n", + " tips\n", + " .null_count()\n", + ").collect()\n" + ] + }, + { + "cell_type": "markdown", + "id": "c94a5e17-883a-4728-ac18-e4381b793182", + "metadata": {}, + "source": [ + "# How to Work With Missing Data in Polars" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "11bc9817-6c80-492d-8846-48451e68fcb1", + "metadata": {}, + "outputs": [], + "source": [ + "import polars as pl\n", + "\n", + "tips = pl.scan_parquet(\"tips.parquet\")\n", + "\n", + "(\n", + " tips\n", + " .filter(\n", + " pl.col(\"total\").is_null() & pl.col(\"tip\").is_null()\n", + " )\n", + ").collect()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d79f6c04-cfcd-45e5-aa36-4a097d6e2082", + "metadata": {}, + "outputs": [], + "source": [ + "(\n", + " tips\n", + " .drop_nulls(pl.col(\"total\"))\n", + " .filter(\n", + " pl.col(\"total\").is_null() & pl.col(\"tip\").is_null()\n", + " )\n", + ").collect()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8b7de256-b058-4b6d-b802-822019b0b7eb", + "metadata": {}, + "outputs": [], + "source": [ + "(\n", + " tips.drop_nulls(pl.col(\"total\"))\n", + " .with_columns(pl.col(\"tip\").fill_null(0))\n", + " .filter(pl.col(\"tip\").is_null())\n", + ").collect()\n" + ] + }, + { + "cell_type": "markdown", + "id": "c628e41c-fc20-4a56-85ea-9ff631e8d614", + "metadata": {}, + "source": [ + "# Using a More Strategic Approach" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "10fd34e7-e94e-47f1-b9da-533b0550c9b7", + "metadata": {}, + "outputs": [], + "source": [ + "import polars as pl\n", + "\n", + "tips = pl.scan_parquet(\"tips.parquet\")\n", + "\n", + "(tips.filter(pl.col(\"time\").is_null())).collect()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a84196c9-5032-4650-83dd-176319b6eed5", + "metadata": {}, + "outputs": [], + "source": [ + "(\n", + " tips\n", + " .filter(\n", + " pl.col(\"record_id\").is_in([2, 3, 4, 14, 15, 16])\n", + " )\n", + ").collect()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "acfdafa7-c9e0-49cc-8b1e-e4366ce2ac59", + "metadata": {}, + "outputs": [], + "source": [ + "(\n", + " tips\n", + " .drop_nulls(\"total\")\n", + " .with_columns(pl.col(\"tip\").fill_null(0))\n", + " .with_columns(pl.col(\"time\").fill_null(strategy=\"forward\"))\n", + " .filter(pl.col(\"record_id\").is_in([3, 15]))\n", + ").collect()\n" + ] + }, + { + "cell_type": "markdown", + "id": "9c007132-c939-47b7-84b6-bf89c3da74a2", + "metadata": {}, + "source": [ + "# Dealing With Nulls Across Multiple Columns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "19504937-9a8b-48c9-b504-62db2bff178c", + "metadata": {}, + "outputs": [], + "source": [ + "import polars as pl\n", + "\n", + "tips = pl.scan_parquet(\"tips.parquet\")\n", + "\n", + "(\n", + " tips\n", + " .filter(\n", + " pl.all_horizontal(pl.col(\"total\", \"tip\").is_null())\n", + " )\n", + ").collect()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0d5ba705-e675-4935-8aab-958a539bd66a", + "metadata": {}, + "outputs": [], + "source": [ + "tips = pl.scan_parquet(\"tips.parquet\")\n", + "\n", + "(\n", + " tips\n", + " .filter(\n", + " ~pl.all_horizontal(pl.col(\"total\", \"tip\").is_null())\n", + " )\n", + ").collect()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "29a6aab6-edb5-42cc-998b-7bd82f45ce8c", + "metadata": {}, + "outputs": [], + "source": [ + "import polars as pl\n", + "\n", + "tips = pl.scan_parquet(\"tips.parquet\")\n", + "\n", + "(\n", + " tips\n", + " .filter(\n", + " ~pl.all_horizontal(pl.col(\"total\", \"tip\").is_null())\n", + " )\n", + " .with_columns(pl.col(\"tip\").fill_null(0))\n", + " .with_columns(pl.col(\"time\").fill_null(strategy=\"forward\"))\n", + ").null_count().collect()\n" + ] + }, + { + "cell_type": "markdown", + "id": "32c00cbe-e300-4fd8-9a1e-f40371528fef", + "metadata": {}, + "source": [ + "# Dealing With Nulls by Column Data Type" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2e29d50f-b9f8-4545-b954-040490e6f15c", + "metadata": {}, + "outputs": [], + "source": [ + "import polars as pl\n", + "\n", + "scientists = pl.LazyFrame(\n", + " {\n", + " \"scientist_id\": [1, 2, 3, 4, 5],\n", + " \"first_name\": [\"Isaac\", \"Louis\", None, \"Charles\", \"Marie\"],\n", + " \"last_name\": [None, \"Pasteur\", \"Einstein\", \"Darwin\", \"Curie\"],\n", + " \"birth_year\": [1642, 1822, None, 1809, 1867],\n", + " \"death_year\": [1726, 1895, 1955, None, 1934],\n", + " }\n", + ")\n", + "\n", + "scientists.collect()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a6a5a990-d2cf-4dd2-8021-1a59e27c64d2", + "metadata": {}, + "outputs": [], + "source": [ + "import polars.selectors as cs\n", + "\n", + "(\n", + " scientists.with_columns(cs.string().fill_null(\"Unknown\")).with_columns(\n", + " cs.integer().fill_null(0)\n", + " )\n", + ").collect()\n" + ] + }, + { + "cell_type": "markdown", + "id": "f211113b-6988-4cf5-a0e9-c1c625b00148", + "metadata": {}, + "source": [ + "# Dealing With Those Pesky NaNs and infs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8b706a22-cc6a-49c9-858c-69bb3f72cb48", + "metadata": {}, + "outputs": [], + "source": [ + "import polars as pl\n", + "\n", + "sales_trends = pl.scan_csv(\"sales_trends.csv\")\n", + "\n", + "sales_trends.collect()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5cde06c9-1a4c-45da-991d-cda5cd27542c", + "metadata": {}, + "outputs": [], + "source": [ + "(\n", + " sales_trends\n", + " .with_columns(\n", + " pl.col(\"next_year\").replace(\n", + " [float(\"inf\"), -float(\"inf\"), float(\"NaN\")], None\n", + " )\n", + " )\n", + ").collect()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "babf6ca8-101f-40f8-8224-426eeece5a81", + "metadata": {}, + "outputs": [], + "source": [ + "(\n", + " sales_trends\n", + " .with_columns(\n", + " pl.col(\"next_year\").replace(\n", + " [float(\"inf\"), -float(\"inf\"), float(\"NaN\")], None\n", + " )\n", + " )\n", + " .with_columns(\n", + " pl.col(\"next_year\").fill_null(\n", + " pl.col(\"current_year\")\n", + " + (pl.col(\"current_year\") - pl.col(\"last_year\"))\n", + " )\n", + " )\n", + ").collect()\n" + ] + }, + { + "cell_type": "markdown", + "id": "903c4028-c3af-49ba-be08-e98afa785c09", + "metadata": {}, + "source": [ + "# Practicing Your Skills - Solution" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d564123d-42da-462b-a52a-c6a815e59b0d", + "metadata": {}, + "outputs": [], + "source": [ + "import polars as pl\n", + "\n", + "episodes = pl.scan_parquet(\"ft_exercise.parquet\")\n", + "\n", + "episodes.null_count().collect()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "000b53ba-c5d3-4a75-89d7-86c36881a078", + "metadata": {}, + "outputs": [], + "source": [ + "import polars as pl\n", + "\n", + "episodes = pl.scan_parquet(\"ft_exercise.parquet\")\n", + "\n", + "(\n", + " episodes\n", + " .with_columns(\n", + " pl.when(pl.col(\"episode\") == 6)\n", + " .then(pl.col(\"series\").fill_null(strategy=\"forward\"))\n", + " .otherwise(pl.col(\"series\").fill_null(strategy=\"backward\"))\n", + " )\n", + " .with_columns(\n", + " pl.when(pl.col(\"episode\") == 4)\n", + " .then(pl.col(\"title\").fill_null(\"The Hotel Inspectors\"))\n", + " .otherwise(pl.col(\"title\").fill_null(\"Waldorf Salad\"))\n", + " )\n", + " .with_columns(\n", + " pl.col(\"original_date\").interpolate()\n", + " )\n", + ").null_count().collect()\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}