├── pyproject.toml ├── .devcontainer └── devcontainer.json ├── README.md └── polars └── Idiomatic Polars.ipynb /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "pres" 3 | version = "0.1.0" 4 | description = "pres" 5 | readme = "README.md" 6 | requires-python = ">=3.10" 7 | dependencies = [ 8 | "altair>=5.4.1", 9 | "catboost>=1.2.7", 10 | "gpxpy>=1.6.2", 11 | "matplotlib>=3.9.2", 12 | "notebook>=7.2.2", 13 | "pandas>=2.2.3", 14 | "plotly>=5.24.1", 15 | "polars>=1.12.0", 16 | "pyarrow>=18.0.0", 17 | "scikit-learn>=1.5.2", 18 | "seaborn>=0.13.2", 19 | "shap>=0.46.0", 20 | "xlrd>=2.0.1", 21 | "yellowbrick>=1.5", 22 | ] 23 | -------------------------------------------------------------------------------- /.devcontainer/devcontainer.json: -------------------------------------------------------------------------------- 1 | { 2 | "image": "mcr.microsoft.com/devcontainers/universal:2", 3 | "hostRequirements": { 4 | "cpus": 4 5 | }, 6 | "waitFor": "onCreateCommand", 7 | "updateContentCommand": "python3 -m pip install uv; uv sync", 8 | "postCreateCommand": "", 9 | "customizations": { 10 | "codespaces": { 11 | "openFiles": [] 12 | }, 13 | "vscode": { 14 | "extensions": [ 15 | "ms-toolsai.jupyter", 16 | "ms-python.python" 17 | ] 18 | } 19 | } 20 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PyData NYC 2024 2 | 3 | 4 | ## Contents 5 | 6 | - **Notebooks**: 7 | - `polars/Idiomatic Polars.ipynb` : "Slides" for Polars 8 | - **Environment Setup**: Instructions for running these notebooks in various environments (locally, Codespaces, and Google Colab). 9 | 10 | ## Running the Jupyter Notebooks 11 | 12 | You can run the notebooks in several ways, depending on your preferences and setup: 13 | 14 | ### 1. Running Locally 15 | 16 | To run the notebooks locally, you will need to create a virtual environment and install the necessary dependencies. 17 | 18 | 1. Clone the repository: 19 | ```sh 20 | git clone git@github.com:mattharrison/pydata_nyc_2024.git 21 | cd pydata_nyc_2024 22 | ``` 23 | 24 | 2. Set up the environment using `uv`: 25 | 26 | Mac/Linux: 27 | ```sh 28 | $ curl -LsSf https://astral.sh/uv/install.sh | sh 29 | ``` 30 | Windows: 31 | ```cmd 32 | > powershell -c "irm https://astral.sh/uv/install.ps1 | iex" 33 | ``` 34 | 35 | Then run: 36 | ``` 37 | uv sync 38 | ``` 39 | 40 | The above commands will create a virtual environment, activate it, and install dependencies. 41 | 42 | 3. Start Jupyter Notebook: 43 | ```sh 44 | uv run jupyter notebook 45 | ``` 46 | 47 | ### 2. Running on GitHub Codespaces 48 | 49 | GitHub Codespaces allows you to run this project entirely in the cloud without needing to set up a local environment. 50 | 51 | 1. Open the GitHub repository in your browser. 52 | 2. Click the **Code** button and select **Open with Codespaces**. 53 | 3. After the Codespace launches, wait for it to install the environment. 54 | 4. Click on the notebook and select the local kernel. 55 | 56 | The project is pre-configured to install the necessary dependencies when the Codespace is first created. 57 | 58 | ### 3. Running on Google Colab 59 | 60 | You can also run the notebooks on Google Colab, which provides free GPU/TPU resources for faster computations. 61 | 62 | 1. Open the repository on GitHub. 63 | 2. Navigate to the desired Jupyter notebook file (ending in `.ipynb`). 64 | 3. Update the domain from `github.com` to `githubtocolab.com` 65 | 4. Once in Colab, you may need to run the first cell to install any required dependencies. 66 | 67 | ## Dependencies 68 | 69 | 70 | To see the complete list of dependencies, please check the `pyproject.toml` file. 71 | 72 | ## Suggested Reading 73 | 74 | To deepen your understanding of Python for data analysis, I recommend the following books: 75 | 76 | - [*Learning Python for Data*](https://store.metasnake.com/learningpy): This book provides a comprehensive introduction to Python, with a focus on its applications in data analysis. It covers Python fundamentals, essential libraries, and practical examples to help you get started with data-driven projects. 77 | 78 | - [*Effective Polars*](https://store.metasnake.com/a5018258-063b-4802-b395-34e75b6eeb5e): A guide to mastering data manipulation and analysis with Polars. 79 | 80 | ## Contributing 81 | 82 | Contributions are welcome! Feel free to open an issue or submit a pull request if you have suggestions, improvements, or additional notebooks to add. 83 | 84 | ## License 85 | 86 | This project is licensed under the MIT License. 87 | 88 | ## Contact 89 | 90 | If you have any questions, feel free to reach out or open an issue on the repository. 91 | 92 | -------------------------------------------------------------------------------- /polars/Idiomatic Polars.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "lines_to_next_cell": 0, 7 | "pycharm": { 8 | "name": "#%% md\n" 9 | } 10 | }, 11 | "source": [ 12 | "# Idiomatic Polars \n", 13 | "\n", 14 | "## Matt Harrison - PyData NYC 2024\n", 15 | "\n", 16 | "## https://github.com/mattharrison/pydata_nyc_2024\n", 17 | "\n", 18 | "\n", 19 | "\n", 20 | " 5\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwith_columns\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpl\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcol\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43melapsed\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcast\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpl\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mInt8\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 6\u001b[0m \u001b[38;5;66;03m#.describe()\u001b[39;00m\n\u001b[1;32m 7\u001b[0m )\n", 1109 | "File \u001b[0;32m~/.envs/menv/lib/python3.10/site-packages/polars/dataframe/frame.py:9194\u001b[0m, in \u001b[0;36mDataFrame.with_columns\u001b[0;34m(self, *exprs, **named_exprs)\u001b[0m\n\u001b[1;32m 9048\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mwith_columns\u001b[39m(\n\u001b[1;32m 9049\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m 9050\u001b[0m \u001b[38;5;241m*\u001b[39mexprs: IntoExpr \u001b[38;5;241m|\u001b[39m Iterable[IntoExpr],\n\u001b[1;32m 9051\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mnamed_exprs: IntoExpr,\n\u001b[1;32m 9052\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m DataFrame:\n\u001b[1;32m 9053\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 9054\u001b[0m \u001b[38;5;124;03m Add columns to this DataFrame.\u001b[39;00m\n\u001b[1;32m 9055\u001b[0m \n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 9192\u001b[0m \u001b[38;5;124;03m └─────┴──────┴─────────────┘\u001b[39;00m\n\u001b[1;32m 9193\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m-> 9194\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlazy\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwith_columns\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mexprs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mnamed_exprs\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcollect\u001b[49m\u001b[43m(\u001b[49m\u001b[43m_eager\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\n", 1110 | "File \u001b[0;32m~/.envs/menv/lib/python3.10/site-packages/polars/lazyframe/frame.py:2055\u001b[0m, in \u001b[0;36mLazyFrame.collect\u001b[0;34m(self, type_coercion, predicate_pushdown, projection_pushdown, simplify_expression, slice_pushdown, comm_subplan_elim, comm_subexpr_elim, cluster_with_columns, collapse_joins, no_optimization, streaming, engine, background, _eager, **_kwargs)\u001b[0m\n\u001b[1;32m 2053\u001b[0m \u001b[38;5;66;03m# Only for testing purposes\u001b[39;00m\n\u001b[1;32m 2054\u001b[0m callback \u001b[38;5;241m=\u001b[39m _kwargs\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpost_opt_callback\u001b[39m\u001b[38;5;124m\"\u001b[39m, callback)\n\u001b[0;32m-> 2055\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m wrap_df(\u001b[43mldf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcollect\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcallback\u001b[49m\u001b[43m)\u001b[49m)\n", 1111 | "\u001b[0;31mInvalidOperationError\u001b[0m: conversion from `i64` to `i8` failed in column 'elapsed' for 10321 out of 10430 values: [128, 129, … 10686]" 1112 | ] 1113 | } 1114 | ], 1115 | "source": [ 1116 | "# chaining\n", 1117 | "# polars prevents illegal casts\n", 1118 | "(df\n", 1119 | " .select(cols)\n", 1120 | " .with_columns(pl.col('elapsed').cast(pl.Int8))\n", 1121 | " #.describe()\n", 1122 | ")" 1123 | ] 1124 | }, 1125 | { 1126 | "cell_type": "code", 1127 | "execution_count": 80, 1128 | "metadata": { 1129 | "pycharm": { 1130 | "name": "#%%\n" 1131 | }, 1132 | "scrolled": true 1133 | }, 1134 | "outputs": [ 1135 | { 1136 | "data": { 1137 | "text/html": [ 1138 | "
\n", 1145 | "shape: (9, 15)
statisticcoursedistance_2dlatitudelongitudetimeelevationspeed_betweentravelledelapsedavg_velocityrolling_travelledrolling_elapsedrolling_velocityrolling_between
strf64f64f64f64strf64f64f64f64f64f64f64f64f64
"count"0.010430.010430.010430.0"10430"10430.010429.010430.010430.010430.010426.010426.010426.010425.0
"null_count"10430.00.00.00.0"0"0.01.00.00.00.04.04.04.05.0
"mean"null1.78839240.859662-111.822538"2024-09-11 01:11:59.773921+00:…1788.953721.81153410644.587515403.773921NaN10645.093695403.7971232.0651831.81141
"std"null1.7107490.0055360.0137null141.4928821.6577495393.5037243067.406074NaN5391.3826563066.2091140.4188291.506718
"min"null0.040.848371-111.855371"2024-09-10 23:41:56+00:00"1480.00.00.00.00.3742622.2672712.80.376911-1.0081e-14
"25%"null0.61123140.856748-111.829785"2024-09-11 00:27:56+00:00"1696.00.6753615810.4311872760.01.8483935812.4568472761.01.8484390.776866
"50%"null1.32463740.859156-111.82222"2024-09-11 01:12:20+00:00"1816.51.39209712239.4237045424.01.98125612239.471125424.01.9812571.582638
"75%"null2.42447240.86085-111.813012"2024-09-11 01:56:25+00:00"1924.92.50697815087.5662418069.02.23191715086.4493228068.02.2319222.428585
"max"null15.3244340.879271-111.801421"2024-09-11 02:40:02+00:00"1984.012.87138218652.93052810686.03.82815218646.17637210684.03.82721612.491514
" 1146 | ], 1147 | "text/plain": [ 1148 | "shape: (9, 15)\n", 1149 | "┌────────────┬─────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬───────────┐\n", 1150 | "│ statistic ┆ course ┆ distance_ ┆ latitude ┆ … ┆ rolling_t ┆ rolling_e ┆ rolling_v ┆ rolling_b │\n", 1151 | "│ --- ┆ --- ┆ 2d ┆ --- ┆ ┆ ravelled ┆ lapsed ┆ elocity ┆ etween │\n", 1152 | "│ str ┆ f64 ┆ --- ┆ f64 ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n", 1153 | "│ ┆ ┆ f64 ┆ ┆ ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n", 1154 | "╞════════════╪═════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪═══════════╡\n", 1155 | "│ count ┆ 0.0 ┆ 10430.0 ┆ 10430.0 ┆ … ┆ 10426.0 ┆ 10426.0 ┆ 10426.0 ┆ 10425.0 │\n", 1156 | "│ null_count ┆ 10430.0 ┆ 0.0 ┆ 0.0 ┆ … ┆ 4.0 ┆ 4.0 ┆ 4.0 ┆ 5.0 │\n", 1157 | "│ mean ┆ null ┆ 1.788392 ┆ 40.859662 ┆ … ┆ 10645.093 ┆ 5403.7971 ┆ 2.065183 ┆ 1.81141 │\n", 1158 | "│ ┆ ┆ ┆ ┆ ┆ 69 ┆ 23 ┆ ┆ │\n", 1159 | "│ std ┆ null ┆ 1.710749 ┆ 0.005536 ┆ … ┆ 5391.3826 ┆ 3066.2091 ┆ 0.418829 ┆ 1.506718 │\n", 1160 | "│ ┆ ┆ ┆ ┆ ┆ 56 ┆ 14 ┆ ┆ │\n", 1161 | "│ min ┆ null ┆ 0.0 ┆ 40.848371 ┆ … ┆ 2.267271 ┆ 2.8 ┆ 0.376911 ┆ -1.0081e- │\n", 1162 | "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 14 │\n", 1163 | "│ 25% ┆ null ┆ 0.611231 ┆ 40.856748 ┆ … ┆ 5812.4568 ┆ 2761.0 ┆ 1.848439 ┆ 0.776866 │\n", 1164 | "│ ┆ ┆ ┆ ┆ ┆ 47 ┆ ┆ ┆ │\n", 1165 | "│ 50% ┆ null ┆ 1.324637 ┆ 40.859156 ┆ … ┆ 12239.471 ┆ 5424.0 ┆ 1.981257 ┆ 1.582638 │\n", 1166 | "│ ┆ ┆ ┆ ┆ ┆ 12 ┆ ┆ ┆ │\n", 1167 | "│ 75% ┆ null ┆ 2.424472 ┆ 40.86085 ┆ … ┆ 15086.449 ┆ 8068.0 ┆ 2.231922 ┆ 2.428585 │\n", 1168 | "│ ┆ ┆ ┆ ┆ ┆ 322 ┆ ┆ ┆ │\n", 1169 | "│ max ┆ null ┆ 15.32443 ┆ 40.879271 ┆ … ┆ 18646.176 ┆ 10684.0 ┆ 3.827216 ┆ 12.491514 │\n", 1170 | "│ ┆ ┆ ┆ ┆ ┆ 372 ┆ ┆ ┆ │\n", 1171 | "└────────────┴─────────┴───────────┴───────────┴───┴───────────┴───────────┴───────────┴───────────┘" 1172 | ] 1173 | }, 1174 | "execution_count": 80, 1175 | "metadata": {}, 1176 | "output_type": "execute_result" 1177 | } 1178 | ], 1179 | "source": [ 1180 | "# chaining\n", 1181 | "(df\n", 1182 | " .select(cols)\n", 1183 | " .with_columns(pl.col('elapsed').cast(pl.Int16))\n", 1184 | " .describe() \n", 1185 | ")" 1186 | ] 1187 | }, 1188 | { 1189 | "cell_type": "code", 1190 | "execution_count": 81, 1191 | "metadata": { 1192 | "lines_to_next_cell": 0, 1193 | "pycharm": { 1194 | "name": "#%%\n" 1195 | }, 1196 | "scrolled": true 1197 | }, 1198 | "outputs": [ 1199 | { 1200 | "data": { 1201 | "text/plain": [ 1202 | "1028660" 1203 | ] 1204 | }, 1205 | "execution_count": 81, 1206 | "metadata": {}, 1207 | "output_type": "execute_result" 1208 | } 1209 | ], 1210 | "source": [ 1211 | "# chaining\n", 1212 | "(df\n", 1213 | " .select(cols)\n", 1214 | " .with_columns(pl.col('elapsed').cast(pl.Int16)) \n", 1215 | " .estimated_size()\n", 1216 | ")" 1217 | ] 1218 | }, 1219 | { 1220 | "cell_type": "code", 1221 | "execution_count": 82, 1222 | "metadata": {}, 1223 | "outputs": [ 1224 | { 1225 | "data": { 1226 | "text/plain": [ 1227 | "1091240" 1228 | ] 1229 | }, 1230 | "execution_count": 82, 1231 | "metadata": {}, 1232 | "output_type": "execute_result" 1233 | } 1234 | ], 1235 | "source": [ 1236 | "(df\n", 1237 | " .select(cols)\n", 1238 | " .estimated_size()\n", 1239 | ")" 1240 | ] 1241 | }, 1242 | { 1243 | "cell_type": "code", 1244 | "execution_count": null, 1245 | "metadata": {}, 1246 | "outputs": [], 1247 | "source": [] 1248 | }, 1249 | { 1250 | "cell_type": "code", 1251 | "execution_count": null, 1252 | "metadata": {}, 1253 | "outputs": [], 1254 | "source": [] 1255 | }, 1256 | { 1257 | "cell_type": "code", 1258 | "execution_count": null, 1259 | "metadata": { 1260 | "lines_to_next_cell": 2, 1261 | "pycharm": { 1262 | "name": "#%%\n" 1263 | } 1264 | }, 1265 | "outputs": [], 1266 | "source": [] 1267 | }, 1268 | { 1269 | "cell_type": "code", 1270 | "execution_count": null, 1271 | "metadata": { 1272 | "lines_to_next_cell": 2, 1273 | "pycharm": { 1274 | "name": "#%%\n" 1275 | } 1276 | }, 1277 | "outputs": [], 1278 | "source": [] 1279 | }, 1280 | { 1281 | "cell_type": "markdown", 1282 | "metadata": { 1283 | "pycharm": { 1284 | "name": "#%% md\n" 1285 | } 1286 | }, 1287 | "source": [ 1288 | "### Strings" 1289 | ] 1290 | }, 1291 | { 1292 | "cell_type": "code", 1293 | "execution_count": 83, 1294 | "metadata": { 1295 | "pycharm": { 1296 | "name": "#%%\n" 1297 | }, 1298 | "scrolled": true 1299 | }, 1300 | "outputs": [ 1301 | { 1302 | "data": { 1303 | "text/html": [ 1304 | "
\n", 1311 | "shape: (0, 0)
" 1312 | ], 1313 | "text/plain": [ 1314 | "shape: (0, 0)\n", 1315 | "┌┐\n", 1316 | "╞╡\n", 1317 | "└┘" 1318 | ] 1319 | }, 1320 | "execution_count": 83, 1321 | "metadata": {}, 1322 | "output_type": "execute_result" 1323 | } 1324 | ], 1325 | "source": [ 1326 | "df.select(cols).select(pl.col(pl.String))" 1327 | ] 1328 | }, 1329 | { 1330 | "cell_type": "code", 1331 | "execution_count": 84, 1332 | "metadata": { 1333 | "scrolled": true 1334 | }, 1335 | "outputs": [ 1336 | { 1337 | "data": { 1338 | "text/html": [ 1339 | "
\n", 1346 | "shape: (10_430, 14)
coursedistance_2dlatitudelongitudetimeelevationspeed_betweentravelledelapsedavg_velocityrolling_travelledrolling_elapsedrolling_velocityrolling_between
strf64f64f64datetime[μs, UTC]f64f64f64i16f64f64f64f64f64
"Maple Syrup"0.040.879161-111.8551692024-09-10 23:41:56 UTC1480.0null0.00NaNnullnullnullnull
"Maple Syrup"1.21088340.879167-111.8551812024-09-10 23:41:58 UTC1480.10.6075031.21088320.605442nullnullnullnull
"Maple Syrup"1.22761240.879172-111.8551942024-09-10 23:41:59 UTC1480.11.2276122.43849530.812832nullnullnullnull
"Maple Syrup"0.95223840.879174-111.8552052024-09-10 23:42:00 UTC1480.10.9522383.39073340.847683nullnullnullnull
"Maple Syrup"0.9055140.879177-111.8552152024-09-10 23:42:01 UTC1480.10.905514.29624350.8592492.2672712.80.80974null
"Maple Syrup"4.08514740.857847-111.8238992024-09-11 02:39:58 UTC1696.84.08637118640.85135106821.74507118634.6000410680.01.7448133.177668
"Maple Syrup"1.53572640.857852-111.8239162024-09-11 02:39:59 UTC1696.81.53572618642.387076106831.74505218637.42085210681.01.7449132.821613
"Maple Syrup"3.15613440.857869-111.8239462024-09-11 02:40:00 UTC1696.73.15771818645.54321106841.74518418640.22073810682.01.7450122.800696
"Maple Syrup"3.62648840.857889-111.823982024-09-11 02:40:01 UTC1696.53.63199918649.169698106851.7453618642.94350710683.01.7451042.724433
"Maple Syrup"3.76082940.857909-111.8240162024-09-11 02:40:02 UTC1696.33.76614418652.930528106861.74554818646.17637210684.01.7452433.235592
" 1347 | ], 1348 | "text/plain": [ 1349 | "shape: (10_430, 14)\n", 1350 | "┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐\n", 1351 | "│ course ┆ distance_ ┆ latitude ┆ longitude ┆ … ┆ rolling_t ┆ rolling_e ┆ rolling_v ┆ rolling_ │\n", 1352 | "│ --- ┆ 2d ┆ --- ┆ --- ┆ ┆ ravelled ┆ lapsed ┆ elocity ┆ between │\n", 1353 | "│ str ┆ --- ┆ f64 ┆ f64 ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n", 1354 | "│ ┆ f64 ┆ ┆ ┆ ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n", 1355 | "╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡\n", 1356 | "│ Maple ┆ 0.0 ┆ 40.879161 ┆ -111.8551 ┆ … ┆ null ┆ null ┆ null ┆ null │\n", 1357 | "│ Syrup ┆ ┆ ┆ 69 ┆ ┆ ┆ ┆ ┆ │\n", 1358 | "│ Maple ┆ 1.210883 ┆ 40.879167 ┆ -111.8551 ┆ … ┆ null ┆ null ┆ null ┆ null │\n", 1359 | "│ Syrup ┆ ┆ ┆ 81 ┆ ┆ ┆ ┆ ┆ │\n", 1360 | "│ Maple ┆ 1.227612 ┆ 40.879172 ┆ -111.8551 ┆ … ┆ null ┆ null ┆ null ┆ null │\n", 1361 | "│ Syrup ┆ ┆ ┆ 94 ┆ ┆ ┆ ┆ ┆ │\n", 1362 | "│ Maple ┆ 0.952238 ┆ 40.879174 ┆ -111.8552 ┆ … ┆ null ┆ null ┆ null ┆ null │\n", 1363 | "│ Syrup ┆ ┆ ┆ 05 ┆ ┆ ┆ ┆ ┆ │\n", 1364 | "│ Maple ┆ 0.90551 ┆ 40.879177 ┆ -111.8552 ┆ … ┆ 2.267271 ┆ 2.8 ┆ 0.80974 ┆ null │\n", 1365 | "│ Syrup ┆ ┆ ┆ 15 ┆ ┆ ┆ ┆ ┆ │\n", 1366 | "│ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n", 1367 | "│ Maple ┆ 4.085147 ┆ 40.857847 ┆ -111.8238 ┆ … ┆ 18634.600 ┆ 10680.0 ┆ 1.744813 ┆ 3.177668 │\n", 1368 | "│ Syrup ┆ ┆ ┆ 99 ┆ ┆ 04 ┆ ┆ ┆ │\n", 1369 | "│ Maple ┆ 1.535726 ┆ 40.857852 ┆ -111.8239 ┆ … ┆ 18637.420 ┆ 10681.0 ┆ 1.744913 ┆ 2.821613 │\n", 1370 | "│ Syrup ┆ ┆ ┆ 16 ┆ ┆ 852 ┆ ┆ ┆ │\n", 1371 | "│ Maple ┆ 3.156134 ┆ 40.857869 ┆ -111.8239 ┆ … ┆ 18640.220 ┆ 10682.0 ┆ 1.745012 ┆ 2.800696 │\n", 1372 | "│ Syrup ┆ ┆ ┆ 46 ┆ ┆ 738 ┆ ┆ ┆ │\n", 1373 | "│ Maple ┆ 3.626488 ┆ 40.857889 ┆ -111.8239 ┆ … ┆ 18642.943 ┆ 10683.0 ┆ 1.745104 ┆ 2.724433 │\n", 1374 | "│ Syrup ┆ ┆ ┆ 8 ┆ ┆ 507 ┆ ┆ ┆ │\n", 1375 | "│ Maple ┆ 3.760829 ┆ 40.857909 ┆ -111.8240 ┆ … ┆ 18646.176 ┆ 10684.0 ┆ 1.745243 ┆ 3.235592 │\n", 1376 | "│ Syrup ┆ ┆ ┆ 16 ┆ ┆ 372 ┆ ┆ ┆ │\n", 1377 | "└───────────┴───────────┴───────────┴───────────┴───┴───────────┴───────────┴───────────┴──────────┘" 1378 | ] 1379 | }, 1380 | "execution_count": 84, 1381 | "metadata": {}, 1382 | "output_type": "execute_result" 1383 | } 1384 | ], 1385 | "source": [ 1386 | "# chaining\n", 1387 | "(df\n", 1388 | " .select(cols)\n", 1389 | " .with_columns(pl.col('elapsed').cast(pl.Int16),\n", 1390 | " course=pl.lit('Maple Syrup'))\n", 1391 | ")" 1392 | ] 1393 | }, 1394 | { 1395 | "cell_type": "code", 1396 | "execution_count": 85, 1397 | "metadata": { 1398 | "scrolled": true 1399 | }, 1400 | "outputs": [ 1401 | { 1402 | "data": { 1403 | "text/html": [ 1404 | "
\n", 1411 | "shape: (10_430, 15)
coursedistance_2dlatitudelongitudetimeelevationspeed_betweentravelledelapsedavg_velocityrolling_travelledrolling_elapsedrolling_velocityrolling_betweenalt_name
strf64f64f64datetime[μs, UTC]f64f64f64i16f64f64f64f64f64str
"Maple Syrup"0.040.879161-111.8551692024-09-10 23:41:56 UTC1480.0null0.00NaNnullnullnullnull"MAPLE SYRUP"
"Maple Syrup"1.21088340.879167-111.8551812024-09-10 23:41:58 UTC1480.10.6075031.21088320.605442nullnullnullnull"MAPLE SYRUP"
"Maple Syrup"1.22761240.879172-111.8551942024-09-10 23:41:59 UTC1480.11.2276122.43849530.812832nullnullnullnull"MAPLE SYRUP"
"Maple Syrup"0.95223840.879174-111.8552052024-09-10 23:42:00 UTC1480.10.9522383.39073340.847683nullnullnullnull"MAPLE SYRUP"
"Maple Syrup"0.9055140.879177-111.8552152024-09-10 23:42:01 UTC1480.10.905514.29624350.8592492.2672712.80.80974null"MAPLE SYRUP"
"Maple Syrup"4.08514740.857847-111.8238992024-09-11 02:39:58 UTC1696.84.08637118640.85135106821.74507118634.6000410680.01.7448133.177668"MAPLE SYRUP"
"Maple Syrup"1.53572640.857852-111.8239162024-09-11 02:39:59 UTC1696.81.53572618642.387076106831.74505218637.42085210681.01.7449132.821613"MAPLE SYRUP"
"Maple Syrup"3.15613440.857869-111.8239462024-09-11 02:40:00 UTC1696.73.15771818645.54321106841.74518418640.22073810682.01.7450122.800696"MAPLE SYRUP"
"Maple Syrup"3.62648840.857889-111.823982024-09-11 02:40:01 UTC1696.53.63199918649.169698106851.7453618642.94350710683.01.7451042.724433"MAPLE SYRUP"
"Maple Syrup"3.76082940.857909-111.8240162024-09-11 02:40:02 UTC1696.33.76614418652.930528106861.74554818646.17637210684.01.7452433.235592"MAPLE SYRUP"
" 1412 | ], 1413 | "text/plain": [ 1414 | "shape: (10_430, 15)\n", 1415 | "┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐\n", 1416 | "│ course ┆ distance_ ┆ latitude ┆ longitude ┆ … ┆ rolling_e ┆ rolling_v ┆ rolling_b ┆ alt_name │\n", 1417 | "│ --- ┆ 2d ┆ --- ┆ --- ┆ ┆ lapsed ┆ elocity ┆ etween ┆ --- │\n", 1418 | "│ str ┆ --- ┆ f64 ┆ f64 ┆ ┆ --- ┆ --- ┆ --- ┆ str │\n", 1419 | "│ ┆ f64 ┆ ┆ ┆ ┆ f64 ┆ f64 ┆ f64 ┆ │\n", 1420 | "╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡\n", 1421 | "│ Maple ┆ 0.0 ┆ 40.879161 ┆ -111.8551 ┆ … ┆ null ┆ null ┆ null ┆ MAPLE │\n", 1422 | "│ Syrup ┆ ┆ ┆ 69 ┆ ┆ ┆ ┆ ┆ SYRUP │\n", 1423 | "│ Maple ┆ 1.210883 ┆ 40.879167 ┆ -111.8551 ┆ … ┆ null ┆ null ┆ null ┆ MAPLE │\n", 1424 | "│ Syrup ┆ ┆ ┆ 81 ┆ ┆ ┆ ┆ ┆ SYRUP │\n", 1425 | "│ Maple ┆ 1.227612 ┆ 40.879172 ┆ -111.8551 ┆ … ┆ null ┆ null ┆ null ┆ MAPLE │\n", 1426 | "│ Syrup ┆ ┆ ┆ 94 ┆ ┆ ┆ ┆ ┆ SYRUP │\n", 1427 | "│ Maple ┆ 0.952238 ┆ 40.879174 ┆ -111.8552 ┆ … ┆ null ┆ null ┆ null ┆ MAPLE │\n", 1428 | "│ Syrup ┆ ┆ ┆ 05 ┆ ┆ ┆ ┆ ┆ SYRUP │\n", 1429 | "│ Maple ┆ 0.90551 ┆ 40.879177 ┆ -111.8552 ┆ … ┆ 2.8 ┆ 0.80974 ┆ null ┆ MAPLE │\n", 1430 | "│ Syrup ┆ ┆ ┆ 15 ┆ ┆ ┆ ┆ ┆ SYRUP │\n", 1431 | "│ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n", 1432 | "│ Maple ┆ 4.085147 ┆ 40.857847 ┆ -111.8238 ┆ … ┆ 10680.0 ┆ 1.744813 ┆ 3.177668 ┆ MAPLE │\n", 1433 | "│ Syrup ┆ ┆ ┆ 99 ┆ ┆ ┆ ┆ ┆ SYRUP │\n", 1434 | "│ Maple ┆ 1.535726 ┆ 40.857852 ┆ -111.8239 ┆ … ┆ 10681.0 ┆ 1.744913 ┆ 2.821613 ┆ MAPLE │\n", 1435 | "│ Syrup ┆ ┆ ┆ 16 ┆ ┆ ┆ ┆ ┆ SYRUP │\n", 1436 | "│ Maple ┆ 3.156134 ┆ 40.857869 ┆ -111.8239 ┆ … ┆ 10682.0 ┆ 1.745012 ┆ 2.800696 ┆ MAPLE │\n", 1437 | "│ Syrup ┆ ┆ ┆ 46 ┆ ┆ ┆ ┆ ┆ SYRUP │\n", 1438 | "│ Maple ┆ 3.626488 ┆ 40.857889 ┆ -111.8239 ┆ … ┆ 10683.0 ┆ 1.745104 ┆ 2.724433 ┆ MAPLE │\n", 1439 | "│ Syrup ┆ ┆ ┆ 8 ┆ ┆ ┆ ┆ ┆ SYRUP │\n", 1440 | "│ Maple ┆ 3.760829 ┆ 40.857909 ┆ -111.8240 ┆ … ┆ 10684.0 ┆ 1.745243 ┆ 3.235592 ┆ MAPLE │\n", 1441 | "│ Syrup ┆ ┆ ┆ 16 ┆ ┆ ┆ ┆ ┆ SYRUP │\n", 1442 | "└───────────┴───────────┴───────────┴───────────┴───┴───────────┴───────────┴───────────┴──────────┘" 1443 | ] 1444 | }, 1445 | "execution_count": 85, 1446 | "metadata": {}, 1447 | "output_type": "execute_result" 1448 | } 1449 | ], 1450 | "source": [ 1451 | "# an example of a string operation\n", 1452 | "(df\n", 1453 | " .select(cols)\n", 1454 | " .with_columns(pl.col('elapsed').cast(pl.Int16),\n", 1455 | " course=pl.lit('Maple Syrup'))\n", 1456 | " .with_columns(alt_name=pl.col('course').str.to_uppercase()) \n", 1457 | " \n", 1458 | ")" 1459 | ] 1460 | }, 1461 | { 1462 | "cell_type": "code", 1463 | "execution_count": 86, 1464 | "metadata": {}, 1465 | "outputs": [ 1466 | { 1467 | "name": "stdout", 1468 | "output_type": "stream", 1469 | "text": [ 1470 | "['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_accessor', '_pyexpr', 'concat', 'contains', 'contains_any', 'count_matches', 'decode', 'encode', 'ends_with', 'escape_regex', 'explode', 'extract', 'extract_all', 'extract_groups', 'extract_many', 'find', 'head', 'join', 'json_decode', 'json_path_match', 'len_bytes', 'len_chars', 'pad_end', 'pad_start', 'replace', 'replace_all', 'replace_many', 'reverse', 'slice', 'split', 'split_exact', 'splitn', 'starts_with', 'strip_chars', 'strip_chars_end', 'strip_chars_start', 'strip_prefix', 'strip_suffix', 'strptime', 'tail', 'to_date', 'to_datetime', 'to_decimal', 'to_integer', 'to_lowercase', 'to_time', 'to_titlecase', 'to_uppercase', 'zfill']\n" 1471 | ] 1472 | } 1473 | ], 1474 | "source": [ 1475 | "# a bunch of string methods off of .str\n", 1476 | "# note that the spelling might be different from python/pandas\n", 1477 | "col = pl.col('')\n", 1478 | "print(dir(col.str))" 1479 | ] 1480 | }, 1481 | { 1482 | "cell_type": "code", 1483 | "execution_count": 87, 1484 | "metadata": { 1485 | "lines_to_next_cell": 0 1486 | }, 1487 | "outputs": [], 1488 | "source": [ 1489 | "col.str.to_uppercase?" 1490 | ] 1491 | }, 1492 | { 1493 | "cell_type": "code", 1494 | "execution_count": null, 1495 | "metadata": {}, 1496 | "outputs": [], 1497 | "source": [] 1498 | }, 1499 | { 1500 | "cell_type": "code", 1501 | "execution_count": null, 1502 | "metadata": {}, 1503 | "outputs": [], 1504 | "source": [] 1505 | }, 1506 | { 1507 | "cell_type": "markdown", 1508 | "metadata": {}, 1509 | "source": [ 1510 | "## Convert Date to Local Time" 1511 | ] 1512 | }, 1513 | { 1514 | "cell_type": "code", 1515 | "execution_count": 88, 1516 | "metadata": {}, 1517 | "outputs": [ 1518 | { 1519 | "data": { 1520 | "text/html": [ 1521 | "
\n", 1528 | "shape: (10_430, 14)
coursedistance_2dlatitudelongitudetimeelevationspeed_betweentravelledelapsedavg_velocityrolling_travelledrolling_elapsedrolling_velocityrolling_between
catf64f64f64datetime[μs, America/Denver]f64f64f64i16f64f64f64f64f64
"Maple Syrup"0.040.879161-111.8551692024-09-10 17:41:56 MDT1480.0null0.00NaNnullnullnullnull
"Maple Syrup"1.21088340.879167-111.8551812024-09-10 17:41:58 MDT1480.10.6075031.21088320.605442nullnullnullnull
"Maple Syrup"1.22761240.879172-111.8551942024-09-10 17:41:59 MDT1480.11.2276122.43849530.812832nullnullnullnull
"Maple Syrup"0.95223840.879174-111.8552052024-09-10 17:42:00 MDT1480.10.9522383.39073340.847683nullnullnullnull
"Maple Syrup"0.9055140.879177-111.8552152024-09-10 17:42:01 MDT1480.10.905514.29624350.8592492.2672712.80.80974null
"Maple Syrup"4.08514740.857847-111.8238992024-09-10 20:39:58 MDT1696.84.08637118640.85135106821.74507118634.6000410680.01.7448133.177668
"Maple Syrup"1.53572640.857852-111.8239162024-09-10 20:39:59 MDT1696.81.53572618642.387076106831.74505218637.42085210681.01.7449132.821613
"Maple Syrup"3.15613440.857869-111.8239462024-09-10 20:40:00 MDT1696.73.15771818645.54321106841.74518418640.22073810682.01.7450122.800696
"Maple Syrup"3.62648840.857889-111.823982024-09-10 20:40:01 MDT1696.53.63199918649.169698106851.7453618642.94350710683.01.7451042.724433
"Maple Syrup"3.76082940.857909-111.8240162024-09-10 20:40:02 MDT1696.33.76614418652.930528106861.74554818646.17637210684.01.7452433.235592
" 1529 | ], 1530 | "text/plain": [ 1531 | "shape: (10_430, 14)\n", 1532 | "┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐\n", 1533 | "│ course ┆ distance_ ┆ latitude ┆ longitude ┆ … ┆ rolling_t ┆ rolling_e ┆ rolling_v ┆ rolling_ │\n", 1534 | "│ --- ┆ 2d ┆ --- ┆ --- ┆ ┆ ravelled ┆ lapsed ┆ elocity ┆ between │\n", 1535 | "│ cat ┆ --- ┆ f64 ┆ f64 ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n", 1536 | "│ ┆ f64 ┆ ┆ ┆ ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n", 1537 | "╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡\n", 1538 | "│ Maple ┆ 0.0 ┆ 40.879161 ┆ -111.8551 ┆ … ┆ null ┆ null ┆ null ┆ null │\n", 1539 | "│ Syrup ┆ ┆ ┆ 69 ┆ ┆ ┆ ┆ ┆ │\n", 1540 | "│ Maple ┆ 1.210883 ┆ 40.879167 ┆ -111.8551 ┆ … ┆ null ┆ null ┆ null ┆ null │\n", 1541 | "│ Syrup ┆ ┆ ┆ 81 ┆ ┆ ┆ ┆ ┆ │\n", 1542 | "│ Maple ┆ 1.227612 ┆ 40.879172 ┆ -111.8551 ┆ … ┆ null ┆ null ┆ null ┆ null │\n", 1543 | "│ Syrup ┆ ┆ ┆ 94 ┆ ┆ ┆ ┆ ┆ │\n", 1544 | "│ Maple ┆ 0.952238 ┆ 40.879174 ┆ -111.8552 ┆ … ┆ null ┆ null ┆ null ┆ null │\n", 1545 | "│ Syrup ┆ ┆ ┆ 05 ┆ ┆ ┆ ┆ ┆ │\n", 1546 | "│ Maple ┆ 0.90551 ┆ 40.879177 ┆ -111.8552 ┆ … ┆ 2.267271 ┆ 2.8 ┆ 0.80974 ┆ null │\n", 1547 | "│ Syrup ┆ ┆ ┆ 15 ┆ ┆ ┆ ┆ ┆ │\n", 1548 | "│ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n", 1549 | "│ Maple ┆ 4.085147 ┆ 40.857847 ┆ -111.8238 ┆ … ┆ 18634.600 ┆ 10680.0 ┆ 1.744813 ┆ 3.177668 │\n", 1550 | "│ Syrup ┆ ┆ ┆ 99 ┆ ┆ 04 ┆ ┆ ┆ │\n", 1551 | "│ Maple ┆ 1.535726 ┆ 40.857852 ┆ -111.8239 ┆ … ┆ 18637.420 ┆ 10681.0 ┆ 1.744913 ┆ 2.821613 │\n", 1552 | "│ Syrup ┆ ┆ ┆ 16 ┆ ┆ 852 ┆ ┆ ┆ │\n", 1553 | "│ Maple ┆ 3.156134 ┆ 40.857869 ┆ -111.8239 ┆ … ┆ 18640.220 ┆ 10682.0 ┆ 1.745012 ┆ 2.800696 │\n", 1554 | "│ Syrup ┆ ┆ ┆ 46 ┆ ┆ 738 ┆ ┆ ┆ │\n", 1555 | "│ Maple ┆ 3.626488 ┆ 40.857889 ┆ -111.8239 ┆ … ┆ 18642.943 ┆ 10683.0 ┆ 1.745104 ┆ 2.724433 │\n", 1556 | "│ Syrup ┆ ┆ ┆ 8 ┆ ┆ 507 ┆ ┆ ┆ │\n", 1557 | "│ Maple ┆ 3.760829 ┆ 40.857909 ┆ -111.8240 ┆ … ┆ 18646.176 ┆ 10684.0 ┆ 1.745243 ┆ 3.235592 │\n", 1558 | "│ Syrup ┆ ┆ ┆ 16 ┆ ┆ 372 ┆ ┆ ┆ │\n", 1559 | "└───────────┴───────────┴───────────┴───────────┴───┴───────────┴───────────┴───────────┴──────────┘" 1560 | ] 1561 | }, 1562 | "execution_count": 88, 1563 | "metadata": {}, 1564 | "output_type": "execute_result" 1565 | } 1566 | ], 1567 | "source": [ 1568 | "(df\n", 1569 | " .select(cols)\n", 1570 | " .with_columns(pl.col('elapsed').cast(pl.Int16),\n", 1571 | " course=pl.lit('Maple Syrup').cast(pl.Categorical),\n", 1572 | " time=pl.col('time').dt.convert_time_zone('America/Denver')\n", 1573 | " )\n", 1574 | ")" 1575 | ] 1576 | }, 1577 | { 1578 | "cell_type": "code", 1579 | "execution_count": 89, 1580 | "metadata": {}, 1581 | "outputs": [ 1582 | { 1583 | "name": "stdout", 1584 | "output_type": "stream", 1585 | "text": [ 1586 | "['__abs__', '__add__', '__and__', '__annotations__', '__array_ufunc__', '__bool__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__floordiv__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__invert__', '__le__', '__lt__', '__mod__', '__module__', '__mul__', '__ne__', '__neg__', '__new__', '__or__', '__pos__', '__pow__', '__radd__', '__rand__', '__reduce__', '__reduce_ex__', '__repr__', '__rfloordiv__', '__rmod__', '__rmul__', '__ror__', '__rpow__', '__rsub__', '__rtruediv__', '__rxor__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__sub__', '__subclasshook__', '__truediv__', '__weakref__', '__xor__', '_accessors', '_from_pyexpr', '_map_batches_wrapper', '_pyexpr', '_repr_html_', 'abs', 'add', 'agg_groups', 'alias', 'all', 'and_', 'any', 'append', 'approx_n_unique', 'arccos', 'arccosh', 'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg_max', 'arg_min', 'arg_sort', 'arg_true', 'arg_unique', 'arr', 'backward_fill', 'bin', 'bitwise_and', 'bitwise_count_ones', 'bitwise_count_zeros', 'bitwise_leading_ones', 'bitwise_leading_zeros', 'bitwise_or', 'bitwise_trailing_ones', 'bitwise_trailing_zeros', 'bitwise_xor', 'bottom_k', 'bottom_k_by', 'cast', 'cat', 'cbrt', 'ceil', 'clip', 'cos', 'cosh', 'cot', 'count', 'cum_count', 'cum_max', 'cum_min', 'cum_prod', 'cum_sum', 'cumulative_eval', 'cut', 'degrees', 'deserialize', 'diff', 'dot', 'drop_nans', 'drop_nulls', 'dt', 'entropy', 'eq', 'eq_missing', 'ewm_mean', 'ewm_mean_by', 'ewm_std', 'ewm_var', 'exclude', 'exp', 'explode', 'extend_constant', 'fill_nan', 'fill_null', 'filter', 'first', 'flatten', 'floor', 'floordiv', 'forward_fill', 'from_json', 'gather', 'gather_every', 'ge', 'get', 'gt', 'has_nulls', 'hash', 'head', 'hist', 'implode', 'inspect', 'interpolate', 'interpolate_by', 'is_between', 'is_duplicated', 'is_finite', 'is_first_distinct', 'is_in', 'is_infinite', 'is_last_distinct', 'is_nan', 'is_not_nan', 'is_not_null', 'is_null', 'is_unique', 'kurtosis', 'last', 'le', 'len', 'limit', 'list', 'log', 'log10', 'log1p', 'lower_bound', 'lt', 'map_batches', 'map_elements', 'max', 'mean', 'median', 'meta', 'min', 'mod', 'mode', 'mul', 'n_unique', 'name', 'nan_max', 'nan_min', 'ne', 'ne_missing', 'neg', 'not_', 'null_count', 'or_', 'over', 'pct_change', 'peak_max', 'peak_min', 'pipe', 'pow', 'product', 'qcut', 'quantile', 'radians', 'rank', 'rechunk', 'register_plugin', 'reinterpret', 'repeat_by', 'replace', 'replace_strict', 'reshape', 'reverse', 'rle', 'rle_id', 'rolling', 'rolling_map', 'rolling_max', 'rolling_max_by', 'rolling_mean', 'rolling_mean_by', 'rolling_median', 'rolling_median_by', 'rolling_min', 'rolling_min_by', 'rolling_quantile', 'rolling_quantile_by', 'rolling_skew', 'rolling_std', 'rolling_std_by', 'rolling_sum', 'rolling_sum_by', 'rolling_var', 'rolling_var_by', 'round', 'round_sig_figs', 'sample', 'search_sorted', 'set_sorted', 'shift', 'shrink_dtype', 'shuffle', 'sign', 'sin', 'sinh', 'skew', 'slice', 'sort', 'sort_by', 'sqrt', 'std', 'str', 'struct', 'sub', 'sum', 'tail', 'tan', 'tanh', 'to_physical', 'top_k', 'top_k_by', 'truediv', 'unique', 'unique_counts', 'upper_bound', 'value_counts', 'var', 'where', 'xor']\n" 1587 | ] 1588 | } 1589 | ], 1590 | "source": [ 1591 | "col = pl.col('time')\n", 1592 | "print(dir(col))" 1593 | ] 1594 | }, 1595 | { 1596 | "cell_type": "code", 1597 | "execution_count": 90, 1598 | "metadata": {}, 1599 | "outputs": [ 1600 | { 1601 | "name": "stdout", 1602 | "output_type": "stream", 1603 | "text": [ 1604 | "268\n" 1605 | ] 1606 | } 1607 | ], 1608 | "source": [ 1609 | "print(len(dir(col)))" 1610 | ] 1611 | }, 1612 | { 1613 | "cell_type": "code", 1614 | "execution_count": 91, 1615 | "metadata": {}, 1616 | "outputs": [ 1617 | { 1618 | "name": "stdout", 1619 | "output_type": "stream", 1620 | "text": [ 1621 | "['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_accessor', '_pyexpr', 'add_business_days', 'base_utc_offset', 'cast_time_unit', 'century', 'combine', 'convert_time_zone', 'date', 'datetime', 'day', 'dst_offset', 'epoch', 'hour', 'is_leap_year', 'iso_year', 'microsecond', 'millennium', 'millisecond', 'minute', 'month', 'month_end', 'month_start', 'nanosecond', 'offset_by', 'ordinal_day', 'quarter', 'replace_time_zone', 'round', 'second', 'strftime', 'time', 'timestamp', 'to_string', 'total_days', 'total_hours', 'total_microseconds', 'total_milliseconds', 'total_minutes', 'total_nanoseconds', 'total_seconds', 'truncate', 'week', 'weekday', 'with_time_unit', 'year']\n" 1622 | ] 1623 | } 1624 | ], 1625 | "source": [ 1626 | "print(dir(col.dt))" 1627 | ] 1628 | }, 1629 | { 1630 | "cell_type": "code", 1631 | "execution_count": 92, 1632 | "metadata": {}, 1633 | "outputs": [ 1634 | { 1635 | "name": "stdout", 1636 | "output_type": "stream", 1637 | "text": [ 1638 | "72\n" 1639 | ] 1640 | } 1641 | ], 1642 | "source": [ 1643 | "print(len(dir(col.dt)))" 1644 | ] 1645 | }, 1646 | { 1647 | "cell_type": "code", 1648 | "execution_count": null, 1649 | "metadata": {}, 1650 | "outputs": [], 1651 | "source": [] 1652 | }, 1653 | { 1654 | "cell_type": "code", 1655 | "execution_count": null, 1656 | "metadata": {}, 1657 | "outputs": [], 1658 | "source": [] 1659 | }, 1660 | { 1661 | "cell_type": "code", 1662 | "execution_count": null, 1663 | "metadata": {}, 1664 | "outputs": [], 1665 | "source": [] 1666 | }, 1667 | { 1668 | "cell_type": "code", 1669 | "execution_count": null, 1670 | "metadata": {}, 1671 | "outputs": [], 1672 | "source": [] 1673 | }, 1674 | { 1675 | "cell_type": "markdown", 1676 | "metadata": {}, 1677 | "source": [ 1678 | "## Missing Data\n", 1679 | "\n", 1680 | "- Use `.fill_null` to address\n", 1681 | "- Use `.filter` to filter rows\n", 1682 | "- Use `.select` to select columns\n", 1683 | "\n", 1684 | "To view rows with missing data use `.filter(pl.col(\"col_name\").is_null())`" 1685 | ] 1686 | }, 1687 | { 1688 | "cell_type": "code", 1689 | "execution_count": 93, 1690 | "metadata": {}, 1691 | "outputs": [ 1692 | { 1693 | "data": { 1694 | "text/html": [ 1695 | "
\n", 1702 | "shape: (1, 14)
coursedistance_2dlatitudelongitudetimeelevationspeed_betweentravelledelapsedavg_velocityrolling_travelledrolling_elapsedrolling_velocityrolling_between
u32u32u32u32u32u32u32u32u32u32u32u32u32u32
00000010004445
" 1703 | ], 1704 | "text/plain": [ 1705 | "shape: (1, 14)\n", 1706 | "┌────────┬────────────┬──────────┬───────────┬───┬────────────┬────────────┬───────────┬───────────┐\n", 1707 | "│ course ┆ distance_2 ┆ latitude ┆ longitude ┆ … ┆ rolling_tr ┆ rolling_el ┆ rolling_v ┆ rolling_b │\n", 1708 | "│ --- ┆ d ┆ --- ┆ --- ┆ ┆ avelled ┆ apsed ┆ elocity ┆ etween │\n", 1709 | "│ u32 ┆ --- ┆ u32 ┆ u32 ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n", 1710 | "│ ┆ u32 ┆ ┆ ┆ ┆ u32 ┆ u32 ┆ u32 ┆ u32 │\n", 1711 | "╞════════╪════════════╪══════════╪═══════════╪═══╪════════════╪════════════╪═══════════╪═══════════╡\n", 1712 | "│ 0 ┆ 0 ┆ 0 ┆ 0 ┆ … ┆ 4 ┆ 4 ┆ 4 ┆ 5 │\n", 1713 | "└────────┴────────────┴──────────┴───────────┴───┴────────────┴────────────┴───────────┴───────────┘" 1714 | ] 1715 | }, 1716 | "execution_count": 93, 1717 | "metadata": {}, 1718 | "output_type": "execute_result" 1719 | } 1720 | ], 1721 | "source": [ 1722 | "(df\n", 1723 | " .select(cols)\n", 1724 | " .with_columns(pl.col('elapsed').cast(pl.Int16),\n", 1725 | " course=pl.lit('Maple Syrup').cast(pl.Categorical),\n", 1726 | " time=pl.col('time').dt.convert_time_zone('America/Denver'))\n", 1727 | " .null_count()\n", 1728 | ")" 1729 | ] 1730 | }, 1731 | { 1732 | "cell_type": "code", 1733 | "execution_count": 94, 1734 | "metadata": { 1735 | "collapsed": true, 1736 | "jupyter": { 1737 | "outputs_hidden": true 1738 | } 1739 | }, 1740 | "outputs": [ 1741 | { 1742 | "data": { 1743 | "text/html": [ 1744 | "
\n", 1751 | "shape: (10_430, 1)
rolling_between
bool
true
true
true
true
true
false
false
false
false
false
" 1752 | ], 1753 | "text/plain": [ 1754 | "shape: (10_430, 1)\n", 1755 | "┌─────────────────┐\n", 1756 | "│ rolling_between │\n", 1757 | "│ --- │\n", 1758 | "│ bool │\n", 1759 | "╞═════════════════╡\n", 1760 | "│ true │\n", 1761 | "│ true │\n", 1762 | "│ true │\n", 1763 | "│ true │\n", 1764 | "│ true │\n", 1765 | "│ … │\n", 1766 | "│ false │\n", 1767 | "│ false │\n", 1768 | "│ false │\n", 1769 | "│ false │\n", 1770 | "│ false │\n", 1771 | "└─────────────────┘" 1772 | ] 1773 | }, 1774 | "execution_count": 94, 1775 | "metadata": {}, 1776 | "output_type": "execute_result" 1777 | } 1778 | ], 1779 | "source": [ 1780 | "# use .select to find where rows are missing\n", 1781 | "(df\n", 1782 | " .select(cols)\n", 1783 | " .with_columns(pl.col('elapsed').cast(pl.Int16),\n", 1784 | " course=pl.lit('Maple Syrup').cast(pl.Categorical),\n", 1785 | " time=pl.col('time').dt.convert_time_zone('America/Denver'))\n", 1786 | " .select(pl.col('rolling_between').is_null())\n", 1787 | ")" 1788 | ] 1789 | }, 1790 | { 1791 | "cell_type": "code", 1792 | "execution_count": 95, 1793 | "metadata": {}, 1794 | "outputs": [ 1795 | { 1796 | "data": { 1797 | "text/html": [ 1798 | "
\n", 1805 | "shape: (5, 15)
indexcoursedistance_2dlatitudelongitudetimeelevationspeed_betweentravelledelapsedavg_velocityrolling_travelledrolling_elapsedrolling_velocityrolling_between
u32catf64f64f64datetime[μs, America/Denver]f64f64f64i16f64f64f64f64f64
0"Maple Syrup"0.040.879161-111.8551692024-09-10 17:41:56 MDT1480.0null0.00NaNnullnullnullnull
1"Maple Syrup"1.21088340.879167-111.8551812024-09-10 17:41:58 MDT1480.10.6075031.21088320.605442nullnullnullnull
2"Maple Syrup"1.22761240.879172-111.8551942024-09-10 17:41:59 MDT1480.11.2276122.43849530.812832nullnullnullnull
3"Maple Syrup"0.95223840.879174-111.8552052024-09-10 17:42:00 MDT1480.10.9522383.39073340.847683nullnullnullnull
4"Maple Syrup"0.9055140.879177-111.8552152024-09-10 17:42:01 MDT1480.10.905514.29624350.8592492.2672712.80.80974null
" 1806 | ], 1807 | "text/plain": [ 1808 | "shape: (5, 15)\n", 1809 | "┌───────┬────────┬─────────────┬───────────┬───┬────────────┬────────────┬────────────┬────────────┐\n", 1810 | "│ index ┆ course ┆ distance_2d ┆ latitude ┆ … ┆ rolling_tr ┆ rolling_el ┆ rolling_ve ┆ rolling_be │\n", 1811 | "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ avelled ┆ apsed ┆ locity ┆ tween │\n", 1812 | "│ u32 ┆ cat ┆ f64 ┆ f64 ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n", 1813 | "│ ┆ ┆ ┆ ┆ ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n", 1814 | "╞═══════╪════════╪═════════════╪═══════════╪═══╪════════════╪════════════╪════════════╪════════════╡\n", 1815 | "│ 0 ┆ Maple ┆ 0.0 ┆ 40.879161 ┆ … ┆ null ┆ null ┆ null ┆ null │\n", 1816 | "│ ┆ Syrup ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", 1817 | "│ 1 ┆ Maple ┆ 1.210883 ┆ 40.879167 ┆ … ┆ null ┆ null ┆ null ┆ null │\n", 1818 | "│ ┆ Syrup ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", 1819 | "│ 2 ┆ Maple ┆ 1.227612 ┆ 40.879172 ┆ … ┆ null ┆ null ┆ null ┆ null │\n", 1820 | "│ ┆ Syrup ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", 1821 | "│ 3 ┆ Maple ┆ 0.952238 ┆ 40.879174 ┆ … ┆ null ┆ null ┆ null ┆ null │\n", 1822 | "│ ┆ Syrup ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", 1823 | "│ 4 ┆ Maple ┆ 0.90551 ┆ 40.879177 ┆ … ┆ 2.267271 ┆ 2.8 ┆ 0.80974 ┆ null │\n", 1824 | "│ ┆ Syrup ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", 1825 | "└───────┴────────┴─────────────┴───────────┴───┴────────────┴────────────┴────────────┴────────────┘" 1826 | ] 1827 | }, 1828 | "execution_count": 95, 1829 | "metadata": {}, 1830 | "output_type": "execute_result" 1831 | } 1832 | ], 1833 | "source": [ 1834 | "# change .select to .filter to view the rows\n", 1835 | "(df\n", 1836 | " .select(cols)\n", 1837 | " .with_row_index()\n", 1838 | " .with_columns(pl.col('elapsed').cast(pl.Int16),\n", 1839 | " course=pl.lit('Maple Syrup').cast(pl.Categorical),\n", 1840 | " time=pl.col('time').dt.convert_time_zone('America/Denver')) \n", 1841 | " .filter(pl.col('rolling_between').is_null())\n", 1842 | ")" 1843 | ] 1844 | }, 1845 | { 1846 | "cell_type": "code", 1847 | "execution_count": 96, 1848 | "metadata": {}, 1849 | "outputs": [ 1850 | { 1851 | "data": { 1852 | "text/html": [ 1853 | "
\n", 1860 | "shape: (1, 12)
distance_2dlatitudelongitudeelevationspeed_betweentravelledelapsedavg_velocityrolling_travelledrolling_elapsedrolling_velocityrolling_between
u32u32u32u32u32u32u32u32u32u32u32u32
000000010000
" 1861 | ], 1862 | "text/plain": [ 1863 | "shape: (1, 12)\n", 1864 | "┌───────────┬──────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬───────────┐\n", 1865 | "│ distance_ ┆ latitude ┆ longitude ┆ elevation ┆ … ┆ rolling_t ┆ rolling_e ┆ rolling_v ┆ rolling_b │\n", 1866 | "│ 2d ┆ --- ┆ --- ┆ --- ┆ ┆ ravelled ┆ lapsed ┆ elocity ┆ etween │\n", 1867 | "│ --- ┆ u32 ┆ u32 ┆ u32 ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n", 1868 | "│ u32 ┆ ┆ ┆ ┆ ┆ u32 ┆ u32 ┆ u32 ┆ u32 │\n", 1869 | "╞═══════════╪══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪═══════════╡\n", 1870 | "│ 0 ┆ 0 ┆ 0 ┆ 0 ┆ … ┆ 0 ┆ 0 ┆ 0 ┆ 0 │\n", 1871 | "└───────────┴──────────┴───────────┴───────────┴───┴───────────┴───────────┴───────────┴───────────┘" 1872 | ] 1873 | }, 1874 | "execution_count": 96, 1875 | "metadata": {}, 1876 | "output_type": "execute_result" 1877 | } 1878 | ], 1879 | "source": [ 1880 | "# what about nans?\n", 1881 | "# note that nan and null are different in polars\n", 1882 | "# nan means not a number\n", 1883 | "# null means missing data\n", 1884 | "(df\n", 1885 | " .select(cols)\n", 1886 | " .with_columns(pl.col('elapsed').cast(pl.Int16),\n", 1887 | " course=pl.lit('Maple Syrup').cast(pl.Categorical),\n", 1888 | " time=pl.col('time').dt.convert_time_zone('America/Denver')) \n", 1889 | " .select(cs.numeric().is_nan().sum())\n", 1890 | ")" 1891 | ] 1892 | }, 1893 | { 1894 | "cell_type": "code", 1895 | "execution_count": 97, 1896 | "metadata": {}, 1897 | "outputs": [ 1898 | { 1899 | "data": { 1900 | "text/html": [ 1901 | "
\n", 1908 | "shape: (10_430, 1)
avg_velocity
bool
true
false
false
false
false
false
false
false
false
false
" 1909 | ], 1910 | "text/plain": [ 1911 | "shape: (10_430, 1)\n", 1912 | "┌──────────────┐\n", 1913 | "│ avg_velocity │\n", 1914 | "│ --- │\n", 1915 | "│ bool │\n", 1916 | "╞══════════════╡\n", 1917 | "│ true │\n", 1918 | "│ false │\n", 1919 | "│ false │\n", 1920 | "│ false │\n", 1921 | "│ false │\n", 1922 | "│ … │\n", 1923 | "│ false │\n", 1924 | "│ false │\n", 1925 | "│ false │\n", 1926 | "│ false │\n", 1927 | "│ false │\n", 1928 | "└──────────────┘" 1929 | ] 1930 | }, 1931 | "execution_count": 97, 1932 | "metadata": {}, 1933 | "output_type": "execute_result" 1934 | } 1935 | ], 1936 | "source": [ 1937 | "(df\n", 1938 | " .select(cols)\n", 1939 | " .with_columns(pl.col('elapsed').cast(pl.Int16),\n", 1940 | " course=pl.lit('Maple Syrup').cast(pl.Categorical),\n", 1941 | " time=pl.col('time').dt.convert_time_zone('America/Denver')) \n", 1942 | " .select(pl.col('avg_velocity').is_nan())\n", 1943 | ")" 1944 | ] 1945 | }, 1946 | { 1947 | "cell_type": "code", 1948 | "execution_count": 98, 1949 | "metadata": { 1950 | "scrolled": true 1951 | }, 1952 | "outputs": [ 1953 | { 1954 | "data": { 1955 | "text/html": [ 1956 | "
\n", 1963 | "shape: (1, 15)
indexcoursedistance_2dlatitudelongitudetimeelevationspeed_betweentravelledelapsedavg_velocityrolling_travelledrolling_elapsedrolling_velocityrolling_between
u32catf64f64f64datetime[μs, America/Denver]f64f64f64i16f64f64f64f64f64
0"Maple Syrup"0.040.879161-111.8551692024-09-10 17:41:56 MDT1480.0null0.00NaNnullnullnullnull
" 1964 | ], 1965 | "text/plain": [ 1966 | "shape: (1, 15)\n", 1967 | "┌───────┬────────┬─────────────┬───────────┬───┬────────────┬────────────┬────────────┬────────────┐\n", 1968 | "│ index ┆ course ┆ distance_2d ┆ latitude ┆ … ┆ rolling_tr ┆ rolling_el ┆ rolling_ve ┆ rolling_be │\n", 1969 | "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ avelled ┆ apsed ┆ locity ┆ tween │\n", 1970 | "│ u32 ┆ cat ┆ f64 ┆ f64 ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n", 1971 | "│ ┆ ┆ ┆ ┆ ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n", 1972 | "╞═══════╪════════╪═════════════╪═══════════╪═══╪════════════╪════════════╪════════════╪════════════╡\n", 1973 | "│ 0 ┆ Maple ┆ 0.0 ┆ 40.879161 ┆ … ┆ null ┆ null ┆ null ┆ null │\n", 1974 | "│ ┆ Syrup ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", 1975 | "└───────┴────────┴─────────────┴───────────┴───┴────────────┴────────────┴────────────┴────────────┘" 1976 | ] 1977 | }, 1978 | "execution_count": 98, 1979 | "metadata": {}, 1980 | "output_type": "execute_result" 1981 | } 1982 | ], 1983 | "source": [ 1984 | "(df\n", 1985 | " .select(cols)\n", 1986 | " .with_row_index() \n", 1987 | " .with_columns(pl.col('elapsed').cast(pl.Int16),\n", 1988 | " course=pl.lit('Maple Syrup').cast(pl.Categorical),\n", 1989 | " time=pl.col('time').dt.convert_time_zone('America/Denver')) \n", 1990 | " .filter(pl.col('avg_velocity').is_nan())\n", 1991 | ")" 1992 | ] 1993 | }, 1994 | { 1995 | "cell_type": "code", 1996 | "execution_count": 99, 1997 | "metadata": {}, 1998 | "outputs": [ 1999 | { 2000 | "data": { 2001 | "text/html": [ 2002 | "
\n", 2009 | "shape: (10_430, 15)
indexcoursedistance_2dlatitudelongitudetimeelevationspeed_betweentravelledelapsedavg_velocityrolling_travelledrolling_elapsedrolling_velocityrolling_between
u32catf64f64f64datetime[μs, America/Denver]f64f64f64i16f64f64f64f64f64
0"Maple Syrup"0.040.879161-111.8551692024-09-10 17:41:56 MDT1480.0null0.00NaNnullnullnullnull
1"Maple Syrup"1.21088340.879167-111.8551812024-09-10 17:41:58 MDT1480.10.6075031.21088320.605442nullnullnullnull
2"Maple Syrup"1.22761240.879172-111.8551942024-09-10 17:41:59 MDT1480.11.2276122.43849530.812832nullnullnullnull
3"Maple Syrup"0.95223840.879174-111.8552052024-09-10 17:42:00 MDT1480.10.9522383.39073340.847683nullnullnullnull
4"Maple Syrup"0.9055140.879177-111.8552152024-09-10 17:42:01 MDT1480.10.905514.29624350.8592492.2672712.80.80974null
10425"Maple Syrup"4.08514740.857847-111.8238992024-09-10 20:39:58 MDT1696.84.08637118640.85135106821.74507118634.6000410680.01.7448133.177668
10426"Maple Syrup"1.53572640.857852-111.8239162024-09-10 20:39:59 MDT1696.81.53572618642.387076106831.74505218637.42085210681.01.7449132.821613
10427"Maple Syrup"3.15613440.857869-111.8239462024-09-10 20:40:00 MDT1696.73.15771818645.54321106841.74518418640.22073810682.01.7450122.800696
10428"Maple Syrup"3.62648840.857889-111.823982024-09-10 20:40:01 MDT1696.53.63199918649.169698106851.7453618642.94350710683.01.7451042.724433
10429"Maple Syrup"3.76082940.857909-111.8240162024-09-10 20:40:02 MDT1696.33.76614418652.930528106861.74554818646.17637210684.01.7452433.235592
" 2010 | ], 2011 | "text/plain": [ 2012 | "shape: (10_430, 15)\n", 2013 | "┌───────┬────────┬─────────────┬───────────┬───┬────────────┬────────────┬────────────┬────────────┐\n", 2014 | "│ index ┆ course ┆ distance_2d ┆ latitude ┆ … ┆ rolling_tr ┆ rolling_el ┆ rolling_ve ┆ rolling_be │\n", 2015 | "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ avelled ┆ apsed ┆ locity ┆ tween │\n", 2016 | "│ u32 ┆ cat ┆ f64 ┆ f64 ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n", 2017 | "│ ┆ ┆ ┆ ┆ ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n", 2018 | "╞═══════╪════════╪═════════════╪═══════════╪═══╪════════════╪════════════╪════════════╪════════════╡\n", 2019 | "│ 0 ┆ Maple ┆ 0.0 ┆ 40.879161 ┆ … ┆ null ┆ null ┆ null ┆ null │\n", 2020 | "│ ┆ Syrup ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", 2021 | "│ 1 ┆ Maple ┆ 1.210883 ┆ 40.879167 ┆ … ┆ null ┆ null ┆ null ┆ null │\n", 2022 | "│ ┆ Syrup ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", 2023 | "│ 2 ┆ Maple ┆ 1.227612 ┆ 40.879172 ┆ … ┆ null ┆ null ┆ null ┆ null │\n", 2024 | "│ ┆ Syrup ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", 2025 | "│ 3 ┆ Maple ┆ 0.952238 ┆ 40.879174 ┆ … ┆ null ┆ null ┆ null ┆ null │\n", 2026 | "│ ┆ Syrup ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", 2027 | "│ 4 ┆ Maple ┆ 0.90551 ┆ 40.879177 ┆ … ┆ 2.267271 ┆ 2.8 ┆ 0.80974 ┆ null │\n", 2028 | "│ ┆ Syrup ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", 2029 | "│ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n", 2030 | "│ 10425 ┆ Maple ┆ 4.085147 ┆ 40.857847 ┆ … ┆ 18634.6000 ┆ 10680.0 ┆ 1.744813 ┆ 3.177668 │\n", 2031 | "│ ┆ Syrup ┆ ┆ ┆ ┆ 4 ┆ ┆ ┆ │\n", 2032 | "│ 10426 ┆ Maple ┆ 1.535726 ┆ 40.857852 ┆ … ┆ 18637.4208 ┆ 10681.0 ┆ 1.744913 ┆ 2.821613 │\n", 2033 | "│ ┆ Syrup ┆ ┆ ┆ ┆ 52 ┆ ┆ ┆ │\n", 2034 | "│ 10427 ┆ Maple ┆ 3.156134 ┆ 40.857869 ┆ … ┆ 18640.2207 ┆ 10682.0 ┆ 1.745012 ┆ 2.800696 │\n", 2035 | "│ ┆ Syrup ┆ ┆ ┆ ┆ 38 ┆ ┆ ┆ │\n", 2036 | "│ 10428 ┆ Maple ┆ 3.626488 ┆ 40.857889 ┆ … ┆ 18642.9435 ┆ 10683.0 ┆ 1.745104 ┆ 2.724433 │\n", 2037 | "│ ┆ Syrup ┆ ┆ ┆ ┆ 07 ┆ ┆ ┆ │\n", 2038 | "│ 10429 ┆ Maple ┆ 3.760829 ┆ 40.857909 ┆ … ┆ 18646.1763 ┆ 10684.0 ┆ 1.745243 ┆ 3.235592 │\n", 2039 | "│ ┆ Syrup ┆ ┆ ┆ ┆ 72 ┆ ┆ ┆ │\n", 2040 | "└───────┴────────┴─────────────┴───────────┴───┴────────────┴────────────┴────────────┴────────────┘" 2041 | ] 2042 | }, 2043 | "execution_count": 99, 2044 | "metadata": {}, 2045 | "output_type": "execute_result" 2046 | } 2047 | ], 2048 | "source": [ 2049 | "# a glorious function\n", 2050 | "\n", 2051 | "def tweak_gpx(df_):\n", 2052 | " return (df_\n", 2053 | " .select(cols)\n", 2054 | " .with_row_index() \n", 2055 | " .with_columns(pl.col('elapsed').cast(pl.Int16),\n", 2056 | " course=pl.lit('Maple Syrup').cast(pl.Categorical),\n", 2057 | " time=pl.col('time').dt.convert_time_zone('America/Denver')) \n", 2058 | " )\n", 2059 | "\n", 2060 | "tweak_gpx(df)" 2061 | ] 2062 | }, 2063 | { 2064 | "cell_type": "code", 2065 | "execution_count": null, 2066 | "metadata": { 2067 | "lines_to_next_cell": 2, 2068 | "pycharm": { 2069 | "name": "#%%\n" 2070 | } 2071 | }, 2072 | "outputs": [], 2073 | "source": [] 2074 | }, 2075 | { 2076 | "cell_type": "code", 2077 | "execution_count": null, 2078 | "metadata": { 2079 | "lines_to_next_cell": 0, 2080 | "pycharm": { 2081 | "name": "#%%\n" 2082 | } 2083 | }, 2084 | "outputs": [], 2085 | "source": [] 2086 | }, 2087 | { 2088 | "cell_type": "code", 2089 | "execution_count": null, 2090 | "metadata": {}, 2091 | "outputs": [], 2092 | "source": [] 2093 | }, 2094 | { 2095 | "cell_type": "code", 2096 | "execution_count": null, 2097 | "metadata": {}, 2098 | "outputs": [], 2099 | "source": [] 2100 | }, 2101 | { 2102 | "cell_type": "markdown", 2103 | "metadata": { 2104 | "pycharm": { 2105 | "name": "#%% md\n" 2106 | } 2107 | }, 2108 | "source": [ 2109 | "## Chain\n", 2110 | "\n", 2111 | "Chaining is also called \"flow\" programming. Rather than making intermediate variables, just leverage the fact that most operations return a new object and work on that.\n", 2112 | "\n", 2113 | "The chain should read like a recipe of ordered steps.\n", 2114 | "\n", 2115 | "(BTW, this is actually what we did above.)" 2116 | ] 2117 | }, 2118 | { 2119 | "cell_type": "code", 2120 | "execution_count": 100, 2121 | "metadata": {}, 2122 | "outputs": [], 2123 | "source": [ 2124 | "# a glorious function\n", 2125 | "\n", 2126 | "def tweak_gpx(df_):\n", 2127 | " return (df_\n", 2128 | " .select(cols)\n", 2129 | " .with_row_index() \n", 2130 | " .with_columns(pl.col('elapsed').cast(pl.Int16),\n", 2131 | " course=pl.lit('Maple Syrup').cast(pl.Categorical),\n", 2132 | " time=pl.col('time').dt.convert_time_zone('America/Denver')) \n", 2133 | " )\n", 2134 | "\n", 2135 | "tweak_gpx(df).write_parquet('Face_plant.parquet')" 2136 | ] 2137 | }, 2138 | { 2139 | "cell_type": "code", 2140 | "execution_count": 101, 2141 | "metadata": {}, 2142 | "outputs": [ 2143 | { 2144 | "data": { 2145 | "text/html": [ 2146 | "

NAIVE QUERY PLAN

run LazyFrame.show_graph() to see the optimized version

\n", 2147 | "\n", 2149 | "\n", 2151 | "\n", 2152 | "\n", 2154 | "\n", 2155 | "polars_query\n", 2156 | "\n", 2157 | "\n", 2158 | "\n", 2159 | "p1\n", 2160 | "\n", 2161 | "WITH COLUMNS [col("elapsed").strict_cast(Int16), String(Maple Syrup).strict_cast(Categorical(None, Physical)).alias("course"), col("time").dt.convert_time_zone().alias("time")]\n", 2162 | "\n", 2163 | "\n", 2164 | "\n", 2165 | "p2\n", 2166 | "\n", 2167 | "ROW_INDEX\n", 2168 | "\n", 2169 | "\n", 2170 | "\n", 2171 | "p1--p2\n", 2172 | "\n", 2173 | "\n", 2174 | "\n", 2175 | "\n", 2176 | "p3\n", 2177 | "\n", 2178 | "π 14/14\n", 2179 | "\n", 2180 | "\n", 2181 | "\n", 2182 | "p2--p3\n", 2183 | "\n", 2184 | "\n", 2185 | "\n", 2186 | "\n", 2187 | "p4\n", 2188 | "\n", 2189 | "Parquet SCAN [Face_plant.parquet]\n", 2190 | "π */15;\n", 2191 | "\n", 2192 | "\n", 2193 | "\n", 2194 | "p3--p4\n", 2195 | "\n", 2196 | "\n", 2197 | "\n", 2198 | "\n" 2199 | ], 2200 | "text/plain": [ 2201 | "" 2202 | ] 2203 | }, 2204 | "execution_count": 101, 2205 | "metadata": {}, 2206 | "output_type": "execute_result" 2207 | } 2208 | ], 2209 | "source": [ 2210 | "# laziness\n", 2211 | "gpx_lazy = pl.scan_parquet('Face_plant.parquet') \n", 2212 | "tweak_gpx(gpx_lazy)" 2213 | ] 2214 | }, 2215 | { 2216 | "cell_type": "code", 2217 | "execution_count": 102, 2218 | "metadata": {}, 2219 | "outputs": [ 2220 | { 2221 | "data": { 2222 | "text/html": [ 2223 | "
\n", 2230 | "shape: (10_430, 15)
indexcoursedistance_2dlatitudelongitudetimeelevationspeed_betweentravelledelapsedavg_velocityrolling_travelledrolling_elapsedrolling_velocityrolling_between
u32catf64f64f64datetime[μs, America/Denver]f64f64f64i16f64f64f64f64f64
0"Maple Syrup"0.040.879161-111.8551692024-09-10 17:41:56 MDT1480.0null0.00NaNnullnullnullnull
1"Maple Syrup"1.21088340.879167-111.8551812024-09-10 17:41:58 MDT1480.10.6075031.21088320.605442nullnullnullnull
2"Maple Syrup"1.22761240.879172-111.8551942024-09-10 17:41:59 MDT1480.11.2276122.43849530.812832nullnullnullnull
3"Maple Syrup"0.95223840.879174-111.8552052024-09-10 17:42:00 MDT1480.10.9522383.39073340.847683nullnullnullnull
4"Maple Syrup"0.9055140.879177-111.8552152024-09-10 17:42:01 MDT1480.10.905514.29624350.8592492.2672712.80.80974null
10425"Maple Syrup"4.08514740.857847-111.8238992024-09-10 20:39:58 MDT1696.84.08637118640.85135106821.74507118634.6000410680.01.7448133.177668
10426"Maple Syrup"1.53572640.857852-111.8239162024-09-10 20:39:59 MDT1696.81.53572618642.387076106831.74505218637.42085210681.01.7449132.821613
10427"Maple Syrup"3.15613440.857869-111.8239462024-09-10 20:40:00 MDT1696.73.15771818645.54321106841.74518418640.22073810682.01.7450122.800696
10428"Maple Syrup"3.62648840.857889-111.823982024-09-10 20:40:01 MDT1696.53.63199918649.169698106851.7453618642.94350710683.01.7451042.724433
10429"Maple Syrup"3.76082940.857909-111.8240162024-09-10 20:40:02 MDT1696.33.76614418652.930528106861.74554818646.17637210684.01.7452433.235592
" 2231 | ], 2232 | "text/plain": [ 2233 | "shape: (10_430, 15)\n", 2234 | "┌───────┬────────┬─────────────┬───────────┬───┬────────────┬────────────┬────────────┬────────────┐\n", 2235 | "│ index ┆ course ┆ distance_2d ┆ latitude ┆ … ┆ rolling_tr ┆ rolling_el ┆ rolling_ve ┆ rolling_be │\n", 2236 | "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ avelled ┆ apsed ┆ locity ┆ tween │\n", 2237 | "│ u32 ┆ cat ┆ f64 ┆ f64 ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n", 2238 | "│ ┆ ┆ ┆ ┆ ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n", 2239 | "╞═══════╪════════╪═════════════╪═══════════╪═══╪════════════╪════════════╪════════════╪════════════╡\n", 2240 | "│ 0 ┆ Maple ┆ 0.0 ┆ 40.879161 ┆ … ┆ null ┆ null ┆ null ┆ null │\n", 2241 | "│ ┆ Syrup ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", 2242 | "│ 1 ┆ Maple ┆ 1.210883 ┆ 40.879167 ┆ … ┆ null ┆ null ┆ null ┆ null │\n", 2243 | "│ ┆ Syrup ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", 2244 | "│ 2 ┆ Maple ┆ 1.227612 ┆ 40.879172 ┆ … ┆ null ┆ null ┆ null ┆ null │\n", 2245 | "│ ┆ Syrup ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", 2246 | "│ 3 ┆ Maple ┆ 0.952238 ┆ 40.879174 ┆ … ┆ null ┆ null ┆ null ┆ null │\n", 2247 | "│ ┆ Syrup ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", 2248 | "│ 4 ┆ Maple ┆ 0.90551 ┆ 40.879177 ┆ … ┆ 2.267271 ┆ 2.8 ┆ 0.80974 ┆ null │\n", 2249 | "│ ┆ Syrup ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", 2250 | "│ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n", 2251 | "│ 10425 ┆ Maple ┆ 4.085147 ┆ 40.857847 ┆ … ┆ 18634.6000 ┆ 10680.0 ┆ 1.744813 ┆ 3.177668 │\n", 2252 | "│ ┆ Syrup ┆ ┆ ┆ ┆ 4 ┆ ┆ ┆ │\n", 2253 | "│ 10426 ┆ Maple ┆ 1.535726 ┆ 40.857852 ┆ … ┆ 18637.4208 ┆ 10681.0 ┆ 1.744913 ┆ 2.821613 │\n", 2254 | "│ ┆ Syrup ┆ ┆ ┆ ┆ 52 ┆ ┆ ┆ │\n", 2255 | "│ 10427 ┆ Maple ┆ 3.156134 ┆ 40.857869 ┆ … ┆ 18640.2207 ┆ 10682.0 ┆ 1.745012 ┆ 2.800696 │\n", 2256 | "│ ┆ Syrup ┆ ┆ ┆ ┆ 38 ┆ ┆ ┆ │\n", 2257 | "│ 10428 ┆ Maple ┆ 3.626488 ┆ 40.857889 ┆ … ┆ 18642.9435 ┆ 10683.0 ┆ 1.745104 ┆ 2.724433 │\n", 2258 | "│ ┆ Syrup ┆ ┆ ┆ ┆ 07 ┆ ┆ ┆ │\n", 2259 | "│ 10429 ┆ Maple ┆ 3.760829 ┆ 40.857909 ┆ … ┆ 18646.1763 ┆ 10684.0 ┆ 1.745243 ┆ 3.235592 │\n", 2260 | "│ ┆ Syrup ┆ ┆ ┆ ┆ 72 ┆ ┆ ┆ │\n", 2261 | "└───────┴────────┴─────────────┴───────────┴───┴────────────┴────────────┴────────────┴────────────┘" 2262 | ] 2263 | }, 2264 | "execution_count": 102, 2265 | "metadata": {}, 2266 | "output_type": "execute_result" 2267 | } 2268 | ], 2269 | "source": [ 2270 | "# use .collect to generate plan and materialize\n", 2271 | "tweak_gpx(gpx_lazy).collect()" 2272 | ] 2273 | }, 2274 | { 2275 | "cell_type": "code", 2276 | "execution_count": 103, 2277 | "metadata": {}, 2278 | "outputs": [ 2279 | { 2280 | "ename": "TypeError", 2281 | "evalue": "LazyFrame.collect() takes 1 positional argument but 2 were given", 2282 | "output_type": "error", 2283 | "traceback": [ 2284 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 2285 | "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", 2286 | "Cell \u001b[0;32mIn[103], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# using GPU!\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m \u001b[43mtweak_gpx\u001b[49m\u001b[43m(\u001b[49m\u001b[43mgpx_lazy\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcollect\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mgpu\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m\n", 2287 | "\u001b[0;31mTypeError\u001b[0m: LazyFrame.collect() takes 1 positional argument but 2 were given" 2288 | ] 2289 | } 2290 | ], 2291 | "source": [ 2292 | "# using GPU!\n", 2293 | "tweak_gpx(gpx_lazy).collect('gpu')" 2294 | ] 2295 | }, 2296 | { 2297 | "cell_type": "code", 2298 | "execution_count": 104, 2299 | "metadata": {}, 2300 | "outputs": [ 2301 | { 2302 | "name": "stdout", 2303 | "output_type": "stream", 2304 | "text": [ 2305 | "(10430, 15)\n" 2306 | ] 2307 | }, 2308 | { 2309 | "data": { 2310 | "text/html": [ 2311 | "
\n", 2318 | "shape: (10_430, 15)
indexcoursedistance_2dlatitudelongitudetimeelevationspeed_betweentravelledelapsedavg_velocityrolling_travelledrolling_elapsedrolling_velocityrolling_between
u32catf64f64f64datetime[μs, America/Denver]f64f64f64i16f64f64f64f64f64
0"Maple Syrup"0.040.879161-111.8551692024-09-10 17:41:56 MDT1480.0null0.00NaNnullnullnullnull
1"Maple Syrup"1.21088340.879167-111.8551812024-09-10 17:41:58 MDT1480.10.6075031.21088320.605442nullnullnullnull
2"Maple Syrup"1.22761240.879172-111.8551942024-09-10 17:41:59 MDT1480.11.2276122.43849530.812832nullnullnullnull
3"Maple Syrup"0.95223840.879174-111.8552052024-09-10 17:42:00 MDT1480.10.9522383.39073340.847683nullnullnullnull
4"Maple Syrup"0.9055140.879177-111.8552152024-09-10 17:42:01 MDT1480.10.905514.29624350.8592492.2672712.80.80974null
10425"Maple Syrup"4.08514740.857847-111.8238992024-09-10 20:39:58 MDT1696.84.08637118640.85135106821.74507118634.6000410680.01.7448133.177668
10426"Maple Syrup"1.53572640.857852-111.8239162024-09-10 20:39:59 MDT1696.81.53572618642.387076106831.74505218637.42085210681.01.7449132.821613
10427"Maple Syrup"3.15613440.857869-111.8239462024-09-10 20:40:00 MDT1696.73.15771818645.54321106841.74518418640.22073810682.01.7450122.800696
10428"Maple Syrup"3.62648840.857889-111.823982024-09-10 20:40:01 MDT1696.53.63199918649.169698106851.7453618642.94350710683.01.7451042.724433
10429"Maple Syrup"3.76082940.857909-111.8240162024-09-10 20:40:02 MDT1696.33.76614418652.930528106861.74554818646.17637210684.01.7452433.235592
" 2319 | ], 2320 | "text/plain": [ 2321 | "shape: (10_430, 15)\n", 2322 | "┌───────┬────────┬─────────────┬───────────┬───┬────────────┬────────────┬────────────┬────────────┐\n", 2323 | "│ index ┆ course ┆ distance_2d ┆ latitude ┆ … ┆ rolling_tr ┆ rolling_el ┆ rolling_ve ┆ rolling_be │\n", 2324 | "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ avelled ┆ apsed ┆ locity ┆ tween │\n", 2325 | "│ u32 ┆ cat ┆ f64 ┆ f64 ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n", 2326 | "│ ┆ ┆ ┆ ┆ ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n", 2327 | "╞═══════╪════════╪═════════════╪═══════════╪═══╪════════════╪════════════╪════════════╪════════════╡\n", 2328 | "│ 0 ┆ Maple ┆ 0.0 ┆ 40.879161 ┆ … ┆ null ┆ null ┆ null ┆ null │\n", 2329 | "│ ┆ Syrup ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", 2330 | "│ 1 ┆ Maple ┆ 1.210883 ┆ 40.879167 ┆ … ┆ null ┆ null ┆ null ┆ null │\n", 2331 | "│ ┆ Syrup ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", 2332 | "│ 2 ┆ Maple ┆ 1.227612 ┆ 40.879172 ┆ … ┆ null ┆ null ┆ null ┆ null │\n", 2333 | "│ ┆ Syrup ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", 2334 | "│ 3 ┆ Maple ┆ 0.952238 ┆ 40.879174 ┆ … ┆ null ┆ null ┆ null ┆ null │\n", 2335 | "│ ┆ Syrup ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", 2336 | "│ 4 ┆ Maple ┆ 0.90551 ┆ 40.879177 ┆ … ┆ 2.267271 ┆ 2.8 ┆ 0.80974 ┆ null │\n", 2337 | "│ ┆ Syrup ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", 2338 | "│ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n", 2339 | "│ 10425 ┆ Maple ┆ 4.085147 ┆ 40.857847 ┆ … ┆ 18634.6000 ┆ 10680.0 ┆ 1.744813 ┆ 3.177668 │\n", 2340 | "│ ┆ Syrup ┆ ┆ ┆ ┆ 4 ┆ ┆ ┆ │\n", 2341 | "│ 10426 ┆ Maple ┆ 1.535726 ┆ 40.857852 ┆ … ┆ 18637.4208 ┆ 10681.0 ┆ 1.744913 ┆ 2.821613 │\n", 2342 | "│ ┆ Syrup ┆ ┆ ┆ ┆ 52 ┆ ┆ ┆ │\n", 2343 | "│ 10427 ┆ Maple ┆ 3.156134 ┆ 40.857869 ┆ … ┆ 18640.2207 ┆ 10682.0 ┆ 1.745012 ┆ 2.800696 │\n", 2344 | "│ ┆ Syrup ┆ ┆ ┆ ┆ 38 ┆ ┆ ┆ │\n", 2345 | "│ 10428 ┆ Maple ┆ 3.626488 ┆ 40.857889 ┆ … ┆ 18642.9435 ┆ 10683.0 ┆ 1.745104 ┆ 2.724433 │\n", 2346 | "│ ┆ Syrup ┆ ┆ ┆ ┆ 07 ┆ ┆ ┆ │\n", 2347 | "│ 10429 ┆ Maple ┆ 3.760829 ┆ 40.857909 ┆ … ┆ 18646.1763 ┆ 10684.0 ┆ 1.745243 ┆ 3.235592 │\n", 2348 | "│ ┆ Syrup ┆ ┆ ┆ ┆ 72 ┆ ┆ ┆ │\n", 2349 | "└───────┴────────┴─────────────┴───────────┴───┴────────────┴────────────┴────────────┴────────────┘" 2350 | ] 2351 | }, 2352 | "execution_count": 104, 2353 | "metadata": {}, 2354 | "output_type": "execute_result" 2355 | } 2356 | ], 2357 | "source": [ 2358 | "# debugging\n", 2359 | "# some folks really want the intermediate data...\n", 2360 | "def get_var(df, var_name):\n", 2361 | " globals()[var_name] = df\n", 2362 | " return df\n", 2363 | "\n", 2364 | "def tweak_gpx(df_):\n", 2365 | " return (df_\n", 2366 | " .pipe(lambda df: print(df.shape) or df) # Look! 🤯\n", 2367 | " .select(cols)\n", 2368 | " .with_row_index() \n", 2369 | " .pipe(get_var, 'intermediate') # Debugging! 💪\n", 2370 | " .with_columns(pl.col('elapsed').cast(pl.Int16),\n", 2371 | " course=pl.lit('Maple Syrup').cast(pl.Categorical),\n", 2372 | " time=pl.col('time').dt.convert_time_zone('America/Denver')) \n", 2373 | " )\n", 2374 | "\n", 2375 | "raw = pl.read_parquet('Face_plant.parquet')\n", 2376 | "tweak_gpx(raw)" 2377 | ] 2378 | }, 2379 | { 2380 | "cell_type": "code", 2381 | "execution_count": 105, 2382 | "metadata": { 2383 | "scrolled": true 2384 | }, 2385 | "outputs": [ 2386 | { 2387 | "data": { 2388 | "text/html": [ 2389 | "
\n", 2396 | "shape: (10_430, 15)
indexcoursedistance_2dlatitudelongitudetimeelevationspeed_betweentravelledelapsedavg_velocityrolling_travelledrolling_elapsedrolling_velocityrolling_between
u32catf64f64f64datetime[μs, America/Denver]f64f64f64i16f64f64f64f64f64
0"Maple Syrup"0.040.879161-111.8551692024-09-10 17:41:56 MDT1480.0null0.00NaNnullnullnullnull
1"Maple Syrup"1.21088340.879167-111.8551812024-09-10 17:41:58 MDT1480.10.6075031.21088320.605442nullnullnullnull
2"Maple Syrup"1.22761240.879172-111.8551942024-09-10 17:41:59 MDT1480.11.2276122.43849530.812832nullnullnullnull
3"Maple Syrup"0.95223840.879174-111.8552052024-09-10 17:42:00 MDT1480.10.9522383.39073340.847683nullnullnullnull
4"Maple Syrup"0.9055140.879177-111.8552152024-09-10 17:42:01 MDT1480.10.905514.29624350.8592492.2672712.80.80974null
10425"Maple Syrup"4.08514740.857847-111.8238992024-09-10 20:39:58 MDT1696.84.08637118640.85135106821.74507118634.6000410680.01.7448133.177668
10426"Maple Syrup"1.53572640.857852-111.8239162024-09-10 20:39:59 MDT1696.81.53572618642.387076106831.74505218637.42085210681.01.7449132.821613
10427"Maple Syrup"3.15613440.857869-111.8239462024-09-10 20:40:00 MDT1696.73.15771818645.54321106841.74518418640.22073810682.01.7450122.800696
10428"Maple Syrup"3.62648840.857889-111.823982024-09-10 20:40:01 MDT1696.53.63199918649.169698106851.7453618642.94350710683.01.7451042.724433
10429"Maple Syrup"3.76082940.857909-111.8240162024-09-10 20:40:02 MDT1696.33.76614418652.930528106861.74554818646.17637210684.01.7452433.235592
" 2397 | ], 2398 | "text/plain": [ 2399 | "shape: (10_430, 15)\n", 2400 | "┌───────┬────────┬─────────────┬───────────┬───┬────────────┬────────────┬────────────┬────────────┐\n", 2401 | "│ index ┆ course ┆ distance_2d ┆ latitude ┆ … ┆ rolling_tr ┆ rolling_el ┆ rolling_ve ┆ rolling_be │\n", 2402 | "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ avelled ┆ apsed ┆ locity ┆ tween │\n", 2403 | "│ u32 ┆ cat ┆ f64 ┆ f64 ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n", 2404 | "│ ┆ ┆ ┆ ┆ ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n", 2405 | "╞═══════╪════════╪═════════════╪═══════════╪═══╪════════════╪════════════╪════════════╪════════════╡\n", 2406 | "│ 0 ┆ Maple ┆ 0.0 ┆ 40.879161 ┆ … ┆ null ┆ null ┆ null ┆ null │\n", 2407 | "│ ┆ Syrup ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", 2408 | "│ 1 ┆ Maple ┆ 1.210883 ┆ 40.879167 ┆ … ┆ null ┆ null ┆ null ┆ null │\n", 2409 | "│ ┆ Syrup ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", 2410 | "│ 2 ┆ Maple ┆ 1.227612 ┆ 40.879172 ┆ … ┆ null ┆ null ┆ null ┆ null │\n", 2411 | "│ ┆ Syrup ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", 2412 | "│ 3 ┆ Maple ┆ 0.952238 ┆ 40.879174 ┆ … ┆ null ┆ null ┆ null ┆ null │\n", 2413 | "│ ┆ Syrup ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", 2414 | "│ 4 ┆ Maple ┆ 0.90551 ┆ 40.879177 ┆ … ┆ 2.267271 ┆ 2.8 ┆ 0.80974 ┆ null │\n", 2415 | "│ ┆ Syrup ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", 2416 | "│ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n", 2417 | "│ 10425 ┆ Maple ┆ 4.085147 ┆ 40.857847 ┆ … ┆ 18634.6000 ┆ 10680.0 ┆ 1.744813 ┆ 3.177668 │\n", 2418 | "│ ┆ Syrup ┆ ┆ ┆ ┆ 4 ┆ ┆ ┆ │\n", 2419 | "│ 10426 ┆ Maple ┆ 1.535726 ┆ 40.857852 ┆ … ┆ 18637.4208 ┆ 10681.0 ┆ 1.744913 ┆ 2.821613 │\n", 2420 | "│ ┆ Syrup ┆ ┆ ┆ ┆ 52 ┆ ┆ ┆ │\n", 2421 | "│ 10427 ┆ Maple ┆ 3.156134 ┆ 40.857869 ┆ … ┆ 18640.2207 ┆ 10682.0 ┆ 1.745012 ┆ 2.800696 │\n", 2422 | "│ ┆ Syrup ┆ ┆ ┆ ┆ 38 ┆ ┆ ┆ │\n", 2423 | "│ 10428 ┆ Maple ┆ 3.626488 ┆ 40.857889 ┆ … ┆ 18642.9435 ┆ 10683.0 ┆ 1.745104 ┆ 2.724433 │\n", 2424 | "│ ┆ Syrup ┆ ┆ ┆ ┆ 07 ┆ ┆ ┆ │\n", 2425 | "│ 10429 ┆ Maple ┆ 3.760829 ┆ 40.857909 ┆ … ┆ 18646.1763 ┆ 10684.0 ┆ 1.745243 ┆ 3.235592 │\n", 2426 | "│ ┆ Syrup ┆ ┆ ┆ ┆ 72 ┆ ┆ ┆ │\n", 2427 | "└───────┴────────┴─────────────┴───────────┴───┴────────────┴────────────┴────────────┴────────────┘" 2428 | ] 2429 | }, 2430 | "execution_count": 105, 2431 | "metadata": {}, 2432 | "output_type": "execute_result" 2433 | } 2434 | ], 2435 | "source": [ 2436 | "intermediate" 2437 | ] 2438 | }, 2439 | { 2440 | "cell_type": "code", 2441 | "execution_count": null, 2442 | "metadata": { 2443 | "lines_to_next_cell": 2, 2444 | "pycharm": { 2445 | "name": "#%%\n" 2446 | } 2447 | }, 2448 | "outputs": [], 2449 | "source": [] 2450 | }, 2451 | { 2452 | "cell_type": "code", 2453 | "execution_count": null, 2454 | "metadata": { 2455 | "lines_to_next_cell": 2, 2456 | "pycharm": { 2457 | "name": "#%%\n" 2458 | } 2459 | }, 2460 | "outputs": [], 2461 | "source": [] 2462 | }, 2463 | { 2464 | "cell_type": "code", 2465 | "execution_count": null, 2466 | "metadata": { 2467 | "lines_to_next_cell": 2, 2468 | "pycharm": { 2469 | "name": "#%%\n" 2470 | } 2471 | }, 2472 | "outputs": [], 2473 | "source": [] 2474 | }, 2475 | { 2476 | "cell_type": "code", 2477 | "execution_count": null, 2478 | "metadata": { 2479 | "lines_to_next_cell": 2, 2480 | "pycharm": { 2481 | "name": "#%%\n" 2482 | } 2483 | }, 2484 | "outputs": [], 2485 | "source": [] 2486 | }, 2487 | { 2488 | "cell_type": "code", 2489 | "execution_count": null, 2490 | "metadata": { 2491 | "lines_to_next_cell": 2, 2492 | "pycharm": { 2493 | "name": "#%%\n" 2494 | } 2495 | }, 2496 | "outputs": [], 2497 | "source": [] 2498 | }, 2499 | { 2500 | "cell_type": "markdown", 2501 | "metadata": { 2502 | "pycharm": { 2503 | "name": "#%% md\n" 2504 | } 2505 | }, 2506 | "source": [ 2507 | "## Don't Apply (map_elements) if you can" 2508 | ] 2509 | }, 2510 | { 2511 | "cell_type": "code", 2512 | "execution_count": 106, 2513 | "metadata": {}, 2514 | "outputs": [ 2515 | { 2516 | "name": "stdout", 2517 | "output_type": "stream", 2518 | "text": [ 2519 | "(10430, 15)\n" 2520 | ] 2521 | } 2522 | ], 2523 | "source": [ 2524 | "# debugging\n", 2525 | "def get_var(df, var_name):\n", 2526 | " globals()[var_name] = df\n", 2527 | " return df\n", 2528 | "\n", 2529 | "def tweak_gpx(df_):\n", 2530 | " return (df_\n", 2531 | " .pipe(lambda df: print(df.shape) or df)\n", 2532 | " .select(cols)\n", 2533 | " .with_row_index() \n", 2534 | " .pipe(get_var, 'intermediate')\n", 2535 | " .with_columns(pl.col('elapsed').cast(pl.Int16),\n", 2536 | " course=pl.lit('Maple Syrup').cast(pl.Categorical),\n", 2537 | " time=pl.col('time').dt.convert_time_zone('America/Denver')) \n", 2538 | " )\n", 2539 | "\n", 2540 | "raw = pl.read_parquet('Face_plant.parquet')\n", 2541 | "df = tweak_gpx(raw)" 2542 | ] 2543 | }, 2544 | { 2545 | "cell_type": "code", 2546 | "execution_count": 107, 2547 | "metadata": {}, 2548 | "outputs": [ 2549 | { 2550 | "name": "stderr", 2551 | "output_type": "stream", 2552 | "text": [ 2553 | "/var/folders/qn/r8_0pgj1645dn1w69vqls6cw0000gn/T/ipykernel_70391/613516876.py:7: PolarsInefficientMapWarning: \n", 2554 | "Expr.map_elements is significantly slower than the native expressions API.\n", 2555 | "Only use if you absolutely CANNOT implement your logic otherwise.\n", 2556 | "Replace this expression...\n", 2557 | " - pl.col(\"elevation\").map_elements(meters_to_feet)\n", 2558 | "with this one instead:\n", 2559 | " + pl.col(\"elevation\") * 3.28084\n", 2560 | "\n", 2561 | " ele_ft=pl.col('elevation').map_elements(meters_to_feet))\n", 2562 | "sys:1: MapWithoutReturnDtypeWarning: Calling `map_elements` without specifying `return_dtype` can lead to unpredictable results. Specify `return_dtype` to silence this warning.\n" 2563 | ] 2564 | }, 2565 | { 2566 | "data": { 2567 | "text/html": [ 2568 | "
\n", 2575 | "shape: (10_430, 2)
elevationele_ft
f64f64
1480.04855.6432
1480.14855.971284
1480.14855.971284
1480.14855.971284
1480.14855.971284
1696.85566.929312
1696.85566.929312
1696.75566.601228
1696.55565.94506
1696.35565.288892
" 2576 | ], 2577 | "text/plain": [ 2578 | "shape: (10_430, 2)\n", 2579 | "┌───────────┬─────────────┐\n", 2580 | "│ elevation ┆ ele_ft │\n", 2581 | "│ --- ┆ --- │\n", 2582 | "│ f64 ┆ f64 │\n", 2583 | "╞═══════════╪═════════════╡\n", 2584 | "│ 1480.0 ┆ 4855.6432 │\n", 2585 | "│ 1480.1 ┆ 4855.971284 │\n", 2586 | "│ 1480.1 ┆ 4855.971284 │\n", 2587 | "│ 1480.1 ┆ 4855.971284 │\n", 2588 | "│ 1480.1 ┆ 4855.971284 │\n", 2589 | "│ … ┆ … │\n", 2590 | "│ 1696.8 ┆ 5566.929312 │\n", 2591 | "│ 1696.8 ┆ 5566.929312 │\n", 2592 | "│ 1696.7 ┆ 5566.601228 │\n", 2593 | "│ 1696.5 ┆ 5565.94506 │\n", 2594 | "│ 1696.3 ┆ 5565.288892 │\n", 2595 | "└───────────┴─────────────┘" 2596 | ] 2597 | }, 2598 | "execution_count": 107, 2599 | "metadata": {}, 2600 | "output_type": "execute_result" 2601 | } 2602 | ], 2603 | "source": [ 2604 | "# convert elevation from meters to feet\n", 2605 | "def meters_to_feet(m):\n", 2606 | " return m * 3.28084\n", 2607 | "\n", 2608 | "(df\n", 2609 | " .select('elevation', \n", 2610 | " ele_ft=pl.col('elevation').map_elements(meters_to_feet)) \n", 2611 | ")" 2612 | ] 2613 | }, 2614 | { 2615 | "cell_type": "code", 2616 | "execution_count": 108, 2617 | "metadata": {}, 2618 | "outputs": [ 2619 | { 2620 | "data": { 2621 | "text/html": [ 2622 | "
\n", 2629 | "shape: (10_430, 2)
elevationele_ft
f64f64
1480.04855.6432
1480.14855.971284
1480.14855.971284
1480.14855.971284
1480.14855.971284
1696.85566.929312
1696.85566.929312
1696.75566.601228
1696.55565.94506
1696.35565.288892
" 2630 | ], 2631 | "text/plain": [ 2632 | "shape: (10_430, 2)\n", 2633 | "┌───────────┬─────────────┐\n", 2634 | "│ elevation ┆ ele_ft │\n", 2635 | "│ --- ┆ --- │\n", 2636 | "│ f64 ┆ f64 │\n", 2637 | "╞═══════════╪═════════════╡\n", 2638 | "│ 1480.0 ┆ 4855.6432 │\n", 2639 | "│ 1480.1 ┆ 4855.971284 │\n", 2640 | "│ 1480.1 ┆ 4855.971284 │\n", 2641 | "│ 1480.1 ┆ 4855.971284 │\n", 2642 | "│ 1480.1 ┆ 4855.971284 │\n", 2643 | "│ … ┆ … │\n", 2644 | "│ 1696.8 ┆ 5566.929312 │\n", 2645 | "│ 1696.8 ┆ 5566.929312 │\n", 2646 | "│ 1696.7 ┆ 5566.601228 │\n", 2647 | "│ 1696.5 ┆ 5565.94506 │\n", 2648 | "│ 1696.3 ┆ 5565.288892 │\n", 2649 | "└───────────┴─────────────┘" 2650 | ] 2651 | }, 2652 | "execution_count": 108, 2653 | "metadata": {}, 2654 | "output_type": "execute_result" 2655 | } 2656 | ], 2657 | "source": [ 2658 | "# convert elevation from meters to feet\n", 2659 | "def meters_to_feet(m):\n", 2660 | " return m * 3.28084\n", 2661 | "\n", 2662 | "(df\n", 2663 | " .select('elevation', \n", 2664 | " ele_ft=meters_to_feet(pl.col('elevation')))\n", 2665 | ")" 2666 | ] 2667 | }, 2668 | { 2669 | "cell_type": "code", 2670 | "execution_count": 109, 2671 | "metadata": {}, 2672 | "outputs": [ 2673 | { 2674 | "data": { 2675 | "text/html": [ 2676 | "
\n", 2683 | "shape: (10_430, 2)
elevationele_ft
f64f64
1480.04855.6432
1480.14855.971284
1480.14855.971284
1480.14855.971284
1480.14855.971284
1696.85566.929312
1696.85566.929312
1696.75566.601228
1696.55565.94506
1696.35565.288892
" 2684 | ], 2685 | "text/plain": [ 2686 | "shape: (10_430, 2)\n", 2687 | "┌───────────┬─────────────┐\n", 2688 | "│ elevation ┆ ele_ft │\n", 2689 | "│ --- ┆ --- │\n", 2690 | "│ f64 ┆ f64 │\n", 2691 | "╞═══════════╪═════════════╡\n", 2692 | "│ 1480.0 ┆ 4855.6432 │\n", 2693 | "│ 1480.1 ┆ 4855.971284 │\n", 2694 | "│ 1480.1 ┆ 4855.971284 │\n", 2695 | "│ 1480.1 ┆ 4855.971284 │\n", 2696 | "│ 1480.1 ┆ 4855.971284 │\n", 2697 | "│ … ┆ … │\n", 2698 | "│ 1696.8 ┆ 5566.929312 │\n", 2699 | "│ 1696.8 ┆ 5566.929312 │\n", 2700 | "│ 1696.7 ┆ 5566.601228 │\n", 2701 | "│ 1696.5 ┆ 5565.94506 │\n", 2702 | "│ 1696.3 ┆ 5565.288892 │\n", 2703 | "└───────────┴─────────────┘" 2704 | ] 2705 | }, 2706 | "execution_count": 109, 2707 | "metadata": {}, 2708 | "output_type": "execute_result" 2709 | } 2710 | ], 2711 | "source": [ 2712 | "# Perhaps more readable\n", 2713 | "# convert elevation from meters to feet\n", 2714 | "def meters_to_feet(m):\n", 2715 | " return m * 3.28084\n", 2716 | "\n", 2717 | "(df\n", 2718 | " .select('elevation', \n", 2719 | " ele_ft=pl.col('elevation').pipe(meters_to_feet))\n", 2720 | ")" 2721 | ] 2722 | }, 2723 | { 2724 | "cell_type": "code", 2725 | "execution_count": null, 2726 | "metadata": { 2727 | "collapsed": true, 2728 | "jupyter": { 2729 | "outputs_hidden": true 2730 | } 2731 | }, 2732 | "outputs": [], 2733 | "source": [ 2734 | "%%timeit\n", 2735 | "# takes 965 µs on my machine\n", 2736 | "(df\n", 2737 | " .select('elevation', ele_ft=pl.col('elevation').map_elements(meters_to_feet)) \n", 2738 | ")" 2739 | ] 2740 | }, 2741 | { 2742 | "cell_type": "code", 2743 | "execution_count": 110, 2744 | "metadata": {}, 2745 | "outputs": [ 2746 | { 2747 | "name": "stdout", 2748 | "output_type": "stream", 2749 | "text": [ 2750 | "38 µs ± 514 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)\n" 2751 | ] 2752 | } 2753 | ], 2754 | "source": [ 2755 | "%%timeit\n", 2756 | "(df\n", 2757 | " .select('elevation', \n", 2758 | " ele_ft=pl.col('elevation').pipe(meters_to_feet))\n", 2759 | ")" 2760 | ] 2761 | }, 2762 | { 2763 | "cell_type": "code", 2764 | "execution_count": 111, 2765 | "metadata": {}, 2766 | "outputs": [ 2767 | { 2768 | "name": "stdout", 2769 | "output_type": "stream", 2770 | "text": [ 2771 | "37.7 µs ± 1.11 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)\n" 2772 | ] 2773 | } 2774 | ], 2775 | "source": [ 2776 | "%%timeit\n", 2777 | "(df\n", 2778 | " .select('elevation', \n", 2779 | " ele_ft=pl.col('elevation')*3.28084)\n", 2780 | ")" 2781 | ] 2782 | }, 2783 | { 2784 | "cell_type": "code", 2785 | "execution_count": 112, 2786 | "metadata": {}, 2787 | "outputs": [ 2788 | { 2789 | "data": { 2790 | "text/plain": [ 2791 | "24.125" 2792 | ] 2793 | }, 2794 | "execution_count": 112, 2795 | "metadata": {}, 2796 | "output_type": "execute_result" 2797 | } 2798 | ], 2799 | "source": [ 2800 | "965/40" 2801 | ] 2802 | }, 2803 | { 2804 | "cell_type": "code", 2805 | "execution_count": null, 2806 | "metadata": { 2807 | "lines_to_next_cell": 2, 2808 | "pycharm": { 2809 | "name": "#%%\n" 2810 | } 2811 | }, 2812 | "outputs": [], 2813 | "source": [] 2814 | }, 2815 | { 2816 | "cell_type": "markdown", 2817 | "metadata": { 2818 | "lines_to_next_cell": 2, 2819 | "pycharm": { 2820 | "name": "#%%\n" 2821 | } 2822 | }, 2823 | "source": [ 2824 | "## benchmark caveat\n", 2825 | "- Use the size of data you are using in the real world" 2826 | ] 2827 | }, 2828 | { 2829 | "cell_type": "code", 2830 | "execution_count": null, 2831 | "metadata": { 2832 | "lines_to_next_cell": 2, 2833 | "pycharm": { 2834 | "name": "#%%\n" 2835 | } 2836 | }, 2837 | "outputs": [], 2838 | "source": [] 2839 | }, 2840 | { 2841 | "cell_type": "code", 2842 | "execution_count": null, 2843 | "metadata": { 2844 | "lines_to_next_cell": 2, 2845 | "pycharm": { 2846 | "name": "#%%\n" 2847 | } 2848 | }, 2849 | "outputs": [], 2850 | "source": [] 2851 | }, 2852 | { 2853 | "cell_type": "code", 2854 | "execution_count": null, 2855 | "metadata": { 2856 | "lines_to_next_cell": 2, 2857 | "pycharm": { 2858 | "name": "#%%\n" 2859 | } 2860 | }, 2861 | "outputs": [], 2862 | "source": [] 2863 | }, 2864 | { 2865 | "cell_type": "code", 2866 | "execution_count": null, 2867 | "metadata": { 2868 | "lines_to_next_cell": 2, 2869 | "pycharm": { 2870 | "name": "#%%\n" 2871 | } 2872 | }, 2873 | "outputs": [], 2874 | "source": [] 2875 | }, 2876 | { 2877 | "cell_type": "code", 2878 | "execution_count": null, 2879 | "metadata": { 2880 | "lines_to_next_cell": 2, 2881 | "pycharm": { 2882 | "name": "#%%\n" 2883 | } 2884 | }, 2885 | "outputs": [], 2886 | "source": [] 2887 | }, 2888 | { 2889 | "cell_type": "markdown", 2890 | "metadata": { 2891 | "pycharm": { 2892 | "name": "#%% md\n" 2893 | } 2894 | }, 2895 | "source": [ 2896 | "## Master Aggregation\n", 2897 | "\n", 2898 | "Let's speed (and distance) by 10 minute intervals" 2899 | ] 2900 | }, 2901 | { 2902 | "cell_type": "code", 2903 | "execution_count": 113, 2904 | "metadata": {}, 2905 | "outputs": [ 2906 | { 2907 | "name": "stdout", 2908 | "output_type": "stream", 2909 | "text": [ 2910 | "(10430, 15)\n" 2911 | ] 2912 | }, 2913 | { 2914 | "data": { 2915 | "text/html": [ 2916 | "
\n", 2923 | "shape: (19, 4)
timetravelledspeedmph
datetime[μs, America/Denver]f64f64f64
2024-09-10 17:40:00 MDT1620.9560033.3560177.507208
2024-09-10 17:50:00 MDT1081.9738281.80634.040585
2024-09-10 18:00:00 MDT99.7248380.1664860.372418
2024-09-10 18:10:00 MDT1703.0068852.8430836.359807
2024-09-10 18:20:00 MDT1602.5146982.6753175.984523
2024-09-10 20:00:00 MDT973.9531591.6259653.637187
2024-09-10 20:10:00 MDT680.1180681.1354222.539872
2024-09-10 20:20:00 MDT796.693771.330042.975219
2024-09-10 20:30:00 MDT763.9670621.2754042.853002
2024-09-10 20:40:00 MDT7.3873183.6936598.262493
" 2924 | ], 2925 | "text/plain": [ 2926 | "shape: (19, 4)\n", 2927 | "┌──────────────────────────────┬─────────────┬──────────┬──────────┐\n", 2928 | "│ time ┆ travelled ┆ speed ┆ mph │\n", 2929 | "│ --- ┆ --- ┆ --- ┆ --- │\n", 2930 | "│ datetime[μs, America/Denver] ┆ f64 ┆ f64 ┆ f64 │\n", 2931 | "╞══════════════════════════════╪═════════════╪══════════╪══════════╡\n", 2932 | "│ 2024-09-10 17:40:00 MDT ┆ 1620.956003 ┆ 3.356017 ┆ 7.507208 │\n", 2933 | "│ 2024-09-10 17:50:00 MDT ┆ 1081.973828 ┆ 1.8063 ┆ 4.040585 │\n", 2934 | "│ 2024-09-10 18:00:00 MDT ┆ 99.724838 ┆ 0.166486 ┆ 0.372418 │\n", 2935 | "│ 2024-09-10 18:10:00 MDT ┆ 1703.006885 ┆ 2.843083 ┆ 6.359807 │\n", 2936 | "│ 2024-09-10 18:20:00 MDT ┆ 1602.514698 ┆ 2.675317 ┆ 5.984523 │\n", 2937 | "│ … ┆ … ┆ … ┆ … │\n", 2938 | "│ 2024-09-10 20:00:00 MDT ┆ 973.953159 ┆ 1.625965 ┆ 3.637187 │\n", 2939 | "│ 2024-09-10 20:10:00 MDT ┆ 680.118068 ┆ 1.135422 ┆ 2.539872 │\n", 2940 | "│ 2024-09-10 20:20:00 MDT ┆ 796.69377 ┆ 1.33004 ┆ 2.975219 │\n", 2941 | "│ 2024-09-10 20:30:00 MDT ┆ 763.967062 ┆ 1.275404 ┆ 2.853002 │\n", 2942 | "│ 2024-09-10 20:40:00 MDT ┆ 7.387318 ┆ 3.693659 ┆ 8.262493 │\n", 2943 | "└──────────────────────────────┴─────────────┴──────────┴──────────┘" 2944 | ] 2945 | }, 2946 | "execution_count": 113, 2947 | "metadata": {}, 2948 | "output_type": "execute_result" 2949 | } 2950 | ], 2951 | "source": [ 2952 | "def meters_per_second_to_mph(mps):\n", 2953 | " return mps * 2.23694\n", 2954 | "\n", 2955 | "(tweak_gpx(raw)\n", 2956 | " .group_by_dynamic(index_column='time', every='10m')\n", 2957 | " .agg(pl.col('travelled').last() - pl.col('travelled').first(),\n", 2958 | " speed=(pl.col('travelled').last() - pl.col('travelled').first()) / \n", 2959 | " ((pl.col('time').last() - pl.col('time').first()).dt.total_seconds())\n", 2960 | " ) \n", 2961 | " .with_columns(mph=pl.col('speed').pipe(meters_per_second_to_mph))\n", 2962 | " )" 2963 | ] 2964 | }, 2965 | { 2966 | "cell_type": "code", 2967 | "execution_count": 114, 2968 | "metadata": {}, 2969 | "outputs": [ 2970 | { 2971 | "name": "stdout", 2972 | "output_type": "stream", 2973 | "text": [ 2974 | "(10430, 15)\n" 2975 | ] 2976 | }, 2977 | { 2978 | "data": { 2979 | "text/html": [ 2980 | "\n", 2981 | "\n", 2992 | "
\n", 2993 | "" 3046 | ], 3047 | "text/plain": [ 3048 | "alt.Chart(...)" 3049 | ] 3050 | }, 3051 | "execution_count": 114, 3052 | "metadata": {}, 3053 | "output_type": "execute_result" 3054 | } 3055 | ], 3056 | "source": [ 3057 | "def meters_per_second_to_mph(mps):\n", 3058 | " return mps * 2.23694\n", 3059 | "\n", 3060 | "(tweak_gpx(raw)\n", 3061 | " .group_by_dynamic(index_column='time', every='10m')\n", 3062 | " .agg(pl.col('travelled').last() - pl.col('travelled').first(),\n", 3063 | " speed=(pl.col('travelled').last() - pl.col('travelled').first()) / \n", 3064 | " ((pl.col('time').last() - pl.col('time').first()).dt.total_seconds())\n", 3065 | " ) \n", 3066 | " .with_columns(mph=pl.col('speed').pipe(meters_per_second_to_mph))\n", 3067 | " .plot.bar(x='time', y='mph')\n", 3068 | " )" 3069 | ] 3070 | }, 3071 | { 3072 | "cell_type": "code", 3073 | "execution_count": 115, 3074 | "metadata": {}, 3075 | "outputs": [ 3076 | { 3077 | "name": "stdout", 3078 | "output_type": "stream", 3079 | "text": [ 3080 | "(10430, 15)\n" 3081 | ] 3082 | }, 3083 | { 3084 | "data": { 3085 | "text/html": [ 3086 | "
\n", 3093 | "shape: (2, 2)
climbingdistance_2d
boolf64
false5.46702
true6.123374
" 3094 | ], 3095 | "text/plain": [ 3096 | "shape: (2, 2)\n", 3097 | "┌──────────┬─────────────┐\n", 3098 | "│ climbing ┆ distance_2d │\n", 3099 | "│ --- ┆ --- │\n", 3100 | "│ bool ┆ f64 │\n", 3101 | "╞══════════╪═════════════╡\n", 3102 | "│ false ┆ 5.46702 │\n", 3103 | "│ true ┆ 6.123374 │\n", 3104 | "└──────────┴─────────────┘" 3105 | ] 3106 | }, 3107 | "execution_count": 115, 3108 | "metadata": {}, 3109 | "output_type": "execute_result" 3110 | } 3111 | ], 3112 | "source": [ 3113 | "# uphill vs downhill\n", 3114 | "def meters_to_feet(m):\n", 3115 | " return m * 3.28084\n", 3116 | "\n", 3117 | "def feet_to_miles(f):\n", 3118 | " return f / 5280\n", 3119 | "\n", 3120 | "(tweak_gpx(raw)\n", 3121 | " .with_columns(climbing=pl.col('elevation').diff().gt(0))\n", 3122 | " .group_by('climbing')\n", 3123 | " .agg(pl.col('distance_2d').sum().pipe(meters_to_feet).pipe(feet_to_miles))\n", 3124 | " .filter(~pl.col('climbing').is_null())\n", 3125 | ")" 3126 | ] 3127 | }, 3128 | { 3129 | "cell_type": "code", 3130 | "execution_count": 116, 3131 | "metadata": {}, 3132 | "outputs": [ 3133 | { 3134 | "name": "stdout", 3135 | "output_type": "stream", 3136 | "text": [ 3137 | "(10430, 15)\n" 3138 | ] 3139 | }, 3140 | { 3141 | "data": { 3142 | "text/html": [ 3143 | "\n", 3144 | "\n", 3155 | "
\n", 3156 | "" 3209 | ], 3210 | "text/plain": [ 3211 | "alt.Chart(...)" 3212 | ] 3213 | }, 3214 | "execution_count": 116, 3215 | "metadata": {}, 3216 | "output_type": "execute_result" 3217 | } 3218 | ], 3219 | "source": [ 3220 | "# uphill vs downhill\n", 3221 | "def meters_to_feet(m):\n", 3222 | " return m * 3.28084\n", 3223 | "\n", 3224 | "def feet_to_miles(f):\n", 3225 | " return f / 5280\n", 3226 | "\n", 3227 | "(tweak_gpx(raw)\n", 3228 | " .with_columns(climbing=pl.col('elevation').diff().gt(0))\n", 3229 | " .group_by('climbing')\n", 3230 | " .agg(pl.col('distance_2d').sum().pipe(meters_to_feet).pipe(feet_to_miles))\n", 3231 | " .filter(~pl.col('climbing').is_null())\n", 3232 | " .plot.bar(x='climbing', y='distance_2d')\n", 3233 | ")" 3234 | ] 3235 | }, 3236 | { 3237 | "cell_type": "code", 3238 | "execution_count": null, 3239 | "metadata": { 3240 | "lines_to_next_cell": 2, 3241 | "pycharm": { 3242 | "name": "#%%\n" 3243 | } 3244 | }, 3245 | "outputs": [], 3246 | "source": [] 3247 | }, 3248 | { 3249 | "cell_type": "code", 3250 | "execution_count": null, 3251 | "metadata": { 3252 | "lines_to_next_cell": 2, 3253 | "pycharm": { 3254 | "name": "#%%\n" 3255 | } 3256 | }, 3257 | "outputs": [], 3258 | "source": [] 3259 | }, 3260 | { 3261 | "cell_type": "code", 3262 | "execution_count": null, 3263 | "metadata": { 3264 | "lines_to_next_cell": 2, 3265 | "pycharm": { 3266 | "name": "#%%\n" 3267 | } 3268 | }, 3269 | "outputs": [], 3270 | "source": [] 3271 | }, 3272 | { 3273 | "cell_type": "code", 3274 | "execution_count": null, 3275 | "metadata": { 3276 | "lines_to_next_cell": 2, 3277 | "pycharm": { 3278 | "name": "#%%\n" 3279 | } 3280 | }, 3281 | "outputs": [], 3282 | "source": [] 3283 | }, 3284 | { 3285 | "cell_type": "code", 3286 | "execution_count": null, 3287 | "metadata": { 3288 | "lines_to_next_cell": 2, 3289 | "pycharm": { 3290 | "name": "#%%\n" 3291 | } 3292 | }, 3293 | "outputs": [], 3294 | "source": [] 3295 | }, 3296 | { 3297 | "cell_type": "code", 3298 | "execution_count": null, 3299 | "metadata": { 3300 | "lines_to_next_cell": 2, 3301 | "pycharm": { 3302 | "name": "#%%\n" 3303 | } 3304 | }, 3305 | "outputs": [], 3306 | "source": [] 3307 | }, 3308 | { 3309 | "cell_type": "code", 3310 | "execution_count": null, 3311 | "metadata": { 3312 | "lines_to_next_cell": 2, 3313 | "pycharm": { 3314 | "name": "#%%\n" 3315 | } 3316 | }, 3317 | "outputs": [], 3318 | "source": [] 3319 | }, 3320 | { 3321 | "cell_type": "code", 3322 | "execution_count": null, 3323 | "metadata": { 3324 | "lines_to_next_cell": 2, 3325 | "pycharm": { 3326 | "name": "#%%\n" 3327 | } 3328 | }, 3329 | "outputs": [], 3330 | "source": [] 3331 | }, 3332 | { 3333 | "cell_type": "markdown", 3334 | "metadata": { 3335 | "pycharm": { 3336 | "name": "#%% md\n" 3337 | } 3338 | }, 3339 | "source": [ 3340 | "## Summary\n", 3341 | "\n", 3342 | "* Correct types save space and enable convenient math, string, and date functionality\n", 3343 | "* Chaining operations will:\n", 3344 | " * Make code readable\n", 3345 | " * Remove bugs\n", 3346 | " * Easier to debug\n", 3347 | "* ``.map_elements`` is slow for math\n", 3348 | "* Aggregations are powerful. Play with them until they make sense\n", 3349 | "\n", 3350 | "\n", 3351 | "Let's connect! Happy to discuss how your team can better leverage tabular technologies.\n", 3352 | "\n", 3353 | "Twitter ``@__mharrison__``, LinkedIn\n", 3354 | "\n", 3355 | "Book giveaway" 3356 | ] 3357 | }, 3358 | { 3359 | "cell_type": "code", 3360 | "execution_count": 117, 3361 | "metadata": {}, 3362 | "outputs": [], 3363 | "source": [ 3364 | "import random" 3365 | ] 3366 | }, 3367 | { 3368 | "cell_type": "code", 3369 | "execution_count": 118, 3370 | "metadata": {}, 3371 | "outputs": [ 3372 | { 3373 | "data": { 3374 | "text/plain": [ 3375 | "10" 3376 | ] 3377 | }, 3378 | "execution_count": 118, 3379 | "metadata": {}, 3380 | "output_type": "execute_result" 3381 | } 3382 | ], 3383 | "source": [ 3384 | "random.randrange(0, 12)" 3385 | ] 3386 | }, 3387 | { 3388 | "cell_type": "code", 3389 | "execution_count": null, 3390 | "metadata": {}, 3391 | "outputs": [], 3392 | "source": [ 3393 | "random.randrange(0, 3)" 3394 | ] 3395 | }, 3396 | { 3397 | "cell_type": "code", 3398 | "execution_count": null, 3399 | "metadata": {}, 3400 | "outputs": [], 3401 | "source": [ 3402 | "import random\n", 3403 | "random.choice([0,1])" 3404 | ] 3405 | }, 3406 | { 3407 | "cell_type": "code", 3408 | "execution_count": null, 3409 | "metadata": { 3410 | "lines_to_next_cell": 2, 3411 | "pycharm": { 3412 | "name": "#%%\n" 3413 | } 3414 | }, 3415 | "outputs": [], 3416 | "source": [ 3417 | "import random\n", 3418 | "random.randrange(1,4)" 3419 | ] 3420 | }, 3421 | { 3422 | "cell_type": "code", 3423 | "execution_count": null, 3424 | "metadata": { 3425 | "lines_to_next_cell": 2, 3426 | "pycharm": { 3427 | "name": "#%%\n" 3428 | } 3429 | }, 3430 | "outputs": [], 3431 | "source": [] 3432 | }, 3433 | { 3434 | "cell_type": "code", 3435 | "execution_count": null, 3436 | "metadata": { 3437 | "lines_to_next_cell": 2, 3438 | "pycharm": { 3439 | "name": "#%%\n" 3440 | } 3441 | }, 3442 | "outputs": [], 3443 | "source": [] 3444 | }, 3445 | { 3446 | "cell_type": "code", 3447 | "execution_count": null, 3448 | "metadata": { 3449 | "pycharm": { 3450 | "name": "#%%\n" 3451 | } 3452 | }, 3453 | "outputs": [], 3454 | "source": [] 3455 | } 3456 | ], 3457 | "metadata": { 3458 | "jupytext": { 3459 | "encoding": "# -*- coding: utf-8 -*-", 3460 | "formats": "ipynb,py:light" 3461 | }, 3462 | "kernelspec": { 3463 | "display_name": "Python 3 (ipykernel)", 3464 | "language": "python", 3465 | "name": "python3" 3466 | }, 3467 | "language_info": { 3468 | "codemirror_mode": { 3469 | "name": "ipython", 3470 | "version": 3 3471 | }, 3472 | "file_extension": ".py", 3473 | "mimetype": "text/x-python", 3474 | "name": "python", 3475 | "nbconvert_exporter": "python", 3476 | "pygments_lexer": "ipython3", 3477 | "version": "3.10.15" 3478 | } 3479 | }, 3480 | "nbformat": 4, 3481 | "nbformat_minor": 4 3482 | } 3483 | --------------------------------------------------------------------------------