├── pyproject.toml
├── .devcontainer
└── devcontainer.json
├── README.md
└── polars
└── Idiomatic Polars.ipynb
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [project]
2 | name = "pres"
3 | version = "0.1.0"
4 | description = "pres"
5 | readme = "README.md"
6 | requires-python = ">=3.10"
7 | dependencies = [
8 | "altair>=5.4.1",
9 | "catboost>=1.2.7",
10 | "gpxpy>=1.6.2",
11 | "matplotlib>=3.9.2",
12 | "notebook>=7.2.2",
13 | "pandas>=2.2.3",
14 | "plotly>=5.24.1",
15 | "polars>=1.12.0",
16 | "pyarrow>=18.0.0",
17 | "scikit-learn>=1.5.2",
18 | "seaborn>=0.13.2",
19 | "shap>=0.46.0",
20 | "xlrd>=2.0.1",
21 | "yellowbrick>=1.5",
22 | ]
23 |
--------------------------------------------------------------------------------
/.devcontainer/devcontainer.json:
--------------------------------------------------------------------------------
1 | {
2 | "image": "mcr.microsoft.com/devcontainers/universal:2",
3 | "hostRequirements": {
4 | "cpus": 4
5 | },
6 | "waitFor": "onCreateCommand",
7 | "updateContentCommand": "python3 -m pip install uv; uv sync",
8 | "postCreateCommand": "",
9 | "customizations": {
10 | "codespaces": {
11 | "openFiles": []
12 | },
13 | "vscode": {
14 | "extensions": [
15 | "ms-toolsai.jupyter",
16 | "ms-python.python"
17 | ]
18 | }
19 | }
20 | }
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # PyData NYC 2024
2 |
3 |
4 | ## Contents
5 |
6 | - **Notebooks**:
7 | - `polars/Idiomatic Polars.ipynb` : "Slides" for Polars
8 | - **Environment Setup**: Instructions for running these notebooks in various environments (locally, Codespaces, and Google Colab).
9 |
10 | ## Running the Jupyter Notebooks
11 |
12 | You can run the notebooks in several ways, depending on your preferences and setup:
13 |
14 | ### 1. Running Locally
15 |
16 | To run the notebooks locally, you will need to create a virtual environment and install the necessary dependencies.
17 |
18 | 1. Clone the repository:
19 | ```sh
20 | git clone git@github.com:mattharrison/pydata_nyc_2024.git
21 | cd pydata_nyc_2024
22 | ```
23 |
24 | 2. Set up the environment using `uv`:
25 |
26 | Mac/Linux:
27 | ```sh
28 | $ curl -LsSf https://astral.sh/uv/install.sh | sh
29 | ```
30 | Windows:
31 | ```cmd
32 | > powershell -c "irm https://astral.sh/uv/install.ps1 | iex"
33 | ```
34 |
35 | Then run:
36 | ```
37 | uv sync
38 | ```
39 |
40 | The above commands will create a virtual environment, activate it, and install dependencies.
41 |
42 | 3. Start Jupyter Notebook:
43 | ```sh
44 | uv run jupyter notebook
45 | ```
46 |
47 | ### 2. Running on GitHub Codespaces
48 |
49 | GitHub Codespaces allows you to run this project entirely in the cloud without needing to set up a local environment.
50 |
51 | 1. Open the GitHub repository in your browser.
52 | 2. Click the **Code** button and select **Open with Codespaces**.
53 | 3. After the Codespace launches, wait for it to install the environment.
54 | 4. Click on the notebook and select the local kernel.
55 |
56 | The project is pre-configured to install the necessary dependencies when the Codespace is first created.
57 |
58 | ### 3. Running on Google Colab
59 |
60 | You can also run the notebooks on Google Colab, which provides free GPU/TPU resources for faster computations.
61 |
62 | 1. Open the repository on GitHub.
63 | 2. Navigate to the desired Jupyter notebook file (ending in `.ipynb`).
64 | 3. Update the domain from `github.com` to `githubtocolab.com`
65 | 4. Once in Colab, you may need to run the first cell to install any required dependencies.
66 |
67 | ## Dependencies
68 |
69 |
70 | To see the complete list of dependencies, please check the `pyproject.toml` file.
71 |
72 | ## Suggested Reading
73 |
74 | To deepen your understanding of Python for data analysis, I recommend the following books:
75 |
76 | - [*Learning Python for Data*](https://store.metasnake.com/learningpy): This book provides a comprehensive introduction to Python, with a focus on its applications in data analysis. It covers Python fundamentals, essential libraries, and practical examples to help you get started with data-driven projects.
77 |
78 | - [*Effective Polars*](https://store.metasnake.com/a5018258-063b-4802-b395-34e75b6eeb5e): A guide to mastering data manipulation and analysis with Polars.
79 |
80 | ## Contributing
81 |
82 | Contributions are welcome! Feel free to open an issue or submit a pull request if you have suggestions, improvements, or additional notebooks to add.
83 |
84 | ## License
85 |
86 | This project is licensed under the MIT License.
87 |
88 | ## Contact
89 |
90 | If you have any questions, feel free to reach out or open an issue on the repository.
91 |
92 |
--------------------------------------------------------------------------------
/polars/Idiomatic Polars.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "lines_to_next_cell": 0,
7 | "pycharm": {
8 | "name": "#%% md\n"
9 | }
10 | },
11 | "source": [
12 | "# Idiomatic Polars \n",
13 | "\n",
14 | "## Matt Harrison - PyData NYC 2024\n",
15 | "\n",
16 | "## https://github.com/mattharrison/pydata_nyc_2024\n",
17 | "\n",
18 | "\n",
19 | "\n",
20 | " 5\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwith_columns\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpl\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcol\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43melapsed\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcast\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpl\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mInt8\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 6\u001b[0m \u001b[38;5;66;03m#.describe()\u001b[39;00m\n\u001b[1;32m 7\u001b[0m )\n",
1109 | "File \u001b[0;32m~/.envs/menv/lib/python3.10/site-packages/polars/dataframe/frame.py:9194\u001b[0m, in \u001b[0;36mDataFrame.with_columns\u001b[0;34m(self, *exprs, **named_exprs)\u001b[0m\n\u001b[1;32m 9048\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mwith_columns\u001b[39m(\n\u001b[1;32m 9049\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m 9050\u001b[0m \u001b[38;5;241m*\u001b[39mexprs: IntoExpr \u001b[38;5;241m|\u001b[39m Iterable[IntoExpr],\n\u001b[1;32m 9051\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mnamed_exprs: IntoExpr,\n\u001b[1;32m 9052\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m DataFrame:\n\u001b[1;32m 9053\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 9054\u001b[0m \u001b[38;5;124;03m Add columns to this DataFrame.\u001b[39;00m\n\u001b[1;32m 9055\u001b[0m \n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 9192\u001b[0m \u001b[38;5;124;03m └─────┴──────┴─────────────┘\u001b[39;00m\n\u001b[1;32m 9193\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m-> 9194\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlazy\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwith_columns\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mexprs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mnamed_exprs\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcollect\u001b[49m\u001b[43m(\u001b[49m\u001b[43m_eager\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\n",
1110 | "File \u001b[0;32m~/.envs/menv/lib/python3.10/site-packages/polars/lazyframe/frame.py:2055\u001b[0m, in \u001b[0;36mLazyFrame.collect\u001b[0;34m(self, type_coercion, predicate_pushdown, projection_pushdown, simplify_expression, slice_pushdown, comm_subplan_elim, comm_subexpr_elim, cluster_with_columns, collapse_joins, no_optimization, streaming, engine, background, _eager, **_kwargs)\u001b[0m\n\u001b[1;32m 2053\u001b[0m \u001b[38;5;66;03m# Only for testing purposes\u001b[39;00m\n\u001b[1;32m 2054\u001b[0m callback \u001b[38;5;241m=\u001b[39m _kwargs\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpost_opt_callback\u001b[39m\u001b[38;5;124m\"\u001b[39m, callback)\n\u001b[0;32m-> 2055\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m wrap_df(\u001b[43mldf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcollect\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcallback\u001b[49m\u001b[43m)\u001b[49m)\n",
1111 | "\u001b[0;31mInvalidOperationError\u001b[0m: conversion from `i64` to `i8` failed in column 'elapsed' for 10321 out of 10430 values: [128, 129, … 10686]"
1112 | ]
1113 | }
1114 | ],
1115 | "source": [
1116 | "# chaining\n",
1117 | "# polars prevents illegal casts\n",
1118 | "(df\n",
1119 | " .select(cols)\n",
1120 | " .with_columns(pl.col('elapsed').cast(pl.Int8))\n",
1121 | " #.describe()\n",
1122 | ")"
1123 | ]
1124 | },
1125 | {
1126 | "cell_type": "code",
1127 | "execution_count": 80,
1128 | "metadata": {
1129 | "pycharm": {
1130 | "name": "#%%\n"
1131 | },
1132 | "scrolled": true
1133 | },
1134 | "outputs": [
1135 | {
1136 | "data": {
1137 | "text/html": [
1138 | "
\n",
1145 | "
shape: (9, 15)| statistic | course | distance_2d | latitude | longitude | time | elevation | speed_between | travelled | elapsed | avg_velocity | rolling_travelled | rolling_elapsed | rolling_velocity | rolling_between |
|---|
| str | f64 | f64 | f64 | f64 | str | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 |
| "count" | 0.0 | 10430.0 | 10430.0 | 10430.0 | "10430" | 10430.0 | 10429.0 | 10430.0 | 10430.0 | 10430.0 | 10426.0 | 10426.0 | 10426.0 | 10425.0 |
| "null_count" | 10430.0 | 0.0 | 0.0 | 0.0 | "0" | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 4.0 | 4.0 | 4.0 | 5.0 |
| "mean" | null | 1.788392 | 40.859662 | -111.822538 | "2024-09-11 01:11:59.773921+00:… | 1788.95372 | 1.811534 | 10644.58751 | 5403.773921 | NaN | 10645.09369 | 5403.797123 | 2.065183 | 1.81141 |
| "std" | null | 1.710749 | 0.005536 | 0.0137 | null | 141.492882 | 1.657749 | 5393.503724 | 3067.406074 | NaN | 5391.382656 | 3066.209114 | 0.418829 | 1.506718 |
| "min" | null | 0.0 | 40.848371 | -111.855371 | "2024-09-10 23:41:56+00:00" | 1480.0 | 0.0 | 0.0 | 0.0 | 0.374262 | 2.267271 | 2.8 | 0.376911 | -1.0081e-14 |
| "25%" | null | 0.611231 | 40.856748 | -111.829785 | "2024-09-11 00:27:56+00:00" | 1696.0 | 0.675361 | 5810.431187 | 2760.0 | 1.848393 | 5812.456847 | 2761.0 | 1.848439 | 0.776866 |
| "50%" | null | 1.324637 | 40.859156 | -111.82222 | "2024-09-11 01:12:20+00:00" | 1816.5 | 1.392097 | 12239.423704 | 5424.0 | 1.981256 | 12239.47112 | 5424.0 | 1.981257 | 1.582638 |
| "75%" | null | 2.424472 | 40.86085 | -111.813012 | "2024-09-11 01:56:25+00:00" | 1924.9 | 2.506978 | 15087.566241 | 8069.0 | 2.231917 | 15086.449322 | 8068.0 | 2.231922 | 2.428585 |
| "max" | null | 15.32443 | 40.879271 | -111.801421 | "2024-09-11 02:40:02+00:00" | 1984.0 | 12.871382 | 18652.930528 | 10686.0 | 3.828152 | 18646.176372 | 10684.0 | 3.827216 | 12.491514 |
"
1146 | ],
1147 | "text/plain": [
1148 | "shape: (9, 15)\n",
1149 | "┌────────────┬─────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬───────────┐\n",
1150 | "│ statistic ┆ course ┆ distance_ ┆ latitude ┆ … ┆ rolling_t ┆ rolling_e ┆ rolling_v ┆ rolling_b │\n",
1151 | "│ --- ┆ --- ┆ 2d ┆ --- ┆ ┆ ravelled ┆ lapsed ┆ elocity ┆ etween │\n",
1152 | "│ str ┆ f64 ┆ --- ┆ f64 ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n",
1153 | "│ ┆ ┆ f64 ┆ ┆ ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n",
1154 | "╞════════════╪═════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪═══════════╡\n",
1155 | "│ count ┆ 0.0 ┆ 10430.0 ┆ 10430.0 ┆ … ┆ 10426.0 ┆ 10426.0 ┆ 10426.0 ┆ 10425.0 │\n",
1156 | "│ null_count ┆ 10430.0 ┆ 0.0 ┆ 0.0 ┆ … ┆ 4.0 ┆ 4.0 ┆ 4.0 ┆ 5.0 │\n",
1157 | "│ mean ┆ null ┆ 1.788392 ┆ 40.859662 ┆ … ┆ 10645.093 ┆ 5403.7971 ┆ 2.065183 ┆ 1.81141 │\n",
1158 | "│ ┆ ┆ ┆ ┆ ┆ 69 ┆ 23 ┆ ┆ │\n",
1159 | "│ std ┆ null ┆ 1.710749 ┆ 0.005536 ┆ … ┆ 5391.3826 ┆ 3066.2091 ┆ 0.418829 ┆ 1.506718 │\n",
1160 | "│ ┆ ┆ ┆ ┆ ┆ 56 ┆ 14 ┆ ┆ │\n",
1161 | "│ min ┆ null ┆ 0.0 ┆ 40.848371 ┆ … ┆ 2.267271 ┆ 2.8 ┆ 0.376911 ┆ -1.0081e- │\n",
1162 | "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 14 │\n",
1163 | "│ 25% ┆ null ┆ 0.611231 ┆ 40.856748 ┆ … ┆ 5812.4568 ┆ 2761.0 ┆ 1.848439 ┆ 0.776866 │\n",
1164 | "│ ┆ ┆ ┆ ┆ ┆ 47 ┆ ┆ ┆ │\n",
1165 | "│ 50% ┆ null ┆ 1.324637 ┆ 40.859156 ┆ … ┆ 12239.471 ┆ 5424.0 ┆ 1.981257 ┆ 1.582638 │\n",
1166 | "│ ┆ ┆ ┆ ┆ ┆ 12 ┆ ┆ ┆ │\n",
1167 | "│ 75% ┆ null ┆ 2.424472 ┆ 40.86085 ┆ … ┆ 15086.449 ┆ 8068.0 ┆ 2.231922 ┆ 2.428585 │\n",
1168 | "│ ┆ ┆ ┆ ┆ ┆ 322 ┆ ┆ ┆ │\n",
1169 | "│ max ┆ null ┆ 15.32443 ┆ 40.879271 ┆ … ┆ 18646.176 ┆ 10684.0 ┆ 3.827216 ┆ 12.491514 │\n",
1170 | "│ ┆ ┆ ┆ ┆ ┆ 372 ┆ ┆ ┆ │\n",
1171 | "└────────────┴─────────┴───────────┴───────────┴───┴───────────┴───────────┴───────────┴───────────┘"
1172 | ]
1173 | },
1174 | "execution_count": 80,
1175 | "metadata": {},
1176 | "output_type": "execute_result"
1177 | }
1178 | ],
1179 | "source": [
1180 | "# chaining\n",
1181 | "(df\n",
1182 | " .select(cols)\n",
1183 | " .with_columns(pl.col('elapsed').cast(pl.Int16))\n",
1184 | " .describe() \n",
1185 | ")"
1186 | ]
1187 | },
1188 | {
1189 | "cell_type": "code",
1190 | "execution_count": 81,
1191 | "metadata": {
1192 | "lines_to_next_cell": 0,
1193 | "pycharm": {
1194 | "name": "#%%\n"
1195 | },
1196 | "scrolled": true
1197 | },
1198 | "outputs": [
1199 | {
1200 | "data": {
1201 | "text/plain": [
1202 | "1028660"
1203 | ]
1204 | },
1205 | "execution_count": 81,
1206 | "metadata": {},
1207 | "output_type": "execute_result"
1208 | }
1209 | ],
1210 | "source": [
1211 | "# chaining\n",
1212 | "(df\n",
1213 | " .select(cols)\n",
1214 | " .with_columns(pl.col('elapsed').cast(pl.Int16)) \n",
1215 | " .estimated_size()\n",
1216 | ")"
1217 | ]
1218 | },
1219 | {
1220 | "cell_type": "code",
1221 | "execution_count": 82,
1222 | "metadata": {},
1223 | "outputs": [
1224 | {
1225 | "data": {
1226 | "text/plain": [
1227 | "1091240"
1228 | ]
1229 | },
1230 | "execution_count": 82,
1231 | "metadata": {},
1232 | "output_type": "execute_result"
1233 | }
1234 | ],
1235 | "source": [
1236 | "(df\n",
1237 | " .select(cols)\n",
1238 | " .estimated_size()\n",
1239 | ")"
1240 | ]
1241 | },
1242 | {
1243 | "cell_type": "code",
1244 | "execution_count": null,
1245 | "metadata": {},
1246 | "outputs": [],
1247 | "source": []
1248 | },
1249 | {
1250 | "cell_type": "code",
1251 | "execution_count": null,
1252 | "metadata": {},
1253 | "outputs": [],
1254 | "source": []
1255 | },
1256 | {
1257 | "cell_type": "code",
1258 | "execution_count": null,
1259 | "metadata": {
1260 | "lines_to_next_cell": 2,
1261 | "pycharm": {
1262 | "name": "#%%\n"
1263 | }
1264 | },
1265 | "outputs": [],
1266 | "source": []
1267 | },
1268 | {
1269 | "cell_type": "code",
1270 | "execution_count": null,
1271 | "metadata": {
1272 | "lines_to_next_cell": 2,
1273 | "pycharm": {
1274 | "name": "#%%\n"
1275 | }
1276 | },
1277 | "outputs": [],
1278 | "source": []
1279 | },
1280 | {
1281 | "cell_type": "markdown",
1282 | "metadata": {
1283 | "pycharm": {
1284 | "name": "#%% md\n"
1285 | }
1286 | },
1287 | "source": [
1288 | "### Strings"
1289 | ]
1290 | },
1291 | {
1292 | "cell_type": "code",
1293 | "execution_count": 83,
1294 | "metadata": {
1295 | "pycharm": {
1296 | "name": "#%%\n"
1297 | },
1298 | "scrolled": true
1299 | },
1300 | "outputs": [
1301 | {
1302 | "data": {
1303 | "text/html": [
1304 | "\n",
1311 | "
shape: (0, 0) "
1312 | ],
1313 | "text/plain": [
1314 | "shape: (0, 0)\n",
1315 | "┌┐\n",
1316 | "╞╡\n",
1317 | "└┘"
1318 | ]
1319 | },
1320 | "execution_count": 83,
1321 | "metadata": {},
1322 | "output_type": "execute_result"
1323 | }
1324 | ],
1325 | "source": [
1326 | "df.select(cols).select(pl.col(pl.String))"
1327 | ]
1328 | },
1329 | {
1330 | "cell_type": "code",
1331 | "execution_count": 84,
1332 | "metadata": {
1333 | "scrolled": true
1334 | },
1335 | "outputs": [
1336 | {
1337 | "data": {
1338 | "text/html": [
1339 | "\n",
1346 | "
shape: (10_430, 14)| course | distance_2d | latitude | longitude | time | elevation | speed_between | travelled | elapsed | avg_velocity | rolling_travelled | rolling_elapsed | rolling_velocity | rolling_between |
|---|
| str | f64 | f64 | f64 | datetime[μs, UTC] | f64 | f64 | f64 | i16 | f64 | f64 | f64 | f64 | f64 |
| "Maple Syrup" | 0.0 | 40.879161 | -111.855169 | 2024-09-10 23:41:56 UTC | 1480.0 | null | 0.0 | 0 | NaN | null | null | null | null |
| "Maple Syrup" | 1.210883 | 40.879167 | -111.855181 | 2024-09-10 23:41:58 UTC | 1480.1 | 0.607503 | 1.210883 | 2 | 0.605442 | null | null | null | null |
| "Maple Syrup" | 1.227612 | 40.879172 | -111.855194 | 2024-09-10 23:41:59 UTC | 1480.1 | 1.227612 | 2.438495 | 3 | 0.812832 | null | null | null | null |
| "Maple Syrup" | 0.952238 | 40.879174 | -111.855205 | 2024-09-10 23:42:00 UTC | 1480.1 | 0.952238 | 3.390733 | 4 | 0.847683 | null | null | null | null |
| "Maple Syrup" | 0.90551 | 40.879177 | -111.855215 | 2024-09-10 23:42:01 UTC | 1480.1 | 0.90551 | 4.296243 | 5 | 0.859249 | 2.267271 | 2.8 | 0.80974 | null |
| … | … | … | … | … | … | … | … | … | … | … | … | … | … |
| "Maple Syrup" | 4.085147 | 40.857847 | -111.823899 | 2024-09-11 02:39:58 UTC | 1696.8 | 4.086371 | 18640.85135 | 10682 | 1.745071 | 18634.60004 | 10680.0 | 1.744813 | 3.177668 |
| "Maple Syrup" | 1.535726 | 40.857852 | -111.823916 | 2024-09-11 02:39:59 UTC | 1696.8 | 1.535726 | 18642.387076 | 10683 | 1.745052 | 18637.420852 | 10681.0 | 1.744913 | 2.821613 |
| "Maple Syrup" | 3.156134 | 40.857869 | -111.823946 | 2024-09-11 02:40:00 UTC | 1696.7 | 3.157718 | 18645.54321 | 10684 | 1.745184 | 18640.220738 | 10682.0 | 1.745012 | 2.800696 |
| "Maple Syrup" | 3.626488 | 40.857889 | -111.82398 | 2024-09-11 02:40:01 UTC | 1696.5 | 3.631999 | 18649.169698 | 10685 | 1.74536 | 18642.943507 | 10683.0 | 1.745104 | 2.724433 |
| "Maple Syrup" | 3.760829 | 40.857909 | -111.824016 | 2024-09-11 02:40:02 UTC | 1696.3 | 3.766144 | 18652.930528 | 10686 | 1.745548 | 18646.176372 | 10684.0 | 1.745243 | 3.235592 |
"
1347 | ],
1348 | "text/plain": [
1349 | "shape: (10_430, 14)\n",
1350 | "┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐\n",
1351 | "│ course ┆ distance_ ┆ latitude ┆ longitude ┆ … ┆ rolling_t ┆ rolling_e ┆ rolling_v ┆ rolling_ │\n",
1352 | "│ --- ┆ 2d ┆ --- ┆ --- ┆ ┆ ravelled ┆ lapsed ┆ elocity ┆ between │\n",
1353 | "│ str ┆ --- ┆ f64 ┆ f64 ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n",
1354 | "│ ┆ f64 ┆ ┆ ┆ ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n",
1355 | "╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡\n",
1356 | "│ Maple ┆ 0.0 ┆ 40.879161 ┆ -111.8551 ┆ … ┆ null ┆ null ┆ null ┆ null │\n",
1357 | "│ Syrup ┆ ┆ ┆ 69 ┆ ┆ ┆ ┆ ┆ │\n",
1358 | "│ Maple ┆ 1.210883 ┆ 40.879167 ┆ -111.8551 ┆ … ┆ null ┆ null ┆ null ┆ null │\n",
1359 | "│ Syrup ┆ ┆ ┆ 81 ┆ ┆ ┆ ┆ ┆ │\n",
1360 | "│ Maple ┆ 1.227612 ┆ 40.879172 ┆ -111.8551 ┆ … ┆ null ┆ null ┆ null ┆ null │\n",
1361 | "│ Syrup ┆ ┆ ┆ 94 ┆ ┆ ┆ ┆ ┆ │\n",
1362 | "│ Maple ┆ 0.952238 ┆ 40.879174 ┆ -111.8552 ┆ … ┆ null ┆ null ┆ null ┆ null │\n",
1363 | "│ Syrup ┆ ┆ ┆ 05 ┆ ┆ ┆ ┆ ┆ │\n",
1364 | "│ Maple ┆ 0.90551 ┆ 40.879177 ┆ -111.8552 ┆ … ┆ 2.267271 ┆ 2.8 ┆ 0.80974 ┆ null │\n",
1365 | "│ Syrup ┆ ┆ ┆ 15 ┆ ┆ ┆ ┆ ┆ │\n",
1366 | "│ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n",
1367 | "│ Maple ┆ 4.085147 ┆ 40.857847 ┆ -111.8238 ┆ … ┆ 18634.600 ┆ 10680.0 ┆ 1.744813 ┆ 3.177668 │\n",
1368 | "│ Syrup ┆ ┆ ┆ 99 ┆ ┆ 04 ┆ ┆ ┆ │\n",
1369 | "│ Maple ┆ 1.535726 ┆ 40.857852 ┆ -111.8239 ┆ … ┆ 18637.420 ┆ 10681.0 ┆ 1.744913 ┆ 2.821613 │\n",
1370 | "│ Syrup ┆ ┆ ┆ 16 ┆ ┆ 852 ┆ ┆ ┆ │\n",
1371 | "│ Maple ┆ 3.156134 ┆ 40.857869 ┆ -111.8239 ┆ … ┆ 18640.220 ┆ 10682.0 ┆ 1.745012 ┆ 2.800696 │\n",
1372 | "│ Syrup ┆ ┆ ┆ 46 ┆ ┆ 738 ┆ ┆ ┆ │\n",
1373 | "│ Maple ┆ 3.626488 ┆ 40.857889 ┆ -111.8239 ┆ … ┆ 18642.943 ┆ 10683.0 ┆ 1.745104 ┆ 2.724433 │\n",
1374 | "│ Syrup ┆ ┆ ┆ 8 ┆ ┆ 507 ┆ ┆ ┆ │\n",
1375 | "│ Maple ┆ 3.760829 ┆ 40.857909 ┆ -111.8240 ┆ … ┆ 18646.176 ┆ 10684.0 ┆ 1.745243 ┆ 3.235592 │\n",
1376 | "│ Syrup ┆ ┆ ┆ 16 ┆ ┆ 372 ┆ ┆ ┆ │\n",
1377 | "└───────────┴───────────┴───────────┴───────────┴───┴───────────┴───────────┴───────────┴──────────┘"
1378 | ]
1379 | },
1380 | "execution_count": 84,
1381 | "metadata": {},
1382 | "output_type": "execute_result"
1383 | }
1384 | ],
1385 | "source": [
1386 | "# chaining\n",
1387 | "(df\n",
1388 | " .select(cols)\n",
1389 | " .with_columns(pl.col('elapsed').cast(pl.Int16),\n",
1390 | " course=pl.lit('Maple Syrup'))\n",
1391 | ")"
1392 | ]
1393 | },
1394 | {
1395 | "cell_type": "code",
1396 | "execution_count": 85,
1397 | "metadata": {
1398 | "scrolled": true
1399 | },
1400 | "outputs": [
1401 | {
1402 | "data": {
1403 | "text/html": [
1404 | "\n",
1411 | "
shape: (10_430, 15)| course | distance_2d | latitude | longitude | time | elevation | speed_between | travelled | elapsed | avg_velocity | rolling_travelled | rolling_elapsed | rolling_velocity | rolling_between | alt_name |
|---|
| str | f64 | f64 | f64 | datetime[μs, UTC] | f64 | f64 | f64 | i16 | f64 | f64 | f64 | f64 | f64 | str |
| "Maple Syrup" | 0.0 | 40.879161 | -111.855169 | 2024-09-10 23:41:56 UTC | 1480.0 | null | 0.0 | 0 | NaN | null | null | null | null | "MAPLE SYRUP" |
| "Maple Syrup" | 1.210883 | 40.879167 | -111.855181 | 2024-09-10 23:41:58 UTC | 1480.1 | 0.607503 | 1.210883 | 2 | 0.605442 | null | null | null | null | "MAPLE SYRUP" |
| "Maple Syrup" | 1.227612 | 40.879172 | -111.855194 | 2024-09-10 23:41:59 UTC | 1480.1 | 1.227612 | 2.438495 | 3 | 0.812832 | null | null | null | null | "MAPLE SYRUP" |
| "Maple Syrup" | 0.952238 | 40.879174 | -111.855205 | 2024-09-10 23:42:00 UTC | 1480.1 | 0.952238 | 3.390733 | 4 | 0.847683 | null | null | null | null | "MAPLE SYRUP" |
| "Maple Syrup" | 0.90551 | 40.879177 | -111.855215 | 2024-09-10 23:42:01 UTC | 1480.1 | 0.90551 | 4.296243 | 5 | 0.859249 | 2.267271 | 2.8 | 0.80974 | null | "MAPLE SYRUP" |
| … | … | … | … | … | … | … | … | … | … | … | … | … | … | … |
| "Maple Syrup" | 4.085147 | 40.857847 | -111.823899 | 2024-09-11 02:39:58 UTC | 1696.8 | 4.086371 | 18640.85135 | 10682 | 1.745071 | 18634.60004 | 10680.0 | 1.744813 | 3.177668 | "MAPLE SYRUP" |
| "Maple Syrup" | 1.535726 | 40.857852 | -111.823916 | 2024-09-11 02:39:59 UTC | 1696.8 | 1.535726 | 18642.387076 | 10683 | 1.745052 | 18637.420852 | 10681.0 | 1.744913 | 2.821613 | "MAPLE SYRUP" |
| "Maple Syrup" | 3.156134 | 40.857869 | -111.823946 | 2024-09-11 02:40:00 UTC | 1696.7 | 3.157718 | 18645.54321 | 10684 | 1.745184 | 18640.220738 | 10682.0 | 1.745012 | 2.800696 | "MAPLE SYRUP" |
| "Maple Syrup" | 3.626488 | 40.857889 | -111.82398 | 2024-09-11 02:40:01 UTC | 1696.5 | 3.631999 | 18649.169698 | 10685 | 1.74536 | 18642.943507 | 10683.0 | 1.745104 | 2.724433 | "MAPLE SYRUP" |
| "Maple Syrup" | 3.760829 | 40.857909 | -111.824016 | 2024-09-11 02:40:02 UTC | 1696.3 | 3.766144 | 18652.930528 | 10686 | 1.745548 | 18646.176372 | 10684.0 | 1.745243 | 3.235592 | "MAPLE SYRUP" |
"
1412 | ],
1413 | "text/plain": [
1414 | "shape: (10_430, 15)\n",
1415 | "┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐\n",
1416 | "│ course ┆ distance_ ┆ latitude ┆ longitude ┆ … ┆ rolling_e ┆ rolling_v ┆ rolling_b ┆ alt_name │\n",
1417 | "│ --- ┆ 2d ┆ --- ┆ --- ┆ ┆ lapsed ┆ elocity ┆ etween ┆ --- │\n",
1418 | "│ str ┆ --- ┆ f64 ┆ f64 ┆ ┆ --- ┆ --- ┆ --- ┆ str │\n",
1419 | "│ ┆ f64 ┆ ┆ ┆ ┆ f64 ┆ f64 ┆ f64 ┆ │\n",
1420 | "╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡\n",
1421 | "│ Maple ┆ 0.0 ┆ 40.879161 ┆ -111.8551 ┆ … ┆ null ┆ null ┆ null ┆ MAPLE │\n",
1422 | "│ Syrup ┆ ┆ ┆ 69 ┆ ┆ ┆ ┆ ┆ SYRUP │\n",
1423 | "│ Maple ┆ 1.210883 ┆ 40.879167 ┆ -111.8551 ┆ … ┆ null ┆ null ┆ null ┆ MAPLE │\n",
1424 | "│ Syrup ┆ ┆ ┆ 81 ┆ ┆ ┆ ┆ ┆ SYRUP │\n",
1425 | "│ Maple ┆ 1.227612 ┆ 40.879172 ┆ -111.8551 ┆ … ┆ null ┆ null ┆ null ┆ MAPLE │\n",
1426 | "│ Syrup ┆ ┆ ┆ 94 ┆ ┆ ┆ ┆ ┆ SYRUP │\n",
1427 | "│ Maple ┆ 0.952238 ┆ 40.879174 ┆ -111.8552 ┆ … ┆ null ┆ null ┆ null ┆ MAPLE │\n",
1428 | "│ Syrup ┆ ┆ ┆ 05 ┆ ┆ ┆ ┆ ┆ SYRUP │\n",
1429 | "│ Maple ┆ 0.90551 ┆ 40.879177 ┆ -111.8552 ┆ … ┆ 2.8 ┆ 0.80974 ┆ null ┆ MAPLE │\n",
1430 | "│ Syrup ┆ ┆ ┆ 15 ┆ ┆ ┆ ┆ ┆ SYRUP │\n",
1431 | "│ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n",
1432 | "│ Maple ┆ 4.085147 ┆ 40.857847 ┆ -111.8238 ┆ … ┆ 10680.0 ┆ 1.744813 ┆ 3.177668 ┆ MAPLE │\n",
1433 | "│ Syrup ┆ ┆ ┆ 99 ┆ ┆ ┆ ┆ ┆ SYRUP │\n",
1434 | "│ Maple ┆ 1.535726 ┆ 40.857852 ┆ -111.8239 ┆ … ┆ 10681.0 ┆ 1.744913 ┆ 2.821613 ┆ MAPLE │\n",
1435 | "│ Syrup ┆ ┆ ┆ 16 ┆ ┆ ┆ ┆ ┆ SYRUP │\n",
1436 | "│ Maple ┆ 3.156134 ┆ 40.857869 ┆ -111.8239 ┆ … ┆ 10682.0 ┆ 1.745012 ┆ 2.800696 ┆ MAPLE │\n",
1437 | "│ Syrup ┆ ┆ ┆ 46 ┆ ┆ ┆ ┆ ┆ SYRUP │\n",
1438 | "│ Maple ┆ 3.626488 ┆ 40.857889 ┆ -111.8239 ┆ … ┆ 10683.0 ┆ 1.745104 ┆ 2.724433 ┆ MAPLE │\n",
1439 | "│ Syrup ┆ ┆ ┆ 8 ┆ ┆ ┆ ┆ ┆ SYRUP │\n",
1440 | "│ Maple ┆ 3.760829 ┆ 40.857909 ┆ -111.8240 ┆ … ┆ 10684.0 ┆ 1.745243 ┆ 3.235592 ┆ MAPLE │\n",
1441 | "│ Syrup ┆ ┆ ┆ 16 ┆ ┆ ┆ ┆ ┆ SYRUP │\n",
1442 | "└───────────┴───────────┴───────────┴───────────┴───┴───────────┴───────────┴───────────┴──────────┘"
1443 | ]
1444 | },
1445 | "execution_count": 85,
1446 | "metadata": {},
1447 | "output_type": "execute_result"
1448 | }
1449 | ],
1450 | "source": [
1451 | "# an example of a string operation\n",
1452 | "(df\n",
1453 | " .select(cols)\n",
1454 | " .with_columns(pl.col('elapsed').cast(pl.Int16),\n",
1455 | " course=pl.lit('Maple Syrup'))\n",
1456 | " .with_columns(alt_name=pl.col('course').str.to_uppercase()) \n",
1457 | " \n",
1458 | ")"
1459 | ]
1460 | },
1461 | {
1462 | "cell_type": "code",
1463 | "execution_count": 86,
1464 | "metadata": {},
1465 | "outputs": [
1466 | {
1467 | "name": "stdout",
1468 | "output_type": "stream",
1469 | "text": [
1470 | "['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_accessor', '_pyexpr', 'concat', 'contains', 'contains_any', 'count_matches', 'decode', 'encode', 'ends_with', 'escape_regex', 'explode', 'extract', 'extract_all', 'extract_groups', 'extract_many', 'find', 'head', 'join', 'json_decode', 'json_path_match', 'len_bytes', 'len_chars', 'pad_end', 'pad_start', 'replace', 'replace_all', 'replace_many', 'reverse', 'slice', 'split', 'split_exact', 'splitn', 'starts_with', 'strip_chars', 'strip_chars_end', 'strip_chars_start', 'strip_prefix', 'strip_suffix', 'strptime', 'tail', 'to_date', 'to_datetime', 'to_decimal', 'to_integer', 'to_lowercase', 'to_time', 'to_titlecase', 'to_uppercase', 'zfill']\n"
1471 | ]
1472 | }
1473 | ],
1474 | "source": [
1475 | "# a bunch of string methods off of .str\n",
1476 | "# note that the spelling might be different from python/pandas\n",
1477 | "col = pl.col('')\n",
1478 | "print(dir(col.str))"
1479 | ]
1480 | },
1481 | {
1482 | "cell_type": "code",
1483 | "execution_count": 87,
1484 | "metadata": {
1485 | "lines_to_next_cell": 0
1486 | },
1487 | "outputs": [],
1488 | "source": [
1489 | "col.str.to_uppercase?"
1490 | ]
1491 | },
1492 | {
1493 | "cell_type": "code",
1494 | "execution_count": null,
1495 | "metadata": {},
1496 | "outputs": [],
1497 | "source": []
1498 | },
1499 | {
1500 | "cell_type": "code",
1501 | "execution_count": null,
1502 | "metadata": {},
1503 | "outputs": [],
1504 | "source": []
1505 | },
1506 | {
1507 | "cell_type": "markdown",
1508 | "metadata": {},
1509 | "source": [
1510 | "## Convert Date to Local Time"
1511 | ]
1512 | },
1513 | {
1514 | "cell_type": "code",
1515 | "execution_count": 88,
1516 | "metadata": {},
1517 | "outputs": [
1518 | {
1519 | "data": {
1520 | "text/html": [
1521 | "\n",
1528 | "
shape: (10_430, 14)| course | distance_2d | latitude | longitude | time | elevation | speed_between | travelled | elapsed | avg_velocity | rolling_travelled | rolling_elapsed | rolling_velocity | rolling_between |
|---|
| cat | f64 | f64 | f64 | datetime[μs, America/Denver] | f64 | f64 | f64 | i16 | f64 | f64 | f64 | f64 | f64 |
| "Maple Syrup" | 0.0 | 40.879161 | -111.855169 | 2024-09-10 17:41:56 MDT | 1480.0 | null | 0.0 | 0 | NaN | null | null | null | null |
| "Maple Syrup" | 1.210883 | 40.879167 | -111.855181 | 2024-09-10 17:41:58 MDT | 1480.1 | 0.607503 | 1.210883 | 2 | 0.605442 | null | null | null | null |
| "Maple Syrup" | 1.227612 | 40.879172 | -111.855194 | 2024-09-10 17:41:59 MDT | 1480.1 | 1.227612 | 2.438495 | 3 | 0.812832 | null | null | null | null |
| "Maple Syrup" | 0.952238 | 40.879174 | -111.855205 | 2024-09-10 17:42:00 MDT | 1480.1 | 0.952238 | 3.390733 | 4 | 0.847683 | null | null | null | null |
| "Maple Syrup" | 0.90551 | 40.879177 | -111.855215 | 2024-09-10 17:42:01 MDT | 1480.1 | 0.90551 | 4.296243 | 5 | 0.859249 | 2.267271 | 2.8 | 0.80974 | null |
| … | … | … | … | … | … | … | … | … | … | … | … | … | … |
| "Maple Syrup" | 4.085147 | 40.857847 | -111.823899 | 2024-09-10 20:39:58 MDT | 1696.8 | 4.086371 | 18640.85135 | 10682 | 1.745071 | 18634.60004 | 10680.0 | 1.744813 | 3.177668 |
| "Maple Syrup" | 1.535726 | 40.857852 | -111.823916 | 2024-09-10 20:39:59 MDT | 1696.8 | 1.535726 | 18642.387076 | 10683 | 1.745052 | 18637.420852 | 10681.0 | 1.744913 | 2.821613 |
| "Maple Syrup" | 3.156134 | 40.857869 | -111.823946 | 2024-09-10 20:40:00 MDT | 1696.7 | 3.157718 | 18645.54321 | 10684 | 1.745184 | 18640.220738 | 10682.0 | 1.745012 | 2.800696 |
| "Maple Syrup" | 3.626488 | 40.857889 | -111.82398 | 2024-09-10 20:40:01 MDT | 1696.5 | 3.631999 | 18649.169698 | 10685 | 1.74536 | 18642.943507 | 10683.0 | 1.745104 | 2.724433 |
| "Maple Syrup" | 3.760829 | 40.857909 | -111.824016 | 2024-09-10 20:40:02 MDT | 1696.3 | 3.766144 | 18652.930528 | 10686 | 1.745548 | 18646.176372 | 10684.0 | 1.745243 | 3.235592 |
"
1529 | ],
1530 | "text/plain": [
1531 | "shape: (10_430, 14)\n",
1532 | "┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐\n",
1533 | "│ course ┆ distance_ ┆ latitude ┆ longitude ┆ … ┆ rolling_t ┆ rolling_e ┆ rolling_v ┆ rolling_ │\n",
1534 | "│ --- ┆ 2d ┆ --- ┆ --- ┆ ┆ ravelled ┆ lapsed ┆ elocity ┆ between │\n",
1535 | "│ cat ┆ --- ┆ f64 ┆ f64 ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n",
1536 | "│ ┆ f64 ┆ ┆ ┆ ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n",
1537 | "╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡\n",
1538 | "│ Maple ┆ 0.0 ┆ 40.879161 ┆ -111.8551 ┆ … ┆ null ┆ null ┆ null ┆ null │\n",
1539 | "│ Syrup ┆ ┆ ┆ 69 ┆ ┆ ┆ ┆ ┆ │\n",
1540 | "│ Maple ┆ 1.210883 ┆ 40.879167 ┆ -111.8551 ┆ … ┆ null ┆ null ┆ null ┆ null │\n",
1541 | "│ Syrup ┆ ┆ ┆ 81 ┆ ┆ ┆ ┆ ┆ │\n",
1542 | "│ Maple ┆ 1.227612 ┆ 40.879172 ┆ -111.8551 ┆ … ┆ null ┆ null ┆ null ┆ null │\n",
1543 | "│ Syrup ┆ ┆ ┆ 94 ┆ ┆ ┆ ┆ ┆ │\n",
1544 | "│ Maple ┆ 0.952238 ┆ 40.879174 ┆ -111.8552 ┆ … ┆ null ┆ null ┆ null ┆ null │\n",
1545 | "│ Syrup ┆ ┆ ┆ 05 ┆ ┆ ┆ ┆ ┆ │\n",
1546 | "│ Maple ┆ 0.90551 ┆ 40.879177 ┆ -111.8552 ┆ … ┆ 2.267271 ┆ 2.8 ┆ 0.80974 ┆ null │\n",
1547 | "│ Syrup ┆ ┆ ┆ 15 ┆ ┆ ┆ ┆ ┆ │\n",
1548 | "│ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n",
1549 | "│ Maple ┆ 4.085147 ┆ 40.857847 ┆ -111.8238 ┆ … ┆ 18634.600 ┆ 10680.0 ┆ 1.744813 ┆ 3.177668 │\n",
1550 | "│ Syrup ┆ ┆ ┆ 99 ┆ ┆ 04 ┆ ┆ ┆ │\n",
1551 | "│ Maple ┆ 1.535726 ┆ 40.857852 ┆ -111.8239 ┆ … ┆ 18637.420 ┆ 10681.0 ┆ 1.744913 ┆ 2.821613 │\n",
1552 | "│ Syrup ┆ ┆ ┆ 16 ┆ ┆ 852 ┆ ┆ ┆ │\n",
1553 | "│ Maple ┆ 3.156134 ┆ 40.857869 ┆ -111.8239 ┆ … ┆ 18640.220 ┆ 10682.0 ┆ 1.745012 ┆ 2.800696 │\n",
1554 | "│ Syrup ┆ ┆ ┆ 46 ┆ ┆ 738 ┆ ┆ ┆ │\n",
1555 | "│ Maple ┆ 3.626488 ┆ 40.857889 ┆ -111.8239 ┆ … ┆ 18642.943 ┆ 10683.0 ┆ 1.745104 ┆ 2.724433 │\n",
1556 | "│ Syrup ┆ ┆ ┆ 8 ┆ ┆ 507 ┆ ┆ ┆ │\n",
1557 | "│ Maple ┆ 3.760829 ┆ 40.857909 ┆ -111.8240 ┆ … ┆ 18646.176 ┆ 10684.0 ┆ 1.745243 ┆ 3.235592 │\n",
1558 | "│ Syrup ┆ ┆ ┆ 16 ┆ ┆ 372 ┆ ┆ ┆ │\n",
1559 | "└───────────┴───────────┴───────────┴───────────┴───┴───────────┴───────────┴───────────┴──────────┘"
1560 | ]
1561 | },
1562 | "execution_count": 88,
1563 | "metadata": {},
1564 | "output_type": "execute_result"
1565 | }
1566 | ],
1567 | "source": [
1568 | "(df\n",
1569 | " .select(cols)\n",
1570 | " .with_columns(pl.col('elapsed').cast(pl.Int16),\n",
1571 | " course=pl.lit('Maple Syrup').cast(pl.Categorical),\n",
1572 | " time=pl.col('time').dt.convert_time_zone('America/Denver')\n",
1573 | " )\n",
1574 | ")"
1575 | ]
1576 | },
1577 | {
1578 | "cell_type": "code",
1579 | "execution_count": 89,
1580 | "metadata": {},
1581 | "outputs": [
1582 | {
1583 | "name": "stdout",
1584 | "output_type": "stream",
1585 | "text": [
1586 | "['__abs__', '__add__', '__and__', '__annotations__', '__array_ufunc__', '__bool__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__floordiv__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__invert__', '__le__', '__lt__', '__mod__', '__module__', '__mul__', '__ne__', '__neg__', '__new__', '__or__', '__pos__', '__pow__', '__radd__', '__rand__', '__reduce__', '__reduce_ex__', '__repr__', '__rfloordiv__', '__rmod__', '__rmul__', '__ror__', '__rpow__', '__rsub__', '__rtruediv__', '__rxor__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__sub__', '__subclasshook__', '__truediv__', '__weakref__', '__xor__', '_accessors', '_from_pyexpr', '_map_batches_wrapper', '_pyexpr', '_repr_html_', 'abs', 'add', 'agg_groups', 'alias', 'all', 'and_', 'any', 'append', 'approx_n_unique', 'arccos', 'arccosh', 'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg_max', 'arg_min', 'arg_sort', 'arg_true', 'arg_unique', 'arr', 'backward_fill', 'bin', 'bitwise_and', 'bitwise_count_ones', 'bitwise_count_zeros', 'bitwise_leading_ones', 'bitwise_leading_zeros', 'bitwise_or', 'bitwise_trailing_ones', 'bitwise_trailing_zeros', 'bitwise_xor', 'bottom_k', 'bottom_k_by', 'cast', 'cat', 'cbrt', 'ceil', 'clip', 'cos', 'cosh', 'cot', 'count', 'cum_count', 'cum_max', 'cum_min', 'cum_prod', 'cum_sum', 'cumulative_eval', 'cut', 'degrees', 'deserialize', 'diff', 'dot', 'drop_nans', 'drop_nulls', 'dt', 'entropy', 'eq', 'eq_missing', 'ewm_mean', 'ewm_mean_by', 'ewm_std', 'ewm_var', 'exclude', 'exp', 'explode', 'extend_constant', 'fill_nan', 'fill_null', 'filter', 'first', 'flatten', 'floor', 'floordiv', 'forward_fill', 'from_json', 'gather', 'gather_every', 'ge', 'get', 'gt', 'has_nulls', 'hash', 'head', 'hist', 'implode', 'inspect', 'interpolate', 'interpolate_by', 'is_between', 'is_duplicated', 'is_finite', 'is_first_distinct', 'is_in', 'is_infinite', 'is_last_distinct', 'is_nan', 'is_not_nan', 'is_not_null', 'is_null', 'is_unique', 'kurtosis', 'last', 'le', 'len', 'limit', 'list', 'log', 'log10', 'log1p', 'lower_bound', 'lt', 'map_batches', 'map_elements', 'max', 'mean', 'median', 'meta', 'min', 'mod', 'mode', 'mul', 'n_unique', 'name', 'nan_max', 'nan_min', 'ne', 'ne_missing', 'neg', 'not_', 'null_count', 'or_', 'over', 'pct_change', 'peak_max', 'peak_min', 'pipe', 'pow', 'product', 'qcut', 'quantile', 'radians', 'rank', 'rechunk', 'register_plugin', 'reinterpret', 'repeat_by', 'replace', 'replace_strict', 'reshape', 'reverse', 'rle', 'rle_id', 'rolling', 'rolling_map', 'rolling_max', 'rolling_max_by', 'rolling_mean', 'rolling_mean_by', 'rolling_median', 'rolling_median_by', 'rolling_min', 'rolling_min_by', 'rolling_quantile', 'rolling_quantile_by', 'rolling_skew', 'rolling_std', 'rolling_std_by', 'rolling_sum', 'rolling_sum_by', 'rolling_var', 'rolling_var_by', 'round', 'round_sig_figs', 'sample', 'search_sorted', 'set_sorted', 'shift', 'shrink_dtype', 'shuffle', 'sign', 'sin', 'sinh', 'skew', 'slice', 'sort', 'sort_by', 'sqrt', 'std', 'str', 'struct', 'sub', 'sum', 'tail', 'tan', 'tanh', 'to_physical', 'top_k', 'top_k_by', 'truediv', 'unique', 'unique_counts', 'upper_bound', 'value_counts', 'var', 'where', 'xor']\n"
1587 | ]
1588 | }
1589 | ],
1590 | "source": [
1591 | "col = pl.col('time')\n",
1592 | "print(dir(col))"
1593 | ]
1594 | },
1595 | {
1596 | "cell_type": "code",
1597 | "execution_count": 90,
1598 | "metadata": {},
1599 | "outputs": [
1600 | {
1601 | "name": "stdout",
1602 | "output_type": "stream",
1603 | "text": [
1604 | "268\n"
1605 | ]
1606 | }
1607 | ],
1608 | "source": [
1609 | "print(len(dir(col)))"
1610 | ]
1611 | },
1612 | {
1613 | "cell_type": "code",
1614 | "execution_count": 91,
1615 | "metadata": {},
1616 | "outputs": [
1617 | {
1618 | "name": "stdout",
1619 | "output_type": "stream",
1620 | "text": [
1621 | "['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_accessor', '_pyexpr', 'add_business_days', 'base_utc_offset', 'cast_time_unit', 'century', 'combine', 'convert_time_zone', 'date', 'datetime', 'day', 'dst_offset', 'epoch', 'hour', 'is_leap_year', 'iso_year', 'microsecond', 'millennium', 'millisecond', 'minute', 'month', 'month_end', 'month_start', 'nanosecond', 'offset_by', 'ordinal_day', 'quarter', 'replace_time_zone', 'round', 'second', 'strftime', 'time', 'timestamp', 'to_string', 'total_days', 'total_hours', 'total_microseconds', 'total_milliseconds', 'total_minutes', 'total_nanoseconds', 'total_seconds', 'truncate', 'week', 'weekday', 'with_time_unit', 'year']\n"
1622 | ]
1623 | }
1624 | ],
1625 | "source": [
1626 | "print(dir(col.dt))"
1627 | ]
1628 | },
1629 | {
1630 | "cell_type": "code",
1631 | "execution_count": 92,
1632 | "metadata": {},
1633 | "outputs": [
1634 | {
1635 | "name": "stdout",
1636 | "output_type": "stream",
1637 | "text": [
1638 | "72\n"
1639 | ]
1640 | }
1641 | ],
1642 | "source": [
1643 | "print(len(dir(col.dt)))"
1644 | ]
1645 | },
1646 | {
1647 | "cell_type": "code",
1648 | "execution_count": null,
1649 | "metadata": {},
1650 | "outputs": [],
1651 | "source": []
1652 | },
1653 | {
1654 | "cell_type": "code",
1655 | "execution_count": null,
1656 | "metadata": {},
1657 | "outputs": [],
1658 | "source": []
1659 | },
1660 | {
1661 | "cell_type": "code",
1662 | "execution_count": null,
1663 | "metadata": {},
1664 | "outputs": [],
1665 | "source": []
1666 | },
1667 | {
1668 | "cell_type": "code",
1669 | "execution_count": null,
1670 | "metadata": {},
1671 | "outputs": [],
1672 | "source": []
1673 | },
1674 | {
1675 | "cell_type": "markdown",
1676 | "metadata": {},
1677 | "source": [
1678 | "## Missing Data\n",
1679 | "\n",
1680 | "- Use `.fill_null` to address\n",
1681 | "- Use `.filter` to filter rows\n",
1682 | "- Use `.select` to select columns\n",
1683 | "\n",
1684 | "To view rows with missing data use `.filter(pl.col(\"col_name\").is_null())`"
1685 | ]
1686 | },
1687 | {
1688 | "cell_type": "code",
1689 | "execution_count": 93,
1690 | "metadata": {},
1691 | "outputs": [
1692 | {
1693 | "data": {
1694 | "text/html": [
1695 | "\n",
1702 | "
shape: (1, 14)| course | distance_2d | latitude | longitude | time | elevation | speed_between | travelled | elapsed | avg_velocity | rolling_travelled | rolling_elapsed | rolling_velocity | rolling_between |
|---|
| u32 | u32 | u32 | u32 | u32 | u32 | u32 | u32 | u32 | u32 | u32 | u32 | u32 | u32 |
| 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 4 | 4 | 4 | 5 |
"
1703 | ],
1704 | "text/plain": [
1705 | "shape: (1, 14)\n",
1706 | "┌────────┬────────────┬──────────┬───────────┬───┬────────────┬────────────┬───────────┬───────────┐\n",
1707 | "│ course ┆ distance_2 ┆ latitude ┆ longitude ┆ … ┆ rolling_tr ┆ rolling_el ┆ rolling_v ┆ rolling_b │\n",
1708 | "│ --- ┆ d ┆ --- ┆ --- ┆ ┆ avelled ┆ apsed ┆ elocity ┆ etween │\n",
1709 | "│ u32 ┆ --- ┆ u32 ┆ u32 ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n",
1710 | "│ ┆ u32 ┆ ┆ ┆ ┆ u32 ┆ u32 ┆ u32 ┆ u32 │\n",
1711 | "╞════════╪════════════╪══════════╪═══════════╪═══╪════════════╪════════════╪═══════════╪═══════════╡\n",
1712 | "│ 0 ┆ 0 ┆ 0 ┆ 0 ┆ … ┆ 4 ┆ 4 ┆ 4 ┆ 5 │\n",
1713 | "└────────┴────────────┴──────────┴───────────┴───┴────────────┴────────────┴───────────┴───────────┘"
1714 | ]
1715 | },
1716 | "execution_count": 93,
1717 | "metadata": {},
1718 | "output_type": "execute_result"
1719 | }
1720 | ],
1721 | "source": [
1722 | "(df\n",
1723 | " .select(cols)\n",
1724 | " .with_columns(pl.col('elapsed').cast(pl.Int16),\n",
1725 | " course=pl.lit('Maple Syrup').cast(pl.Categorical),\n",
1726 | " time=pl.col('time').dt.convert_time_zone('America/Denver'))\n",
1727 | " .null_count()\n",
1728 | ")"
1729 | ]
1730 | },
1731 | {
1732 | "cell_type": "code",
1733 | "execution_count": 94,
1734 | "metadata": {
1735 | "collapsed": true,
1736 | "jupyter": {
1737 | "outputs_hidden": true
1738 | }
1739 | },
1740 | "outputs": [
1741 | {
1742 | "data": {
1743 | "text/html": [
1744 | "\n",
1751 | "
shape: (10_430, 1)| rolling_between |
|---|
| bool |
| true |
| true |
| true |
| true |
| true |
| … |
| false |
| false |
| false |
| false |
| false |
"
1752 | ],
1753 | "text/plain": [
1754 | "shape: (10_430, 1)\n",
1755 | "┌─────────────────┐\n",
1756 | "│ rolling_between │\n",
1757 | "│ --- │\n",
1758 | "│ bool │\n",
1759 | "╞═════════════════╡\n",
1760 | "│ true │\n",
1761 | "│ true │\n",
1762 | "│ true │\n",
1763 | "│ true │\n",
1764 | "│ true │\n",
1765 | "│ … │\n",
1766 | "│ false │\n",
1767 | "│ false │\n",
1768 | "│ false │\n",
1769 | "│ false │\n",
1770 | "│ false │\n",
1771 | "└─────────────────┘"
1772 | ]
1773 | },
1774 | "execution_count": 94,
1775 | "metadata": {},
1776 | "output_type": "execute_result"
1777 | }
1778 | ],
1779 | "source": [
1780 | "# use .select to find where rows are missing\n",
1781 | "(df\n",
1782 | " .select(cols)\n",
1783 | " .with_columns(pl.col('elapsed').cast(pl.Int16),\n",
1784 | " course=pl.lit('Maple Syrup').cast(pl.Categorical),\n",
1785 | " time=pl.col('time').dt.convert_time_zone('America/Denver'))\n",
1786 | " .select(pl.col('rolling_between').is_null())\n",
1787 | ")"
1788 | ]
1789 | },
1790 | {
1791 | "cell_type": "code",
1792 | "execution_count": 95,
1793 | "metadata": {},
1794 | "outputs": [
1795 | {
1796 | "data": {
1797 | "text/html": [
1798 | "\n",
1805 | "
shape: (5, 15)| index | course | distance_2d | latitude | longitude | time | elevation | speed_between | travelled | elapsed | avg_velocity | rolling_travelled | rolling_elapsed | rolling_velocity | rolling_between |
|---|
| u32 | cat | f64 | f64 | f64 | datetime[μs, America/Denver] | f64 | f64 | f64 | i16 | f64 | f64 | f64 | f64 | f64 |
| 0 | "Maple Syrup" | 0.0 | 40.879161 | -111.855169 | 2024-09-10 17:41:56 MDT | 1480.0 | null | 0.0 | 0 | NaN | null | null | null | null |
| 1 | "Maple Syrup" | 1.210883 | 40.879167 | -111.855181 | 2024-09-10 17:41:58 MDT | 1480.1 | 0.607503 | 1.210883 | 2 | 0.605442 | null | null | null | null |
| 2 | "Maple Syrup" | 1.227612 | 40.879172 | -111.855194 | 2024-09-10 17:41:59 MDT | 1480.1 | 1.227612 | 2.438495 | 3 | 0.812832 | null | null | null | null |
| 3 | "Maple Syrup" | 0.952238 | 40.879174 | -111.855205 | 2024-09-10 17:42:00 MDT | 1480.1 | 0.952238 | 3.390733 | 4 | 0.847683 | null | null | null | null |
| 4 | "Maple Syrup" | 0.90551 | 40.879177 | -111.855215 | 2024-09-10 17:42:01 MDT | 1480.1 | 0.90551 | 4.296243 | 5 | 0.859249 | 2.267271 | 2.8 | 0.80974 | null |
"
1806 | ],
1807 | "text/plain": [
1808 | "shape: (5, 15)\n",
1809 | "┌───────┬────────┬─────────────┬───────────┬───┬────────────┬────────────┬────────────┬────────────┐\n",
1810 | "│ index ┆ course ┆ distance_2d ┆ latitude ┆ … ┆ rolling_tr ┆ rolling_el ┆ rolling_ve ┆ rolling_be │\n",
1811 | "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ avelled ┆ apsed ┆ locity ┆ tween │\n",
1812 | "│ u32 ┆ cat ┆ f64 ┆ f64 ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n",
1813 | "│ ┆ ┆ ┆ ┆ ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n",
1814 | "╞═══════╪════════╪═════════════╪═══════════╪═══╪════════════╪════════════╪════════════╪════════════╡\n",
1815 | "│ 0 ┆ Maple ┆ 0.0 ┆ 40.879161 ┆ … ┆ null ┆ null ┆ null ┆ null │\n",
1816 | "│ ┆ Syrup ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n",
1817 | "│ 1 ┆ Maple ┆ 1.210883 ┆ 40.879167 ┆ … ┆ null ┆ null ┆ null ┆ null │\n",
1818 | "│ ┆ Syrup ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n",
1819 | "│ 2 ┆ Maple ┆ 1.227612 ┆ 40.879172 ┆ … ┆ null ┆ null ┆ null ┆ null │\n",
1820 | "│ ┆ Syrup ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n",
1821 | "│ 3 ┆ Maple ┆ 0.952238 ┆ 40.879174 ┆ … ┆ null ┆ null ┆ null ┆ null │\n",
1822 | "│ ┆ Syrup ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n",
1823 | "│ 4 ┆ Maple ┆ 0.90551 ┆ 40.879177 ┆ … ┆ 2.267271 ┆ 2.8 ┆ 0.80974 ┆ null │\n",
1824 | "│ ┆ Syrup ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n",
1825 | "└───────┴────────┴─────────────┴───────────┴───┴────────────┴────────────┴────────────┴────────────┘"
1826 | ]
1827 | },
1828 | "execution_count": 95,
1829 | "metadata": {},
1830 | "output_type": "execute_result"
1831 | }
1832 | ],
1833 | "source": [
1834 | "# change .select to .filter to view the rows\n",
1835 | "(df\n",
1836 | " .select(cols)\n",
1837 | " .with_row_index()\n",
1838 | " .with_columns(pl.col('elapsed').cast(pl.Int16),\n",
1839 | " course=pl.lit('Maple Syrup').cast(pl.Categorical),\n",
1840 | " time=pl.col('time').dt.convert_time_zone('America/Denver')) \n",
1841 | " .filter(pl.col('rolling_between').is_null())\n",
1842 | ")"
1843 | ]
1844 | },
1845 | {
1846 | "cell_type": "code",
1847 | "execution_count": 96,
1848 | "metadata": {},
1849 | "outputs": [
1850 | {
1851 | "data": {
1852 | "text/html": [
1853 | "\n",
1860 | "
shape: (1, 12)| distance_2d | latitude | longitude | elevation | speed_between | travelled | elapsed | avg_velocity | rolling_travelled | rolling_elapsed | rolling_velocity | rolling_between |
|---|
| u32 | u32 | u32 | u32 | u32 | u32 | u32 | u32 | u32 | u32 | u32 | u32 |
| 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
"
1861 | ],
1862 | "text/plain": [
1863 | "shape: (1, 12)\n",
1864 | "┌───────────┬──────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬───────────┐\n",
1865 | "│ distance_ ┆ latitude ┆ longitude ┆ elevation ┆ … ┆ rolling_t ┆ rolling_e ┆ rolling_v ┆ rolling_b │\n",
1866 | "│ 2d ┆ --- ┆ --- ┆ --- ┆ ┆ ravelled ┆ lapsed ┆ elocity ┆ etween │\n",
1867 | "│ --- ┆ u32 ┆ u32 ┆ u32 ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n",
1868 | "│ u32 ┆ ┆ ┆ ┆ ┆ u32 ┆ u32 ┆ u32 ┆ u32 │\n",
1869 | "╞═══════════╪══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪═══════════╡\n",
1870 | "│ 0 ┆ 0 ┆ 0 ┆ 0 ┆ … ┆ 0 ┆ 0 ┆ 0 ┆ 0 │\n",
1871 | "└───────────┴──────────┴───────────┴───────────┴───┴───────────┴───────────┴───────────┴───────────┘"
1872 | ]
1873 | },
1874 | "execution_count": 96,
1875 | "metadata": {},
1876 | "output_type": "execute_result"
1877 | }
1878 | ],
1879 | "source": [
1880 | "# what about nans?\n",
1881 | "# note that nan and null are different in polars\n",
1882 | "# nan means not a number\n",
1883 | "# null means missing data\n",
1884 | "(df\n",
1885 | " .select(cols)\n",
1886 | " .with_columns(pl.col('elapsed').cast(pl.Int16),\n",
1887 | " course=pl.lit('Maple Syrup').cast(pl.Categorical),\n",
1888 | " time=pl.col('time').dt.convert_time_zone('America/Denver')) \n",
1889 | " .select(cs.numeric().is_nan().sum())\n",
1890 | ")"
1891 | ]
1892 | },
1893 | {
1894 | "cell_type": "code",
1895 | "execution_count": 97,
1896 | "metadata": {},
1897 | "outputs": [
1898 | {
1899 | "data": {
1900 | "text/html": [
1901 | "\n",
1908 | "
shape: (10_430, 1)| avg_velocity |
|---|
| bool |
| true |
| false |
| false |
| false |
| false |
| … |
| false |
| false |
| false |
| false |
| false |
"
1909 | ],
1910 | "text/plain": [
1911 | "shape: (10_430, 1)\n",
1912 | "┌──────────────┐\n",
1913 | "│ avg_velocity │\n",
1914 | "│ --- │\n",
1915 | "│ bool │\n",
1916 | "╞══════════════╡\n",
1917 | "│ true │\n",
1918 | "│ false │\n",
1919 | "│ false │\n",
1920 | "│ false │\n",
1921 | "│ false │\n",
1922 | "│ … │\n",
1923 | "│ false │\n",
1924 | "│ false │\n",
1925 | "│ false │\n",
1926 | "│ false │\n",
1927 | "│ false │\n",
1928 | "└──────────────┘"
1929 | ]
1930 | },
1931 | "execution_count": 97,
1932 | "metadata": {},
1933 | "output_type": "execute_result"
1934 | }
1935 | ],
1936 | "source": [
1937 | "(df\n",
1938 | " .select(cols)\n",
1939 | " .with_columns(pl.col('elapsed').cast(pl.Int16),\n",
1940 | " course=pl.lit('Maple Syrup').cast(pl.Categorical),\n",
1941 | " time=pl.col('time').dt.convert_time_zone('America/Denver')) \n",
1942 | " .select(pl.col('avg_velocity').is_nan())\n",
1943 | ")"
1944 | ]
1945 | },
1946 | {
1947 | "cell_type": "code",
1948 | "execution_count": 98,
1949 | "metadata": {
1950 | "scrolled": true
1951 | },
1952 | "outputs": [
1953 | {
1954 | "data": {
1955 | "text/html": [
1956 | "\n",
1963 | "
shape: (1, 15)| index | course | distance_2d | latitude | longitude | time | elevation | speed_between | travelled | elapsed | avg_velocity | rolling_travelled | rolling_elapsed | rolling_velocity | rolling_between |
|---|
| u32 | cat | f64 | f64 | f64 | datetime[μs, America/Denver] | f64 | f64 | f64 | i16 | f64 | f64 | f64 | f64 | f64 |
| 0 | "Maple Syrup" | 0.0 | 40.879161 | -111.855169 | 2024-09-10 17:41:56 MDT | 1480.0 | null | 0.0 | 0 | NaN | null | null | null | null |
"
1964 | ],
1965 | "text/plain": [
1966 | "shape: (1, 15)\n",
1967 | "┌───────┬────────┬─────────────┬───────────┬───┬────────────┬────────────┬────────────┬────────────┐\n",
1968 | "│ index ┆ course ┆ distance_2d ┆ latitude ┆ … ┆ rolling_tr ┆ rolling_el ┆ rolling_ve ┆ rolling_be │\n",
1969 | "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ avelled ┆ apsed ┆ locity ┆ tween │\n",
1970 | "│ u32 ┆ cat ┆ f64 ┆ f64 ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n",
1971 | "│ ┆ ┆ ┆ ┆ ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n",
1972 | "╞═══════╪════════╪═════════════╪═══════════╪═══╪════════════╪════════════╪════════════╪════════════╡\n",
1973 | "│ 0 ┆ Maple ┆ 0.0 ┆ 40.879161 ┆ … ┆ null ┆ null ┆ null ┆ null │\n",
1974 | "│ ┆ Syrup ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n",
1975 | "└───────┴────────┴─────────────┴───────────┴───┴────────────┴────────────┴────────────┴────────────┘"
1976 | ]
1977 | },
1978 | "execution_count": 98,
1979 | "metadata": {},
1980 | "output_type": "execute_result"
1981 | }
1982 | ],
1983 | "source": [
1984 | "(df\n",
1985 | " .select(cols)\n",
1986 | " .with_row_index() \n",
1987 | " .with_columns(pl.col('elapsed').cast(pl.Int16),\n",
1988 | " course=pl.lit('Maple Syrup').cast(pl.Categorical),\n",
1989 | " time=pl.col('time').dt.convert_time_zone('America/Denver')) \n",
1990 | " .filter(pl.col('avg_velocity').is_nan())\n",
1991 | ")"
1992 | ]
1993 | },
1994 | {
1995 | "cell_type": "code",
1996 | "execution_count": 99,
1997 | "metadata": {},
1998 | "outputs": [
1999 | {
2000 | "data": {
2001 | "text/html": [
2002 | "\n",
2009 | "
shape: (10_430, 15)| index | course | distance_2d | latitude | longitude | time | elevation | speed_between | travelled | elapsed | avg_velocity | rolling_travelled | rolling_elapsed | rolling_velocity | rolling_between |
|---|
| u32 | cat | f64 | f64 | f64 | datetime[μs, America/Denver] | f64 | f64 | f64 | i16 | f64 | f64 | f64 | f64 | f64 |
| 0 | "Maple Syrup" | 0.0 | 40.879161 | -111.855169 | 2024-09-10 17:41:56 MDT | 1480.0 | null | 0.0 | 0 | NaN | null | null | null | null |
| 1 | "Maple Syrup" | 1.210883 | 40.879167 | -111.855181 | 2024-09-10 17:41:58 MDT | 1480.1 | 0.607503 | 1.210883 | 2 | 0.605442 | null | null | null | null |
| 2 | "Maple Syrup" | 1.227612 | 40.879172 | -111.855194 | 2024-09-10 17:41:59 MDT | 1480.1 | 1.227612 | 2.438495 | 3 | 0.812832 | null | null | null | null |
| 3 | "Maple Syrup" | 0.952238 | 40.879174 | -111.855205 | 2024-09-10 17:42:00 MDT | 1480.1 | 0.952238 | 3.390733 | 4 | 0.847683 | null | null | null | null |
| 4 | "Maple Syrup" | 0.90551 | 40.879177 | -111.855215 | 2024-09-10 17:42:01 MDT | 1480.1 | 0.90551 | 4.296243 | 5 | 0.859249 | 2.267271 | 2.8 | 0.80974 | null |
| … | … | … | … | … | … | … | … | … | … | … | … | … | … | … |
| 10425 | "Maple Syrup" | 4.085147 | 40.857847 | -111.823899 | 2024-09-10 20:39:58 MDT | 1696.8 | 4.086371 | 18640.85135 | 10682 | 1.745071 | 18634.60004 | 10680.0 | 1.744813 | 3.177668 |
| 10426 | "Maple Syrup" | 1.535726 | 40.857852 | -111.823916 | 2024-09-10 20:39:59 MDT | 1696.8 | 1.535726 | 18642.387076 | 10683 | 1.745052 | 18637.420852 | 10681.0 | 1.744913 | 2.821613 |
| 10427 | "Maple Syrup" | 3.156134 | 40.857869 | -111.823946 | 2024-09-10 20:40:00 MDT | 1696.7 | 3.157718 | 18645.54321 | 10684 | 1.745184 | 18640.220738 | 10682.0 | 1.745012 | 2.800696 |
| 10428 | "Maple Syrup" | 3.626488 | 40.857889 | -111.82398 | 2024-09-10 20:40:01 MDT | 1696.5 | 3.631999 | 18649.169698 | 10685 | 1.74536 | 18642.943507 | 10683.0 | 1.745104 | 2.724433 |
| 10429 | "Maple Syrup" | 3.760829 | 40.857909 | -111.824016 | 2024-09-10 20:40:02 MDT | 1696.3 | 3.766144 | 18652.930528 | 10686 | 1.745548 | 18646.176372 | 10684.0 | 1.745243 | 3.235592 |
"
2010 | ],
2011 | "text/plain": [
2012 | "shape: (10_430, 15)\n",
2013 | "┌───────┬────────┬─────────────┬───────────┬───┬────────────┬────────────┬────────────┬────────────┐\n",
2014 | "│ index ┆ course ┆ distance_2d ┆ latitude ┆ … ┆ rolling_tr ┆ rolling_el ┆ rolling_ve ┆ rolling_be │\n",
2015 | "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ avelled ┆ apsed ┆ locity ┆ tween │\n",
2016 | "│ u32 ┆ cat ┆ f64 ┆ f64 ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n",
2017 | "│ ┆ ┆ ┆ ┆ ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n",
2018 | "╞═══════╪════════╪═════════════╪═══════════╪═══╪════════════╪════════════╪════════════╪════════════╡\n",
2019 | "│ 0 ┆ Maple ┆ 0.0 ┆ 40.879161 ┆ … ┆ null ┆ null ┆ null ┆ null │\n",
2020 | "│ ┆ Syrup ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n",
2021 | "│ 1 ┆ Maple ┆ 1.210883 ┆ 40.879167 ┆ … ┆ null ┆ null ┆ null ┆ null │\n",
2022 | "│ ┆ Syrup ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n",
2023 | "│ 2 ┆ Maple ┆ 1.227612 ┆ 40.879172 ┆ … ┆ null ┆ null ┆ null ┆ null │\n",
2024 | "│ ┆ Syrup ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n",
2025 | "│ 3 ┆ Maple ┆ 0.952238 ┆ 40.879174 ┆ … ┆ null ┆ null ┆ null ┆ null │\n",
2026 | "│ ┆ Syrup ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n",
2027 | "│ 4 ┆ Maple ┆ 0.90551 ┆ 40.879177 ┆ … ┆ 2.267271 ┆ 2.8 ┆ 0.80974 ┆ null │\n",
2028 | "│ ┆ Syrup ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n",
2029 | "│ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n",
2030 | "│ 10425 ┆ Maple ┆ 4.085147 ┆ 40.857847 ┆ … ┆ 18634.6000 ┆ 10680.0 ┆ 1.744813 ┆ 3.177668 │\n",
2031 | "│ ┆ Syrup ┆ ┆ ┆ ┆ 4 ┆ ┆ ┆ │\n",
2032 | "│ 10426 ┆ Maple ┆ 1.535726 ┆ 40.857852 ┆ … ┆ 18637.4208 ┆ 10681.0 ┆ 1.744913 ┆ 2.821613 │\n",
2033 | "│ ┆ Syrup ┆ ┆ ┆ ┆ 52 ┆ ┆ ┆ │\n",
2034 | "│ 10427 ┆ Maple ┆ 3.156134 ┆ 40.857869 ┆ … ┆ 18640.2207 ┆ 10682.0 ┆ 1.745012 ┆ 2.800696 │\n",
2035 | "│ ┆ Syrup ┆ ┆ ┆ ┆ 38 ┆ ┆ ┆ │\n",
2036 | "│ 10428 ┆ Maple ┆ 3.626488 ┆ 40.857889 ┆ … ┆ 18642.9435 ┆ 10683.0 ┆ 1.745104 ┆ 2.724433 │\n",
2037 | "│ ┆ Syrup ┆ ┆ ┆ ┆ 07 ┆ ┆ ┆ │\n",
2038 | "│ 10429 ┆ Maple ┆ 3.760829 ┆ 40.857909 ┆ … ┆ 18646.1763 ┆ 10684.0 ┆ 1.745243 ┆ 3.235592 │\n",
2039 | "│ ┆ Syrup ┆ ┆ ┆ ┆ 72 ┆ ┆ ┆ │\n",
2040 | "└───────┴────────┴─────────────┴───────────┴───┴────────────┴────────────┴────────────┴────────────┘"
2041 | ]
2042 | },
2043 | "execution_count": 99,
2044 | "metadata": {},
2045 | "output_type": "execute_result"
2046 | }
2047 | ],
2048 | "source": [
2049 | "# a glorious function\n",
2050 | "\n",
2051 | "def tweak_gpx(df_):\n",
2052 | " return (df_\n",
2053 | " .select(cols)\n",
2054 | " .with_row_index() \n",
2055 | " .with_columns(pl.col('elapsed').cast(pl.Int16),\n",
2056 | " course=pl.lit('Maple Syrup').cast(pl.Categorical),\n",
2057 | " time=pl.col('time').dt.convert_time_zone('America/Denver')) \n",
2058 | " )\n",
2059 | "\n",
2060 | "tweak_gpx(df)"
2061 | ]
2062 | },
2063 | {
2064 | "cell_type": "code",
2065 | "execution_count": null,
2066 | "metadata": {
2067 | "lines_to_next_cell": 2,
2068 | "pycharm": {
2069 | "name": "#%%\n"
2070 | }
2071 | },
2072 | "outputs": [],
2073 | "source": []
2074 | },
2075 | {
2076 | "cell_type": "code",
2077 | "execution_count": null,
2078 | "metadata": {
2079 | "lines_to_next_cell": 0,
2080 | "pycharm": {
2081 | "name": "#%%\n"
2082 | }
2083 | },
2084 | "outputs": [],
2085 | "source": []
2086 | },
2087 | {
2088 | "cell_type": "code",
2089 | "execution_count": null,
2090 | "metadata": {},
2091 | "outputs": [],
2092 | "source": []
2093 | },
2094 | {
2095 | "cell_type": "code",
2096 | "execution_count": null,
2097 | "metadata": {},
2098 | "outputs": [],
2099 | "source": []
2100 | },
2101 | {
2102 | "cell_type": "markdown",
2103 | "metadata": {
2104 | "pycharm": {
2105 | "name": "#%% md\n"
2106 | }
2107 | },
2108 | "source": [
2109 | "## Chain\n",
2110 | "\n",
2111 | "Chaining is also called \"flow\" programming. Rather than making intermediate variables, just leverage the fact that most operations return a new object and work on that.\n",
2112 | "\n",
2113 | "The chain should read like a recipe of ordered steps.\n",
2114 | "\n",
2115 | "(BTW, this is actually what we did above.)"
2116 | ]
2117 | },
2118 | {
2119 | "cell_type": "code",
2120 | "execution_count": 100,
2121 | "metadata": {},
2122 | "outputs": [],
2123 | "source": [
2124 | "# a glorious function\n",
2125 | "\n",
2126 | "def tweak_gpx(df_):\n",
2127 | " return (df_\n",
2128 | " .select(cols)\n",
2129 | " .with_row_index() \n",
2130 | " .with_columns(pl.col('elapsed').cast(pl.Int16),\n",
2131 | " course=pl.lit('Maple Syrup').cast(pl.Categorical),\n",
2132 | " time=pl.col('time').dt.convert_time_zone('America/Denver')) \n",
2133 | " )\n",
2134 | "\n",
2135 | "tweak_gpx(df).write_parquet('Face_plant.parquet')"
2136 | ]
2137 | },
2138 | {
2139 | "cell_type": "code",
2140 | "execution_count": 101,
2141 | "metadata": {},
2142 | "outputs": [
2143 | {
2144 | "data": {
2145 | "text/html": [
2146 | "NAIVE QUERY PLAN
run LazyFrame.show_graph() to see the optimized version
\n",
2147 | "\n",
2149 | "\n",
2151 | "\n",
2152 | "\n"
2199 | ],
2200 | "text/plain": [
2201 | ""
2202 | ]
2203 | },
2204 | "execution_count": 101,
2205 | "metadata": {},
2206 | "output_type": "execute_result"
2207 | }
2208 | ],
2209 | "source": [
2210 | "# laziness\n",
2211 | "gpx_lazy = pl.scan_parquet('Face_plant.parquet') \n",
2212 | "tweak_gpx(gpx_lazy)"
2213 | ]
2214 | },
2215 | {
2216 | "cell_type": "code",
2217 | "execution_count": 102,
2218 | "metadata": {},
2219 | "outputs": [
2220 | {
2221 | "data": {
2222 | "text/html": [
2223 | "\n",
2230 | "
shape: (10_430, 15)| index | course | distance_2d | latitude | longitude | time | elevation | speed_between | travelled | elapsed | avg_velocity | rolling_travelled | rolling_elapsed | rolling_velocity | rolling_between |
|---|
| u32 | cat | f64 | f64 | f64 | datetime[μs, America/Denver] | f64 | f64 | f64 | i16 | f64 | f64 | f64 | f64 | f64 |
| 0 | "Maple Syrup" | 0.0 | 40.879161 | -111.855169 | 2024-09-10 17:41:56 MDT | 1480.0 | null | 0.0 | 0 | NaN | null | null | null | null |
| 1 | "Maple Syrup" | 1.210883 | 40.879167 | -111.855181 | 2024-09-10 17:41:58 MDT | 1480.1 | 0.607503 | 1.210883 | 2 | 0.605442 | null | null | null | null |
| 2 | "Maple Syrup" | 1.227612 | 40.879172 | -111.855194 | 2024-09-10 17:41:59 MDT | 1480.1 | 1.227612 | 2.438495 | 3 | 0.812832 | null | null | null | null |
| 3 | "Maple Syrup" | 0.952238 | 40.879174 | -111.855205 | 2024-09-10 17:42:00 MDT | 1480.1 | 0.952238 | 3.390733 | 4 | 0.847683 | null | null | null | null |
| 4 | "Maple Syrup" | 0.90551 | 40.879177 | -111.855215 | 2024-09-10 17:42:01 MDT | 1480.1 | 0.90551 | 4.296243 | 5 | 0.859249 | 2.267271 | 2.8 | 0.80974 | null |
| … | … | … | … | … | … | … | … | … | … | … | … | … | … | … |
| 10425 | "Maple Syrup" | 4.085147 | 40.857847 | -111.823899 | 2024-09-10 20:39:58 MDT | 1696.8 | 4.086371 | 18640.85135 | 10682 | 1.745071 | 18634.60004 | 10680.0 | 1.744813 | 3.177668 |
| 10426 | "Maple Syrup" | 1.535726 | 40.857852 | -111.823916 | 2024-09-10 20:39:59 MDT | 1696.8 | 1.535726 | 18642.387076 | 10683 | 1.745052 | 18637.420852 | 10681.0 | 1.744913 | 2.821613 |
| 10427 | "Maple Syrup" | 3.156134 | 40.857869 | -111.823946 | 2024-09-10 20:40:00 MDT | 1696.7 | 3.157718 | 18645.54321 | 10684 | 1.745184 | 18640.220738 | 10682.0 | 1.745012 | 2.800696 |
| 10428 | "Maple Syrup" | 3.626488 | 40.857889 | -111.82398 | 2024-09-10 20:40:01 MDT | 1696.5 | 3.631999 | 18649.169698 | 10685 | 1.74536 | 18642.943507 | 10683.0 | 1.745104 | 2.724433 |
| 10429 | "Maple Syrup" | 3.760829 | 40.857909 | -111.824016 | 2024-09-10 20:40:02 MDT | 1696.3 | 3.766144 | 18652.930528 | 10686 | 1.745548 | 18646.176372 | 10684.0 | 1.745243 | 3.235592 |
"
2231 | ],
2232 | "text/plain": [
2233 | "shape: (10_430, 15)\n",
2234 | "┌───────┬────────┬─────────────┬───────────┬───┬────────────┬────────────┬────────────┬────────────┐\n",
2235 | "│ index ┆ course ┆ distance_2d ┆ latitude ┆ … ┆ rolling_tr ┆ rolling_el ┆ rolling_ve ┆ rolling_be │\n",
2236 | "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ avelled ┆ apsed ┆ locity ┆ tween │\n",
2237 | "│ u32 ┆ cat ┆ f64 ┆ f64 ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n",
2238 | "│ ┆ ┆ ┆ ┆ ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n",
2239 | "╞═══════╪════════╪═════════════╪═══════════╪═══╪════════════╪════════════╪════════════╪════════════╡\n",
2240 | "│ 0 ┆ Maple ┆ 0.0 ┆ 40.879161 ┆ … ┆ null ┆ null ┆ null ┆ null │\n",
2241 | "│ ┆ Syrup ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n",
2242 | "│ 1 ┆ Maple ┆ 1.210883 ┆ 40.879167 ┆ … ┆ null ┆ null ┆ null ┆ null │\n",
2243 | "│ ┆ Syrup ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n",
2244 | "│ 2 ┆ Maple ┆ 1.227612 ┆ 40.879172 ┆ … ┆ null ┆ null ┆ null ┆ null │\n",
2245 | "│ ┆ Syrup ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n",
2246 | "│ 3 ┆ Maple ┆ 0.952238 ┆ 40.879174 ┆ … ┆ null ┆ null ┆ null ┆ null │\n",
2247 | "│ ┆ Syrup ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n",
2248 | "│ 4 ┆ Maple ┆ 0.90551 ┆ 40.879177 ┆ … ┆ 2.267271 ┆ 2.8 ┆ 0.80974 ┆ null │\n",
2249 | "│ ┆ Syrup ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n",
2250 | "│ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n",
2251 | "│ 10425 ┆ Maple ┆ 4.085147 ┆ 40.857847 ┆ … ┆ 18634.6000 ┆ 10680.0 ┆ 1.744813 ┆ 3.177668 │\n",
2252 | "│ ┆ Syrup ┆ ┆ ┆ ┆ 4 ┆ ┆ ┆ │\n",
2253 | "│ 10426 ┆ Maple ┆ 1.535726 ┆ 40.857852 ┆ … ┆ 18637.4208 ┆ 10681.0 ┆ 1.744913 ┆ 2.821613 │\n",
2254 | "│ ┆ Syrup ┆ ┆ ┆ ┆ 52 ┆ ┆ ┆ │\n",
2255 | "│ 10427 ┆ Maple ┆ 3.156134 ┆ 40.857869 ┆ … ┆ 18640.2207 ┆ 10682.0 ┆ 1.745012 ┆ 2.800696 │\n",
2256 | "│ ┆ Syrup ┆ ┆ ┆ ┆ 38 ┆ ┆ ┆ │\n",
2257 | "│ 10428 ┆ Maple ┆ 3.626488 ┆ 40.857889 ┆ … ┆ 18642.9435 ┆ 10683.0 ┆ 1.745104 ┆ 2.724433 │\n",
2258 | "│ ┆ Syrup ┆ ┆ ┆ ┆ 07 ┆ ┆ ┆ │\n",
2259 | "│ 10429 ┆ Maple ┆ 3.760829 ┆ 40.857909 ┆ … ┆ 18646.1763 ┆ 10684.0 ┆ 1.745243 ┆ 3.235592 │\n",
2260 | "│ ┆ Syrup ┆ ┆ ┆ ┆ 72 ┆ ┆ ┆ │\n",
2261 | "└───────┴────────┴─────────────┴───────────┴───┴────────────┴────────────┴────────────┴────────────┘"
2262 | ]
2263 | },
2264 | "execution_count": 102,
2265 | "metadata": {},
2266 | "output_type": "execute_result"
2267 | }
2268 | ],
2269 | "source": [
2270 | "# use .collect to generate plan and materialize\n",
2271 | "tweak_gpx(gpx_lazy).collect()"
2272 | ]
2273 | },
2274 | {
2275 | "cell_type": "code",
2276 | "execution_count": 103,
2277 | "metadata": {},
2278 | "outputs": [
2279 | {
2280 | "ename": "TypeError",
2281 | "evalue": "LazyFrame.collect() takes 1 positional argument but 2 were given",
2282 | "output_type": "error",
2283 | "traceback": [
2284 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
2285 | "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
2286 | "Cell \u001b[0;32mIn[103], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# using GPU!\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m \u001b[43mtweak_gpx\u001b[49m\u001b[43m(\u001b[49m\u001b[43mgpx_lazy\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcollect\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mgpu\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m\n",
2287 | "\u001b[0;31mTypeError\u001b[0m: LazyFrame.collect() takes 1 positional argument but 2 were given"
2288 | ]
2289 | }
2290 | ],
2291 | "source": [
2292 | "# using GPU!\n",
2293 | "tweak_gpx(gpx_lazy).collect('gpu')"
2294 | ]
2295 | },
2296 | {
2297 | "cell_type": "code",
2298 | "execution_count": 104,
2299 | "metadata": {},
2300 | "outputs": [
2301 | {
2302 | "name": "stdout",
2303 | "output_type": "stream",
2304 | "text": [
2305 | "(10430, 15)\n"
2306 | ]
2307 | },
2308 | {
2309 | "data": {
2310 | "text/html": [
2311 | "\n",
2318 | "
shape: (10_430, 15)| index | course | distance_2d | latitude | longitude | time | elevation | speed_between | travelled | elapsed | avg_velocity | rolling_travelled | rolling_elapsed | rolling_velocity | rolling_between |
|---|
| u32 | cat | f64 | f64 | f64 | datetime[μs, America/Denver] | f64 | f64 | f64 | i16 | f64 | f64 | f64 | f64 | f64 |
| 0 | "Maple Syrup" | 0.0 | 40.879161 | -111.855169 | 2024-09-10 17:41:56 MDT | 1480.0 | null | 0.0 | 0 | NaN | null | null | null | null |
| 1 | "Maple Syrup" | 1.210883 | 40.879167 | -111.855181 | 2024-09-10 17:41:58 MDT | 1480.1 | 0.607503 | 1.210883 | 2 | 0.605442 | null | null | null | null |
| 2 | "Maple Syrup" | 1.227612 | 40.879172 | -111.855194 | 2024-09-10 17:41:59 MDT | 1480.1 | 1.227612 | 2.438495 | 3 | 0.812832 | null | null | null | null |
| 3 | "Maple Syrup" | 0.952238 | 40.879174 | -111.855205 | 2024-09-10 17:42:00 MDT | 1480.1 | 0.952238 | 3.390733 | 4 | 0.847683 | null | null | null | null |
| 4 | "Maple Syrup" | 0.90551 | 40.879177 | -111.855215 | 2024-09-10 17:42:01 MDT | 1480.1 | 0.90551 | 4.296243 | 5 | 0.859249 | 2.267271 | 2.8 | 0.80974 | null |
| … | … | … | … | … | … | … | … | … | … | … | … | … | … | … |
| 10425 | "Maple Syrup" | 4.085147 | 40.857847 | -111.823899 | 2024-09-10 20:39:58 MDT | 1696.8 | 4.086371 | 18640.85135 | 10682 | 1.745071 | 18634.60004 | 10680.0 | 1.744813 | 3.177668 |
| 10426 | "Maple Syrup" | 1.535726 | 40.857852 | -111.823916 | 2024-09-10 20:39:59 MDT | 1696.8 | 1.535726 | 18642.387076 | 10683 | 1.745052 | 18637.420852 | 10681.0 | 1.744913 | 2.821613 |
| 10427 | "Maple Syrup" | 3.156134 | 40.857869 | -111.823946 | 2024-09-10 20:40:00 MDT | 1696.7 | 3.157718 | 18645.54321 | 10684 | 1.745184 | 18640.220738 | 10682.0 | 1.745012 | 2.800696 |
| 10428 | "Maple Syrup" | 3.626488 | 40.857889 | -111.82398 | 2024-09-10 20:40:01 MDT | 1696.5 | 3.631999 | 18649.169698 | 10685 | 1.74536 | 18642.943507 | 10683.0 | 1.745104 | 2.724433 |
| 10429 | "Maple Syrup" | 3.760829 | 40.857909 | -111.824016 | 2024-09-10 20:40:02 MDT | 1696.3 | 3.766144 | 18652.930528 | 10686 | 1.745548 | 18646.176372 | 10684.0 | 1.745243 | 3.235592 |
"
2319 | ],
2320 | "text/plain": [
2321 | "shape: (10_430, 15)\n",
2322 | "┌───────┬────────┬─────────────┬───────────┬───┬────────────┬────────────┬────────────┬────────────┐\n",
2323 | "│ index ┆ course ┆ distance_2d ┆ latitude ┆ … ┆ rolling_tr ┆ rolling_el ┆ rolling_ve ┆ rolling_be │\n",
2324 | "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ avelled ┆ apsed ┆ locity ┆ tween │\n",
2325 | "│ u32 ┆ cat ┆ f64 ┆ f64 ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n",
2326 | "│ ┆ ┆ ┆ ┆ ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n",
2327 | "╞═══════╪════════╪═════════════╪═══════════╪═══╪════════════╪════════════╪════════════╪════════════╡\n",
2328 | "│ 0 ┆ Maple ┆ 0.0 ┆ 40.879161 ┆ … ┆ null ┆ null ┆ null ┆ null │\n",
2329 | "│ ┆ Syrup ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n",
2330 | "│ 1 ┆ Maple ┆ 1.210883 ┆ 40.879167 ┆ … ┆ null ┆ null ┆ null ┆ null │\n",
2331 | "│ ┆ Syrup ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n",
2332 | "│ 2 ┆ Maple ┆ 1.227612 ┆ 40.879172 ┆ … ┆ null ┆ null ┆ null ┆ null │\n",
2333 | "│ ┆ Syrup ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n",
2334 | "│ 3 ┆ Maple ┆ 0.952238 ┆ 40.879174 ┆ … ┆ null ┆ null ┆ null ┆ null │\n",
2335 | "│ ┆ Syrup ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n",
2336 | "│ 4 ┆ Maple ┆ 0.90551 ┆ 40.879177 ┆ … ┆ 2.267271 ┆ 2.8 ┆ 0.80974 ┆ null │\n",
2337 | "│ ┆ Syrup ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n",
2338 | "│ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n",
2339 | "│ 10425 ┆ Maple ┆ 4.085147 ┆ 40.857847 ┆ … ┆ 18634.6000 ┆ 10680.0 ┆ 1.744813 ┆ 3.177668 │\n",
2340 | "│ ┆ Syrup ┆ ┆ ┆ ┆ 4 ┆ ┆ ┆ │\n",
2341 | "│ 10426 ┆ Maple ┆ 1.535726 ┆ 40.857852 ┆ … ┆ 18637.4208 ┆ 10681.0 ┆ 1.744913 ┆ 2.821613 │\n",
2342 | "│ ┆ Syrup ┆ ┆ ┆ ┆ 52 ┆ ┆ ┆ │\n",
2343 | "│ 10427 ┆ Maple ┆ 3.156134 ┆ 40.857869 ┆ … ┆ 18640.2207 ┆ 10682.0 ┆ 1.745012 ┆ 2.800696 │\n",
2344 | "│ ┆ Syrup ┆ ┆ ┆ ┆ 38 ┆ ┆ ┆ │\n",
2345 | "│ 10428 ┆ Maple ┆ 3.626488 ┆ 40.857889 ┆ … ┆ 18642.9435 ┆ 10683.0 ┆ 1.745104 ┆ 2.724433 │\n",
2346 | "│ ┆ Syrup ┆ ┆ ┆ ┆ 07 ┆ ┆ ┆ │\n",
2347 | "│ 10429 ┆ Maple ┆ 3.760829 ┆ 40.857909 ┆ … ┆ 18646.1763 ┆ 10684.0 ┆ 1.745243 ┆ 3.235592 │\n",
2348 | "│ ┆ Syrup ┆ ┆ ┆ ┆ 72 ┆ ┆ ┆ │\n",
2349 | "└───────┴────────┴─────────────┴───────────┴───┴────────────┴────────────┴────────────┴────────────┘"
2350 | ]
2351 | },
2352 | "execution_count": 104,
2353 | "metadata": {},
2354 | "output_type": "execute_result"
2355 | }
2356 | ],
2357 | "source": [
2358 | "# debugging\n",
2359 | "# some folks really want the intermediate data...\n",
2360 | "def get_var(df, var_name):\n",
2361 | " globals()[var_name] = df\n",
2362 | " return df\n",
2363 | "\n",
2364 | "def tweak_gpx(df_):\n",
2365 | " return (df_\n",
2366 | " .pipe(lambda df: print(df.shape) or df) # Look! 🤯\n",
2367 | " .select(cols)\n",
2368 | " .with_row_index() \n",
2369 | " .pipe(get_var, 'intermediate') # Debugging! 💪\n",
2370 | " .with_columns(pl.col('elapsed').cast(pl.Int16),\n",
2371 | " course=pl.lit('Maple Syrup').cast(pl.Categorical),\n",
2372 | " time=pl.col('time').dt.convert_time_zone('America/Denver')) \n",
2373 | " )\n",
2374 | "\n",
2375 | "raw = pl.read_parquet('Face_plant.parquet')\n",
2376 | "tweak_gpx(raw)"
2377 | ]
2378 | },
2379 | {
2380 | "cell_type": "code",
2381 | "execution_count": 105,
2382 | "metadata": {
2383 | "scrolled": true
2384 | },
2385 | "outputs": [
2386 | {
2387 | "data": {
2388 | "text/html": [
2389 | "\n",
2396 | "
shape: (10_430, 15)| index | course | distance_2d | latitude | longitude | time | elevation | speed_between | travelled | elapsed | avg_velocity | rolling_travelled | rolling_elapsed | rolling_velocity | rolling_between |
|---|
| u32 | cat | f64 | f64 | f64 | datetime[μs, America/Denver] | f64 | f64 | f64 | i16 | f64 | f64 | f64 | f64 | f64 |
| 0 | "Maple Syrup" | 0.0 | 40.879161 | -111.855169 | 2024-09-10 17:41:56 MDT | 1480.0 | null | 0.0 | 0 | NaN | null | null | null | null |
| 1 | "Maple Syrup" | 1.210883 | 40.879167 | -111.855181 | 2024-09-10 17:41:58 MDT | 1480.1 | 0.607503 | 1.210883 | 2 | 0.605442 | null | null | null | null |
| 2 | "Maple Syrup" | 1.227612 | 40.879172 | -111.855194 | 2024-09-10 17:41:59 MDT | 1480.1 | 1.227612 | 2.438495 | 3 | 0.812832 | null | null | null | null |
| 3 | "Maple Syrup" | 0.952238 | 40.879174 | -111.855205 | 2024-09-10 17:42:00 MDT | 1480.1 | 0.952238 | 3.390733 | 4 | 0.847683 | null | null | null | null |
| 4 | "Maple Syrup" | 0.90551 | 40.879177 | -111.855215 | 2024-09-10 17:42:01 MDT | 1480.1 | 0.90551 | 4.296243 | 5 | 0.859249 | 2.267271 | 2.8 | 0.80974 | null |
| … | … | … | … | … | … | … | … | … | … | … | … | … | … | … |
| 10425 | "Maple Syrup" | 4.085147 | 40.857847 | -111.823899 | 2024-09-10 20:39:58 MDT | 1696.8 | 4.086371 | 18640.85135 | 10682 | 1.745071 | 18634.60004 | 10680.0 | 1.744813 | 3.177668 |
| 10426 | "Maple Syrup" | 1.535726 | 40.857852 | -111.823916 | 2024-09-10 20:39:59 MDT | 1696.8 | 1.535726 | 18642.387076 | 10683 | 1.745052 | 18637.420852 | 10681.0 | 1.744913 | 2.821613 |
| 10427 | "Maple Syrup" | 3.156134 | 40.857869 | -111.823946 | 2024-09-10 20:40:00 MDT | 1696.7 | 3.157718 | 18645.54321 | 10684 | 1.745184 | 18640.220738 | 10682.0 | 1.745012 | 2.800696 |
| 10428 | "Maple Syrup" | 3.626488 | 40.857889 | -111.82398 | 2024-09-10 20:40:01 MDT | 1696.5 | 3.631999 | 18649.169698 | 10685 | 1.74536 | 18642.943507 | 10683.0 | 1.745104 | 2.724433 |
| 10429 | "Maple Syrup" | 3.760829 | 40.857909 | -111.824016 | 2024-09-10 20:40:02 MDT | 1696.3 | 3.766144 | 18652.930528 | 10686 | 1.745548 | 18646.176372 | 10684.0 | 1.745243 | 3.235592 |
"
2397 | ],
2398 | "text/plain": [
2399 | "shape: (10_430, 15)\n",
2400 | "┌───────┬────────┬─────────────┬───────────┬───┬────────────┬────────────┬────────────┬────────────┐\n",
2401 | "│ index ┆ course ┆ distance_2d ┆ latitude ┆ … ┆ rolling_tr ┆ rolling_el ┆ rolling_ve ┆ rolling_be │\n",
2402 | "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ avelled ┆ apsed ┆ locity ┆ tween │\n",
2403 | "│ u32 ┆ cat ┆ f64 ┆ f64 ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n",
2404 | "│ ┆ ┆ ┆ ┆ ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n",
2405 | "╞═══════╪════════╪═════════════╪═══════════╪═══╪════════════╪════════════╪════════════╪════════════╡\n",
2406 | "│ 0 ┆ Maple ┆ 0.0 ┆ 40.879161 ┆ … ┆ null ┆ null ┆ null ┆ null │\n",
2407 | "│ ┆ Syrup ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n",
2408 | "│ 1 ┆ Maple ┆ 1.210883 ┆ 40.879167 ┆ … ┆ null ┆ null ┆ null ┆ null │\n",
2409 | "│ ┆ Syrup ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n",
2410 | "│ 2 ┆ Maple ┆ 1.227612 ┆ 40.879172 ┆ … ┆ null ┆ null ┆ null ┆ null │\n",
2411 | "│ ┆ Syrup ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n",
2412 | "│ 3 ┆ Maple ┆ 0.952238 ┆ 40.879174 ┆ … ┆ null ┆ null ┆ null ┆ null │\n",
2413 | "│ ┆ Syrup ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n",
2414 | "│ 4 ┆ Maple ┆ 0.90551 ┆ 40.879177 ┆ … ┆ 2.267271 ┆ 2.8 ┆ 0.80974 ┆ null │\n",
2415 | "│ ┆ Syrup ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n",
2416 | "│ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n",
2417 | "│ 10425 ┆ Maple ┆ 4.085147 ┆ 40.857847 ┆ … ┆ 18634.6000 ┆ 10680.0 ┆ 1.744813 ┆ 3.177668 │\n",
2418 | "│ ┆ Syrup ┆ ┆ ┆ ┆ 4 ┆ ┆ ┆ │\n",
2419 | "│ 10426 ┆ Maple ┆ 1.535726 ┆ 40.857852 ┆ … ┆ 18637.4208 ┆ 10681.0 ┆ 1.744913 ┆ 2.821613 │\n",
2420 | "│ ┆ Syrup ┆ ┆ ┆ ┆ 52 ┆ ┆ ┆ │\n",
2421 | "│ 10427 ┆ Maple ┆ 3.156134 ┆ 40.857869 ┆ … ┆ 18640.2207 ┆ 10682.0 ┆ 1.745012 ┆ 2.800696 │\n",
2422 | "│ ┆ Syrup ┆ ┆ ┆ ┆ 38 ┆ ┆ ┆ │\n",
2423 | "│ 10428 ┆ Maple ┆ 3.626488 ┆ 40.857889 ┆ … ┆ 18642.9435 ┆ 10683.0 ┆ 1.745104 ┆ 2.724433 │\n",
2424 | "│ ┆ Syrup ┆ ┆ ┆ ┆ 07 ┆ ┆ ┆ │\n",
2425 | "│ 10429 ┆ Maple ┆ 3.760829 ┆ 40.857909 ┆ … ┆ 18646.1763 ┆ 10684.0 ┆ 1.745243 ┆ 3.235592 │\n",
2426 | "│ ┆ Syrup ┆ ┆ ┆ ┆ 72 ┆ ┆ ┆ │\n",
2427 | "└───────┴────────┴─────────────┴───────────┴───┴────────────┴────────────┴────────────┴────────────┘"
2428 | ]
2429 | },
2430 | "execution_count": 105,
2431 | "metadata": {},
2432 | "output_type": "execute_result"
2433 | }
2434 | ],
2435 | "source": [
2436 | "intermediate"
2437 | ]
2438 | },
2439 | {
2440 | "cell_type": "code",
2441 | "execution_count": null,
2442 | "metadata": {
2443 | "lines_to_next_cell": 2,
2444 | "pycharm": {
2445 | "name": "#%%\n"
2446 | }
2447 | },
2448 | "outputs": [],
2449 | "source": []
2450 | },
2451 | {
2452 | "cell_type": "code",
2453 | "execution_count": null,
2454 | "metadata": {
2455 | "lines_to_next_cell": 2,
2456 | "pycharm": {
2457 | "name": "#%%\n"
2458 | }
2459 | },
2460 | "outputs": [],
2461 | "source": []
2462 | },
2463 | {
2464 | "cell_type": "code",
2465 | "execution_count": null,
2466 | "metadata": {
2467 | "lines_to_next_cell": 2,
2468 | "pycharm": {
2469 | "name": "#%%\n"
2470 | }
2471 | },
2472 | "outputs": [],
2473 | "source": []
2474 | },
2475 | {
2476 | "cell_type": "code",
2477 | "execution_count": null,
2478 | "metadata": {
2479 | "lines_to_next_cell": 2,
2480 | "pycharm": {
2481 | "name": "#%%\n"
2482 | }
2483 | },
2484 | "outputs": [],
2485 | "source": []
2486 | },
2487 | {
2488 | "cell_type": "code",
2489 | "execution_count": null,
2490 | "metadata": {
2491 | "lines_to_next_cell": 2,
2492 | "pycharm": {
2493 | "name": "#%%\n"
2494 | }
2495 | },
2496 | "outputs": [],
2497 | "source": []
2498 | },
2499 | {
2500 | "cell_type": "markdown",
2501 | "metadata": {
2502 | "pycharm": {
2503 | "name": "#%% md\n"
2504 | }
2505 | },
2506 | "source": [
2507 | "## Don't Apply (map_elements) if you can"
2508 | ]
2509 | },
2510 | {
2511 | "cell_type": "code",
2512 | "execution_count": 106,
2513 | "metadata": {},
2514 | "outputs": [
2515 | {
2516 | "name": "stdout",
2517 | "output_type": "stream",
2518 | "text": [
2519 | "(10430, 15)\n"
2520 | ]
2521 | }
2522 | ],
2523 | "source": [
2524 | "# debugging\n",
2525 | "def get_var(df, var_name):\n",
2526 | " globals()[var_name] = df\n",
2527 | " return df\n",
2528 | "\n",
2529 | "def tweak_gpx(df_):\n",
2530 | " return (df_\n",
2531 | " .pipe(lambda df: print(df.shape) or df)\n",
2532 | " .select(cols)\n",
2533 | " .with_row_index() \n",
2534 | " .pipe(get_var, 'intermediate')\n",
2535 | " .with_columns(pl.col('elapsed').cast(pl.Int16),\n",
2536 | " course=pl.lit('Maple Syrup').cast(pl.Categorical),\n",
2537 | " time=pl.col('time').dt.convert_time_zone('America/Denver')) \n",
2538 | " )\n",
2539 | "\n",
2540 | "raw = pl.read_parquet('Face_plant.parquet')\n",
2541 | "df = tweak_gpx(raw)"
2542 | ]
2543 | },
2544 | {
2545 | "cell_type": "code",
2546 | "execution_count": 107,
2547 | "metadata": {},
2548 | "outputs": [
2549 | {
2550 | "name": "stderr",
2551 | "output_type": "stream",
2552 | "text": [
2553 | "/var/folders/qn/r8_0pgj1645dn1w69vqls6cw0000gn/T/ipykernel_70391/613516876.py:7: PolarsInefficientMapWarning: \n",
2554 | "Expr.map_elements is significantly slower than the native expressions API.\n",
2555 | "Only use if you absolutely CANNOT implement your logic otherwise.\n",
2556 | "Replace this expression...\n",
2557 | " - pl.col(\"elevation\").map_elements(meters_to_feet)\n",
2558 | "with this one instead:\n",
2559 | " + pl.col(\"elevation\") * 3.28084\n",
2560 | "\n",
2561 | " ele_ft=pl.col('elevation').map_elements(meters_to_feet))\n",
2562 | "sys:1: MapWithoutReturnDtypeWarning: Calling `map_elements` without specifying `return_dtype` can lead to unpredictable results. Specify `return_dtype` to silence this warning.\n"
2563 | ]
2564 | },
2565 | {
2566 | "data": {
2567 | "text/html": [
2568 | "\n",
2575 | "
shape: (10_430, 2)| elevation | ele_ft |
|---|
| f64 | f64 |
| 1480.0 | 4855.6432 |
| 1480.1 | 4855.971284 |
| 1480.1 | 4855.971284 |
| 1480.1 | 4855.971284 |
| 1480.1 | 4855.971284 |
| … | … |
| 1696.8 | 5566.929312 |
| 1696.8 | 5566.929312 |
| 1696.7 | 5566.601228 |
| 1696.5 | 5565.94506 |
| 1696.3 | 5565.288892 |
"
2576 | ],
2577 | "text/plain": [
2578 | "shape: (10_430, 2)\n",
2579 | "┌───────────┬─────────────┐\n",
2580 | "│ elevation ┆ ele_ft │\n",
2581 | "│ --- ┆ --- │\n",
2582 | "│ f64 ┆ f64 │\n",
2583 | "╞═══════════╪═════════════╡\n",
2584 | "│ 1480.0 ┆ 4855.6432 │\n",
2585 | "│ 1480.1 ┆ 4855.971284 │\n",
2586 | "│ 1480.1 ┆ 4855.971284 │\n",
2587 | "│ 1480.1 ┆ 4855.971284 │\n",
2588 | "│ 1480.1 ┆ 4855.971284 │\n",
2589 | "│ … ┆ … │\n",
2590 | "│ 1696.8 ┆ 5566.929312 │\n",
2591 | "│ 1696.8 ┆ 5566.929312 │\n",
2592 | "│ 1696.7 ┆ 5566.601228 │\n",
2593 | "│ 1696.5 ┆ 5565.94506 │\n",
2594 | "│ 1696.3 ┆ 5565.288892 │\n",
2595 | "└───────────┴─────────────┘"
2596 | ]
2597 | },
2598 | "execution_count": 107,
2599 | "metadata": {},
2600 | "output_type": "execute_result"
2601 | }
2602 | ],
2603 | "source": [
2604 | "# convert elevation from meters to feet\n",
2605 | "def meters_to_feet(m):\n",
2606 | " return m * 3.28084\n",
2607 | "\n",
2608 | "(df\n",
2609 | " .select('elevation', \n",
2610 | " ele_ft=pl.col('elevation').map_elements(meters_to_feet)) \n",
2611 | ")"
2612 | ]
2613 | },
2614 | {
2615 | "cell_type": "code",
2616 | "execution_count": 108,
2617 | "metadata": {},
2618 | "outputs": [
2619 | {
2620 | "data": {
2621 | "text/html": [
2622 | "\n",
2629 | "
shape: (10_430, 2)| elevation | ele_ft |
|---|
| f64 | f64 |
| 1480.0 | 4855.6432 |
| 1480.1 | 4855.971284 |
| 1480.1 | 4855.971284 |
| 1480.1 | 4855.971284 |
| 1480.1 | 4855.971284 |
| … | … |
| 1696.8 | 5566.929312 |
| 1696.8 | 5566.929312 |
| 1696.7 | 5566.601228 |
| 1696.5 | 5565.94506 |
| 1696.3 | 5565.288892 |
"
2630 | ],
2631 | "text/plain": [
2632 | "shape: (10_430, 2)\n",
2633 | "┌───────────┬─────────────┐\n",
2634 | "│ elevation ┆ ele_ft │\n",
2635 | "│ --- ┆ --- │\n",
2636 | "│ f64 ┆ f64 │\n",
2637 | "╞═══════════╪═════════════╡\n",
2638 | "│ 1480.0 ┆ 4855.6432 │\n",
2639 | "│ 1480.1 ┆ 4855.971284 │\n",
2640 | "│ 1480.1 ┆ 4855.971284 │\n",
2641 | "│ 1480.1 ┆ 4855.971284 │\n",
2642 | "│ 1480.1 ┆ 4855.971284 │\n",
2643 | "│ … ┆ … │\n",
2644 | "│ 1696.8 ┆ 5566.929312 │\n",
2645 | "│ 1696.8 ┆ 5566.929312 │\n",
2646 | "│ 1696.7 ┆ 5566.601228 │\n",
2647 | "│ 1696.5 ┆ 5565.94506 │\n",
2648 | "│ 1696.3 ┆ 5565.288892 │\n",
2649 | "└───────────┴─────────────┘"
2650 | ]
2651 | },
2652 | "execution_count": 108,
2653 | "metadata": {},
2654 | "output_type": "execute_result"
2655 | }
2656 | ],
2657 | "source": [
2658 | "# convert elevation from meters to feet\n",
2659 | "def meters_to_feet(m):\n",
2660 | " return m * 3.28084\n",
2661 | "\n",
2662 | "(df\n",
2663 | " .select('elevation', \n",
2664 | " ele_ft=meters_to_feet(pl.col('elevation')))\n",
2665 | ")"
2666 | ]
2667 | },
2668 | {
2669 | "cell_type": "code",
2670 | "execution_count": 109,
2671 | "metadata": {},
2672 | "outputs": [
2673 | {
2674 | "data": {
2675 | "text/html": [
2676 | "\n",
2683 | "
shape: (10_430, 2)| elevation | ele_ft |
|---|
| f64 | f64 |
| 1480.0 | 4855.6432 |
| 1480.1 | 4855.971284 |
| 1480.1 | 4855.971284 |
| 1480.1 | 4855.971284 |
| 1480.1 | 4855.971284 |
| … | … |
| 1696.8 | 5566.929312 |
| 1696.8 | 5566.929312 |
| 1696.7 | 5566.601228 |
| 1696.5 | 5565.94506 |
| 1696.3 | 5565.288892 |
"
2684 | ],
2685 | "text/plain": [
2686 | "shape: (10_430, 2)\n",
2687 | "┌───────────┬─────────────┐\n",
2688 | "│ elevation ┆ ele_ft │\n",
2689 | "│ --- ┆ --- │\n",
2690 | "│ f64 ┆ f64 │\n",
2691 | "╞═══════════╪═════════════╡\n",
2692 | "│ 1480.0 ┆ 4855.6432 │\n",
2693 | "│ 1480.1 ┆ 4855.971284 │\n",
2694 | "│ 1480.1 ┆ 4855.971284 │\n",
2695 | "│ 1480.1 ┆ 4855.971284 │\n",
2696 | "│ 1480.1 ┆ 4855.971284 │\n",
2697 | "│ … ┆ … │\n",
2698 | "│ 1696.8 ┆ 5566.929312 │\n",
2699 | "│ 1696.8 ┆ 5566.929312 │\n",
2700 | "│ 1696.7 ┆ 5566.601228 │\n",
2701 | "│ 1696.5 ┆ 5565.94506 │\n",
2702 | "│ 1696.3 ┆ 5565.288892 │\n",
2703 | "└───────────┴─────────────┘"
2704 | ]
2705 | },
2706 | "execution_count": 109,
2707 | "metadata": {},
2708 | "output_type": "execute_result"
2709 | }
2710 | ],
2711 | "source": [
2712 | "# Perhaps more readable\n",
2713 | "# convert elevation from meters to feet\n",
2714 | "def meters_to_feet(m):\n",
2715 | " return m * 3.28084\n",
2716 | "\n",
2717 | "(df\n",
2718 | " .select('elevation', \n",
2719 | " ele_ft=pl.col('elevation').pipe(meters_to_feet))\n",
2720 | ")"
2721 | ]
2722 | },
2723 | {
2724 | "cell_type": "code",
2725 | "execution_count": null,
2726 | "metadata": {
2727 | "collapsed": true,
2728 | "jupyter": {
2729 | "outputs_hidden": true
2730 | }
2731 | },
2732 | "outputs": [],
2733 | "source": [
2734 | "%%timeit\n",
2735 | "# takes 965 µs on my machine\n",
2736 | "(df\n",
2737 | " .select('elevation', ele_ft=pl.col('elevation').map_elements(meters_to_feet)) \n",
2738 | ")"
2739 | ]
2740 | },
2741 | {
2742 | "cell_type": "code",
2743 | "execution_count": 110,
2744 | "metadata": {},
2745 | "outputs": [
2746 | {
2747 | "name": "stdout",
2748 | "output_type": "stream",
2749 | "text": [
2750 | "38 µs ± 514 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)\n"
2751 | ]
2752 | }
2753 | ],
2754 | "source": [
2755 | "%%timeit\n",
2756 | "(df\n",
2757 | " .select('elevation', \n",
2758 | " ele_ft=pl.col('elevation').pipe(meters_to_feet))\n",
2759 | ")"
2760 | ]
2761 | },
2762 | {
2763 | "cell_type": "code",
2764 | "execution_count": 111,
2765 | "metadata": {},
2766 | "outputs": [
2767 | {
2768 | "name": "stdout",
2769 | "output_type": "stream",
2770 | "text": [
2771 | "37.7 µs ± 1.11 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)\n"
2772 | ]
2773 | }
2774 | ],
2775 | "source": [
2776 | "%%timeit\n",
2777 | "(df\n",
2778 | " .select('elevation', \n",
2779 | " ele_ft=pl.col('elevation')*3.28084)\n",
2780 | ")"
2781 | ]
2782 | },
2783 | {
2784 | "cell_type": "code",
2785 | "execution_count": 112,
2786 | "metadata": {},
2787 | "outputs": [
2788 | {
2789 | "data": {
2790 | "text/plain": [
2791 | "24.125"
2792 | ]
2793 | },
2794 | "execution_count": 112,
2795 | "metadata": {},
2796 | "output_type": "execute_result"
2797 | }
2798 | ],
2799 | "source": [
2800 | "965/40"
2801 | ]
2802 | },
2803 | {
2804 | "cell_type": "code",
2805 | "execution_count": null,
2806 | "metadata": {
2807 | "lines_to_next_cell": 2,
2808 | "pycharm": {
2809 | "name": "#%%\n"
2810 | }
2811 | },
2812 | "outputs": [],
2813 | "source": []
2814 | },
2815 | {
2816 | "cell_type": "markdown",
2817 | "metadata": {
2818 | "lines_to_next_cell": 2,
2819 | "pycharm": {
2820 | "name": "#%%\n"
2821 | }
2822 | },
2823 | "source": [
2824 | "## benchmark caveat\n",
2825 | "- Use the size of data you are using in the real world"
2826 | ]
2827 | },
2828 | {
2829 | "cell_type": "code",
2830 | "execution_count": null,
2831 | "metadata": {
2832 | "lines_to_next_cell": 2,
2833 | "pycharm": {
2834 | "name": "#%%\n"
2835 | }
2836 | },
2837 | "outputs": [],
2838 | "source": []
2839 | },
2840 | {
2841 | "cell_type": "code",
2842 | "execution_count": null,
2843 | "metadata": {
2844 | "lines_to_next_cell": 2,
2845 | "pycharm": {
2846 | "name": "#%%\n"
2847 | }
2848 | },
2849 | "outputs": [],
2850 | "source": []
2851 | },
2852 | {
2853 | "cell_type": "code",
2854 | "execution_count": null,
2855 | "metadata": {
2856 | "lines_to_next_cell": 2,
2857 | "pycharm": {
2858 | "name": "#%%\n"
2859 | }
2860 | },
2861 | "outputs": [],
2862 | "source": []
2863 | },
2864 | {
2865 | "cell_type": "code",
2866 | "execution_count": null,
2867 | "metadata": {
2868 | "lines_to_next_cell": 2,
2869 | "pycharm": {
2870 | "name": "#%%\n"
2871 | }
2872 | },
2873 | "outputs": [],
2874 | "source": []
2875 | },
2876 | {
2877 | "cell_type": "code",
2878 | "execution_count": null,
2879 | "metadata": {
2880 | "lines_to_next_cell": 2,
2881 | "pycharm": {
2882 | "name": "#%%\n"
2883 | }
2884 | },
2885 | "outputs": [],
2886 | "source": []
2887 | },
2888 | {
2889 | "cell_type": "markdown",
2890 | "metadata": {
2891 | "pycharm": {
2892 | "name": "#%% md\n"
2893 | }
2894 | },
2895 | "source": [
2896 | "## Master Aggregation\n",
2897 | "\n",
2898 | "Let's speed (and distance) by 10 minute intervals"
2899 | ]
2900 | },
2901 | {
2902 | "cell_type": "code",
2903 | "execution_count": 113,
2904 | "metadata": {},
2905 | "outputs": [
2906 | {
2907 | "name": "stdout",
2908 | "output_type": "stream",
2909 | "text": [
2910 | "(10430, 15)\n"
2911 | ]
2912 | },
2913 | {
2914 | "data": {
2915 | "text/html": [
2916 | "\n",
2923 | "
shape: (19, 4)| time | travelled | speed | mph |
|---|
| datetime[μs, America/Denver] | f64 | f64 | f64 |
| 2024-09-10 17:40:00 MDT | 1620.956003 | 3.356017 | 7.507208 |
| 2024-09-10 17:50:00 MDT | 1081.973828 | 1.8063 | 4.040585 |
| 2024-09-10 18:00:00 MDT | 99.724838 | 0.166486 | 0.372418 |
| 2024-09-10 18:10:00 MDT | 1703.006885 | 2.843083 | 6.359807 |
| 2024-09-10 18:20:00 MDT | 1602.514698 | 2.675317 | 5.984523 |
| … | … | … | … |
| 2024-09-10 20:00:00 MDT | 973.953159 | 1.625965 | 3.637187 |
| 2024-09-10 20:10:00 MDT | 680.118068 | 1.135422 | 2.539872 |
| 2024-09-10 20:20:00 MDT | 796.69377 | 1.33004 | 2.975219 |
| 2024-09-10 20:30:00 MDT | 763.967062 | 1.275404 | 2.853002 |
| 2024-09-10 20:40:00 MDT | 7.387318 | 3.693659 | 8.262493 |
"
2924 | ],
2925 | "text/plain": [
2926 | "shape: (19, 4)\n",
2927 | "┌──────────────────────────────┬─────────────┬──────────┬──────────┐\n",
2928 | "│ time ┆ travelled ┆ speed ┆ mph │\n",
2929 | "│ --- ┆ --- ┆ --- ┆ --- │\n",
2930 | "│ datetime[μs, America/Denver] ┆ f64 ┆ f64 ┆ f64 │\n",
2931 | "╞══════════════════════════════╪═════════════╪══════════╪══════════╡\n",
2932 | "│ 2024-09-10 17:40:00 MDT ┆ 1620.956003 ┆ 3.356017 ┆ 7.507208 │\n",
2933 | "│ 2024-09-10 17:50:00 MDT ┆ 1081.973828 ┆ 1.8063 ┆ 4.040585 │\n",
2934 | "│ 2024-09-10 18:00:00 MDT ┆ 99.724838 ┆ 0.166486 ┆ 0.372418 │\n",
2935 | "│ 2024-09-10 18:10:00 MDT ┆ 1703.006885 ┆ 2.843083 ┆ 6.359807 │\n",
2936 | "│ 2024-09-10 18:20:00 MDT ┆ 1602.514698 ┆ 2.675317 ┆ 5.984523 │\n",
2937 | "│ … ┆ … ┆ … ┆ … │\n",
2938 | "│ 2024-09-10 20:00:00 MDT ┆ 973.953159 ┆ 1.625965 ┆ 3.637187 │\n",
2939 | "│ 2024-09-10 20:10:00 MDT ┆ 680.118068 ┆ 1.135422 ┆ 2.539872 │\n",
2940 | "│ 2024-09-10 20:20:00 MDT ┆ 796.69377 ┆ 1.33004 ┆ 2.975219 │\n",
2941 | "│ 2024-09-10 20:30:00 MDT ┆ 763.967062 ┆ 1.275404 ┆ 2.853002 │\n",
2942 | "│ 2024-09-10 20:40:00 MDT ┆ 7.387318 ┆ 3.693659 ┆ 8.262493 │\n",
2943 | "└──────────────────────────────┴─────────────┴──────────┴──────────┘"
2944 | ]
2945 | },
2946 | "execution_count": 113,
2947 | "metadata": {},
2948 | "output_type": "execute_result"
2949 | }
2950 | ],
2951 | "source": [
2952 | "def meters_per_second_to_mph(mps):\n",
2953 | " return mps * 2.23694\n",
2954 | "\n",
2955 | "(tweak_gpx(raw)\n",
2956 | " .group_by_dynamic(index_column='time', every='10m')\n",
2957 | " .agg(pl.col('travelled').last() - pl.col('travelled').first(),\n",
2958 | " speed=(pl.col('travelled').last() - pl.col('travelled').first()) / \n",
2959 | " ((pl.col('time').last() - pl.col('time').first()).dt.total_seconds())\n",
2960 | " ) \n",
2961 | " .with_columns(mph=pl.col('speed').pipe(meters_per_second_to_mph))\n",
2962 | " )"
2963 | ]
2964 | },
2965 | {
2966 | "cell_type": "code",
2967 | "execution_count": 114,
2968 | "metadata": {},
2969 | "outputs": [
2970 | {
2971 | "name": "stdout",
2972 | "output_type": "stream",
2973 | "text": [
2974 | "(10430, 15)\n"
2975 | ]
2976 | },
2977 | {
2978 | "data": {
2979 | "text/html": [
2980 | "\n",
2981 | "\n",
2992 | "\n",
2993 | ""
3046 | ],
3047 | "text/plain": [
3048 | "alt.Chart(...)"
3049 | ]
3050 | },
3051 | "execution_count": 114,
3052 | "metadata": {},
3053 | "output_type": "execute_result"
3054 | }
3055 | ],
3056 | "source": [
3057 | "def meters_per_second_to_mph(mps):\n",
3058 | " return mps * 2.23694\n",
3059 | "\n",
3060 | "(tweak_gpx(raw)\n",
3061 | " .group_by_dynamic(index_column='time', every='10m')\n",
3062 | " .agg(pl.col('travelled').last() - pl.col('travelled').first(),\n",
3063 | " speed=(pl.col('travelled').last() - pl.col('travelled').first()) / \n",
3064 | " ((pl.col('time').last() - pl.col('time').first()).dt.total_seconds())\n",
3065 | " ) \n",
3066 | " .with_columns(mph=pl.col('speed').pipe(meters_per_second_to_mph))\n",
3067 | " .plot.bar(x='time', y='mph')\n",
3068 | " )"
3069 | ]
3070 | },
3071 | {
3072 | "cell_type": "code",
3073 | "execution_count": 115,
3074 | "metadata": {},
3075 | "outputs": [
3076 | {
3077 | "name": "stdout",
3078 | "output_type": "stream",
3079 | "text": [
3080 | "(10430, 15)\n"
3081 | ]
3082 | },
3083 | {
3084 | "data": {
3085 | "text/html": [
3086 | "\n",
3093 | "
shape: (2, 2)| climbing | distance_2d |
|---|
| bool | f64 |
| false | 5.46702 |
| true | 6.123374 |
"
3094 | ],
3095 | "text/plain": [
3096 | "shape: (2, 2)\n",
3097 | "┌──────────┬─────────────┐\n",
3098 | "│ climbing ┆ distance_2d │\n",
3099 | "│ --- ┆ --- │\n",
3100 | "│ bool ┆ f64 │\n",
3101 | "╞══════════╪═════════════╡\n",
3102 | "│ false ┆ 5.46702 │\n",
3103 | "│ true ┆ 6.123374 │\n",
3104 | "└──────────┴─────────────┘"
3105 | ]
3106 | },
3107 | "execution_count": 115,
3108 | "metadata": {},
3109 | "output_type": "execute_result"
3110 | }
3111 | ],
3112 | "source": [
3113 | "# uphill vs downhill\n",
3114 | "def meters_to_feet(m):\n",
3115 | " return m * 3.28084\n",
3116 | "\n",
3117 | "def feet_to_miles(f):\n",
3118 | " return f / 5280\n",
3119 | "\n",
3120 | "(tweak_gpx(raw)\n",
3121 | " .with_columns(climbing=pl.col('elevation').diff().gt(0))\n",
3122 | " .group_by('climbing')\n",
3123 | " .agg(pl.col('distance_2d').sum().pipe(meters_to_feet).pipe(feet_to_miles))\n",
3124 | " .filter(~pl.col('climbing').is_null())\n",
3125 | ")"
3126 | ]
3127 | },
3128 | {
3129 | "cell_type": "code",
3130 | "execution_count": 116,
3131 | "metadata": {},
3132 | "outputs": [
3133 | {
3134 | "name": "stdout",
3135 | "output_type": "stream",
3136 | "text": [
3137 | "(10430, 15)\n"
3138 | ]
3139 | },
3140 | {
3141 | "data": {
3142 | "text/html": [
3143 | "\n",
3144 | "\n",
3155 | "\n",
3156 | ""
3209 | ],
3210 | "text/plain": [
3211 | "alt.Chart(...)"
3212 | ]
3213 | },
3214 | "execution_count": 116,
3215 | "metadata": {},
3216 | "output_type": "execute_result"
3217 | }
3218 | ],
3219 | "source": [
3220 | "# uphill vs downhill\n",
3221 | "def meters_to_feet(m):\n",
3222 | " return m * 3.28084\n",
3223 | "\n",
3224 | "def feet_to_miles(f):\n",
3225 | " return f / 5280\n",
3226 | "\n",
3227 | "(tweak_gpx(raw)\n",
3228 | " .with_columns(climbing=pl.col('elevation').diff().gt(0))\n",
3229 | " .group_by('climbing')\n",
3230 | " .agg(pl.col('distance_2d').sum().pipe(meters_to_feet).pipe(feet_to_miles))\n",
3231 | " .filter(~pl.col('climbing').is_null())\n",
3232 | " .plot.bar(x='climbing', y='distance_2d')\n",
3233 | ")"
3234 | ]
3235 | },
3236 | {
3237 | "cell_type": "code",
3238 | "execution_count": null,
3239 | "metadata": {
3240 | "lines_to_next_cell": 2,
3241 | "pycharm": {
3242 | "name": "#%%\n"
3243 | }
3244 | },
3245 | "outputs": [],
3246 | "source": []
3247 | },
3248 | {
3249 | "cell_type": "code",
3250 | "execution_count": null,
3251 | "metadata": {
3252 | "lines_to_next_cell": 2,
3253 | "pycharm": {
3254 | "name": "#%%\n"
3255 | }
3256 | },
3257 | "outputs": [],
3258 | "source": []
3259 | },
3260 | {
3261 | "cell_type": "code",
3262 | "execution_count": null,
3263 | "metadata": {
3264 | "lines_to_next_cell": 2,
3265 | "pycharm": {
3266 | "name": "#%%\n"
3267 | }
3268 | },
3269 | "outputs": [],
3270 | "source": []
3271 | },
3272 | {
3273 | "cell_type": "code",
3274 | "execution_count": null,
3275 | "metadata": {
3276 | "lines_to_next_cell": 2,
3277 | "pycharm": {
3278 | "name": "#%%\n"
3279 | }
3280 | },
3281 | "outputs": [],
3282 | "source": []
3283 | },
3284 | {
3285 | "cell_type": "code",
3286 | "execution_count": null,
3287 | "metadata": {
3288 | "lines_to_next_cell": 2,
3289 | "pycharm": {
3290 | "name": "#%%\n"
3291 | }
3292 | },
3293 | "outputs": [],
3294 | "source": []
3295 | },
3296 | {
3297 | "cell_type": "code",
3298 | "execution_count": null,
3299 | "metadata": {
3300 | "lines_to_next_cell": 2,
3301 | "pycharm": {
3302 | "name": "#%%\n"
3303 | }
3304 | },
3305 | "outputs": [],
3306 | "source": []
3307 | },
3308 | {
3309 | "cell_type": "code",
3310 | "execution_count": null,
3311 | "metadata": {
3312 | "lines_to_next_cell": 2,
3313 | "pycharm": {
3314 | "name": "#%%\n"
3315 | }
3316 | },
3317 | "outputs": [],
3318 | "source": []
3319 | },
3320 | {
3321 | "cell_type": "code",
3322 | "execution_count": null,
3323 | "metadata": {
3324 | "lines_to_next_cell": 2,
3325 | "pycharm": {
3326 | "name": "#%%\n"
3327 | }
3328 | },
3329 | "outputs": [],
3330 | "source": []
3331 | },
3332 | {
3333 | "cell_type": "markdown",
3334 | "metadata": {
3335 | "pycharm": {
3336 | "name": "#%% md\n"
3337 | }
3338 | },
3339 | "source": [
3340 | "## Summary\n",
3341 | "\n",
3342 | "* Correct types save space and enable convenient math, string, and date functionality\n",
3343 | "* Chaining operations will:\n",
3344 | " * Make code readable\n",
3345 | " * Remove bugs\n",
3346 | " * Easier to debug\n",
3347 | "* ``.map_elements`` is slow for math\n",
3348 | "* Aggregations are powerful. Play with them until they make sense\n",
3349 | "\n",
3350 | "\n",
3351 | "Let's connect! Happy to discuss how your team can better leverage tabular technologies.\n",
3352 | "\n",
3353 | "Twitter ``@__mharrison__``, LinkedIn\n",
3354 | "\n",
3355 | "Book giveaway"
3356 | ]
3357 | },
3358 | {
3359 | "cell_type": "code",
3360 | "execution_count": 117,
3361 | "metadata": {},
3362 | "outputs": [],
3363 | "source": [
3364 | "import random"
3365 | ]
3366 | },
3367 | {
3368 | "cell_type": "code",
3369 | "execution_count": 118,
3370 | "metadata": {},
3371 | "outputs": [
3372 | {
3373 | "data": {
3374 | "text/plain": [
3375 | "10"
3376 | ]
3377 | },
3378 | "execution_count": 118,
3379 | "metadata": {},
3380 | "output_type": "execute_result"
3381 | }
3382 | ],
3383 | "source": [
3384 | "random.randrange(0, 12)"
3385 | ]
3386 | },
3387 | {
3388 | "cell_type": "code",
3389 | "execution_count": null,
3390 | "metadata": {},
3391 | "outputs": [],
3392 | "source": [
3393 | "random.randrange(0, 3)"
3394 | ]
3395 | },
3396 | {
3397 | "cell_type": "code",
3398 | "execution_count": null,
3399 | "metadata": {},
3400 | "outputs": [],
3401 | "source": [
3402 | "import random\n",
3403 | "random.choice([0,1])"
3404 | ]
3405 | },
3406 | {
3407 | "cell_type": "code",
3408 | "execution_count": null,
3409 | "metadata": {
3410 | "lines_to_next_cell": 2,
3411 | "pycharm": {
3412 | "name": "#%%\n"
3413 | }
3414 | },
3415 | "outputs": [],
3416 | "source": [
3417 | "import random\n",
3418 | "random.randrange(1,4)"
3419 | ]
3420 | },
3421 | {
3422 | "cell_type": "code",
3423 | "execution_count": null,
3424 | "metadata": {
3425 | "lines_to_next_cell": 2,
3426 | "pycharm": {
3427 | "name": "#%%\n"
3428 | }
3429 | },
3430 | "outputs": [],
3431 | "source": []
3432 | },
3433 | {
3434 | "cell_type": "code",
3435 | "execution_count": null,
3436 | "metadata": {
3437 | "lines_to_next_cell": 2,
3438 | "pycharm": {
3439 | "name": "#%%\n"
3440 | }
3441 | },
3442 | "outputs": [],
3443 | "source": []
3444 | },
3445 | {
3446 | "cell_type": "code",
3447 | "execution_count": null,
3448 | "metadata": {
3449 | "pycharm": {
3450 | "name": "#%%\n"
3451 | }
3452 | },
3453 | "outputs": [],
3454 | "source": []
3455 | }
3456 | ],
3457 | "metadata": {
3458 | "jupytext": {
3459 | "encoding": "# -*- coding: utf-8 -*-",
3460 | "formats": "ipynb,py:light"
3461 | },
3462 | "kernelspec": {
3463 | "display_name": "Python 3 (ipykernel)",
3464 | "language": "python",
3465 | "name": "python3"
3466 | },
3467 | "language_info": {
3468 | "codemirror_mode": {
3469 | "name": "ipython",
3470 | "version": 3
3471 | },
3472 | "file_extension": ".py",
3473 | "mimetype": "text/x-python",
3474 | "name": "python",
3475 | "nbconvert_exporter": "python",
3476 | "pygments_lexer": "ipython3",
3477 | "version": "3.10.15"
3478 | }
3479 | },
3480 | "nbformat": 4,
3481 | "nbformat_minor": 4
3482 | }
3483 |
--------------------------------------------------------------------------------