├── .gitattributes ├── .gitignore ├── 01-intro-to-pandas-part-1-answers.ipynb ├── 01-intro-to-pandas-part-1-exercises.ipynb ├── 01-intro-to-pandas-part-1-slides.ipynb ├── 02-intro-to-pandas-part-2-answers.ipynb ├── 02-intro-to-pandas-part-2-exercises.ipynb ├── 02-intro-to-pandas-part-2-slides.ipynb ├── 03-group-apply-answers.ipynb ├── 03-group-apply-exercises.ipynb ├── 03-group-apply-slides.ipynb ├── 04-time-series-answers.ipynb ├── 04-time-series-exercises.ipynb ├── 04-time-series-slides.ipynb ├── 05-merge-pivot-answers.ipynb ├── 05-merge-pivot-exercises.ipynb ├── 05-merge-pivot-slides.ipynb ├── 06-advanced-merge-reshape-answers.ipynb ├── 06-advanced-merge-reshape-exercises.ipynb ├── 06-advanced-merge-reshape-slides.ipynb ├── LICENSE ├── README.md ├── assets ├── data-label-arrays.png ├── enplus-logo-colored.png ├── enplus-logo-colored.svg ├── full-join.png ├── inner-join.png ├── lag.png ├── lead.png ├── left-join.png ├── lesson-01-key-value.png ├── right-join.png ├── rolling.png ├── split-apply-combine.png ├── stock-trading-1600x1200.jpg ├── vectorized-multiplication.png └── venn-diagrams.sketch ├── binder └── environment.yml ├── build.py ├── build.sh ├── build ├── .gitignore ├── custom.css ├── favicon.ico └── reveal.js │ └── .gitignore ├── config.py ├── data ├── 726505-04845-2009 └── weather-6m.csv ├── docker-compose.yml ├── environment-dev.yml ├── environment.yml ├── scripts └── combine-envs.sh └── sp500.csv /.gitattributes: -------------------------------------------------------------------------------- 1 | *.jpg filter=lfs diff=lfs merge=lfs -text 2 | 3 | *.ipynb diff=jupyternotebook 4 | 5 | *.ipynb merge=jupyternotebook 6 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # OS X 7 | .DS_Store 8 | 9 | # intellij, vscode 10 | .idea/ 11 | .vscode/ 12 | 13 | # C extensions 14 | *.so 15 | 16 | # Distribution / packaging 17 | .Python 18 | env/ 19 | develop-eggs/ 20 | dist/ 21 | downloads/ 22 | eggs/ 23 | .eggs/ 24 | lib/ 25 | lib64/ 26 | parts/ 27 | sdist/ 28 | var/ 29 | *.egg-info/ 30 | .installed.cfg 31 | *.egg 32 | 33 | # PyInstaller 34 | # Usually these files are written by a python script from a template 35 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 36 | *.manifest 37 | *.spec 38 | 39 | # Installer logs 40 | log.txt 41 | pip-log.txt 42 | pip-delete-this-directory.txt 43 | 44 | # Unit test / coverage reports 45 | htmlcov/ 46 | .tox/ 47 | .coverage 48 | .coverage.* 49 | .cache 50 | nosetests.xml 51 | coverage.xml 52 | *,cover 53 | .hypothesis/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | 63 | # Flask stuff: 64 | instance/ 65 | .webassets-cache 66 | 67 | # Scrapy stuff: 68 | .scrapy 69 | 70 | # Sphinx documentation 71 | docs/_build/ 72 | 73 | # PyBuilder 74 | target/ 75 | 76 | # IPython Notebook 77 | .ipynb_checkpoints 78 | 79 | # pyenv 80 | .python-version 81 | 82 | # celery beat schedule file 83 | celerybeat-schedule 84 | 85 | # dotenv 86 | .env 87 | 88 | # virtualenv 89 | venv/ 90 | ENV/ 91 | 92 | # Spyder project settings 93 | .spyderproject 94 | 95 | # Rope project settings 96 | .ropeproject 97 | 98 | *.html 99 | reveal.js 100 | !build.sh 101 | -------------------------------------------------------------------------------- /01-intro-to-pandas-part-1-answers.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "tags": [ 7 | "setup" 8 | ] 9 | }, 10 | "source": [ 11 | "(c) 2016 - present. Enplus Advisors, Inc." 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 1, 17 | "metadata": { 18 | "tags": [ 19 | "setup" 20 | ] 21 | }, 22 | "outputs": [], 23 | "source": [ 24 | "import numpy as np\n", 25 | "import pandas as pd" 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": { 31 | "tags": [ 32 | "exercise" 33 | ] 34 | }, 35 | "source": [ 36 | "**Exercise:**\n", 37 | "\n", 38 | "* Create a `Series` and assign it to `s1a` from the integers 8, 6, 7, 5\n", 39 | "* From a `dict`, create an integer `Series` and assign it to `s1b` with values 8, 6, 7, 5 named `s1b` and a string index 'a', 'b', 'c', and 'd'\n", 40 | "* Convert the `s1b` values to 64-bit floating point values and assign it to `s1c`\n", 41 | "* Extract only the values from the `s1c` as a `PandasArray`" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 2, 47 | "metadata": {}, 48 | "outputs": [], 49 | "source": [ 50 | "s1a = pd.Series([8, 6, 7, 5])" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 3, 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "s1b = pd.Series({'a': 8, 'b': 6, 'c': 7, 'd': 5})" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 4, 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [ 68 | "s1c = s1b.astype(np.float64)" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": 5, 74 | "metadata": {}, 75 | "outputs": [ 76 | { 77 | "data": { 78 | "text/plain": [ 79 | "\n", 80 | "[8.0, 6.0, 7.0, 5.0]\n", 81 | "Length: 4, dtype: float64" 82 | ] 83 | }, 84 | "execution_count": 5, 85 | "metadata": {}, 86 | "output_type": "execute_result" 87 | } 88 | ], 89 | "source": [ 90 | "s1c.array # s1c.values for pandas < 0.24.0" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": 6, 96 | "metadata": {}, 97 | "outputs": [ 98 | { 99 | "data": { 100 | "text/plain": [ 101 | "array([8., 6., 7., 5.])" 102 | ] 103 | }, 104 | "execution_count": 6, 105 | "metadata": {}, 106 | "output_type": "execute_result" 107 | } 108 | ], 109 | "source": [ 110 | "s1c.to_numpy() # to convert to a `numpy.ndarray`" 111 | ] 112 | }, 113 | { 114 | "cell_type": "markdown", 115 | "metadata": { 116 | "tags": [ 117 | "exercise" 118 | ] 119 | }, 120 | "source": [ 121 | "**Exercise:**\n", 122 | "\n", 123 | "* Select the first element of `s2` using integer based lookup (`iloc`).\n", 124 | "* Select the first element of the `s2` using label indexing (`loc`).\n", 125 | "* Select all elements greater than 6 in `s2` using a boolean `Series`" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": 7, 131 | "metadata": { 132 | "tags": [ 133 | "exercise" 134 | ] 135 | }, 136 | "outputs": [], 137 | "source": [ 138 | "s2 = pd.Series([6, 8, 7, 5], index=list('abcd'), dtype='Int64')" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": 8, 144 | "metadata": {}, 145 | "outputs": [ 146 | { 147 | "data": { 148 | "text/plain": [ 149 | "6" 150 | ] 151 | }, 152 | "execution_count": 8, 153 | "metadata": {}, 154 | "output_type": "execute_result" 155 | } 156 | ], 157 | "source": [ 158 | "s2.iloc[0] # also s2.iat[0]" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": 9, 164 | "metadata": {}, 165 | "outputs": [ 166 | { 167 | "data": { 168 | "text/plain": [ 169 | "6" 170 | ] 171 | }, 172 | "execution_count": 9, 173 | "metadata": {}, 174 | "output_type": "execute_result" 175 | } 176 | ], 177 | "source": [ 178 | "s2.loc['a']" 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": 10, 184 | "metadata": {}, 185 | "outputs": [ 186 | { 187 | "data": { 188 | "text/plain": [ 189 | "b 8\n", 190 | "c 7\n", 191 | "dtype: Int64" 192 | ] 193 | }, 194 | "execution_count": 10, 195 | "metadata": {}, 196 | "output_type": "execute_result" 197 | } 198 | ], 199 | "source": [ 200 | "s2[s2 > 6]" 201 | ] 202 | }, 203 | { 204 | "cell_type": "markdown", 205 | "metadata": { 206 | "tags": [ 207 | "exercise" 208 | ] 209 | }, 210 | "source": [ 211 | "**Exercise**\n", 212 | "\n", 213 | "* Select all non-NaN values in `s3`\n", 214 | "* What will the result of adding `s2` and `s3` together be? \n", 215 | " Figure it out on paper then check in the notebook with `s2 + s3`" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": 11, 221 | "metadata": { 222 | "tags": [ 223 | "exercise" 224 | ] 225 | }, 226 | "outputs": [], 227 | "source": [ 228 | "s3 = pd.Series([9., 100., np.nan], index=list('ayz'), dtype='Int64')" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": 12, 234 | "metadata": {}, 235 | "outputs": [ 236 | { 237 | "data": { 238 | "text/plain": [ 239 | "a 9\n", 240 | "y 100\n", 241 | "dtype: Int64" 242 | ] 243 | }, 244 | "execution_count": 12, 245 | "metadata": {}, 246 | "output_type": "execute_result" 247 | } 248 | ], 249 | "source": [ 250 | "s3[s3.notnull()]" 251 | ] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "execution_count": 13, 256 | "metadata": {}, 257 | "outputs": [ 258 | { 259 | "data": { 260 | "text/plain": [ 261 | "a 15\n", 262 | "b \n", 263 | "c \n", 264 | "d \n", 265 | "y \n", 266 | "z \n", 267 | "dtype: Int64" 268 | ] 269 | }, 270 | "execution_count": 13, 271 | "metadata": {}, 272 | "output_type": "execute_result" 273 | } 274 | ], 275 | "source": [ 276 | "s2 + s3" 277 | ] 278 | } 279 | ], 280 | "metadata": { 281 | "celltoolbar": "Tags", 282 | "kernelspec": { 283 | "display_name": "Python 3", 284 | "language": "python", 285 | "name": "python3" 286 | }, 287 | "language_info": { 288 | "codemirror_mode": { 289 | "name": "ipython", 290 | "version": 3 291 | }, 292 | "file_extension": ".py", 293 | "mimetype": "text/x-python", 294 | "name": "python", 295 | "nbconvert_exporter": "python", 296 | "pygments_lexer": "ipython3", 297 | "version": "3.7.7" 298 | }, 299 | "toc": { 300 | "base_numbering": 1, 301 | "nav_menu": {}, 302 | "number_sections": false, 303 | "sideBar": true, 304 | "skip_h1_title": true, 305 | "title_cell": "Table of Contents", 306 | "title_sidebar": "Contents", 307 | "toc_cell": false, 308 | "toc_position": {}, 309 | "toc_section_display": true, 310 | "toc_window_display": false 311 | } 312 | }, 313 | "nbformat": 4, 314 | "nbformat_minor": 4 315 | } 316 | -------------------------------------------------------------------------------- /01-intro-to-pandas-part-1-exercises.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "tags": [ 7 | "setup" 8 | ] 9 | }, 10 | "source": [ 11 | "(c) 2016 - present. Enplus Advisors, Inc." 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "metadata": { 18 | "tags": [ 19 | "setup" 20 | ] 21 | }, 22 | "outputs": [], 23 | "source": [ 24 | "import numpy as np\n", 25 | "import pandas as pd" 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": { 31 | "tags": [ 32 | "exercise" 33 | ] 34 | }, 35 | "source": [ 36 | "**Exercise:**\n", 37 | "\n", 38 | "* Create a `Series` and assign it to `s1a` from the integers 8, 6, 7, 5\n", 39 | "* From a `dict`, create an integer `Series` and assign it to `s1b` with values 8, 6, 7, 5 named `s1b` and a string index 'a', 'b', 'c', and 'd'\n", 40 | "* Convert the `s1b` values to 64-bit floating point values and assign it to `s1c`\n", 41 | "* Extract only the values from the `s1c` as a `PandasArray`" 42 | ] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "metadata": { 47 | "tags": [ 48 | "exercise" 49 | ] 50 | }, 51 | "source": [ 52 | "**Exercise:**\n", 53 | "\n", 54 | "* Select the first element of `s2` using integer based lookup (`iloc`).\n", 55 | "* Select the first element of the `s2` using label indexing (`loc`).\n", 56 | "* Select all elements greater than 6 in `s2` using a boolean `Series`" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "metadata": { 63 | "tags": [ 64 | "exercise" 65 | ] 66 | }, 67 | "outputs": [], 68 | "source": [ 69 | "s2 = pd.Series([6, 8, 7, 5], index=list('abcd'), dtype='Int64')" 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "metadata": { 75 | "tags": [ 76 | "exercise" 77 | ] 78 | }, 79 | "source": [ 80 | "**Exercise**\n", 81 | "\n", 82 | "* Select all non-NaN values in `s3`\n", 83 | "* What will the result of adding `s2` and `s3` together be? \n", 84 | " Figure it out on paper then check in the notebook with `s2 + s3`" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": null, 90 | "metadata": { 91 | "tags": [ 92 | "exercise" 93 | ] 94 | }, 95 | "outputs": [], 96 | "source": [ 97 | "s3 = pd.Series([9., 100., np.nan], index=list('ayz'), dtype='Int64')" 98 | ] 99 | } 100 | ], 101 | "metadata": { 102 | "kernelspec": { 103 | "display_name": "Python 3", 104 | "language": "python", 105 | "name": "python3" 106 | }, 107 | "language_info": { 108 | "codemirror_mode": { 109 | "name": "ipython", 110 | "version": 3 111 | }, 112 | "file_extension": ".py", 113 | "mimetype": "text/x-python", 114 | "name": "python", 115 | "nbconvert_exporter": "python", 116 | "pygments_lexer": "ipython3", 117 | "version": "3.7.7" 118 | } 119 | }, 120 | "nbformat": 4, 121 | "nbformat_minor": 4 122 | } 123 | -------------------------------------------------------------------------------- /02-intro-to-pandas-part-2-answers.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "slideshow": { 7 | "slide_type": "notes" 8 | }, 9 | "tags": [ 10 | "setup" 11 | ] 12 | }, 13 | "source": [ 14 | "(c) 2016 - present. Enplus Advisors, Inc." 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 1, 20 | "metadata": { 21 | "slideshow": { 22 | "slide_type": "skip" 23 | }, 24 | "tags": [ 25 | "setup" 26 | ] 27 | }, 28 | "outputs": [], 29 | "source": [ 30 | "import numpy as np\n", 31 | "import pandas as pd" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 2, 37 | "metadata": { 38 | "slideshow": { 39 | "slide_type": "slide" 40 | }, 41 | "tags": [ 42 | "setup" 43 | ] 44 | }, 45 | "outputs": [], 46 | "source": [ 47 | "df = pd.DataFrame({\n", 48 | " 'ticker': ['AAPL', 'AAPL', 'MSFT', 'IBM', 'YHOO'],\n", 49 | " 'date': ['2015-12-30', '2015-12-31', '2015-12-30', '2015-12-30', '2015-12-30'],\n", 50 | " 'open': [426.23, 427.81, 42.3, 101.65, 35.53]\n", 51 | "})" 52 | ] 53 | }, 54 | { 55 | "cell_type": "markdown", 56 | "metadata": { 57 | "tags": [ 58 | "exercise" 59 | ] 60 | }, 61 | "source": [ 62 | "**Exercise:**\n", 63 | "\n", 64 | "* Select the `open` column as a `Series` using attribute lookup\n", 65 | "* Select the `open` column as a `Series` using `dict`-style lookup\n", 66 | "* Select the `date` column as a `DataFrame`" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": 3, 72 | "metadata": {}, 73 | "outputs": [], 74 | "source": [ 75 | "res1a = df.open\n", 76 | "res1b = df['open']\n", 77 | "res1c = df[['open']]" 78 | ] 79 | }, 80 | { 81 | "cell_type": "markdown", 82 | "metadata": { 83 | "tags": [ 84 | "exercise" 85 | ] 86 | }, 87 | "source": [ 88 | "**Exercise:**\n", 89 | "\n", 90 | "* Select all rows with the `AAPL` ticker and the `date` and `open`\n", 91 | " columns.\n", 92 | "* Assign to the variable `df1` a new `DataFrame` with `ticker` as\n", 93 | " the index.\n", 94 | "* Assign to the variable `df2` a new `DataFrame` with `date` as\n", 95 | " the index. Create this `DataFrame` from `df1` with a single\n", 96 | " statement.\n", 97 | "* Sort `df2` by the index values." 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": 4, 103 | "metadata": {}, 104 | "outputs": [], 105 | "source": [ 106 | "res2a = df.loc[df.ticker == 'AAPL', ['date', 'open']]\n", 107 | "df1 = df.set_index('ticker')\n", 108 | "df2 = df1.reset_index().set_index('date')" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": 5, 114 | "metadata": {}, 115 | "outputs": [], 116 | "source": [ 117 | "df2_sorted = df2.sort_index()" 118 | ] 119 | }, 120 | { 121 | "cell_type": "markdown", 122 | "metadata": { 123 | "tags": [ 124 | "exercise" 125 | ] 126 | }, 127 | "source": [ 128 | "**Exercise:**\n", 129 | "\n", 130 | "* Create a copy of `df` called `df3`. Add a new column of `NaNs` \n", 131 | " to `df3` called `close`. Assign `close` the same value as `open`\n", 132 | " for all `open` values greater than 100.\n", 133 | "* Sort `df3` by its `close` values." 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": 6, 139 | "metadata": {}, 140 | "outputs": [ 141 | { 142 | "data": { 143 | "text/html": [ 144 | "
\n", 145 | "\n", 158 | "\n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | "
tickerdateopenclose
0AAPL2015-12-30426.23426.23
1AAPL2015-12-31427.81427.81
2MSFT2015-12-3042.30NaN
3IBM2015-12-30101.65101.65
4YHOO2015-12-3035.53NaN
\n", 206 | "
" 207 | ], 208 | "text/plain": [ 209 | " ticker date open close\n", 210 | "0 AAPL 2015-12-30 426.23 426.23\n", 211 | "1 AAPL 2015-12-31 427.81 427.81\n", 212 | "2 MSFT 2015-12-30 42.30 NaN\n", 213 | "3 IBM 2015-12-30 101.65 101.65\n", 214 | "4 YHOO 2015-12-30 35.53 NaN" 215 | ] 216 | }, 217 | "execution_count": 6, 218 | "metadata": {}, 219 | "output_type": "execute_result" 220 | } 221 | ], 222 | "source": [ 223 | "df3 = df.copy()\n", 224 | "\n", 225 | "# this could be skipped from a functional standpoint, though\n", 226 | "# the instructions say to do it\n", 227 | "df3['close'] = np.nan \n", 228 | "\n", 229 | "gt100 = df3.open[df3.open > 100]\n", 230 | "df3.close = gt100 # you can use dot syntax b/c `close` already exists\n", 231 | "df3" 232 | ] 233 | } 234 | ], 235 | "metadata": { 236 | "celltoolbar": "Tags", 237 | "kernelspec": { 238 | "display_name": "Python 3", 239 | "language": "python", 240 | "name": "python3" 241 | }, 242 | "language_info": { 243 | "codemirror_mode": { 244 | "name": "ipython", 245 | "version": 3 246 | }, 247 | "file_extension": ".py", 248 | "mimetype": "text/x-python", 249 | "name": "python", 250 | "nbconvert_exporter": "python", 251 | "pygments_lexer": "ipython3", 252 | "version": "3.7.3" 253 | }, 254 | "toc": { 255 | "base_numbering": 1, 256 | "nav_menu": {}, 257 | "number_sections": false, 258 | "sideBar": false, 259 | "skip_h1_title": true, 260 | "title_cell": "Table of Contents", 261 | "title_sidebar": "Contents", 262 | "toc_cell": false, 263 | "toc_position": {}, 264 | "toc_section_display": false, 265 | "toc_window_display": false 266 | } 267 | }, 268 | "nbformat": 4, 269 | "nbformat_minor": 1 270 | } 271 | -------------------------------------------------------------------------------- /02-intro-to-pandas-part-2-exercises.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "slideshow": { 7 | "slide_type": "notes" 8 | }, 9 | "tags": [ 10 | "setup" 11 | ] 12 | }, 13 | "source": [ 14 | "(c) 2016 - present. Enplus Advisors, Inc." 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": { 21 | "slideshow": { 22 | "slide_type": "skip" 23 | }, 24 | "tags": [ 25 | "setup" 26 | ] 27 | }, 28 | "outputs": [], 29 | "source": [ 30 | "import numpy as np\n", 31 | "import pandas as pd" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": null, 37 | "metadata": { 38 | "slideshow": { 39 | "slide_type": "slide" 40 | }, 41 | "tags": [ 42 | "setup" 43 | ] 44 | }, 45 | "outputs": [], 46 | "source": [ 47 | "df = pd.DataFrame({\n", 48 | " 'ticker': ['AAPL', 'AAPL', 'MSFT', 'IBM', 'YHOO'],\n", 49 | " 'date': ['2015-12-30', '2015-12-31', '2015-12-30', '2015-12-30', '2015-12-30'],\n", 50 | " 'open': [426.23, 427.81, 42.3, 101.65, 35.53]\n", 51 | "})" 52 | ] 53 | }, 54 | { 55 | "cell_type": "markdown", 56 | "metadata": { 57 | "tags": [ 58 | "exercise" 59 | ] 60 | }, 61 | "source": [ 62 | "**Exercise:**\n", 63 | "\n", 64 | "* Select the `open` column as a `Series` using attribute lookup\n", 65 | "* Select the `open` column as a `Series` using `dict`-style lookup\n", 66 | "* Select the `date` column as a `DataFrame`" 67 | ] 68 | }, 69 | { 70 | "cell_type": "markdown", 71 | "metadata": { 72 | "tags": [ 73 | "exercise" 74 | ] 75 | }, 76 | "source": [ 77 | "**Exercise:**\n", 78 | "\n", 79 | "* Select all rows with the `AAPL` ticker and the `date` and `open`\n", 80 | " columns.\n", 81 | "* Assign to the variable `df1` a new `DataFrame` with `ticker` as\n", 82 | " the index.\n", 83 | "* Assign to the variable `df2` a new `DataFrame` with `date` as\n", 84 | " the index. Create this `DataFrame` from `df1` with a single\n", 85 | " statement.\n", 86 | "* Sort `df2` by the index values." 87 | ] 88 | }, 89 | { 90 | "cell_type": "markdown", 91 | "metadata": { 92 | "tags": [ 93 | "exercise" 94 | ] 95 | }, 96 | "source": [ 97 | "**Exercise:**\n", 98 | "\n", 99 | "* Create a copy of `df` called `df3`. Add a new column of `NaNs` \n", 100 | " to `df3` called `close`. Assign `close` the same value as `open`\n", 101 | " for all `open` values greater than 100.\n", 102 | "* Sort `df3` by its `close` values." 103 | ] 104 | } 105 | ], 106 | "metadata": { 107 | "kernelspec": { 108 | "display_name": "Python 3", 109 | "language": "python", 110 | "name": "python3" 111 | }, 112 | "language_info": { 113 | "codemirror_mode": { 114 | "name": "ipython", 115 | "version": 3 116 | }, 117 | "file_extension": ".py", 118 | "mimetype": "text/x-python", 119 | "name": "python", 120 | "nbconvert_exporter": "python", 121 | "pygments_lexer": "ipython3", 122 | "version": "3.7.3" 123 | } 124 | }, 125 | "nbformat": 4, 126 | "nbformat_minor": 1 127 | } 128 | -------------------------------------------------------------------------------- /03-group-apply-answers.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "tags": [ 7 | "setup" 8 | ] 9 | }, 10 | "source": [ 11 | "(c) 2016 - present. Enplus Advisors, Inc." 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 1, 17 | "metadata": { 18 | "tags": [ 19 | "setup" 20 | ] 21 | }, 22 | "outputs": [], 23 | "source": [ 24 | "import numpy as np\n", 25 | "import pandas as pd\n", 26 | "\n", 27 | "pd.set_option('display.float_format', '{:,.1f}'.format)" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 2, 33 | "metadata": { 34 | "tags": [ 35 | "setup" 36 | ] 37 | }, 38 | "outputs": [], 39 | "source": [ 40 | "dat = pd.read_csv('data/weather-6m.csv')" 41 | ] 42 | }, 43 | { 44 | "cell_type": "markdown", 45 | "metadata": { 46 | "tags": [ 47 | "exercise" 48 | ] 49 | }, 50 | "source": [ 51 | "**Exercise:**\n", 52 | "\n", 53 | "Calculate the average `air_temp` by `month`." 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 3, 59 | "metadata": {}, 60 | "outputs": [], 61 | "source": [ 62 | "grp = dat.groupby('month')" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": 4, 68 | "metadata": { 69 | "scrolled": false 70 | }, 71 | "outputs": [ 72 | { 73 | "data": { 74 | "text/plain": [ 75 | "month\n", 76 | "1 -10.0\n", 77 | "2 -3.0\n", 78 | "3 2.1\n", 79 | "4 7.0\n", 80 | "5 14.0\n", 81 | "6 18.1\n", 82 | "Name: air_temp, dtype: float64" 83 | ] 84 | }, 85 | "execution_count": 4, 86 | "metadata": {}, 87 | "output_type": "execute_result" 88 | } 89 | ], 90 | "source": [ 91 | "grp['air_temp'].mean()" 92 | ] 93 | }, 94 | { 95 | "cell_type": "markdown", 96 | "metadata": { 97 | "tags": [ 98 | "exercise" 99 | ] 100 | }, 101 | "source": [ 102 | "**Exercise:**\n", 103 | "\n", 104 | "Compute summary statistics on `air_temp` and `dew_point` using \n", 105 | "the `describe` method." 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": 5, 111 | "metadata": { 112 | "scrolled": true 113 | }, 114 | "outputs": [ 115 | { 116 | "data": { 117 | "text/html": [ 118 | "
\n", 119 | "\n", 136 | "\n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | "
air_tempdew_point
countmeanstdmin25%50%75%maxcountmeanstdmin25%50%75%max
month
1712.0-10.06.2-29.4-13.3-10.0-5.62.8712.0-14.16.8-32.8-18.3-13.9-8.91.0
2644.0-3.06.8-19.4-7.2-2.21.715.0644.0-7.37.3-22.8-12.2-7.2-2.28.3
3713.02.16.7-13.3-1.72.25.622.8713.0-3.46.1-17.2-7.8-2.80.613.3
4691.07.06.0-2.82.85.69.428.3691.00.36.1-13.3-3.9-1.13.916.7
5713.014.05.11.110.613.917.228.3713.06.24.6-6.12.86.710.018.3
6688.018.16.03.313.817.822.833.3688.012.35.5-3.38.911.717.223.3
\n", 299 | "
" 300 | ], 301 | "text/plain": [ 302 | " air_temp dew_point \\\n", 303 | " count mean std min 25% 50% 75% max count mean std \n", 304 | "month \n", 305 | "1 712.0 -10.0 6.2 -29.4 -13.3 -10.0 -5.6 2.8 712.0 -14.1 6.8 \n", 306 | "2 644.0 -3.0 6.8 -19.4 -7.2 -2.2 1.7 15.0 644.0 -7.3 7.3 \n", 307 | "3 713.0 2.1 6.7 -13.3 -1.7 2.2 5.6 22.8 713.0 -3.4 6.1 \n", 308 | "4 691.0 7.0 6.0 -2.8 2.8 5.6 9.4 28.3 691.0 0.3 6.1 \n", 309 | "5 713.0 14.0 5.1 1.1 10.6 13.9 17.2 28.3 713.0 6.2 4.6 \n", 310 | "6 688.0 18.1 6.0 3.3 13.8 17.8 22.8 33.3 688.0 12.3 5.5 \n", 311 | "\n", 312 | " \n", 313 | " min 25% 50% 75% max \n", 314 | "month \n", 315 | "1 -32.8 -18.3 -13.9 -8.9 1.0 \n", 316 | "2 -22.8 -12.2 -7.2 -2.2 8.3 \n", 317 | "3 -17.2 -7.8 -2.8 0.6 13.3 \n", 318 | "4 -13.3 -3.9 -1.1 3.9 16.7 \n", 319 | "5 -6.1 2.8 6.7 10.0 18.3 \n", 320 | "6 -3.3 8.9 11.7 17.2 23.3 " 321 | ] 322 | }, 323 | "execution_count": 5, 324 | "metadata": {}, 325 | "output_type": "execute_result" 326 | } 327 | ], 328 | "source": [ 329 | "grp[['air_temp', 'dew_point']].describe()" 330 | ] 331 | }, 332 | { 333 | "cell_type": "markdown", 334 | "metadata": { 335 | "tags": [ 336 | "exercise" 337 | ] 338 | }, 339 | "source": [ 340 | "**Exercise:**\n", 341 | "\n", 342 | "For January and February and 0 - 11 hours, calculate the average and standard deviation of `air_temp` grouping by month and hour of the day. Name your result columns `air_temp_mean` and `air_temp_sd`.\n", 343 | "\n", 344 | "Your result `DataFrame` should have 24 rows, the number of months (2) times the number of hours (12). \n", 345 | "\n", 346 | "$2 * 12 = 24$\n", 347 | "\n" 348 | ] 349 | }, 350 | { 351 | "cell_type": "code", 352 | "execution_count": 6, 353 | "metadata": {}, 354 | "outputs": [], 355 | "source": [ 356 | "idx = dat.month.isin([1, 2]) & (dat.hour < 12)\n", 357 | "grp2 = dat[idx].groupby(['month', 'hour'])\n", 358 | "hourly_temp = grp2.agg(\n", 359 | " air_temp_mean=('air_temp', 'mean'),\n", 360 | " air_temp_sd=('air_temp', 'std')\n", 361 | ")" 362 | ] 363 | }, 364 | { 365 | "cell_type": "markdown", 366 | "metadata": { 367 | "tags": [ 368 | "exercise" 369 | ] 370 | }, 371 | "source": [ 372 | "**Exercise:**\n", 373 | "\n", 374 | "By month, calculate quantiles for `air_temp` using the quantiles defined in `breaks`. \n", 375 | "\n", 376 | "Hint: Use the `quantile` method defined on a `Series` (`pd.Series.quantile`).\n" 377 | ] 378 | }, 379 | { 380 | "cell_type": "code", 381 | "execution_count": 7, 382 | "metadata": { 383 | "tags": [ 384 | "exercise" 385 | ] 386 | }, 387 | "outputs": [], 388 | "source": [ 389 | "breaks = [0.01, 0.25, 0.5, 0.75, 0.99]" 390 | ] 391 | }, 392 | { 393 | "cell_type": "code", 394 | "execution_count": 8, 395 | "metadata": {}, 396 | "outputs": [ 397 | { 398 | "data": { 399 | "text/html": [ 400 | "
\n", 401 | "\n", 414 | "\n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | "
air_temp0.00.20.50.81.0
month
1-25.0-13.3-10.0-5.61.1
2-18.3-7.2-2.21.712.8
3-11.0-1.72.25.619.3
4-2.22.85.69.423.4
52.810.613.917.227.8
66.113.817.822.832.2
\n", 484 | "
" 485 | ], 486 | "text/plain": [ 487 | "air_temp 0.0 0.2 0.5 0.8 1.0\n", 488 | "month \n", 489 | "1 -25.0 -13.3 -10.0 -5.6 1.1\n", 490 | "2 -18.3 -7.2 -2.2 1.7 12.8\n", 491 | "3 -11.0 -1.7 2.2 5.6 19.3\n", 492 | "4 -2.2 2.8 5.6 9.4 23.4\n", 493 | "5 2.8 10.6 13.9 17.2 27.8\n", 494 | "6 6.1 13.8 17.8 22.8 32.2" 495 | ] 496 | }, 497 | "execution_count": 8, 498 | "metadata": {}, 499 | "output_type": "execute_result" 500 | } 501 | ], 502 | "source": [ 503 | "grp3 = dat.groupby('month')\n", 504 | "grp3.apply(lambda x: x.air_temp.quantile(breaks))" 505 | ] 506 | } 507 | ], 508 | "metadata": { 509 | "celltoolbar": "Tags", 510 | "kernelspec": { 511 | "display_name": "Python 3", 512 | "language": "python", 513 | "name": "python3" 514 | }, 515 | "language_info": { 516 | "codemirror_mode": { 517 | "name": "ipython", 518 | "version": 3 519 | }, 520 | "file_extension": ".py", 521 | "mimetype": "text/x-python", 522 | "name": "python", 523 | "nbconvert_exporter": "python", 524 | "pygments_lexer": "ipython3", 525 | "version": "3.7.7" 526 | }, 527 | "toc": { 528 | "base_numbering": 1, 529 | "nav_menu": {}, 530 | "number_sections": false, 531 | "sideBar": true, 532 | "skip_h1_title": true, 533 | "title_cell": "Table of Contents", 534 | "title_sidebar": "Contents", 535 | "toc_cell": false, 536 | "toc_position": {}, 537 | "toc_section_display": true, 538 | "toc_window_display": false 539 | } 540 | }, 541 | "nbformat": 4, 542 | "nbformat_minor": 1 543 | } 544 | -------------------------------------------------------------------------------- /03-group-apply-exercises.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "tags": [ 7 | "setup" 8 | ] 9 | }, 10 | "source": [ 11 | "(c) 2016 - present. Enplus Advisors, Inc." 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "metadata": { 18 | "tags": [ 19 | "setup" 20 | ] 21 | }, 22 | "outputs": [], 23 | "source": [ 24 | "import numpy as np\n", 25 | "import pandas as pd\n", 26 | "\n", 27 | "pd.set_option('display.float_format', '{:,.1f}'.format)" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "metadata": { 34 | "tags": [ 35 | "setup" 36 | ] 37 | }, 38 | "outputs": [], 39 | "source": [ 40 | "dat = pd.read_csv('data/weather-6m.csv')" 41 | ] 42 | }, 43 | { 44 | "cell_type": "markdown", 45 | "metadata": { 46 | "tags": [ 47 | "exercise" 48 | ] 49 | }, 50 | "source": [ 51 | "**Exercise:**\n", 52 | "\n", 53 | "Calculate the average `air_temp` by `month`." 54 | ] 55 | }, 56 | { 57 | "cell_type": "markdown", 58 | "metadata": { 59 | "tags": [ 60 | "exercise" 61 | ] 62 | }, 63 | "source": [ 64 | "**Exercise:**\n", 65 | "\n", 66 | "Compute summary statistics on `air_temp` and `dew_point` using \n", 67 | "the `describe` method." 68 | ] 69 | }, 70 | { 71 | "cell_type": "markdown", 72 | "metadata": { 73 | "tags": [ 74 | "exercise" 75 | ] 76 | }, 77 | "source": [ 78 | "**Exercise:**\n", 79 | "\n", 80 | "For January and February and 0 - 11 hours, calculate the average and standard deviation of `air_temp` grouping by month and hour of the day. Name your result columns `air_temp_mean` and `air_temp_sd`.\n", 81 | "\n", 82 | "Your result `DataFrame` should have 24 rows, the number of months (2) times the number of hours (12). \n", 83 | "\n", 84 | "$2 * 12 = 24$\n", 85 | "\n" 86 | ] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "metadata": { 91 | "tags": [ 92 | "exercise" 93 | ] 94 | }, 95 | "source": [ 96 | "**Exercise:**\n", 97 | "\n", 98 | "By month, calculate quantiles for `air_temp` using the quantiles defined in `breaks`. \n", 99 | "\n", 100 | "Hint: Use the `quantile` method defined on a `Series` (`pd.Series.quantile`).\n" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "metadata": { 107 | "tags": [ 108 | "exercise" 109 | ] 110 | }, 111 | "outputs": [], 112 | "source": [ 113 | "breaks = [0.01, 0.25, 0.5, 0.75, 0.99]" 114 | ] 115 | } 116 | ], 117 | "metadata": { 118 | "kernelspec": { 119 | "display_name": "Python 3", 120 | "language": "python", 121 | "name": "python3" 122 | }, 123 | "language_info": { 124 | "codemirror_mode": { 125 | "name": "ipython", 126 | "version": 3 127 | }, 128 | "file_extension": ".py", 129 | "mimetype": "text/x-python", 130 | "name": "python", 131 | "nbconvert_exporter": "python", 132 | "pygments_lexer": "ipython3", 133 | "version": "3.7.7" 134 | } 135 | }, 136 | "nbformat": 4, 137 | "nbformat_minor": 1 138 | } 139 | -------------------------------------------------------------------------------- /04-time-series-answers.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "tags": [ 7 | "setup" 8 | ] 9 | }, 10 | "source": [ 11 | "(c) 2016 - present. Enplus Advisors, Inc." 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": { 17 | "tags": [ 18 | "setup" 19 | ] 20 | }, 21 | "source": [ 22 | "This module uses:\n", 23 | "* SP500 returns" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 1, 29 | "metadata": { 30 | "tags": [ 31 | "setup" 32 | ] 33 | }, 34 | "outputs": [], 35 | "source": [ 36 | "import datetime as dt\n", 37 | "\n", 38 | "import numpy as np\n", 39 | "import pandas as pd\n", 40 | "\n", 41 | "pd.set_option('display.precision', 2)" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 2, 47 | "metadata": { 48 | "tags": [ 49 | "setup" 50 | ] 51 | }, 52 | "outputs": [], 53 | "source": [ 54 | "sp5_df = pd.read_csv(\n", 55 | " 'sp500.csv', usecols=['date', 'adj_close'], \n", 56 | " parse_dates=['date'])" 57 | ] 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "metadata": { 62 | "tags": [ 63 | "exercise" 64 | ] 65 | }, 66 | "source": [ 67 | "**Exercise:**\n", 68 | "\n", 69 | "Create a `pandas` Timestamp for January 1st, 1993 16:00 (don't worry about timezone)." 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 3, 75 | "metadata": {}, 76 | "outputs": [ 77 | { 78 | "data": { 79 | "text/plain": [ 80 | "Timestamp('1993-01-01 16:00:00')" 81 | ] 82 | }, 83 | "execution_count": 3, 84 | "metadata": {}, 85 | "output_type": "execute_result" 86 | } 87 | ], 88 | "source": [ 89 | "pd.Timestamp('1993-01-01 16:00') # __" 90 | ] 91 | }, 92 | { 93 | "cell_type": "markdown", 94 | "metadata": { 95 | "tags": [ 96 | "exercise" 97 | ] 98 | }, 99 | "source": [ 100 | "**Exercise:**\n", 101 | "\n", 102 | "Generate a an Index of:\n", 103 | "* 5 calendar days starting on January 1, 2010.\n", 104 | "* All US business days (weekdays) starting on January 1, 2010\n", 105 | " and ending on January 15, 2010.\n", 106 | " \n", 107 | "__Hint:__ You can view the help for a function by running `help(function_name)`, e.g. `help(pd.Timestamp)`. Try looking at the help for `pd.date_range`." 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": 4, 113 | "metadata": {}, 114 | "outputs": [ 115 | { 116 | "data": { 117 | "text/plain": [ 118 | "DatetimeIndex(['2010-01-01', '2010-01-02', '2010-01-03', '2010-01-04',\n", 119 | " '2010-01-05'],\n", 120 | " dtype='datetime64[ns]', freq='D')" 121 | ] 122 | }, 123 | "execution_count": 4, 124 | "metadata": {}, 125 | "output_type": "execute_result" 126 | } 127 | ], 128 | "source": [ 129 | "pd.date_range(start='2010-01-01', periods=5, freq='D') # __" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": 5, 135 | "metadata": {}, 136 | "outputs": [ 137 | { 138 | "data": { 139 | "text/plain": [ 140 | "DatetimeIndex(['2010-01-01', '2010-01-04', '2010-01-05', '2010-01-06',\n", 141 | " '2010-01-07', '2010-01-08', '2010-01-11', '2010-01-12',\n", 142 | " '2010-01-13', '2010-01-14', '2010-01-15'],\n", 143 | " dtype='datetime64[ns]', freq='B')" 144 | ] 145 | }, 146 | "execution_count": 5, 147 | "metadata": {}, 148 | "output_type": "execute_result" 149 | } 150 | ], 151 | "source": [ 152 | "pd.date_range(start='2010-01-01', end='2010-01-15', freq='B') # __" 153 | ] 154 | }, 155 | { 156 | "cell_type": "markdown", 157 | "metadata": { 158 | "tags": [ 159 | "exercise" 160 | ] 161 | }, 162 | "source": [ 163 | "**Exercise:**\n", 164 | "\n", 165 | "Create a Series named `sp5` from the `adj_close` column `sp5_df`, using `date` as the\n", 166 | "index. Make sure you call `sort_index()` to make sure the index is sorted.\n", 167 | "\n", 168 | "__Hint:__ The first two parameters of `pd.Series` are `data` and `index`. When both `data` and `index` are `Series`, the `index` of `data` is aligned against the values in `Series`. You can always force positional alignment by converting a `Series` to an `PandasArray` (`pd.Series.array`)\n" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": 6, 174 | "metadata": {}, 175 | "outputs": [], 176 | "source": [ 177 | "sp5 = pd.Series( # __\n", 178 | " sp5_df.adj_close.array, index=sp5_df.date, \n", 179 | " name='adj_close').sort_index()" 180 | ] 181 | }, 182 | { 183 | "cell_type": "markdown", 184 | "metadata": { 185 | "tags": [ 186 | "exercise" 187 | ] 188 | }, 189 | "source": [ 190 | "**Exercise:**\n", 191 | "\n", 192 | "Write 2 different ways to select January 3, 1995 from the `sp5` series. \n", 193 | "\n", 194 | "_There are more than 2 ways to do this, but you only need 2!_" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": 7, 200 | "metadata": {}, 201 | "outputs": [], 202 | "source": [ 203 | "d1a = sp5['19950103'] # __\n", 204 | "d1b = sp5['1995-01-03'] # __\n", 205 | "d1c = sp5[dt.datetime(1995, 1, 3)] # __" 206 | ] 207 | }, 208 | { 209 | "cell_type": "markdown", 210 | "metadata": { 211 | "tags": [ 212 | "exercise" 213 | ] 214 | }, 215 | "source": [ 216 | "**Exercise:**\n", 217 | "\n", 218 | "Select from `sp5` all observations for:\n", 219 | "* March 1995\n", 220 | "* Year of 1995" 221 | ] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": 8, 226 | "metadata": {}, 227 | "outputs": [], 228 | "source": [ 229 | "mar_95 = sp5['1995-03'] # __" 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "execution_count": 9, 235 | "metadata": {}, 236 | "outputs": [], 237 | "source": [ 238 | "y_95 = sp5['1995'] # __" 239 | ] 240 | }, 241 | { 242 | "cell_type": "markdown", 243 | "metadata": { 244 | "tags": [ 245 | "exercise" 246 | ] 247 | }, 248 | "source": [ 249 | "**Exercise**\n", 250 | "\n", 251 | "For `sp5`:\n", 252 | "\n", 253 | "Calculate the day-over-day percent change in the values and to assign the result to the variable `sp5_rtn`.\n", 254 | "\n", 255 | "Hint: Use `shift`" 256 | ] 257 | }, 258 | { 259 | "cell_type": "code", 260 | "execution_count": 10, 261 | "metadata": {}, 262 | "outputs": [], 263 | "source": [ 264 | "sp5_rtn = sp5 / sp5.shift(1) - 1 # __" 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": 11, 270 | "metadata": {}, 271 | "outputs": [], 272 | "source": [ 273 | "# alternative solution\n", 274 | "# sp5.pct_change()" 275 | ] 276 | }, 277 | { 278 | "cell_type": "markdown", 279 | "metadata": { 280 | "tags": [ 281 | "exercise" 282 | ] 283 | }, 284 | "source": [ 285 | "**Exercise**\n", 286 | "\n", 287 | "Resample the data from daily to monthly to calculate average day-over-day percent change." 288 | ] 289 | }, 290 | { 291 | "cell_type": "code", 292 | "execution_count": 12, 293 | "metadata": {}, 294 | "outputs": [], 295 | "source": [ 296 | "rtn_mnth = sp5_rtn.resample('M').mean() # __" 297 | ] 298 | } 299 | ], 300 | "metadata": { 301 | "celltoolbar": "Tags", 302 | "kernelspec": { 303 | "display_name": "Python 3 (ipykernel)", 304 | "language": "python", 305 | "name": "python3" 306 | }, 307 | "language_info": { 308 | "codemirror_mode": { 309 | "name": "ipython", 310 | "version": 3 311 | }, 312 | "file_extension": ".py", 313 | "mimetype": "text/x-python", 314 | "name": "python", 315 | "nbconvert_exporter": "python", 316 | "pygments_lexer": "ipython3", 317 | "version": "3.10.8" 318 | }, 319 | "toc": { 320 | "base_numbering": 1, 321 | "nav_menu": {}, 322 | "number_sections": false, 323 | "sideBar": true, 324 | "skip_h1_title": true, 325 | "title_cell": "Table of Contents", 326 | "title_sidebar": "Contents", 327 | "toc_cell": false, 328 | "toc_position": {}, 329 | "toc_section_display": true, 330 | "toc_window_display": false 331 | } 332 | }, 333 | "nbformat": 4, 334 | "nbformat_minor": 4 335 | } 336 | -------------------------------------------------------------------------------- /04-time-series-exercises.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "tags": [ 7 | "setup" 8 | ] 9 | }, 10 | "source": [ 11 | "(c) 2016 - present. Enplus Advisors, Inc." 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": { 17 | "tags": [ 18 | "setup" 19 | ] 20 | }, 21 | "source": [ 22 | "This module uses:\n", 23 | "* SP500 returns" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": null, 29 | "metadata": { 30 | "tags": [ 31 | "setup" 32 | ] 33 | }, 34 | "outputs": [], 35 | "source": [ 36 | "import datetime as dt\n", 37 | "\n", 38 | "import numpy as np\n", 39 | "import pandas as pd\n", 40 | "\n", 41 | "pd.set_option('display.precision', 2)" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": null, 47 | "metadata": { 48 | "tags": [ 49 | "setup" 50 | ] 51 | }, 52 | "outputs": [], 53 | "source": [ 54 | "sp5_df = pd.read_csv(\n", 55 | " 'sp500.csv', usecols=['date', 'adj_close'], \n", 56 | " parse_dates=['date'])" 57 | ] 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "metadata": { 62 | "tags": [ 63 | "exercise" 64 | ] 65 | }, 66 | "source": [ 67 | "**Exercise:**\n", 68 | "\n", 69 | "Create a `pandas` Timestamp for January 1st, 1993 16:00 (don't worry about timezone)." 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "metadata": { 75 | "tags": [ 76 | "exercise" 77 | ] 78 | }, 79 | "source": [ 80 | "**Exercise:**\n", 81 | "\n", 82 | "Generate a an Index of:\n", 83 | "* 5 calendar days starting on January 1, 2010.\n", 84 | "* All US business days (weekdays) starting on January 1, 2010\n", 85 | " and ending on January 15, 2010.\n", 86 | " \n", 87 | "__Hint:__ You can view the help for a function by running `help(function_name)`, e.g. `help(pd.Timestamp)`. Try looking at the help for `pd.date_range`." 88 | ] 89 | }, 90 | { 91 | "cell_type": "markdown", 92 | "metadata": { 93 | "tags": [ 94 | "exercise" 95 | ] 96 | }, 97 | "source": [ 98 | "**Exercise:**\n", 99 | "\n", 100 | "Create a Series named `sp5` from the `adj_close` column `sp5_df`, using `date` as the\n", 101 | "index. Make sure you call `sort_index()` to make sure the index is sorted.\n", 102 | "\n", 103 | "__Hint:__ The first two parameters of `pd.Series` are `data` and `index`. When both `data` and `index` are `Series`, the `index` of `data` is aligned against the values in `Series`. You can always force positional alignment by converting a `Series` to an `PandasArray` (`pd.Series.array`)\n" 104 | ] 105 | }, 106 | { 107 | "cell_type": "markdown", 108 | "metadata": { 109 | "tags": [ 110 | "exercise" 111 | ] 112 | }, 113 | "source": [ 114 | "**Exercise:**\n", 115 | "\n", 116 | "Write 2 different ways to select January 3, 1995 from the `sp5` series. \n", 117 | "\n", 118 | "_There are more than 2 ways to do this, but you only need 2!_" 119 | ] 120 | }, 121 | { 122 | "cell_type": "markdown", 123 | "metadata": { 124 | "tags": [ 125 | "exercise" 126 | ] 127 | }, 128 | "source": [ 129 | "**Exercise:**\n", 130 | "\n", 131 | "Select from `sp5` all observations for:\n", 132 | "* March 1995\n", 133 | "* Year of 1995" 134 | ] 135 | }, 136 | { 137 | "cell_type": "markdown", 138 | "metadata": { 139 | "tags": [ 140 | "exercise" 141 | ] 142 | }, 143 | "source": [ 144 | "**Exercise**\n", 145 | "\n", 146 | "For `sp5`:\n", 147 | "\n", 148 | "Calculate the day-over-day percent change in the values and to assign the result to the variable `sp5_rtn`.\n", 149 | "\n", 150 | "Hint: Use `shift`" 151 | ] 152 | }, 153 | { 154 | "cell_type": "markdown", 155 | "metadata": { 156 | "tags": [ 157 | "exercise" 158 | ] 159 | }, 160 | "source": [ 161 | "**Exercise**\n", 162 | "\n", 163 | "Resample the data from daily to monthly to calculate average day-over-day percent change." 164 | ] 165 | } 166 | ], 167 | "metadata": { 168 | "kernelspec": { 169 | "display_name": "Python 3 (ipykernel)", 170 | "language": "python", 171 | "name": "python3" 172 | }, 173 | "language_info": { 174 | "codemirror_mode": { 175 | "name": "ipython", 176 | "version": 3 177 | }, 178 | "file_extension": ".py", 179 | "mimetype": "text/x-python", 180 | "name": "python", 181 | "nbconvert_exporter": "python", 182 | "pygments_lexer": "ipython3", 183 | "version": "3.10.8" 184 | } 185 | }, 186 | "nbformat": 4, 187 | "nbformat_minor": 4 188 | } 189 | -------------------------------------------------------------------------------- /05-merge-pivot-answers.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "tags": [ 7 | "setup" 8 | ] 9 | }, 10 | "source": [ 11 | "(c) 2016 - present. Enplus Advisors, Inc." 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "metadata": { 18 | "tags": [ 19 | "setup" 20 | ] 21 | }, 22 | "outputs": [], 23 | "source": [ 24 | "import io\n", 25 | "\n", 26 | "import numpy as np\n", 27 | "import pandas as pd\n", 28 | "\n", 29 | "pd.set_option('display.precision', 2)" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": null, 35 | "metadata": { 36 | "tags": [ 37 | "setup" 38 | ] 39 | }, 40 | "outputs": [], 41 | "source": [ 42 | "prc = pd.read_csv(\n", 43 | " io.StringIO('ticker,open,date,close\\nAAPL,426.23,2018-01-04,435.23\\nMSFT,42.3,2018-01-04,51.3\\nAAPL,436.23,2018-01-05,\\nMSFT,52.3,2018-01-05,\\n'),\n", 44 | " parse_dates=['date']\n", 45 | ")\n", 46 | "prc2 = prc.assign(\n", 47 | " date=pd.to_datetime('2018-01-06'),\n", 48 | " close=prc.open + np.random.randn(len(prc.open))\n", 49 | ").drop('open', axis=1)" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "metadata": { 56 | "tags": [ 57 | "setup" 58 | ] 59 | }, 60 | "outputs": [], 61 | "source": [ 62 | "volume = pd.DataFrame({\n", 63 | " 'ticker': ['AAPL', 'MSFT', 'IBM', 'YHOO', 'GOOG'],\n", 64 | " 'volume': [1954.73, 335.83, 362.79, 858.18, 629.79]\n", 65 | "}).assign(date=pd.to_datetime('2018-01-05'))" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "metadata": {}, 72 | "outputs": [], 73 | "source": [ 74 | "prc" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": null, 80 | "metadata": {}, 81 | "outputs": [], 82 | "source": [ 83 | "volume" 84 | ] 85 | }, 86 | { 87 | "cell_type": "markdown", 88 | "metadata": { 89 | "tags": [ 90 | "exercise" 91 | ] 92 | }, 93 | "source": [ 94 | "**Exercise**\n", 95 | "\n", 96 | "Merge `prc` and `volume` on `ticker, date`:\n", 97 | "\n", 98 | "* Preserving only the records with common `ticker`s and `date`s\n", 99 | "* Preserving all the records in `prc`\n", 100 | "* Preserving the records in both `prc` and `volume`\n", 101 | "\n", 102 | "_All of these merges should be performed on `ticker` and `date`_" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": null, 108 | "metadata": {}, 109 | "outputs": [], 110 | "source": [ 111 | "cols = ['ticker', 'date']\n", 112 | "pd.merge(prc, volume, on=cols)" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": null, 118 | "metadata": {}, 119 | "outputs": [], 120 | "source": [ 121 | "pd.merge(prc, volume, on=cols, how='left')" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": null, 127 | "metadata": {}, 128 | "outputs": [], 129 | "source": [ 130 | "pd.merge(prc, volume, on=cols, how='outer')" 131 | ] 132 | }, 133 | { 134 | "cell_type": "markdown", 135 | "metadata": { 136 | "tags": [ 137 | "exercise" 138 | ] 139 | }, 140 | "source": [ 141 | "**Exercise:**\n", 142 | "\n", 143 | "Using `pd.concat`, concatenate the rows of `prc` and `prc2`, making\n", 144 | "a single call to `pd.concat` for each bulleted sub-exercise:\n", 145 | "\n", 146 | "* Make sure your result generates a new index like in the previous\n", 147 | " exercise\n", 148 | "* Only include the columns in both `prc` and `prc2` in the result,\n", 149 | " additionally generating a new index\n", 150 | "* Make your result include a `MultiIndex` with a value of `prc`\n", 151 | " or `prc2` to indicate which `DataFrame` provided the values" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": null, 157 | "metadata": {}, 158 | "outputs": [], 159 | "source": [ 160 | "pd.concat([prc, prc2], ignore_index=True)" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": null, 166 | "metadata": {}, 167 | "outputs": [], 168 | "source": [ 169 | "pd.concat([prc, prc2], join='inner', ignore_index=True)" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": null, 175 | "metadata": {}, 176 | "outputs": [], 177 | "source": [ 178 | "pd.concat([prc, prc2], keys=['prc', 'prc2'])" 179 | ] 180 | } 181 | ], 182 | "metadata": { 183 | "celltoolbar": "Tags", 184 | "kernelspec": { 185 | "display_name": "Python 3 (ipykernel)", 186 | "language": "python", 187 | "name": "python3" 188 | }, 189 | "language_info": { 190 | "codemirror_mode": { 191 | "name": "ipython", 192 | "version": 3 193 | }, 194 | "file_extension": ".py", 195 | "mimetype": "text/x-python", 196 | "name": "python", 197 | "nbconvert_exporter": "python", 198 | "pygments_lexer": "ipython3", 199 | "version": "3.10.8" 200 | }, 201 | "toc": { 202 | "base_numbering": 1, 203 | "nav_menu": {}, 204 | "number_sections": false, 205 | "sideBar": true, 206 | "skip_h1_title": true, 207 | "title_cell": "Table of Contents", 208 | "title_sidebar": "Contents", 209 | "toc_cell": false, 210 | "toc_position": {}, 211 | "toc_section_display": true, 212 | "toc_window_display": false 213 | } 214 | }, 215 | "nbformat": 4, 216 | "nbformat_minor": 4 217 | } 218 | -------------------------------------------------------------------------------- /05-merge-pivot-exercises.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "tags": [ 7 | "setup" 8 | ] 9 | }, 10 | "source": [ 11 | "(c) 2016 - present. Enplus Advisors, Inc." 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "metadata": { 18 | "tags": [ 19 | "setup" 20 | ] 21 | }, 22 | "outputs": [], 23 | "source": [ 24 | "import io\n", 25 | "\n", 26 | "import numpy as np\n", 27 | "import pandas as pd\n", 28 | "\n", 29 | "pd.set_option('display.precision', 2)" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": null, 35 | "metadata": { 36 | "tags": [ 37 | "setup" 38 | ] 39 | }, 40 | "outputs": [], 41 | "source": [ 42 | "prc = pd.read_csv(\n", 43 | " io.StringIO('ticker,open,date,close\\nAAPL,426.23,2018-01-04,435.23\\nMSFT,42.3,2018-01-04,51.3\\nAAPL,436.23,2018-01-05,\\nMSFT,52.3,2018-01-05,\\n'),\n", 44 | " parse_dates=['date']\n", 45 | ")\n", 46 | "prc2 = prc.assign(\n", 47 | " date=pd.to_datetime('2018-01-06'),\n", 48 | " close=prc.open + np.random.randn(len(prc.open))\n", 49 | ").drop('open', axis=1)" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "metadata": { 56 | "tags": [ 57 | "setup" 58 | ] 59 | }, 60 | "outputs": [], 61 | "source": [ 62 | "volume = pd.DataFrame({\n", 63 | " 'ticker': ['AAPL', 'MSFT', 'IBM', 'YHOO', 'GOOG'],\n", 64 | " 'volume': [1954.73, 335.83, 362.79, 858.18, 629.79]\n", 65 | "}).assign(date=pd.to_datetime('2018-01-05'))" 66 | ] 67 | }, 68 | { 69 | "cell_type": "markdown", 70 | "metadata": { 71 | "tags": [ 72 | "exercise" 73 | ] 74 | }, 75 | "source": [ 76 | "**Exercise**\n", 77 | "\n", 78 | "Merge `prc` and `volume` on `ticker, date`:\n", 79 | "\n", 80 | "* Preserving only the records with common `ticker`s and `date`s\n", 81 | "* Preserving all the records in `prc`\n", 82 | "* Preserving the records in both `prc` and `volume`\n", 83 | "\n", 84 | "_All of these merges should be performed on `ticker` and `date`_" 85 | ] 86 | }, 87 | { 88 | "cell_type": "markdown", 89 | "metadata": { 90 | "tags": [ 91 | "exercise" 92 | ] 93 | }, 94 | "source": [ 95 | "**Exercise:**\n", 96 | "\n", 97 | "Using `pd.concat`, concatenate the rows of `prc` and `prc2`, making\n", 98 | "a single call to `pd.concat` for each bulleted sub-exercise:\n", 99 | "\n", 100 | "* Make sure your result generates a new index like in the previous\n", 101 | " exercise\n", 102 | "* Only include the columns in both `prc` and `prc2` in the result,\n", 103 | " additionally generating a new index\n", 104 | "* Make your result include a `MultiIndex` with a value of `prc`\n", 105 | " or `prc2` to indicate which `DataFrame` provided the values" 106 | ] 107 | } 108 | ], 109 | "metadata": { 110 | "kernelspec": { 111 | "display_name": "Python 3 (ipykernel)", 112 | "language": "python", 113 | "name": "python3" 114 | }, 115 | "language_info": { 116 | "codemirror_mode": { 117 | "name": "ipython", 118 | "version": 3 119 | }, 120 | "file_extension": ".py", 121 | "mimetype": "text/x-python", 122 | "name": "python", 123 | "nbconvert_exporter": "python", 124 | "pygments_lexer": "ipython3", 125 | "version": "3.10.8" 126 | } 127 | }, 128 | "nbformat": 4, 129 | "nbformat_minor": 4 130 | } 131 | -------------------------------------------------------------------------------- /06-advanced-merge-reshape-answers.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "tags": [ 7 | "setup" 8 | ] 9 | }, 10 | "source": [ 11 | "(c) 2016 - present. Enplus Advisors, Inc." 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "metadata": { 18 | "tags": [ 19 | "setup" 20 | ] 21 | }, 22 | "outputs": [], 23 | "source": [ 24 | "import numpy as np\n", 25 | "import pandas as pd\n", 26 | "\n", 27 | "pd.set_option('display.precision', 2)" 28 | ] 29 | }, 30 | { 31 | "cell_type": "markdown", 32 | "metadata": { 33 | "tags": [ 34 | "setup" 35 | ] 36 | }, 37 | "source": [ 38 | "**Data**\n", 39 | "\n", 40 | "* `sp5_jan` is SP500 market close prices and trading volume for\n", 41 | " January 2015.\n", 42 | "* `sales` is weekly sales data for Acme Widgets Co. for January\n", 43 | " 2015 in thousands of widgets sold and \\$ millions in revenue" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": null, 49 | "metadata": { 50 | "tags": [ 51 | "setup" 52 | ] 53 | }, 54 | "outputs": [], 55 | "source": [ 56 | "np.random.seed(100)\n", 57 | "\n", 58 | "sp5 = pd.read_csv(\n", 59 | " 'sp500.csv', parse_dates=['date'], index_col=['date'],\n", 60 | " usecols=['date', 'close', 'volume'])\\\n", 61 | " .sort_index()\n", 62 | "\n", 63 | "sp5_jan = sp5.loc['2015-01', :].copy()\n", 64 | "sp5_jan['volume'] = sp5_jan['volume'] / 1e6\n", 65 | "sales = pd.DataFrame({\n", 66 | " 'date': pd.date_range('2015-01-01', '2015-01-31', freq='W'),\n", 67 | "})\n", 68 | "sales['widgets_sold'] = abs(10 * np.random.randn(sales.shape[0])).round()\n", 69 | "sales['revenue'] = sales.widgets_sold * 20" 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "metadata": { 75 | "tags": [ 76 | "exercise" 77 | ] 78 | }, 79 | "source": [ 80 | "**Exercise:**\n", 81 | "\n", 82 | "Merge `sp5_jan` with `sales`, filling sales data forward. Save\n", 83 | "the result as `res_1`. Your result should have the same number of records\n", 84 | "as `sp5_jan`." 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": null, 90 | "metadata": {}, 91 | "outputs": [], 92 | "source": [ 93 | "sales_jan = sales.set_index(\"date\").reindex(sp5_jan.index, method=\"ffill\")\n", 94 | "res_1 = sp5_jan.join(sales_jan)\n", 95 | "\n", 96 | "assert res_1.shape[0] == sp5_jan.shape[0]" 97 | ] 98 | }, 99 | { 100 | "cell_type": "markdown", 101 | "metadata": { 102 | "tags": [ 103 | "exercise" 104 | ] 105 | }, 106 | "source": [ 107 | "**Exercise:**\n", 108 | "\n", 109 | "Convert the output from the previous exercise to long format with\n", 110 | "`date` as the ID variable, saving the result as `res_2`" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": null, 116 | "metadata": {}, 117 | "outputs": [], 118 | "source": [ 119 | "res_2 = pd.melt(res_1.reset_index(), id_vars='date')\n", 120 | "res_2.head()" 121 | ] 122 | }, 123 | { 124 | "cell_type": "markdown", 125 | "metadata": { 126 | "tags": [ 127 | "exercise" 128 | ] 129 | }, 130 | "source": [ 131 | "**Exercise**\n", 132 | "\n", 133 | "Convert `res_2` back to wide format using the `pivot` method." 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": null, 139 | "metadata": {}, 140 | "outputs": [], 141 | "source": [ 142 | "res_3a = res_2.pivot(index='date', columns='variable')\n", 143 | "res_3a.head()" 144 | ] 145 | }, 146 | { 147 | "cell_type": "markdown", 148 | "metadata": { 149 | "tags": [ 150 | "exercise" 151 | ] 152 | }, 153 | "source": [ 154 | "**Exercise**\n", 155 | "\n", 156 | "Convert `res_2` back to wide format using the `unstack` method." 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": null, 162 | "metadata": {}, 163 | "outputs": [], 164 | "source": [ 165 | "res_3b = res_2.set_index(['date', 'variable']).unstack()\n", 166 | "res_3b.head()" 167 | ] 168 | } 169 | ], 170 | "metadata": { 171 | "celltoolbar": "Tags", 172 | "kernelspec": { 173 | "display_name": "Python 3 (ipykernel)", 174 | "language": "python", 175 | "name": "python3" 176 | }, 177 | "language_info": { 178 | "codemirror_mode": { 179 | "name": "ipython", 180 | "version": 3 181 | }, 182 | "file_extension": ".py", 183 | "mimetype": "text/x-python", 184 | "name": "python", 185 | "nbconvert_exporter": "python", 186 | "pygments_lexer": "ipython3", 187 | "version": "3.10.8" 188 | }, 189 | "toc": { 190 | "base_numbering": 1, 191 | "nav_menu": {}, 192 | "number_sections": false, 193 | "sideBar": true, 194 | "skip_h1_title": true, 195 | "title_cell": "Table of Contents", 196 | "title_sidebar": "Contents", 197 | "toc_cell": false, 198 | "toc_position": {}, 199 | "toc_section_display": true, 200 | "toc_window_display": false 201 | } 202 | }, 203 | "nbformat": 4, 204 | "nbformat_minor": 4 205 | } 206 | -------------------------------------------------------------------------------- /06-advanced-merge-reshape-exercises.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "tags": [ 7 | "setup" 8 | ] 9 | }, 10 | "source": [ 11 | "(c) 2016 - present. Enplus Advisors, Inc." 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "metadata": { 18 | "tags": [ 19 | "setup" 20 | ] 21 | }, 22 | "outputs": [], 23 | "source": [ 24 | "import numpy as np\n", 25 | "import pandas as pd\n", 26 | "\n", 27 | "pd.set_option('display.precision', 2)" 28 | ] 29 | }, 30 | { 31 | "cell_type": "markdown", 32 | "metadata": { 33 | "tags": [ 34 | "setup" 35 | ] 36 | }, 37 | "source": [ 38 | "**Data**\n", 39 | "\n", 40 | "* `sp5_jan` is SP500 market close prices and trading volume for\n", 41 | " January 2015.\n", 42 | "* `sales` is weekly sales data for Acme Widgets Co. for January\n", 43 | " 2015 in thousands of widgets sold and \\$ millions in revenue" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": null, 49 | "metadata": { 50 | "tags": [ 51 | "setup" 52 | ] 53 | }, 54 | "outputs": [], 55 | "source": [ 56 | "np.random.seed(100)\n", 57 | "\n", 58 | "sp5 = pd.read_csv(\n", 59 | " 'sp500.csv', parse_dates=['date'], index_col=['date'],\n", 60 | " usecols=['date', 'close', 'volume'])\\\n", 61 | " .sort_index()\n", 62 | "\n", 63 | "sp5_jan = sp5.loc['2015-01', :].copy()\n", 64 | "sp5_jan['volume'] = sp5_jan['volume'] / 1e6\n", 65 | "sales = pd.DataFrame({\n", 66 | " 'date': pd.date_range('2015-01-01', '2015-01-31', freq='W'),\n", 67 | "})\n", 68 | "sales['widgets_sold'] = abs(10 * np.random.randn(sales.shape[0])).round()\n", 69 | "sales['revenue'] = sales.widgets_sold * 20" 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "metadata": { 75 | "tags": [ 76 | "exercise" 77 | ] 78 | }, 79 | "source": [ 80 | "**Exercise:**\n", 81 | "\n", 82 | "Merge `sp5_jan` with `sales`, filling sales data forward. Save\n", 83 | "the result as `res_1`. Your result should have the same number of records\n", 84 | "as `sp5_jan`." 85 | ] 86 | }, 87 | { 88 | "cell_type": "markdown", 89 | "metadata": { 90 | "tags": [ 91 | "exercise" 92 | ] 93 | }, 94 | "source": [ 95 | "**Exercise:**\n", 96 | "\n", 97 | "Convert the output from the previous exercise to long format with\n", 98 | "`date` as the ID variable, saving the result as `res_2`" 99 | ] 100 | }, 101 | { 102 | "cell_type": "markdown", 103 | "metadata": { 104 | "tags": [ 105 | "exercise" 106 | ] 107 | }, 108 | "source": [ 109 | "**Exercise**\n", 110 | "\n", 111 | "Convert `res_2` back to wide format using the `pivot` method." 112 | ] 113 | }, 114 | { 115 | "cell_type": "markdown", 116 | "metadata": { 117 | "tags": [ 118 | "exercise" 119 | ] 120 | }, 121 | "source": [ 122 | "**Exercise**\n", 123 | "\n", 124 | "Convert `res_2` back to wide format using the `unstack` method." 125 | ] 126 | } 127 | ], 128 | "metadata": { 129 | "kernelspec": { 130 | "display_name": "Python 3 (ipykernel)", 131 | "language": "python", 132 | "name": "python3" 133 | }, 134 | "language_info": { 135 | "codemirror_mode": { 136 | "name": "ipython", 137 | "version": 3 138 | }, 139 | "file_extension": ".py", 140 | "mimetype": "text/x-python", 141 | "name": "python", 142 | "nbconvert_exporter": "python", 143 | "pygments_lexer": "ipython3", 144 | "version": "3.10.8" 145 | } 146 | }, 147 | "nbformat": 4, 148 | "nbformat_minor": 4 149 | } 150 | -------------------------------------------------------------------------------- /06-advanced-merge-reshape-slides.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "slideshow": { 7 | "slide_type": "skip" 8 | } 9 | }, 10 | "source": [ 11 | "(c) 2016 - present. Enplus Advisors, Inc." 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 1, 17 | "metadata": { 18 | "slideshow": { 19 | "slide_type": "skip" 20 | } 21 | }, 22 | "outputs": [], 23 | "source": [ 24 | "import datetime as dt\n", 25 | "\n", 26 | "import numpy as np\n", 27 | "import pandas as pd\n", 28 | "\n", 29 | "pd.set_option('display.precision', 2)" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 2, 35 | "metadata": { 36 | "slideshow": { 37 | "slide_type": "skip" 38 | } 39 | }, 40 | "outputs": [], 41 | "source": [ 42 | "# Sample dataset w/ 5 days of data\n", 43 | "\n", 44 | "def make_dataset(n_days=4):\n", 45 | " if n_days not in (4, 5):\n", 46 | " raise ValueError(f\"n_days must be 4 or 5, got {n_days:d}.\")\n", 47 | " \n", 48 | " data = {\n", 49 | " 'date': ['2015-12-28', '2015-12-29', '2015-12-30', \n", 50 | " '2015-12-31', '2016-01-04'],\n", 51 | " 'goog': [762.51, 776.60, 771.00, 758.88, 741.84],\n", 52 | " 'aapl': [106.82, 108.74, 107.32, 105.26, 105.35]\n", 53 | " }\n", 54 | " \n", 55 | " n_max = len(data['date'])\n", 56 | " slice_ = slice(n_max - n_days, n_max)\n", 57 | " \n", 58 | " sub = {k: v[slice_] for k,v in data.items()}\n", 59 | " dates = sub['date']\n", 60 | " n = len(dates)\n", 61 | " \n", 62 | " # breakpoint()\n", 63 | " rv = pd.DataFrame({\n", 64 | " 'ticker': ['GOOG'] * n + ['AAPL'] * n,\n", 65 | " 'date': [pd.to_datetime(x) for x in dates] * 2,\n", 66 | " 'close': sub['goog'] + sub['aapl']\n", 67 | " })\n", 68 | " \n", 69 | " return rv" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 3, 75 | "metadata": { 76 | "slideshow": { 77 | "slide_type": "skip" 78 | } 79 | }, 80 | "outputs": [], 81 | "source": [ 82 | "def make_long():\n", 83 | " day_1 = dt.date(2015, 12, 29)\n", 84 | " day_2 = dt.date(2015, 12, 30)\n", 85 | " \n", 86 | " col_close = 'close'\n", 87 | " col_open = 'open'\n", 88 | " \n", 89 | " cols = ['date', 'ticker', 'variable', 'value']\n", 90 | " \n", 91 | " rv = pd.DataFrame([\n", 92 | " {'ticker': 'GOOG', 'date': day_1, 'variable': col_close, 'value': 776.60},\n", 93 | " {'ticker': 'GOOG', 'date': day_2, 'variable': col_close, 'value': 771.00},\n", 94 | " {'ticker': 'AAPL', 'date': day_1, 'variable': col_open, 'value': 107.01},\n", 95 | " {'ticker': 'AAPL', 'date': day_1, 'variable': col_close, 'value': 105.26}\n", 96 | " ], columns=cols)\n", 97 | " \n", 98 | " return rv" 99 | ] 100 | }, 101 | { 102 | "cell_type": "markdown", 103 | "metadata": { 104 | "slideshow": { 105 | "slide_type": "slide" 106 | } 107 | }, 108 | "source": [ 109 | "# Programming with Data:
Advanced Python and Pandas\n", 110 | "\n", 111 | "# Advanced Merging & Reshaping" 112 | ] 113 | }, 114 | { 115 | "cell_type": "markdown", 116 | "metadata": { 117 | "slideshow": { 118 | "slide_type": "slide" 119 | } 120 | }, 121 | "source": [ 122 | "## Grouped and Ordered Data" 123 | ] 124 | }, 125 | { 126 | "cell_type": "markdown", 127 | "metadata": { 128 | "slideshow": { 129 | "slide_type": "fragment" 130 | } 131 | }, 132 | "source": [ 133 | "Working again with securities market data. In quant finance, this is a\n", 134 | "common data type, daily stock prices." 135 | ] 136 | }, 137 | { 138 | "cell_type": "markdown", 139 | "metadata": { 140 | "slideshow": { 141 | "slide_type": "slide" 142 | } 143 | }, 144 | "source": [ 145 | "### Display the data" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": 4, 151 | "metadata": { 152 | "slideshow": { 153 | "slide_type": "fragment" 154 | } 155 | }, 156 | "outputs": [ 157 | { 158 | "data": { 159 | "text/html": [ 160 | "
\n", 161 | "\n", 174 | "\n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | "
tickerdateclose
0GOOG2015-12-29776.60
1GOOG2015-12-30771.00
2GOOG2015-12-31758.88
3GOOG2016-01-04741.84
4AAPL2015-12-29108.74
5AAPL2015-12-30107.32
6AAPL2015-12-31105.26
7AAPL2016-01-04105.35
\n", 234 | "
" 235 | ], 236 | "text/plain": [ 237 | " ticker date close\n", 238 | "0 GOOG 2015-12-29 776.60\n", 239 | "1 GOOG 2015-12-30 771.00\n", 240 | "2 GOOG 2015-12-31 758.88\n", 241 | "3 GOOG 2016-01-04 741.84\n", 242 | "4 AAPL 2015-12-29 108.74\n", 243 | "5 AAPL 2015-12-30 107.32\n", 244 | "6 AAPL 2015-12-31 105.26\n", 245 | "7 AAPL 2016-01-04 105.35" 246 | ] 247 | }, 248 | "execution_count": 4, 249 | "metadata": {}, 250 | "output_type": "execute_result" 251 | } 252 | ], 253 | "source": [ 254 | "_dts = ['2015-12-29', '2015-12-30', '2015-12-31', '2016-01-04']\n", 255 | "_goog = [776.60, 771.00, 758.88, 741.84]\n", 256 | "_aapl = [108.74, 107.32, 105.26, 105.35]\n", 257 | "\n", 258 | "df = pd.DataFrame({\n", 259 | " 'ticker': ['GOOG'] * 4 + ['AAPL'] * 4,\n", 260 | " 'date': [pd.to_datetime(x) for x in _dts] * 2,\n", 261 | " 'close': _goog + _aapl\n", 262 | "})\n", 263 | "df\n" 264 | ] 265 | }, 266 | { 267 | "cell_type": "markdown", 268 | "metadata": { 269 | "slideshow": { 270 | "slide_type": "slide" 271 | } 272 | }, 273 | "source": [ 274 | "### A single, ordered series" 275 | ] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "execution_count": 5, 280 | "metadata": { 281 | "slideshow": { 282 | "slide_type": "fragment" 283 | } 284 | }, 285 | "outputs": [ 286 | { 287 | "data": { 288 | "text/html": [ 289 | "
\n", 290 | "\n", 303 | "\n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | "
daterate
02015-12-302.40
12016-01-042.56
\n", 324 | "
" 325 | ], 326 | "text/plain": [ 327 | " date rate\n", 328 | "0 2015-12-30 2.40\n", 329 | "1 2016-01-04 2.56" 330 | ] 331 | }, 332 | "execution_count": 5, 333 | "metadata": {}, 334 | "output_type": "execute_result" 335 | } 336 | ], 337 | "source": [ 338 | "tbill = pd.DataFrame({\n", 339 | " 'date': [pd.to_datetime(x) for x in ['2015-12-30', '2016-01-04']],\n", 340 | " 'rate': [2.40, 2.56]\n", 341 | "})\n", 342 | "tbill" 343 | ] 344 | }, 345 | { 346 | "cell_type": "markdown", 347 | "metadata": { 348 | "slideshow": { 349 | "slide_type": "slide" 350 | } 351 | }, 352 | "source": [ 353 | "## Merge data that is grouped and ordered\n", 354 | "\n", 355 | "* Left panel is irregularly spaced, e.g. business days\n", 356 | "* Right time series also irregularly spaced, e.g. a sparse subset of the first\n", 357 | " series" 358 | ] 359 | }, 360 | { 361 | "cell_type": "markdown", 362 | "metadata": { 363 | "slideshow": { 364 | "slide_type": "slide" 365 | } 366 | }, 367 | "source": [ 368 | "### How not to do the merge" 369 | ] 370 | }, 371 | { 372 | "cell_type": "markdown", 373 | "metadata": { 374 | "slideshow": { 375 | "slide_type": "fragment" 376 | } 377 | }, 378 | "source": [ 379 | "Don't use plain `pd.merge` and fill forward across groups." 380 | ] 381 | }, 382 | { 383 | "cell_type": "code", 384 | "execution_count": 6, 385 | "metadata": { 386 | "slideshow": { 387 | "slide_type": "fragment" 388 | } 389 | }, 390 | "outputs": [ 391 | { 392 | "data": { 393 | "text/html": [ 394 | "
\n", 395 | "\n", 408 | "\n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | "
tickerdatecloserate
0GOOG2015-12-29776.60NaN
1GOOG2015-12-30771.002.40
2GOOG2015-12-31758.882.40
3GOOG2016-01-04741.842.56
4AAPL2015-12-29108.742.56
5AAPL2015-12-30107.322.40
6AAPL2015-12-31105.262.40
7AAPL2016-01-04105.352.56
\n", 477 | "
" 478 | ], 479 | "text/plain": [ 480 | " ticker date close rate\n", 481 | "0 GOOG 2015-12-29 776.60 NaN\n", 482 | "1 GOOG 2015-12-30 771.00 2.40\n", 483 | "2 GOOG 2015-12-31 758.88 2.40\n", 484 | "3 GOOG 2016-01-04 741.84 2.56\n", 485 | "4 AAPL 2015-12-29 108.74 2.56\n", 486 | "5 AAPL 2015-12-30 107.32 2.40\n", 487 | "6 AAPL 2015-12-31 105.26 2.40\n", 488 | "7 AAPL 2016-01-04 105.35 2.56" 489 | ] 490 | }, 491 | "execution_count": 6, 492 | "metadata": {}, 493 | "output_type": "execute_result" 494 | } 495 | ], 496 | "source": [ 497 | "pd.merge(df, tbill, on='date', how='left').ffill()" 498 | ] 499 | }, 500 | { 501 | "cell_type": "markdown", 502 | "metadata": { 503 | "slideshow": { 504 | "slide_type": "slide" 505 | } 506 | }, 507 | "source": [ 508 | "### Merge Ordered V2" 509 | ] 510 | }, 511 | { 512 | "cell_type": "code", 513 | "execution_count": 7, 514 | "metadata": { 515 | "slideshow": { 516 | "slide_type": "fragment" 517 | } 518 | }, 519 | "outputs": [ 520 | { 521 | "data": { 522 | "text/html": [ 523 | "
\n", 524 | "\n", 537 | "\n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | "
tickerdatecloserate
0GOOG2015-12-29776.60NaN
1GOOG2015-12-30771.002.40
2GOOG2015-12-31758.882.40
3GOOG2016-01-04741.842.56
4AAPL2015-12-29108.74NaN
5AAPL2015-12-30107.322.40
6AAPL2015-12-31105.262.40
7AAPL2016-01-04105.352.56
\n", 606 | "
" 607 | ], 608 | "text/plain": [ 609 | " ticker date close rate\n", 610 | "0 GOOG 2015-12-29 776.60 NaN\n", 611 | "1 GOOG 2015-12-30 771.00 2.40\n", 612 | "2 GOOG 2015-12-31 758.88 2.40\n", 613 | "3 GOOG 2016-01-04 741.84 2.56\n", 614 | "4 AAPL 2015-12-29 108.74 NaN\n", 615 | "5 AAPL 2015-12-30 107.32 2.40\n", 616 | "6 AAPL 2015-12-31 105.26 2.40\n", 617 | "7 AAPL 2016-01-04 105.35 2.56" 618 | ] 619 | }, 620 | "execution_count": 7, 621 | "metadata": {}, 622 | "output_type": "execute_result" 623 | } 624 | ], 625 | "source": [ 626 | "mkt = pd.merge_ordered(df, tbill, on='date', left_by='ticker', fill_method='ffill')\n", 627 | "mkt" 628 | ] 629 | }, 630 | { 631 | "cell_type": "markdown", 632 | "metadata": { 633 | "slideshow": { 634 | "slide_type": "slide" 635 | } 636 | }, 637 | "source": [ 638 | "# Reshaping & Pivoting" 639 | ] 640 | }, 641 | { 642 | "cell_type": "markdown", 643 | "metadata": { 644 | "slideshow": { 645 | "slide_type": "slide" 646 | } 647 | }, 648 | "source": [ 649 | "## Wide and Long Formats\n", 650 | "\n", 651 | "* Depending on the operation or the data storage location, data stored\n", 652 | " in a \"wide\" or \"long\" format" 653 | ] 654 | }, 655 | { 656 | "cell_type": "markdown", 657 | "metadata": { 658 | "slideshow": { 659 | "slide_type": "slide" 660 | } 661 | }, 662 | "source": [ 663 | "### Long Format\n", 664 | "\n", 665 | "* Common format for data in relational databases because allows\n", 666 | " new attributes without a schema change\n", 667 | "* \"Long\" format is also called \"stacked\" or \"record\" format in the\n", 668 | " `pandas` documentation. Also called `Entity-Attribute-Value (EAV)`\n", 669 | "* \"Sparse\" by design" 670 | ] 671 | }, 672 | { 673 | "cell_type": "markdown", 674 | "metadata": { 675 | "slideshow": { 676 | "slide_type": "skip" 677 | } 678 | }, 679 | "source": [ 680 | "TODO: Include diagram of Long Format" 681 | ] 682 | }, 683 | { 684 | "cell_type": "code", 685 | "execution_count": 8, 686 | "metadata": { 687 | "slideshow": { 688 | "slide_type": "skip" 689 | } 690 | }, 691 | "outputs": [], 692 | "source": [ 693 | "def make_long_aapl():\n", 694 | " day_1 = dt.date(2015, 12, 29)\n", 695 | " day_2 = dt.date(2015, 12, 30)\n", 696 | " \n", 697 | " col_close = 'close'\n", 698 | " col_open = 'open'\n", 699 | " \n", 700 | " cols = ['date', 'ticker', 'variable', 'value']\n", 701 | " \n", 702 | " rv = pd.DataFrame([\n", 703 | " {'ticker': 'AAPL', 'date': day_1, 'variable': col_open, 'value': 106.96},\n", 704 | " {'ticker': 'AAPL', 'date': day_1, 'variable': col_close, 'value': 108.74},\n", 705 | " {'ticker': 'AAPL', 'date': day_2, 'variable': col_open, 'value': 108.58},\n", 706 | " {'ticker': 'AAPL', 'date': day_2, 'variable': col_close, 'value': 107.32}\n", 707 | " ], columns=cols)\n", 708 | " \n", 709 | " return rv" 710 | ] 711 | }, 712 | { 713 | "cell_type": "markdown", 714 | "metadata": { 715 | "slideshow": { 716 | "slide_type": "slide" 717 | } 718 | }, 719 | "source": [ 720 | "### Simplest Long Format\n", 721 | "\n", 722 | "* Multiple attributes for a single entity (AAPL)\n", 723 | "* Row for every period (12/29 & 12/30) x (number of attributes)" 724 | ] 725 | }, 726 | { 727 | "cell_type": "code", 728 | "execution_count": 9, 729 | "metadata": { 730 | "slideshow": { 731 | "slide_type": "fragment" 732 | } 733 | }, 734 | "outputs": [ 735 | { 736 | "data": { 737 | "text/html": [ 738 | "
\n", 739 | "\n", 752 | "\n", 753 | " \n", 754 | " \n", 755 | " \n", 756 | " \n", 757 | " \n", 758 | " \n", 759 | " \n", 760 | " \n", 761 | " \n", 762 | " \n", 763 | " \n", 764 | " \n", 765 | " \n", 766 | " \n", 767 | " \n", 768 | " \n", 769 | " \n", 770 | " \n", 771 | " \n", 772 | " \n", 773 | " \n", 774 | " \n", 775 | " \n", 776 | " \n", 777 | " \n", 778 | " \n", 779 | " \n", 780 | " \n", 781 | " \n", 782 | " \n", 783 | " \n", 784 | " \n", 785 | " \n", 786 | " \n", 787 | " \n", 788 | " \n", 789 | " \n", 790 | " \n", 791 | " \n", 792 | "
datetickervariablevalue
02015-12-29AAPLopen106.96
12015-12-29AAPLclose108.74
22015-12-30AAPLopen108.58
32015-12-30AAPLclose107.32
\n", 793 | "
" 794 | ], 795 | "text/plain": [ 796 | " date ticker variable value\n", 797 | "0 2015-12-29 AAPL open 106.96\n", 798 | "1 2015-12-29 AAPL close 108.74\n", 799 | "2 2015-12-30 AAPL open 108.58\n", 800 | "3 2015-12-30 AAPL close 107.32" 801 | ] 802 | }, 803 | "execution_count": 9, 804 | "metadata": {}, 805 | "output_type": "execute_result" 806 | } 807 | ], 808 | "source": [ 809 | "aapl_long = make_long_aapl()\n", 810 | "aapl_long" 811 | ] 812 | }, 813 | { 814 | "cell_type": "markdown", 815 | "metadata": { 816 | "slideshow": { 817 | "slide_type": "slide" 818 | } 819 | }, 820 | "source": [ 821 | "### Wide Format\n", 822 | "\n", 823 | "* Identifiers stored in the index\n", 824 | "* Each attribute has its own column\n", 825 | "* Common format for use by machine learning algorithms" 826 | ] 827 | }, 828 | { 829 | "cell_type": "markdown", 830 | "metadata": { 831 | "slideshow": { 832 | "slide_type": "skip" 833 | } 834 | }, 835 | "source": [ 836 | "TODO: Include diagram of wide format" 837 | ] 838 | }, 839 | { 840 | "cell_type": "markdown", 841 | "metadata": { 842 | "slideshow": { 843 | "slide_type": "slide" 844 | } 845 | }, 846 | "source": [ 847 | "### Long-to-Wide" 848 | ] 849 | }, 850 | { 851 | "cell_type": "code", 852 | "execution_count": 10, 853 | "metadata": { 854 | "slideshow": { 855 | "slide_type": "fragment" 856 | } 857 | }, 858 | "outputs": [ 859 | { 860 | "data": { 861 | "text/html": [ 862 | "
\n", 863 | "\n", 876 | "\n", 877 | " \n", 878 | " \n", 879 | " \n", 880 | " \n", 881 | " \n", 882 | " \n", 883 | " \n", 884 | " \n", 885 | " \n", 886 | " \n", 887 | " \n", 888 | " \n", 889 | " \n", 890 | " \n", 891 | " \n", 892 | " \n", 893 | " \n", 894 | " \n", 895 | " \n", 896 | " \n", 897 | " \n", 898 | " \n", 899 | " \n", 900 | " \n", 901 | " \n", 902 | " \n", 903 | " \n", 904 | " \n", 905 | " \n", 906 | " \n", 907 | " \n", 908 | " \n", 909 | " \n", 910 | " \n", 911 | " \n", 912 | " \n", 913 | " \n", 914 | " \n", 915 | " \n", 916 | "
datetickervariablevalue
02015-12-29AAPLopen106.96
12015-12-29AAPLclose108.74
22015-12-30AAPLopen108.58
32015-12-30AAPLclose107.32
\n", 917 | "
" 918 | ], 919 | "text/plain": [ 920 | " date ticker variable value\n", 921 | "0 2015-12-29 AAPL open 106.96\n", 922 | "1 2015-12-29 AAPL close 108.74\n", 923 | "2 2015-12-30 AAPL open 108.58\n", 924 | "3 2015-12-30 AAPL close 107.32" 925 | ] 926 | }, 927 | "execution_count": 10, 928 | "metadata": {}, 929 | "output_type": "execute_result" 930 | } 931 | ], 932 | "source": [ 933 | "aapl_long" 934 | ] 935 | }, 936 | { 937 | "cell_type": "code", 938 | "execution_count": 11, 939 | "metadata": { 940 | "slideshow": { 941 | "slide_type": "fragment" 942 | } 943 | }, 944 | "outputs": [ 945 | { 946 | "data": { 947 | "text/html": [ 948 | "
\n", 949 | "\n", 962 | "\n", 963 | " \n", 964 | " \n", 965 | " \n", 966 | " \n", 967 | " \n", 968 | " \n", 969 | " \n", 970 | " \n", 971 | " \n", 972 | " \n", 973 | " \n", 974 | " \n", 975 | " \n", 976 | " \n", 977 | " \n", 978 | " \n", 979 | " \n", 980 | " \n", 981 | " \n", 982 | " \n", 983 | " \n", 984 | " \n", 985 | " \n", 986 | " \n", 987 | "
variablecloseopen
date
2015-12-29108.74106.96
2015-12-30107.32108.58
\n", 988 | "
" 989 | ], 990 | "text/plain": [ 991 | "variable close open\n", 992 | "date \n", 993 | "2015-12-29 108.74 106.96\n", 994 | "2015-12-30 107.32 108.58" 995 | ] 996 | }, 997 | "execution_count": 11, 998 | "metadata": {}, 999 | "output_type": "execute_result" 1000 | } 1001 | ], 1002 | "source": [ 1003 | "aapl_long.pivot(index='date', columns='variable', values='value')" 1004 | ] 1005 | }, 1006 | { 1007 | "cell_type": "markdown", 1008 | "metadata": { 1009 | "slideshow": { 1010 | "slide_type": "slide" 1011 | } 1012 | }, 1013 | "source": [ 1014 | "### Long-to-Wide with multiple ID columns" 1015 | ] 1016 | }, 1017 | { 1018 | "cell_type": "code", 1019 | "execution_count": 12, 1020 | "metadata": { 1021 | "slideshow": { 1022 | "slide_type": "fragment" 1023 | } 1024 | }, 1025 | "outputs": [ 1026 | { 1027 | "data": { 1028 | "text/html": [ 1029 | "
\n", 1030 | "\n", 1047 | "\n", 1048 | " \n", 1049 | " \n", 1050 | " \n", 1051 | " \n", 1052 | " \n", 1053 | " \n", 1054 | " \n", 1055 | " \n", 1056 | " \n", 1057 | " \n", 1058 | " \n", 1059 | " \n", 1060 | " \n", 1061 | " \n", 1062 | " \n", 1063 | " \n", 1064 | " \n", 1065 | " \n", 1066 | " \n", 1067 | " \n", 1068 | " \n", 1069 | " \n", 1070 | " \n", 1071 | " \n", 1072 | " \n", 1073 | " \n", 1074 | " \n", 1075 | " \n", 1076 | " \n", 1077 | " \n", 1078 | " \n", 1079 | " \n", 1080 | " \n", 1081 | "
value
variablecloseopen
dateticker
2015-12-29AAPL108.74106.96
2015-12-30AAPL107.32108.58
\n", 1082 | "
" 1083 | ], 1084 | "text/plain": [ 1085 | " value \n", 1086 | "variable close open\n", 1087 | "date ticker \n", 1088 | "2015-12-29 AAPL 108.74 106.96\n", 1089 | "2015-12-30 AAPL 107.32 108.58" 1090 | ] 1091 | }, 1092 | "execution_count": 12, 1093 | "metadata": {}, 1094 | "output_type": "execute_result" 1095 | } 1096 | ], 1097 | "source": [ 1098 | "aapl_wide = aapl_long.set_index(['date', 'ticker', 'variable']).unstack()\n", 1099 | "aapl_wide" 1100 | ] 1101 | }, 1102 | { 1103 | "cell_type": "markdown", 1104 | "metadata": { 1105 | "slideshow": { 1106 | "slide_type": "slide" 1107 | } 1108 | }, 1109 | "source": [ 1110 | "### Wide-to-Long" 1111 | ] 1112 | }, 1113 | { 1114 | "cell_type": "code", 1115 | "execution_count": 13, 1116 | "metadata": { 1117 | "slideshow": { 1118 | "slide_type": "fragment" 1119 | } 1120 | }, 1121 | "outputs": [ 1122 | { 1123 | "data": { 1124 | "text/html": [ 1125 | "
\n", 1126 | "\n", 1139 | "\n", 1140 | " \n", 1141 | " \n", 1142 | " \n", 1143 | " \n", 1144 | " \n", 1145 | " \n", 1146 | " \n", 1147 | " \n", 1148 | " \n", 1149 | " \n", 1150 | " \n", 1151 | " \n", 1152 | " \n", 1153 | " \n", 1154 | " \n", 1155 | " \n", 1156 | " \n", 1157 | " \n", 1158 | " \n", 1159 | " \n", 1160 | " \n", 1161 | " \n", 1162 | " \n", 1163 | " \n", 1164 | " \n", 1165 | " \n", 1166 | " \n", 1167 | " \n", 1168 | " \n", 1169 | " \n", 1170 | " \n", 1171 | " \n", 1172 | " \n", 1173 | " \n", 1174 | " \n", 1175 | " \n", 1176 | " \n", 1177 | " \n", 1178 | " \n", 1179 | "
datetickervariablevalue
02015-12-29AAPLclose108.74
12015-12-29AAPLopen106.96
22015-12-30AAPLclose107.32
32015-12-30AAPLopen108.58
\n", 1180 | "
" 1181 | ], 1182 | "text/plain": [ 1183 | " date ticker variable value\n", 1184 | "0 2015-12-29 AAPL close 108.74\n", 1185 | "1 2015-12-29 AAPL open 106.96\n", 1186 | "2 2015-12-30 AAPL close 107.32\n", 1187 | "3 2015-12-30 AAPL open 108.58" 1188 | ] 1189 | }, 1190 | "execution_count": 13, 1191 | "metadata": {}, 1192 | "output_type": "execute_result" 1193 | } 1194 | ], 1195 | "source": [ 1196 | "aapl_wide.stack().reset_index()" 1197 | ] 1198 | }, 1199 | { 1200 | "cell_type": "markdown", 1201 | "metadata": { 1202 | "slideshow": { 1203 | "slide_type": "slide" 1204 | } 1205 | }, 1206 | "source": [ 1207 | "## Pivot Tables" 1208 | ] 1209 | }, 1210 | { 1211 | "cell_type": "code", 1212 | "execution_count": 14, 1213 | "metadata": { 1214 | "slideshow": { 1215 | "slide_type": "fragment" 1216 | } 1217 | }, 1218 | "outputs": [ 1219 | { 1220 | "data": { 1221 | "text/html": [ 1222 | "
\n", 1223 | "\n", 1236 | "\n", 1237 | " \n", 1238 | " \n", 1239 | " \n", 1240 | " \n", 1241 | " \n", 1242 | " \n", 1243 | " \n", 1244 | " \n", 1245 | " \n", 1246 | " \n", 1247 | " \n", 1248 | " \n", 1249 | " \n", 1250 | " \n", 1251 | " \n", 1252 | " \n", 1253 | " \n", 1254 | " \n", 1255 | " \n", 1256 | " \n", 1257 | " \n", 1258 | " \n", 1259 | " \n", 1260 | " \n", 1261 | " \n", 1262 | " \n", 1263 | " \n", 1264 | " \n", 1265 | " \n", 1266 | " \n", 1267 | " \n", 1268 | " \n", 1269 | " \n", 1270 | " \n", 1271 | " \n", 1272 | " \n", 1273 | " \n", 1274 | " \n", 1275 | " \n", 1276 | " \n", 1277 | " \n", 1278 | " \n", 1279 | " \n", 1280 | " \n", 1281 | " \n", 1282 | " \n", 1283 | " \n", 1284 | " \n", 1285 | " \n", 1286 | " \n", 1287 | " \n", 1288 | " \n", 1289 | " \n", 1290 | " \n", 1291 | " \n", 1292 | " \n", 1293 | " \n", 1294 | " \n", 1295 | " \n", 1296 | " \n", 1297 | " \n", 1298 | " \n", 1299 | " \n", 1300 | " \n", 1301 | " \n", 1302 | " \n", 1303 | " \n", 1304 | "
tickerdatecloserate
0GOOG2015-12-29776.60NaN
1GOOG2015-12-30771.002.40
2GOOG2015-12-31758.882.40
3GOOG2016-01-04741.842.56
4AAPL2015-12-29108.74NaN
5AAPL2015-12-30107.322.40
6AAPL2015-12-31105.262.40
7AAPL2016-01-04105.352.56
\n", 1305 | "
" 1306 | ], 1307 | "text/plain": [ 1308 | " ticker date close rate\n", 1309 | "0 GOOG 2015-12-29 776.60 NaN\n", 1310 | "1 GOOG 2015-12-30 771.00 2.40\n", 1311 | "2 GOOG 2015-12-31 758.88 2.40\n", 1312 | "3 GOOG 2016-01-04 741.84 2.56\n", 1313 | "4 AAPL 2015-12-29 108.74 NaN\n", 1314 | "5 AAPL 2015-12-30 107.32 2.40\n", 1315 | "6 AAPL 2015-12-31 105.26 2.40\n", 1316 | "7 AAPL 2016-01-04 105.35 2.56" 1317 | ] 1318 | }, 1319 | "execution_count": 14, 1320 | "metadata": {}, 1321 | "output_type": "execute_result" 1322 | } 1323 | ], 1324 | "source": [ 1325 | "mkt" 1326 | ] 1327 | }, 1328 | { 1329 | "cell_type": "markdown", 1330 | "metadata": { 1331 | "slideshow": { 1332 | "slide_type": "slide" 1333 | } 1334 | }, 1335 | "source": [ 1336 | "### Simple Pivot Table" 1337 | ] 1338 | }, 1339 | { 1340 | "cell_type": "code", 1341 | "execution_count": 15, 1342 | "metadata": { 1343 | "slideshow": { 1344 | "slide_type": "fragment" 1345 | } 1346 | }, 1347 | "outputs": [ 1348 | { 1349 | "data": { 1350 | "text/html": [ 1351 | "
\n", 1352 | "\n", 1365 | "\n", 1366 | " \n", 1367 | " \n", 1368 | " \n", 1369 | " \n", 1370 | " \n", 1371 | " \n", 1372 | " \n", 1373 | " \n", 1374 | " \n", 1375 | " \n", 1376 | " \n", 1377 | " \n", 1378 | " \n", 1379 | " \n", 1380 | " \n", 1381 | " \n", 1382 | " \n", 1383 | " \n", 1384 | " \n", 1385 | " \n", 1386 | " \n", 1387 | " \n", 1388 | " \n", 1389 | " \n", 1390 | "
closerate
ticker
AAPL106.672.45
GOOG762.082.45
\n", 1391 | "
" 1392 | ], 1393 | "text/plain": [ 1394 | " close rate\n", 1395 | "ticker \n", 1396 | "AAPL 106.67 2.45\n", 1397 | "GOOG 762.08 2.45" 1398 | ] 1399 | }, 1400 | "execution_count": 15, 1401 | "metadata": {}, 1402 | "output_type": "execute_result" 1403 | } 1404 | ], 1405 | "source": [ 1406 | "select_cols = ['ticker', 'close', 'rate']\n", 1407 | "pd.pivot_table(mkt.loc[:, select_cols], index='ticker', aggfunc='mean')" 1408 | ] 1409 | } 1410 | ], 1411 | "metadata": { 1412 | "celltoolbar": "Slideshow", 1413 | "kernelspec": { 1414 | "display_name": "Python 3 (ipykernel)", 1415 | "language": "python", 1416 | "name": "python3" 1417 | }, 1418 | "language_info": { 1419 | "codemirror_mode": { 1420 | "name": "ipython", 1421 | "version": 3 1422 | }, 1423 | "file_extension": ".py", 1424 | "mimetype": "text/x-python", 1425 | "name": "python", 1426 | "nbconvert_exporter": "python", 1427 | "pygments_lexer": "ipython3", 1428 | "version": "3.10.8" 1429 | }, 1430 | "toc": { 1431 | "base_numbering": 1, 1432 | "nav_menu": {}, 1433 | "number_sections": false, 1434 | "sideBar": true, 1435 | "skip_h1_title": true, 1436 | "title_cell": "Table of Contents", 1437 | "title_sidebar": "Contents", 1438 | "toc_cell": false, 1439 | "toc_position": { 1440 | "height": "calc(100% - 180px)", 1441 | "left": "10px", 1442 | "top": "150px", 1443 | "width": "351px" 1444 | }, 1445 | "toc_section_display": true, 1446 | "toc_window_display": false 1447 | } 1448 | }, 1449 | "nbformat": 4, 1450 | "nbformat_minor": 4 1451 | } 1452 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | (c) 2020 Daniel J. Gerlanc 2 | 3 | Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International 4 | Public License 5 | 6 | By exercising the Licensed Rights (defined below), You accept and agree 7 | to be bound by the terms and conditions of this Creative Commons 8 | Attribution-NonCommercial-ShareAlike 4.0 International Public License 9 | ("Public License"). To the extent this Public License may be 10 | interpreted as a contract, You are granted the Licensed Rights in 11 | consideration of Your acceptance of these terms and conditions, and the 12 | Licensor grants You such rights in consideration of benefits the 13 | Licensor receives from making the Licensed Material available under 14 | these terms and conditions. 15 | 16 | 17 | Section 1 -- Definitions. 18 | 19 | a. Adapted Material means material subject to Copyright and Similar 20 | Rights that is derived from or based upon the Licensed Material 21 | and in which the Licensed Material is translated, altered, 22 | arranged, transformed, or otherwise modified in a manner requiring 23 | permission under the Copyright and Similar Rights held by the 24 | Licensor. For purposes of this Public License, where the Licensed 25 | Material is a musical work, performance, or sound recording, 26 | Adapted Material is always produced where the Licensed Material is 27 | synched in timed relation with a moving image. 28 | 29 | b. Adapter's License means the license You apply to Your Copyright 30 | and Similar Rights in Your contributions to Adapted Material in 31 | accordance with the terms and conditions of this Public License. 32 | 33 | c. BY-NC-SA Compatible License means a license listed at 34 | creativecommons.org/compatiblelicenses, approved by Creative 35 | Commons as essentially the equivalent of this Public License. 36 | 37 | d. Copyright and Similar Rights means copyright and/or similar rights 38 | closely related to copyright including, without limitation, 39 | performance, broadcast, sound recording, and Sui Generis Database 40 | Rights, without regard to how the rights are labeled or 41 | categorized. For purposes of this Public License, the rights 42 | specified in Section 2(b)(1)-(2) are not Copyright and Similar 43 | Rights. 44 | 45 | e. Effective Technological Measures means those measures that, in the 46 | absence of proper authority, may not be circumvented under laws 47 | fulfilling obligations under Article 11 of the WIPO Copyright 48 | Treaty adopted on December 20, 1996, and/or similar international 49 | agreements. 50 | 51 | f. Exceptions and Limitations means fair use, fair dealing, and/or 52 | any other exception or limitation to Copyright and Similar Rights 53 | that applies to Your use of the Licensed Material. 54 | 55 | g. License Elements means the license attributes listed in the name 56 | of a Creative Commons Public License. The License Elements of this 57 | Public License are Attribution, NonCommercial, and ShareAlike. 58 | 59 | h. Licensed Material means the artistic or literary work, database, 60 | or other material to which the Licensor applied this Public 61 | License. 62 | 63 | i. Licensed Rights means the rights granted to You subject to the 64 | terms and conditions of this Public License, which are limited to 65 | all Copyright and Similar Rights that apply to Your use of the 66 | Licensed Material and that the Licensor has authority to license. 67 | 68 | j. Licensor means the individual(s) or entity(ies) granting rights 69 | under this Public License. 70 | 71 | k. NonCommercial means not primarily intended for or directed towards 72 | commercial advantage or monetary compensation. For purposes of 73 | this Public License, the exchange of the Licensed Material for 74 | other material subject to Copyright and Similar Rights by digital 75 | file-sharing or similar means is NonCommercial provided there is 76 | no payment of monetary compensation in connection with the 77 | exchange. 78 | 79 | l. Share means to provide material to the public by any means or 80 | process that requires permission under the Licensed Rights, such 81 | as reproduction, public display, public performance, distribution, 82 | dissemination, communication, or importation, and to make material 83 | available to the public including in ways that members of the 84 | public may access the material from a place and at a time 85 | individually chosen by them. 86 | 87 | m. Sui Generis Database Rights means rights other than copyright 88 | resulting from Directive 96/9/EC of the European Parliament and of 89 | the Council of 11 March 1996 on the legal protection of databases, 90 | as amended and/or succeeded, as well as other essentially 91 | equivalent rights anywhere in the world. 92 | 93 | n. You means the individual or entity exercising the Licensed Rights 94 | under this Public License. Your has a corresponding meaning. 95 | 96 | 97 | Section 2 -- Scope. 98 | 99 | a. License grant. 100 | 101 | 1. Subject to the terms and conditions of this Public License, 102 | the Licensor hereby grants You a worldwide, royalty-free, 103 | non-sublicensable, non-exclusive, irrevocable license to 104 | exercise the Licensed Rights in the Licensed Material to: 105 | 106 | a. reproduce and Share the Licensed Material, in whole or 107 | in part, for NonCommercial purposes only; and 108 | 109 | b. produce, reproduce, and Share Adapted Material for 110 | NonCommercial purposes only. 111 | 112 | 2. Exceptions and Limitations. For the avoidance of doubt, where 113 | Exceptions and Limitations apply to Your use, this Public 114 | License does not apply, and You do not need to comply with 115 | its terms and conditions. 116 | 117 | 3. Term. The term of this Public License is specified in Section 118 | 6(a). 119 | 120 | 4. Media and formats; technical modifications allowed. The 121 | Licensor authorizes You to exercise the Licensed Rights in 122 | all media and formats whether now known or hereafter created, 123 | and to make technical modifications necessary to do so. The 124 | Licensor waives and/or agrees not to assert any right or 125 | authority to forbid You from making technical modifications 126 | necessary to exercise the Licensed Rights, including 127 | technical modifications necessary to circumvent Effective 128 | Technological Measures. For purposes of this Public License, 129 | simply making modifications authorized by this Section 2(a) 130 | (4) never produces Adapted Material. 131 | 132 | 5. Downstream recipients. 133 | 134 | a. Offer from the Licensor -- Licensed Material. Every 135 | recipient of the Licensed Material automatically 136 | receives an offer from the Licensor to exercise the 137 | Licensed Rights under the terms and conditions of this 138 | Public License. 139 | 140 | b. Additional offer from the Licensor -- Adapted Material. 141 | Every recipient of Adapted Material from You 142 | automatically receives an offer from the Licensor to 143 | exercise the Licensed Rights in the Adapted Material 144 | under the conditions of the Adapter's License You apply. 145 | 146 | c. No downstream restrictions. You may not offer or impose 147 | any additional or different terms or conditions on, or 148 | apply any Effective Technological Measures to, the 149 | Licensed Material if doing so restricts exercise of the 150 | Licensed Rights by any recipient of the Licensed 151 | Material. 152 | 153 | 6. No endorsement. Nothing in this Public License constitutes or 154 | may be construed as permission to assert or imply that You 155 | are, or that Your use of the Licensed Material is, connected 156 | with, or sponsored, endorsed, or granted official status by, 157 | the Licensor or others designated to receive attribution as 158 | provided in Section 3(a)(1)(A)(i). 159 | 160 | b. Other rights. 161 | 162 | 1. Moral rights, such as the right of integrity, are not 163 | licensed under this Public License, nor are publicity, 164 | privacy, and/or other similar personality rights; however, to 165 | the extent possible, the Licensor waives and/or agrees not to 166 | assert any such rights held by the Licensor to the limited 167 | extent necessary to allow You to exercise the Licensed 168 | Rights, but not otherwise. 169 | 170 | 2. Patent and trademark rights are not licensed under this 171 | Public License. 172 | 173 | 3. To the extent possible, the Licensor waives any right to 174 | collect royalties from You for the exercise of the Licensed 175 | Rights, whether directly or through a collecting society 176 | under any voluntary or waivable statutory or compulsory 177 | licensing scheme. In all other cases the Licensor expressly 178 | reserves any right to collect such royalties, including when 179 | the Licensed Material is used other than for NonCommercial 180 | purposes. 181 | 182 | 183 | Section 3 -- License Conditions. 184 | 185 | Your exercise of the Licensed Rights is expressly made subject to the 186 | following conditions. 187 | 188 | a. Attribution. 189 | 190 | 1. If You Share the Licensed Material (including in modified 191 | form), You must: 192 | 193 | a. retain the following if it is supplied by the Licensor 194 | with the Licensed Material: 195 | 196 | i. identification of the creator(s) of the Licensed 197 | Material and any others designated to receive 198 | attribution, in any reasonable manner requested by 199 | the Licensor (including by pseudonym if 200 | designated); 201 | 202 | ii. a copyright notice; 203 | 204 | iii. a notice that refers to this Public License; 205 | 206 | iv. a notice that refers to the disclaimer of 207 | warranties; 208 | 209 | v. a URI or hyperlink to the Licensed Material to the 210 | extent reasonably practicable; 211 | 212 | b. indicate if You modified the Licensed Material and 213 | retain an indication of any previous modifications; and 214 | 215 | c. indicate the Licensed Material is licensed under this 216 | Public License, and include the text of, or the URI or 217 | hyperlink to, this Public License. 218 | 219 | 2. You may satisfy the conditions in Section 3(a)(1) in any 220 | reasonable manner based on the medium, means, and context in 221 | which You Share the Licensed Material. For example, it may be 222 | reasonable to satisfy the conditions by providing a URI or 223 | hyperlink to a resource that includes the required 224 | information. 225 | 3. If requested by the Licensor, You must remove any of the 226 | information required by Section 3(a)(1)(A) to the extent 227 | reasonably practicable. 228 | 229 | b. ShareAlike. 230 | 231 | In addition to the conditions in Section 3(a), if You Share 232 | Adapted Material You produce, the following conditions also apply. 233 | 234 | 1. The Adapter's License You apply must be a Creative Commons 235 | license with the same License Elements, this version or 236 | later, or a BY-NC-SA Compatible License. 237 | 238 | 2. You must include the text of, or the URI or hyperlink to, the 239 | Adapter's License You apply. You may satisfy this condition 240 | in any reasonable manner based on the medium, means, and 241 | context in which You Share Adapted Material. 242 | 243 | 3. You may not offer or impose any additional or different terms 244 | or conditions on, or apply any Effective Technological 245 | Measures to, Adapted Material that restrict exercise of the 246 | rights granted under the Adapter's License You apply. 247 | 248 | 249 | Section 4 -- Sui Generis Database Rights. 250 | 251 | Where the Licensed Rights include Sui Generis Database Rights that 252 | apply to Your use of the Licensed Material: 253 | 254 | a. for the avoidance of doubt, Section 2(a)(1) grants You the right 255 | to extract, reuse, reproduce, and Share all or a substantial 256 | portion of the contents of the database for NonCommercial purposes 257 | only; 258 | 259 | b. if You include all or a substantial portion of the database 260 | contents in a database in which You have Sui Generis Database 261 | Rights, then the database in which You have Sui Generis Database 262 | Rights (but not its individual contents) is Adapted Material, 263 | including for purposes of Section 3(b); and 264 | 265 | c. You must comply with the conditions in Section 3(a) if You Share 266 | all or a substantial portion of the contents of the database. 267 | 268 | For the avoidance of doubt, this Section 4 supplements and does not 269 | replace Your obligations under this Public License where the Licensed 270 | Rights include other Copyright and Similar Rights. 271 | 272 | 273 | Section 5 -- Disclaimer of Warranties and Limitation of Liability. 274 | 275 | a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE 276 | EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS 277 | AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF 278 | ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS, 279 | IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION, 280 | WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR 281 | PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS, 282 | ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT 283 | KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT 284 | ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU. 285 | 286 | b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE 287 | TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION, 288 | NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT, 289 | INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES, 290 | COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR 291 | USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN 292 | ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR 293 | DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR 294 | IN PART, THIS LIMITATION MAY NOT APPLY TO YOU. 295 | 296 | c. The disclaimer of warranties and limitation of liability provided 297 | above shall be interpreted in a manner that, to the extent 298 | possible, most closely approximates an absolute disclaimer and 299 | waiver of all liability. 300 | 301 | 302 | Section 6 -- Term and Termination. 303 | 304 | a. This Public License applies for the term of the Copyright and 305 | Similar Rights licensed here. However, if You fail to comply with 306 | this Public License, then Your rights under this Public License 307 | terminate automatically. 308 | 309 | b. Where Your right to use the Licensed Material has terminated under 310 | Section 6(a), it reinstates: 311 | 312 | 1. automatically as of the date the violation is cured, provided 313 | it is cured within 30 days of Your discovery of the 314 | violation; or 315 | 316 | 2. upon express reinstatement by the Licensor. 317 | 318 | For the avoidance of doubt, this Section 6(b) does not affect any 319 | right the Licensor may have to seek remedies for Your violations 320 | of this Public License. 321 | 322 | c. For the avoidance of doubt, the Licensor may also offer the 323 | Licensed Material under separate terms or conditions or stop 324 | distributing the Licensed Material at any time; however, doing so 325 | will not terminate this Public License. 326 | 327 | d. Sections 1, 5, 6, 7, and 8 survive termination of this Public 328 | License. 329 | 330 | 331 | Section 7 -- Other Terms and Conditions. 332 | 333 | a. The Licensor shall not be bound by any additional or different 334 | terms or conditions communicated by You unless expressly agreed. 335 | 336 | b. Any arrangements, understandings, or agreements regarding the 337 | Licensed Material not stated herein are separate from and 338 | independent of the terms and conditions of this Public License. 339 | 340 | 341 | Section 8 -- Interpretation. 342 | 343 | a. For the avoidance of doubt, this Public License does not, and 344 | shall not be interpreted to, reduce, limit, restrict, or impose 345 | conditions on any use of the Licensed Material that could lawfully 346 | be made without permission under this Public License. 347 | 348 | b. To the extent possible, if any provision of this Public License is 349 | deemed unenforceable, it shall be automatically reformed to the 350 | minimum extent necessary to make it enforceable. If the provision 351 | cannot be reformed, it shall be severed from this Public License 352 | without affecting the enforceability of the remaining terms and 353 | conditions. 354 | 355 | c. No term or condition of this Public License will be waived and no 356 | failure to comply consented to unless expressly agreed to by the 357 | Licensor. 358 | 359 | d. Nothing in this Public License constitutes or may be interpreted 360 | as a limitation upon, or waiver of, any privileges and immunities 361 | that apply to the Licensor or You, including from the legal 362 | processes of any jurisdiction or authority. 363 | 364 | ======================================================================= 365 | 366 | Creative Commons is not a party to its public 367 | licenses. Notwithstanding, Creative Commons may elect to apply one of 368 | its public licenses to material it publishes and in those instances 369 | will be considered the “Licensor.” The text of the Creative Commons 370 | public licenses is dedicated to the public domain under the CC0 Public 371 | Domain Dedication. Except for the limited purpose of indicating that 372 | material is shared under a Creative Commons public license or as 373 | otherwise permitted by the Creative Commons policies published at 374 | creativecommons.org/policies, Creative Commons does not authorize the 375 | use of the trademark "Creative Commons" or any other trademark or logo 376 | of Creative Commons without its prior written consent including, 377 | without limitation, in connection with any unauthorized modifications 378 | to any of its public licenses or any other arrangements, 379 | understandings, or agreements concerning use of licensed material. For 380 | the avoidance of doubt, this paragraph does not form part of the 381 | public licenses. 382 | 383 | Creative Commons may be contacted at creativecommons.org. 384 | 385 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Programming with Data: Python and Pandas 2 | 3 | [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/dgerlanc/programming-with-data/main?urlpath=lab) 4 | 5 | This repository contains the slides, exercises, and answers for *Programming 6 | with Data: Python and Pandas*. The goal of this tutorial is to teach you, 7 | someone with experience programming in Python, most of the features available in 8 | Pandas. The material from this course has been presented at conferences 9 | including ODSC and Battlefin Discovery Data and online through the O'Reilly 10 | platform. 11 | 12 | ## Why this course exists 13 | Whether in R, MATLAB, Stata, or python, modern data analysis, for many 14 | researchers, requires some kind of programming. The preponderance of tools and 15 | specialized languages for data analysis suggests that general purpose 16 | programming languages like C and Java do not readily address the needs of data 17 | scientists; something more is needed. 18 | 19 | In this workshop, you will learn how to accelerate your data analyses using the 20 | Python language and Pandas, a library specifically designed for interactive data 21 | analysis. Pandas is a massive library, so we will focus on its core 22 | functionality, specifically, loading, filtering, grouping, and transforming 23 | data. Having completed this workshop, you will understand the fundamentals of 24 | Pandas, be aware of common pitfalls, and be ready to perform your own analyses. 25 | 26 | ### Prerequisites: 27 | 28 | Workshop assumes that participants have intermediate-level programming ability 29 | in Python. Participants should know the difference between a `dict`, `list`, and 30 | `tuple`. Familiarity with control-flow (`if/else/for/while`) and error handling 31 | (`try/catch`) are required. 32 | 33 | No statistics background is required. 34 | 35 | ## Installation 36 | 37 | ### Binder 38 | 39 | If you have a stable Internet connection and the free Binder service isn't under 40 | too much load, the easiest way to interactively run the slides and try the 41 | exercises is to click the Binder badge (make sure you open in a new window). 42 | Keep in mind that Binder aggresively shuts down idle instances so you'll need to 43 | refresh the link if you're idle for too long. 44 | 45 | [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/dgerlanc/programming-with-data/main) 46 | 47 | ### Prerendered Notebooks 48 | 49 | You may view the HTML versions of slides and the answers directly in your browser on Github 50 | though you will not be able to run them interactively: 51 | 52 | * [Lesson 1 - Series](https://github.com/dgerlanc/programming-with-data/blob/main/01-intro-to-pandas-part-1-slides.ipynb) 53 | * [Lesson 2 - DataFrames](https://github.com/dgerlanc/programming-with-data/blob/main/02-intro-to-pandas-part-2-slides.ipynb) 54 | * [Lesson 3 - Split, Apply, Combine](https://github.com/dgerlanc/programming-with-data/blob/main/03-group-apply-slides.ipynb) 55 | * [Lesson 4 - Time Series](https://github.com/dgerlanc/programming-with-data/blob/main/04-time-series-slides.ipynb) 56 | * [Lesson 5 - Merge and Concat](https://github.com/dgerlanc/programming-with-data/blob/main/05-merge-pivot-slides.ipynb) 57 | * [Lesson 6 - Advanced Merge and Reshape](https://github.com/dgerlanc/programming-with-data/blob/main/06-advanced-merge-reshape-slides.ipynb) 58 | 59 | ### Local Installation 60 | 61 | If you're taking the course, want to follow along with the slides and do the 62 | exercises, and may not have Internet access, download and 63 | install the Anaconda Python 3 distribution and `conda` package manager 64 | ahead of time: 65 | 66 | ``` 67 | https://www.anaconda.com/download/ 68 | ``` 69 | 70 | Download the latest version of the course materials 71 | [here](https://github.com/dgerlanc/programming-with-data/archive/main.zip). 72 | 73 | Alternatively, you may clone the course repository using `git`: 74 | 75 | ``` 76 | $ git clone https://github.com/dgerlanc/programming-with-data.git 77 | ``` 78 | 79 | The remainder of the installation requires that you use the command line. 80 | 81 | To complete the course exercises, you must use `conda` to install the 82 | dependencies specified in the `environment.yml` file in the repository: 83 | 84 | ``` 85 | $ conda env create -f environment.yml 86 | ``` 87 | 88 | This will create an `conda` environment called `progwd` which may be 89 | "activated" with the following commands: 90 | 91 | * Windows: `activate progwd` 92 | * Linux and Mac: `conda activate progwd` 93 | 94 | Once you've activated the environment your prompt will probably 95 | look something like this: 96 | 97 | ``` 98 | (progwd) $ 99 | ``` 100 | 101 | The entire course is designed to use `jupyter` notebooks. Start the 102 | notebook server to get started: 103 | 104 | ``` 105 | (progwd) $ jupyter lab 106 | ``` 107 | 108 | ## Feedback 109 | 110 | Your feedback on the course helps to improve it for future students. 111 | Please leave feedback [here](https://danielgerlanc.typeform.com/to/RyB6AJ). 112 | -------------------------------------------------------------------------------- /assets/data-label-arrays.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dgerlanc/programming-with-data/39e26d54ca885ffb39a10591b7c314db6186e60a/assets/data-label-arrays.png -------------------------------------------------------------------------------- /assets/enplus-logo-colored.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dgerlanc/programming-with-data/39e26d54ca885ffb39a10591b7c314db6186e60a/assets/enplus-logo-colored.png -------------------------------------------------------------------------------- /assets/enplus-logo-colored.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | -------------------------------------------------------------------------------- /assets/full-join.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dgerlanc/programming-with-data/39e26d54ca885ffb39a10591b7c314db6186e60a/assets/full-join.png -------------------------------------------------------------------------------- /assets/inner-join.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dgerlanc/programming-with-data/39e26d54ca885ffb39a10591b7c314db6186e60a/assets/inner-join.png -------------------------------------------------------------------------------- /assets/lag.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dgerlanc/programming-with-data/39e26d54ca885ffb39a10591b7c314db6186e60a/assets/lag.png -------------------------------------------------------------------------------- /assets/lead.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dgerlanc/programming-with-data/39e26d54ca885ffb39a10591b7c314db6186e60a/assets/lead.png -------------------------------------------------------------------------------- /assets/left-join.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dgerlanc/programming-with-data/39e26d54ca885ffb39a10591b7c314db6186e60a/assets/left-join.png -------------------------------------------------------------------------------- /assets/lesson-01-key-value.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dgerlanc/programming-with-data/39e26d54ca885ffb39a10591b7c314db6186e60a/assets/lesson-01-key-value.png -------------------------------------------------------------------------------- /assets/right-join.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dgerlanc/programming-with-data/39e26d54ca885ffb39a10591b7c314db6186e60a/assets/right-join.png -------------------------------------------------------------------------------- /assets/rolling.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dgerlanc/programming-with-data/39e26d54ca885ffb39a10591b7c314db6186e60a/assets/rolling.png -------------------------------------------------------------------------------- /assets/split-apply-combine.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dgerlanc/programming-with-data/39e26d54ca885ffb39a10591b7c314db6186e60a/assets/split-apply-combine.png -------------------------------------------------------------------------------- /assets/stock-trading-1600x1200.jpg: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:bbdaf1957fa2eb0f2cd77f989bdcbad40630f947451baa95b19cfc030b484c28 3 | size 144575 4 | -------------------------------------------------------------------------------- /assets/vectorized-multiplication.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dgerlanc/programming-with-data/39e26d54ca885ffb39a10591b7c314db6186e60a/assets/vectorized-multiplication.png -------------------------------------------------------------------------------- /assets/venn-diagrams.sketch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dgerlanc/programming-with-data/39e26d54ca885ffb39a10591b7c314db6186e60a/assets/venn-diagrams.sketch -------------------------------------------------------------------------------- /binder/environment.yml: -------------------------------------------------------------------------------- 1 | name: progwd 2 | channels: 3 | - conda-forge 4 | dependencies: 5 | - python=3.10.8 6 | - pandas=1.5.2 7 | - numpy 8 | -------------------------------------------------------------------------------- /build.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import subprocess 4 | from functools import partial 5 | from multiprocessing import cpu_count, Pool 6 | from pathlib import Path 7 | 8 | # TODO: Only update files changed since last run 9 | # TODO: Add conversion tool from existing answer notebooks 10 | # TODO: add test for errors when individual tasks fail 11 | 12 | 13 | def answer2exercise(infile, outfile): 14 | """ 15 | Convert answer notebooks to exercise notebooks 16 | 17 | TODO: Fail if output notebook is empty? 18 | 19 | """ 20 | cmd = "jupyter nbconvert --config config.py --to notebook --output".split() 21 | cmd.extend([outfile, infile]) 22 | subprocess.run(cmd) 23 | 24 | 25 | def slide2html(infile): 26 | """ 27 | Convert slide notebooks to reveal.js 28 | 29 | """ 30 | cmd = ( 31 | "jupyter nbconvert" 32 | " --to slides" 33 | " --reveal-prefix=reveal.js" 34 | " --SlidesExporter.file_extension=.html" 35 | " --output-dir build" 36 | ).split() 37 | cmd.append(str(infile)) 38 | subprocess.run(cmd) 39 | 40 | 41 | def run_slide(infile): 42 | cmd = "jupyter nbconvert --to notebook --inplace --execute".split() 43 | cmd.append(str(infile)) 44 | 45 | devnull = subprocess.DEVNULL 46 | subprocess.run(cmd, check=True, stdout=devnull, stderr=devnull) 47 | 48 | 49 | def main(): 50 | p = Path(".") 51 | 52 | slide_fns = sorted(str(x) for x in p.glob("*slides.ipynb")) 53 | answer_nbs = sorted(str(x) for x in p.glob("*answers.ipynb")) 54 | exercise_nbs = [x.replace("answer", "exercise") for x in answer_nbs] 55 | 56 | n_cpus = cpu_count() 57 | with Pool(n_cpus) as pool: 58 | print("Running notebooks") 59 | pool.map(run_slide, slide_fns) 60 | 61 | print("ipynb slides -> reveal.js html") 62 | pool.map(slide2html, slide_fns) 63 | 64 | print("Convert answers to exercises") 65 | # print(f'{answer_nb} -> {exercise_nb}') 66 | pool.starmap(answer2exercise, zip(answer_nbs, exercise_nbs)) 67 | 68 | # copy over assets 69 | 70 | # html slides -> pdf 71 | 72 | 73 | if __name__ == "__main__": 74 | main() 75 | -------------------------------------------------------------------------------- /build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | # 1) Run all notebooks and check for errors 6 | # jupyter nbconvert --inplace --to notebook --execute *-slides.ipynb 7 | 8 | # 2) Convert answers to exercises. 9 | # jupyter nbconvert --config config.py --to notebook \ 10 | # --output 04-merge-pivot-exercises.ipynb 04-merge-pivot-answers.ipynb 11 | 12 | # ls *slides.ipynb | parallel jupyter nbconvert --to slides --output-dir build 13 | jupyter nbconvert --to slides --reveal-prefix=reveal.js --output-dir build *slides.ipynb 14 | cp -a assets build/assets 15 | 16 | # add converting to pdf 17 | # set query string to be ?print-pdf&pdfSeparateFragments=false&pdfMaxPagesPerSlide=1" 18 | 19 | # add combining for classes 20 | # pdfconcat --output programming-with-data-foundations.pdf 0[1-3]*pdf 21 | -------------------------------------------------------------------------------- /build/.gitignore: -------------------------------------------------------------------------------- 1 | # Ignore everything in this directory 2 | * 3 | # Except this file 4 | !.gitignore 5 | !reveal.js/ 6 | !custom.css 7 | !favicon.ico 8 | -------------------------------------------------------------------------------- /build/custom.css: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /build/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dgerlanc/programming-with-data/39e26d54ca885ffb39a10591b7c314db6186e60a/build/favicon.ico -------------------------------------------------------------------------------- /build/reveal.js/.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | *.iml 3 | *.iws 4 | *.eml 5 | out/ 6 | .DS_Store 7 | .svn 8 | log/*.log 9 | tmp/** 10 | node_modules/ 11 | package-lock.json 12 | .sass-cache 13 | css/reveal.min.css 14 | js/reveal.min.js 15 | -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from nbconvert.preprocessors import Preprocessor 4 | 5 | 6 | def _all_tags(nb): 7 | rv = set() 8 | for cell in nb.cells: 9 | tags = cell.metadata.get("tags") 10 | if tags: 11 | for tag in tags: 12 | rv.add(tag) 13 | return rv 14 | 15 | 16 | class ExercisePreprocessor(Preprocessor): 17 | keep_tags = {"setup", "exercise"} 18 | 19 | def preprocess(self, nb, resources): 20 | cells = [] 21 | for cell in nb.cells: 22 | tags = cell.metadata.get("tags", tuple()) 23 | 24 | if any(tag in self.keep_tags for tag in tags): 25 | # must check if cell.cell_type == 'code' 26 | # cell.execution_count = None 27 | # cell.outputs = [] 28 | cells.append(cell) 29 | 30 | nb.cells = cells 31 | 32 | nb.metadata.pop("celltoolbar", None) 33 | nb.metadata.pop("toc", None) 34 | 35 | return nb, resources 36 | 37 | 38 | c = get_config() # noqa 39 | c.Exporter.preprocessors = [ 40 | ExercisePreprocessor, 41 | "nbconvert.preprocessors.TagRemovePreprocessor", 42 | "nbconvert.preprocessors.ClearOutputPreprocessor", 43 | ] 44 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3.7" 2 | 3 | services: 4 | 5 | # This works but currently only for a single slide 6 | # slides: 7 | # image: astefanutti/decktape:2.11 8 | # command: "-s 2048x1280 http://nginx/03-group-apply-slides.html slides-03.pdf" 9 | # volumes: 10 | # - "./build:/slides" 11 | # depends_on: 12 | # - nginx 13 | 14 | nginx: 15 | image: nginx:1.17 16 | ports: 17 | - "80:80" 18 | restart: always 19 | volumes: 20 | - "./build:/usr/share/nginx/html:ro" 21 | -------------------------------------------------------------------------------- /environment-dev.yml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | dependencies: 4 | - conda-merge 5 | - isort 6 | - jupyterlab 7 | - nbdime 8 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | dependencies: 4 | - conda-merge 5 | - isort 6 | - jupyterlab 7 | - nbdime 8 | - numpy 9 | - pandas=1.5.2 10 | - python=3.10.8 11 | name: progwd 12 | -------------------------------------------------------------------------------- /scripts/combine-envs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Merge the binder and dev dependencies into a single environment 4 | 5 | conda-merge binder/environment.yml environment-dev.yml > environment.yml 6 | --------------------------------------------------------------------------------